1*4e366538SXin Li /*
2*4e366538SXin Li * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li *
4*4e366538SXin Li * Use of this source code is governed by a BSD-style license
5*4e366538SXin Li * that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li * tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li * in the file PATENTS. All contributing project authors may
8*4e366538SXin Li * be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li */
10*4e366538SXin Li
11*4e366538SXin Li #include "libyuv/row.h"
12*4e366538SXin Li #include "libyuv/scale_row.h"
13*4e366538SXin Li
14*4e366538SXin Li #ifdef __cplusplus
15*4e366538SXin Li namespace libyuv {
16*4e366538SXin Li extern "C" {
17*4e366538SXin Li #endif
18*4e366538SXin Li
19*4e366538SXin Li // This module is for GCC x86 and x64.
20*4e366538SXin Li #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21*4e366538SXin Li
22*4e366538SXin Li // Offsets for source bytes 0 to 9
23*4e366538SXin Li static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24*4e366538SXin Li 128, 128, 128, 128, 128, 128, 128, 128};
25*4e366538SXin Li
26*4e366538SXin Li // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27*4e366538SXin Li static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28*4e366538SXin Li 128, 128, 128, 128, 128, 128, 128, 128};
29*4e366538SXin Li
30*4e366538SXin Li // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31*4e366538SXin Li static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32*4e366538SXin Li 128, 128, 128, 128, 128, 128, 128, 128};
33*4e366538SXin Li
34*4e366538SXin Li // Offsets for source bytes 0 to 10
35*4e366538SXin Li static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36*4e366538SXin Li
37*4e366538SXin Li // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38*4e366538SXin Li static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
39*4e366538SXin Li 8, 9, 9, 10, 10, 11, 12, 13};
40*4e366538SXin Li
41*4e366538SXin Li // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42*4e366538SXin Li static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43*4e366538SXin Li 10, 11, 12, 13, 13, 14, 14, 15};
44*4e366538SXin Li
45*4e366538SXin Li // Coefficients for source bytes 0 to 10
46*4e366538SXin Li static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47*4e366538SXin Li
48*4e366538SXin Li // Coefficients for source bytes 10 to 21
49*4e366538SXin Li static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50*4e366538SXin Li
51*4e366538SXin Li // Coefficients for source bytes 21 to 31
52*4e366538SXin Li static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53*4e366538SXin Li
54*4e366538SXin Li // Coefficients for source bytes 21 to 31
55*4e366538SXin Li static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56*4e366538SXin Li
57*4e366538SXin Li static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58*4e366538SXin Li 128, 128, 128, 128, 128, 128, 128, 128};
59*4e366538SXin Li
60*4e366538SXin Li static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61*4e366538SXin Li 6, 8, 11, 14, 128, 128, 128, 128};
62*4e366538SXin Li
63*4e366538SXin Li // Arrange words 0,3,6 into 0,1,2
64*4e366538SXin Li static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65*4e366538SXin Li 128, 128, 128, 128, 128, 128, 128, 128};
66*4e366538SXin Li
67*4e366538SXin Li // Arrange words 0,3,6 into 3,4,5
68*4e366538SXin Li static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69*4e366538SXin Li 6, 7, 12, 13, 128, 128, 128, 128};
70*4e366538SXin Li
71*4e366538SXin Li // Scaling values for boxes of 3x3 and 2x3
72*4e366538SXin Li static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73*4e366538SXin Li 65536 / 9, 65536 / 6, 0, 0};
74*4e366538SXin Li
75*4e366538SXin Li // Arrange first value for pixels 0,1,2,3,4,5
76*4e366538SXin Li static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77*4e366538SXin Li 11, 128, 14, 128, 128, 128, 128, 128};
78*4e366538SXin Li
79*4e366538SXin Li // Arrange second value for pixels 0,1,2,3,4,5
80*4e366538SXin Li static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81*4e366538SXin Li 12, 128, 15, 128, 128, 128, 128, 128};
82*4e366538SXin Li
83*4e366538SXin Li // Arrange third value for pixels 0,1,2,3,4,5
84*4e366538SXin Li static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85*4e366538SXin Li 13, 128, 128, 128, 128, 128, 128, 128};
86*4e366538SXin Li
87*4e366538SXin Li // Scaling values for boxes of 3x2 and 2x2
88*4e366538SXin Li static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89*4e366538SXin Li 65536 / 3, 65536 / 2, 0, 0};
90*4e366538SXin Li
91*4e366538SXin Li // GCC versions of row functions are verbatim conversions from Visual C.
92*4e366538SXin Li // Generated using gcc disassembly on Visual C object file:
93*4e366538SXin Li // objdump -D yuvscaler.obj >yuvscaler.txt
94*4e366538SXin Li
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95*4e366538SXin Li void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96*4e366538SXin Li ptrdiff_t src_stride,
97*4e366538SXin Li uint8_t* dst_ptr,
98*4e366538SXin Li int dst_width) {
99*4e366538SXin Li (void)src_stride;
100*4e366538SXin Li asm volatile(
101*4e366538SXin Li // 16 pixel loop.
102*4e366538SXin Li LABELALIGN
103*4e366538SXin Li "1: \n"
104*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
105*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
106*4e366538SXin Li "lea 0x20(%0),%0 \n"
107*4e366538SXin Li "psrlw $0x8,%%xmm0 \n"
108*4e366538SXin Li "psrlw $0x8,%%xmm1 \n"
109*4e366538SXin Li "packuswb %%xmm1,%%xmm0 \n"
110*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
111*4e366538SXin Li "lea 0x10(%1),%1 \n"
112*4e366538SXin Li "sub $0x10,%2 \n"
113*4e366538SXin Li "jg 1b \n"
114*4e366538SXin Li : "+r"(src_ptr), // %0
115*4e366538SXin Li "+r"(dst_ptr), // %1
116*4e366538SXin Li "+r"(dst_width) // %2
117*4e366538SXin Li ::"memory",
118*4e366538SXin Li "cc", "xmm0", "xmm1");
119*4e366538SXin Li }
120*4e366538SXin Li
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121*4e366538SXin Li void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122*4e366538SXin Li ptrdiff_t src_stride,
123*4e366538SXin Li uint8_t* dst_ptr,
124*4e366538SXin Li int dst_width) {
125*4e366538SXin Li (void)src_stride;
126*4e366538SXin Li asm volatile(
127*4e366538SXin Li "pcmpeqb %%xmm4,%%xmm4 \n"
128*4e366538SXin Li "psrlw $0xf,%%xmm4 \n"
129*4e366538SXin Li "packuswb %%xmm4,%%xmm4 \n"
130*4e366538SXin Li "pxor %%xmm5,%%xmm5 \n"
131*4e366538SXin Li
132*4e366538SXin Li LABELALIGN
133*4e366538SXin Li "1: \n"
134*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
135*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
136*4e366538SXin Li "lea 0x20(%0),%0 \n"
137*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm0 \n"
138*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm1 \n"
139*4e366538SXin Li "pavgw %%xmm5,%%xmm0 \n"
140*4e366538SXin Li "pavgw %%xmm5,%%xmm1 \n"
141*4e366538SXin Li "packuswb %%xmm1,%%xmm0 \n"
142*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
143*4e366538SXin Li "lea 0x10(%1),%1 \n"
144*4e366538SXin Li "sub $0x10,%2 \n"
145*4e366538SXin Li "jg 1b \n"
146*4e366538SXin Li : "+r"(src_ptr), // %0
147*4e366538SXin Li "+r"(dst_ptr), // %1
148*4e366538SXin Li "+r"(dst_width) // %2
149*4e366538SXin Li ::"memory",
150*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151*4e366538SXin Li }
152*4e366538SXin Li
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153*4e366538SXin Li void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154*4e366538SXin Li ptrdiff_t src_stride,
155*4e366538SXin Li uint8_t* dst_ptr,
156*4e366538SXin Li int dst_width) {
157*4e366538SXin Li asm volatile(
158*4e366538SXin Li "pcmpeqb %%xmm4,%%xmm4 \n"
159*4e366538SXin Li "psrlw $0xf,%%xmm4 \n"
160*4e366538SXin Li "packuswb %%xmm4,%%xmm4 \n"
161*4e366538SXin Li "pxor %%xmm5,%%xmm5 \n"
162*4e366538SXin Li
163*4e366538SXin Li LABELALIGN
164*4e366538SXin Li "1: \n"
165*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
166*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
167*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm2 \n"
168*4e366538SXin Li "movdqu 0x10(%0,%3,1),%%xmm3 \n"
169*4e366538SXin Li "lea 0x20(%0),%0 \n"
170*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm0 \n"
171*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm1 \n"
172*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm2 \n"
173*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm3 \n"
174*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n"
175*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n"
176*4e366538SXin Li "psrlw $0x1,%%xmm0 \n"
177*4e366538SXin Li "psrlw $0x1,%%xmm1 \n"
178*4e366538SXin Li "pavgw %%xmm5,%%xmm0 \n"
179*4e366538SXin Li "pavgw %%xmm5,%%xmm1 \n"
180*4e366538SXin Li "packuswb %%xmm1,%%xmm0 \n"
181*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
182*4e366538SXin Li "lea 0x10(%1),%1 \n"
183*4e366538SXin Li "sub $0x10,%2 \n"
184*4e366538SXin Li "jg 1b \n"
185*4e366538SXin Li : "+r"(src_ptr), // %0
186*4e366538SXin Li "+r"(dst_ptr), // %1
187*4e366538SXin Li "+r"(dst_width) // %2
188*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %3
189*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190*4e366538SXin Li }
191*4e366538SXin Li
192*4e366538SXin Li #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193*4e366538SXin Li void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194*4e366538SXin Li ptrdiff_t src_stride,
195*4e366538SXin Li uint8_t* dst_ptr,
196*4e366538SXin Li int dst_width) {
197*4e366538SXin Li (void)src_stride;
198*4e366538SXin Li asm volatile(LABELALIGN
199*4e366538SXin Li "1: \n"
200*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n"
201*4e366538SXin Li "vmovdqu 0x20(%0),%%ymm1 \n"
202*4e366538SXin Li "lea 0x40(%0),%0 \n"
203*4e366538SXin Li "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
204*4e366538SXin Li "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
205*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
206*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
207*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
208*4e366538SXin Li "lea 0x20(%1),%1 \n"
209*4e366538SXin Li "sub $0x20,%2 \n"
210*4e366538SXin Li "jg 1b \n"
211*4e366538SXin Li "vzeroupper \n"
212*4e366538SXin Li : "+r"(src_ptr), // %0
213*4e366538SXin Li "+r"(dst_ptr), // %1
214*4e366538SXin Li "+r"(dst_width) // %2
215*4e366538SXin Li ::"memory",
216*4e366538SXin Li "cc", "xmm0", "xmm1");
217*4e366538SXin Li }
218*4e366538SXin Li
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219*4e366538SXin Li void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220*4e366538SXin Li ptrdiff_t src_stride,
221*4e366538SXin Li uint8_t* dst_ptr,
222*4e366538SXin Li int dst_width) {
223*4e366538SXin Li (void)src_stride;
224*4e366538SXin Li asm volatile(
225*4e366538SXin Li "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
226*4e366538SXin Li "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
227*4e366538SXin Li "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
228*4e366538SXin Li "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
229*4e366538SXin Li
230*4e366538SXin Li LABELALIGN
231*4e366538SXin Li "1: \n"
232*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n"
233*4e366538SXin Li "vmovdqu 0x20(%0),%%ymm1 \n"
234*4e366538SXin Li "lea 0x40(%0),%0 \n"
235*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
236*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
237*4e366538SXin Li "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
238*4e366538SXin Li "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
239*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
240*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
241*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
242*4e366538SXin Li "lea 0x20(%1),%1 \n"
243*4e366538SXin Li "sub $0x20,%2 \n"
244*4e366538SXin Li "jg 1b \n"
245*4e366538SXin Li "vzeroupper \n"
246*4e366538SXin Li : "+r"(src_ptr), // %0
247*4e366538SXin Li "+r"(dst_ptr), // %1
248*4e366538SXin Li "+r"(dst_width) // %2
249*4e366538SXin Li ::"memory",
250*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251*4e366538SXin Li }
252*4e366538SXin Li
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253*4e366538SXin Li void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254*4e366538SXin Li ptrdiff_t src_stride,
255*4e366538SXin Li uint8_t* dst_ptr,
256*4e366538SXin Li int dst_width) {
257*4e366538SXin Li asm volatile(
258*4e366538SXin Li "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
259*4e366538SXin Li "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
260*4e366538SXin Li "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
261*4e366538SXin Li "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
262*4e366538SXin Li
263*4e366538SXin Li LABELALIGN
264*4e366538SXin Li "1: \n"
265*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n"
266*4e366538SXin Li "vmovdqu 0x20(%0),%%ymm1 \n"
267*4e366538SXin Li "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
268*4e366538SXin Li "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
269*4e366538SXin Li "lea 0x40(%0),%0 \n"
270*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
271*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
272*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
273*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
274*4e366538SXin Li "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
275*4e366538SXin Li "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
276*4e366538SXin Li "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
277*4e366538SXin Li "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
278*4e366538SXin Li "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
279*4e366538SXin Li "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
280*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
281*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
282*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
283*4e366538SXin Li "lea 0x20(%1),%1 \n"
284*4e366538SXin Li "sub $0x20,%2 \n"
285*4e366538SXin Li "jg 1b \n"
286*4e366538SXin Li "vzeroupper \n"
287*4e366538SXin Li : "+r"(src_ptr), // %0
288*4e366538SXin Li "+r"(dst_ptr), // %1
289*4e366538SXin Li "+r"(dst_width) // %2
290*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %3
291*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292*4e366538SXin Li }
293*4e366538SXin Li #endif // HAS_SCALEROWDOWN2_AVX2
294*4e366538SXin Li
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295*4e366538SXin Li void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296*4e366538SXin Li ptrdiff_t src_stride,
297*4e366538SXin Li uint8_t* dst_ptr,
298*4e366538SXin Li int dst_width) {
299*4e366538SXin Li (void)src_stride;
300*4e366538SXin Li asm volatile(
301*4e366538SXin Li "pcmpeqb %%xmm5,%%xmm5 \n"
302*4e366538SXin Li "psrld $0x18,%%xmm5 \n"
303*4e366538SXin Li "pslld $0x10,%%xmm5 \n"
304*4e366538SXin Li
305*4e366538SXin Li LABELALIGN
306*4e366538SXin Li "1: \n"
307*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
308*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
309*4e366538SXin Li "lea 0x20(%0),%0 \n"
310*4e366538SXin Li "pand %%xmm5,%%xmm0 \n"
311*4e366538SXin Li "pand %%xmm5,%%xmm1 \n"
312*4e366538SXin Li "packuswb %%xmm1,%%xmm0 \n"
313*4e366538SXin Li "psrlw $0x8,%%xmm0 \n"
314*4e366538SXin Li "packuswb %%xmm0,%%xmm0 \n"
315*4e366538SXin Li "movq %%xmm0,(%1) \n"
316*4e366538SXin Li "lea 0x8(%1),%1 \n"
317*4e366538SXin Li "sub $0x8,%2 \n"
318*4e366538SXin Li "jg 1b \n"
319*4e366538SXin Li : "+r"(src_ptr), // %0
320*4e366538SXin Li "+r"(dst_ptr), // %1
321*4e366538SXin Li "+r"(dst_width) // %2
322*4e366538SXin Li ::"memory",
323*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm5");
324*4e366538SXin Li }
325*4e366538SXin Li
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326*4e366538SXin Li void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327*4e366538SXin Li ptrdiff_t src_stride,
328*4e366538SXin Li uint8_t* dst_ptr,
329*4e366538SXin Li int dst_width) {
330*4e366538SXin Li intptr_t stridex3;
331*4e366538SXin Li asm volatile(
332*4e366538SXin Li "pcmpeqb %%xmm4,%%xmm4 \n"
333*4e366538SXin Li "psrlw $0xf,%%xmm4 \n"
334*4e366538SXin Li "movdqa %%xmm4,%%xmm5 \n"
335*4e366538SXin Li "packuswb %%xmm4,%%xmm4 \n"
336*4e366538SXin Li "psllw $0x3,%%xmm5 \n"
337*4e366538SXin Li "lea 0x00(%4,%4,2),%3 \n"
338*4e366538SXin Li
339*4e366538SXin Li LABELALIGN
340*4e366538SXin Li "1: \n"
341*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
342*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
343*4e366538SXin Li "movdqu 0x00(%0,%4,1),%%xmm2 \n"
344*4e366538SXin Li "movdqu 0x10(%0,%4,1),%%xmm3 \n"
345*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm0 \n"
346*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm1 \n"
347*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm2 \n"
348*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm3 \n"
349*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n"
350*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n"
351*4e366538SXin Li "movdqu 0x00(%0,%4,2),%%xmm2 \n"
352*4e366538SXin Li "movdqu 0x10(%0,%4,2),%%xmm3 \n"
353*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm2 \n"
354*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm3 \n"
355*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n"
356*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n"
357*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm2 \n"
358*4e366538SXin Li "movdqu 0x10(%0,%3,1),%%xmm3 \n"
359*4e366538SXin Li "lea 0x20(%0),%0 \n"
360*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm2 \n"
361*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm3 \n"
362*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n"
363*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n"
364*4e366538SXin Li "phaddw %%xmm1,%%xmm0 \n"
365*4e366538SXin Li "paddw %%xmm5,%%xmm0 \n"
366*4e366538SXin Li "psrlw $0x4,%%xmm0 \n"
367*4e366538SXin Li "packuswb %%xmm0,%%xmm0 \n"
368*4e366538SXin Li "movq %%xmm0,(%1) \n"
369*4e366538SXin Li "lea 0x8(%1),%1 \n"
370*4e366538SXin Li "sub $0x8,%2 \n"
371*4e366538SXin Li "jg 1b \n"
372*4e366538SXin Li : "+r"(src_ptr), // %0
373*4e366538SXin Li "+r"(dst_ptr), // %1
374*4e366538SXin Li "+r"(dst_width), // %2
375*4e366538SXin Li "=&r"(stridex3) // %3
376*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %4
377*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378*4e366538SXin Li }
379*4e366538SXin Li
380*4e366538SXin Li #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381*4e366538SXin Li void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382*4e366538SXin Li ptrdiff_t src_stride,
383*4e366538SXin Li uint8_t* dst_ptr,
384*4e366538SXin Li int dst_width) {
385*4e366538SXin Li (void)src_stride;
386*4e366538SXin Li asm volatile(
387*4e366538SXin Li "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
388*4e366538SXin Li "vpsrld $0x18,%%ymm5,%%ymm5 \n"
389*4e366538SXin Li "vpslld $0x10,%%ymm5,%%ymm5 \n"
390*4e366538SXin Li
391*4e366538SXin Li LABELALIGN
392*4e366538SXin Li "1: \n"
393*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n"
394*4e366538SXin Li "vmovdqu 0x20(%0),%%ymm1 \n"
395*4e366538SXin Li "lea 0x40(%0),%0 \n"
396*4e366538SXin Li "vpand %%ymm5,%%ymm0,%%ymm0 \n"
397*4e366538SXin Li "vpand %%ymm5,%%ymm1,%%ymm1 \n"
398*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
399*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
400*4e366538SXin Li "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
401*4e366538SXin Li "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
402*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
403*4e366538SXin Li "vmovdqu %%xmm0,(%1) \n"
404*4e366538SXin Li "lea 0x10(%1),%1 \n"
405*4e366538SXin Li "sub $0x10,%2 \n"
406*4e366538SXin Li "jg 1b \n"
407*4e366538SXin Li "vzeroupper \n"
408*4e366538SXin Li : "+r"(src_ptr), // %0
409*4e366538SXin Li "+r"(dst_ptr), // %1
410*4e366538SXin Li "+r"(dst_width) // %2
411*4e366538SXin Li ::"memory",
412*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm5");
413*4e366538SXin Li }
414*4e366538SXin Li
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415*4e366538SXin Li void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416*4e366538SXin Li ptrdiff_t src_stride,
417*4e366538SXin Li uint8_t* dst_ptr,
418*4e366538SXin Li int dst_width) {
419*4e366538SXin Li asm volatile(
420*4e366538SXin Li "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
421*4e366538SXin Li "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
422*4e366538SXin Li "vpsllw $0x3,%%ymm4,%%ymm5 \n"
423*4e366538SXin Li "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
424*4e366538SXin Li
425*4e366538SXin Li LABELALIGN
426*4e366538SXin Li "1: \n"
427*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n"
428*4e366538SXin Li "vmovdqu 0x20(%0),%%ymm1 \n"
429*4e366538SXin Li "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
430*4e366538SXin Li "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
431*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
432*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
433*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
434*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
435*4e366538SXin Li "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
436*4e366538SXin Li "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
437*4e366538SXin Li "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
438*4e366538SXin Li "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
439*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
440*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
441*4e366538SXin Li "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
442*4e366538SXin Li "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
443*4e366538SXin Li "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
444*4e366538SXin Li "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
445*4e366538SXin Li "lea 0x40(%0),%0 \n"
446*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
447*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
448*4e366538SXin Li "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
449*4e366538SXin Li "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
450*4e366538SXin Li "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
451*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
452*4e366538SXin Li "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
453*4e366538SXin Li "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
454*4e366538SXin Li "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
455*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n"
456*4e366538SXin Li "vmovdqu %%xmm0,(%1) \n"
457*4e366538SXin Li "lea 0x10(%1),%1 \n"
458*4e366538SXin Li "sub $0x10,%2 \n"
459*4e366538SXin Li "jg 1b \n"
460*4e366538SXin Li "vzeroupper \n"
461*4e366538SXin Li : "+r"(src_ptr), // %0
462*4e366538SXin Li "+r"(dst_ptr), // %1
463*4e366538SXin Li "+r"(dst_width) // %2
464*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
465*4e366538SXin Li "r"((intptr_t)(src_stride * 3)) // %4
466*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467*4e366538SXin Li }
468*4e366538SXin Li #endif // HAS_SCALEROWDOWN4_AVX2
469*4e366538SXin Li
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470*4e366538SXin Li void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471*4e366538SXin Li ptrdiff_t src_stride,
472*4e366538SXin Li uint8_t* dst_ptr,
473*4e366538SXin Li int dst_width) {
474*4e366538SXin Li (void)src_stride;
475*4e366538SXin Li asm volatile(
476*4e366538SXin Li "movdqa %0,%%xmm3 \n"
477*4e366538SXin Li "movdqa %1,%%xmm4 \n"
478*4e366538SXin Li "movdqa %2,%%xmm5 \n"
479*4e366538SXin Li :
480*4e366538SXin Li : "m"(kShuf0), // %0
481*4e366538SXin Li "m"(kShuf1), // %1
482*4e366538SXin Li "m"(kShuf2) // %2
483*4e366538SXin Li );
484*4e366538SXin Li asm volatile(LABELALIGN
485*4e366538SXin Li "1: \n"
486*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
487*4e366538SXin Li "movdqu 0x10(%0),%%xmm2 \n"
488*4e366538SXin Li "lea 0x20(%0),%0 \n"
489*4e366538SXin Li "movdqa %%xmm2,%%xmm1 \n"
490*4e366538SXin Li "palignr $0x8,%%xmm0,%%xmm1 \n"
491*4e366538SXin Li "pshufb %%xmm3,%%xmm0 \n"
492*4e366538SXin Li "pshufb %%xmm4,%%xmm1 \n"
493*4e366538SXin Li "pshufb %%xmm5,%%xmm2 \n"
494*4e366538SXin Li "movq %%xmm0,(%1) \n"
495*4e366538SXin Li "movq %%xmm1,0x8(%1) \n"
496*4e366538SXin Li "movq %%xmm2,0x10(%1) \n"
497*4e366538SXin Li "lea 0x18(%1),%1 \n"
498*4e366538SXin Li "sub $0x18,%2 \n"
499*4e366538SXin Li "jg 1b \n"
500*4e366538SXin Li : "+r"(src_ptr), // %0
501*4e366538SXin Li "+r"(dst_ptr), // %1
502*4e366538SXin Li "+r"(dst_width) // %2
503*4e366538SXin Li ::"memory",
504*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505*4e366538SXin Li }
506*4e366538SXin Li
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507*4e366538SXin Li void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508*4e366538SXin Li ptrdiff_t src_stride,
509*4e366538SXin Li uint8_t* dst_ptr,
510*4e366538SXin Li int dst_width) {
511*4e366538SXin Li asm volatile(
512*4e366538SXin Li "movdqa %0,%%xmm2 \n" // kShuf01
513*4e366538SXin Li "movdqa %1,%%xmm3 \n" // kShuf11
514*4e366538SXin Li "movdqa %2,%%xmm4 \n" // kShuf21
515*4e366538SXin Li :
516*4e366538SXin Li : "m"(kShuf01), // %0
517*4e366538SXin Li "m"(kShuf11), // %1
518*4e366538SXin Li "m"(kShuf21) // %2
519*4e366538SXin Li );
520*4e366538SXin Li asm volatile(
521*4e366538SXin Li "movdqa %0,%%xmm5 \n" // kMadd01
522*4e366538SXin Li "movdqa %1,%%xmm0 \n" // kMadd11
523*4e366538SXin Li "movdqa %2,%%xmm1 \n" // kRound34
524*4e366538SXin Li :
525*4e366538SXin Li : "m"(kMadd01), // %0
526*4e366538SXin Li "m"(kMadd11), // %1
527*4e366538SXin Li "m"(kRound34) // %2
528*4e366538SXin Li );
529*4e366538SXin Li asm volatile(LABELALIGN
530*4e366538SXin Li "1: \n"
531*4e366538SXin Li "movdqu (%0),%%xmm6 \n"
532*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm7 \n"
533*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
534*4e366538SXin Li "pshufb %%xmm2,%%xmm6 \n"
535*4e366538SXin Li "pmaddubsw %%xmm5,%%xmm6 \n"
536*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
537*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
538*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
539*4e366538SXin Li "movq %%xmm6,(%1) \n"
540*4e366538SXin Li "movdqu 0x8(%0),%%xmm6 \n"
541*4e366538SXin Li "movdqu 0x8(%0,%3,1),%%xmm7 \n"
542*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
543*4e366538SXin Li "pshufb %%xmm3,%%xmm6 \n"
544*4e366538SXin Li "pmaddubsw %%xmm0,%%xmm6 \n"
545*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
546*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
547*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
548*4e366538SXin Li "movq %%xmm6,0x8(%1) \n"
549*4e366538SXin Li "movdqu 0x10(%0),%%xmm6 \n"
550*4e366538SXin Li "movdqu 0x10(%0,%3,1),%%xmm7 \n"
551*4e366538SXin Li "lea 0x20(%0),%0 \n"
552*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
553*4e366538SXin Li "pshufb %%xmm4,%%xmm6 \n"
554*4e366538SXin Li "pmaddubsw %4,%%xmm6 \n"
555*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
556*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
557*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
558*4e366538SXin Li "movq %%xmm6,0x10(%1) \n"
559*4e366538SXin Li "lea 0x18(%1),%1 \n"
560*4e366538SXin Li "sub $0x18,%2 \n"
561*4e366538SXin Li "jg 1b \n"
562*4e366538SXin Li : "+r"(src_ptr), // %0
563*4e366538SXin Li "+r"(dst_ptr), // %1
564*4e366538SXin Li "+r"(dst_width) // %2
565*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
566*4e366538SXin Li "m"(kMadd21) // %4
567*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568*4e366538SXin Li "xmm6", "xmm7");
569*4e366538SXin Li }
570*4e366538SXin Li
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571*4e366538SXin Li void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572*4e366538SXin Li ptrdiff_t src_stride,
573*4e366538SXin Li uint8_t* dst_ptr,
574*4e366538SXin Li int dst_width) {
575*4e366538SXin Li asm volatile(
576*4e366538SXin Li "movdqa %0,%%xmm2 \n" // kShuf01
577*4e366538SXin Li "movdqa %1,%%xmm3 \n" // kShuf11
578*4e366538SXin Li "movdqa %2,%%xmm4 \n" // kShuf21
579*4e366538SXin Li :
580*4e366538SXin Li : "m"(kShuf01), // %0
581*4e366538SXin Li "m"(kShuf11), // %1
582*4e366538SXin Li "m"(kShuf21) // %2
583*4e366538SXin Li );
584*4e366538SXin Li asm volatile(
585*4e366538SXin Li "movdqa %0,%%xmm5 \n" // kMadd01
586*4e366538SXin Li "movdqa %1,%%xmm0 \n" // kMadd11
587*4e366538SXin Li "movdqa %2,%%xmm1 \n" // kRound34
588*4e366538SXin Li :
589*4e366538SXin Li : "m"(kMadd01), // %0
590*4e366538SXin Li "m"(kMadd11), // %1
591*4e366538SXin Li "m"(kRound34) // %2
592*4e366538SXin Li );
593*4e366538SXin Li
594*4e366538SXin Li asm volatile(LABELALIGN
595*4e366538SXin Li "1: \n"
596*4e366538SXin Li "movdqu (%0),%%xmm6 \n"
597*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm7 \n"
598*4e366538SXin Li "pavgb %%xmm6,%%xmm7 \n"
599*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
600*4e366538SXin Li "pshufb %%xmm2,%%xmm6 \n"
601*4e366538SXin Li "pmaddubsw %%xmm5,%%xmm6 \n"
602*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
603*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
604*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
605*4e366538SXin Li "movq %%xmm6,(%1) \n"
606*4e366538SXin Li "movdqu 0x8(%0),%%xmm6 \n"
607*4e366538SXin Li "movdqu 0x8(%0,%3,1),%%xmm7 \n"
608*4e366538SXin Li "pavgb %%xmm6,%%xmm7 \n"
609*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
610*4e366538SXin Li "pshufb %%xmm3,%%xmm6 \n"
611*4e366538SXin Li "pmaddubsw %%xmm0,%%xmm6 \n"
612*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
613*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
614*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
615*4e366538SXin Li "movq %%xmm6,0x8(%1) \n"
616*4e366538SXin Li "movdqu 0x10(%0),%%xmm6 \n"
617*4e366538SXin Li "movdqu 0x10(%0,%3,1),%%xmm7 \n"
618*4e366538SXin Li "lea 0x20(%0),%0 \n"
619*4e366538SXin Li "pavgb %%xmm6,%%xmm7 \n"
620*4e366538SXin Li "pavgb %%xmm7,%%xmm6 \n"
621*4e366538SXin Li "pshufb %%xmm4,%%xmm6 \n"
622*4e366538SXin Li "pmaddubsw %4,%%xmm6 \n"
623*4e366538SXin Li "paddsw %%xmm1,%%xmm6 \n"
624*4e366538SXin Li "psrlw $0x2,%%xmm6 \n"
625*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
626*4e366538SXin Li "movq %%xmm6,0x10(%1) \n"
627*4e366538SXin Li "lea 0x18(%1),%1 \n"
628*4e366538SXin Li "sub $0x18,%2 \n"
629*4e366538SXin Li "jg 1b \n"
630*4e366538SXin Li : "+r"(src_ptr), // %0
631*4e366538SXin Li "+r"(dst_ptr), // %1
632*4e366538SXin Li "+r"(dst_width) // %2
633*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
634*4e366538SXin Li "m"(kMadd21) // %4
635*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636*4e366538SXin Li "xmm6", "xmm7");
637*4e366538SXin Li }
638*4e366538SXin Li
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639*4e366538SXin Li void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640*4e366538SXin Li ptrdiff_t src_stride,
641*4e366538SXin Li uint8_t* dst_ptr,
642*4e366538SXin Li int dst_width) {
643*4e366538SXin Li (void)src_stride;
644*4e366538SXin Li asm volatile(
645*4e366538SXin Li "movdqa %3,%%xmm4 \n"
646*4e366538SXin Li "movdqa %4,%%xmm5 \n"
647*4e366538SXin Li
648*4e366538SXin Li LABELALIGN
649*4e366538SXin Li "1: \n"
650*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
651*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
652*4e366538SXin Li "lea 0x20(%0),%0 \n"
653*4e366538SXin Li "pshufb %%xmm4,%%xmm0 \n"
654*4e366538SXin Li "pshufb %%xmm5,%%xmm1 \n"
655*4e366538SXin Li "paddusb %%xmm1,%%xmm0 \n"
656*4e366538SXin Li "movq %%xmm0,(%1) \n"
657*4e366538SXin Li "movhlps %%xmm0,%%xmm1 \n"
658*4e366538SXin Li "movd %%xmm1,0x8(%1) \n"
659*4e366538SXin Li "lea 0xc(%1),%1 \n"
660*4e366538SXin Li "sub $0xc,%2 \n"
661*4e366538SXin Li "jg 1b \n"
662*4e366538SXin Li : "+r"(src_ptr), // %0
663*4e366538SXin Li "+r"(dst_ptr), // %1
664*4e366538SXin Li "+r"(dst_width) // %2
665*4e366538SXin Li : "m"(kShuf38a), // %3
666*4e366538SXin Li "m"(kShuf38b) // %4
667*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668*4e366538SXin Li }
669*4e366538SXin Li
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670*4e366538SXin Li void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671*4e366538SXin Li ptrdiff_t src_stride,
672*4e366538SXin Li uint8_t* dst_ptr,
673*4e366538SXin Li int dst_width) {
674*4e366538SXin Li asm volatile(
675*4e366538SXin Li "movdqa %0,%%xmm2 \n"
676*4e366538SXin Li "movdqa %1,%%xmm3 \n"
677*4e366538SXin Li "movdqa %2,%%xmm4 \n"
678*4e366538SXin Li "movdqa %3,%%xmm5 \n"
679*4e366538SXin Li :
680*4e366538SXin Li : "m"(kShufAb0), // %0
681*4e366538SXin Li "m"(kShufAb1), // %1
682*4e366538SXin Li "m"(kShufAb2), // %2
683*4e366538SXin Li "m"(kScaleAb2) // %3
684*4e366538SXin Li );
685*4e366538SXin Li asm volatile(LABELALIGN
686*4e366538SXin Li "1: \n"
687*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
688*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm1 \n"
689*4e366538SXin Li "lea 0x10(%0),%0 \n"
690*4e366538SXin Li "pavgb %%xmm1,%%xmm0 \n"
691*4e366538SXin Li "movdqa %%xmm0,%%xmm1 \n"
692*4e366538SXin Li "pshufb %%xmm2,%%xmm1 \n"
693*4e366538SXin Li "movdqa %%xmm0,%%xmm6 \n"
694*4e366538SXin Li "pshufb %%xmm3,%%xmm6 \n"
695*4e366538SXin Li "paddusw %%xmm6,%%xmm1 \n"
696*4e366538SXin Li "pshufb %%xmm4,%%xmm0 \n"
697*4e366538SXin Li "paddusw %%xmm0,%%xmm1 \n"
698*4e366538SXin Li "pmulhuw %%xmm5,%%xmm1 \n"
699*4e366538SXin Li "packuswb %%xmm1,%%xmm1 \n"
700*4e366538SXin Li "movd %%xmm1,(%1) \n"
701*4e366538SXin Li "psrlq $0x10,%%xmm1 \n"
702*4e366538SXin Li "movd %%xmm1,0x2(%1) \n"
703*4e366538SXin Li "lea 0x6(%1),%1 \n"
704*4e366538SXin Li "sub $0x6,%2 \n"
705*4e366538SXin Li "jg 1b \n"
706*4e366538SXin Li : "+r"(src_ptr), // %0
707*4e366538SXin Li "+r"(dst_ptr), // %1
708*4e366538SXin Li "+r"(dst_width) // %2
709*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %3
710*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711*4e366538SXin Li "xmm6");
712*4e366538SXin Li }
713*4e366538SXin Li
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714*4e366538SXin Li void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715*4e366538SXin Li ptrdiff_t src_stride,
716*4e366538SXin Li uint8_t* dst_ptr,
717*4e366538SXin Li int dst_width) {
718*4e366538SXin Li asm volatile(
719*4e366538SXin Li "movdqa %0,%%xmm2 \n"
720*4e366538SXin Li "movdqa %1,%%xmm3 \n"
721*4e366538SXin Li "movdqa %2,%%xmm4 \n"
722*4e366538SXin Li "pxor %%xmm5,%%xmm5 \n"
723*4e366538SXin Li :
724*4e366538SXin Li : "m"(kShufAc), // %0
725*4e366538SXin Li "m"(kShufAc3), // %1
726*4e366538SXin Li "m"(kScaleAc33) // %2
727*4e366538SXin Li );
728*4e366538SXin Li asm volatile(LABELALIGN
729*4e366538SXin Li "1: \n"
730*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
731*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm6 \n"
732*4e366538SXin Li "movhlps %%xmm0,%%xmm1 \n"
733*4e366538SXin Li "movhlps %%xmm6,%%xmm7 \n"
734*4e366538SXin Li "punpcklbw %%xmm5,%%xmm0 \n"
735*4e366538SXin Li "punpcklbw %%xmm5,%%xmm1 \n"
736*4e366538SXin Li "punpcklbw %%xmm5,%%xmm6 \n"
737*4e366538SXin Li "punpcklbw %%xmm5,%%xmm7 \n"
738*4e366538SXin Li "paddusw %%xmm6,%%xmm0 \n"
739*4e366538SXin Li "paddusw %%xmm7,%%xmm1 \n"
740*4e366538SXin Li "movdqu 0x00(%0,%3,2),%%xmm6 \n"
741*4e366538SXin Li "lea 0x10(%0),%0 \n"
742*4e366538SXin Li "movhlps %%xmm6,%%xmm7 \n"
743*4e366538SXin Li "punpcklbw %%xmm5,%%xmm6 \n"
744*4e366538SXin Li "punpcklbw %%xmm5,%%xmm7 \n"
745*4e366538SXin Li "paddusw %%xmm6,%%xmm0 \n"
746*4e366538SXin Li "paddusw %%xmm7,%%xmm1 \n"
747*4e366538SXin Li "movdqa %%xmm0,%%xmm6 \n"
748*4e366538SXin Li "psrldq $0x2,%%xmm0 \n"
749*4e366538SXin Li "paddusw %%xmm0,%%xmm6 \n"
750*4e366538SXin Li "psrldq $0x2,%%xmm0 \n"
751*4e366538SXin Li "paddusw %%xmm0,%%xmm6 \n"
752*4e366538SXin Li "pshufb %%xmm2,%%xmm6 \n"
753*4e366538SXin Li "movdqa %%xmm1,%%xmm7 \n"
754*4e366538SXin Li "psrldq $0x2,%%xmm1 \n"
755*4e366538SXin Li "paddusw %%xmm1,%%xmm7 \n"
756*4e366538SXin Li "psrldq $0x2,%%xmm1 \n"
757*4e366538SXin Li "paddusw %%xmm1,%%xmm7 \n"
758*4e366538SXin Li "pshufb %%xmm3,%%xmm7 \n"
759*4e366538SXin Li "paddusw %%xmm7,%%xmm6 \n"
760*4e366538SXin Li "pmulhuw %%xmm4,%%xmm6 \n"
761*4e366538SXin Li "packuswb %%xmm6,%%xmm6 \n"
762*4e366538SXin Li "movd %%xmm6,(%1) \n"
763*4e366538SXin Li "psrlq $0x10,%%xmm6 \n"
764*4e366538SXin Li "movd %%xmm6,0x2(%1) \n"
765*4e366538SXin Li "lea 0x6(%1),%1 \n"
766*4e366538SXin Li "sub $0x6,%2 \n"
767*4e366538SXin Li "jg 1b \n"
768*4e366538SXin Li : "+r"(src_ptr), // %0
769*4e366538SXin Li "+r"(dst_ptr), // %1
770*4e366538SXin Li "+r"(dst_width) // %2
771*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %3
772*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773*4e366538SXin Li "xmm6", "xmm7");
774*4e366538SXin Li }
775*4e366538SXin Li
776*4e366538SXin Li static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
777*4e366538SXin Li 10, 11, 8, 9, 14, 15, 12, 13};
778*4e366538SXin Li
779*4e366538SXin Li static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780*4e366538SXin Li 3, 1, 1, 3, 3, 1, 1, 3};
781*4e366538SXin Li
782*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783*4e366538SXin Li void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784*4e366538SXin Li uint8_t* dst_ptr,
785*4e366538SXin Li int dst_width) {
786*4e366538SXin Li asm volatile(
787*4e366538SXin Li "pxor %%xmm0,%%xmm0 \n" // 0
788*4e366538SXin Li "pcmpeqw %%xmm6,%%xmm6 \n"
789*4e366538SXin Li "psrlw $15,%%xmm6 \n"
790*4e366538SXin Li "psllw $1,%%xmm6 \n" // all 2
791*4e366538SXin Li
792*4e366538SXin Li LABELALIGN
793*4e366538SXin Li "1: \n"
794*4e366538SXin Li "movq (%0),%%xmm1 \n" // 01234567
795*4e366538SXin Li "movq 1(%0),%%xmm2 \n" // 12345678
796*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
797*4e366538SXin Li "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
798*4e366538SXin Li "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
799*4e366538SXin Li "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
800*4e366538SXin Li "movdqa %%xmm1,%%xmm4 \n"
801*4e366538SXin Li "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
802*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
803*4e366538SXin Li "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
804*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n"
805*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
806*4e366538SXin Li "paddw %%xmm6,%%xmm4 \n"
807*4e366538SXin Li "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
808*4e366538SXin Li "paddw %%xmm5,%%xmm5 \n"
809*4e366538SXin Li "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
810*4e366538SXin Li "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
811*4e366538SXin Li
812*4e366538SXin Li "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
813*4e366538SXin Li "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
814*4e366538SXin Li "paddw %%xmm2,%%xmm1 \n"
815*4e366538SXin Li "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
816*4e366538SXin Li "paddw %%xmm6,%%xmm1 \n"
817*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n"
818*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
819*4e366538SXin Li "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
820*4e366538SXin Li
821*4e366538SXin Li "packuswb %%xmm1,%%xmm5 \n"
822*4e366538SXin Li "movdqu %%xmm5,(%1) \n"
823*4e366538SXin Li
824*4e366538SXin Li "lea 0x8(%0),%0 \n"
825*4e366538SXin Li "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
826*4e366538SXin Li "sub $0x10,%2 \n"
827*4e366538SXin Li "jg 1b \n"
828*4e366538SXin Li : "+r"(src_ptr), // %0
829*4e366538SXin Li "+r"(dst_ptr), // %1
830*4e366538SXin Li "+r"(dst_width) // %2
831*4e366538SXin Li :
832*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833*4e366538SXin Li }
834*4e366538SXin Li #endif
835*4e366538SXin Li
836*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837*4e366538SXin Li void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838*4e366538SXin Li ptrdiff_t src_stride,
839*4e366538SXin Li uint8_t* dst_ptr,
840*4e366538SXin Li ptrdiff_t dst_stride,
841*4e366538SXin Li int dst_width) {
842*4e366538SXin Li asm volatile(
843*4e366538SXin Li LABELALIGN
844*4e366538SXin Li "1: \n"
845*4e366538SXin Li "pxor %%xmm0,%%xmm0 \n" // 0
846*4e366538SXin Li // above line
847*4e366538SXin Li "movq (%0),%%xmm1 \n" // 01234567
848*4e366538SXin Li "movq 1(%0),%%xmm2 \n" // 12345678
849*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
850*4e366538SXin Li "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
851*4e366538SXin Li "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
852*4e366538SXin Li "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
853*4e366538SXin Li
854*4e366538SXin Li "movdqa %%xmm1,%%xmm4 \n"
855*4e366538SXin Li "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
856*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
857*4e366538SXin Li "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
858*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // near+far
859*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
860*4e366538SXin Li "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
861*4e366538SXin Li "paddw %%xmm5,%%xmm5 \n" // 2*near
862*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
863*4e366538SXin Li
864*4e366538SXin Li "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
865*4e366538SXin Li "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
866*4e366538SXin Li "paddw %%xmm2,%%xmm1 \n"
867*4e366538SXin Li "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
868*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 2*near
869*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
870*4e366538SXin Li
871*4e366538SXin Li // below line
872*4e366538SXin Li "movq (%0,%3),%%xmm6 \n" // 01234567
873*4e366538SXin Li "movq 1(%0,%3),%%xmm2 \n" // 12345678
874*4e366538SXin Li "movdqa %%xmm6,%%xmm3 \n"
875*4e366538SXin Li "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
876*4e366538SXin Li "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
877*4e366538SXin Li "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
878*4e366538SXin Li
879*4e366538SXin Li "movdqa %%xmm6,%%xmm5 \n"
880*4e366538SXin Li "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
881*4e366538SXin Li "movdqa %%xmm2,%%xmm7 \n"
882*4e366538SXin Li "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
883*4e366538SXin Li "paddw %%xmm7,%%xmm5 \n" // near+far
884*4e366538SXin Li "movdqa %%xmm3,%%xmm7 \n"
885*4e366538SXin Li "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
886*4e366538SXin Li "paddw %%xmm7,%%xmm7 \n" // 2*near
887*4e366538SXin Li "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
888*4e366538SXin Li
889*4e366538SXin Li "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
890*4e366538SXin Li "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
891*4e366538SXin Li "paddw %%xmm6,%%xmm2 \n" // near+far
892*4e366538SXin Li "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
893*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 2*near
894*4e366538SXin Li "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
895*4e366538SXin Li
896*4e366538SXin Li // xmm4 xmm1
897*4e366538SXin Li // xmm5 xmm2
898*4e366538SXin Li "pcmpeqw %%xmm0,%%xmm0 \n"
899*4e366538SXin Li "psrlw $15,%%xmm0 \n"
900*4e366538SXin Li "psllw $3,%%xmm0 \n" // all 8
901*4e366538SXin Li
902*4e366538SXin Li "movdqa %%xmm4,%%xmm3 \n"
903*4e366538SXin Li "movdqa %%xmm5,%%xmm6 \n"
904*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
905*4e366538SXin Li "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
906*4e366538SXin Li "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
907*4e366538SXin Li "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
908*4e366538SXin Li "psrlw $4,%%xmm3 \n" // ^ div by 16
909*4e366538SXin Li
910*4e366538SXin Li "movdqa %%xmm1,%%xmm7 \n"
911*4e366538SXin Li "movdqa %%xmm2,%%xmm6 \n"
912*4e366538SXin Li "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
913*4e366538SXin Li "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
914*4e366538SXin Li "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
915*4e366538SXin Li "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
916*4e366538SXin Li "psrlw $4,%%xmm7 \n" // ^ div by 16
917*4e366538SXin Li
918*4e366538SXin Li "packuswb %%xmm7,%%xmm3 \n"
919*4e366538SXin Li "movdqu %%xmm3,(%1) \n" // save above line
920*4e366538SXin Li
921*4e366538SXin Li "movdqa %%xmm5,%%xmm3 \n"
922*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
923*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
924*4e366538SXin Li "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
925*4e366538SXin Li "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
926*4e366538SXin Li "psrlw $4,%%xmm5 \n" // ^ div by 16
927*4e366538SXin Li
928*4e366538SXin Li "movdqa %%xmm2,%%xmm3 \n"
929*4e366538SXin Li "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
930*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
931*4e366538SXin Li "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
932*4e366538SXin Li "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
933*4e366538SXin Li "psrlw $4,%%xmm2 \n" // ^ div by 16
934*4e366538SXin Li
935*4e366538SXin Li "packuswb %%xmm2,%%xmm5 \n"
936*4e366538SXin Li "movdqu %%xmm5,(%1,%4) \n" // save below line
937*4e366538SXin Li
938*4e366538SXin Li "lea 0x8(%0),%0 \n"
939*4e366538SXin Li "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
940*4e366538SXin Li "sub $0x10,%2 \n"
941*4e366538SXin Li "jg 1b \n"
942*4e366538SXin Li : "+r"(src_ptr), // %0
943*4e366538SXin Li "+r"(dst_ptr), // %1
944*4e366538SXin Li "+r"(dst_width) // %2
945*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
946*4e366538SXin Li "r"((intptr_t)(dst_stride)) // %4
947*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948*4e366538SXin Li "xmm7");
949*4e366538SXin Li }
950*4e366538SXin Li #endif
951*4e366538SXin Li
952*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953*4e366538SXin Li void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954*4e366538SXin Li uint16_t* dst_ptr,
955*4e366538SXin Li int dst_width) {
956*4e366538SXin Li asm volatile(
957*4e366538SXin Li "movdqa %3,%%xmm5 \n"
958*4e366538SXin Li "pcmpeqw %%xmm4,%%xmm4 \n"
959*4e366538SXin Li "psrlw $15,%%xmm4 \n"
960*4e366538SXin Li "psllw $1,%%xmm4 \n" // all 2
961*4e366538SXin Li
962*4e366538SXin Li LABELALIGN
963*4e366538SXin Li "1: \n"
964*4e366538SXin Li "movdqu (%0),%%xmm0 \n" // 01234567 (16)
965*4e366538SXin Li "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
966*4e366538SXin Li
967*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
968*4e366538SXin Li "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
969*4e366538SXin Li "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
970*4e366538SXin Li
971*4e366538SXin Li "movdqa %%xmm2,%%xmm3 \n"
972*4e366538SXin Li "movdqa %%xmm0,%%xmm1 \n"
973*4e366538SXin Li "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
974*4e366538SXin Li "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
975*4e366538SXin Li
976*4e366538SXin Li "paddw %%xmm4,%%xmm1 \n" // far+2
977*4e366538SXin Li "paddw %%xmm4,%%xmm3 \n" // far+2
978*4e366538SXin Li "paddw %%xmm0,%%xmm1 \n" // near+far+2
979*4e366538SXin Li "paddw %%xmm2,%%xmm3 \n" // near+far+2
980*4e366538SXin Li "paddw %%xmm0,%%xmm0 \n" // 2*near
981*4e366538SXin Li "paddw %%xmm2,%%xmm2 \n" // 2*near
982*4e366538SXin Li "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
983*4e366538SXin Li "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
984*4e366538SXin Li
985*4e366538SXin Li "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
986*4e366538SXin Li "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
987*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
988*4e366538SXin Li "movdqu %%xmm2,16(%1) \n"
989*4e366538SXin Li
990*4e366538SXin Li "lea 0x10(%0),%0 \n"
991*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
992*4e366538SXin Li "sub $0x10,%2 \n"
993*4e366538SXin Li "jg 1b \n"
994*4e366538SXin Li : "+r"(src_ptr), // %0
995*4e366538SXin Li "+r"(dst_ptr), // %1
996*4e366538SXin Li "+r"(dst_width) // %2
997*4e366538SXin Li : "m"(kLinearShuffleFar) // %3
998*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999*4e366538SXin Li }
1000*4e366538SXin Li #endif
1001*4e366538SXin Li
1002*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003*4e366538SXin Li void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004*4e366538SXin Li ptrdiff_t src_stride,
1005*4e366538SXin Li uint16_t* dst_ptr,
1006*4e366538SXin Li ptrdiff_t dst_stride,
1007*4e366538SXin Li int dst_width) {
1008*4e366538SXin Li asm volatile(
1009*4e366538SXin Li "pcmpeqw %%xmm7,%%xmm7 \n"
1010*4e366538SXin Li "psrlw $15,%%xmm7 \n"
1011*4e366538SXin Li "psllw $3,%%xmm7 \n" // all 8
1012*4e366538SXin Li "movdqa %5,%%xmm6 \n"
1013*4e366538SXin Li
1014*4e366538SXin Li LABELALIGN
1015*4e366538SXin Li "1: \n"
1016*4e366538SXin Li // above line
1017*4e366538SXin Li "movdqu (%0),%%xmm0 \n" // 01234567 (16)
1018*4e366538SXin Li "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
1019*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1020*4e366538SXin Li "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
1021*4e366538SXin Li "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
1022*4e366538SXin Li "movdqa %%xmm2,%%xmm3 \n"
1023*4e366538SXin Li "movdqa %%xmm0,%%xmm1 \n"
1024*4e366538SXin Li "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
1025*4e366538SXin Li "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
1026*4e366538SXin Li "paddw %%xmm0,%%xmm1 \n" // near+far
1027*4e366538SXin Li "paddw %%xmm2,%%xmm3 \n" // near+far
1028*4e366538SXin Li "paddw %%xmm0,%%xmm0 \n" // 2*near
1029*4e366538SXin Li "paddw %%xmm2,%%xmm2 \n" // 2*near
1030*4e366538SXin Li "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
1031*4e366538SXin Li "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
1032*4e366538SXin Li
1033*4e366538SXin Li // below line
1034*4e366538SXin Li "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
1035*4e366538SXin Li "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
1036*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
1037*4e366538SXin Li "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
1038*4e366538SXin Li "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
1039*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
1040*4e366538SXin Li "movdqa %%xmm1,%%xmm4 \n"
1041*4e366538SXin Li "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
1042*4e366538SXin Li "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
1043*4e366538SXin Li "paddw %%xmm1,%%xmm4 \n" // near+far
1044*4e366538SXin Li "paddw %%xmm3,%%xmm5 \n" // near+far
1045*4e366538SXin Li "paddw %%xmm1,%%xmm1 \n" // 2*near
1046*4e366538SXin Li "paddw %%xmm3,%%xmm3 \n" // 2*near
1047*4e366538SXin Li "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
1048*4e366538SXin Li "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1049*4e366538SXin Li
1050*4e366538SXin Li // xmm0 xmm2
1051*4e366538SXin Li // xmm1 xmm3
1052*4e366538SXin Li
1053*4e366538SXin Li "movdqa %%xmm0,%%xmm4 \n"
1054*4e366538SXin Li "movdqa %%xmm1,%%xmm5 \n"
1055*4e366538SXin Li "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
1056*4e366538SXin Li "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
1057*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1058*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1059*4e366538SXin Li "psrlw $4,%%xmm4 \n" // ^ div by 16
1060*4e366538SXin Li "movdqu %%xmm4,(%1) \n"
1061*4e366538SXin Li
1062*4e366538SXin Li "movdqa %%xmm2,%%xmm4 \n"
1063*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
1064*4e366538SXin Li "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
1065*4e366538SXin Li "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
1066*4e366538SXin Li "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
1067*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
1068*4e366538SXin Li "psrlw $4,%%xmm4 \n" // ^ div by 16
1069*4e366538SXin Li "movdqu %%xmm4,0x10(%1) \n"
1070*4e366538SXin Li
1071*4e366538SXin Li "movdqa %%xmm1,%%xmm4 \n"
1072*4e366538SXin Li "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
1073*4e366538SXin Li "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
1074*4e366538SXin Li "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
1075*4e366538SXin Li "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
1076*4e366538SXin Li "psrlw $4,%%xmm1 \n" // ^ div by 16
1077*4e366538SXin Li "movdqu %%xmm1,(%1,%4,2) \n"
1078*4e366538SXin Li
1079*4e366538SXin Li "movdqa %%xmm3,%%xmm4 \n"
1080*4e366538SXin Li "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
1081*4e366538SXin Li "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
1082*4e366538SXin Li "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
1083*4e366538SXin Li "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
1084*4e366538SXin Li "psrlw $4,%%xmm3 \n" // ^ div by 16
1085*4e366538SXin Li "movdqu %%xmm3,0x10(%1,%4,2) \n"
1086*4e366538SXin Li
1087*4e366538SXin Li "lea 0x10(%0),%0 \n"
1088*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1089*4e366538SXin Li "sub $0x10,%2 \n"
1090*4e366538SXin Li "jg 1b \n"
1091*4e366538SXin Li : "+r"(src_ptr), // %0
1092*4e366538SXin Li "+r"(dst_ptr), // %1
1093*4e366538SXin Li "+r"(dst_width) // %2
1094*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1095*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
1096*4e366538SXin Li "m"(kLinearShuffleFar) // %5
1097*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1098*4e366538SXin Li "xmm7");
1099*4e366538SXin Li }
1100*4e366538SXin Li #endif
1101*4e366538SXin Li
1102*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1103*4e366538SXin Li void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1104*4e366538SXin Li uint16_t* dst_ptr,
1105*4e366538SXin Li int dst_width) {
1106*4e366538SXin Li asm volatile(
1107*4e366538SXin Li "pxor %%xmm5,%%xmm5 \n"
1108*4e366538SXin Li "pcmpeqd %%xmm4,%%xmm4 \n"
1109*4e366538SXin Li "psrld $31,%%xmm4 \n"
1110*4e366538SXin Li "pslld $1,%%xmm4 \n" // all 2
1111*4e366538SXin Li
1112*4e366538SXin Li LABELALIGN
1113*4e366538SXin Li "1: \n"
1114*4e366538SXin Li "movq (%0),%%xmm0 \n" // 0123 (16b)
1115*4e366538SXin Li "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1116*4e366538SXin Li
1117*4e366538SXin Li "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
1118*4e366538SXin Li "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
1119*4e366538SXin Li
1120*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1121*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
1122*4e366538SXin Li
1123*4e366538SXin Li "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1124*4e366538SXin Li "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1125*4e366538SXin Li
1126*4e366538SXin Li "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
1127*4e366538SXin Li "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
1128*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
1129*4e366538SXin Li "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
1130*4e366538SXin Li "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1131*4e366538SXin Li "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1132*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
1133*4e366538SXin Li "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
1134*4e366538SXin Li
1135*4e366538SXin Li "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1136*4e366538SXin Li "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
1137*4e366538SXin Li "packssdw %%xmm1,%%xmm0 \n"
1138*4e366538SXin Li "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
1139*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
1140*4e366538SXin Li
1141*4e366538SXin Li "lea 0x8(%0),%0 \n"
1142*4e366538SXin Li "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1143*4e366538SXin Li "sub $0x8,%2 \n"
1144*4e366538SXin Li "jg 1b \n"
1145*4e366538SXin Li : "+r"(src_ptr), // %0
1146*4e366538SXin Li "+r"(dst_ptr), // %1
1147*4e366538SXin Li "+r"(dst_width) // %2
1148*4e366538SXin Li :
1149*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1150*4e366538SXin Li }
1151*4e366538SXin Li #endif
1152*4e366538SXin Li
1153*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1154*4e366538SXin Li void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1155*4e366538SXin Li ptrdiff_t src_stride,
1156*4e366538SXin Li uint16_t* dst_ptr,
1157*4e366538SXin Li ptrdiff_t dst_stride,
1158*4e366538SXin Li int dst_width) {
1159*4e366538SXin Li asm volatile(
1160*4e366538SXin Li "pxor %%xmm7,%%xmm7 \n"
1161*4e366538SXin Li "pcmpeqd %%xmm6,%%xmm6 \n"
1162*4e366538SXin Li "psrld $31,%%xmm6 \n"
1163*4e366538SXin Li "pslld $3,%%xmm6 \n" // all 8
1164*4e366538SXin Li
1165*4e366538SXin Li LABELALIGN
1166*4e366538SXin Li "1: \n"
1167*4e366538SXin Li "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
1168*4e366538SXin Li "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
1169*4e366538SXin Li "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
1170*4e366538SXin Li "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
1171*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1172*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
1173*4e366538SXin Li "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
1174*4e366538SXin Li "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
1175*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
1176*4e366538SXin Li "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
1177*4e366538SXin Li "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
1178*4e366538SXin Li "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
1179*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1180*4e366538SXin Li "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1181*4e366538SXin Li
1182*4e366538SXin Li "movq (%0),%%xmm0 \n" // 0123 (16b)
1183*4e366538SXin Li "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1184*4e366538SXin Li "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
1185*4e366538SXin Li "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
1186*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1187*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
1188*4e366538SXin Li "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1189*4e366538SXin Li "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1190*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
1191*4e366538SXin Li "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
1192*4e366538SXin Li "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1193*4e366538SXin Li "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1194*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1195*4e366538SXin Li "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1196*4e366538SXin Li
1197*4e366538SXin Li "movq (%0,%3,2),%%xmm2 \n"
1198*4e366538SXin Li "movq 2(%0,%3,2),%%xmm3 \n"
1199*4e366538SXin Li "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
1200*4e366538SXin Li "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
1201*4e366538SXin Li "movdqa %%xmm2,%%xmm4 \n"
1202*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
1203*4e366538SXin Li "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
1204*4e366538SXin Li "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
1205*4e366538SXin Li "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
1206*4e366538SXin Li "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
1207*4e366538SXin Li "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
1208*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
1209*4e366538SXin Li "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
1210*4e366538SXin Li "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1211*4e366538SXin Li
1212*4e366538SXin Li "movdqa %%xmm0,%%xmm4 \n"
1213*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
1214*4e366538SXin Li "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1215*4e366538SXin Li "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1216*4e366538SXin Li "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1217*4e366538SXin Li "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1218*4e366538SXin Li "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1219*4e366538SXin Li
1220*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
1221*4e366538SXin Li "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
1222*4e366538SXin Li "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1223*4e366538SXin Li "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
1224*4e366538SXin Li "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1225*4e366538SXin Li "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1226*4e366538SXin Li
1227*4e366538SXin Li "movdqa %%xmm1,%%xmm0 \n"
1228*4e366538SXin Li "movdqa %%xmm3,%%xmm2 \n"
1229*4e366538SXin Li "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
1230*4e366538SXin Li "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
1231*4e366538SXin Li "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
1232*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1233*4e366538SXin Li "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1234*4e366538SXin Li
1235*4e366538SXin Li "movdqa %%xmm3,%%xmm2 \n"
1236*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
1237*4e366538SXin Li "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
1238*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
1239*4e366538SXin Li "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
1240*4e366538SXin Li "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
1241*4e366538SXin Li
1242*4e366538SXin Li "packssdw %%xmm0,%%xmm4 \n"
1243*4e366538SXin Li "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1244*4e366538SXin Li "movdqu %%xmm4,(%1) \n" // store above
1245*4e366538SXin Li "packssdw %%xmm2,%%xmm5 \n"
1246*4e366538SXin Li "pshufd $0b11011000,%%xmm5,%%xmm5 \n"
1247*4e366538SXin Li "movdqu %%xmm5,(%1,%4,2) \n" // store below
1248*4e366538SXin Li
1249*4e366538SXin Li "lea 0x8(%0),%0 \n"
1250*4e366538SXin Li "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1251*4e366538SXin Li "sub $0x8,%2 \n"
1252*4e366538SXin Li "jg 1b \n"
1253*4e366538SXin Li : "+r"(src_ptr), // %0
1254*4e366538SXin Li "+r"(dst_ptr), // %1
1255*4e366538SXin Li "+r"(dst_width) // %2
1256*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1257*4e366538SXin Li "r"((intptr_t)(dst_stride)) // %4
1258*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1259*4e366538SXin Li "xmm7");
1260*4e366538SXin Li }
1261*4e366538SXin Li #endif
1262*4e366538SXin Li
1263*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1264*4e366538SXin Li void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1265*4e366538SXin Li uint8_t* dst_ptr,
1266*4e366538SXin Li int dst_width) {
1267*4e366538SXin Li asm volatile(
1268*4e366538SXin Li "pcmpeqw %%xmm4,%%xmm4 \n"
1269*4e366538SXin Li "psrlw $15,%%xmm4 \n"
1270*4e366538SXin Li "psllw $1,%%xmm4 \n" // all 2
1271*4e366538SXin Li "movdqa %3,%%xmm3 \n"
1272*4e366538SXin Li
1273*4e366538SXin Li LABELALIGN
1274*4e366538SXin Li "1: \n"
1275*4e366538SXin Li "movq (%0),%%xmm0 \n" // 01234567
1276*4e366538SXin Li "movq 1(%0),%%xmm1 \n" // 12345678
1277*4e366538SXin Li "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1278*4e366538SXin Li "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1279*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1280*4e366538SXin Li "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1281*4e366538SXin Li "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1282*4e366538SXin Li "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
1283*4e366538SXin Li "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
1284*4e366538SXin Li "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
1285*4e366538SXin Li "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
1286*4e366538SXin Li "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1287*4e366538SXin Li "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
1288*4e366538SXin Li "packuswb %%xmm2,%%xmm0 \n"
1289*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
1290*4e366538SXin Li "lea 0x8(%0),%0 \n"
1291*4e366538SXin Li "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1292*4e366538SXin Li "sub $0x10,%2 \n"
1293*4e366538SXin Li "jg 1b \n"
1294*4e366538SXin Li : "+r"(src_ptr), // %0
1295*4e366538SXin Li "+r"(dst_ptr), // %1
1296*4e366538SXin Li "+r"(dst_width) // %2
1297*4e366538SXin Li : "m"(kLinearMadd31) // %3
1298*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1299*4e366538SXin Li }
1300*4e366538SXin Li #endif
1301*4e366538SXin Li
1302*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1303*4e366538SXin Li void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1304*4e366538SXin Li ptrdiff_t src_stride,
1305*4e366538SXin Li uint8_t* dst_ptr,
1306*4e366538SXin Li ptrdiff_t dst_stride,
1307*4e366538SXin Li int dst_width) {
1308*4e366538SXin Li asm volatile(
1309*4e366538SXin Li "pcmpeqw %%xmm6,%%xmm6 \n"
1310*4e366538SXin Li "psrlw $15,%%xmm6 \n"
1311*4e366538SXin Li "psllw $3,%%xmm6 \n" // all 8
1312*4e366538SXin Li "movdqa %5,%%xmm7 \n"
1313*4e366538SXin Li
1314*4e366538SXin Li LABELALIGN
1315*4e366538SXin Li "1: \n"
1316*4e366538SXin Li "movq (%0),%%xmm0 \n" // 01234567
1317*4e366538SXin Li "movq 1(%0),%%xmm1 \n" // 12345678
1318*4e366538SXin Li "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1319*4e366538SXin Li "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1320*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1321*4e366538SXin Li "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1322*4e366538SXin Li "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1323*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
1324*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
1325*4e366538SXin Li
1326*4e366538SXin Li "movq (%0,%3),%%xmm1 \n"
1327*4e366538SXin Li "movq 1(%0,%3),%%xmm4 \n"
1328*4e366538SXin Li "punpcklwd %%xmm1,%%xmm1 \n"
1329*4e366538SXin Li "punpcklwd %%xmm4,%%xmm4 \n"
1330*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
1331*4e366538SXin Li "punpckhdq %%xmm4,%%xmm3 \n"
1332*4e366538SXin Li "punpckldq %%xmm4,%%xmm1 \n"
1333*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
1334*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
1335*4e366538SXin Li
1336*4e366538SXin Li // xmm0 xmm2
1337*4e366538SXin Li // xmm1 xmm3
1338*4e366538SXin Li
1339*4e366538SXin Li "movdqa %%xmm0,%%xmm4 \n"
1340*4e366538SXin Li "movdqa %%xmm1,%%xmm5 \n"
1341*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1342*4e366538SXin Li "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1343*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1344*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1345*4e366538SXin Li "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1346*4e366538SXin Li
1347*4e366538SXin Li "movdqa %%xmm1,%%xmm5 \n"
1348*4e366538SXin Li "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
1349*4e366538SXin Li "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1350*4e366538SXin Li "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
1351*4e366538SXin Li "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1352*4e366538SXin Li "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1353*4e366538SXin Li
1354*4e366538SXin Li "movdqa %%xmm2,%%xmm0 \n"
1355*4e366538SXin Li "movdqa %%xmm3,%%xmm1 \n"
1356*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
1357*4e366538SXin Li "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
1358*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
1359*4e366538SXin Li "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1360*4e366538SXin Li "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1361*4e366538SXin Li
1362*4e366538SXin Li "movdqa %%xmm3,%%xmm1 \n"
1363*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
1364*4e366538SXin Li "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
1365*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
1366*4e366538SXin Li "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
1367*4e366538SXin Li "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
1368*4e366538SXin Li
1369*4e366538SXin Li "packuswb %%xmm0,%%xmm4 \n"
1370*4e366538SXin Li "movdqu %%xmm4,(%1) \n" // store above
1371*4e366538SXin Li "packuswb %%xmm1,%%xmm5 \n"
1372*4e366538SXin Li "movdqu %%xmm5,(%1,%4) \n" // store below
1373*4e366538SXin Li
1374*4e366538SXin Li "lea 0x8(%0),%0 \n"
1375*4e366538SXin Li "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1376*4e366538SXin Li "sub $0x10,%2 \n"
1377*4e366538SXin Li "jg 1b \n"
1378*4e366538SXin Li : "+r"(src_ptr), // %0
1379*4e366538SXin Li "+r"(dst_ptr), // %1
1380*4e366538SXin Li "+r"(dst_width) // %2
1381*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1382*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
1383*4e366538SXin Li "m"(kLinearMadd31) // %5
1384*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1385*4e366538SXin Li "xmm7");
1386*4e366538SXin Li }
1387*4e366538SXin Li #endif
1388*4e366538SXin Li
1389*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1390*4e366538SXin Li void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1391*4e366538SXin Li uint8_t* dst_ptr,
1392*4e366538SXin Li int dst_width) {
1393*4e366538SXin Li asm volatile(
1394*4e366538SXin Li "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1395*4e366538SXin Li "vpsrlw $15,%%ymm4,%%ymm4 \n"
1396*4e366538SXin Li "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1397*4e366538SXin Li "vbroadcastf128 %3,%%ymm3 \n"
1398*4e366538SXin Li
1399*4e366538SXin Li LABELALIGN
1400*4e366538SXin Li "1: \n"
1401*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1402*4e366538SXin Li "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1403*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1404*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1405*4e366538SXin Li "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1406*4e366538SXin Li "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1407*4e366538SXin Li "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1408*4e366538SXin Li "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1409*4e366538SXin Li "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
1410*4e366538SXin Li "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
1411*4e366538SXin Li "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
1412*4e366538SXin Li "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
1413*4e366538SXin Li "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1414*4e366538SXin Li "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1415*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1416*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
1417*4e366538SXin Li
1418*4e366538SXin Li "lea 0x10(%0),%0 \n"
1419*4e366538SXin Li "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1420*4e366538SXin Li "sub $0x20,%2 \n"
1421*4e366538SXin Li "jg 1b \n"
1422*4e366538SXin Li "vzeroupper \n"
1423*4e366538SXin Li : "+r"(src_ptr), // %0
1424*4e366538SXin Li "+r"(dst_ptr), // %1
1425*4e366538SXin Li "+r"(dst_width) // %2
1426*4e366538SXin Li : "m"(kLinearMadd31) // %3
1427*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1428*4e366538SXin Li }
1429*4e366538SXin Li #endif
1430*4e366538SXin Li
1431*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1432*4e366538SXin Li void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1433*4e366538SXin Li ptrdiff_t src_stride,
1434*4e366538SXin Li uint8_t* dst_ptr,
1435*4e366538SXin Li ptrdiff_t dst_stride,
1436*4e366538SXin Li int dst_width) {
1437*4e366538SXin Li asm volatile(
1438*4e366538SXin Li "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
1439*4e366538SXin Li "vpsrlw $15,%%ymm6,%%ymm6 \n"
1440*4e366538SXin Li "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
1441*4e366538SXin Li "vbroadcastf128 %5,%%ymm7 \n"
1442*4e366538SXin Li
1443*4e366538SXin Li LABELALIGN
1444*4e366538SXin Li "1: \n"
1445*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1446*4e366538SXin Li "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1447*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1448*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1449*4e366538SXin Li "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1450*4e366538SXin Li "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1451*4e366538SXin Li "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1452*4e366538SXin Li "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1453*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
1454*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
1455*4e366538SXin Li
1456*4e366538SXin Li "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
1457*4e366538SXin Li "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
1458*4e366538SXin Li "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
1459*4e366538SXin Li "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
1460*4e366538SXin Li "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
1461*4e366538SXin Li "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
1462*4e366538SXin Li "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
1463*4e366538SXin Li "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
1464*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
1465*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
1466*4e366538SXin Li
1467*4e366538SXin Li // ymm0 ymm1
1468*4e366538SXin Li // ymm2 ymm3
1469*4e366538SXin Li
1470*4e366538SXin Li "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1471*4e366538SXin Li "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1472*4e366538SXin Li "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1473*4e366538SXin Li "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1474*4e366538SXin Li "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1475*4e366538SXin Li
1476*4e366538SXin Li "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1477*4e366538SXin Li "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1478*4e366538SXin Li "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1479*4e366538SXin Li "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1480*4e366538SXin Li "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1481*4e366538SXin Li
1482*4e366538SXin Li "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1483*4e366538SXin Li "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1484*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1485*4e366538SXin Li "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1486*4e366538SXin Li "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1487*4e366538SXin Li
1488*4e366538SXin Li "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1489*4e366538SXin Li "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1490*4e366538SXin Li "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1491*4e366538SXin Li "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1492*4e366538SXin Li "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1493*4e366538SXin Li
1494*4e366538SXin Li "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
1495*4e366538SXin Li "vmovdqu %%ymm4,(%1) \n" // store above
1496*4e366538SXin Li "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
1497*4e366538SXin Li "vmovdqu %%ymm5,(%1,%4) \n" // store below
1498*4e366538SXin Li
1499*4e366538SXin Li "lea 0x10(%0),%0 \n"
1500*4e366538SXin Li "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1501*4e366538SXin Li "sub $0x20,%2 \n"
1502*4e366538SXin Li "jg 1b \n"
1503*4e366538SXin Li "vzeroupper \n"
1504*4e366538SXin Li : "+r"(src_ptr), // %0
1505*4e366538SXin Li "+r"(dst_ptr), // %1
1506*4e366538SXin Li "+r"(dst_width) // %2
1507*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1508*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
1509*4e366538SXin Li "m"(kLinearMadd31) // %5
1510*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1511*4e366538SXin Li "xmm7");
1512*4e366538SXin Li }
1513*4e366538SXin Li #endif
1514*4e366538SXin Li
1515*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1516*4e366538SXin Li void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1517*4e366538SXin Li uint16_t* dst_ptr,
1518*4e366538SXin Li int dst_width) {
1519*4e366538SXin Li asm volatile(
1520*4e366538SXin Li "vbroadcastf128 %3,%%ymm5 \n"
1521*4e366538SXin Li "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1522*4e366538SXin Li "vpsrlw $15,%%ymm4,%%ymm4 \n"
1523*4e366538SXin Li "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1524*4e366538SXin Li
1525*4e366538SXin Li LABELALIGN
1526*4e366538SXin Li "1: \n"
1527*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
1528*4e366538SXin Li "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
1529*4e366538SXin Li
1530*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
1531*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
1532*4e366538SXin Li
1533*4e366538SXin Li "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
1534*4e366538SXin Li "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1535*4e366538SXin Li "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
1536*4e366538SXin Li "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1537*4e366538SXin Li
1538*4e366538SXin Li "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
1539*4e366538SXin Li "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
1540*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
1541*4e366538SXin Li "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
1542*4e366538SXin Li "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1543*4e366538SXin Li "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
1544*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
1545*4e366538SXin Li "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
1546*4e366538SXin Li
1547*4e366538SXin Li "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
1548*4e366538SXin Li "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
1549*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
1550*4e366538SXin Li "vmovdqu %%ymm2,32(%1) \n"
1551*4e366538SXin Li
1552*4e366538SXin Li "lea 0x20(%0),%0 \n"
1553*4e366538SXin Li "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
1554*4e366538SXin Li "sub $0x20,%2 \n"
1555*4e366538SXin Li "jg 1b \n"
1556*4e366538SXin Li "vzeroupper \n"
1557*4e366538SXin Li : "+r"(src_ptr), // %0
1558*4e366538SXin Li "+r"(dst_ptr), // %1
1559*4e366538SXin Li "+r"(dst_width) // %2
1560*4e366538SXin Li : "m"(kLinearShuffleFar) // %3
1561*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1562*4e366538SXin Li }
1563*4e366538SXin Li #endif
1564*4e366538SXin Li
1565*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1566*4e366538SXin Li void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1567*4e366538SXin Li ptrdiff_t src_stride,
1568*4e366538SXin Li uint16_t* dst_ptr,
1569*4e366538SXin Li ptrdiff_t dst_stride,
1570*4e366538SXin Li int dst_width) {
1571*4e366538SXin Li asm volatile(
1572*4e366538SXin Li "vbroadcastf128 %5,%%ymm5 \n"
1573*4e366538SXin Li "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1574*4e366538SXin Li "vpsrlw $15,%%ymm4,%%ymm4 \n"
1575*4e366538SXin Li "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
1576*4e366538SXin Li
1577*4e366538SXin Li LABELALIGN
1578*4e366538SXin Li "1: \n"
1579*4e366538SXin Li
1580*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
1581*4e366538SXin Li "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
1582*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1583*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1584*4e366538SXin Li "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1585*4e366538SXin Li "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1586*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1587*4e366538SXin Li "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1588*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
1589*4e366538SXin Li
1590*4e366538SXin Li "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
1591*4e366538SXin Li "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
1592*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1593*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1594*4e366538SXin Li "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1595*4e366538SXin Li "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1596*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1597*4e366538SXin Li "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1598*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
1599*4e366538SXin Li
1600*4e366538SXin Li "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
1601*4e366538SXin Li "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
1602*4e366538SXin Li "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
1603*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
1604*4e366538SXin Li "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1605*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n" // store above
1606*4e366538SXin Li
1607*4e366538SXin Li "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
1608*4e366538SXin Li "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
1609*4e366538SXin Li "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
1610*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
1611*4e366538SXin Li "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1612*4e366538SXin Li "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
1613*4e366538SXin Li
1614*4e366538SXin Li "lea 0x10(%0),%0 \n"
1615*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1616*4e366538SXin Li "sub $0x10,%2 \n"
1617*4e366538SXin Li "jg 1b \n"
1618*4e366538SXin Li "vzeroupper \n"
1619*4e366538SXin Li : "+r"(src_ptr), // %0
1620*4e366538SXin Li "+r"(dst_ptr), // %1
1621*4e366538SXin Li "+r"(dst_width) // %2
1622*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1623*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
1624*4e366538SXin Li "m"(kLinearShuffleFar) // %5
1625*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1626*4e366538SXin Li }
1627*4e366538SXin Li #endif
1628*4e366538SXin Li
1629*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1630*4e366538SXin Li void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1631*4e366538SXin Li uint16_t* dst_ptr,
1632*4e366538SXin Li int dst_width) {
1633*4e366538SXin Li asm volatile(
1634*4e366538SXin Li "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
1635*4e366538SXin Li "vpsrld $31,%%ymm4,%%ymm4 \n"
1636*4e366538SXin Li "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
1637*4e366538SXin Li
1638*4e366538SXin Li LABELALIGN
1639*4e366538SXin Li "1: \n"
1640*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1641*4e366538SXin Li "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1642*4e366538SXin Li
1643*4e366538SXin Li "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1644*4e366538SXin Li "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1645*4e366538SXin Li
1646*4e366538SXin Li "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1647*4e366538SXin Li "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1648*4e366538SXin Li
1649*4e366538SXin Li "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
1650*4e366538SXin Li "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
1651*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
1652*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
1653*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1654*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1655*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
1656*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
1657*4e366538SXin Li
1658*4e366538SXin Li "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1659*4e366538SXin Li "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1660*4e366538SXin Li "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
1661*4e366538SXin Li "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
1662*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
1663*4e366538SXin Li
1664*4e366538SXin Li "lea 0x10(%0),%0 \n"
1665*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1666*4e366538SXin Li "sub $0x10,%2 \n"
1667*4e366538SXin Li "jg 1b \n"
1668*4e366538SXin Li "vzeroupper \n"
1669*4e366538SXin Li : "+r"(src_ptr), // %0
1670*4e366538SXin Li "+r"(dst_ptr), // %1
1671*4e366538SXin Li "+r"(dst_width) // %2
1672*4e366538SXin Li :
1673*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1674*4e366538SXin Li }
1675*4e366538SXin Li #endif
1676*4e366538SXin Li
1677*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1678*4e366538SXin Li void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1679*4e366538SXin Li ptrdiff_t src_stride,
1680*4e366538SXin Li uint16_t* dst_ptr,
1681*4e366538SXin Li ptrdiff_t dst_stride,
1682*4e366538SXin Li int dst_width) {
1683*4e366538SXin Li asm volatile(
1684*4e366538SXin Li "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
1685*4e366538SXin Li "vpsrld $31,%%ymm6,%%ymm6 \n"
1686*4e366538SXin Li "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
1687*4e366538SXin Li
1688*4e366538SXin Li LABELALIGN
1689*4e366538SXin Li "1: \n"
1690*4e366538SXin Li
1691*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1692*4e366538SXin Li "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1693*4e366538SXin Li "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1694*4e366538SXin Li "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1695*4e366538SXin Li "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1696*4e366538SXin Li "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1697*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
1698*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
1699*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1700*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1701*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
1702*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
1703*4e366538SXin Li
1704*4e366538SXin Li "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
1705*4e366538SXin Li "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
1706*4e366538SXin Li "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
1707*4e366538SXin Li "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
1708*4e366538SXin Li "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
1709*4e366538SXin Li "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
1710*4e366538SXin Li "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
1711*4e366538SXin Li "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
1712*4e366538SXin Li "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
1713*4e366538SXin Li "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
1714*4e366538SXin Li "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
1715*4e366538SXin Li "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
1716*4e366538SXin Li
1717*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1718*4e366538SXin Li "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1719*4e366538SXin Li "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1720*4e366538SXin Li "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1721*4e366538SXin Li "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1722*4e366538SXin Li
1723*4e366538SXin Li "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1724*4e366538SXin Li "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1725*4e366538SXin Li "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1726*4e366538SXin Li "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1727*4e366538SXin Li "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1728*4e366538SXin Li
1729*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1730*4e366538SXin Li "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1731*4e366538SXin Li "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1732*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1733*4e366538SXin Li "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1734*4e366538SXin Li
1735*4e366538SXin Li "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1736*4e366538SXin Li "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1737*4e366538SXin Li "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1738*4e366538SXin Li "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1739*4e366538SXin Li "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1740*4e366538SXin Li
1741*4e366538SXin Li "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
1742*4e366538SXin Li "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
1743*4e366538SXin Li "vmovdqu %%ymm4,(%1) \n" // store above
1744*4e366538SXin Li "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
1745*4e366538SXin Li "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
1746*4e366538SXin Li "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
1747*4e366538SXin Li
1748*4e366538SXin Li "lea 0x10(%0),%0 \n"
1749*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1750*4e366538SXin Li "sub $0x10,%2 \n"
1751*4e366538SXin Li "jg 1b \n"
1752*4e366538SXin Li "vzeroupper \n"
1753*4e366538SXin Li : "+r"(src_ptr), // %0
1754*4e366538SXin Li "+r"(dst_ptr), // %1
1755*4e366538SXin Li "+r"(dst_width) // %2
1756*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
1757*4e366538SXin Li "r"((intptr_t)(dst_stride)) // %4
1758*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1759*4e366538SXin Li }
1760*4e366538SXin Li #endif
1761*4e366538SXin Li
1762*4e366538SXin Li // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1763*4e366538SXin Li void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1764*4e366538SXin Li uint16_t* dst_ptr,
1765*4e366538SXin Li int src_width) {
1766*4e366538SXin Li asm volatile("pxor %%xmm5,%%xmm5 \n"
1767*4e366538SXin Li
1768*4e366538SXin Li // 16 pixel loop.
1769*4e366538SXin Li LABELALIGN
1770*4e366538SXin Li "1: \n"
1771*4e366538SXin Li "movdqu (%0),%%xmm3 \n"
1772*4e366538SXin Li "lea 0x10(%0),%0 \n" // src_ptr += 16
1773*4e366538SXin Li "movdqu (%1),%%xmm0 \n"
1774*4e366538SXin Li "movdqu 0x10(%1),%%xmm1 \n"
1775*4e366538SXin Li "movdqa %%xmm3,%%xmm2 \n"
1776*4e366538SXin Li "punpcklbw %%xmm5,%%xmm2 \n"
1777*4e366538SXin Li "punpckhbw %%xmm5,%%xmm3 \n"
1778*4e366538SXin Li "paddusw %%xmm2,%%xmm0 \n"
1779*4e366538SXin Li "paddusw %%xmm3,%%xmm1 \n"
1780*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
1781*4e366538SXin Li "movdqu %%xmm1,0x10(%1) \n"
1782*4e366538SXin Li "lea 0x20(%1),%1 \n"
1783*4e366538SXin Li "sub $0x10,%2 \n"
1784*4e366538SXin Li "jg 1b \n"
1785*4e366538SXin Li : "+r"(src_ptr), // %0
1786*4e366538SXin Li "+r"(dst_ptr), // %1
1787*4e366538SXin Li "+r"(src_width) // %2
1788*4e366538SXin Li :
1789*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1790*4e366538SXin Li }
1791*4e366538SXin Li
1792*4e366538SXin Li #ifdef HAS_SCALEADDROW_AVX2
1793*4e366538SXin Li // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1794*4e366538SXin Li void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1795*4e366538SXin Li uint16_t* dst_ptr,
1796*4e366538SXin Li int src_width) {
1797*4e366538SXin Li asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
1798*4e366538SXin Li
1799*4e366538SXin Li LABELALIGN
1800*4e366538SXin Li "1: \n"
1801*4e366538SXin Li "vmovdqu (%0),%%ymm3 \n"
1802*4e366538SXin Li "lea 0x20(%0),%0 \n" // src_ptr += 32
1803*4e366538SXin Li "vpermq $0xd8,%%ymm3,%%ymm3 \n"
1804*4e366538SXin Li "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
1805*4e366538SXin Li "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
1806*4e366538SXin Li "vpaddusw (%1),%%ymm2,%%ymm0 \n"
1807*4e366538SXin Li "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
1808*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
1809*4e366538SXin Li "vmovdqu %%ymm1,0x20(%1) \n"
1810*4e366538SXin Li "lea 0x40(%1),%1 \n"
1811*4e366538SXin Li "sub $0x20,%2 \n"
1812*4e366538SXin Li "jg 1b \n"
1813*4e366538SXin Li "vzeroupper \n"
1814*4e366538SXin Li : "+r"(src_ptr), // %0
1815*4e366538SXin Li "+r"(dst_ptr), // %1
1816*4e366538SXin Li "+r"(src_width) // %2
1817*4e366538SXin Li :
1818*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1819*4e366538SXin Li }
1820*4e366538SXin Li #endif // HAS_SCALEADDROW_AVX2
1821*4e366538SXin Li
1822*4e366538SXin Li // Constant for making pixels signed to avoid pmaddubsw
1823*4e366538SXin Li // saturation.
1824*4e366538SXin Li static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1825*4e366538SXin Li 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1826*4e366538SXin Li
1827*4e366538SXin Li // Constant for making pixels unsigned and adding .5 for rounding.
1828*4e366538SXin Li static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1829*4e366538SXin Li 0x4040, 0x4040, 0x4040, 0x4040};
1830*4e366538SXin Li
1831*4e366538SXin Li // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1832*4e366538SXin Li void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1833*4e366538SXin Li const uint8_t* src_ptr,
1834*4e366538SXin Li int dst_width,
1835*4e366538SXin Li int x,
1836*4e366538SXin Li int dx) {
1837*4e366538SXin Li intptr_t x0, x1, temp_pixel;
1838*4e366538SXin Li asm volatile(
1839*4e366538SXin Li "movd %6,%%xmm2 \n"
1840*4e366538SXin Li "movd %7,%%xmm3 \n"
1841*4e366538SXin Li "movl $0x04040000,%k2 \n"
1842*4e366538SXin Li "movd %k2,%%xmm5 \n"
1843*4e366538SXin Li "pcmpeqb %%xmm6,%%xmm6 \n"
1844*4e366538SXin Li "psrlw $0x9,%%xmm6 \n" // 0x007f007f
1845*4e366538SXin Li "pcmpeqb %%xmm7,%%xmm7 \n"
1846*4e366538SXin Li "psrlw $15,%%xmm7 \n" // 0x00010001
1847*4e366538SXin Li
1848*4e366538SXin Li "pextrw $0x1,%%xmm2,%k3 \n"
1849*4e366538SXin Li "subl $0x2,%5 \n"
1850*4e366538SXin Li "jl 29f \n"
1851*4e366538SXin Li "movdqa %%xmm2,%%xmm0 \n"
1852*4e366538SXin Li "paddd %%xmm3,%%xmm0 \n"
1853*4e366538SXin Li "punpckldq %%xmm0,%%xmm2 \n"
1854*4e366538SXin Li "punpckldq %%xmm3,%%xmm3 \n"
1855*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n"
1856*4e366538SXin Li "pextrw $0x3,%%xmm2,%k4 \n"
1857*4e366538SXin Li
1858*4e366538SXin Li LABELALIGN
1859*4e366538SXin Li "2: \n"
1860*4e366538SXin Li "movdqa %%xmm2,%%xmm1 \n"
1861*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n"
1862*4e366538SXin Li "movzwl 0x00(%1,%3,1),%k2 \n"
1863*4e366538SXin Li "movd %k2,%%xmm0 \n"
1864*4e366538SXin Li "psrlw $0x9,%%xmm1 \n"
1865*4e366538SXin Li "movzwl 0x00(%1,%4,1),%k2 \n"
1866*4e366538SXin Li "movd %k2,%%xmm4 \n"
1867*4e366538SXin Li "pshufb %%xmm5,%%xmm1 \n"
1868*4e366538SXin Li "punpcklwd %%xmm4,%%xmm0 \n"
1869*4e366538SXin Li "psubb %8,%%xmm0 \n" // make pixels signed.
1870*4e366538SXin Li "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
1871*4e366538SXin Li // 1
1872*4e366538SXin Li "paddusb %%xmm7,%%xmm1 \n"
1873*4e366538SXin Li "pmaddubsw %%xmm0,%%xmm1 \n"
1874*4e366538SXin Li "pextrw $0x1,%%xmm2,%k3 \n"
1875*4e366538SXin Li "pextrw $0x3,%%xmm2,%k4 \n"
1876*4e366538SXin Li "paddw %9,%%xmm1 \n" // make pixels unsigned.
1877*4e366538SXin Li "psrlw $0x7,%%xmm1 \n"
1878*4e366538SXin Li "packuswb %%xmm1,%%xmm1 \n"
1879*4e366538SXin Li "movd %%xmm1,%k2 \n"
1880*4e366538SXin Li "mov %w2,(%0) \n"
1881*4e366538SXin Li "lea 0x2(%0),%0 \n"
1882*4e366538SXin Li "subl $0x2,%5 \n"
1883*4e366538SXin Li "jge 2b \n"
1884*4e366538SXin Li
1885*4e366538SXin Li LABELALIGN
1886*4e366538SXin Li "29: \n"
1887*4e366538SXin Li "addl $0x1,%5 \n"
1888*4e366538SXin Li "jl 99f \n"
1889*4e366538SXin Li "movzwl 0x00(%1,%3,1),%k2 \n"
1890*4e366538SXin Li "movd %k2,%%xmm0 \n"
1891*4e366538SXin Li "psrlw $0x9,%%xmm2 \n"
1892*4e366538SXin Li "pshufb %%xmm5,%%xmm2 \n"
1893*4e366538SXin Li "psubb %8,%%xmm0 \n" // make pixels signed.
1894*4e366538SXin Li "pxor %%xmm6,%%xmm2 \n"
1895*4e366538SXin Li "paddusb %%xmm7,%%xmm2 \n"
1896*4e366538SXin Li "pmaddubsw %%xmm0,%%xmm2 \n"
1897*4e366538SXin Li "paddw %9,%%xmm2 \n" // make pixels unsigned.
1898*4e366538SXin Li "psrlw $0x7,%%xmm2 \n"
1899*4e366538SXin Li "packuswb %%xmm2,%%xmm2 \n"
1900*4e366538SXin Li "movd %%xmm2,%k2 \n"
1901*4e366538SXin Li "mov %b2,(%0) \n"
1902*4e366538SXin Li "99: \n"
1903*4e366538SXin Li : "+r"(dst_ptr), // %0
1904*4e366538SXin Li "+r"(src_ptr), // %1
1905*4e366538SXin Li "=&a"(temp_pixel), // %2
1906*4e366538SXin Li "=&r"(x0), // %3
1907*4e366538SXin Li "=&r"(x1), // %4
1908*4e366538SXin Li #if defined(__x86_64__)
1909*4e366538SXin Li "+rm"(dst_width) // %5
1910*4e366538SXin Li #else
1911*4e366538SXin Li "+m"(dst_width) // %5
1912*4e366538SXin Li #endif
1913*4e366538SXin Li : "rm"(x), // %6
1914*4e366538SXin Li "rm"(dx), // %7
1915*4e366538SXin Li #if defined(__x86_64__)
1916*4e366538SXin Li "x"(kFsub80), // %8
1917*4e366538SXin Li "x"(kFadd40) // %9
1918*4e366538SXin Li #else
1919*4e366538SXin Li "m"(kFsub80), // %8
1920*4e366538SXin Li "m"(kFadd40) // %9
1921*4e366538SXin Li #endif
1922*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1923*4e366538SXin Li "xmm7");
1924*4e366538SXin Li }
1925*4e366538SXin Li
1926*4e366538SXin Li // Reads 4 pixels, duplicates them and writes 8 pixels.
1927*4e366538SXin Li // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1928*4e366538SXin Li void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1929*4e366538SXin Li const uint8_t* src_ptr,
1930*4e366538SXin Li int dst_width,
1931*4e366538SXin Li int x,
1932*4e366538SXin Li int dx) {
1933*4e366538SXin Li (void)x;
1934*4e366538SXin Li (void)dx;
1935*4e366538SXin Li asm volatile(LABELALIGN
1936*4e366538SXin Li "1: \n"
1937*4e366538SXin Li "movdqu (%1),%%xmm0 \n"
1938*4e366538SXin Li "lea 0x10(%1),%1 \n"
1939*4e366538SXin Li "movdqa %%xmm0,%%xmm1 \n"
1940*4e366538SXin Li "punpcklbw %%xmm0,%%xmm0 \n"
1941*4e366538SXin Li "punpckhbw %%xmm1,%%xmm1 \n"
1942*4e366538SXin Li "movdqu %%xmm0,(%0) \n"
1943*4e366538SXin Li "movdqu %%xmm1,0x10(%0) \n"
1944*4e366538SXin Li "lea 0x20(%0),%0 \n"
1945*4e366538SXin Li "sub $0x20,%2 \n"
1946*4e366538SXin Li "jg 1b \n"
1947*4e366538SXin Li
1948*4e366538SXin Li : "+r"(dst_ptr), // %0
1949*4e366538SXin Li "+r"(src_ptr), // %1
1950*4e366538SXin Li "+r"(dst_width) // %2
1951*4e366538SXin Li ::"memory",
1952*4e366538SXin Li "cc", "xmm0", "xmm1");
1953*4e366538SXin Li }
1954*4e366538SXin Li
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1955*4e366538SXin Li void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1956*4e366538SXin Li ptrdiff_t src_stride,
1957*4e366538SXin Li uint8_t* dst_argb,
1958*4e366538SXin Li int dst_width) {
1959*4e366538SXin Li (void)src_stride;
1960*4e366538SXin Li asm volatile(LABELALIGN
1961*4e366538SXin Li "1: \n"
1962*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
1963*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
1964*4e366538SXin Li "lea 0x20(%0),%0 \n"
1965*4e366538SXin Li "shufps $0xdd,%%xmm1,%%xmm0 \n"
1966*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
1967*4e366538SXin Li "lea 0x10(%1),%1 \n"
1968*4e366538SXin Li "sub $0x4,%2 \n"
1969*4e366538SXin Li "jg 1b \n"
1970*4e366538SXin Li : "+r"(src_argb), // %0
1971*4e366538SXin Li "+r"(dst_argb), // %1
1972*4e366538SXin Li "+r"(dst_width) // %2
1973*4e366538SXin Li ::"memory",
1974*4e366538SXin Li "cc", "xmm0", "xmm1");
1975*4e366538SXin Li }
1976*4e366538SXin Li
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1977*4e366538SXin Li void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1978*4e366538SXin Li ptrdiff_t src_stride,
1979*4e366538SXin Li uint8_t* dst_argb,
1980*4e366538SXin Li int dst_width) {
1981*4e366538SXin Li (void)src_stride;
1982*4e366538SXin Li asm volatile(LABELALIGN
1983*4e366538SXin Li "1: \n"
1984*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
1985*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
1986*4e366538SXin Li "lea 0x20(%0),%0 \n"
1987*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
1988*4e366538SXin Li "shufps $0x88,%%xmm1,%%xmm0 \n"
1989*4e366538SXin Li "shufps $0xdd,%%xmm1,%%xmm2 \n"
1990*4e366538SXin Li "pavgb %%xmm2,%%xmm0 \n"
1991*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
1992*4e366538SXin Li "lea 0x10(%1),%1 \n"
1993*4e366538SXin Li "sub $0x4,%2 \n"
1994*4e366538SXin Li "jg 1b \n"
1995*4e366538SXin Li : "+r"(src_argb), // %0
1996*4e366538SXin Li "+r"(dst_argb), // %1
1997*4e366538SXin Li "+r"(dst_width) // %2
1998*4e366538SXin Li ::"memory",
1999*4e366538SXin Li "cc", "xmm0", "xmm1");
2000*4e366538SXin Li }
2001*4e366538SXin Li
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2002*4e366538SXin Li void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2003*4e366538SXin Li ptrdiff_t src_stride,
2004*4e366538SXin Li uint8_t* dst_argb,
2005*4e366538SXin Li int dst_width) {
2006*4e366538SXin Li asm volatile(LABELALIGN
2007*4e366538SXin Li "1: \n"
2008*4e366538SXin Li "movdqu (%0),%%xmm0 \n"
2009*4e366538SXin Li "movdqu 0x10(%0),%%xmm1 \n"
2010*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm2 \n"
2011*4e366538SXin Li "movdqu 0x10(%0,%3,1),%%xmm3 \n"
2012*4e366538SXin Li "lea 0x20(%0),%0 \n"
2013*4e366538SXin Li "pavgb %%xmm2,%%xmm0 \n"
2014*4e366538SXin Li "pavgb %%xmm3,%%xmm1 \n"
2015*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2016*4e366538SXin Li "shufps $0x88,%%xmm1,%%xmm0 \n"
2017*4e366538SXin Li "shufps $0xdd,%%xmm1,%%xmm2 \n"
2018*4e366538SXin Li "pavgb %%xmm2,%%xmm0 \n"
2019*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
2020*4e366538SXin Li "lea 0x10(%1),%1 \n"
2021*4e366538SXin Li "sub $0x4,%2 \n"
2022*4e366538SXin Li "jg 1b \n"
2023*4e366538SXin Li : "+r"(src_argb), // %0
2024*4e366538SXin Li "+r"(dst_argb), // %1
2025*4e366538SXin Li "+r"(dst_width) // %2
2026*4e366538SXin Li : "r"((intptr_t)(src_stride)) // %3
2027*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2028*4e366538SXin Li }
2029*4e366538SXin Li
2030*4e366538SXin Li // Reads 4 pixels at a time.
2031*4e366538SXin Li // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2032*4e366538SXin Li void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2033*4e366538SXin Li ptrdiff_t src_stride,
2034*4e366538SXin Li int src_stepx,
2035*4e366538SXin Li uint8_t* dst_argb,
2036*4e366538SXin Li int dst_width) {
2037*4e366538SXin Li intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2038*4e366538SXin Li intptr_t src_stepx_x12;
2039*4e366538SXin Li (void)src_stride;
2040*4e366538SXin Li asm volatile(
2041*4e366538SXin Li "lea 0x00(,%1,4),%1 \n"
2042*4e366538SXin Li "lea 0x00(%1,%1,2),%4 \n"
2043*4e366538SXin Li
2044*4e366538SXin Li LABELALIGN
2045*4e366538SXin Li "1: \n"
2046*4e366538SXin Li "movd (%0),%%xmm0 \n"
2047*4e366538SXin Li "movd 0x00(%0,%1,1),%%xmm1 \n"
2048*4e366538SXin Li "punpckldq %%xmm1,%%xmm0 \n"
2049*4e366538SXin Li "movd 0x00(%0,%1,2),%%xmm2 \n"
2050*4e366538SXin Li "movd 0x00(%0,%4,1),%%xmm3 \n"
2051*4e366538SXin Li "lea 0x00(%0,%1,4),%0 \n"
2052*4e366538SXin Li "punpckldq %%xmm3,%%xmm2 \n"
2053*4e366538SXin Li "punpcklqdq %%xmm2,%%xmm0 \n"
2054*4e366538SXin Li "movdqu %%xmm0,(%2) \n"
2055*4e366538SXin Li "lea 0x10(%2),%2 \n"
2056*4e366538SXin Li "sub $0x4,%3 \n"
2057*4e366538SXin Li "jg 1b \n"
2058*4e366538SXin Li : "+r"(src_argb), // %0
2059*4e366538SXin Li "+r"(src_stepx_x4), // %1
2060*4e366538SXin Li "+r"(dst_argb), // %2
2061*4e366538SXin Li "+r"(dst_width), // %3
2062*4e366538SXin Li "=&r"(src_stepx_x12) // %4
2063*4e366538SXin Li ::"memory",
2064*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2065*4e366538SXin Li }
2066*4e366538SXin Li
2067*4e366538SXin Li // Blends four 2x2 to 4x1.
2068*4e366538SXin Li // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2069*4e366538SXin Li void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2070*4e366538SXin Li ptrdiff_t src_stride,
2071*4e366538SXin Li int src_stepx,
2072*4e366538SXin Li uint8_t* dst_argb,
2073*4e366538SXin Li int dst_width) {
2074*4e366538SXin Li intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2075*4e366538SXin Li intptr_t src_stepx_x12;
2076*4e366538SXin Li intptr_t row1 = (intptr_t)(src_stride);
2077*4e366538SXin Li asm volatile(
2078*4e366538SXin Li "lea 0x00(,%1,4),%1 \n"
2079*4e366538SXin Li "lea 0x00(%1,%1,2),%4 \n"
2080*4e366538SXin Li "lea 0x00(%0,%5,1),%5 \n"
2081*4e366538SXin Li
2082*4e366538SXin Li LABELALIGN
2083*4e366538SXin Li "1: \n"
2084*4e366538SXin Li "movq (%0),%%xmm0 \n"
2085*4e366538SXin Li "movhps 0x00(%0,%1,1),%%xmm0 \n"
2086*4e366538SXin Li "movq 0x00(%0,%1,2),%%xmm1 \n"
2087*4e366538SXin Li "movhps 0x00(%0,%4,1),%%xmm1 \n"
2088*4e366538SXin Li "lea 0x00(%0,%1,4),%0 \n"
2089*4e366538SXin Li "movq (%5),%%xmm2 \n"
2090*4e366538SXin Li "movhps 0x00(%5,%1,1),%%xmm2 \n"
2091*4e366538SXin Li "movq 0x00(%5,%1,2),%%xmm3 \n"
2092*4e366538SXin Li "movhps 0x00(%5,%4,1),%%xmm3 \n"
2093*4e366538SXin Li "lea 0x00(%5,%1,4),%5 \n"
2094*4e366538SXin Li "pavgb %%xmm2,%%xmm0 \n"
2095*4e366538SXin Li "pavgb %%xmm3,%%xmm1 \n"
2096*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2097*4e366538SXin Li "shufps $0x88,%%xmm1,%%xmm0 \n"
2098*4e366538SXin Li "shufps $0xdd,%%xmm1,%%xmm2 \n"
2099*4e366538SXin Li "pavgb %%xmm2,%%xmm0 \n"
2100*4e366538SXin Li "movdqu %%xmm0,(%2) \n"
2101*4e366538SXin Li "lea 0x10(%2),%2 \n"
2102*4e366538SXin Li "sub $0x4,%3 \n"
2103*4e366538SXin Li "jg 1b \n"
2104*4e366538SXin Li : "+r"(src_argb), // %0
2105*4e366538SXin Li "+r"(src_stepx_x4), // %1
2106*4e366538SXin Li "+r"(dst_argb), // %2
2107*4e366538SXin Li "+rm"(dst_width), // %3
2108*4e366538SXin Li "=&r"(src_stepx_x12), // %4
2109*4e366538SXin Li "+r"(row1) // %5
2110*4e366538SXin Li ::"memory",
2111*4e366538SXin Li "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2112*4e366538SXin Li }
2113*4e366538SXin Li
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2114*4e366538SXin Li void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2115*4e366538SXin Li const uint8_t* src_argb,
2116*4e366538SXin Li int dst_width,
2117*4e366538SXin Li int x,
2118*4e366538SXin Li int dx) {
2119*4e366538SXin Li intptr_t x0, x1;
2120*4e366538SXin Li asm volatile(
2121*4e366538SXin Li "movd %5,%%xmm2 \n"
2122*4e366538SXin Li "movd %6,%%xmm3 \n"
2123*4e366538SXin Li "pshufd $0x0,%%xmm2,%%xmm2 \n"
2124*4e366538SXin Li "pshufd $0x11,%%xmm3,%%xmm0 \n"
2125*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n"
2126*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n"
2127*4e366538SXin Li "pshufd $0x5,%%xmm3,%%xmm0 \n"
2128*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n"
2129*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n"
2130*4e366538SXin Li "pshufd $0x0,%%xmm3,%%xmm3 \n"
2131*4e366538SXin Li "pextrw $0x1,%%xmm2,%k0 \n"
2132*4e366538SXin Li "pextrw $0x3,%%xmm2,%k1 \n"
2133*4e366538SXin Li "cmp $0x0,%4 \n"
2134*4e366538SXin Li "jl 99f \n"
2135*4e366538SXin Li "sub $0x4,%4 \n"
2136*4e366538SXin Li "jl 49f \n"
2137*4e366538SXin Li
2138*4e366538SXin Li LABELALIGN
2139*4e366538SXin Li "40: \n"
2140*4e366538SXin Li "movd 0x00(%3,%0,4),%%xmm0 \n"
2141*4e366538SXin Li "movd 0x00(%3,%1,4),%%xmm1 \n"
2142*4e366538SXin Li "pextrw $0x5,%%xmm2,%k0 \n"
2143*4e366538SXin Li "pextrw $0x7,%%xmm2,%k1 \n"
2144*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n"
2145*4e366538SXin Li "punpckldq %%xmm1,%%xmm0 \n"
2146*4e366538SXin Li "movd 0x00(%3,%0,4),%%xmm1 \n"
2147*4e366538SXin Li "movd 0x00(%3,%1,4),%%xmm4 \n"
2148*4e366538SXin Li "pextrw $0x1,%%xmm2,%k0 \n"
2149*4e366538SXin Li "pextrw $0x3,%%xmm2,%k1 \n"
2150*4e366538SXin Li "punpckldq %%xmm4,%%xmm1 \n"
2151*4e366538SXin Li "punpcklqdq %%xmm1,%%xmm0 \n"
2152*4e366538SXin Li "movdqu %%xmm0,(%2) \n"
2153*4e366538SXin Li "lea 0x10(%2),%2 \n"
2154*4e366538SXin Li "sub $0x4,%4 \n"
2155*4e366538SXin Li "jge 40b \n"
2156*4e366538SXin Li
2157*4e366538SXin Li "49: \n"
2158*4e366538SXin Li "test $0x2,%4 \n"
2159*4e366538SXin Li "je 29f \n"
2160*4e366538SXin Li "movd 0x00(%3,%0,4),%%xmm0 \n"
2161*4e366538SXin Li "movd 0x00(%3,%1,4),%%xmm1 \n"
2162*4e366538SXin Li "pextrw $0x5,%%xmm2,%k0 \n"
2163*4e366538SXin Li "punpckldq %%xmm1,%%xmm0 \n"
2164*4e366538SXin Li "movq %%xmm0,(%2) \n"
2165*4e366538SXin Li "lea 0x8(%2),%2 \n"
2166*4e366538SXin Li "29: \n"
2167*4e366538SXin Li "test $0x1,%4 \n"
2168*4e366538SXin Li "je 99f \n"
2169*4e366538SXin Li "movd 0x00(%3,%0,4),%%xmm0 \n"
2170*4e366538SXin Li "movd %%xmm0,(%2) \n"
2171*4e366538SXin Li "99: \n"
2172*4e366538SXin Li : "=&a"(x0), // %0
2173*4e366538SXin Li "=&d"(x1), // %1
2174*4e366538SXin Li "+r"(dst_argb), // %2
2175*4e366538SXin Li "+r"(src_argb), // %3
2176*4e366538SXin Li "+r"(dst_width) // %4
2177*4e366538SXin Li : "rm"(x), // %5
2178*4e366538SXin Li "rm"(dx) // %6
2179*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2180*4e366538SXin Li }
2181*4e366538SXin Li
2182*4e366538SXin Li // Reads 4 pixels, duplicates them and writes 8 pixels.
2183*4e366538SXin Li // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2184*4e366538SXin Li void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2185*4e366538SXin Li const uint8_t* src_argb,
2186*4e366538SXin Li int dst_width,
2187*4e366538SXin Li int x,
2188*4e366538SXin Li int dx) {
2189*4e366538SXin Li (void)x;
2190*4e366538SXin Li (void)dx;
2191*4e366538SXin Li asm volatile(LABELALIGN
2192*4e366538SXin Li "1: \n"
2193*4e366538SXin Li "movdqu (%1),%%xmm0 \n"
2194*4e366538SXin Li "lea 0x10(%1),%1 \n"
2195*4e366538SXin Li "movdqa %%xmm0,%%xmm1 \n"
2196*4e366538SXin Li "punpckldq %%xmm0,%%xmm0 \n"
2197*4e366538SXin Li "punpckhdq %%xmm1,%%xmm1 \n"
2198*4e366538SXin Li "movdqu %%xmm0,(%0) \n"
2199*4e366538SXin Li "movdqu %%xmm1,0x10(%0) \n"
2200*4e366538SXin Li "lea 0x20(%0),%0 \n"
2201*4e366538SXin Li "sub $0x8,%2 \n"
2202*4e366538SXin Li "jg 1b \n"
2203*4e366538SXin Li
2204*4e366538SXin Li : "+r"(dst_argb), // %0
2205*4e366538SXin Li "+r"(src_argb), // %1
2206*4e366538SXin Li "+r"(dst_width) // %2
2207*4e366538SXin Li ::"memory",
2208*4e366538SXin Li "cc", "xmm0", "xmm1");
2209*4e366538SXin Li }
2210*4e366538SXin Li
2211*4e366538SXin Li // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2212*4e366538SXin Li static const uvec8 kShuffleColARGB = {
2213*4e366538SXin Li 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
2214*4e366538SXin Li 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
2215*4e366538SXin Li };
2216*4e366538SXin Li
2217*4e366538SXin Li // Shuffle table for duplicating 2 fractions into 8 bytes each
2218*4e366538SXin Li static const uvec8 kShuffleFractions = {
2219*4e366538SXin Li 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2220*4e366538SXin Li };
2221*4e366538SXin Li
2222*4e366538SXin Li // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2223*4e366538SXin Li void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2224*4e366538SXin Li const uint8_t* src_argb,
2225*4e366538SXin Li int dst_width,
2226*4e366538SXin Li int x,
2227*4e366538SXin Li int dx) {
2228*4e366538SXin Li intptr_t x0, x1;
2229*4e366538SXin Li asm volatile(
2230*4e366538SXin Li "movdqa %0,%%xmm4 \n"
2231*4e366538SXin Li "movdqa %1,%%xmm5 \n"
2232*4e366538SXin Li :
2233*4e366538SXin Li : "m"(kShuffleColARGB), // %0
2234*4e366538SXin Li "m"(kShuffleFractions) // %1
2235*4e366538SXin Li );
2236*4e366538SXin Li
2237*4e366538SXin Li asm volatile(
2238*4e366538SXin Li "movd %5,%%xmm2 \n"
2239*4e366538SXin Li "movd %6,%%xmm3 \n"
2240*4e366538SXin Li "pcmpeqb %%xmm6,%%xmm6 \n"
2241*4e366538SXin Li "psrlw $0x9,%%xmm6 \n"
2242*4e366538SXin Li "pextrw $0x1,%%xmm2,%k3 \n"
2243*4e366538SXin Li "sub $0x2,%2 \n"
2244*4e366538SXin Li "jl 29f \n"
2245*4e366538SXin Li "movdqa %%xmm2,%%xmm0 \n"
2246*4e366538SXin Li "paddd %%xmm3,%%xmm0 \n"
2247*4e366538SXin Li "punpckldq %%xmm0,%%xmm2 \n"
2248*4e366538SXin Li "punpckldq %%xmm3,%%xmm3 \n"
2249*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n"
2250*4e366538SXin Li "pextrw $0x3,%%xmm2,%k4 \n"
2251*4e366538SXin Li
2252*4e366538SXin Li LABELALIGN
2253*4e366538SXin Li "2: \n"
2254*4e366538SXin Li "movdqa %%xmm2,%%xmm1 \n"
2255*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n"
2256*4e366538SXin Li "movq 0x00(%1,%3,4),%%xmm0 \n"
2257*4e366538SXin Li "psrlw $0x9,%%xmm1 \n"
2258*4e366538SXin Li "movhps 0x00(%1,%4,4),%%xmm0 \n"
2259*4e366538SXin Li "pshufb %%xmm5,%%xmm1 \n"
2260*4e366538SXin Li "pshufb %%xmm4,%%xmm0 \n"
2261*4e366538SXin Li "pxor %%xmm6,%%xmm1 \n"
2262*4e366538SXin Li "pmaddubsw %%xmm1,%%xmm0 \n"
2263*4e366538SXin Li "psrlw $0x7,%%xmm0 \n"
2264*4e366538SXin Li "pextrw $0x1,%%xmm2,%k3 \n"
2265*4e366538SXin Li "pextrw $0x3,%%xmm2,%k4 \n"
2266*4e366538SXin Li "packuswb %%xmm0,%%xmm0 \n"
2267*4e366538SXin Li "movq %%xmm0,(%0) \n"
2268*4e366538SXin Li "lea 0x8(%0),%0 \n"
2269*4e366538SXin Li "sub $0x2,%2 \n"
2270*4e366538SXin Li "jge 2b \n"
2271*4e366538SXin Li
2272*4e366538SXin Li LABELALIGN
2273*4e366538SXin Li "29: \n"
2274*4e366538SXin Li "add $0x1,%2 \n"
2275*4e366538SXin Li "jl 99f \n"
2276*4e366538SXin Li "psrlw $0x9,%%xmm2 \n"
2277*4e366538SXin Li "movq 0x00(%1,%3,4),%%xmm0 \n"
2278*4e366538SXin Li "pshufb %%xmm5,%%xmm2 \n"
2279*4e366538SXin Li "pshufb %%xmm4,%%xmm0 \n"
2280*4e366538SXin Li "pxor %%xmm6,%%xmm2 \n"
2281*4e366538SXin Li "pmaddubsw %%xmm2,%%xmm0 \n"
2282*4e366538SXin Li "psrlw $0x7,%%xmm0 \n"
2283*4e366538SXin Li "packuswb %%xmm0,%%xmm0 \n"
2284*4e366538SXin Li "movd %%xmm0,(%0) \n"
2285*4e366538SXin Li
2286*4e366538SXin Li LABELALIGN
2287*4e366538SXin Li "99: \n" // clang-format error.
2288*4e366538SXin Li
2289*4e366538SXin Li : "+r"(dst_argb), // %0
2290*4e366538SXin Li "+r"(src_argb), // %1
2291*4e366538SXin Li "+rm"(dst_width), // %2
2292*4e366538SXin Li "=&r"(x0), // %3
2293*4e366538SXin Li "=&r"(x1) // %4
2294*4e366538SXin Li : "rm"(x), // %5
2295*4e366538SXin Li "rm"(dx) // %6
2296*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2297*4e366538SXin Li }
2298*4e366538SXin Li
2299*4e366538SXin Li // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2300*4e366538SXin Li int FixedDiv_X86(int num, int div) {
2301*4e366538SXin Li asm volatile(
2302*4e366538SXin Li "cdq \n"
2303*4e366538SXin Li "shld $0x10,%%eax,%%edx \n"
2304*4e366538SXin Li "shl $0x10,%%eax \n"
2305*4e366538SXin Li "idiv %1 \n"
2306*4e366538SXin Li "mov %0, %%eax \n"
2307*4e366538SXin Li : "+a"(num) // %0
2308*4e366538SXin Li : "c"(div) // %1
2309*4e366538SXin Li : "memory", "cc", "edx");
2310*4e366538SXin Li return num;
2311*4e366538SXin Li }
2312*4e366538SXin Li
2313*4e366538SXin Li // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2314*4e366538SXin Li int FixedDiv1_X86(int num, int div) {
2315*4e366538SXin Li asm volatile(
2316*4e366538SXin Li "cdq \n"
2317*4e366538SXin Li "shld $0x10,%%eax,%%edx \n"
2318*4e366538SXin Li "shl $0x10,%%eax \n"
2319*4e366538SXin Li "sub $0x10001,%%eax \n"
2320*4e366538SXin Li "sbb $0x0,%%edx \n"
2321*4e366538SXin Li "sub $0x1,%1 \n"
2322*4e366538SXin Li "idiv %1 \n"
2323*4e366538SXin Li "mov %0, %%eax \n"
2324*4e366538SXin Li : "+a"(num) // %0
2325*4e366538SXin Li : "c"(div) // %1
2326*4e366538SXin Li : "memory", "cc", "edx");
2327*4e366538SXin Li return num;
2328*4e366538SXin Li }
2329*4e366538SXin Li
2330*4e366538SXin Li #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
2331*4e366538SXin Li defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
2332*4e366538SXin Li
2333*4e366538SXin Li // Shuffle table for splitting UV into upper and lower part of register.
2334*4e366538SXin Li static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2335*4e366538SXin Li 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2336*4e366538SXin Li static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
2337*4e366538SXin Li 6u, 14u, 0x80, 0x80, 0x80, 0x80,
2338*4e366538SXin Li 0x80, 0x80, 0x80, 0x80};
2339*4e366538SXin Li #endif
2340*4e366538SXin Li
2341*4e366538SXin Li #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2342*4e366538SXin Li
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2343*4e366538SXin Li void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2344*4e366538SXin Li ptrdiff_t src_stride,
2345*4e366538SXin Li uint8_t* dst_ptr,
2346*4e366538SXin Li int dst_width) {
2347*4e366538SXin Li asm volatile(
2348*4e366538SXin Li "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
2349*4e366538SXin Li "psrlw $0xf,%%xmm4 \n"
2350*4e366538SXin Li "packuswb %%xmm4,%%xmm4 \n"
2351*4e366538SXin Li "pxor %%xmm5, %%xmm5 \n" // zero
2352*4e366538SXin Li "movdqa %4,%%xmm1 \n" // split shuffler
2353*4e366538SXin Li "movdqa %5,%%xmm3 \n" // merge shuffler
2354*4e366538SXin Li
2355*4e366538SXin Li LABELALIGN
2356*4e366538SXin Li "1: \n"
2357*4e366538SXin Li "movdqu (%0),%%xmm0 \n" // 8 UV row 0
2358*4e366538SXin Li "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
2359*4e366538SXin Li "lea 0x10(%0),%0 \n"
2360*4e366538SXin Li "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
2361*4e366538SXin Li "pshufb %%xmm1,%%xmm2 \n"
2362*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
2363*4e366538SXin Li "pmaddubsw %%xmm4,%%xmm2 \n"
2364*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n" // vertical add
2365*4e366538SXin Li "psrlw $0x1,%%xmm0 \n" // round
2366*4e366538SXin Li "pavgw %%xmm5,%%xmm0 \n"
2367*4e366538SXin Li "pshufb %%xmm3,%%xmm0 \n" // merge uv
2368*4e366538SXin Li "movq %%xmm0,(%1) \n"
2369*4e366538SXin Li "lea 0x8(%1),%1 \n" // 4 UV
2370*4e366538SXin Li "sub $0x4,%2 \n"
2371*4e366538SXin Li "jg 1b \n"
2372*4e366538SXin Li : "+r"(src_ptr), // %0
2373*4e366538SXin Li "+r"(dst_ptr), // %1
2374*4e366538SXin Li "+r"(dst_width) // %2
2375*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2376*4e366538SXin Li "m"(kShuffleSplitUV), // %4
2377*4e366538SXin Li "m"(kShuffleMergeUV) // %5
2378*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2379*4e366538SXin Li }
2380*4e366538SXin Li #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
2381*4e366538SXin Li
2382*4e366538SXin Li #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2383*4e366538SXin Li void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2384*4e366538SXin Li ptrdiff_t src_stride,
2385*4e366538SXin Li uint8_t* dst_ptr,
2386*4e366538SXin Li int dst_width) {
2387*4e366538SXin Li asm volatile(
2388*4e366538SXin Li "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
2389*4e366538SXin Li "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
2390*4e366538SXin Li "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
2391*4e366538SXin Li "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
2392*4e366538SXin Li "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
2393*4e366538SXin Li "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
2394*4e366538SXin Li
2395*4e366538SXin Li LABELALIGN
2396*4e366538SXin Li "1: \n"
2397*4e366538SXin Li "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
2398*4e366538SXin Li "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
2399*4e366538SXin Li "lea 0x20(%0),%0 \n"
2400*4e366538SXin Li "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
2401*4e366538SXin Li "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
2402*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
2403*4e366538SXin Li "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
2404*4e366538SXin Li "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
2405*4e366538SXin Li "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
2406*4e366538SXin Li "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
2407*4e366538SXin Li "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
2408*4e366538SXin Li "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
2409*4e366538SXin Li "vmovdqu %%xmm0,(%1) \n"
2410*4e366538SXin Li "lea 0x10(%1),%1 \n" // 8 UV
2411*4e366538SXin Li "sub $0x8,%2 \n"
2412*4e366538SXin Li "jg 1b \n"
2413*4e366538SXin Li "vzeroupper \n"
2414*4e366538SXin Li : "+r"(src_ptr), // %0
2415*4e366538SXin Li "+r"(dst_ptr), // %1
2416*4e366538SXin Li "+r"(dst_width) // %2
2417*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2418*4e366538SXin Li "m"(kShuffleSplitUV), // %4
2419*4e366538SXin Li "m"(kShuffleMergeUV) // %5
2420*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2421*4e366538SXin Li }
2422*4e366538SXin Li #endif // HAS_SCALEUVROWDOWN2BOX_AVX2
2423*4e366538SXin Li
2424*4e366538SXin Li static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2425*4e366538SXin Li 3, 1, 3, 1, 1, 3, 1, 3};
2426*4e366538SXin Li
2427*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2428*4e366538SXin Li void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2429*4e366538SXin Li uint8_t* dst_ptr,
2430*4e366538SXin Li int dst_width) {
2431*4e366538SXin Li asm volatile(
2432*4e366538SXin Li "pcmpeqw %%xmm4,%%xmm4 \n"
2433*4e366538SXin Li "psrlw $15,%%xmm4 \n"
2434*4e366538SXin Li "psllw $1,%%xmm4 \n" // all 2
2435*4e366538SXin Li "movdqa %3,%%xmm3 \n"
2436*4e366538SXin Li
2437*4e366538SXin Li LABELALIGN
2438*4e366538SXin Li "1: \n"
2439*4e366538SXin Li "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2440*4e366538SXin Li "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2441*4e366538SXin Li "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2442*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2443*4e366538SXin Li "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2444*4e366538SXin Li "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2445*4e366538SXin Li "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2446*4e366538SXin Li "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2447*4e366538SXin Li "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
2448*4e366538SXin Li "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
2449*4e366538SXin Li "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2450*4e366538SXin Li "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
2451*4e366538SXin Li "packuswb %%xmm2,%%xmm0 \n"
2452*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
2453*4e366538SXin Li
2454*4e366538SXin Li "lea 0x8(%0),%0 \n"
2455*4e366538SXin Li "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2456*4e366538SXin Li "sub $0x8,%2 \n"
2457*4e366538SXin Li "jg 1b \n"
2458*4e366538SXin Li : "+r"(src_ptr), // %0
2459*4e366538SXin Li "+r"(dst_ptr), // %1
2460*4e366538SXin Li "+r"(dst_width) // %2
2461*4e366538SXin Li : "m"(kUVLinearMadd31) // %3
2462*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2463*4e366538SXin Li }
2464*4e366538SXin Li #endif
2465*4e366538SXin Li
2466*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2467*4e366538SXin Li void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2468*4e366538SXin Li ptrdiff_t src_stride,
2469*4e366538SXin Li uint8_t* dst_ptr,
2470*4e366538SXin Li ptrdiff_t dst_stride,
2471*4e366538SXin Li int dst_width) {
2472*4e366538SXin Li asm volatile(
2473*4e366538SXin Li "pcmpeqw %%xmm6,%%xmm6 \n"
2474*4e366538SXin Li "psrlw $15,%%xmm6 \n"
2475*4e366538SXin Li "psllw $3,%%xmm6 \n" // all 8
2476*4e366538SXin Li "movdqa %5,%%xmm7 \n"
2477*4e366538SXin Li
2478*4e366538SXin Li LABELALIGN
2479*4e366538SXin Li "1: \n"
2480*4e366538SXin Li "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2481*4e366538SXin Li "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2482*4e366538SXin Li "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2483*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2484*4e366538SXin Li "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2485*4e366538SXin Li "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2486*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2487*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2488*4e366538SXin Li
2489*4e366538SXin Li "movq (%0,%3),%%xmm1 \n"
2490*4e366538SXin Li "movq 2(%0,%3),%%xmm4 \n"
2491*4e366538SXin Li "punpcklbw %%xmm4,%%xmm1 \n"
2492*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
2493*4e366538SXin Li "punpckhdq %%xmm1,%%xmm3 \n"
2494*4e366538SXin Li "punpckldq %%xmm1,%%xmm1 \n"
2495*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
2496*4e366538SXin Li "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
2497*4e366538SXin Li
2498*4e366538SXin Li // xmm0 xmm2
2499*4e366538SXin Li // xmm1 xmm3
2500*4e366538SXin Li
2501*4e366538SXin Li "movdqa %%xmm0,%%xmm4 \n"
2502*4e366538SXin Li "movdqa %%xmm1,%%xmm5 \n"
2503*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2504*4e366538SXin Li "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2505*4e366538SXin Li "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2506*4e366538SXin Li "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2507*4e366538SXin Li "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2508*4e366538SXin Li
2509*4e366538SXin Li "movdqa %%xmm1,%%xmm5 \n"
2510*4e366538SXin Li "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
2511*4e366538SXin Li "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2512*4e366538SXin Li "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
2513*4e366538SXin Li "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2514*4e366538SXin Li "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2515*4e366538SXin Li
2516*4e366538SXin Li "movdqa %%xmm2,%%xmm0 \n"
2517*4e366538SXin Li "movdqa %%xmm3,%%xmm1 \n"
2518*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
2519*4e366538SXin Li "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
2520*4e366538SXin Li "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
2521*4e366538SXin Li "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2522*4e366538SXin Li "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2523*4e366538SXin Li
2524*4e366538SXin Li "movdqa %%xmm3,%%xmm1 \n"
2525*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
2526*4e366538SXin Li "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
2527*4e366538SXin Li "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
2528*4e366538SXin Li "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
2529*4e366538SXin Li "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
2530*4e366538SXin Li
2531*4e366538SXin Li "packuswb %%xmm0,%%xmm4 \n"
2532*4e366538SXin Li "movdqu %%xmm4,(%1) \n" // store above
2533*4e366538SXin Li "packuswb %%xmm1,%%xmm5 \n"
2534*4e366538SXin Li "movdqu %%xmm5,(%1,%4) \n" // store below
2535*4e366538SXin Li
2536*4e366538SXin Li "lea 0x8(%0),%0 \n"
2537*4e366538SXin Li "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2538*4e366538SXin Li "sub $0x8,%2 \n"
2539*4e366538SXin Li "jg 1b \n"
2540*4e366538SXin Li : "+r"(src_ptr), // %0
2541*4e366538SXin Li "+r"(dst_ptr), // %1
2542*4e366538SXin Li "+r"(dst_width) // %2
2543*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2544*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
2545*4e366538SXin Li "m"(kUVLinearMadd31) // %5
2546*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2547*4e366538SXin Li "xmm7");
2548*4e366538SXin Li }
2549*4e366538SXin Li #endif
2550*4e366538SXin Li
2551*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
2552*4e366538SXin Li
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2553*4e366538SXin Li void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2554*4e366538SXin Li uint8_t* dst_ptr,
2555*4e366538SXin Li int dst_width) {
2556*4e366538SXin Li asm volatile(
2557*4e366538SXin Li "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
2558*4e366538SXin Li "vpsrlw $15,%%ymm4,%%ymm4 \n"
2559*4e366538SXin Li "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
2560*4e366538SXin Li "vbroadcastf128 %3,%%ymm3 \n"
2561*4e366538SXin Li
2562*4e366538SXin Li LABELALIGN
2563*4e366538SXin Li "1: \n"
2564*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n"
2565*4e366538SXin Li "vmovdqu 2(%0),%%xmm1 \n"
2566*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2567*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2568*4e366538SXin Li "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2569*4e366538SXin Li "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2570*4e366538SXin Li "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2571*4e366538SXin Li "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
2572*4e366538SXin Li "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
2573*4e366538SXin Li "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
2574*4e366538SXin Li "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
2575*4e366538SXin Li "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2576*4e366538SXin Li "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2577*4e366538SXin Li "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2578*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
2579*4e366538SXin Li
2580*4e366538SXin Li "lea 0x10(%0),%0 \n"
2581*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2582*4e366538SXin Li "sub $0x10,%2 \n"
2583*4e366538SXin Li "jg 1b \n"
2584*4e366538SXin Li "vzeroupper \n"
2585*4e366538SXin Li : "+r"(src_ptr), // %0
2586*4e366538SXin Li "+r"(dst_ptr), // %1
2587*4e366538SXin Li "+r"(dst_width) // %2
2588*4e366538SXin Li : "m"(kUVLinearMadd31) // %3
2589*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2590*4e366538SXin Li }
2591*4e366538SXin Li #endif
2592*4e366538SXin Li
2593*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2594*4e366538SXin Li void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2595*4e366538SXin Li ptrdiff_t src_stride,
2596*4e366538SXin Li uint8_t* dst_ptr,
2597*4e366538SXin Li ptrdiff_t dst_stride,
2598*4e366538SXin Li int dst_width) {
2599*4e366538SXin Li asm volatile(
2600*4e366538SXin Li "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
2601*4e366538SXin Li "vpsrlw $15,%%ymm6,%%ymm6 \n"
2602*4e366538SXin Li "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
2603*4e366538SXin Li "vbroadcastf128 %5,%%ymm7 \n"
2604*4e366538SXin Li
2605*4e366538SXin Li LABELALIGN
2606*4e366538SXin Li "1: \n"
2607*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n"
2608*4e366538SXin Li "vmovdqu 2(%0),%%xmm1 \n"
2609*4e366538SXin Li "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2610*4e366538SXin Li "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2611*4e366538SXin Li "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2612*4e366538SXin Li "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2613*4e366538SXin Li "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2614*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
2615*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
2616*4e366538SXin Li
2617*4e366538SXin Li "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
2618*4e366538SXin Li "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
2619*4e366538SXin Li "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
2620*4e366538SXin Li "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
2621*4e366538SXin Li "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
2622*4e366538SXin Li "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
2623*4e366538SXin Li "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
2624*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
2625*4e366538SXin Li "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
2626*4e366538SXin Li
2627*4e366538SXin Li // ymm0 ymm1
2628*4e366538SXin Li // ymm2 ymm3
2629*4e366538SXin Li
2630*4e366538SXin Li "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2631*4e366538SXin Li "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2632*4e366538SXin Li "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2633*4e366538SXin Li "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2634*4e366538SXin Li "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2635*4e366538SXin Li
2636*4e366538SXin Li "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2637*4e366538SXin Li "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2638*4e366538SXin Li "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2639*4e366538SXin Li "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2640*4e366538SXin Li "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2641*4e366538SXin Li
2642*4e366538SXin Li "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2643*4e366538SXin Li "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2644*4e366538SXin Li "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2645*4e366538SXin Li "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2646*4e366538SXin Li "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2647*4e366538SXin Li
2648*4e366538SXin Li "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2649*4e366538SXin Li "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2650*4e366538SXin Li "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2651*4e366538SXin Li "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2652*4e366538SXin Li "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2653*4e366538SXin Li
2654*4e366538SXin Li "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
2655*4e366538SXin Li "vmovdqu %%ymm4,(%1) \n" // store above
2656*4e366538SXin Li "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
2657*4e366538SXin Li "vmovdqu %%ymm5,(%1,%4) \n" // store below
2658*4e366538SXin Li
2659*4e366538SXin Li "lea 0x10(%0),%0 \n"
2660*4e366538SXin Li "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2661*4e366538SXin Li "sub $0x10,%2 \n"
2662*4e366538SXin Li "jg 1b \n"
2663*4e366538SXin Li "vzeroupper \n"
2664*4e366538SXin Li : "+r"(src_ptr), // %0
2665*4e366538SXin Li "+r"(dst_ptr), // %1
2666*4e366538SXin Li "+r"(dst_width) // %2
2667*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2668*4e366538SXin Li "r"((intptr_t)(dst_stride)), // %4
2669*4e366538SXin Li "m"(kUVLinearMadd31) // %5
2670*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2671*4e366538SXin Li "xmm7");
2672*4e366538SXin Li }
2673*4e366538SXin Li #endif
2674*4e366538SXin Li
2675*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2676*4e366538SXin Li void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2677*4e366538SXin Li uint16_t* dst_ptr,
2678*4e366538SXin Li int dst_width) {
2679*4e366538SXin Li asm volatile(
2680*4e366538SXin Li "pxor %%xmm5,%%xmm5 \n"
2681*4e366538SXin Li "pcmpeqd %%xmm4,%%xmm4 \n"
2682*4e366538SXin Li "psrld $31,%%xmm4 \n"
2683*4e366538SXin Li "pslld $1,%%xmm4 \n" // all 2
2684*4e366538SXin Li
2685*4e366538SXin Li LABELALIGN
2686*4e366538SXin Li "1: \n"
2687*4e366538SXin Li "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2688*4e366538SXin Li "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2689*4e366538SXin Li
2690*4e366538SXin Li "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
2691*4e366538SXin Li "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
2692*4e366538SXin Li
2693*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2694*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
2695*4e366538SXin Li
2696*4e366538SXin Li "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
2697*4e366538SXin Li "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
2698*4e366538SXin Li
2699*4e366538SXin Li "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
2700*4e366538SXin Li "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
2701*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
2702*4e366538SXin Li "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
2703*4e366538SXin Li "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
2704*4e366538SXin Li "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
2705*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
2706*4e366538SXin Li "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
2707*4e366538SXin Li
2708*4e366538SXin Li "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2709*4e366538SXin Li "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
2710*4e366538SXin Li "packusdw %%xmm1,%%xmm0 \n"
2711*4e366538SXin Li "movdqu %%xmm0,(%1) \n"
2712*4e366538SXin Li
2713*4e366538SXin Li "lea 0x8(%0),%0 \n"
2714*4e366538SXin Li "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2715*4e366538SXin Li "sub $0x4,%2 \n"
2716*4e366538SXin Li "jg 1b \n"
2717*4e366538SXin Li : "+r"(src_ptr), // %0
2718*4e366538SXin Li "+r"(dst_ptr), // %1
2719*4e366538SXin Li "+r"(dst_width) // %2
2720*4e366538SXin Li :
2721*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2722*4e366538SXin Li }
2723*4e366538SXin Li #endif
2724*4e366538SXin Li
2725*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2726*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2727*4e366538SXin Li ptrdiff_t src_stride,
2728*4e366538SXin Li uint16_t* dst_ptr,
2729*4e366538SXin Li ptrdiff_t dst_stride,
2730*4e366538SXin Li int dst_width) {
2731*4e366538SXin Li asm volatile(
2732*4e366538SXin Li "pxor %%xmm7,%%xmm7 \n"
2733*4e366538SXin Li "pcmpeqd %%xmm6,%%xmm6 \n"
2734*4e366538SXin Li "psrld $31,%%xmm6 \n"
2735*4e366538SXin Li "pslld $3,%%xmm6 \n" // all 8
2736*4e366538SXin Li
2737*4e366538SXin Li LABELALIGN
2738*4e366538SXin Li "1: \n"
2739*4e366538SXin Li "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2740*4e366538SXin Li "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2741*4e366538SXin Li "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
2742*4e366538SXin Li "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
2743*4e366538SXin Li "movdqa %%xmm0,%%xmm2 \n"
2744*4e366538SXin Li "movdqa %%xmm1,%%xmm3 \n"
2745*4e366538SXin Li "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
2746*4e366538SXin Li "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
2747*4e366538SXin Li "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
2748*4e366538SXin Li "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
2749*4e366538SXin Li "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
2750*4e366538SXin Li "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
2751*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
2752*4e366538SXin Li "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
2753*4e366538SXin Li
2754*4e366538SXin Li "movq (%0,%3,2),%%xmm2 \n"
2755*4e366538SXin Li "movq 4(%0,%3,2),%%xmm3 \n"
2756*4e366538SXin Li "punpcklwd %%xmm7,%%xmm2 \n"
2757*4e366538SXin Li "punpcklwd %%xmm7,%%xmm3 \n"
2758*4e366538SXin Li "movdqa %%xmm2,%%xmm4 \n"
2759*4e366538SXin Li "movdqa %%xmm3,%%xmm5 \n"
2760*4e366538SXin Li "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
2761*4e366538SXin Li "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
2762*4e366538SXin Li "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
2763*4e366538SXin Li "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
2764*4e366538SXin Li "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
2765*4e366538SXin Li "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
2766*4e366538SXin Li "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
2767*4e366538SXin Li "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
2768*4e366538SXin Li
2769*4e366538SXin Li "movdqa %%xmm0,%%xmm4 \n"
2770*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
2771*4e366538SXin Li "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2772*4e366538SXin Li "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2773*4e366538SXin Li "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2774*4e366538SXin Li "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2775*4e366538SXin Li "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2776*4e366538SXin Li
2777*4e366538SXin Li "movdqa %%xmm2,%%xmm5 \n"
2778*4e366538SXin Li "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
2779*4e366538SXin Li "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2780*4e366538SXin Li "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
2781*4e366538SXin Li "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2782*4e366538SXin Li "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2783*4e366538SXin Li
2784*4e366538SXin Li "movdqa %%xmm1,%%xmm0 \n"
2785*4e366538SXin Li "movdqa %%xmm3,%%xmm2 \n"
2786*4e366538SXin Li "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
2787*4e366538SXin Li "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
2788*4e366538SXin Li "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
2789*4e366538SXin Li "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2790*4e366538SXin Li "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2791*4e366538SXin Li
2792*4e366538SXin Li "movdqa %%xmm3,%%xmm2 \n"
2793*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
2794*4e366538SXin Li "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
2795*4e366538SXin Li "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
2796*4e366538SXin Li "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
2797*4e366538SXin Li "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
2798*4e366538SXin Li
2799*4e366538SXin Li "packusdw %%xmm0,%%xmm4 \n"
2800*4e366538SXin Li "movdqu %%xmm4,(%1) \n" // store above
2801*4e366538SXin Li "packusdw %%xmm2,%%xmm5 \n"
2802*4e366538SXin Li "movdqu %%xmm5,(%1,%4,2) \n" // store below
2803*4e366538SXin Li
2804*4e366538SXin Li "lea 0x8(%0),%0 \n"
2805*4e366538SXin Li "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2806*4e366538SXin Li "sub $0x4,%2 \n"
2807*4e366538SXin Li "jg 1b \n"
2808*4e366538SXin Li : "+r"(src_ptr), // %0
2809*4e366538SXin Li "+r"(dst_ptr), // %1
2810*4e366538SXin Li "+r"(dst_width) // %2
2811*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2812*4e366538SXin Li "r"((intptr_t)(dst_stride)) // %4
2813*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2814*4e366538SXin Li "xmm7");
2815*4e366538SXin Li }
2816*4e366538SXin Li #endif
2817*4e366538SXin Li
2818*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2819*4e366538SXin Li void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2820*4e366538SXin Li uint16_t* dst_ptr,
2821*4e366538SXin Li int dst_width) {
2822*4e366538SXin Li asm volatile(
2823*4e366538SXin Li "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
2824*4e366538SXin Li "vpsrld $31,%%ymm4,%%ymm4 \n"
2825*4e366538SXin Li "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
2826*4e366538SXin Li
2827*4e366538SXin Li LABELALIGN
2828*4e366538SXin Li "1: \n"
2829*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2830*4e366538SXin Li "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2831*4e366538SXin Li
2832*4e366538SXin Li "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2833*4e366538SXin Li "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2834*4e366538SXin Li
2835*4e366538SXin Li "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2836*4e366538SXin Li "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2837*4e366538SXin Li
2838*4e366538SXin Li "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
2839*4e366538SXin Li "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
2840*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
2841*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
2842*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2843*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2844*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
2845*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
2846*4e366538SXin Li
2847*4e366538SXin Li "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2848*4e366538SXin Li "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2849*4e366538SXin Li "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
2850*4e366538SXin Li "vmovdqu %%ymm0,(%1) \n"
2851*4e366538SXin Li
2852*4e366538SXin Li "lea 0x10(%0),%0 \n"
2853*4e366538SXin Li "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2854*4e366538SXin Li "sub $0x8,%2 \n"
2855*4e366538SXin Li "jg 1b \n"
2856*4e366538SXin Li "vzeroupper \n"
2857*4e366538SXin Li : "+r"(src_ptr), // %0
2858*4e366538SXin Li "+r"(dst_ptr), // %1
2859*4e366538SXin Li "+r"(dst_width) // %2
2860*4e366538SXin Li :
2861*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2862*4e366538SXin Li }
2863*4e366538SXin Li #endif
2864*4e366538SXin Li
2865*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2866*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2867*4e366538SXin Li ptrdiff_t src_stride,
2868*4e366538SXin Li uint16_t* dst_ptr,
2869*4e366538SXin Li ptrdiff_t dst_stride,
2870*4e366538SXin Li int dst_width) {
2871*4e366538SXin Li asm volatile(
2872*4e366538SXin Li "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
2873*4e366538SXin Li "vpsrld $31,%%ymm6,%%ymm6 \n"
2874*4e366538SXin Li "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
2875*4e366538SXin Li
2876*4e366538SXin Li LABELALIGN
2877*4e366538SXin Li "1: \n"
2878*4e366538SXin Li
2879*4e366538SXin Li "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2880*4e366538SXin Li "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2881*4e366538SXin Li "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2882*4e366538SXin Li "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2883*4e366538SXin Li "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2884*4e366538SXin Li "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2885*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
2886*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
2887*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2888*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2889*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
2890*4e366538SXin Li "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
2891*4e366538SXin Li
2892*4e366538SXin Li "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
2893*4e366538SXin Li "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
2894*4e366538SXin Li "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
2895*4e366538SXin Li "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
2896*4e366538SXin Li "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
2897*4e366538SXin Li "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
2898*4e366538SXin Li "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
2899*4e366538SXin Li "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
2900*4e366538SXin Li "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
2901*4e366538SXin Li "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
2902*4e366538SXin Li "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
2903*4e366538SXin Li "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
2904*4e366538SXin Li
2905*4e366538SXin Li "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2906*4e366538SXin Li "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2907*4e366538SXin Li "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2908*4e366538SXin Li "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2909*4e366538SXin Li "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2910*4e366538SXin Li
2911*4e366538SXin Li "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2912*4e366538SXin Li "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2913*4e366538SXin Li "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2914*4e366538SXin Li "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2915*4e366538SXin Li "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2916*4e366538SXin Li
2917*4e366538SXin Li "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2918*4e366538SXin Li "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2919*4e366538SXin Li "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2920*4e366538SXin Li "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2921*4e366538SXin Li "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2922*4e366538SXin Li
2923*4e366538SXin Li "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2924*4e366538SXin Li "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2925*4e366538SXin Li "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2926*4e366538SXin Li "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2927*4e366538SXin Li "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2928*4e366538SXin Li
2929*4e366538SXin Li "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
2930*4e366538SXin Li "vmovdqu %%ymm4,(%1) \n" // store above
2931*4e366538SXin Li "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
2932*4e366538SXin Li "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
2933*4e366538SXin Li
2934*4e366538SXin Li "lea 0x10(%0),%0 \n"
2935*4e366538SXin Li "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2936*4e366538SXin Li "sub $0x8,%2 \n"
2937*4e366538SXin Li "jg 1b \n"
2938*4e366538SXin Li "vzeroupper \n"
2939*4e366538SXin Li : "+r"(src_ptr), // %0
2940*4e366538SXin Li "+r"(dst_ptr), // %1
2941*4e366538SXin Li "+r"(dst_width) // %2
2942*4e366538SXin Li : "r"((intptr_t)(src_stride)), // %3
2943*4e366538SXin Li "r"((intptr_t)(dst_stride)) // %4
2944*4e366538SXin Li : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2945*4e366538SXin Li }
2946*4e366538SXin Li #endif
2947*4e366538SXin Li
2948*4e366538SXin Li #endif // defined(__x86_64__) || defined(__i386__)
2949*4e366538SXin Li
2950*4e366538SXin Li #ifdef __cplusplus
2951*4e366538SXin Li } // extern "C"
2952*4e366538SXin Li } // namespace libyuv
2953*4e366538SXin Li #endif
2954