xref: /aosp_15_r20/external/libyuv/source/scale_gcc.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                              128, 128, 128, 128, 128, 128, 128, 128};
25 
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                              128, 128, 128, 128, 128, 128, 128, 128};
29 
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                              128, 128, 128, 128, 128, 128, 128, 128};
33 
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36 
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
39                               8, 9, 9, 10, 10, 11, 12, 13};
40 
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43                               10, 11, 12, 13, 13, 14, 14, 15};
44 
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 
57 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58                                128, 128, 128, 128, 128, 128, 128, 128};
59 
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61                                6,   8,   11,  14,  128, 128, 128, 128};
62 
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65                               128, 128, 128, 128, 128, 128, 128, 128};
66 
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69                                6,   7,   12,  13,  128, 128, 128, 128};
70 
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73                                   65536 / 9, 65536 / 6, 0,         0};
74 
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77                                11, 128, 14, 128, 128, 128, 128, 128};
78 
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81                                12, 128, 15, 128, 128, 128, 128, 128};
82 
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85                                13, 128, 128, 128, 128, 128, 128, 128};
86 
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89                                  65536 / 3, 65536 / 2, 0,         0};
90 
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94 
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96                          ptrdiff_t src_stride,
97                          uint8_t* dst_ptr,
98                          int dst_width) {
99   (void)src_stride;
100   asm volatile(
101       // 16 pixel loop.
102       LABELALIGN
103       "1:                                        \n"
104       "movdqu      (%0),%%xmm0                   \n"
105       "movdqu      0x10(%0),%%xmm1               \n"
106       "lea         0x20(%0),%0                   \n"
107       "psrlw       $0x8,%%xmm0                   \n"
108       "psrlw       $0x8,%%xmm1                   \n"
109       "packuswb    %%xmm1,%%xmm0                 \n"
110       "movdqu      %%xmm0,(%1)                   \n"
111       "lea         0x10(%1),%1                   \n"
112       "sub         $0x10,%2                      \n"
113       "jg          1b                            \n"
114       : "+r"(src_ptr),   // %0
115         "+r"(dst_ptr),   // %1
116         "+r"(dst_width)  // %2
117         ::"memory",
118         "cc", "xmm0", "xmm1");
119 }
120 
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122                                ptrdiff_t src_stride,
123                                uint8_t* dst_ptr,
124                                int dst_width) {
125   (void)src_stride;
126   asm volatile(
127       "pcmpeqb     %%xmm4,%%xmm4                 \n"
128       "psrlw       $0xf,%%xmm4                   \n"
129       "packuswb    %%xmm4,%%xmm4                 \n"
130       "pxor        %%xmm5,%%xmm5                 \n"
131 
132       LABELALIGN
133       "1:                                        \n"
134       "movdqu      (%0),%%xmm0                   \n"
135       "movdqu      0x10(%0),%%xmm1               \n"
136       "lea         0x20(%0),%0                   \n"
137       "pmaddubsw   %%xmm4,%%xmm0                 \n"
138       "pmaddubsw   %%xmm4,%%xmm1                 \n"
139       "pavgw       %%xmm5,%%xmm0                 \n"
140       "pavgw       %%xmm5,%%xmm1                 \n"
141       "packuswb    %%xmm1,%%xmm0                 \n"
142       "movdqu      %%xmm0,(%1)                   \n"
143       "lea         0x10(%1),%1                   \n"
144       "sub         $0x10,%2                      \n"
145       "jg          1b                            \n"
146       : "+r"(src_ptr),   // %0
147         "+r"(dst_ptr),   // %1
148         "+r"(dst_width)  // %2
149         ::"memory",
150         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152 
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154                             ptrdiff_t src_stride,
155                             uint8_t* dst_ptr,
156                             int dst_width) {
157   asm volatile(
158       "pcmpeqb     %%xmm4,%%xmm4                 \n"
159       "psrlw       $0xf,%%xmm4                   \n"
160       "packuswb    %%xmm4,%%xmm4                 \n"
161       "pxor        %%xmm5,%%xmm5                 \n"
162 
163       LABELALIGN
164       "1:                                        \n"
165       "movdqu      (%0),%%xmm0                   \n"
166       "movdqu      0x10(%0),%%xmm1               \n"
167       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
168       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
169       "lea         0x20(%0),%0                   \n"
170       "pmaddubsw   %%xmm4,%%xmm0                 \n"
171       "pmaddubsw   %%xmm4,%%xmm1                 \n"
172       "pmaddubsw   %%xmm4,%%xmm2                 \n"
173       "pmaddubsw   %%xmm4,%%xmm3                 \n"
174       "paddw       %%xmm2,%%xmm0                 \n"
175       "paddw       %%xmm3,%%xmm1                 \n"
176       "psrlw       $0x1,%%xmm0                   \n"
177       "psrlw       $0x1,%%xmm1                   \n"
178       "pavgw       %%xmm5,%%xmm0                 \n"
179       "pavgw       %%xmm5,%%xmm1                 \n"
180       "packuswb    %%xmm1,%%xmm0                 \n"
181       "movdqu      %%xmm0,(%1)                   \n"
182       "lea         0x10(%1),%1                   \n"
183       "sub         $0x10,%2                      \n"
184       "jg          1b                            \n"
185       : "+r"(src_ptr),               // %0
186         "+r"(dst_ptr),               // %1
187         "+r"(dst_width)              // %2
188       : "r"((intptr_t)(src_stride))  // %3
189       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191 
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194                         ptrdiff_t src_stride,
195                         uint8_t* dst_ptr,
196                         int dst_width) {
197   (void)src_stride;
198   asm volatile(LABELALIGN
199       "1:                                        \n"
200       "vmovdqu     (%0),%%ymm0                   \n"
201       "vmovdqu     0x20(%0),%%ymm1               \n"
202       "lea         0x40(%0),%0                   \n"
203       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
204       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
205       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
206       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
207       "vmovdqu     %%ymm0,(%1)                   \n"
208       "lea         0x20(%1),%1                   \n"
209       "sub         $0x20,%2                      \n"
210       "jg          1b                            \n"
211       "vzeroupper                                \n"
212                : "+r"(src_ptr),   // %0
213                  "+r"(dst_ptr),   // %1
214                  "+r"(dst_width)  // %2
215                  ::"memory",
216                  "cc", "xmm0", "xmm1");
217 }
218 
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220                               ptrdiff_t src_stride,
221                               uint8_t* dst_ptr,
222                               int dst_width) {
223   (void)src_stride;
224   asm volatile(
225       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
226       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
227       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
228       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
229 
230       LABELALIGN
231       "1:                                        \n"
232       "vmovdqu     (%0),%%ymm0                   \n"
233       "vmovdqu     0x20(%0),%%ymm1               \n"
234       "lea         0x40(%0),%0                   \n"
235       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
236       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
237       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
238       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
239       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
240       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
241       "vmovdqu     %%ymm0,(%1)                   \n"
242       "lea         0x20(%1),%1                   \n"
243       "sub         $0x20,%2                      \n"
244       "jg          1b                            \n"
245       "vzeroupper                                \n"
246       : "+r"(src_ptr),   // %0
247         "+r"(dst_ptr),   // %1
248         "+r"(dst_width)  // %2
249         ::"memory",
250         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252 
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254                            ptrdiff_t src_stride,
255                            uint8_t* dst_ptr,
256                            int dst_width) {
257   asm volatile(
258       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
259       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
260       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
261       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
262 
263       LABELALIGN
264       "1:                                        \n"
265       "vmovdqu     (%0),%%ymm0                   \n"
266       "vmovdqu     0x20(%0),%%ymm1               \n"
267       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
268       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
269       "lea         0x40(%0),%0                   \n"
270       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
271       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
272       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
273       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
274       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
275       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
276       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
277       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
278       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
279       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
280       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
281       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
282       "vmovdqu     %%ymm0,(%1)                   \n"
283       "lea         0x20(%1),%1                   \n"
284       "sub         $0x20,%2                      \n"
285       "jg          1b                            \n"
286       "vzeroupper                                \n"
287       : "+r"(src_ptr),               // %0
288         "+r"(dst_ptr),               // %1
289         "+r"(dst_width)              // %2
290       : "r"((intptr_t)(src_stride))  // %3
291       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif  // HAS_SCALEROWDOWN2_AVX2
294 
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296                          ptrdiff_t src_stride,
297                          uint8_t* dst_ptr,
298                          int dst_width) {
299   (void)src_stride;
300   asm volatile(
301       "pcmpeqb     %%xmm5,%%xmm5                 \n"
302       "psrld       $0x18,%%xmm5                  \n"
303       "pslld       $0x10,%%xmm5                  \n"
304 
305       LABELALIGN
306       "1:                                        \n"
307       "movdqu      (%0),%%xmm0                   \n"
308       "movdqu      0x10(%0),%%xmm1               \n"
309       "lea         0x20(%0),%0                   \n"
310       "pand        %%xmm5,%%xmm0                 \n"
311       "pand        %%xmm5,%%xmm1                 \n"
312       "packuswb    %%xmm1,%%xmm0                 \n"
313       "psrlw       $0x8,%%xmm0                   \n"
314       "packuswb    %%xmm0,%%xmm0                 \n"
315       "movq        %%xmm0,(%1)                   \n"
316       "lea         0x8(%1),%1                    \n"
317       "sub         $0x8,%2                       \n"
318       "jg          1b                            \n"
319       : "+r"(src_ptr),   // %0
320         "+r"(dst_ptr),   // %1
321         "+r"(dst_width)  // %2
322         ::"memory",
323         "cc", "xmm0", "xmm1", "xmm5");
324 }
325 
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327                             ptrdiff_t src_stride,
328                             uint8_t* dst_ptr,
329                             int dst_width) {
330   intptr_t stridex3;
331   asm volatile(
332       "pcmpeqb     %%xmm4,%%xmm4                 \n"
333       "psrlw       $0xf,%%xmm4                   \n"
334       "movdqa      %%xmm4,%%xmm5                 \n"
335       "packuswb    %%xmm4,%%xmm4                 \n"
336       "psllw       $0x3,%%xmm5                   \n"
337       "lea         0x00(%4,%4,2),%3              \n"
338 
339       LABELALIGN
340       "1:                                        \n"
341       "movdqu      (%0),%%xmm0                   \n"
342       "movdqu      0x10(%0),%%xmm1               \n"
343       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
344       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
345       "pmaddubsw   %%xmm4,%%xmm0                 \n"
346       "pmaddubsw   %%xmm4,%%xmm1                 \n"
347       "pmaddubsw   %%xmm4,%%xmm2                 \n"
348       "pmaddubsw   %%xmm4,%%xmm3                 \n"
349       "paddw       %%xmm2,%%xmm0                 \n"
350       "paddw       %%xmm3,%%xmm1                 \n"
351       "movdqu      0x00(%0,%4,2),%%xmm2          \n"
352       "movdqu      0x10(%0,%4,2),%%xmm3          \n"
353       "pmaddubsw   %%xmm4,%%xmm2                 \n"
354       "pmaddubsw   %%xmm4,%%xmm3                 \n"
355       "paddw       %%xmm2,%%xmm0                 \n"
356       "paddw       %%xmm3,%%xmm1                 \n"
357       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
358       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
359       "lea         0x20(%0),%0                   \n"
360       "pmaddubsw   %%xmm4,%%xmm2                 \n"
361       "pmaddubsw   %%xmm4,%%xmm3                 \n"
362       "paddw       %%xmm2,%%xmm0                 \n"
363       "paddw       %%xmm3,%%xmm1                 \n"
364       "phaddw      %%xmm1,%%xmm0                 \n"
365       "paddw       %%xmm5,%%xmm0                 \n"
366       "psrlw       $0x4,%%xmm0                   \n"
367       "packuswb    %%xmm0,%%xmm0                 \n"
368       "movq        %%xmm0,(%1)                   \n"
369       "lea         0x8(%1),%1                    \n"
370       "sub         $0x8,%2                       \n"
371       "jg          1b                            \n"
372       : "+r"(src_ptr),               // %0
373         "+r"(dst_ptr),               // %1
374         "+r"(dst_width),             // %2
375         "=&r"(stridex3)              // %3
376       : "r"((intptr_t)(src_stride))  // %4
377       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379 
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382                         ptrdiff_t src_stride,
383                         uint8_t* dst_ptr,
384                         int dst_width) {
385   (void)src_stride;
386   asm volatile(
387       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
388       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
389       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
390 
391       LABELALIGN
392       "1:                                        \n"
393       "vmovdqu     (%0),%%ymm0                   \n"
394       "vmovdqu     0x20(%0),%%ymm1               \n"
395       "lea         0x40(%0),%0                   \n"
396       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
397       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
398       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
399       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
400       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
401       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
402       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
403       "vmovdqu     %%xmm0,(%1)                   \n"
404       "lea         0x10(%1),%1                   \n"
405       "sub         $0x10,%2                      \n"
406       "jg          1b                            \n"
407       "vzeroupper                                \n"
408       : "+r"(src_ptr),   // %0
409         "+r"(dst_ptr),   // %1
410         "+r"(dst_width)  // %2
411         ::"memory",
412         "cc", "xmm0", "xmm1", "xmm5");
413 }
414 
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416                            ptrdiff_t src_stride,
417                            uint8_t* dst_ptr,
418                            int dst_width) {
419   asm volatile(
420       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
421       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
422       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
423       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
424 
425       LABELALIGN
426       "1:                                        \n"
427       "vmovdqu     (%0),%%ymm0                   \n"
428       "vmovdqu     0x20(%0),%%ymm1               \n"
429       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
430       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
431       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
432       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
433       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
434       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
435       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
436       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
437       "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
438       "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
439       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
440       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
441       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
442       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
443       "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
444       "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
445       "lea         0x40(%0),%0                   \n"
446       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
447       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
448       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
449       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
450       "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
451       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
452       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
453       "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
454       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
455       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
456       "vmovdqu     %%xmm0,(%1)                   \n"
457       "lea         0x10(%1),%1                   \n"
458       "sub         $0x10,%2                      \n"
459       "jg          1b                            \n"
460       "vzeroupper                                \n"
461       : "+r"(src_ptr),                   // %0
462         "+r"(dst_ptr),                   // %1
463         "+r"(dst_width)                  // %2
464       : "r"((intptr_t)(src_stride)),     // %3
465         "r"((intptr_t)(src_stride * 3))  // %4
466       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif  // HAS_SCALEROWDOWN4_AVX2
469 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471                           ptrdiff_t src_stride,
472                           uint8_t* dst_ptr,
473                           int dst_width) {
474   (void)src_stride;
475   asm volatile(
476       "movdqa      %0,%%xmm3                     \n"
477       "movdqa      %1,%%xmm4                     \n"
478       "movdqa      %2,%%xmm5                     \n"
479       :
480       : "m"(kShuf0),  // %0
481         "m"(kShuf1),  // %1
482         "m"(kShuf2)   // %2
483   );
484   asm volatile(LABELALIGN
485       "1:                                        \n"
486       "movdqu      (%0),%%xmm0                   \n"
487       "movdqu      0x10(%0),%%xmm2               \n"
488       "lea         0x20(%0),%0                   \n"
489       "movdqa      %%xmm2,%%xmm1                 \n"
490       "palignr     $0x8,%%xmm0,%%xmm1            \n"
491       "pshufb      %%xmm3,%%xmm0                 \n"
492       "pshufb      %%xmm4,%%xmm1                 \n"
493       "pshufb      %%xmm5,%%xmm2                 \n"
494       "movq        %%xmm0,(%1)                   \n"
495       "movq        %%xmm1,0x8(%1)                \n"
496       "movq        %%xmm2,0x10(%1)               \n"
497       "lea         0x18(%1),%1                   \n"
498       "sub         $0x18,%2                      \n"
499       "jg          1b                            \n"
500                : "+r"(src_ptr),   // %0
501                  "+r"(dst_ptr),   // %1
502                  "+r"(dst_width)  // %2
503                  ::"memory",
504                  "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506 
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508                                 ptrdiff_t src_stride,
509                                 uint8_t* dst_ptr,
510                                 int dst_width) {
511   asm volatile(
512       "movdqa      %0,%%xmm2                     \n"  // kShuf01
513       "movdqa      %1,%%xmm3                     \n"  // kShuf11
514       "movdqa      %2,%%xmm4                     \n"  // kShuf21
515       :
516       : "m"(kShuf01),  // %0
517         "m"(kShuf11),  // %1
518         "m"(kShuf21)   // %2
519   );
520   asm volatile(
521       "movdqa      %0,%%xmm5                     \n"  // kMadd01
522       "movdqa      %1,%%xmm0                     \n"  // kMadd11
523       "movdqa      %2,%%xmm1                     \n"  // kRound34
524       :
525       : "m"(kMadd01),  // %0
526         "m"(kMadd11),  // %1
527         "m"(kRound34)  // %2
528   );
529   asm volatile(LABELALIGN
530       "1:                                        \n"
531       "movdqu      (%0),%%xmm6                   \n"
532       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
533       "pavgb       %%xmm7,%%xmm6                 \n"
534       "pshufb      %%xmm2,%%xmm6                 \n"
535       "pmaddubsw   %%xmm5,%%xmm6                 \n"
536       "paddsw      %%xmm1,%%xmm6                 \n"
537       "psrlw       $0x2,%%xmm6                   \n"
538       "packuswb    %%xmm6,%%xmm6                 \n"
539       "movq        %%xmm6,(%1)                   \n"
540       "movdqu      0x8(%0),%%xmm6                \n"
541       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
542       "pavgb       %%xmm7,%%xmm6                 \n"
543       "pshufb      %%xmm3,%%xmm6                 \n"
544       "pmaddubsw   %%xmm0,%%xmm6                 \n"
545       "paddsw      %%xmm1,%%xmm6                 \n"
546       "psrlw       $0x2,%%xmm6                   \n"
547       "packuswb    %%xmm6,%%xmm6                 \n"
548       "movq        %%xmm6,0x8(%1)                \n"
549       "movdqu      0x10(%0),%%xmm6               \n"
550       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
551       "lea         0x20(%0),%0                   \n"
552       "pavgb       %%xmm7,%%xmm6                 \n"
553       "pshufb      %%xmm4,%%xmm6                 \n"
554       "pmaddubsw   %4,%%xmm6                     \n"
555       "paddsw      %%xmm1,%%xmm6                 \n"
556       "psrlw       $0x2,%%xmm6                   \n"
557       "packuswb    %%xmm6,%%xmm6                 \n"
558       "movq        %%xmm6,0x10(%1)               \n"
559       "lea         0x18(%1),%1                   \n"
560       "sub         $0x18,%2                      \n"
561       "jg          1b                            \n"
562                : "+r"(src_ptr),                // %0
563                  "+r"(dst_ptr),                // %1
564                  "+r"(dst_width)               // %2
565                : "r"((intptr_t)(src_stride)),  // %3
566                  "m"(kMadd21)                  // %4
567                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568                  "xmm6", "xmm7");
569 }
570 
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572                                 ptrdiff_t src_stride,
573                                 uint8_t* dst_ptr,
574                                 int dst_width) {
575   asm volatile(
576       "movdqa      %0,%%xmm2                     \n"  // kShuf01
577       "movdqa      %1,%%xmm3                     \n"  // kShuf11
578       "movdqa      %2,%%xmm4                     \n"  // kShuf21
579       :
580       : "m"(kShuf01),  // %0
581         "m"(kShuf11),  // %1
582         "m"(kShuf21)   // %2
583   );
584   asm volatile(
585       "movdqa      %0,%%xmm5                     \n"  // kMadd01
586       "movdqa      %1,%%xmm0                     \n"  // kMadd11
587       "movdqa      %2,%%xmm1                     \n"  // kRound34
588       :
589       : "m"(kMadd01),  // %0
590         "m"(kMadd11),  // %1
591         "m"(kRound34)  // %2
592   );
593 
594   asm volatile(LABELALIGN
595       "1:                                        \n"
596       "movdqu      (%0),%%xmm6                   \n"
597       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
598       "pavgb       %%xmm6,%%xmm7                 \n"
599       "pavgb       %%xmm7,%%xmm6                 \n"
600       "pshufb      %%xmm2,%%xmm6                 \n"
601       "pmaddubsw   %%xmm5,%%xmm6                 \n"
602       "paddsw      %%xmm1,%%xmm6                 \n"
603       "psrlw       $0x2,%%xmm6                   \n"
604       "packuswb    %%xmm6,%%xmm6                 \n"
605       "movq        %%xmm6,(%1)                   \n"
606       "movdqu      0x8(%0),%%xmm6                \n"
607       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
608       "pavgb       %%xmm6,%%xmm7                 \n"
609       "pavgb       %%xmm7,%%xmm6                 \n"
610       "pshufb      %%xmm3,%%xmm6                 \n"
611       "pmaddubsw   %%xmm0,%%xmm6                 \n"
612       "paddsw      %%xmm1,%%xmm6                 \n"
613       "psrlw       $0x2,%%xmm6                   \n"
614       "packuswb    %%xmm6,%%xmm6                 \n"
615       "movq        %%xmm6,0x8(%1)                \n"
616       "movdqu      0x10(%0),%%xmm6               \n"
617       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
618       "lea         0x20(%0),%0                   \n"
619       "pavgb       %%xmm6,%%xmm7                 \n"
620       "pavgb       %%xmm7,%%xmm6                 \n"
621       "pshufb      %%xmm4,%%xmm6                 \n"
622       "pmaddubsw   %4,%%xmm6                     \n"
623       "paddsw      %%xmm1,%%xmm6                 \n"
624       "psrlw       $0x2,%%xmm6                   \n"
625       "packuswb    %%xmm6,%%xmm6                 \n"
626       "movq        %%xmm6,0x10(%1)               \n"
627       "lea         0x18(%1),%1                   \n"
628       "sub         $0x18,%2                      \n"
629       "jg          1b                            \n"
630                : "+r"(src_ptr),                // %0
631                  "+r"(dst_ptr),                // %1
632                  "+r"(dst_width)               // %2
633                : "r"((intptr_t)(src_stride)),  // %3
634                  "m"(kMadd21)                  // %4
635                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636                  "xmm6", "xmm7");
637 }
638 
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640                           ptrdiff_t src_stride,
641                           uint8_t* dst_ptr,
642                           int dst_width) {
643   (void)src_stride;
644   asm volatile(
645       "movdqa      %3,%%xmm4                     \n"
646       "movdqa      %4,%%xmm5                     \n"
647 
648       LABELALIGN
649       "1:                                        \n"
650       "movdqu      (%0),%%xmm0                   \n"
651       "movdqu      0x10(%0),%%xmm1               \n"
652       "lea         0x20(%0),%0                   \n"
653       "pshufb      %%xmm4,%%xmm0                 \n"
654       "pshufb      %%xmm5,%%xmm1                 \n"
655       "paddusb     %%xmm1,%%xmm0                 \n"
656       "movq        %%xmm0,(%1)                   \n"
657       "movhlps     %%xmm0,%%xmm1                 \n"
658       "movd        %%xmm1,0x8(%1)                \n"
659       "lea         0xc(%1),%1                    \n"
660       "sub         $0xc,%2                       \n"
661       "jg          1b                            \n"
662       : "+r"(src_ptr),   // %0
663         "+r"(dst_ptr),   // %1
664         "+r"(dst_width)  // %2
665       : "m"(kShuf38a),   // %3
666         "m"(kShuf38b)    // %4
667       : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669 
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671                                 ptrdiff_t src_stride,
672                                 uint8_t* dst_ptr,
673                                 int dst_width) {
674   asm volatile(
675       "movdqa      %0,%%xmm2                     \n"
676       "movdqa      %1,%%xmm3                     \n"
677       "movdqa      %2,%%xmm4                     \n"
678       "movdqa      %3,%%xmm5                     \n"
679       :
680       : "m"(kShufAb0),  // %0
681         "m"(kShufAb1),  // %1
682         "m"(kShufAb2),  // %2
683         "m"(kScaleAb2)  // %3
684   );
685   asm volatile(LABELALIGN
686       "1:                                        \n"
687       "movdqu      (%0),%%xmm0                   \n"
688       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
689       "lea         0x10(%0),%0                   \n"
690       "pavgb       %%xmm1,%%xmm0                 \n"
691       "movdqa      %%xmm0,%%xmm1                 \n"
692       "pshufb      %%xmm2,%%xmm1                 \n"
693       "movdqa      %%xmm0,%%xmm6                 \n"
694       "pshufb      %%xmm3,%%xmm6                 \n"
695       "paddusw     %%xmm6,%%xmm1                 \n"
696       "pshufb      %%xmm4,%%xmm0                 \n"
697       "paddusw     %%xmm0,%%xmm1                 \n"
698       "pmulhuw     %%xmm5,%%xmm1                 \n"
699       "packuswb    %%xmm1,%%xmm1                 \n"
700       "movd        %%xmm1,(%1)                   \n"
701       "psrlq       $0x10,%%xmm1                  \n"
702       "movd        %%xmm1,0x2(%1)                \n"
703       "lea         0x6(%1),%1                    \n"
704       "sub         $0x6,%2                       \n"
705       "jg          1b                            \n"
706                : "+r"(src_ptr),               // %0
707                  "+r"(dst_ptr),               // %1
708                  "+r"(dst_width)              // %2
709                : "r"((intptr_t)(src_stride))  // %3
710                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711                  "xmm6");
712 }
713 
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715                                 ptrdiff_t src_stride,
716                                 uint8_t* dst_ptr,
717                                 int dst_width) {
718   asm volatile(
719       "movdqa      %0,%%xmm2                     \n"
720       "movdqa      %1,%%xmm3                     \n"
721       "movdqa      %2,%%xmm4                     \n"
722       "pxor        %%xmm5,%%xmm5                 \n"
723       :
724       : "m"(kShufAc),    // %0
725         "m"(kShufAc3),   // %1
726         "m"(kScaleAc33)  // %2
727   );
728   asm volatile(LABELALIGN
729       "1:                                        \n"
730       "movdqu      (%0),%%xmm0                   \n"
731       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
732       "movhlps     %%xmm0,%%xmm1                 \n"
733       "movhlps     %%xmm6,%%xmm7                 \n"
734       "punpcklbw   %%xmm5,%%xmm0                 \n"
735       "punpcklbw   %%xmm5,%%xmm1                 \n"
736       "punpcklbw   %%xmm5,%%xmm6                 \n"
737       "punpcklbw   %%xmm5,%%xmm7                 \n"
738       "paddusw     %%xmm6,%%xmm0                 \n"
739       "paddusw     %%xmm7,%%xmm1                 \n"
740       "movdqu      0x00(%0,%3,2),%%xmm6          \n"
741       "lea         0x10(%0),%0                   \n"
742       "movhlps     %%xmm6,%%xmm7                 \n"
743       "punpcklbw   %%xmm5,%%xmm6                 \n"
744       "punpcklbw   %%xmm5,%%xmm7                 \n"
745       "paddusw     %%xmm6,%%xmm0                 \n"
746       "paddusw     %%xmm7,%%xmm1                 \n"
747       "movdqa      %%xmm0,%%xmm6                 \n"
748       "psrldq      $0x2,%%xmm0                   \n"
749       "paddusw     %%xmm0,%%xmm6                 \n"
750       "psrldq      $0x2,%%xmm0                   \n"
751       "paddusw     %%xmm0,%%xmm6                 \n"
752       "pshufb      %%xmm2,%%xmm6                 \n"
753       "movdqa      %%xmm1,%%xmm7                 \n"
754       "psrldq      $0x2,%%xmm1                   \n"
755       "paddusw     %%xmm1,%%xmm7                 \n"
756       "psrldq      $0x2,%%xmm1                   \n"
757       "paddusw     %%xmm1,%%xmm7                 \n"
758       "pshufb      %%xmm3,%%xmm7                 \n"
759       "paddusw     %%xmm7,%%xmm6                 \n"
760       "pmulhuw     %%xmm4,%%xmm6                 \n"
761       "packuswb    %%xmm6,%%xmm6                 \n"
762       "movd        %%xmm6,(%1)                   \n"
763       "psrlq       $0x10,%%xmm6                  \n"
764       "movd        %%xmm6,0x2(%1)                \n"
765       "lea         0x6(%1),%1                    \n"
766       "sub         $0x6,%2                       \n"
767       "jg          1b                            \n"
768                : "+r"(src_ptr),               // %0
769                  "+r"(dst_ptr),               // %1
770                  "+r"(dst_width)              // %2
771                : "r"((intptr_t)(src_stride))  // %3
772                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773                  "xmm6", "xmm7");
774 }
775 
776 static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
777                                         10, 11, 8, 9, 14, 15, 12, 13};
778 
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780                                     3, 1, 1, 3, 3, 1, 1, 3};
781 
782 #ifdef HAS_SCALEROWUP2_LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784                              uint8_t* dst_ptr,
785                              int dst_width) {
786   asm volatile(
787       "pxor        %%xmm0,%%xmm0                 \n"  // 0
788       "pcmpeqw     %%xmm6,%%xmm6                 \n"
789       "psrlw       $15,%%xmm6                    \n"
790       "psllw       $1,%%xmm6                     \n"  // all 2
791 
792       LABELALIGN
793       "1:                                        \n"
794       "movq        (%0),%%xmm1                   \n"  // 01234567
795       "movq        1(%0),%%xmm2                  \n"  // 12345678
796       "movdqa      %%xmm1,%%xmm3                 \n"
797       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
798       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
799       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
800       "movdqa      %%xmm1,%%xmm4                 \n"
801       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
802       "movdqa      %%xmm2,%%xmm5                 \n"
803       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
804       "paddw       %%xmm5,%%xmm4                 \n"
805       "movdqa      %%xmm3,%%xmm5                 \n"
806       "paddw       %%xmm6,%%xmm4                 \n"
807       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
808       "paddw       %%xmm5,%%xmm5                 \n"
809       "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
810       "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
811 
812       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
813       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
814       "paddw       %%xmm2,%%xmm1                 \n"
815       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
816       "paddw       %%xmm6,%%xmm1                 \n"
817       "paddw       %%xmm3,%%xmm3                 \n"
818       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
819       "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
820 
821       "packuswb    %%xmm1,%%xmm5                 \n"
822       "movdqu      %%xmm5,(%1)                   \n"
823 
824       "lea         0x8(%0),%0                    \n"
825       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
826       "sub         $0x10,%2                      \n"
827       "jg          1b                            \n"
828       : "+r"(src_ptr),   // %0
829         "+r"(dst_ptr),   // %1
830         "+r"(dst_width)  // %2
831       :
832       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835 
836 #ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838                                ptrdiff_t src_stride,
839                                uint8_t* dst_ptr,
840                                ptrdiff_t dst_stride,
841                                int dst_width) {
842   asm volatile(
843       LABELALIGN
844       "1:                                        \n"
845       "pxor        %%xmm0,%%xmm0                 \n"  // 0
846       // above line
847       "movq        (%0),%%xmm1                   \n"  // 01234567
848       "movq        1(%0),%%xmm2                  \n"  // 12345678
849       "movdqa      %%xmm1,%%xmm3                 \n"
850       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
851       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
852       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
853 
854       "movdqa      %%xmm1,%%xmm4                 \n"
855       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
856       "movdqa      %%xmm2,%%xmm5                 \n"
857       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
858       "paddw       %%xmm5,%%xmm4                 \n"  // near+far
859       "movdqa      %%xmm3,%%xmm5                 \n"
860       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
861       "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
862       "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
863 
864       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
865       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
866       "paddw       %%xmm2,%%xmm1                 \n"
867       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
868       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
869       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
870 
871       // below line
872       "movq        (%0,%3),%%xmm6                \n"  // 01234567
873       "movq        1(%0,%3),%%xmm2               \n"  // 12345678
874       "movdqa      %%xmm6,%%xmm3                 \n"
875       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
876       "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
877       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
878 
879       "movdqa      %%xmm6,%%xmm5                 \n"
880       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
881       "movdqa      %%xmm2,%%xmm7                 \n"
882       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
883       "paddw       %%xmm7,%%xmm5                 \n"  // near+far
884       "movdqa      %%xmm3,%%xmm7                 \n"
885       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
886       "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
887       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
888 
889       "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
890       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
891       "paddw       %%xmm6,%%xmm2                 \n"  // near+far
892       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
893       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
894       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
895 
896       // xmm4 xmm1
897       // xmm5 xmm2
898       "pcmpeqw     %%xmm0,%%xmm0                 \n"
899       "psrlw       $15,%%xmm0                    \n"
900       "psllw       $3,%%xmm0                     \n"  // all 8
901 
902       "movdqa      %%xmm4,%%xmm3                 \n"
903       "movdqa      %%xmm5,%%xmm6                 \n"
904       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
905       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
906       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
907       "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
908       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
909 
910       "movdqa      %%xmm1,%%xmm7                 \n"
911       "movdqa      %%xmm2,%%xmm6                 \n"
912       "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
913       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
914       "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
915       "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
916       "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
917 
918       "packuswb    %%xmm7,%%xmm3                 \n"
919       "movdqu      %%xmm3,(%1)                   \n"  // save above line
920 
921       "movdqa      %%xmm5,%%xmm3                 \n"
922       "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
923       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
924       "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
925       "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
926       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
927 
928       "movdqa      %%xmm2,%%xmm3                 \n"
929       "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
930       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
931       "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
932       "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
933       "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
934 
935       "packuswb    %%xmm2,%%xmm5                 \n"
936       "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
937 
938       "lea         0x8(%0),%0                    \n"
939       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
940       "sub         $0x10,%2                      \n"
941       "jg          1b                            \n"
942       : "+r"(src_ptr),                // %0
943         "+r"(dst_ptr),                // %1
944         "+r"(dst_width)               // %2
945       : "r"((intptr_t)(src_stride)),  // %3
946         "r"((intptr_t)(dst_stride))   // %4
947       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948         "xmm7");
949 }
950 #endif
951 
952 #ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954                                  uint16_t* dst_ptr,
955                                  int dst_width) {
956   asm volatile(
957       "movdqa      %3,%%xmm5                     \n"
958       "pcmpeqw     %%xmm4,%%xmm4                 \n"
959       "psrlw       $15,%%xmm4                    \n"
960       "psllw       $1,%%xmm4                     \n"  // all 2
961 
962       LABELALIGN
963       "1:                                        \n"
964       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
965       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
966 
967       "movdqa      %%xmm0,%%xmm2                 \n"
968       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
969       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
970 
971       "movdqa      %%xmm2,%%xmm3                 \n"
972       "movdqa      %%xmm0,%%xmm1                 \n"
973       "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
974       "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
975 
976       "paddw       %%xmm4,%%xmm1                 \n"  // far+2
977       "paddw       %%xmm4,%%xmm3                 \n"  // far+2
978       "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
979       "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
980       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
981       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
982       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
983       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
984 
985       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
986       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
987       "movdqu      %%xmm0,(%1)                   \n"
988       "movdqu      %%xmm2,16(%1)                 \n"
989 
990       "lea         0x10(%0),%0                   \n"
991       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
992       "sub         $0x10,%2                      \n"
993       "jg          1b                            \n"
994       : "+r"(src_ptr),          // %0
995         "+r"(dst_ptr),          // %1
996         "+r"(dst_width)         // %2
997       : "m"(kLinearShuffleFar)  // %3
998       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001 
1002 #ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004                                    ptrdiff_t src_stride,
1005                                    uint16_t* dst_ptr,
1006                                    ptrdiff_t dst_stride,
1007                                    int dst_width) {
1008   asm volatile(
1009       "pcmpeqw     %%xmm7,%%xmm7                 \n"
1010       "psrlw       $15,%%xmm7                    \n"
1011       "psllw       $3,%%xmm7                     \n"  // all 8
1012       "movdqa      %5,%%xmm6                     \n"
1013 
1014       LABELALIGN
1015       "1:                                        \n"
1016       // above line
1017       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
1018       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
1019       "movdqa      %%xmm0,%%xmm2                 \n"
1020       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
1021       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
1022       "movdqa      %%xmm2,%%xmm3                 \n"
1023       "movdqa      %%xmm0,%%xmm1                 \n"
1024       "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
1025       "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
1026       "paddw       %%xmm0,%%xmm1                 \n"  // near+far
1027       "paddw       %%xmm2,%%xmm3                 \n"  // near+far
1028       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
1029       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
1030       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
1031       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
1032 
1033       // below line
1034       "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
1035       "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
1036       "movdqa      %%xmm1,%%xmm3                 \n"
1037       "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
1038       "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
1039       "movdqa      %%xmm3,%%xmm5                 \n"
1040       "movdqa      %%xmm1,%%xmm4                 \n"
1041       "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
1042       "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
1043       "paddw       %%xmm1,%%xmm4                 \n"  // near+far
1044       "paddw       %%xmm3,%%xmm5                 \n"  // near+far
1045       "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
1046       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
1047       "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
1048       "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1049 
1050       // xmm0 xmm2
1051       // xmm1 xmm3
1052 
1053       "movdqa      %%xmm0,%%xmm4                 \n"
1054       "movdqa      %%xmm1,%%xmm5                 \n"
1055       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1056       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1057       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1058       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1059       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1060       "movdqu      %%xmm4,(%1)                   \n"
1061 
1062       "movdqa      %%xmm2,%%xmm4                 \n"
1063       "movdqa      %%xmm3,%%xmm5                 \n"
1064       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
1065       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
1066       "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
1067       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
1068       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1069       "movdqu      %%xmm4,0x10(%1)               \n"
1070 
1071       "movdqa      %%xmm1,%%xmm4                 \n"
1072       "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1073       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
1074       "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
1075       "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
1076       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
1077       "movdqu      %%xmm1,(%1,%4,2)              \n"
1078 
1079       "movdqa      %%xmm3,%%xmm4                 \n"
1080       "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1081       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
1082       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
1083       "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
1084       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
1085       "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
1086 
1087       "lea         0x10(%0),%0                   \n"
1088       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1089       "sub         $0x10,%2                      \n"
1090       "jg          1b                            \n"
1091       : "+r"(src_ptr),                // %0
1092         "+r"(dst_ptr),                // %1
1093         "+r"(dst_width)               // %2
1094       : "r"((intptr_t)(src_stride)),  // %3
1095         "r"((intptr_t)(dst_stride)),  // %4
1096         "m"(kLinearShuffleFar)        // %5
1097       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1098         "xmm7");
1099 }
1100 #endif
1101 
1102 #ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1103 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1104                                 uint16_t* dst_ptr,
1105                                 int dst_width) {
1106   asm volatile(
1107       "pxor        %%xmm5,%%xmm5                 \n"
1108       "pcmpeqd     %%xmm4,%%xmm4                 \n"
1109       "psrld       $31,%%xmm4                    \n"
1110       "pslld       $1,%%xmm4                     \n"  // all 2
1111 
1112       LABELALIGN
1113       "1:                                        \n"
1114       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1115       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1116 
1117       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
1118       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
1119 
1120       "movdqa      %%xmm0,%%xmm2                 \n"
1121       "movdqa      %%xmm1,%%xmm3                 \n"
1122 
1123       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1124       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1125 
1126       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
1127       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
1128       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
1129       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
1130       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1131       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1132       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
1133       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
1134 
1135       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1136       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
1137       "packssdw    %%xmm1,%%xmm0                 \n"
1138       "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
1139       "movdqu      %%xmm0,(%1)                   \n"
1140 
1141       "lea         0x8(%0),%0                    \n"
1142       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1143       "sub         $0x8,%2                       \n"
1144       "jg          1b                            \n"
1145       : "+r"(src_ptr),   // %0
1146         "+r"(dst_ptr),   // %1
1147         "+r"(dst_width)  // %2
1148       :
1149       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1150 }
1151 #endif
1152 
1153 #ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1154 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1155                                   ptrdiff_t src_stride,
1156                                   uint16_t* dst_ptr,
1157                                   ptrdiff_t dst_stride,
1158                                   int dst_width) {
1159   asm volatile(
1160       "pxor        %%xmm7,%%xmm7                 \n"
1161       "pcmpeqd     %%xmm6,%%xmm6                 \n"
1162       "psrld       $31,%%xmm6                    \n"
1163       "pslld       $3,%%xmm6                     \n"  // all 8
1164 
1165       LABELALIGN
1166       "1:                                        \n"
1167       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
1168       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
1169       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
1170       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
1171       "movdqa      %%xmm0,%%xmm2                 \n"
1172       "movdqa      %%xmm1,%%xmm3                 \n"
1173       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
1174       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
1175       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
1176       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
1177       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
1178       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
1179       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1180       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1181 
1182       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1183       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1184       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
1185       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
1186       "movdqa      %%xmm0,%%xmm2                 \n"
1187       "movdqa      %%xmm1,%%xmm3                 \n"
1188       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1189       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1190       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
1191       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
1192       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1193       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1194       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1195       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1196 
1197       "movq        (%0,%3,2),%%xmm2              \n"
1198       "movq        2(%0,%3,2),%%xmm3             \n"
1199       "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
1200       "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
1201       "movdqa      %%xmm2,%%xmm4                 \n"
1202       "movdqa      %%xmm3,%%xmm5                 \n"
1203       "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
1204       "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
1205       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
1206       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
1207       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
1208       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
1209       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
1210       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1211 
1212       "movdqa      %%xmm0,%%xmm4                 \n"
1213       "movdqa      %%xmm2,%%xmm5                 \n"
1214       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1215       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1216       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1217       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1218       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1219 
1220       "movdqa      %%xmm2,%%xmm5                 \n"
1221       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1222       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1223       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1224       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1225       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1226 
1227       "movdqa      %%xmm1,%%xmm0                 \n"
1228       "movdqa      %%xmm3,%%xmm2                 \n"
1229       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1230       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
1231       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1232       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1233       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1234 
1235       "movdqa      %%xmm3,%%xmm2                 \n"
1236       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
1237       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
1238       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
1239       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
1240       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
1241 
1242       "packssdw    %%xmm0,%%xmm4                 \n"
1243       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1244       "movdqu      %%xmm4,(%1)                   \n"  // store above
1245       "packssdw    %%xmm2,%%xmm5                 \n"
1246       "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
1247       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
1248 
1249       "lea         0x8(%0),%0                    \n"
1250       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1251       "sub         $0x8,%2                       \n"
1252       "jg          1b                            \n"
1253       : "+r"(src_ptr),                // %0
1254         "+r"(dst_ptr),                // %1
1255         "+r"(dst_width)               // %2
1256       : "r"((intptr_t)(src_stride)),  // %3
1257         "r"((intptr_t)(dst_stride))   // %4
1258       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1259         "xmm7");
1260 }
1261 #endif
1262 
1263 #ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1264 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1265                               uint8_t* dst_ptr,
1266                               int dst_width) {
1267   asm volatile(
1268       "pcmpeqw     %%xmm4,%%xmm4                 \n"
1269       "psrlw       $15,%%xmm4                    \n"
1270       "psllw       $1,%%xmm4                     \n"  // all 2
1271       "movdqa      %3,%%xmm3                     \n"
1272 
1273       LABELALIGN
1274       "1:                                        \n"
1275       "movq        (%0),%%xmm0                   \n"  // 01234567
1276       "movq        1(%0),%%xmm1                  \n"  // 12345678
1277       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1278       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1279       "movdqa      %%xmm0,%%xmm2                 \n"
1280       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1281       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1282       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
1283       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
1284       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
1285       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
1286       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1287       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
1288       "packuswb    %%xmm2,%%xmm0                 \n"
1289       "movdqu      %%xmm0,(%1)                   \n"
1290       "lea         0x8(%0),%0                    \n"
1291       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1292       "sub         $0x10,%2                      \n"
1293       "jg          1b                            \n"
1294       : "+r"(src_ptr),      // %0
1295         "+r"(dst_ptr),      // %1
1296         "+r"(dst_width)     // %2
1297       : "m"(kLinearMadd31)  // %3
1298       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1299 }
1300 #endif
1301 
1302 #ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1303 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1304                                 ptrdiff_t src_stride,
1305                                 uint8_t* dst_ptr,
1306                                 ptrdiff_t dst_stride,
1307                                 int dst_width) {
1308   asm volatile(
1309       "pcmpeqw     %%xmm6,%%xmm6                 \n"
1310       "psrlw       $15,%%xmm6                    \n"
1311       "psllw       $3,%%xmm6                     \n"  // all 8
1312       "movdqa      %5,%%xmm7                     \n"
1313 
1314       LABELALIGN
1315       "1:                                        \n"
1316       "movq        (%0),%%xmm0                   \n"  // 01234567
1317       "movq        1(%0),%%xmm1                  \n"  // 12345678
1318       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1319       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1320       "movdqa      %%xmm0,%%xmm2                 \n"
1321       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1322       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1323       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
1324       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
1325 
1326       "movq        (%0,%3),%%xmm1                \n"
1327       "movq        1(%0,%3),%%xmm4               \n"
1328       "punpcklwd   %%xmm1,%%xmm1                 \n"
1329       "punpcklwd   %%xmm4,%%xmm4                 \n"
1330       "movdqa      %%xmm1,%%xmm3                 \n"
1331       "punpckhdq   %%xmm4,%%xmm3                 \n"
1332       "punpckldq   %%xmm4,%%xmm1                 \n"
1333       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
1334       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
1335 
1336       // xmm0 xmm2
1337       // xmm1 xmm3
1338 
1339       "movdqa      %%xmm0,%%xmm4                 \n"
1340       "movdqa      %%xmm1,%%xmm5                 \n"
1341       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1342       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1343       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1344       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1345       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1346 
1347       "movdqa      %%xmm1,%%xmm5                 \n"
1348       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1349       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1350       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1351       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1352       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1353 
1354       "movdqa      %%xmm2,%%xmm0                 \n"
1355       "movdqa      %%xmm3,%%xmm1                 \n"
1356       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1357       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
1358       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1359       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1360       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1361 
1362       "movdqa      %%xmm3,%%xmm1                 \n"
1363       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
1364       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1365       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
1366       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
1367       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
1368 
1369       "packuswb    %%xmm0,%%xmm4                 \n"
1370       "movdqu      %%xmm4,(%1)                   \n"  // store above
1371       "packuswb    %%xmm1,%%xmm5                 \n"
1372       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
1373 
1374       "lea         0x8(%0),%0                    \n"
1375       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1376       "sub         $0x10,%2                      \n"
1377       "jg          1b                            \n"
1378       : "+r"(src_ptr),                // %0
1379         "+r"(dst_ptr),                // %1
1380         "+r"(dst_width)               // %2
1381       : "r"((intptr_t)(src_stride)),  // %3
1382         "r"((intptr_t)(dst_stride)),  // %4
1383         "m"(kLinearMadd31)            // %5
1384       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1385         "xmm7");
1386 }
1387 #endif
1388 
1389 #ifdef HAS_SCALEROWUP2_LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1390 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1391                              uint8_t* dst_ptr,
1392                              int dst_width) {
1393   asm volatile(
1394       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1395       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1396       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1397       "vbroadcastf128 %3,%%ymm3                  \n"
1398 
1399       LABELALIGN
1400       "1:                                        \n"
1401       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1402       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1403       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1404       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1405       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1406       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1407       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1408       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1409       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
1410       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
1411       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
1412       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
1413       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1414       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1415       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1416       "vmovdqu     %%ymm0,(%1)                   \n"
1417 
1418       "lea         0x10(%0),%0                   \n"
1419       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1420       "sub         $0x20,%2                      \n"
1421       "jg          1b                            \n"
1422       "vzeroupper                                \n"
1423       : "+r"(src_ptr),      // %0
1424         "+r"(dst_ptr),      // %1
1425         "+r"(dst_width)     // %2
1426       : "m"(kLinearMadd31)  // %3
1427       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1428 }
1429 #endif
1430 
1431 #ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1432 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1433                                ptrdiff_t src_stride,
1434                                uint8_t* dst_ptr,
1435                                ptrdiff_t dst_stride,
1436                                int dst_width) {
1437   asm volatile(
1438       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
1439       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
1440       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
1441       "vbroadcastf128 %5,%%ymm7                  \n"
1442 
1443       LABELALIGN
1444       "1:                                        \n"
1445       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1446       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1447       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1448       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1449       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1450       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1451       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1452       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1453       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
1454       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
1455 
1456       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
1457       "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
1458       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
1459       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
1460       "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
1461       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
1462       "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
1463       "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
1464       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
1465       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
1466 
1467       // ymm0 ymm1
1468       // ymm2 ymm3
1469 
1470       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1471       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1472       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1473       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1474       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1475 
1476       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1477       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1478       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1479       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1480       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1481 
1482       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1483       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1484       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1485       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1486       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1487 
1488       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1489       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1490       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1491       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1492       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1493 
1494       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
1495       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1496       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
1497       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
1498 
1499       "lea         0x10(%0),%0                   \n"
1500       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1501       "sub         $0x20,%2                      \n"
1502       "jg          1b                            \n"
1503       "vzeroupper                                \n"
1504       : "+r"(src_ptr),                // %0
1505         "+r"(dst_ptr),                // %1
1506         "+r"(dst_width)               // %2
1507       : "r"((intptr_t)(src_stride)),  // %3
1508         "r"((intptr_t)(dst_stride)),  // %4
1509         "m"(kLinearMadd31)            // %5
1510       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1511         "xmm7");
1512 }
1513 #endif
1514 
1515 #ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1516 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1517                                 uint16_t* dst_ptr,
1518                                 int dst_width) {
1519   asm volatile(
1520       "vbroadcastf128 %3,%%ymm5                  \n"
1521       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1522       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1523       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1524 
1525       LABELALIGN
1526       "1:                                        \n"
1527       "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
1528       "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
1529 
1530       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
1531       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
1532 
1533       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
1534       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1535       "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
1536       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1537 
1538       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
1539       "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
1540       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
1541       "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
1542       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1543       "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
1544       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
1545       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
1546 
1547       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
1548       "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
1549       "vmovdqu     %%ymm0,(%1)                   \n"
1550       "vmovdqu     %%ymm2,32(%1)                 \n"
1551 
1552       "lea         0x20(%0),%0                   \n"
1553       "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
1554       "sub         $0x20,%2                      \n"
1555       "jg          1b                            \n"
1556       "vzeroupper                                \n"
1557       : "+r"(src_ptr),          // %0
1558         "+r"(dst_ptr),          // %1
1559         "+r"(dst_width)         // %2
1560       : "m"(kLinearShuffleFar)  // %3
1561       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1562 }
1563 #endif
1564 
1565 #ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1566 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1567                                   ptrdiff_t src_stride,
1568                                   uint16_t* dst_ptr,
1569                                   ptrdiff_t dst_stride,
1570                                   int dst_width) {
1571   asm volatile(
1572       "vbroadcastf128 %5,%%ymm5                  \n"
1573       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1574       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1575       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
1576 
1577       LABELALIGN
1578       "1:                                        \n"
1579 
1580       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
1581       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
1582       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1583       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1584       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1585       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1586       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1587       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1588       "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
1589 
1590       "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
1591       "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
1592       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1593       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1594       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1595       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1596       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1597       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1598       "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
1599 
1600       "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
1601       "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
1602       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
1603       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
1604       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1605       "vmovdqu     %%ymm0,(%1)                   \n"  // store above
1606 
1607       "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
1608       "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
1609       "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
1610       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
1611       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1612       "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
1613 
1614       "lea         0x10(%0),%0                   \n"
1615       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1616       "sub         $0x10,%2                      \n"
1617       "jg          1b                            \n"
1618       "vzeroupper                                \n"
1619       : "+r"(src_ptr),                // %0
1620         "+r"(dst_ptr),                // %1
1621         "+r"(dst_width)               // %2
1622       : "r"((intptr_t)(src_stride)),  // %3
1623         "r"((intptr_t)(dst_stride)),  // %4
1624         "m"(kLinearShuffleFar)        // %5
1625       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1626 }
1627 #endif
1628 
1629 #ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1630 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1631                                 uint16_t* dst_ptr,
1632                                 int dst_width) {
1633   asm volatile(
1634       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
1635       "vpsrld      $31,%%ymm4,%%ymm4             \n"
1636       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
1637 
1638       LABELALIGN
1639       "1:                                        \n"
1640       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1641       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1642 
1643       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1644       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1645 
1646       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1647       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1648 
1649       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
1650       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
1651       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
1652       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
1653       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1654       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1655       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
1656       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
1657 
1658       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1659       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1660       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
1661       "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
1662       "vmovdqu     %%ymm0,(%1)                   \n"
1663 
1664       "lea         0x10(%0),%0                   \n"
1665       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1666       "sub         $0x10,%2                      \n"
1667       "jg          1b                            \n"
1668       "vzeroupper                                \n"
1669       : "+r"(src_ptr),   // %0
1670         "+r"(dst_ptr),   // %1
1671         "+r"(dst_width)  // %2
1672       :
1673       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1674 }
1675 #endif
1676 
1677 #ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1678 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1679                                   ptrdiff_t src_stride,
1680                                   uint16_t* dst_ptr,
1681                                   ptrdiff_t dst_stride,
1682                                   int dst_width) {
1683   asm volatile(
1684       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
1685       "vpsrld      $31,%%ymm6,%%ymm6             \n"
1686       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
1687 
1688       LABELALIGN
1689       "1:                                        \n"
1690 
1691       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1692       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1693       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1694       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1695       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1696       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1697       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
1698       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
1699       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1700       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1701       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
1702       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
1703 
1704       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
1705       "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
1706       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
1707       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
1708       "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
1709       "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
1710       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
1711       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
1712       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
1713       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
1714       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
1715       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
1716 
1717       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1718       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1719       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1720       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1721       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1722 
1723       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1724       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1725       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1726       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1727       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1728 
1729       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1730       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1731       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1732       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1733       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1734 
1735       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1736       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1737       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1738       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1739       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1740 
1741       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
1742       "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
1743       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1744       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
1745       "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
1746       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
1747 
1748       "lea         0x10(%0),%0                   \n"
1749       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1750       "sub         $0x10,%2                      \n"
1751       "jg          1b                            \n"
1752       "vzeroupper                                \n"
1753       : "+r"(src_ptr),                // %0
1754         "+r"(dst_ptr),                // %1
1755         "+r"(dst_width)               // %2
1756       : "r"((intptr_t)(src_stride)),  // %3
1757         "r"((intptr_t)(dst_stride))   // %4
1758       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1759 }
1760 #endif
1761 
1762 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1763 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1764                       uint16_t* dst_ptr,
1765                       int src_width) {
1766       asm volatile("pxor        %%xmm5,%%xmm5                 \n"
1767 
1768                // 16 pixel loop.
1769                LABELALIGN
1770       "1:                                        \n"
1771       "movdqu      (%0),%%xmm3                   \n"
1772       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
1773       "movdqu      (%1),%%xmm0                   \n"
1774       "movdqu      0x10(%1),%%xmm1               \n"
1775       "movdqa      %%xmm3,%%xmm2                 \n"
1776       "punpcklbw   %%xmm5,%%xmm2                 \n"
1777       "punpckhbw   %%xmm5,%%xmm3                 \n"
1778       "paddusw     %%xmm2,%%xmm0                 \n"
1779       "paddusw     %%xmm3,%%xmm1                 \n"
1780       "movdqu      %%xmm0,(%1)                   \n"
1781       "movdqu      %%xmm1,0x10(%1)               \n"
1782       "lea         0x20(%1),%1                   \n"
1783       "sub         $0x10,%2                      \n"
1784       "jg          1b                            \n"
1785                : "+r"(src_ptr),   // %0
1786                  "+r"(dst_ptr),   // %1
1787                  "+r"(src_width)  // %2
1788                :
1789                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1790 }
1791 
1792 #ifdef HAS_SCALEADDROW_AVX2
1793 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1794 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1795                       uint16_t* dst_ptr,
1796                       int src_width) {
1797       asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
1798 
1799                LABELALIGN
1800       "1:                                        \n"
1801       "vmovdqu     (%0),%%ymm3                   \n"
1802       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
1803       "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
1804       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
1805       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
1806       "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
1807       "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
1808       "vmovdqu     %%ymm0,(%1)                   \n"
1809       "vmovdqu     %%ymm1,0x20(%1)               \n"
1810       "lea         0x40(%1),%1                   \n"
1811       "sub         $0x20,%2                      \n"
1812       "jg          1b                            \n"
1813       "vzeroupper                                \n"
1814                : "+r"(src_ptr),   // %0
1815                  "+r"(dst_ptr),   // %1
1816                  "+r"(src_width)  // %2
1817                :
1818                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1819 }
1820 #endif  // HAS_SCALEADDROW_AVX2
1821 
1822 // Constant for making pixels signed to avoid pmaddubsw
1823 // saturation.
1824 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1825                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1826 
1827 // Constant for making pixels unsigned and adding .5 for rounding.
1828 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1829                                0x4040, 0x4040, 0x4040, 0x4040};
1830 
1831 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1832 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1833                            const uint8_t* src_ptr,
1834                            int dst_width,
1835                            int x,
1836                            int dx) {
1837   intptr_t x0, x1, temp_pixel;
1838   asm volatile(
1839       "movd        %6,%%xmm2                     \n"
1840       "movd        %7,%%xmm3                     \n"
1841       "movl        $0x04040000,%k2               \n"
1842       "movd        %k2,%%xmm5                    \n"
1843       "pcmpeqb     %%xmm6,%%xmm6                 \n"
1844       "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
1845       "pcmpeqb     %%xmm7,%%xmm7                 \n"
1846       "psrlw       $15,%%xmm7                    \n"  // 0x00010001
1847 
1848       "pextrw      $0x1,%%xmm2,%k3               \n"
1849       "subl        $0x2,%5                       \n"
1850       "jl          29f                           \n"
1851       "movdqa      %%xmm2,%%xmm0                 \n"
1852       "paddd       %%xmm3,%%xmm0                 \n"
1853       "punpckldq   %%xmm0,%%xmm2                 \n"
1854       "punpckldq   %%xmm3,%%xmm3                 \n"
1855       "paddd       %%xmm3,%%xmm3                 \n"
1856       "pextrw      $0x3,%%xmm2,%k4               \n"
1857 
1858       LABELALIGN
1859       "2:                                        \n"
1860       "movdqa      %%xmm2,%%xmm1                 \n"
1861       "paddd       %%xmm3,%%xmm2                 \n"
1862       "movzwl      0x00(%1,%3,1),%k2             \n"
1863       "movd        %k2,%%xmm0                    \n"
1864       "psrlw       $0x9,%%xmm1                   \n"
1865       "movzwl      0x00(%1,%4,1),%k2             \n"
1866       "movd        %k2,%%xmm4                    \n"
1867       "pshufb      %%xmm5,%%xmm1                 \n"
1868       "punpcklwd   %%xmm4,%%xmm0                 \n"
1869       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1870       "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
1871                                                       // 1
1872       "paddusb     %%xmm7,%%xmm1                 \n"
1873       "pmaddubsw   %%xmm0,%%xmm1                 \n"
1874       "pextrw      $0x1,%%xmm2,%k3               \n"
1875       "pextrw      $0x3,%%xmm2,%k4               \n"
1876       "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
1877       "psrlw       $0x7,%%xmm1                   \n"
1878       "packuswb    %%xmm1,%%xmm1                 \n"
1879       "movd        %%xmm1,%k2                    \n"
1880       "mov         %w2,(%0)                      \n"
1881       "lea         0x2(%0),%0                    \n"
1882       "subl        $0x2,%5                       \n"
1883       "jge         2b                            \n"
1884 
1885       LABELALIGN
1886       "29:                                       \n"
1887       "addl        $0x1,%5                       \n"
1888       "jl          99f                           \n"
1889       "movzwl      0x00(%1,%3,1),%k2             \n"
1890       "movd        %k2,%%xmm0                    \n"
1891       "psrlw       $0x9,%%xmm2                   \n"
1892       "pshufb      %%xmm5,%%xmm2                 \n"
1893       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1894       "pxor        %%xmm6,%%xmm2                 \n"
1895       "paddusb     %%xmm7,%%xmm2                 \n"
1896       "pmaddubsw   %%xmm0,%%xmm2                 \n"
1897       "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
1898       "psrlw       $0x7,%%xmm2                   \n"
1899       "packuswb    %%xmm2,%%xmm2                 \n"
1900       "movd        %%xmm2,%k2                    \n"
1901       "mov         %b2,(%0)                      \n"
1902       "99:                                       \n"
1903       : "+r"(dst_ptr),      // %0
1904         "+r"(src_ptr),      // %1
1905         "=&a"(temp_pixel),  // %2
1906         "=&r"(x0),          // %3
1907         "=&r"(x1),          // %4
1908 #if defined(__x86_64__)
1909         "+rm"(dst_width)  // %5
1910 #else
1911         "+m"(dst_width)  // %5
1912 #endif
1913       : "rm"(x),   // %6
1914         "rm"(dx),  // %7
1915 #if defined(__x86_64__)
1916         "x"(kFsub80),  // %8
1917         "x"(kFadd40)   // %9
1918 #else
1919         "m"(kFsub80),    // %8
1920         "m"(kFadd40)     // %9
1921 #endif
1922       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1923         "xmm7");
1924 }
1925 
1926 // Reads 4 pixels, duplicates them and writes 8 pixels.
1927 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1928 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1929                        const uint8_t* src_ptr,
1930                        int dst_width,
1931                        int x,
1932                        int dx) {
1933   (void)x;
1934   (void)dx;
1935   asm volatile(LABELALIGN
1936       "1:                                        \n"
1937       "movdqu      (%1),%%xmm0                   \n"
1938       "lea         0x10(%1),%1                   \n"
1939       "movdqa      %%xmm0,%%xmm1                 \n"
1940       "punpcklbw   %%xmm0,%%xmm0                 \n"
1941       "punpckhbw   %%xmm1,%%xmm1                 \n"
1942       "movdqu      %%xmm0,(%0)                   \n"
1943       "movdqu      %%xmm1,0x10(%0)               \n"
1944       "lea         0x20(%0),%0                   \n"
1945       "sub         $0x20,%2                      \n"
1946       "jg          1b                            \n"
1947 
1948                : "+r"(dst_ptr),   // %0
1949                  "+r"(src_ptr),   // %1
1950                  "+r"(dst_width)  // %2
1951                  ::"memory",
1952                  "cc", "xmm0", "xmm1");
1953 }
1954 
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1955 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1956                             ptrdiff_t src_stride,
1957                             uint8_t* dst_argb,
1958                             int dst_width) {
1959   (void)src_stride;
1960   asm volatile(LABELALIGN
1961       "1:                                        \n"
1962       "movdqu      (%0),%%xmm0                   \n"
1963       "movdqu      0x10(%0),%%xmm1               \n"
1964       "lea         0x20(%0),%0                   \n"
1965       "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1966       "movdqu      %%xmm0,(%1)                   \n"
1967       "lea         0x10(%1),%1                   \n"
1968       "sub         $0x4,%2                       \n"
1969       "jg          1b                            \n"
1970                : "+r"(src_argb),  // %0
1971                  "+r"(dst_argb),  // %1
1972                  "+r"(dst_width)  // %2
1973                  ::"memory",
1974                  "cc", "xmm0", "xmm1");
1975 }
1976 
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1977 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1978                                   ptrdiff_t src_stride,
1979                                   uint8_t* dst_argb,
1980                                   int dst_width) {
1981   (void)src_stride;
1982   asm volatile(LABELALIGN
1983       "1:                                        \n"
1984       "movdqu      (%0),%%xmm0                   \n"
1985       "movdqu      0x10(%0),%%xmm1               \n"
1986       "lea         0x20(%0),%0                   \n"
1987       "movdqa      %%xmm0,%%xmm2                 \n"
1988       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1989       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1990       "pavgb       %%xmm2,%%xmm0                 \n"
1991       "movdqu      %%xmm0,(%1)                   \n"
1992       "lea         0x10(%1),%1                   \n"
1993       "sub         $0x4,%2                       \n"
1994       "jg          1b                            \n"
1995                : "+r"(src_argb),  // %0
1996                  "+r"(dst_argb),  // %1
1997                  "+r"(dst_width)  // %2
1998                  ::"memory",
1999                  "cc", "xmm0", "xmm1");
2000 }
2001 
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2002 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2003                                ptrdiff_t src_stride,
2004                                uint8_t* dst_argb,
2005                                int dst_width) {
2006   asm volatile(LABELALIGN
2007       "1:                                        \n"
2008       "movdqu      (%0),%%xmm0                   \n"
2009       "movdqu      0x10(%0),%%xmm1               \n"
2010       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
2011       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
2012       "lea         0x20(%0),%0                   \n"
2013       "pavgb       %%xmm2,%%xmm0                 \n"
2014       "pavgb       %%xmm3,%%xmm1                 \n"
2015       "movdqa      %%xmm0,%%xmm2                 \n"
2016       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2017       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2018       "pavgb       %%xmm2,%%xmm0                 \n"
2019       "movdqu      %%xmm0,(%1)                   \n"
2020       "lea         0x10(%1),%1                   \n"
2021       "sub         $0x4,%2                       \n"
2022       "jg          1b                            \n"
2023                : "+r"(src_argb),              // %0
2024                  "+r"(dst_argb),              // %1
2025                  "+r"(dst_width)              // %2
2026                : "r"((intptr_t)(src_stride))  // %3
2027                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2028 }
2029 
2030 // Reads 4 pixels at a time.
2031 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2032 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2033                                ptrdiff_t src_stride,
2034                                int src_stepx,
2035                                uint8_t* dst_argb,
2036                                int dst_width) {
2037   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2038   intptr_t src_stepx_x12;
2039   (void)src_stride;
2040   asm volatile(
2041       "lea         0x00(,%1,4),%1                \n"
2042       "lea         0x00(%1,%1,2),%4              \n"
2043 
2044       LABELALIGN
2045       "1:                                        \n"
2046       "movd        (%0),%%xmm0                   \n"
2047       "movd        0x00(%0,%1,1),%%xmm1          \n"
2048       "punpckldq   %%xmm1,%%xmm0                 \n"
2049       "movd        0x00(%0,%1,2),%%xmm2          \n"
2050       "movd        0x00(%0,%4,1),%%xmm3          \n"
2051       "lea         0x00(%0,%1,4),%0              \n"
2052       "punpckldq   %%xmm3,%%xmm2                 \n"
2053       "punpcklqdq  %%xmm2,%%xmm0                 \n"
2054       "movdqu      %%xmm0,(%2)                   \n"
2055       "lea         0x10(%2),%2                   \n"
2056       "sub         $0x4,%3                       \n"
2057       "jg          1b                            \n"
2058       : "+r"(src_argb),       // %0
2059         "+r"(src_stepx_x4),   // %1
2060         "+r"(dst_argb),       // %2
2061         "+r"(dst_width),      // %3
2062         "=&r"(src_stepx_x12)  // %4
2063         ::"memory",
2064         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2065 }
2066 
2067 // Blends four 2x2 to 4x1.
2068 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2069 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2070                                   ptrdiff_t src_stride,
2071                                   int src_stepx,
2072                                   uint8_t* dst_argb,
2073                                   int dst_width) {
2074   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2075   intptr_t src_stepx_x12;
2076   intptr_t row1 = (intptr_t)(src_stride);
2077   asm volatile(
2078       "lea         0x00(,%1,4),%1                \n"
2079       "lea         0x00(%1,%1,2),%4              \n"
2080       "lea         0x00(%0,%5,1),%5              \n"
2081 
2082       LABELALIGN
2083       "1:                                        \n"
2084       "movq        (%0),%%xmm0                   \n"
2085       "movhps      0x00(%0,%1,1),%%xmm0          \n"
2086       "movq        0x00(%0,%1,2),%%xmm1          \n"
2087       "movhps      0x00(%0,%4,1),%%xmm1          \n"
2088       "lea         0x00(%0,%1,4),%0              \n"
2089       "movq        (%5),%%xmm2                   \n"
2090       "movhps      0x00(%5,%1,1),%%xmm2          \n"
2091       "movq        0x00(%5,%1,2),%%xmm3          \n"
2092       "movhps      0x00(%5,%4,1),%%xmm3          \n"
2093       "lea         0x00(%5,%1,4),%5              \n"
2094       "pavgb       %%xmm2,%%xmm0                 \n"
2095       "pavgb       %%xmm3,%%xmm1                 \n"
2096       "movdqa      %%xmm0,%%xmm2                 \n"
2097       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2098       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2099       "pavgb       %%xmm2,%%xmm0                 \n"
2100       "movdqu      %%xmm0,(%2)                   \n"
2101       "lea         0x10(%2),%2                   \n"
2102       "sub         $0x4,%3                       \n"
2103       "jg          1b                            \n"
2104       : "+r"(src_argb),        // %0
2105         "+r"(src_stepx_x4),    // %1
2106         "+r"(dst_argb),        // %2
2107         "+rm"(dst_width),      // %3
2108         "=&r"(src_stepx_x12),  // %4
2109         "+r"(row1)             // %5
2110         ::"memory",
2111         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2112 }
2113 
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2114 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2115                         const uint8_t* src_argb,
2116                         int dst_width,
2117                         int x,
2118                         int dx) {
2119   intptr_t x0, x1;
2120   asm volatile(
2121       "movd        %5,%%xmm2                     \n"
2122       "movd        %6,%%xmm3                     \n"
2123       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
2124       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
2125       "paddd       %%xmm0,%%xmm2                 \n"
2126       "paddd       %%xmm3,%%xmm3                 \n"
2127       "pshufd      $0x5,%%xmm3,%%xmm0            \n"
2128       "paddd       %%xmm0,%%xmm2                 \n"
2129       "paddd       %%xmm3,%%xmm3                 \n"
2130       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
2131       "pextrw      $0x1,%%xmm2,%k0               \n"
2132       "pextrw      $0x3,%%xmm2,%k1               \n"
2133       "cmp         $0x0,%4                       \n"
2134       "jl          99f                           \n"
2135       "sub         $0x4,%4                       \n"
2136       "jl          49f                           \n"
2137 
2138       LABELALIGN
2139       "40:                                       \n"
2140       "movd        0x00(%3,%0,4),%%xmm0          \n"
2141       "movd        0x00(%3,%1,4),%%xmm1          \n"
2142       "pextrw      $0x5,%%xmm2,%k0               \n"
2143       "pextrw      $0x7,%%xmm2,%k1               \n"
2144       "paddd       %%xmm3,%%xmm2                 \n"
2145       "punpckldq   %%xmm1,%%xmm0                 \n"
2146       "movd        0x00(%3,%0,4),%%xmm1          \n"
2147       "movd        0x00(%3,%1,4),%%xmm4          \n"
2148       "pextrw      $0x1,%%xmm2,%k0               \n"
2149       "pextrw      $0x3,%%xmm2,%k1               \n"
2150       "punpckldq   %%xmm4,%%xmm1                 \n"
2151       "punpcklqdq  %%xmm1,%%xmm0                 \n"
2152       "movdqu      %%xmm0,(%2)                   \n"
2153       "lea         0x10(%2),%2                   \n"
2154       "sub         $0x4,%4                       \n"
2155       "jge         40b                           \n"
2156 
2157       "49:                                       \n"
2158       "test        $0x2,%4                       \n"
2159       "je          29f                           \n"
2160       "movd        0x00(%3,%0,4),%%xmm0          \n"
2161       "movd        0x00(%3,%1,4),%%xmm1          \n"
2162       "pextrw      $0x5,%%xmm2,%k0               \n"
2163       "punpckldq   %%xmm1,%%xmm0                 \n"
2164       "movq        %%xmm0,(%2)                   \n"
2165       "lea         0x8(%2),%2                    \n"
2166       "29:                                       \n"
2167       "test        $0x1,%4                       \n"
2168       "je          99f                           \n"
2169       "movd        0x00(%3,%0,4),%%xmm0          \n"
2170       "movd        %%xmm0,(%2)                   \n"
2171       "99:                                       \n"
2172       : "=&a"(x0),       // %0
2173         "=&d"(x1),       // %1
2174         "+r"(dst_argb),  // %2
2175         "+r"(src_argb),  // %3
2176         "+r"(dst_width)  // %4
2177       : "rm"(x),         // %5
2178         "rm"(dx)         // %6
2179       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2180 }
2181 
2182 // Reads 4 pixels, duplicates them and writes 8 pixels.
2183 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2184 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2185                            const uint8_t* src_argb,
2186                            int dst_width,
2187                            int x,
2188                            int dx) {
2189   (void)x;
2190   (void)dx;
2191   asm volatile(LABELALIGN
2192       "1:                                        \n"
2193       "movdqu      (%1),%%xmm0                   \n"
2194       "lea         0x10(%1),%1                   \n"
2195       "movdqa      %%xmm0,%%xmm1                 \n"
2196       "punpckldq   %%xmm0,%%xmm0                 \n"
2197       "punpckhdq   %%xmm1,%%xmm1                 \n"
2198       "movdqu      %%xmm0,(%0)                   \n"
2199       "movdqu      %%xmm1,0x10(%0)               \n"
2200       "lea         0x20(%0),%0                   \n"
2201       "sub         $0x8,%2                       \n"
2202       "jg          1b                            \n"
2203 
2204                : "+r"(dst_argb),  // %0
2205                  "+r"(src_argb),  // %1
2206                  "+r"(dst_width)  // %2
2207                  ::"memory",
2208                  "cc", "xmm0", "xmm1");
2209 }
2210 
2211 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2212 static const uvec8 kShuffleColARGB = {
2213     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
2214     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
2215 };
2216 
2217 // Shuffle table for duplicating 2 fractions into 8 bytes each
2218 static const uvec8 kShuffleFractions = {
2219     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2220 };
2221 
2222 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2223 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2224                                const uint8_t* src_argb,
2225                                int dst_width,
2226                                int x,
2227                                int dx) {
2228   intptr_t x0, x1;
2229   asm volatile(
2230       "movdqa      %0,%%xmm4                     \n"
2231       "movdqa      %1,%%xmm5                     \n"
2232       :
2233       : "m"(kShuffleColARGB),   // %0
2234         "m"(kShuffleFractions)  // %1
2235   );
2236 
2237   asm volatile(
2238       "movd        %5,%%xmm2                     \n"
2239       "movd        %6,%%xmm3                     \n"
2240       "pcmpeqb     %%xmm6,%%xmm6                 \n"
2241       "psrlw       $0x9,%%xmm6                   \n"
2242       "pextrw      $0x1,%%xmm2,%k3               \n"
2243       "sub         $0x2,%2                       \n"
2244       "jl          29f                           \n"
2245       "movdqa      %%xmm2,%%xmm0                 \n"
2246       "paddd       %%xmm3,%%xmm0                 \n"
2247       "punpckldq   %%xmm0,%%xmm2                 \n"
2248       "punpckldq   %%xmm3,%%xmm3                 \n"
2249       "paddd       %%xmm3,%%xmm3                 \n"
2250       "pextrw      $0x3,%%xmm2,%k4               \n"
2251 
2252       LABELALIGN
2253       "2:                                        \n"
2254       "movdqa      %%xmm2,%%xmm1                 \n"
2255       "paddd       %%xmm3,%%xmm2                 \n"
2256       "movq        0x00(%1,%3,4),%%xmm0          \n"
2257       "psrlw       $0x9,%%xmm1                   \n"
2258       "movhps      0x00(%1,%4,4),%%xmm0          \n"
2259       "pshufb      %%xmm5,%%xmm1                 \n"
2260       "pshufb      %%xmm4,%%xmm0                 \n"
2261       "pxor        %%xmm6,%%xmm1                 \n"
2262       "pmaddubsw   %%xmm1,%%xmm0                 \n"
2263       "psrlw       $0x7,%%xmm0                   \n"
2264       "pextrw      $0x1,%%xmm2,%k3               \n"
2265       "pextrw      $0x3,%%xmm2,%k4               \n"
2266       "packuswb    %%xmm0,%%xmm0                 \n"
2267       "movq        %%xmm0,(%0)                   \n"
2268       "lea         0x8(%0),%0                    \n"
2269       "sub         $0x2,%2                       \n"
2270       "jge         2b                            \n"
2271 
2272       LABELALIGN
2273       "29:                                       \n"
2274       "add         $0x1,%2                       \n"
2275       "jl          99f                           \n"
2276       "psrlw       $0x9,%%xmm2                   \n"
2277       "movq        0x00(%1,%3,4),%%xmm0          \n"
2278       "pshufb      %%xmm5,%%xmm2                 \n"
2279       "pshufb      %%xmm4,%%xmm0                 \n"
2280       "pxor        %%xmm6,%%xmm2                 \n"
2281       "pmaddubsw   %%xmm2,%%xmm0                 \n"
2282       "psrlw       $0x7,%%xmm0                   \n"
2283       "packuswb    %%xmm0,%%xmm0                 \n"
2284       "movd        %%xmm0,(%0)                   \n"
2285 
2286       LABELALIGN
2287       "99:                                       \n"  // clang-format error.
2288 
2289       : "+r"(dst_argb),    // %0
2290         "+r"(src_argb),    // %1
2291         "+rm"(dst_width),  // %2
2292         "=&r"(x0),         // %3
2293         "=&r"(x1)          // %4
2294       : "rm"(x),           // %5
2295         "rm"(dx)           // %6
2296       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2297 }
2298 
2299 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2300 int FixedDiv_X86(int num, int div) {
2301   asm volatile(
2302       "cdq                                       \n"
2303       "shld        $0x10,%%eax,%%edx             \n"
2304       "shl         $0x10,%%eax                   \n"
2305       "idiv        %1                            \n"
2306       "mov         %0, %%eax                     \n"
2307       : "+a"(num)  // %0
2308       : "c"(div)   // %1
2309       : "memory", "cc", "edx");
2310   return num;
2311 }
2312 
2313 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2314 int FixedDiv1_X86(int num, int div) {
2315   asm volatile(
2316       "cdq                                       \n"
2317       "shld        $0x10,%%eax,%%edx             \n"
2318       "shl         $0x10,%%eax                   \n"
2319       "sub         $0x10001,%%eax                \n"
2320       "sbb         $0x0,%%edx                    \n"
2321       "sub         $0x1,%1                       \n"
2322       "idiv        %1                            \n"
2323       "mov         %0, %%eax                     \n"
2324       : "+a"(num)  // %0
2325       : "c"(div)   // %1
2326       : "memory", "cc", "edx");
2327   return num;
2328 }
2329 
2330 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
2331     defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
2332 
2333 // Shuffle table for splitting UV into upper and lower part of register.
2334 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2335                                       1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2336 static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
2337                                       6u,   14u,  0x80, 0x80, 0x80, 0x80,
2338                                       0x80, 0x80, 0x80, 0x80};
2339 #endif
2340 
2341 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2342 
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2343 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2344                               ptrdiff_t src_stride,
2345                               uint8_t* dst_ptr,
2346                               int dst_width) {
2347   asm volatile(
2348       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
2349       "psrlw       $0xf,%%xmm4                   \n"
2350       "packuswb    %%xmm4,%%xmm4                 \n"
2351       "pxor        %%xmm5, %%xmm5                \n"  // zero
2352       "movdqa      %4,%%xmm1                     \n"  // split shuffler
2353       "movdqa      %5,%%xmm3                     \n"  // merge shuffler
2354 
2355       LABELALIGN
2356       "1:                                        \n"
2357       "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
2358       "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
2359       "lea         0x10(%0),%0                   \n"
2360       "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
2361       "pshufb      %%xmm1,%%xmm2                 \n"
2362       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
2363       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2364       "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
2365       "psrlw       $0x1,%%xmm0                   \n"  // round
2366       "pavgw       %%xmm5,%%xmm0                 \n"
2367       "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
2368       "movq        %%xmm0,(%1)                   \n"
2369       "lea         0x8(%1),%1                    \n"  // 4 UV
2370       "sub         $0x4,%2                       \n"
2371       "jg          1b                            \n"
2372       : "+r"(src_ptr),                // %0
2373         "+r"(dst_ptr),                // %1
2374         "+r"(dst_width)               // %2
2375       : "r"((intptr_t)(src_stride)),  // %3
2376         "m"(kShuffleSplitUV),         // %4
2377         "m"(kShuffleMergeUV)          // %5
2378       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2379 }
2380 #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
2381 
2382 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2383 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2384                              ptrdiff_t src_stride,
2385                              uint8_t* dst_ptr,
2386                              int dst_width) {
2387   asm volatile(
2388       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
2389       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
2390       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
2391       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
2392       "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
2393       "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
2394 
2395       LABELALIGN
2396       "1:                                        \n"
2397       "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
2398       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
2399       "lea         0x20(%0),%0                   \n"
2400       "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
2401       "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
2402       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
2403       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
2404       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
2405       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
2406       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
2407       "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
2408       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
2409       "vmovdqu     %%xmm0,(%1)                   \n"
2410       "lea         0x10(%1),%1                   \n"  // 8 UV
2411       "sub         $0x8,%2                       \n"
2412       "jg          1b                            \n"
2413       "vzeroupper                                \n"
2414       : "+r"(src_ptr),                // %0
2415         "+r"(dst_ptr),                // %1
2416         "+r"(dst_width)               // %2
2417       : "r"((intptr_t)(src_stride)),  // %3
2418         "m"(kShuffleSplitUV),         // %4
2419         "m"(kShuffleMergeUV)          // %5
2420       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2421 }
2422 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
2423 
2424 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2425                                       3, 1, 3, 1, 1, 3, 1, 3};
2426 
2427 #ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2428 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2429                                 uint8_t* dst_ptr,
2430                                 int dst_width) {
2431   asm volatile(
2432       "pcmpeqw     %%xmm4,%%xmm4                 \n"
2433       "psrlw       $15,%%xmm4                    \n"
2434       "psllw       $1,%%xmm4                     \n"  // all 2
2435       "movdqa      %3,%%xmm3                     \n"
2436 
2437       LABELALIGN
2438       "1:                                        \n"
2439       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2440       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2441       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2442       "movdqa      %%xmm0,%%xmm2                 \n"
2443       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2444       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2445       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2446       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2447       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
2448       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
2449       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2450       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
2451       "packuswb    %%xmm2,%%xmm0                 \n"
2452       "movdqu      %%xmm0,(%1)                   \n"
2453 
2454       "lea         0x8(%0),%0                    \n"
2455       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2456       "sub         $0x8,%2                       \n"
2457       "jg          1b                            \n"
2458       : "+r"(src_ptr),        // %0
2459         "+r"(dst_ptr),        // %1
2460         "+r"(dst_width)       // %2
2461       : "m"(kUVLinearMadd31)  // %3
2462       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2463 }
2464 #endif
2465 
2466 #ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2467 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2468                                   ptrdiff_t src_stride,
2469                                   uint8_t* dst_ptr,
2470                                   ptrdiff_t dst_stride,
2471                                   int dst_width) {
2472   asm volatile(
2473       "pcmpeqw     %%xmm6,%%xmm6                 \n"
2474       "psrlw       $15,%%xmm6                    \n"
2475       "psllw       $3,%%xmm6                     \n"  // all 8
2476       "movdqa      %5,%%xmm7                     \n"
2477 
2478       LABELALIGN
2479       "1:                                        \n"
2480       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2481       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2482       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2483       "movdqa      %%xmm0,%%xmm2                 \n"
2484       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2485       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2486       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2487       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2488 
2489       "movq        (%0,%3),%%xmm1                \n"
2490       "movq        2(%0,%3),%%xmm4               \n"
2491       "punpcklbw   %%xmm4,%%xmm1                 \n"
2492       "movdqa      %%xmm1,%%xmm3                 \n"
2493       "punpckhdq   %%xmm1,%%xmm3                 \n"
2494       "punpckldq   %%xmm1,%%xmm1                 \n"
2495       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
2496       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
2497 
2498       // xmm0 xmm2
2499       // xmm1 xmm3
2500 
2501       "movdqa      %%xmm0,%%xmm4                 \n"
2502       "movdqa      %%xmm1,%%xmm5                 \n"
2503       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2504       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2505       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2506       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2507       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2508 
2509       "movdqa      %%xmm1,%%xmm5                 \n"
2510       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2511       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2512       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2513       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2514       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2515 
2516       "movdqa      %%xmm2,%%xmm0                 \n"
2517       "movdqa      %%xmm3,%%xmm1                 \n"
2518       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2519       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
2520       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2521       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2522       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2523 
2524       "movdqa      %%xmm3,%%xmm1                 \n"
2525       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
2526       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
2527       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
2528       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
2529       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
2530 
2531       "packuswb    %%xmm0,%%xmm4                 \n"
2532       "movdqu      %%xmm4,(%1)                   \n"  // store above
2533       "packuswb    %%xmm1,%%xmm5                 \n"
2534       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
2535 
2536       "lea         0x8(%0),%0                    \n"
2537       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2538       "sub         $0x8,%2                       \n"
2539       "jg          1b                            \n"
2540       : "+r"(src_ptr),                // %0
2541         "+r"(dst_ptr),                // %1
2542         "+r"(dst_width)               // %2
2543       : "r"((intptr_t)(src_stride)),  // %3
2544         "r"((intptr_t)(dst_stride)),  // %4
2545         "m"(kUVLinearMadd31)          // %5
2546       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2547         "xmm7");
2548 }
2549 #endif
2550 
2551 #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
2552 
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2553 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2554                                uint8_t* dst_ptr,
2555                                int dst_width) {
2556   asm volatile(
2557       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
2558       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
2559       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
2560       "vbroadcastf128 %3,%%ymm3                  \n"
2561 
2562       LABELALIGN
2563       "1:                                        \n"
2564       "vmovdqu     (%0),%%xmm0                   \n"
2565       "vmovdqu     2(%0),%%xmm1                  \n"
2566       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2567       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2568       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2569       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2570       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2571       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
2572       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
2573       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
2574       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
2575       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2576       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2577       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
2578       "vmovdqu     %%ymm0,(%1)                   \n"
2579 
2580       "lea         0x10(%0),%0                   \n"
2581       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2582       "sub         $0x10,%2                      \n"
2583       "jg          1b                            \n"
2584       "vzeroupper                                \n"
2585       : "+r"(src_ptr),        // %0
2586         "+r"(dst_ptr),        // %1
2587         "+r"(dst_width)       // %2
2588       : "m"(kUVLinearMadd31)  // %3
2589       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2590 }
2591 #endif
2592 
2593 #ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2594 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2595                                  ptrdiff_t src_stride,
2596                                  uint8_t* dst_ptr,
2597                                  ptrdiff_t dst_stride,
2598                                  int dst_width) {
2599   asm volatile(
2600       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
2601       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
2602       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
2603       "vbroadcastf128 %5,%%ymm7                  \n"
2604 
2605       LABELALIGN
2606       "1:                                        \n"
2607       "vmovdqu     (%0),%%xmm0                   \n"
2608       "vmovdqu     2(%0),%%xmm1                  \n"
2609       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2610       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2611       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2612       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2613       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2614       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
2615       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
2616 
2617       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
2618       "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
2619       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
2620       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
2621       "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
2622       "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
2623       "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
2624       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
2625       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
2626 
2627       // ymm0 ymm1
2628       // ymm2 ymm3
2629 
2630       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2631       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2632       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2633       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2634       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2635 
2636       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2637       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2638       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2639       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2640       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2641 
2642       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2643       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2644       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2645       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2646       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2647 
2648       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2649       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2650       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2651       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2652       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2653 
2654       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
2655       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2656       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
2657       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
2658 
2659       "lea         0x10(%0),%0                   \n"
2660       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2661       "sub         $0x10,%2                      \n"
2662       "jg          1b                            \n"
2663       "vzeroupper                                \n"
2664       : "+r"(src_ptr),                // %0
2665         "+r"(dst_ptr),                // %1
2666         "+r"(dst_width)               // %2
2667       : "r"((intptr_t)(src_stride)),  // %3
2668         "r"((intptr_t)(dst_stride)),  // %4
2669         "m"(kUVLinearMadd31)          // %5
2670       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2671         "xmm7");
2672 }
2673 #endif
2674 
2675 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2676 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2677                                    uint16_t* dst_ptr,
2678                                    int dst_width) {
2679   asm volatile(
2680       "pxor        %%xmm5,%%xmm5                 \n"
2681       "pcmpeqd     %%xmm4,%%xmm4                 \n"
2682       "psrld       $31,%%xmm4                    \n"
2683       "pslld       $1,%%xmm4                     \n"  // all 2
2684 
2685       LABELALIGN
2686       "1:                                        \n"
2687       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2688       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2689 
2690       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
2691       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
2692 
2693       "movdqa      %%xmm0,%%xmm2                 \n"
2694       "movdqa      %%xmm1,%%xmm3                 \n"
2695 
2696       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
2697       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
2698 
2699       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
2700       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
2701       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
2702       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
2703       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
2704       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
2705       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
2706       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
2707 
2708       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2709       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
2710       "packusdw    %%xmm1,%%xmm0                 \n"
2711       "movdqu      %%xmm0,(%1)                   \n"
2712 
2713       "lea         0x8(%0),%0                    \n"
2714       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2715       "sub         $0x4,%2                       \n"
2716       "jg          1b                            \n"
2717       : "+r"(src_ptr),   // %0
2718         "+r"(dst_ptr),   // %1
2719         "+r"(dst_width)  // %2
2720       :
2721       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2722 }
2723 #endif
2724 
2725 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2726 void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2727                                      ptrdiff_t src_stride,
2728                                      uint16_t* dst_ptr,
2729                                      ptrdiff_t dst_stride,
2730                                      int dst_width) {
2731   asm volatile(
2732       "pxor        %%xmm7,%%xmm7                 \n"
2733       "pcmpeqd     %%xmm6,%%xmm6                 \n"
2734       "psrld       $31,%%xmm6                    \n"
2735       "pslld       $3,%%xmm6                     \n"  // all 8
2736 
2737       LABELALIGN
2738       "1:                                        \n"
2739       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2740       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2741       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
2742       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
2743       "movdqa      %%xmm0,%%xmm2                 \n"
2744       "movdqa      %%xmm1,%%xmm3                 \n"
2745       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
2746       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
2747       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
2748       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
2749       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
2750       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
2751       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
2752       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
2753 
2754       "movq        (%0,%3,2),%%xmm2              \n"
2755       "movq        4(%0,%3,2),%%xmm3             \n"
2756       "punpcklwd   %%xmm7,%%xmm2                 \n"
2757       "punpcklwd   %%xmm7,%%xmm3                 \n"
2758       "movdqa      %%xmm2,%%xmm4                 \n"
2759       "movdqa      %%xmm3,%%xmm5                 \n"
2760       "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
2761       "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
2762       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
2763       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
2764       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
2765       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
2766       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
2767       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
2768 
2769       "movdqa      %%xmm0,%%xmm4                 \n"
2770       "movdqa      %%xmm2,%%xmm5                 \n"
2771       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2772       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2773       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2774       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2775       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2776 
2777       "movdqa      %%xmm2,%%xmm5                 \n"
2778       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2779       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2780       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2781       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2782       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2783 
2784       "movdqa      %%xmm1,%%xmm0                 \n"
2785       "movdqa      %%xmm3,%%xmm2                 \n"
2786       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2787       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
2788       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2789       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2790       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2791 
2792       "movdqa      %%xmm3,%%xmm2                 \n"
2793       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
2794       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
2795       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
2796       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
2797       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
2798 
2799       "packusdw    %%xmm0,%%xmm4                 \n"
2800       "movdqu      %%xmm4,(%1)                   \n"  // store above
2801       "packusdw    %%xmm2,%%xmm5                 \n"
2802       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
2803 
2804       "lea         0x8(%0),%0                    \n"
2805       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2806       "sub         $0x4,%2                       \n"
2807       "jg          1b                            \n"
2808       : "+r"(src_ptr),                // %0
2809         "+r"(dst_ptr),                // %1
2810         "+r"(dst_width)               // %2
2811       : "r"((intptr_t)(src_stride)),  // %3
2812         "r"((intptr_t)(dst_stride))   // %4
2813       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2814         "xmm7");
2815 }
2816 #endif
2817 
2818 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2819 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2820                                   uint16_t* dst_ptr,
2821                                   int dst_width) {
2822   asm volatile(
2823       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
2824       "vpsrld      $31,%%ymm4,%%ymm4             \n"
2825       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
2826 
2827       LABELALIGN
2828       "1:                                        \n"
2829       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2830       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2831 
2832       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2833       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2834 
2835       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2836       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2837 
2838       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
2839       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
2840       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
2841       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
2842       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2843       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2844       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
2845       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
2846 
2847       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2848       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2849       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
2850       "vmovdqu     %%ymm0,(%1)                   \n"
2851 
2852       "lea         0x10(%0),%0                   \n"
2853       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2854       "sub         $0x8,%2                       \n"
2855       "jg          1b                            \n"
2856       "vzeroupper                                \n"
2857       : "+r"(src_ptr),   // %0
2858         "+r"(dst_ptr),   // %1
2859         "+r"(dst_width)  // %2
2860       :
2861       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2862 }
2863 #endif
2864 
2865 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2866 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2867                                     ptrdiff_t src_stride,
2868                                     uint16_t* dst_ptr,
2869                                     ptrdiff_t dst_stride,
2870                                     int dst_width) {
2871   asm volatile(
2872       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
2873       "vpsrld      $31,%%ymm6,%%ymm6             \n"
2874       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
2875 
2876       LABELALIGN
2877       "1:                                        \n"
2878 
2879       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2880       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2881       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2882       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2883       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2884       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2885       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
2886       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
2887       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2888       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2889       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
2890       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
2891 
2892       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
2893       "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
2894       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
2895       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
2896       "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
2897       "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
2898       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
2899       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
2900       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
2901       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
2902       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
2903       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
2904 
2905       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2906       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2907       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2908       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2909       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2910 
2911       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2912       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2913       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2914       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2915       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2916 
2917       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2918       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2919       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2920       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2921       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2922 
2923       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2924       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2925       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2926       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2927       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2928 
2929       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
2930       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2931       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
2932       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
2933 
2934       "lea         0x10(%0),%0                   \n"
2935       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2936       "sub         $0x8,%2                       \n"
2937       "jg          1b                            \n"
2938       "vzeroupper                                \n"
2939       : "+r"(src_ptr),                // %0
2940         "+r"(dst_ptr),                // %1
2941         "+r"(dst_width)               // %2
2942       : "r"((intptr_t)(src_stride)),  // %3
2943         "r"((intptr_t)(dst_stride))   // %4
2944       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2945 }
2946 #endif
2947 
2948 #endif  // defined(__x86_64__) || defined(__i386__)
2949 
2950 #ifdef __cplusplus
2951 }  // extern "C"
2952 }  // namespace libyuv
2953 #endif
2954