1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24 128, 128, 128, 128, 128, 128, 128, 128};
25
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28 128, 128, 128, 128, 128, 128, 128, 128};
29
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32 128, 128, 128, 128, 128, 128, 128, 128};
33
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
39 8, 9, 9, 10, 10, 11, 12, 13};
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 10, 11, 12, 13, 13, 14, 14, 15};
44
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56
57 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 128, 128, 128, 128, 128, 128, 128, 128};
59
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 6, 8, 11, 14, 128, 128, 128, 128};
62
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 128, 128, 128, 128, 128, 128, 128, 128};
66
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 6, 7, 12, 13, 128, 128, 128, 128};
70
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 65536 / 9, 65536 / 6, 0, 0};
74
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 11, 128, 14, 128, 128, 128, 128, 128};
78
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 12, 128, 15, 128, 128, 128, 128, 128};
82
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 13, 128, 128, 128, 128, 128, 128, 128};
86
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 65536 / 3, 65536 / 2, 0, 0};
90
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96 ptrdiff_t src_stride,
97 uint8_t* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile(
101 // 16 pixel loop.
102 LABELALIGN
103 "1: \n"
104 "movdqu (%0),%%xmm0 \n"
105 "movdqu 0x10(%0),%%xmm1 \n"
106 "lea 0x20(%0),%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqu %%xmm0,(%1) \n"
111 "lea 0x10(%1),%1 \n"
112 "sub $0x10,%2 \n"
113 "jg 1b \n"
114 : "+r"(src_ptr), // %0
115 "+r"(dst_ptr), // %1
116 "+r"(dst_width) // %2
117 ::"memory",
118 "cc", "xmm0", "xmm1");
119 }
120
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122 ptrdiff_t src_stride,
123 uint8_t* dst_ptr,
124 int dst_width) {
125 (void)src_stride;
126 asm volatile(
127 "pcmpeqb %%xmm4,%%xmm4 \n"
128 "psrlw $0xf,%%xmm4 \n"
129 "packuswb %%xmm4,%%xmm4 \n"
130 "pxor %%xmm5,%%xmm5 \n"
131
132 LABELALIGN
133 "1: \n"
134 "movdqu (%0),%%xmm0 \n"
135 "movdqu 0x10(%0),%%xmm1 \n"
136 "lea 0x20(%0),%0 \n"
137 "pmaddubsw %%xmm4,%%xmm0 \n"
138 "pmaddubsw %%xmm4,%%xmm1 \n"
139 "pavgw %%xmm5,%%xmm0 \n"
140 "pavgw %%xmm5,%%xmm1 \n"
141 "packuswb %%xmm1,%%xmm0 \n"
142 "movdqu %%xmm0,(%1) \n"
143 "lea 0x10(%1),%1 \n"
144 "sub $0x10,%2 \n"
145 "jg 1b \n"
146 : "+r"(src_ptr), // %0
147 "+r"(dst_ptr), // %1
148 "+r"(dst_width) // %2
149 ::"memory",
150 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154 ptrdiff_t src_stride,
155 uint8_t* dst_ptr,
156 int dst_width) {
157 asm volatile(
158 "pcmpeqb %%xmm4,%%xmm4 \n"
159 "psrlw $0xf,%%xmm4 \n"
160 "packuswb %%xmm4,%%xmm4 \n"
161 "pxor %%xmm5,%%xmm5 \n"
162
163 LABELALIGN
164 "1: \n"
165 "movdqu (%0),%%xmm0 \n"
166 "movdqu 0x10(%0),%%xmm1 \n"
167 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
168 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
169 "lea 0x20(%0),%0 \n"
170 "pmaddubsw %%xmm4,%%xmm0 \n"
171 "pmaddubsw %%xmm4,%%xmm1 \n"
172 "pmaddubsw %%xmm4,%%xmm2 \n"
173 "pmaddubsw %%xmm4,%%xmm3 \n"
174 "paddw %%xmm2,%%xmm0 \n"
175 "paddw %%xmm3,%%xmm1 \n"
176 "psrlw $0x1,%%xmm0 \n"
177 "psrlw $0x1,%%xmm1 \n"
178 "pavgw %%xmm5,%%xmm0 \n"
179 "pavgw %%xmm5,%%xmm1 \n"
180 "packuswb %%xmm1,%%xmm0 \n"
181 "movdqu %%xmm0,(%1) \n"
182 "lea 0x10(%1),%1 \n"
183 "sub $0x10,%2 \n"
184 "jg 1b \n"
185 : "+r"(src_ptr), // %0
186 "+r"(dst_ptr), // %1
187 "+r"(dst_width) // %2
188 : "r"((intptr_t)(src_stride)) // %3
189 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194 ptrdiff_t src_stride,
195 uint8_t* dst_ptr,
196 int dst_width) {
197 (void)src_stride;
198 asm volatile(LABELALIGN
199 "1: \n"
200 "vmovdqu (%0),%%ymm0 \n"
201 "vmovdqu 0x20(%0),%%ymm1 \n"
202 "lea 0x40(%0),%0 \n"
203 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
204 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
205 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
206 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
207 "vmovdqu %%ymm0,(%1) \n"
208 "lea 0x20(%1),%1 \n"
209 "sub $0x20,%2 \n"
210 "jg 1b \n"
211 "vzeroupper \n"
212 : "+r"(src_ptr), // %0
213 "+r"(dst_ptr), // %1
214 "+r"(dst_width) // %2
215 ::"memory",
216 "cc", "xmm0", "xmm1");
217 }
218
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220 ptrdiff_t src_stride,
221 uint8_t* dst_ptr,
222 int dst_width) {
223 (void)src_stride;
224 asm volatile(
225 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
226 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
227 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
228 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
229
230 LABELALIGN
231 "1: \n"
232 "vmovdqu (%0),%%ymm0 \n"
233 "vmovdqu 0x20(%0),%%ymm1 \n"
234 "lea 0x40(%0),%0 \n"
235 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
236 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
237 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
238 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
239 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
240 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
241 "vmovdqu %%ymm0,(%1) \n"
242 "lea 0x20(%1),%1 \n"
243 "sub $0x20,%2 \n"
244 "jg 1b \n"
245 "vzeroupper \n"
246 : "+r"(src_ptr), // %0
247 "+r"(dst_ptr), // %1
248 "+r"(dst_width) // %2
249 ::"memory",
250 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254 ptrdiff_t src_stride,
255 uint8_t* dst_ptr,
256 int dst_width) {
257 asm volatile(
258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
259 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
260 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
261 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
262
263 LABELALIGN
264 "1: \n"
265 "vmovdqu (%0),%%ymm0 \n"
266 "vmovdqu 0x20(%0),%%ymm1 \n"
267 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
268 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
269 "lea 0x40(%0),%0 \n"
270 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
271 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
272 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
273 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
274 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
275 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
276 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
277 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
278 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
279 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
280 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
281 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
282 "vmovdqu %%ymm0,(%1) \n"
283 "lea 0x20(%1),%1 \n"
284 "sub $0x20,%2 \n"
285 "jg 1b \n"
286 "vzeroupper \n"
287 : "+r"(src_ptr), // %0
288 "+r"(dst_ptr), // %1
289 "+r"(dst_width) // %2
290 : "r"((intptr_t)(src_stride)) // %3
291 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif // HAS_SCALEROWDOWN2_AVX2
294
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296 ptrdiff_t src_stride,
297 uint8_t* dst_ptr,
298 int dst_width) {
299 (void)src_stride;
300 asm volatile(
301 "pcmpeqb %%xmm5,%%xmm5 \n"
302 "psrld $0x18,%%xmm5 \n"
303 "pslld $0x10,%%xmm5 \n"
304
305 LABELALIGN
306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "lea 0x20(%0),%0 \n"
310 "pand %%xmm5,%%xmm0 \n"
311 "pand %%xmm5,%%xmm1 \n"
312 "packuswb %%xmm1,%%xmm0 \n"
313 "psrlw $0x8,%%xmm0 \n"
314 "packuswb %%xmm0,%%xmm0 \n"
315 "movq %%xmm0,(%1) \n"
316 "lea 0x8(%1),%1 \n"
317 "sub $0x8,%2 \n"
318 "jg 1b \n"
319 : "+r"(src_ptr), // %0
320 "+r"(dst_ptr), // %1
321 "+r"(dst_width) // %2
322 ::"memory",
323 "cc", "xmm0", "xmm1", "xmm5");
324 }
325
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327 ptrdiff_t src_stride,
328 uint8_t* dst_ptr,
329 int dst_width) {
330 intptr_t stridex3;
331 asm volatile(
332 "pcmpeqb %%xmm4,%%xmm4 \n"
333 "psrlw $0xf,%%xmm4 \n"
334 "movdqa %%xmm4,%%xmm5 \n"
335 "packuswb %%xmm4,%%xmm4 \n"
336 "psllw $0x3,%%xmm5 \n"
337 "lea 0x00(%4,%4,2),%3 \n"
338
339 LABELALIGN
340 "1: \n"
341 "movdqu (%0),%%xmm0 \n"
342 "movdqu 0x10(%0),%%xmm1 \n"
343 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
344 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
345 "pmaddubsw %%xmm4,%%xmm0 \n"
346 "pmaddubsw %%xmm4,%%xmm1 \n"
347 "pmaddubsw %%xmm4,%%xmm2 \n"
348 "pmaddubsw %%xmm4,%%xmm3 \n"
349 "paddw %%xmm2,%%xmm0 \n"
350 "paddw %%xmm3,%%xmm1 \n"
351 "movdqu 0x00(%0,%4,2),%%xmm2 \n"
352 "movdqu 0x10(%0,%4,2),%%xmm3 \n"
353 "pmaddubsw %%xmm4,%%xmm2 \n"
354 "pmaddubsw %%xmm4,%%xmm3 \n"
355 "paddw %%xmm2,%%xmm0 \n"
356 "paddw %%xmm3,%%xmm1 \n"
357 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
358 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
359 "lea 0x20(%0),%0 \n"
360 "pmaddubsw %%xmm4,%%xmm2 \n"
361 "pmaddubsw %%xmm4,%%xmm3 \n"
362 "paddw %%xmm2,%%xmm0 \n"
363 "paddw %%xmm3,%%xmm1 \n"
364 "phaddw %%xmm1,%%xmm0 \n"
365 "paddw %%xmm5,%%xmm0 \n"
366 "psrlw $0x4,%%xmm0 \n"
367 "packuswb %%xmm0,%%xmm0 \n"
368 "movq %%xmm0,(%1) \n"
369 "lea 0x8(%1),%1 \n"
370 "sub $0x8,%2 \n"
371 "jg 1b \n"
372 : "+r"(src_ptr), // %0
373 "+r"(dst_ptr), // %1
374 "+r"(dst_width), // %2
375 "=&r"(stridex3) // %3
376 : "r"((intptr_t)(src_stride)) // %4
377 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382 ptrdiff_t src_stride,
383 uint8_t* dst_ptr,
384 int dst_width) {
385 (void)src_stride;
386 asm volatile(
387 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
388 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
389 "vpslld $0x10,%%ymm5,%%ymm5 \n"
390
391 LABELALIGN
392 "1: \n"
393 "vmovdqu (%0),%%ymm0 \n"
394 "vmovdqu 0x20(%0),%%ymm1 \n"
395 "lea 0x40(%0),%0 \n"
396 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
397 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
398 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
399 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
400 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
401 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
402 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
403 "vmovdqu %%xmm0,(%1) \n"
404 "lea 0x10(%1),%1 \n"
405 "sub $0x10,%2 \n"
406 "jg 1b \n"
407 "vzeroupper \n"
408 : "+r"(src_ptr), // %0
409 "+r"(dst_ptr), // %1
410 "+r"(dst_width) // %2
411 ::"memory",
412 "cc", "xmm0", "xmm1", "xmm5");
413 }
414
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416 ptrdiff_t src_stride,
417 uint8_t* dst_ptr,
418 int dst_width) {
419 asm volatile(
420 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
421 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
422 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
423 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
424
425 LABELALIGN
426 "1: \n"
427 "vmovdqu (%0),%%ymm0 \n"
428 "vmovdqu 0x20(%0),%%ymm1 \n"
429 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
430 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
431 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
432 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
433 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
434 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
435 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
436 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
437 "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
438 "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
439 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
440 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
441 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
442 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
443 "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
444 "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
445 "lea 0x40(%0),%0 \n"
446 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
447 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
448 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
449 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
450 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
451 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
452 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
453 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
454 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
455 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
456 "vmovdqu %%xmm0,(%1) \n"
457 "lea 0x10(%1),%1 \n"
458 "sub $0x10,%2 \n"
459 "jg 1b \n"
460 "vzeroupper \n"
461 : "+r"(src_ptr), // %0
462 "+r"(dst_ptr), // %1
463 "+r"(dst_width) // %2
464 : "r"((intptr_t)(src_stride)), // %3
465 "r"((intptr_t)(src_stride * 3)) // %4
466 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif // HAS_SCALEROWDOWN4_AVX2
469
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471 ptrdiff_t src_stride,
472 uint8_t* dst_ptr,
473 int dst_width) {
474 (void)src_stride;
475 asm volatile(
476 "movdqa %0,%%xmm3 \n"
477 "movdqa %1,%%xmm4 \n"
478 "movdqa %2,%%xmm5 \n"
479 :
480 : "m"(kShuf0), // %0
481 "m"(kShuf1), // %1
482 "m"(kShuf2) // %2
483 );
484 asm volatile(LABELALIGN
485 "1: \n"
486 "movdqu (%0),%%xmm0 \n"
487 "movdqu 0x10(%0),%%xmm2 \n"
488 "lea 0x20(%0),%0 \n"
489 "movdqa %%xmm2,%%xmm1 \n"
490 "palignr $0x8,%%xmm0,%%xmm1 \n"
491 "pshufb %%xmm3,%%xmm0 \n"
492 "pshufb %%xmm4,%%xmm1 \n"
493 "pshufb %%xmm5,%%xmm2 \n"
494 "movq %%xmm0,(%1) \n"
495 "movq %%xmm1,0x8(%1) \n"
496 "movq %%xmm2,0x10(%1) \n"
497 "lea 0x18(%1),%1 \n"
498 "sub $0x18,%2 \n"
499 "jg 1b \n"
500 : "+r"(src_ptr), // %0
501 "+r"(dst_ptr), // %1
502 "+r"(dst_width) // %2
503 ::"memory",
504 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508 ptrdiff_t src_stride,
509 uint8_t* dst_ptr,
510 int dst_width) {
511 asm volatile(
512 "movdqa %0,%%xmm2 \n" // kShuf01
513 "movdqa %1,%%xmm3 \n" // kShuf11
514 "movdqa %2,%%xmm4 \n" // kShuf21
515 :
516 : "m"(kShuf01), // %0
517 "m"(kShuf11), // %1
518 "m"(kShuf21) // %2
519 );
520 asm volatile(
521 "movdqa %0,%%xmm5 \n" // kMadd01
522 "movdqa %1,%%xmm0 \n" // kMadd11
523 "movdqa %2,%%xmm1 \n" // kRound34
524 :
525 : "m"(kMadd01), // %0
526 "m"(kMadd11), // %1
527 "m"(kRound34) // %2
528 );
529 asm volatile(LABELALIGN
530 "1: \n"
531 "movdqu (%0),%%xmm6 \n"
532 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
533 "pavgb %%xmm7,%%xmm6 \n"
534 "pshufb %%xmm2,%%xmm6 \n"
535 "pmaddubsw %%xmm5,%%xmm6 \n"
536 "paddsw %%xmm1,%%xmm6 \n"
537 "psrlw $0x2,%%xmm6 \n"
538 "packuswb %%xmm6,%%xmm6 \n"
539 "movq %%xmm6,(%1) \n"
540 "movdqu 0x8(%0),%%xmm6 \n"
541 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
542 "pavgb %%xmm7,%%xmm6 \n"
543 "pshufb %%xmm3,%%xmm6 \n"
544 "pmaddubsw %%xmm0,%%xmm6 \n"
545 "paddsw %%xmm1,%%xmm6 \n"
546 "psrlw $0x2,%%xmm6 \n"
547 "packuswb %%xmm6,%%xmm6 \n"
548 "movq %%xmm6,0x8(%1) \n"
549 "movdqu 0x10(%0),%%xmm6 \n"
550 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
551 "lea 0x20(%0),%0 \n"
552 "pavgb %%xmm7,%%xmm6 \n"
553 "pshufb %%xmm4,%%xmm6 \n"
554 "pmaddubsw %4,%%xmm6 \n"
555 "paddsw %%xmm1,%%xmm6 \n"
556 "psrlw $0x2,%%xmm6 \n"
557 "packuswb %%xmm6,%%xmm6 \n"
558 "movq %%xmm6,0x10(%1) \n"
559 "lea 0x18(%1),%1 \n"
560 "sub $0x18,%2 \n"
561 "jg 1b \n"
562 : "+r"(src_ptr), // %0
563 "+r"(dst_ptr), // %1
564 "+r"(dst_width) // %2
565 : "r"((intptr_t)(src_stride)), // %3
566 "m"(kMadd21) // %4
567 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568 "xmm6", "xmm7");
569 }
570
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572 ptrdiff_t src_stride,
573 uint8_t* dst_ptr,
574 int dst_width) {
575 asm volatile(
576 "movdqa %0,%%xmm2 \n" // kShuf01
577 "movdqa %1,%%xmm3 \n" // kShuf11
578 "movdqa %2,%%xmm4 \n" // kShuf21
579 :
580 : "m"(kShuf01), // %0
581 "m"(kShuf11), // %1
582 "m"(kShuf21) // %2
583 );
584 asm volatile(
585 "movdqa %0,%%xmm5 \n" // kMadd01
586 "movdqa %1,%%xmm0 \n" // kMadd11
587 "movdqa %2,%%xmm1 \n" // kRound34
588 :
589 : "m"(kMadd01), // %0
590 "m"(kMadd11), // %1
591 "m"(kRound34) // %2
592 );
593
594 asm volatile(LABELALIGN
595 "1: \n"
596 "movdqu (%0),%%xmm6 \n"
597 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
598 "pavgb %%xmm6,%%xmm7 \n"
599 "pavgb %%xmm7,%%xmm6 \n"
600 "pshufb %%xmm2,%%xmm6 \n"
601 "pmaddubsw %%xmm5,%%xmm6 \n"
602 "paddsw %%xmm1,%%xmm6 \n"
603 "psrlw $0x2,%%xmm6 \n"
604 "packuswb %%xmm6,%%xmm6 \n"
605 "movq %%xmm6,(%1) \n"
606 "movdqu 0x8(%0),%%xmm6 \n"
607 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
608 "pavgb %%xmm6,%%xmm7 \n"
609 "pavgb %%xmm7,%%xmm6 \n"
610 "pshufb %%xmm3,%%xmm6 \n"
611 "pmaddubsw %%xmm0,%%xmm6 \n"
612 "paddsw %%xmm1,%%xmm6 \n"
613 "psrlw $0x2,%%xmm6 \n"
614 "packuswb %%xmm6,%%xmm6 \n"
615 "movq %%xmm6,0x8(%1) \n"
616 "movdqu 0x10(%0),%%xmm6 \n"
617 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
618 "lea 0x20(%0),%0 \n"
619 "pavgb %%xmm6,%%xmm7 \n"
620 "pavgb %%xmm7,%%xmm6 \n"
621 "pshufb %%xmm4,%%xmm6 \n"
622 "pmaddubsw %4,%%xmm6 \n"
623 "paddsw %%xmm1,%%xmm6 \n"
624 "psrlw $0x2,%%xmm6 \n"
625 "packuswb %%xmm6,%%xmm6 \n"
626 "movq %%xmm6,0x10(%1) \n"
627 "lea 0x18(%1),%1 \n"
628 "sub $0x18,%2 \n"
629 "jg 1b \n"
630 : "+r"(src_ptr), // %0
631 "+r"(dst_ptr), // %1
632 "+r"(dst_width) // %2
633 : "r"((intptr_t)(src_stride)), // %3
634 "m"(kMadd21) // %4
635 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636 "xmm6", "xmm7");
637 }
638
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640 ptrdiff_t src_stride,
641 uint8_t* dst_ptr,
642 int dst_width) {
643 (void)src_stride;
644 asm volatile(
645 "movdqa %3,%%xmm4 \n"
646 "movdqa %4,%%xmm5 \n"
647
648 LABELALIGN
649 "1: \n"
650 "movdqu (%0),%%xmm0 \n"
651 "movdqu 0x10(%0),%%xmm1 \n"
652 "lea 0x20(%0),%0 \n"
653 "pshufb %%xmm4,%%xmm0 \n"
654 "pshufb %%xmm5,%%xmm1 \n"
655 "paddusb %%xmm1,%%xmm0 \n"
656 "movq %%xmm0,(%1) \n"
657 "movhlps %%xmm0,%%xmm1 \n"
658 "movd %%xmm1,0x8(%1) \n"
659 "lea 0xc(%1),%1 \n"
660 "sub $0xc,%2 \n"
661 "jg 1b \n"
662 : "+r"(src_ptr), // %0
663 "+r"(dst_ptr), // %1
664 "+r"(dst_width) // %2
665 : "m"(kShuf38a), // %3
666 "m"(kShuf38b) // %4
667 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671 ptrdiff_t src_stride,
672 uint8_t* dst_ptr,
673 int dst_width) {
674 asm volatile(
675 "movdqa %0,%%xmm2 \n"
676 "movdqa %1,%%xmm3 \n"
677 "movdqa %2,%%xmm4 \n"
678 "movdqa %3,%%xmm5 \n"
679 :
680 : "m"(kShufAb0), // %0
681 "m"(kShufAb1), // %1
682 "m"(kShufAb2), // %2
683 "m"(kScaleAb2) // %3
684 );
685 asm volatile(LABELALIGN
686 "1: \n"
687 "movdqu (%0),%%xmm0 \n"
688 "movdqu 0x00(%0,%3,1),%%xmm1 \n"
689 "lea 0x10(%0),%0 \n"
690 "pavgb %%xmm1,%%xmm0 \n"
691 "movdqa %%xmm0,%%xmm1 \n"
692 "pshufb %%xmm2,%%xmm1 \n"
693 "movdqa %%xmm0,%%xmm6 \n"
694 "pshufb %%xmm3,%%xmm6 \n"
695 "paddusw %%xmm6,%%xmm1 \n"
696 "pshufb %%xmm4,%%xmm0 \n"
697 "paddusw %%xmm0,%%xmm1 \n"
698 "pmulhuw %%xmm5,%%xmm1 \n"
699 "packuswb %%xmm1,%%xmm1 \n"
700 "movd %%xmm1,(%1) \n"
701 "psrlq $0x10,%%xmm1 \n"
702 "movd %%xmm1,0x2(%1) \n"
703 "lea 0x6(%1),%1 \n"
704 "sub $0x6,%2 \n"
705 "jg 1b \n"
706 : "+r"(src_ptr), // %0
707 "+r"(dst_ptr), // %1
708 "+r"(dst_width) // %2
709 : "r"((intptr_t)(src_stride)) // %3
710 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711 "xmm6");
712 }
713
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715 ptrdiff_t src_stride,
716 uint8_t* dst_ptr,
717 int dst_width) {
718 asm volatile(
719 "movdqa %0,%%xmm2 \n"
720 "movdqa %1,%%xmm3 \n"
721 "movdqa %2,%%xmm4 \n"
722 "pxor %%xmm5,%%xmm5 \n"
723 :
724 : "m"(kShufAc), // %0
725 "m"(kShufAc3), // %1
726 "m"(kScaleAc33) // %2
727 );
728 asm volatile(LABELALIGN
729 "1: \n"
730 "movdqu (%0),%%xmm0 \n"
731 "movdqu 0x00(%0,%3,1),%%xmm6 \n"
732 "movhlps %%xmm0,%%xmm1 \n"
733 "movhlps %%xmm6,%%xmm7 \n"
734 "punpcklbw %%xmm5,%%xmm0 \n"
735 "punpcklbw %%xmm5,%%xmm1 \n"
736 "punpcklbw %%xmm5,%%xmm6 \n"
737 "punpcklbw %%xmm5,%%xmm7 \n"
738 "paddusw %%xmm6,%%xmm0 \n"
739 "paddusw %%xmm7,%%xmm1 \n"
740 "movdqu 0x00(%0,%3,2),%%xmm6 \n"
741 "lea 0x10(%0),%0 \n"
742 "movhlps %%xmm6,%%xmm7 \n"
743 "punpcklbw %%xmm5,%%xmm6 \n"
744 "punpcklbw %%xmm5,%%xmm7 \n"
745 "paddusw %%xmm6,%%xmm0 \n"
746 "paddusw %%xmm7,%%xmm1 \n"
747 "movdqa %%xmm0,%%xmm6 \n"
748 "psrldq $0x2,%%xmm0 \n"
749 "paddusw %%xmm0,%%xmm6 \n"
750 "psrldq $0x2,%%xmm0 \n"
751 "paddusw %%xmm0,%%xmm6 \n"
752 "pshufb %%xmm2,%%xmm6 \n"
753 "movdqa %%xmm1,%%xmm7 \n"
754 "psrldq $0x2,%%xmm1 \n"
755 "paddusw %%xmm1,%%xmm7 \n"
756 "psrldq $0x2,%%xmm1 \n"
757 "paddusw %%xmm1,%%xmm7 \n"
758 "pshufb %%xmm3,%%xmm7 \n"
759 "paddusw %%xmm7,%%xmm6 \n"
760 "pmulhuw %%xmm4,%%xmm6 \n"
761 "packuswb %%xmm6,%%xmm6 \n"
762 "movd %%xmm6,(%1) \n"
763 "psrlq $0x10,%%xmm6 \n"
764 "movd %%xmm6,0x2(%1) \n"
765 "lea 0x6(%1),%1 \n"
766 "sub $0x6,%2 \n"
767 "jg 1b \n"
768 : "+r"(src_ptr), // %0
769 "+r"(dst_ptr), // %1
770 "+r"(dst_width) // %2
771 : "r"((intptr_t)(src_stride)) // %3
772 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773 "xmm6", "xmm7");
774 }
775
776 static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
777 10, 11, 8, 9, 14, 15, 12, 13};
778
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780 3, 1, 1, 3, 3, 1, 1, 3};
781
782 #ifdef HAS_SCALEROWUP2_LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784 uint8_t* dst_ptr,
785 int dst_width) {
786 asm volatile(
787 "pxor %%xmm0,%%xmm0 \n" // 0
788 "pcmpeqw %%xmm6,%%xmm6 \n"
789 "psrlw $15,%%xmm6 \n"
790 "psllw $1,%%xmm6 \n" // all 2
791
792 LABELALIGN
793 "1: \n"
794 "movq (%0),%%xmm1 \n" // 01234567
795 "movq 1(%0),%%xmm2 \n" // 12345678
796 "movdqa %%xmm1,%%xmm3 \n"
797 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
798 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
799 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
800 "movdqa %%xmm1,%%xmm4 \n"
801 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
802 "movdqa %%xmm2,%%xmm5 \n"
803 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
804 "paddw %%xmm5,%%xmm4 \n"
805 "movdqa %%xmm3,%%xmm5 \n"
806 "paddw %%xmm6,%%xmm4 \n"
807 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
808 "paddw %%xmm5,%%xmm5 \n"
809 "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
810 "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
811
812 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
813 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
814 "paddw %%xmm2,%%xmm1 \n"
815 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
816 "paddw %%xmm6,%%xmm1 \n"
817 "paddw %%xmm3,%%xmm3 \n"
818 "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
819 "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
820
821 "packuswb %%xmm1,%%xmm5 \n"
822 "movdqu %%xmm5,(%1) \n"
823
824 "lea 0x8(%0),%0 \n"
825 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
826 "sub $0x10,%2 \n"
827 "jg 1b \n"
828 : "+r"(src_ptr), // %0
829 "+r"(dst_ptr), // %1
830 "+r"(dst_width) // %2
831 :
832 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835
836 #ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838 ptrdiff_t src_stride,
839 uint8_t* dst_ptr,
840 ptrdiff_t dst_stride,
841 int dst_width) {
842 asm volatile(
843 LABELALIGN
844 "1: \n"
845 "pxor %%xmm0,%%xmm0 \n" // 0
846 // above line
847 "movq (%0),%%xmm1 \n" // 01234567
848 "movq 1(%0),%%xmm2 \n" // 12345678
849 "movdqa %%xmm1,%%xmm3 \n"
850 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
851 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
852 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
853
854 "movdqa %%xmm1,%%xmm4 \n"
855 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
856 "movdqa %%xmm2,%%xmm5 \n"
857 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
858 "paddw %%xmm5,%%xmm4 \n" // near+far
859 "movdqa %%xmm3,%%xmm5 \n"
860 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
861 "paddw %%xmm5,%%xmm5 \n" // 2*near
862 "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
863
864 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
865 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
866 "paddw %%xmm2,%%xmm1 \n"
867 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
868 "paddw %%xmm3,%%xmm3 \n" // 2*near
869 "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
870
871 // below line
872 "movq (%0,%3),%%xmm6 \n" // 01234567
873 "movq 1(%0,%3),%%xmm2 \n" // 12345678
874 "movdqa %%xmm6,%%xmm3 \n"
875 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
876 "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
877 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
878
879 "movdqa %%xmm6,%%xmm5 \n"
880 "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
881 "movdqa %%xmm2,%%xmm7 \n"
882 "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
883 "paddw %%xmm7,%%xmm5 \n" // near+far
884 "movdqa %%xmm3,%%xmm7 \n"
885 "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
886 "paddw %%xmm7,%%xmm7 \n" // 2*near
887 "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
888
889 "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
890 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
891 "paddw %%xmm6,%%xmm2 \n" // near+far
892 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
893 "paddw %%xmm3,%%xmm3 \n" // 2*near
894 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
895
896 // xmm4 xmm1
897 // xmm5 xmm2
898 "pcmpeqw %%xmm0,%%xmm0 \n"
899 "psrlw $15,%%xmm0 \n"
900 "psllw $3,%%xmm0 \n" // all 8
901
902 "movdqa %%xmm4,%%xmm3 \n"
903 "movdqa %%xmm5,%%xmm6 \n"
904 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
905 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
906 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
907 "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
908 "psrlw $4,%%xmm3 \n" // ^ div by 16
909
910 "movdqa %%xmm1,%%xmm7 \n"
911 "movdqa %%xmm2,%%xmm6 \n"
912 "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
913 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
914 "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
915 "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
916 "psrlw $4,%%xmm7 \n" // ^ div by 16
917
918 "packuswb %%xmm7,%%xmm3 \n"
919 "movdqu %%xmm3,(%1) \n" // save above line
920
921 "movdqa %%xmm5,%%xmm3 \n"
922 "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
923 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
924 "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
925 "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
926 "psrlw $4,%%xmm5 \n" // ^ div by 16
927
928 "movdqa %%xmm2,%%xmm3 \n"
929 "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
930 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
931 "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
932 "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
933 "psrlw $4,%%xmm2 \n" // ^ div by 16
934
935 "packuswb %%xmm2,%%xmm5 \n"
936 "movdqu %%xmm5,(%1,%4) \n" // save below line
937
938 "lea 0x8(%0),%0 \n"
939 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
940 "sub $0x10,%2 \n"
941 "jg 1b \n"
942 : "+r"(src_ptr), // %0
943 "+r"(dst_ptr), // %1
944 "+r"(dst_width) // %2
945 : "r"((intptr_t)(src_stride)), // %3
946 "r"((intptr_t)(dst_stride)) // %4
947 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948 "xmm7");
949 }
950 #endif
951
952 #ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954 uint16_t* dst_ptr,
955 int dst_width) {
956 asm volatile(
957 "movdqa %3,%%xmm5 \n"
958 "pcmpeqw %%xmm4,%%xmm4 \n"
959 "psrlw $15,%%xmm4 \n"
960 "psllw $1,%%xmm4 \n" // all 2
961
962 LABELALIGN
963 "1: \n"
964 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
965 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
966
967 "movdqa %%xmm0,%%xmm2 \n"
968 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
969 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
970
971 "movdqa %%xmm2,%%xmm3 \n"
972 "movdqa %%xmm0,%%xmm1 \n"
973 "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
974 "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
975
976 "paddw %%xmm4,%%xmm1 \n" // far+2
977 "paddw %%xmm4,%%xmm3 \n" // far+2
978 "paddw %%xmm0,%%xmm1 \n" // near+far+2
979 "paddw %%xmm2,%%xmm3 \n" // near+far+2
980 "paddw %%xmm0,%%xmm0 \n" // 2*near
981 "paddw %%xmm2,%%xmm2 \n" // 2*near
982 "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
983 "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
984
985 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
986 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
987 "movdqu %%xmm0,(%1) \n"
988 "movdqu %%xmm2,16(%1) \n"
989
990 "lea 0x10(%0),%0 \n"
991 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
992 "sub $0x10,%2 \n"
993 "jg 1b \n"
994 : "+r"(src_ptr), // %0
995 "+r"(dst_ptr), // %1
996 "+r"(dst_width) // %2
997 : "m"(kLinearShuffleFar) // %3
998 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001
1002 #ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004 ptrdiff_t src_stride,
1005 uint16_t* dst_ptr,
1006 ptrdiff_t dst_stride,
1007 int dst_width) {
1008 asm volatile(
1009 "pcmpeqw %%xmm7,%%xmm7 \n"
1010 "psrlw $15,%%xmm7 \n"
1011 "psllw $3,%%xmm7 \n" // all 8
1012 "movdqa %5,%%xmm6 \n"
1013
1014 LABELALIGN
1015 "1: \n"
1016 // above line
1017 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
1018 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
1019 "movdqa %%xmm0,%%xmm2 \n"
1020 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
1021 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
1022 "movdqa %%xmm2,%%xmm3 \n"
1023 "movdqa %%xmm0,%%xmm1 \n"
1024 "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
1025 "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
1026 "paddw %%xmm0,%%xmm1 \n" // near+far
1027 "paddw %%xmm2,%%xmm3 \n" // near+far
1028 "paddw %%xmm0,%%xmm0 \n" // 2*near
1029 "paddw %%xmm2,%%xmm2 \n" // 2*near
1030 "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
1031 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
1032
1033 // below line
1034 "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
1035 "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
1036 "movdqa %%xmm1,%%xmm3 \n"
1037 "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
1038 "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
1039 "movdqa %%xmm3,%%xmm5 \n"
1040 "movdqa %%xmm1,%%xmm4 \n"
1041 "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
1042 "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
1043 "paddw %%xmm1,%%xmm4 \n" // near+far
1044 "paddw %%xmm3,%%xmm5 \n" // near+far
1045 "paddw %%xmm1,%%xmm1 \n" // 2*near
1046 "paddw %%xmm3,%%xmm3 \n" // 2*near
1047 "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
1048 "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1049
1050 // xmm0 xmm2
1051 // xmm1 xmm3
1052
1053 "movdqa %%xmm0,%%xmm4 \n"
1054 "movdqa %%xmm1,%%xmm5 \n"
1055 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
1056 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
1057 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1058 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1059 "psrlw $4,%%xmm4 \n" // ^ div by 16
1060 "movdqu %%xmm4,(%1) \n"
1061
1062 "movdqa %%xmm2,%%xmm4 \n"
1063 "movdqa %%xmm3,%%xmm5 \n"
1064 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
1065 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
1066 "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
1067 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
1068 "psrlw $4,%%xmm4 \n" // ^ div by 16
1069 "movdqu %%xmm4,0x10(%1) \n"
1070
1071 "movdqa %%xmm1,%%xmm4 \n"
1072 "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
1073 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
1074 "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
1075 "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
1076 "psrlw $4,%%xmm1 \n" // ^ div by 16
1077 "movdqu %%xmm1,(%1,%4,2) \n"
1078
1079 "movdqa %%xmm3,%%xmm4 \n"
1080 "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
1081 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
1082 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
1083 "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
1084 "psrlw $4,%%xmm3 \n" // ^ div by 16
1085 "movdqu %%xmm3,0x10(%1,%4,2) \n"
1086
1087 "lea 0x10(%0),%0 \n"
1088 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1089 "sub $0x10,%2 \n"
1090 "jg 1b \n"
1091 : "+r"(src_ptr), // %0
1092 "+r"(dst_ptr), // %1
1093 "+r"(dst_width) // %2
1094 : "r"((intptr_t)(src_stride)), // %3
1095 "r"((intptr_t)(dst_stride)), // %4
1096 "m"(kLinearShuffleFar) // %5
1097 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1098 "xmm7");
1099 }
1100 #endif
1101
1102 #ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1103 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1104 uint16_t* dst_ptr,
1105 int dst_width) {
1106 asm volatile(
1107 "pxor %%xmm5,%%xmm5 \n"
1108 "pcmpeqd %%xmm4,%%xmm4 \n"
1109 "psrld $31,%%xmm4 \n"
1110 "pslld $1,%%xmm4 \n" // all 2
1111
1112 LABELALIGN
1113 "1: \n"
1114 "movq (%0),%%xmm0 \n" // 0123 (16b)
1115 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1116
1117 "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
1118 "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
1119
1120 "movdqa %%xmm0,%%xmm2 \n"
1121 "movdqa %%xmm1,%%xmm3 \n"
1122
1123 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1124 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1125
1126 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
1127 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
1128 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
1129 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
1130 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1131 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1132 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
1133 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
1134
1135 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1136 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
1137 "packssdw %%xmm1,%%xmm0 \n"
1138 "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
1139 "movdqu %%xmm0,(%1) \n"
1140
1141 "lea 0x8(%0),%0 \n"
1142 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1143 "sub $0x8,%2 \n"
1144 "jg 1b \n"
1145 : "+r"(src_ptr), // %0
1146 "+r"(dst_ptr), // %1
1147 "+r"(dst_width) // %2
1148 :
1149 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1150 }
1151 #endif
1152
1153 #ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1154 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1155 ptrdiff_t src_stride,
1156 uint16_t* dst_ptr,
1157 ptrdiff_t dst_stride,
1158 int dst_width) {
1159 asm volatile(
1160 "pxor %%xmm7,%%xmm7 \n"
1161 "pcmpeqd %%xmm6,%%xmm6 \n"
1162 "psrld $31,%%xmm6 \n"
1163 "pslld $3,%%xmm6 \n" // all 8
1164
1165 LABELALIGN
1166 "1: \n"
1167 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
1168 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
1169 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
1170 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
1171 "movdqa %%xmm0,%%xmm2 \n"
1172 "movdqa %%xmm1,%%xmm3 \n"
1173 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
1174 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
1175 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
1176 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
1177 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
1178 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
1179 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1180 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1181
1182 "movq (%0),%%xmm0 \n" // 0123 (16b)
1183 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1184 "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
1185 "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
1186 "movdqa %%xmm0,%%xmm2 \n"
1187 "movdqa %%xmm1,%%xmm3 \n"
1188 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1189 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1190 "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
1191 "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
1192 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1193 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1194 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1195 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1196
1197 "movq (%0,%3,2),%%xmm2 \n"
1198 "movq 2(%0,%3,2),%%xmm3 \n"
1199 "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
1200 "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
1201 "movdqa %%xmm2,%%xmm4 \n"
1202 "movdqa %%xmm3,%%xmm5 \n"
1203 "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
1204 "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
1205 "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
1206 "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
1207 "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
1208 "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
1209 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
1210 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1211
1212 "movdqa %%xmm0,%%xmm4 \n"
1213 "movdqa %%xmm2,%%xmm5 \n"
1214 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1215 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1216 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1217 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1218 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1219
1220 "movdqa %%xmm2,%%xmm5 \n"
1221 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
1222 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1223 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
1224 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1225 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1226
1227 "movdqa %%xmm1,%%xmm0 \n"
1228 "movdqa %%xmm3,%%xmm2 \n"
1229 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
1230 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
1231 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
1232 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1233 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1234
1235 "movdqa %%xmm3,%%xmm2 \n"
1236 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
1237 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
1238 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
1239 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
1240 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
1241
1242 "packssdw %%xmm0,%%xmm4 \n"
1243 "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1244 "movdqu %%xmm4,(%1) \n" // store above
1245 "packssdw %%xmm2,%%xmm5 \n"
1246 "pshufd $0b11011000,%%xmm5,%%xmm5 \n"
1247 "movdqu %%xmm5,(%1,%4,2) \n" // store below
1248
1249 "lea 0x8(%0),%0 \n"
1250 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1251 "sub $0x8,%2 \n"
1252 "jg 1b \n"
1253 : "+r"(src_ptr), // %0
1254 "+r"(dst_ptr), // %1
1255 "+r"(dst_width) // %2
1256 : "r"((intptr_t)(src_stride)), // %3
1257 "r"((intptr_t)(dst_stride)) // %4
1258 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1259 "xmm7");
1260 }
1261 #endif
1262
1263 #ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1264 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1265 uint8_t* dst_ptr,
1266 int dst_width) {
1267 asm volatile(
1268 "pcmpeqw %%xmm4,%%xmm4 \n"
1269 "psrlw $15,%%xmm4 \n"
1270 "psllw $1,%%xmm4 \n" // all 2
1271 "movdqa %3,%%xmm3 \n"
1272
1273 LABELALIGN
1274 "1: \n"
1275 "movq (%0),%%xmm0 \n" // 01234567
1276 "movq 1(%0),%%xmm1 \n" // 12345678
1277 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1278 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1279 "movdqa %%xmm0,%%xmm2 \n"
1280 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1281 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1282 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
1283 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
1284 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
1285 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
1286 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1287 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
1288 "packuswb %%xmm2,%%xmm0 \n"
1289 "movdqu %%xmm0,(%1) \n"
1290 "lea 0x8(%0),%0 \n"
1291 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1292 "sub $0x10,%2 \n"
1293 "jg 1b \n"
1294 : "+r"(src_ptr), // %0
1295 "+r"(dst_ptr), // %1
1296 "+r"(dst_width) // %2
1297 : "m"(kLinearMadd31) // %3
1298 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1299 }
1300 #endif
1301
1302 #ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1303 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1304 ptrdiff_t src_stride,
1305 uint8_t* dst_ptr,
1306 ptrdiff_t dst_stride,
1307 int dst_width) {
1308 asm volatile(
1309 "pcmpeqw %%xmm6,%%xmm6 \n"
1310 "psrlw $15,%%xmm6 \n"
1311 "psllw $3,%%xmm6 \n" // all 8
1312 "movdqa %5,%%xmm7 \n"
1313
1314 LABELALIGN
1315 "1: \n"
1316 "movq (%0),%%xmm0 \n" // 01234567
1317 "movq 1(%0),%%xmm1 \n" // 12345678
1318 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1319 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1320 "movdqa %%xmm0,%%xmm2 \n"
1321 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1322 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1323 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
1324 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
1325
1326 "movq (%0,%3),%%xmm1 \n"
1327 "movq 1(%0,%3),%%xmm4 \n"
1328 "punpcklwd %%xmm1,%%xmm1 \n"
1329 "punpcklwd %%xmm4,%%xmm4 \n"
1330 "movdqa %%xmm1,%%xmm3 \n"
1331 "punpckhdq %%xmm4,%%xmm3 \n"
1332 "punpckldq %%xmm4,%%xmm1 \n"
1333 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
1334 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
1335
1336 // xmm0 xmm2
1337 // xmm1 xmm3
1338
1339 "movdqa %%xmm0,%%xmm4 \n"
1340 "movdqa %%xmm1,%%xmm5 \n"
1341 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1342 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1343 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1344 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1345 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1346
1347 "movdqa %%xmm1,%%xmm5 \n"
1348 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
1349 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1350 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
1351 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1352 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1353
1354 "movdqa %%xmm2,%%xmm0 \n"
1355 "movdqa %%xmm3,%%xmm1 \n"
1356 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
1357 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
1358 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
1359 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1360 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1361
1362 "movdqa %%xmm3,%%xmm1 \n"
1363 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
1364 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
1365 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
1366 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
1367 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
1368
1369 "packuswb %%xmm0,%%xmm4 \n"
1370 "movdqu %%xmm4,(%1) \n" // store above
1371 "packuswb %%xmm1,%%xmm5 \n"
1372 "movdqu %%xmm5,(%1,%4) \n" // store below
1373
1374 "lea 0x8(%0),%0 \n"
1375 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1376 "sub $0x10,%2 \n"
1377 "jg 1b \n"
1378 : "+r"(src_ptr), // %0
1379 "+r"(dst_ptr), // %1
1380 "+r"(dst_width) // %2
1381 : "r"((intptr_t)(src_stride)), // %3
1382 "r"((intptr_t)(dst_stride)), // %4
1383 "m"(kLinearMadd31) // %5
1384 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1385 "xmm7");
1386 }
1387 #endif
1388
1389 #ifdef HAS_SCALEROWUP2_LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1390 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1391 uint8_t* dst_ptr,
1392 int dst_width) {
1393 asm volatile(
1394 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1395 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1396 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1397 "vbroadcastf128 %3,%%ymm3 \n"
1398
1399 LABELALIGN
1400 "1: \n"
1401 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1402 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1403 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1404 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1405 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1406 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1407 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1408 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1409 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
1410 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
1411 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
1412 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
1413 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1414 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1415 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1416 "vmovdqu %%ymm0,(%1) \n"
1417
1418 "lea 0x10(%0),%0 \n"
1419 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1420 "sub $0x20,%2 \n"
1421 "jg 1b \n"
1422 "vzeroupper \n"
1423 : "+r"(src_ptr), // %0
1424 "+r"(dst_ptr), // %1
1425 "+r"(dst_width) // %2
1426 : "m"(kLinearMadd31) // %3
1427 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1428 }
1429 #endif
1430
1431 #ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1432 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1433 ptrdiff_t src_stride,
1434 uint8_t* dst_ptr,
1435 ptrdiff_t dst_stride,
1436 int dst_width) {
1437 asm volatile(
1438 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
1439 "vpsrlw $15,%%ymm6,%%ymm6 \n"
1440 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
1441 "vbroadcastf128 %5,%%ymm7 \n"
1442
1443 LABELALIGN
1444 "1: \n"
1445 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1446 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1447 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1448 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1449 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1450 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1451 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1452 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1453 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
1454 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
1455
1456 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
1457 "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
1458 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
1459 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
1460 "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
1461 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
1462 "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
1463 "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
1464 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
1465 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
1466
1467 // ymm0 ymm1
1468 // ymm2 ymm3
1469
1470 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1471 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1472 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1473 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1474 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1475
1476 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1477 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1478 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1479 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1480 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1481
1482 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1483 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1484 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1485 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1486 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1487
1488 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1489 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1490 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1491 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1492 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1493
1494 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
1495 "vmovdqu %%ymm4,(%1) \n" // store above
1496 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
1497 "vmovdqu %%ymm5,(%1,%4) \n" // store below
1498
1499 "lea 0x10(%0),%0 \n"
1500 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1501 "sub $0x20,%2 \n"
1502 "jg 1b \n"
1503 "vzeroupper \n"
1504 : "+r"(src_ptr), // %0
1505 "+r"(dst_ptr), // %1
1506 "+r"(dst_width) // %2
1507 : "r"((intptr_t)(src_stride)), // %3
1508 "r"((intptr_t)(dst_stride)), // %4
1509 "m"(kLinearMadd31) // %5
1510 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1511 "xmm7");
1512 }
1513 #endif
1514
1515 #ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1516 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1517 uint16_t* dst_ptr,
1518 int dst_width) {
1519 asm volatile(
1520 "vbroadcastf128 %3,%%ymm5 \n"
1521 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1522 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1523 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1524
1525 LABELALIGN
1526 "1: \n"
1527 "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
1528 "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
1529
1530 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
1531 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
1532
1533 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
1534 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1535 "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
1536 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1537
1538 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
1539 "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
1540 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
1541 "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
1542 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1543 "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
1544 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
1545 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
1546
1547 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
1548 "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
1549 "vmovdqu %%ymm0,(%1) \n"
1550 "vmovdqu %%ymm2,32(%1) \n"
1551
1552 "lea 0x20(%0),%0 \n"
1553 "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
1554 "sub $0x20,%2 \n"
1555 "jg 1b \n"
1556 "vzeroupper \n"
1557 : "+r"(src_ptr), // %0
1558 "+r"(dst_ptr), // %1
1559 "+r"(dst_width) // %2
1560 : "m"(kLinearShuffleFar) // %3
1561 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1562 }
1563 #endif
1564
1565 #ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1566 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1567 ptrdiff_t src_stride,
1568 uint16_t* dst_ptr,
1569 ptrdiff_t dst_stride,
1570 int dst_width) {
1571 asm volatile(
1572 "vbroadcastf128 %5,%%ymm5 \n"
1573 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1574 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1575 "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
1576
1577 LABELALIGN
1578 "1: \n"
1579
1580 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
1581 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
1582 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1583 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1584 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1585 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1586 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1587 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1588 "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
1589
1590 "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
1591 "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
1592 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1593 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1594 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1595 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1596 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1597 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1598 "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
1599
1600 "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
1601 "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
1602 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
1603 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
1604 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1605 "vmovdqu %%ymm0,(%1) \n" // store above
1606
1607 "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
1608 "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
1609 "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
1610 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
1611 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1612 "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
1613
1614 "lea 0x10(%0),%0 \n"
1615 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1616 "sub $0x10,%2 \n"
1617 "jg 1b \n"
1618 "vzeroupper \n"
1619 : "+r"(src_ptr), // %0
1620 "+r"(dst_ptr), // %1
1621 "+r"(dst_width) // %2
1622 : "r"((intptr_t)(src_stride)), // %3
1623 "r"((intptr_t)(dst_stride)), // %4
1624 "m"(kLinearShuffleFar) // %5
1625 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1626 }
1627 #endif
1628
1629 #ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1630 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1631 uint16_t* dst_ptr,
1632 int dst_width) {
1633 asm volatile(
1634 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
1635 "vpsrld $31,%%ymm4,%%ymm4 \n"
1636 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
1637
1638 LABELALIGN
1639 "1: \n"
1640 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1641 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1642
1643 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1644 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1645
1646 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1647 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1648
1649 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
1650 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
1651 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
1652 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
1653 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1654 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1655 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
1656 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
1657
1658 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1659 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1660 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
1661 "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
1662 "vmovdqu %%ymm0,(%1) \n"
1663
1664 "lea 0x10(%0),%0 \n"
1665 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1666 "sub $0x10,%2 \n"
1667 "jg 1b \n"
1668 "vzeroupper \n"
1669 : "+r"(src_ptr), // %0
1670 "+r"(dst_ptr), // %1
1671 "+r"(dst_width) // %2
1672 :
1673 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1674 }
1675 #endif
1676
1677 #ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1678 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1679 ptrdiff_t src_stride,
1680 uint16_t* dst_ptr,
1681 ptrdiff_t dst_stride,
1682 int dst_width) {
1683 asm volatile(
1684 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
1685 "vpsrld $31,%%ymm6,%%ymm6 \n"
1686 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
1687
1688 LABELALIGN
1689 "1: \n"
1690
1691 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1692 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1693 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1694 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1695 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1696 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1697 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
1698 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
1699 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1700 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1701 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
1702 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
1703
1704 "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
1705 "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
1706 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
1707 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
1708 "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
1709 "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
1710 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
1711 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
1712 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
1713 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
1714 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
1715 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
1716
1717 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1718 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1719 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1720 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1721 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1722
1723 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1724 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1725 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1726 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1727 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1728
1729 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1730 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1731 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1732 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1733 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1734
1735 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1736 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1737 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1738 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1739 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1740
1741 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
1742 "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
1743 "vmovdqu %%ymm4,(%1) \n" // store above
1744 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
1745 "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
1746 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
1747
1748 "lea 0x10(%0),%0 \n"
1749 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1750 "sub $0x10,%2 \n"
1751 "jg 1b \n"
1752 "vzeroupper \n"
1753 : "+r"(src_ptr), // %0
1754 "+r"(dst_ptr), // %1
1755 "+r"(dst_width) // %2
1756 : "r"((intptr_t)(src_stride)), // %3
1757 "r"((intptr_t)(dst_stride)) // %4
1758 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1759 }
1760 #endif
1761
1762 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1763 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1764 uint16_t* dst_ptr,
1765 int src_width) {
1766 asm volatile("pxor %%xmm5,%%xmm5 \n"
1767
1768 // 16 pixel loop.
1769 LABELALIGN
1770 "1: \n"
1771 "movdqu (%0),%%xmm3 \n"
1772 "lea 0x10(%0),%0 \n" // src_ptr += 16
1773 "movdqu (%1),%%xmm0 \n"
1774 "movdqu 0x10(%1),%%xmm1 \n"
1775 "movdqa %%xmm3,%%xmm2 \n"
1776 "punpcklbw %%xmm5,%%xmm2 \n"
1777 "punpckhbw %%xmm5,%%xmm3 \n"
1778 "paddusw %%xmm2,%%xmm0 \n"
1779 "paddusw %%xmm3,%%xmm1 \n"
1780 "movdqu %%xmm0,(%1) \n"
1781 "movdqu %%xmm1,0x10(%1) \n"
1782 "lea 0x20(%1),%1 \n"
1783 "sub $0x10,%2 \n"
1784 "jg 1b \n"
1785 : "+r"(src_ptr), // %0
1786 "+r"(dst_ptr), // %1
1787 "+r"(src_width) // %2
1788 :
1789 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1790 }
1791
1792 #ifdef HAS_SCALEADDROW_AVX2
1793 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1794 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1795 uint16_t* dst_ptr,
1796 int src_width) {
1797 asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
1798
1799 LABELALIGN
1800 "1: \n"
1801 "vmovdqu (%0),%%ymm3 \n"
1802 "lea 0x20(%0),%0 \n" // src_ptr += 32
1803 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
1804 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
1805 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
1806 "vpaddusw (%1),%%ymm2,%%ymm0 \n"
1807 "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
1808 "vmovdqu %%ymm0,(%1) \n"
1809 "vmovdqu %%ymm1,0x20(%1) \n"
1810 "lea 0x40(%1),%1 \n"
1811 "sub $0x20,%2 \n"
1812 "jg 1b \n"
1813 "vzeroupper \n"
1814 : "+r"(src_ptr), // %0
1815 "+r"(dst_ptr), // %1
1816 "+r"(src_width) // %2
1817 :
1818 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1819 }
1820 #endif // HAS_SCALEADDROW_AVX2
1821
1822 // Constant for making pixels signed to avoid pmaddubsw
1823 // saturation.
1824 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1825 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1826
1827 // Constant for making pixels unsigned and adding .5 for rounding.
1828 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1829 0x4040, 0x4040, 0x4040, 0x4040};
1830
1831 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1832 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1833 const uint8_t* src_ptr,
1834 int dst_width,
1835 int x,
1836 int dx) {
1837 intptr_t x0, x1, temp_pixel;
1838 asm volatile(
1839 "movd %6,%%xmm2 \n"
1840 "movd %7,%%xmm3 \n"
1841 "movl $0x04040000,%k2 \n"
1842 "movd %k2,%%xmm5 \n"
1843 "pcmpeqb %%xmm6,%%xmm6 \n"
1844 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
1845 "pcmpeqb %%xmm7,%%xmm7 \n"
1846 "psrlw $15,%%xmm7 \n" // 0x00010001
1847
1848 "pextrw $0x1,%%xmm2,%k3 \n"
1849 "subl $0x2,%5 \n"
1850 "jl 29f \n"
1851 "movdqa %%xmm2,%%xmm0 \n"
1852 "paddd %%xmm3,%%xmm0 \n"
1853 "punpckldq %%xmm0,%%xmm2 \n"
1854 "punpckldq %%xmm3,%%xmm3 \n"
1855 "paddd %%xmm3,%%xmm3 \n"
1856 "pextrw $0x3,%%xmm2,%k4 \n"
1857
1858 LABELALIGN
1859 "2: \n"
1860 "movdqa %%xmm2,%%xmm1 \n"
1861 "paddd %%xmm3,%%xmm2 \n"
1862 "movzwl 0x00(%1,%3,1),%k2 \n"
1863 "movd %k2,%%xmm0 \n"
1864 "psrlw $0x9,%%xmm1 \n"
1865 "movzwl 0x00(%1,%4,1),%k2 \n"
1866 "movd %k2,%%xmm4 \n"
1867 "pshufb %%xmm5,%%xmm1 \n"
1868 "punpcklwd %%xmm4,%%xmm0 \n"
1869 "psubb %8,%%xmm0 \n" // make pixels signed.
1870 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
1871 // 1
1872 "paddusb %%xmm7,%%xmm1 \n"
1873 "pmaddubsw %%xmm0,%%xmm1 \n"
1874 "pextrw $0x1,%%xmm2,%k3 \n"
1875 "pextrw $0x3,%%xmm2,%k4 \n"
1876 "paddw %9,%%xmm1 \n" // make pixels unsigned.
1877 "psrlw $0x7,%%xmm1 \n"
1878 "packuswb %%xmm1,%%xmm1 \n"
1879 "movd %%xmm1,%k2 \n"
1880 "mov %w2,(%0) \n"
1881 "lea 0x2(%0),%0 \n"
1882 "subl $0x2,%5 \n"
1883 "jge 2b \n"
1884
1885 LABELALIGN
1886 "29: \n"
1887 "addl $0x1,%5 \n"
1888 "jl 99f \n"
1889 "movzwl 0x00(%1,%3,1),%k2 \n"
1890 "movd %k2,%%xmm0 \n"
1891 "psrlw $0x9,%%xmm2 \n"
1892 "pshufb %%xmm5,%%xmm2 \n"
1893 "psubb %8,%%xmm0 \n" // make pixels signed.
1894 "pxor %%xmm6,%%xmm2 \n"
1895 "paddusb %%xmm7,%%xmm2 \n"
1896 "pmaddubsw %%xmm0,%%xmm2 \n"
1897 "paddw %9,%%xmm2 \n" // make pixels unsigned.
1898 "psrlw $0x7,%%xmm2 \n"
1899 "packuswb %%xmm2,%%xmm2 \n"
1900 "movd %%xmm2,%k2 \n"
1901 "mov %b2,(%0) \n"
1902 "99: \n"
1903 : "+r"(dst_ptr), // %0
1904 "+r"(src_ptr), // %1
1905 "=&a"(temp_pixel), // %2
1906 "=&r"(x0), // %3
1907 "=&r"(x1), // %4
1908 #if defined(__x86_64__)
1909 "+rm"(dst_width) // %5
1910 #else
1911 "+m"(dst_width) // %5
1912 #endif
1913 : "rm"(x), // %6
1914 "rm"(dx), // %7
1915 #if defined(__x86_64__)
1916 "x"(kFsub80), // %8
1917 "x"(kFadd40) // %9
1918 #else
1919 "m"(kFsub80), // %8
1920 "m"(kFadd40) // %9
1921 #endif
1922 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1923 "xmm7");
1924 }
1925
1926 // Reads 4 pixels, duplicates them and writes 8 pixels.
1927 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1928 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1929 const uint8_t* src_ptr,
1930 int dst_width,
1931 int x,
1932 int dx) {
1933 (void)x;
1934 (void)dx;
1935 asm volatile(LABELALIGN
1936 "1: \n"
1937 "movdqu (%1),%%xmm0 \n"
1938 "lea 0x10(%1),%1 \n"
1939 "movdqa %%xmm0,%%xmm1 \n"
1940 "punpcklbw %%xmm0,%%xmm0 \n"
1941 "punpckhbw %%xmm1,%%xmm1 \n"
1942 "movdqu %%xmm0,(%0) \n"
1943 "movdqu %%xmm1,0x10(%0) \n"
1944 "lea 0x20(%0),%0 \n"
1945 "sub $0x20,%2 \n"
1946 "jg 1b \n"
1947
1948 : "+r"(dst_ptr), // %0
1949 "+r"(src_ptr), // %1
1950 "+r"(dst_width) // %2
1951 ::"memory",
1952 "cc", "xmm0", "xmm1");
1953 }
1954
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1955 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1956 ptrdiff_t src_stride,
1957 uint8_t* dst_argb,
1958 int dst_width) {
1959 (void)src_stride;
1960 asm volatile(LABELALIGN
1961 "1: \n"
1962 "movdqu (%0),%%xmm0 \n"
1963 "movdqu 0x10(%0),%%xmm1 \n"
1964 "lea 0x20(%0),%0 \n"
1965 "shufps $0xdd,%%xmm1,%%xmm0 \n"
1966 "movdqu %%xmm0,(%1) \n"
1967 "lea 0x10(%1),%1 \n"
1968 "sub $0x4,%2 \n"
1969 "jg 1b \n"
1970 : "+r"(src_argb), // %0
1971 "+r"(dst_argb), // %1
1972 "+r"(dst_width) // %2
1973 ::"memory",
1974 "cc", "xmm0", "xmm1");
1975 }
1976
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1977 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1978 ptrdiff_t src_stride,
1979 uint8_t* dst_argb,
1980 int dst_width) {
1981 (void)src_stride;
1982 asm volatile(LABELALIGN
1983 "1: \n"
1984 "movdqu (%0),%%xmm0 \n"
1985 "movdqu 0x10(%0),%%xmm1 \n"
1986 "lea 0x20(%0),%0 \n"
1987 "movdqa %%xmm0,%%xmm2 \n"
1988 "shufps $0x88,%%xmm1,%%xmm0 \n"
1989 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1990 "pavgb %%xmm2,%%xmm0 \n"
1991 "movdqu %%xmm0,(%1) \n"
1992 "lea 0x10(%1),%1 \n"
1993 "sub $0x4,%2 \n"
1994 "jg 1b \n"
1995 : "+r"(src_argb), // %0
1996 "+r"(dst_argb), // %1
1997 "+r"(dst_width) // %2
1998 ::"memory",
1999 "cc", "xmm0", "xmm1");
2000 }
2001
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2002 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2003 ptrdiff_t src_stride,
2004 uint8_t* dst_argb,
2005 int dst_width) {
2006 asm volatile(LABELALIGN
2007 "1: \n"
2008 "movdqu (%0),%%xmm0 \n"
2009 "movdqu 0x10(%0),%%xmm1 \n"
2010 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
2011 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
2012 "lea 0x20(%0),%0 \n"
2013 "pavgb %%xmm2,%%xmm0 \n"
2014 "pavgb %%xmm3,%%xmm1 \n"
2015 "movdqa %%xmm0,%%xmm2 \n"
2016 "shufps $0x88,%%xmm1,%%xmm0 \n"
2017 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2018 "pavgb %%xmm2,%%xmm0 \n"
2019 "movdqu %%xmm0,(%1) \n"
2020 "lea 0x10(%1),%1 \n"
2021 "sub $0x4,%2 \n"
2022 "jg 1b \n"
2023 : "+r"(src_argb), // %0
2024 "+r"(dst_argb), // %1
2025 "+r"(dst_width) // %2
2026 : "r"((intptr_t)(src_stride)) // %3
2027 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2028 }
2029
2030 // Reads 4 pixels at a time.
2031 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2032 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2033 ptrdiff_t src_stride,
2034 int src_stepx,
2035 uint8_t* dst_argb,
2036 int dst_width) {
2037 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2038 intptr_t src_stepx_x12;
2039 (void)src_stride;
2040 asm volatile(
2041 "lea 0x00(,%1,4),%1 \n"
2042 "lea 0x00(%1,%1,2),%4 \n"
2043
2044 LABELALIGN
2045 "1: \n"
2046 "movd (%0),%%xmm0 \n"
2047 "movd 0x00(%0,%1,1),%%xmm1 \n"
2048 "punpckldq %%xmm1,%%xmm0 \n"
2049 "movd 0x00(%0,%1,2),%%xmm2 \n"
2050 "movd 0x00(%0,%4,1),%%xmm3 \n"
2051 "lea 0x00(%0,%1,4),%0 \n"
2052 "punpckldq %%xmm3,%%xmm2 \n"
2053 "punpcklqdq %%xmm2,%%xmm0 \n"
2054 "movdqu %%xmm0,(%2) \n"
2055 "lea 0x10(%2),%2 \n"
2056 "sub $0x4,%3 \n"
2057 "jg 1b \n"
2058 : "+r"(src_argb), // %0
2059 "+r"(src_stepx_x4), // %1
2060 "+r"(dst_argb), // %2
2061 "+r"(dst_width), // %3
2062 "=&r"(src_stepx_x12) // %4
2063 ::"memory",
2064 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2065 }
2066
2067 // Blends four 2x2 to 4x1.
2068 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2069 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2070 ptrdiff_t src_stride,
2071 int src_stepx,
2072 uint8_t* dst_argb,
2073 int dst_width) {
2074 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2075 intptr_t src_stepx_x12;
2076 intptr_t row1 = (intptr_t)(src_stride);
2077 asm volatile(
2078 "lea 0x00(,%1,4),%1 \n"
2079 "lea 0x00(%1,%1,2),%4 \n"
2080 "lea 0x00(%0,%5,1),%5 \n"
2081
2082 LABELALIGN
2083 "1: \n"
2084 "movq (%0),%%xmm0 \n"
2085 "movhps 0x00(%0,%1,1),%%xmm0 \n"
2086 "movq 0x00(%0,%1,2),%%xmm1 \n"
2087 "movhps 0x00(%0,%4,1),%%xmm1 \n"
2088 "lea 0x00(%0,%1,4),%0 \n"
2089 "movq (%5),%%xmm2 \n"
2090 "movhps 0x00(%5,%1,1),%%xmm2 \n"
2091 "movq 0x00(%5,%1,2),%%xmm3 \n"
2092 "movhps 0x00(%5,%4,1),%%xmm3 \n"
2093 "lea 0x00(%5,%1,4),%5 \n"
2094 "pavgb %%xmm2,%%xmm0 \n"
2095 "pavgb %%xmm3,%%xmm1 \n"
2096 "movdqa %%xmm0,%%xmm2 \n"
2097 "shufps $0x88,%%xmm1,%%xmm0 \n"
2098 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2099 "pavgb %%xmm2,%%xmm0 \n"
2100 "movdqu %%xmm0,(%2) \n"
2101 "lea 0x10(%2),%2 \n"
2102 "sub $0x4,%3 \n"
2103 "jg 1b \n"
2104 : "+r"(src_argb), // %0
2105 "+r"(src_stepx_x4), // %1
2106 "+r"(dst_argb), // %2
2107 "+rm"(dst_width), // %3
2108 "=&r"(src_stepx_x12), // %4
2109 "+r"(row1) // %5
2110 ::"memory",
2111 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2112 }
2113
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2114 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2115 const uint8_t* src_argb,
2116 int dst_width,
2117 int x,
2118 int dx) {
2119 intptr_t x0, x1;
2120 asm volatile(
2121 "movd %5,%%xmm2 \n"
2122 "movd %6,%%xmm3 \n"
2123 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2124 "pshufd $0x11,%%xmm3,%%xmm0 \n"
2125 "paddd %%xmm0,%%xmm2 \n"
2126 "paddd %%xmm3,%%xmm3 \n"
2127 "pshufd $0x5,%%xmm3,%%xmm0 \n"
2128 "paddd %%xmm0,%%xmm2 \n"
2129 "paddd %%xmm3,%%xmm3 \n"
2130 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2131 "pextrw $0x1,%%xmm2,%k0 \n"
2132 "pextrw $0x3,%%xmm2,%k1 \n"
2133 "cmp $0x0,%4 \n"
2134 "jl 99f \n"
2135 "sub $0x4,%4 \n"
2136 "jl 49f \n"
2137
2138 LABELALIGN
2139 "40: \n"
2140 "movd 0x00(%3,%0,4),%%xmm0 \n"
2141 "movd 0x00(%3,%1,4),%%xmm1 \n"
2142 "pextrw $0x5,%%xmm2,%k0 \n"
2143 "pextrw $0x7,%%xmm2,%k1 \n"
2144 "paddd %%xmm3,%%xmm2 \n"
2145 "punpckldq %%xmm1,%%xmm0 \n"
2146 "movd 0x00(%3,%0,4),%%xmm1 \n"
2147 "movd 0x00(%3,%1,4),%%xmm4 \n"
2148 "pextrw $0x1,%%xmm2,%k0 \n"
2149 "pextrw $0x3,%%xmm2,%k1 \n"
2150 "punpckldq %%xmm4,%%xmm1 \n"
2151 "punpcklqdq %%xmm1,%%xmm0 \n"
2152 "movdqu %%xmm0,(%2) \n"
2153 "lea 0x10(%2),%2 \n"
2154 "sub $0x4,%4 \n"
2155 "jge 40b \n"
2156
2157 "49: \n"
2158 "test $0x2,%4 \n"
2159 "je 29f \n"
2160 "movd 0x00(%3,%0,4),%%xmm0 \n"
2161 "movd 0x00(%3,%1,4),%%xmm1 \n"
2162 "pextrw $0x5,%%xmm2,%k0 \n"
2163 "punpckldq %%xmm1,%%xmm0 \n"
2164 "movq %%xmm0,(%2) \n"
2165 "lea 0x8(%2),%2 \n"
2166 "29: \n"
2167 "test $0x1,%4 \n"
2168 "je 99f \n"
2169 "movd 0x00(%3,%0,4),%%xmm0 \n"
2170 "movd %%xmm0,(%2) \n"
2171 "99: \n"
2172 : "=&a"(x0), // %0
2173 "=&d"(x1), // %1
2174 "+r"(dst_argb), // %2
2175 "+r"(src_argb), // %3
2176 "+r"(dst_width) // %4
2177 : "rm"(x), // %5
2178 "rm"(dx) // %6
2179 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2180 }
2181
2182 // Reads 4 pixels, duplicates them and writes 8 pixels.
2183 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2184 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2185 const uint8_t* src_argb,
2186 int dst_width,
2187 int x,
2188 int dx) {
2189 (void)x;
2190 (void)dx;
2191 asm volatile(LABELALIGN
2192 "1: \n"
2193 "movdqu (%1),%%xmm0 \n"
2194 "lea 0x10(%1),%1 \n"
2195 "movdqa %%xmm0,%%xmm1 \n"
2196 "punpckldq %%xmm0,%%xmm0 \n"
2197 "punpckhdq %%xmm1,%%xmm1 \n"
2198 "movdqu %%xmm0,(%0) \n"
2199 "movdqu %%xmm1,0x10(%0) \n"
2200 "lea 0x20(%0),%0 \n"
2201 "sub $0x8,%2 \n"
2202 "jg 1b \n"
2203
2204 : "+r"(dst_argb), // %0
2205 "+r"(src_argb), // %1
2206 "+r"(dst_width) // %2
2207 ::"memory",
2208 "cc", "xmm0", "xmm1");
2209 }
2210
2211 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2212 static const uvec8 kShuffleColARGB = {
2213 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
2214 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
2215 };
2216
2217 // Shuffle table for duplicating 2 fractions into 8 bytes each
2218 static const uvec8 kShuffleFractions = {
2219 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2220 };
2221
2222 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2223 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2224 const uint8_t* src_argb,
2225 int dst_width,
2226 int x,
2227 int dx) {
2228 intptr_t x0, x1;
2229 asm volatile(
2230 "movdqa %0,%%xmm4 \n"
2231 "movdqa %1,%%xmm5 \n"
2232 :
2233 : "m"(kShuffleColARGB), // %0
2234 "m"(kShuffleFractions) // %1
2235 );
2236
2237 asm volatile(
2238 "movd %5,%%xmm2 \n"
2239 "movd %6,%%xmm3 \n"
2240 "pcmpeqb %%xmm6,%%xmm6 \n"
2241 "psrlw $0x9,%%xmm6 \n"
2242 "pextrw $0x1,%%xmm2,%k3 \n"
2243 "sub $0x2,%2 \n"
2244 "jl 29f \n"
2245 "movdqa %%xmm2,%%xmm0 \n"
2246 "paddd %%xmm3,%%xmm0 \n"
2247 "punpckldq %%xmm0,%%xmm2 \n"
2248 "punpckldq %%xmm3,%%xmm3 \n"
2249 "paddd %%xmm3,%%xmm3 \n"
2250 "pextrw $0x3,%%xmm2,%k4 \n"
2251
2252 LABELALIGN
2253 "2: \n"
2254 "movdqa %%xmm2,%%xmm1 \n"
2255 "paddd %%xmm3,%%xmm2 \n"
2256 "movq 0x00(%1,%3,4),%%xmm0 \n"
2257 "psrlw $0x9,%%xmm1 \n"
2258 "movhps 0x00(%1,%4,4),%%xmm0 \n"
2259 "pshufb %%xmm5,%%xmm1 \n"
2260 "pshufb %%xmm4,%%xmm0 \n"
2261 "pxor %%xmm6,%%xmm1 \n"
2262 "pmaddubsw %%xmm1,%%xmm0 \n"
2263 "psrlw $0x7,%%xmm0 \n"
2264 "pextrw $0x1,%%xmm2,%k3 \n"
2265 "pextrw $0x3,%%xmm2,%k4 \n"
2266 "packuswb %%xmm0,%%xmm0 \n"
2267 "movq %%xmm0,(%0) \n"
2268 "lea 0x8(%0),%0 \n"
2269 "sub $0x2,%2 \n"
2270 "jge 2b \n"
2271
2272 LABELALIGN
2273 "29: \n"
2274 "add $0x1,%2 \n"
2275 "jl 99f \n"
2276 "psrlw $0x9,%%xmm2 \n"
2277 "movq 0x00(%1,%3,4),%%xmm0 \n"
2278 "pshufb %%xmm5,%%xmm2 \n"
2279 "pshufb %%xmm4,%%xmm0 \n"
2280 "pxor %%xmm6,%%xmm2 \n"
2281 "pmaddubsw %%xmm2,%%xmm0 \n"
2282 "psrlw $0x7,%%xmm0 \n"
2283 "packuswb %%xmm0,%%xmm0 \n"
2284 "movd %%xmm0,(%0) \n"
2285
2286 LABELALIGN
2287 "99: \n" // clang-format error.
2288
2289 : "+r"(dst_argb), // %0
2290 "+r"(src_argb), // %1
2291 "+rm"(dst_width), // %2
2292 "=&r"(x0), // %3
2293 "=&r"(x1) // %4
2294 : "rm"(x), // %5
2295 "rm"(dx) // %6
2296 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2297 }
2298
2299 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2300 int FixedDiv_X86(int num, int div) {
2301 asm volatile(
2302 "cdq \n"
2303 "shld $0x10,%%eax,%%edx \n"
2304 "shl $0x10,%%eax \n"
2305 "idiv %1 \n"
2306 "mov %0, %%eax \n"
2307 : "+a"(num) // %0
2308 : "c"(div) // %1
2309 : "memory", "cc", "edx");
2310 return num;
2311 }
2312
2313 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2314 int FixedDiv1_X86(int num, int div) {
2315 asm volatile(
2316 "cdq \n"
2317 "shld $0x10,%%eax,%%edx \n"
2318 "shl $0x10,%%eax \n"
2319 "sub $0x10001,%%eax \n"
2320 "sbb $0x0,%%edx \n"
2321 "sub $0x1,%1 \n"
2322 "idiv %1 \n"
2323 "mov %0, %%eax \n"
2324 : "+a"(num) // %0
2325 : "c"(div) // %1
2326 : "memory", "cc", "edx");
2327 return num;
2328 }
2329
2330 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
2331 defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
2332
2333 // Shuffle table for splitting UV into upper and lower part of register.
2334 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2335 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2336 static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
2337 6u, 14u, 0x80, 0x80, 0x80, 0x80,
2338 0x80, 0x80, 0x80, 0x80};
2339 #endif
2340
2341 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2342
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2343 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2344 ptrdiff_t src_stride,
2345 uint8_t* dst_ptr,
2346 int dst_width) {
2347 asm volatile(
2348 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
2349 "psrlw $0xf,%%xmm4 \n"
2350 "packuswb %%xmm4,%%xmm4 \n"
2351 "pxor %%xmm5, %%xmm5 \n" // zero
2352 "movdqa %4,%%xmm1 \n" // split shuffler
2353 "movdqa %5,%%xmm3 \n" // merge shuffler
2354
2355 LABELALIGN
2356 "1: \n"
2357 "movdqu (%0),%%xmm0 \n" // 8 UV row 0
2358 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
2359 "lea 0x10(%0),%0 \n"
2360 "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
2361 "pshufb %%xmm1,%%xmm2 \n"
2362 "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
2363 "pmaddubsw %%xmm4,%%xmm2 \n"
2364 "paddw %%xmm2,%%xmm0 \n" // vertical add
2365 "psrlw $0x1,%%xmm0 \n" // round
2366 "pavgw %%xmm5,%%xmm0 \n"
2367 "pshufb %%xmm3,%%xmm0 \n" // merge uv
2368 "movq %%xmm0,(%1) \n"
2369 "lea 0x8(%1),%1 \n" // 4 UV
2370 "sub $0x4,%2 \n"
2371 "jg 1b \n"
2372 : "+r"(src_ptr), // %0
2373 "+r"(dst_ptr), // %1
2374 "+r"(dst_width) // %2
2375 : "r"((intptr_t)(src_stride)), // %3
2376 "m"(kShuffleSplitUV), // %4
2377 "m"(kShuffleMergeUV) // %5
2378 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2379 }
2380 #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
2381
2382 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2383 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2384 ptrdiff_t src_stride,
2385 uint8_t* dst_ptr,
2386 int dst_width) {
2387 asm volatile(
2388 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
2389 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
2390 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
2391 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
2392 "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
2393 "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
2394
2395 LABELALIGN
2396 "1: \n"
2397 "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
2398 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
2399 "lea 0x20(%0),%0 \n"
2400 "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
2401 "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
2402 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
2403 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
2404 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
2405 "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
2406 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
2407 "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
2408 "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
2409 "vmovdqu %%xmm0,(%1) \n"
2410 "lea 0x10(%1),%1 \n" // 8 UV
2411 "sub $0x8,%2 \n"
2412 "jg 1b \n"
2413 "vzeroupper \n"
2414 : "+r"(src_ptr), // %0
2415 "+r"(dst_ptr), // %1
2416 "+r"(dst_width) // %2
2417 : "r"((intptr_t)(src_stride)), // %3
2418 "m"(kShuffleSplitUV), // %4
2419 "m"(kShuffleMergeUV) // %5
2420 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2421 }
2422 #endif // HAS_SCALEUVROWDOWN2BOX_AVX2
2423
2424 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2425 3, 1, 3, 1, 1, 3, 1, 3};
2426
2427 #ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2428 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2429 uint8_t* dst_ptr,
2430 int dst_width) {
2431 asm volatile(
2432 "pcmpeqw %%xmm4,%%xmm4 \n"
2433 "psrlw $15,%%xmm4 \n"
2434 "psllw $1,%%xmm4 \n" // all 2
2435 "movdqa %3,%%xmm3 \n"
2436
2437 LABELALIGN
2438 "1: \n"
2439 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2440 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2441 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2442 "movdqa %%xmm0,%%xmm2 \n"
2443 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2444 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2445 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2446 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2447 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
2448 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
2449 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2450 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
2451 "packuswb %%xmm2,%%xmm0 \n"
2452 "movdqu %%xmm0,(%1) \n"
2453
2454 "lea 0x8(%0),%0 \n"
2455 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2456 "sub $0x8,%2 \n"
2457 "jg 1b \n"
2458 : "+r"(src_ptr), // %0
2459 "+r"(dst_ptr), // %1
2460 "+r"(dst_width) // %2
2461 : "m"(kUVLinearMadd31) // %3
2462 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2463 }
2464 #endif
2465
2466 #ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2467 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2468 ptrdiff_t src_stride,
2469 uint8_t* dst_ptr,
2470 ptrdiff_t dst_stride,
2471 int dst_width) {
2472 asm volatile(
2473 "pcmpeqw %%xmm6,%%xmm6 \n"
2474 "psrlw $15,%%xmm6 \n"
2475 "psllw $3,%%xmm6 \n" // all 8
2476 "movdqa %5,%%xmm7 \n"
2477
2478 LABELALIGN
2479 "1: \n"
2480 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2481 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2482 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2483 "movdqa %%xmm0,%%xmm2 \n"
2484 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2485 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2486 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2487 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2488
2489 "movq (%0,%3),%%xmm1 \n"
2490 "movq 2(%0,%3),%%xmm4 \n"
2491 "punpcklbw %%xmm4,%%xmm1 \n"
2492 "movdqa %%xmm1,%%xmm3 \n"
2493 "punpckhdq %%xmm1,%%xmm3 \n"
2494 "punpckldq %%xmm1,%%xmm1 \n"
2495 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
2496 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
2497
2498 // xmm0 xmm2
2499 // xmm1 xmm3
2500
2501 "movdqa %%xmm0,%%xmm4 \n"
2502 "movdqa %%xmm1,%%xmm5 \n"
2503 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2504 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2505 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2506 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2507 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2508
2509 "movdqa %%xmm1,%%xmm5 \n"
2510 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
2511 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2512 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
2513 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2514 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2515
2516 "movdqa %%xmm2,%%xmm0 \n"
2517 "movdqa %%xmm3,%%xmm1 \n"
2518 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
2519 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
2520 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
2521 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2522 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2523
2524 "movdqa %%xmm3,%%xmm1 \n"
2525 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
2526 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
2527 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
2528 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
2529 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
2530
2531 "packuswb %%xmm0,%%xmm4 \n"
2532 "movdqu %%xmm4,(%1) \n" // store above
2533 "packuswb %%xmm1,%%xmm5 \n"
2534 "movdqu %%xmm5,(%1,%4) \n" // store below
2535
2536 "lea 0x8(%0),%0 \n"
2537 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2538 "sub $0x8,%2 \n"
2539 "jg 1b \n"
2540 : "+r"(src_ptr), // %0
2541 "+r"(dst_ptr), // %1
2542 "+r"(dst_width) // %2
2543 : "r"((intptr_t)(src_stride)), // %3
2544 "r"((intptr_t)(dst_stride)), // %4
2545 "m"(kUVLinearMadd31) // %5
2546 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2547 "xmm7");
2548 }
2549 #endif
2550
2551 #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
2552
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2553 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2554 uint8_t* dst_ptr,
2555 int dst_width) {
2556 asm volatile(
2557 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
2558 "vpsrlw $15,%%ymm4,%%ymm4 \n"
2559 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
2560 "vbroadcastf128 %3,%%ymm3 \n"
2561
2562 LABELALIGN
2563 "1: \n"
2564 "vmovdqu (%0),%%xmm0 \n"
2565 "vmovdqu 2(%0),%%xmm1 \n"
2566 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2567 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2568 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2569 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2570 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2571 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
2572 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
2573 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
2574 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
2575 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2576 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2577 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2578 "vmovdqu %%ymm0,(%1) \n"
2579
2580 "lea 0x10(%0),%0 \n"
2581 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2582 "sub $0x10,%2 \n"
2583 "jg 1b \n"
2584 "vzeroupper \n"
2585 : "+r"(src_ptr), // %0
2586 "+r"(dst_ptr), // %1
2587 "+r"(dst_width) // %2
2588 : "m"(kUVLinearMadd31) // %3
2589 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2590 }
2591 #endif
2592
2593 #ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2594 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2595 ptrdiff_t src_stride,
2596 uint8_t* dst_ptr,
2597 ptrdiff_t dst_stride,
2598 int dst_width) {
2599 asm volatile(
2600 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
2601 "vpsrlw $15,%%ymm6,%%ymm6 \n"
2602 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
2603 "vbroadcastf128 %5,%%ymm7 \n"
2604
2605 LABELALIGN
2606 "1: \n"
2607 "vmovdqu (%0),%%xmm0 \n"
2608 "vmovdqu 2(%0),%%xmm1 \n"
2609 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2610 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2611 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2612 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2613 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2614 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
2615 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
2616
2617 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
2618 "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
2619 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
2620 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
2621 "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
2622 "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
2623 "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
2624 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
2625 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
2626
2627 // ymm0 ymm1
2628 // ymm2 ymm3
2629
2630 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2631 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2632 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2633 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2634 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2635
2636 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2637 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2638 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2639 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2640 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2641
2642 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2643 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2644 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2645 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2646 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2647
2648 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2649 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2650 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2651 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2652 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2653
2654 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
2655 "vmovdqu %%ymm4,(%1) \n" // store above
2656 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
2657 "vmovdqu %%ymm5,(%1,%4) \n" // store below
2658
2659 "lea 0x10(%0),%0 \n"
2660 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2661 "sub $0x10,%2 \n"
2662 "jg 1b \n"
2663 "vzeroupper \n"
2664 : "+r"(src_ptr), // %0
2665 "+r"(dst_ptr), // %1
2666 "+r"(dst_width) // %2
2667 : "r"((intptr_t)(src_stride)), // %3
2668 "r"((intptr_t)(dst_stride)), // %4
2669 "m"(kUVLinearMadd31) // %5
2670 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2671 "xmm7");
2672 }
2673 #endif
2674
2675 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2676 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2677 uint16_t* dst_ptr,
2678 int dst_width) {
2679 asm volatile(
2680 "pxor %%xmm5,%%xmm5 \n"
2681 "pcmpeqd %%xmm4,%%xmm4 \n"
2682 "psrld $31,%%xmm4 \n"
2683 "pslld $1,%%xmm4 \n" // all 2
2684
2685 LABELALIGN
2686 "1: \n"
2687 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2688 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2689
2690 "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
2691 "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
2692
2693 "movdqa %%xmm0,%%xmm2 \n"
2694 "movdqa %%xmm1,%%xmm3 \n"
2695
2696 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
2697 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
2698
2699 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
2700 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
2701 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
2702 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
2703 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
2704 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
2705 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
2706 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
2707
2708 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2709 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
2710 "packusdw %%xmm1,%%xmm0 \n"
2711 "movdqu %%xmm0,(%1) \n"
2712
2713 "lea 0x8(%0),%0 \n"
2714 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2715 "sub $0x4,%2 \n"
2716 "jg 1b \n"
2717 : "+r"(src_ptr), // %0
2718 "+r"(dst_ptr), // %1
2719 "+r"(dst_width) // %2
2720 :
2721 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2722 }
2723 #endif
2724
2725 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2726 void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2727 ptrdiff_t src_stride,
2728 uint16_t* dst_ptr,
2729 ptrdiff_t dst_stride,
2730 int dst_width) {
2731 asm volatile(
2732 "pxor %%xmm7,%%xmm7 \n"
2733 "pcmpeqd %%xmm6,%%xmm6 \n"
2734 "psrld $31,%%xmm6 \n"
2735 "pslld $3,%%xmm6 \n" // all 8
2736
2737 LABELALIGN
2738 "1: \n"
2739 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2740 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2741 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
2742 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
2743 "movdqa %%xmm0,%%xmm2 \n"
2744 "movdqa %%xmm1,%%xmm3 \n"
2745 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
2746 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
2747 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
2748 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
2749 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
2750 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
2751 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
2752 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
2753
2754 "movq (%0,%3,2),%%xmm2 \n"
2755 "movq 4(%0,%3,2),%%xmm3 \n"
2756 "punpcklwd %%xmm7,%%xmm2 \n"
2757 "punpcklwd %%xmm7,%%xmm3 \n"
2758 "movdqa %%xmm2,%%xmm4 \n"
2759 "movdqa %%xmm3,%%xmm5 \n"
2760 "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
2761 "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
2762 "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
2763 "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
2764 "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
2765 "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
2766 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
2767 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
2768
2769 "movdqa %%xmm0,%%xmm4 \n"
2770 "movdqa %%xmm2,%%xmm5 \n"
2771 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2772 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2773 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2774 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2775 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2776
2777 "movdqa %%xmm2,%%xmm5 \n"
2778 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
2779 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2780 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
2781 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2782 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2783
2784 "movdqa %%xmm1,%%xmm0 \n"
2785 "movdqa %%xmm3,%%xmm2 \n"
2786 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
2787 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
2788 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
2789 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2790 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2791
2792 "movdqa %%xmm3,%%xmm2 \n"
2793 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
2794 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
2795 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
2796 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
2797 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
2798
2799 "packusdw %%xmm0,%%xmm4 \n"
2800 "movdqu %%xmm4,(%1) \n" // store above
2801 "packusdw %%xmm2,%%xmm5 \n"
2802 "movdqu %%xmm5,(%1,%4,2) \n" // store below
2803
2804 "lea 0x8(%0),%0 \n"
2805 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2806 "sub $0x4,%2 \n"
2807 "jg 1b \n"
2808 : "+r"(src_ptr), // %0
2809 "+r"(dst_ptr), // %1
2810 "+r"(dst_width) // %2
2811 : "r"((intptr_t)(src_stride)), // %3
2812 "r"((intptr_t)(dst_stride)) // %4
2813 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2814 "xmm7");
2815 }
2816 #endif
2817
2818 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2819 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2820 uint16_t* dst_ptr,
2821 int dst_width) {
2822 asm volatile(
2823 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
2824 "vpsrld $31,%%ymm4,%%ymm4 \n"
2825 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
2826
2827 LABELALIGN
2828 "1: \n"
2829 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2830 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2831
2832 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2833 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2834
2835 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2836 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2837
2838 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
2839 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
2840 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
2841 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
2842 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2843 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2844 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
2845 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
2846
2847 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2848 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2849 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
2850 "vmovdqu %%ymm0,(%1) \n"
2851
2852 "lea 0x10(%0),%0 \n"
2853 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2854 "sub $0x8,%2 \n"
2855 "jg 1b \n"
2856 "vzeroupper \n"
2857 : "+r"(src_ptr), // %0
2858 "+r"(dst_ptr), // %1
2859 "+r"(dst_width) // %2
2860 :
2861 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2862 }
2863 #endif
2864
2865 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2866 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2867 ptrdiff_t src_stride,
2868 uint16_t* dst_ptr,
2869 ptrdiff_t dst_stride,
2870 int dst_width) {
2871 asm volatile(
2872 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
2873 "vpsrld $31,%%ymm6,%%ymm6 \n"
2874 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
2875
2876 LABELALIGN
2877 "1: \n"
2878
2879 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2880 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2881 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2882 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2883 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2884 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2885 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
2886 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
2887 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2888 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2889 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
2890 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
2891
2892 "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
2893 "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
2894 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
2895 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
2896 "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
2897 "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
2898 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
2899 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
2900 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
2901 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
2902 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
2903 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
2904
2905 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2906 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2907 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2908 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2909 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2910
2911 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2912 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2913 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2914 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2915 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2916
2917 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2918 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2919 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2920 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2921 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2922
2923 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2924 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2925 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2926 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2927 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2928
2929 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
2930 "vmovdqu %%ymm4,(%1) \n" // store above
2931 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
2932 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
2933
2934 "lea 0x10(%0),%0 \n"
2935 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2936 "sub $0x8,%2 \n"
2937 "jg 1b \n"
2938 "vzeroupper \n"
2939 : "+r"(src_ptr), // %0
2940 "+r"(dst_ptr), // %1
2941 "+r"(dst_width) // %2
2942 : "r"((intptr_t)(src_stride)), // %3
2943 "r"((intptr_t)(dst_stride)) // %4
2944 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2945 }
2946 #endif
2947
2948 #endif // defined(__x86_64__) || defined(__i386__)
2949
2950 #ifdef __cplusplus
2951 } // extern "C"
2952 } // namespace libyuv
2953 #endif
2954