xref: /aosp_15_r20/external/libyuv/source/scale_neon.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1*4e366538SXin Li /*
2*4e366538SXin Li  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li  *
4*4e366538SXin Li  *  Use of this source code is governed by a BSD-style license
5*4e366538SXin Li  *  that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li  *  tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li  *  in the file PATENTS. All contributing project authors may
8*4e366538SXin Li  *  be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li  */
10*4e366538SXin Li 
11*4e366538SXin Li #include "libyuv/row.h"
12*4e366538SXin Li 
13*4e366538SXin Li #ifdef __cplusplus
14*4e366538SXin Li namespace libyuv {
15*4e366538SXin Li extern "C" {
16*4e366538SXin Li #endif
17*4e366538SXin Li 
18*4e366538SXin Li // This module is for GCC Neon.
19*4e366538SXin Li #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20*4e366538SXin Li     !defined(__aarch64__)
21*4e366538SXin Li 
22*4e366538SXin Li // NEON downscalers with interpolation.
23*4e366538SXin Li // Provided by Fritz Koenig
24*4e366538SXin Li 
25*4e366538SXin Li // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)26*4e366538SXin Li void ScaleRowDown2_NEON(const uint8_t* src_ptr,
27*4e366538SXin Li                         ptrdiff_t src_stride,
28*4e366538SXin Li                         uint8_t* dst,
29*4e366538SXin Li                         int dst_width) {
30*4e366538SXin Li   (void)src_stride;
31*4e366538SXin Li   asm volatile(
32*4e366538SXin Li       "1:                                        \n"
33*4e366538SXin Li       // load even pixels into q0, odd into q1
34*4e366538SXin Li       "vld2.8      {q0, q1}, [%0]!               \n"
35*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 16 processed per loop
36*4e366538SXin Li       "vst1.8      {q1}, [%1]!                   \n"  // store odd pixels
37*4e366538SXin Li       "bgt         1b                            \n"
38*4e366538SXin Li       : "+r"(src_ptr),   // %0
39*4e366538SXin Li         "+r"(dst),       // %1
40*4e366538SXin Li         "+r"(dst_width)  // %2
41*4e366538SXin Li       :
42*4e366538SXin Li       : "q0", "q1"  // Clobber List
43*4e366538SXin Li   );
44*4e366538SXin Li }
45*4e366538SXin Li 
46*4e366538SXin Li // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)47*4e366538SXin Li void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
48*4e366538SXin Li                               ptrdiff_t src_stride,
49*4e366538SXin Li                               uint8_t* dst,
50*4e366538SXin Li                               int dst_width) {
51*4e366538SXin Li   (void)src_stride;
52*4e366538SXin Li   asm volatile(
53*4e366538SXin Li       "1:                                        \n"
54*4e366538SXin Li       "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
55*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 16 processed per loop
56*4e366538SXin Li       "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
57*4e366538SXin Li       "vst1.8      {q0}, [%1]!                   \n"
58*4e366538SXin Li       "bgt         1b                            \n"
59*4e366538SXin Li       : "+r"(src_ptr),   // %0
60*4e366538SXin Li         "+r"(dst),       // %1
61*4e366538SXin Li         "+r"(dst_width)  // %2
62*4e366538SXin Li       :
63*4e366538SXin Li       : "q0", "q1"  // Clobber List
64*4e366538SXin Li   );
65*4e366538SXin Li }
66*4e366538SXin Li 
67*4e366538SXin Li // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)68*4e366538SXin Li void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
69*4e366538SXin Li                            ptrdiff_t src_stride,
70*4e366538SXin Li                            uint8_t* dst,
71*4e366538SXin Li                            int dst_width) {
72*4e366538SXin Li   asm volatile(
73*4e366538SXin Li       // change the stride to row 2 pointer
74*4e366538SXin Li       "add         %1, %0                        \n"
75*4e366538SXin Li       "1:                                        \n"
76*4e366538SXin Li       "vld1.8      {q0, q1}, [%0]!               \n"  // load row 1 and post inc
77*4e366538SXin Li       "vld1.8      {q2, q3}, [%1]!               \n"  // load row 2 and post inc
78*4e366538SXin Li       "subs        %3, %3, #16                   \n"  // 16 processed per loop
79*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"  // row 1 add adjacent
80*4e366538SXin Li       "vpaddl.u8   q1, q1                        \n"
81*4e366538SXin Li       "vpadal.u8   q0, q2                        \n"  // row 2 add adjacent +
82*4e366538SXin Li                                                       // row1
83*4e366538SXin Li       "vpadal.u8   q1, q3                        \n"
84*4e366538SXin Li       "vrshrn.u16  d0, q0, #2                    \n"  // downshift, round and
85*4e366538SXin Li                                                       // pack
86*4e366538SXin Li       "vrshrn.u16  d1, q1, #2                    \n"
87*4e366538SXin Li       "vst1.8      {q0}, [%2]!                   \n"
88*4e366538SXin Li       "bgt         1b                            \n"
89*4e366538SXin Li       : "+r"(src_ptr),     // %0
90*4e366538SXin Li         "+r"(src_stride),  // %1
91*4e366538SXin Li         "+r"(dst),         // %2
92*4e366538SXin Li         "+r"(dst_width)    // %3
93*4e366538SXin Li       :
94*4e366538SXin Li       : "q0", "q1", "q2", "q3"  // Clobber List
95*4e366538SXin Li   );
96*4e366538SXin Li }
97*4e366538SXin Li 
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)98*4e366538SXin Li void ScaleRowDown4_NEON(const uint8_t* src_ptr,
99*4e366538SXin Li                         ptrdiff_t src_stride,
100*4e366538SXin Li                         uint8_t* dst_ptr,
101*4e366538SXin Li                         int dst_width) {
102*4e366538SXin Li   (void)src_stride;
103*4e366538SXin Li   asm volatile(
104*4e366538SXin Li       "1:                                        \n"
105*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
106*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop
107*4e366538SXin Li       "vst1.8      {d2}, [%1]!                   \n"
108*4e366538SXin Li       "bgt         1b                            \n"
109*4e366538SXin Li       : "+r"(src_ptr),   // %0
110*4e366538SXin Li         "+r"(dst_ptr),   // %1
111*4e366538SXin Li         "+r"(dst_width)  // %2
112*4e366538SXin Li       :
113*4e366538SXin Li       : "q0", "q1", "memory", "cc");
114*4e366538SXin Li }
115*4e366538SXin Li 
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)116*4e366538SXin Li void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
117*4e366538SXin Li                            ptrdiff_t src_stride,
118*4e366538SXin Li                            uint8_t* dst_ptr,
119*4e366538SXin Li                            int dst_width) {
120*4e366538SXin Li   const uint8_t* src_ptr1 = src_ptr + src_stride;
121*4e366538SXin Li   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
122*4e366538SXin Li   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
123*4e366538SXin Li   asm volatile(
124*4e366538SXin Li       "1:                                        \n"
125*4e366538SXin Li       "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
126*4e366538SXin Li       "vld1.8      {q1}, [%3]!                   \n"
127*4e366538SXin Li       "vld1.8      {q2}, [%4]!                   \n"
128*4e366538SXin Li       "vld1.8      {q3}, [%5]!                   \n"
129*4e366538SXin Li       "subs        %2, %2, #4                    \n"
130*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"
131*4e366538SXin Li       "vpadal.u8   q0, q1                        \n"
132*4e366538SXin Li       "vpadal.u8   q0, q2                        \n"
133*4e366538SXin Li       "vpadal.u8   q0, q3                        \n"
134*4e366538SXin Li       "vpaddl.u16  q0, q0                        \n"
135*4e366538SXin Li       "vrshrn.u32  d0, q0, #4                    \n"  // divide by 16 w/rounding
136*4e366538SXin Li       "vmovn.u16   d0, q0                        \n"
137*4e366538SXin Li       "vst1.32     {d0[0]}, [%1]!                \n"
138*4e366538SXin Li       "bgt         1b                            \n"
139*4e366538SXin Li       : "+r"(src_ptr),    // %0
140*4e366538SXin Li         "+r"(dst_ptr),    // %1
141*4e366538SXin Li         "+r"(dst_width),  // %2
142*4e366538SXin Li         "+r"(src_ptr1),   // %3
143*4e366538SXin Li         "+r"(src_ptr2),   // %4
144*4e366538SXin Li         "+r"(src_ptr3)    // %5
145*4e366538SXin Li       :
146*4e366538SXin Li       : "q0", "q1", "q2", "q3", "memory", "cc");
147*4e366538SXin Li }
148*4e366538SXin Li 
149*4e366538SXin Li // Down scale from 4 to 3 pixels. Use the neon multilane read/write
150*4e366538SXin Li // to load up the every 4th pixel into a 4 different registers.
151*4e366538SXin Li // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)152*4e366538SXin Li void ScaleRowDown34_NEON(const uint8_t* src_ptr,
153*4e366538SXin Li                          ptrdiff_t src_stride,
154*4e366538SXin Li                          uint8_t* dst_ptr,
155*4e366538SXin Li                          int dst_width) {
156*4e366538SXin Li   (void)src_stride;
157*4e366538SXin Li   asm volatile(
158*4e366538SXin Li       "1:                                        \n"
159*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
160*4e366538SXin Li       "subs        %2, %2, #24                   \n"
161*4e366538SXin Li       "vmov        d2, d3                        \n"  // order d0, d1, d2
162*4e366538SXin Li       "vst3.8      {d0, d1, d2}, [%1]!           \n"
163*4e366538SXin Li       "bgt         1b                            \n"
164*4e366538SXin Li       : "+r"(src_ptr),   // %0
165*4e366538SXin Li         "+r"(dst_ptr),   // %1
166*4e366538SXin Li         "+r"(dst_width)  // %2
167*4e366538SXin Li       :
168*4e366538SXin Li       : "d0", "d1", "d2", "d3", "memory", "cc");
169*4e366538SXin Li }
170*4e366538SXin Li 
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)171*4e366538SXin Li void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
172*4e366538SXin Li                                ptrdiff_t src_stride,
173*4e366538SXin Li                                uint8_t* dst_ptr,
174*4e366538SXin Li                                int dst_width) {
175*4e366538SXin Li   asm volatile(
176*4e366538SXin Li       "vmov.u8     d24, #3                       \n"
177*4e366538SXin Li       "add         %3, %0                        \n"
178*4e366538SXin Li       "1:                                        \n"
179*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
180*4e366538SXin Li       "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
181*4e366538SXin Li       "subs        %2, %2, #24                   \n"
182*4e366538SXin Li 
183*4e366538SXin Li       // filter src line 0 with src line 1
184*4e366538SXin Li       // expand chars to shorts to allow for room
185*4e366538SXin Li       // when adding lines together
186*4e366538SXin Li       "vmovl.u8    q8, d4                        \n"
187*4e366538SXin Li       "vmovl.u8    q9, d5                        \n"
188*4e366538SXin Li       "vmovl.u8    q10, d6                       \n"
189*4e366538SXin Li       "vmovl.u8    q11, d7                       \n"
190*4e366538SXin Li 
191*4e366538SXin Li       // 3 * line_0 + line_1
192*4e366538SXin Li       "vmlal.u8    q8, d0, d24                   \n"
193*4e366538SXin Li       "vmlal.u8    q9, d1, d24                   \n"
194*4e366538SXin Li       "vmlal.u8    q10, d2, d24                  \n"
195*4e366538SXin Li       "vmlal.u8    q11, d3, d24                  \n"
196*4e366538SXin Li 
197*4e366538SXin Li       // (3 * line_0 + line_1 + 2) >> 2
198*4e366538SXin Li       "vqrshrn.u16 d0, q8, #2                    \n"
199*4e366538SXin Li       "vqrshrn.u16 d1, q9, #2                    \n"
200*4e366538SXin Li       "vqrshrn.u16 d2, q10, #2                   \n"
201*4e366538SXin Li       "vqrshrn.u16 d3, q11, #2                   \n"
202*4e366538SXin Li 
203*4e366538SXin Li       // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
204*4e366538SXin Li       "vmovl.u8    q8, d1                        \n"
205*4e366538SXin Li       "vmlal.u8    q8, d0, d24                   \n"
206*4e366538SXin Li       "vqrshrn.u16 d0, q8, #2                    \n"
207*4e366538SXin Li 
208*4e366538SXin Li       // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
209*4e366538SXin Li       "vrhadd.u8   d1, d1, d2                    \n"
210*4e366538SXin Li 
211*4e366538SXin Li       // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
212*4e366538SXin Li       "vmovl.u8    q8, d2                        \n"
213*4e366538SXin Li       "vmlal.u8    q8, d3, d24                   \n"
214*4e366538SXin Li       "vqrshrn.u16 d2, q8, #2                    \n"
215*4e366538SXin Li 
216*4e366538SXin Li       "vst3.8      {d0, d1, d2}, [%1]!           \n"
217*4e366538SXin Li 
218*4e366538SXin Li       "bgt         1b                            \n"
219*4e366538SXin Li       : "+r"(src_ptr),    // %0
220*4e366538SXin Li         "+r"(dst_ptr),    // %1
221*4e366538SXin Li         "+r"(dst_width),  // %2
222*4e366538SXin Li         "+r"(src_stride)  // %3
223*4e366538SXin Li       :
224*4e366538SXin Li       : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
225*4e366538SXin Li         "cc");
226*4e366538SXin Li }
227*4e366538SXin Li 
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)228*4e366538SXin Li void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
229*4e366538SXin Li                                ptrdiff_t src_stride,
230*4e366538SXin Li                                uint8_t* dst_ptr,
231*4e366538SXin Li                                int dst_width) {
232*4e366538SXin Li   asm volatile(
233*4e366538SXin Li       "vmov.u8     d24, #3                       \n"
234*4e366538SXin Li       "add         %3, %0                        \n"
235*4e366538SXin Li       "1:                                        \n"
236*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
237*4e366538SXin Li       "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"  // src line 1
238*4e366538SXin Li       "subs        %2, %2, #24                   \n"
239*4e366538SXin Li       // average src line 0 with src line 1
240*4e366538SXin Li       "vrhadd.u8   q0, q0, q2                    \n"
241*4e366538SXin Li       "vrhadd.u8   q1, q1, q3                    \n"
242*4e366538SXin Li 
243*4e366538SXin Li       // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
244*4e366538SXin Li       "vmovl.u8    q3, d1                        \n"
245*4e366538SXin Li       "vmlal.u8    q3, d0, d24                   \n"
246*4e366538SXin Li       "vqrshrn.u16 d0, q3, #2                    \n"
247*4e366538SXin Li 
248*4e366538SXin Li       // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
249*4e366538SXin Li       "vrhadd.u8   d1, d1, d2                    \n"
250*4e366538SXin Li 
251*4e366538SXin Li       // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
252*4e366538SXin Li       "vmovl.u8    q3, d2                        \n"
253*4e366538SXin Li       "vmlal.u8    q3, d3, d24                   \n"
254*4e366538SXin Li       "vqrshrn.u16 d2, q3, #2                    \n"
255*4e366538SXin Li 
256*4e366538SXin Li       "vst3.8      {d0, d1, d2}, [%1]!           \n"
257*4e366538SXin Li       "bgt         1b                            \n"
258*4e366538SXin Li       : "+r"(src_ptr),    // %0
259*4e366538SXin Li         "+r"(dst_ptr),    // %1
260*4e366538SXin Li         "+r"(dst_width),  // %2
261*4e366538SXin Li         "+r"(src_stride)  // %3
262*4e366538SXin Li       :
263*4e366538SXin Li       : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
264*4e366538SXin Li }
265*4e366538SXin Li 
266*4e366538SXin Li #define HAS_SCALEROWDOWN38_NEON
267*4e366538SXin Li static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
268*4e366538SXin Li                               22, 24, 27, 30, 0,  0,  0,  0};
269*4e366538SXin Li static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
270*4e366538SXin Li                                 18, 6, 14, 19, 0,  0,  0, 0};
271*4e366538SXin Li static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
272*4e366538SXin Li                                    65536 / 12, 65536 / 12, 65536 / 12,
273*4e366538SXin Li                                    65536 / 12, 65536 / 12};
274*4e366538SXin Li static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
275*4e366538SXin Li                                    65536 / 18, 65536 / 18, 65536 / 18,
276*4e366538SXin Li                                    65536 / 18, 65536 / 18};
277*4e366538SXin Li 
278*4e366538SXin Li // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)279*4e366538SXin Li void ScaleRowDown38_NEON(const uint8_t* src_ptr,
280*4e366538SXin Li                          ptrdiff_t src_stride,
281*4e366538SXin Li                          uint8_t* dst_ptr,
282*4e366538SXin Li                          int dst_width) {
283*4e366538SXin Li   (void)src_stride;
284*4e366538SXin Li   asm volatile(
285*4e366538SXin Li       "vld1.8      {q3}, [%3]                    \n"
286*4e366538SXin Li       "1:                                        \n"
287*4e366538SXin Li       "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
288*4e366538SXin Li       "subs        %2, %2, #12                   \n"
289*4e366538SXin Li       "vtbl.u8     d4, {d0, d1, d2, d3}, d6      \n"
290*4e366538SXin Li       "vtbl.u8     d5, {d0, d1, d2, d3}, d7      \n"
291*4e366538SXin Li       "vst1.8      {d4}, [%1]!                   \n"
292*4e366538SXin Li       "vst1.32     {d5[0]}, [%1]!                \n"
293*4e366538SXin Li       "bgt         1b                            \n"
294*4e366538SXin Li       : "+r"(src_ptr),   // %0
295*4e366538SXin Li         "+r"(dst_ptr),   // %1
296*4e366538SXin Li         "+r"(dst_width)  // %2
297*4e366538SXin Li       : "r"(&kShuf38)    // %3
298*4e366538SXin Li       : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
299*4e366538SXin Li }
300*4e366538SXin Li 
301*4e366538SXin Li // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)302*4e366538SXin Li void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
303*4e366538SXin Li                                       ptrdiff_t src_stride,
304*4e366538SXin Li                                       uint8_t* dst_ptr,
305*4e366538SXin Li                                       int dst_width) {
306*4e366538SXin Li   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
307*4e366538SXin Li 
308*4e366538SXin Li   asm volatile(
309*4e366538SXin Li       "vld1.16     {q13}, [%5]                   \n"
310*4e366538SXin Li       "vld1.8      {q14}, [%6]                   \n"
311*4e366538SXin Li       "vld1.8      {q15}, [%7]                   \n"
312*4e366538SXin Li       "add         %3, %0                        \n"
313*4e366538SXin Li       "1:                                        \n"
314*4e366538SXin Li 
315*4e366538SXin Li       // d0 = 00 40 01 41 02 42 03 43
316*4e366538SXin Li       // d1 = 10 50 11 51 12 52 13 53
317*4e366538SXin Li       // d2 = 20 60 21 61 22 62 23 63
318*4e366538SXin Li       // d3 = 30 70 31 71 32 72 33 73
319*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
320*4e366538SXin Li       "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
321*4e366538SXin Li       "vld4.8      {d16, d17, d18, d19}, [%4]!   \n"
322*4e366538SXin Li       "subs        %2, %2, #12                   \n"
323*4e366538SXin Li 
324*4e366538SXin Li       // Shuffle the input data around to get align the data
325*4e366538SXin Li       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
326*4e366538SXin Li       // d0 = 00 10 01 11 02 12 03 13
327*4e366538SXin Li       // d1 = 40 50 41 51 42 52 43 53
328*4e366538SXin Li       "vtrn.u8     d0, d1                        \n"
329*4e366538SXin Li       "vtrn.u8     d4, d5                        \n"
330*4e366538SXin Li       "vtrn.u8     d16, d17                      \n"
331*4e366538SXin Li 
332*4e366538SXin Li       // d2 = 20 30 21 31 22 32 23 33
333*4e366538SXin Li       // d3 = 60 70 61 71 62 72 63 73
334*4e366538SXin Li       "vtrn.u8     d2, d3                        \n"
335*4e366538SXin Li       "vtrn.u8     d6, d7                        \n"
336*4e366538SXin Li       "vtrn.u8     d18, d19                      \n"
337*4e366538SXin Li 
338*4e366538SXin Li       // d0 = 00+10 01+11 02+12 03+13
339*4e366538SXin Li       // d2 = 40+50 41+51 42+52 43+53
340*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"
341*4e366538SXin Li       "vpaddl.u8   q2, q2                        \n"
342*4e366538SXin Li       "vpaddl.u8   q8, q8                        \n"
343*4e366538SXin Li 
344*4e366538SXin Li       // d3 = 60+70 61+71 62+72 63+73
345*4e366538SXin Li       "vpaddl.u8   d3, d3                        \n"
346*4e366538SXin Li       "vpaddl.u8   d7, d7                        \n"
347*4e366538SXin Li       "vpaddl.u8   d19, d19                      \n"
348*4e366538SXin Li 
349*4e366538SXin Li       // combine source lines
350*4e366538SXin Li       "vadd.u16    q0, q2                        \n"
351*4e366538SXin Li       "vadd.u16    q0, q8                        \n"
352*4e366538SXin Li       "vadd.u16    d4, d3, d7                    \n"
353*4e366538SXin Li       "vadd.u16    d4, d19                       \n"
354*4e366538SXin Li 
355*4e366538SXin Li       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
356*4e366538SXin Li       //             + s[6 + st * 1] + s[7 + st * 1]
357*4e366538SXin Li       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
358*4e366538SXin Li       "vqrdmulh.s16 q2, q2, q13                  \n"
359*4e366538SXin Li       "vmovn.u16   d4, q2                        \n"
360*4e366538SXin Li 
361*4e366538SXin Li       // Shuffle 2,3 reg around so that 2 can be added to the
362*4e366538SXin Li       //  0,1 reg and 3 can be added to the 4,5 reg. This
363*4e366538SXin Li       //  requires expanding from u8 to u16 as the 0,1 and 4,5
364*4e366538SXin Li       //  registers are already expanded. Then do transposes
365*4e366538SXin Li       //  to get aligned.
366*4e366538SXin Li       // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
367*4e366538SXin Li       "vmovl.u8    q1, d2                        \n"
368*4e366538SXin Li       "vmovl.u8    q3, d6                        \n"
369*4e366538SXin Li       "vmovl.u8    q9, d18                       \n"
370*4e366538SXin Li 
371*4e366538SXin Li       // combine source lines
372*4e366538SXin Li       "vadd.u16    q1, q3                        \n"
373*4e366538SXin Li       "vadd.u16    q1, q9                        \n"
374*4e366538SXin Li 
375*4e366538SXin Li       // d4 = xx 20 xx 30 xx 22 xx 32
376*4e366538SXin Li       // d5 = xx 21 xx 31 xx 23 xx 33
377*4e366538SXin Li       "vtrn.u32    d2, d3                        \n"
378*4e366538SXin Li 
379*4e366538SXin Li       // d4 = xx 20 xx 21 xx 22 xx 23
380*4e366538SXin Li       // d5 = xx 30 xx 31 xx 32 xx 33
381*4e366538SXin Li       "vtrn.u16    d2, d3                        \n"
382*4e366538SXin Li 
383*4e366538SXin Li       // 0+1+2, 3+4+5
384*4e366538SXin Li       "vadd.u16    q0, q1                        \n"
385*4e366538SXin Li 
386*4e366538SXin Li       // Need to divide, but can't downshift as the the value
387*4e366538SXin Li       //  isn't a power of 2. So multiply by 65536 / n
388*4e366538SXin Li       //  and take the upper 16 bits.
389*4e366538SXin Li       "vqrdmulh.s16 q0, q0, q15                  \n"
390*4e366538SXin Li 
391*4e366538SXin Li       // Align for table lookup, vtbl requires registers to
392*4e366538SXin Li       //  be adjacent
393*4e366538SXin Li       "vmov.u8     d2, d4                        \n"
394*4e366538SXin Li 
395*4e366538SXin Li       "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
396*4e366538SXin Li       "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
397*4e366538SXin Li 
398*4e366538SXin Li       "vst1.8      {d3}, [%1]!                   \n"
399*4e366538SXin Li       "vst1.32     {d4[0]}, [%1]!                \n"
400*4e366538SXin Li       "bgt         1b                            \n"
401*4e366538SXin Li       : "+r"(src_ptr),       // %0
402*4e366538SXin Li         "+r"(dst_ptr),       // %1
403*4e366538SXin Li         "+r"(dst_width),     // %2
404*4e366538SXin Li         "+r"(src_stride),    // %3
405*4e366538SXin Li         "+r"(src_ptr1)       // %4
406*4e366538SXin Li       : "r"(&kMult38_Div6),  // %5
407*4e366538SXin Li         "r"(&kShuf38_2),     // %6
408*4e366538SXin Li         "r"(&kMult38_Div9)   // %7
409*4e366538SXin Li       : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
410*4e366538SXin Li         "cc");
411*4e366538SXin Li }
412*4e366538SXin Li 
413*4e366538SXin Li // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)414*4e366538SXin Li void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
415*4e366538SXin Li                                ptrdiff_t src_stride,
416*4e366538SXin Li                                uint8_t* dst_ptr,
417*4e366538SXin Li                                int dst_width) {
418*4e366538SXin Li   asm volatile(
419*4e366538SXin Li       "vld1.16     {q13}, [%4]                   \n"
420*4e366538SXin Li       "vld1.8      {q14}, [%5]                   \n"
421*4e366538SXin Li       "add         %3, %0                        \n"
422*4e366538SXin Li       "1:                                        \n"
423*4e366538SXin Li 
424*4e366538SXin Li       // d0 = 00 40 01 41 02 42 03 43
425*4e366538SXin Li       // d1 = 10 50 11 51 12 52 13 53
426*4e366538SXin Li       // d2 = 20 60 21 61 22 62 23 63
427*4e366538SXin Li       // d3 = 30 70 31 71 32 72 33 73
428*4e366538SXin Li       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"
429*4e366538SXin Li       "vld4.8      {d4, d5, d6, d7}, [%3]!       \n"
430*4e366538SXin Li       "subs        %2, %2, #12                   \n"
431*4e366538SXin Li 
432*4e366538SXin Li       // Shuffle the input data around to get align the data
433*4e366538SXin Li       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
434*4e366538SXin Li       // d0 = 00 10 01 11 02 12 03 13
435*4e366538SXin Li       // d1 = 40 50 41 51 42 52 43 53
436*4e366538SXin Li       "vtrn.u8     d0, d1                        \n"
437*4e366538SXin Li       "vtrn.u8     d4, d5                        \n"
438*4e366538SXin Li 
439*4e366538SXin Li       // d2 = 20 30 21 31 22 32 23 33
440*4e366538SXin Li       // d3 = 60 70 61 71 62 72 63 73
441*4e366538SXin Li       "vtrn.u8     d2, d3                        \n"
442*4e366538SXin Li       "vtrn.u8     d6, d7                        \n"
443*4e366538SXin Li 
444*4e366538SXin Li       // d0 = 00+10 01+11 02+12 03+13
445*4e366538SXin Li       // d2 = 40+50 41+51 42+52 43+53
446*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"
447*4e366538SXin Li       "vpaddl.u8   q2, q2                        \n"
448*4e366538SXin Li 
449*4e366538SXin Li       // d3 = 60+70 61+71 62+72 63+73
450*4e366538SXin Li       "vpaddl.u8   d3, d3                        \n"
451*4e366538SXin Li       "vpaddl.u8   d7, d7                        \n"
452*4e366538SXin Li 
453*4e366538SXin Li       // combine source lines
454*4e366538SXin Li       "vadd.u16    q0, q2                        \n"
455*4e366538SXin Li       "vadd.u16    d4, d3, d7                    \n"
456*4e366538SXin Li 
457*4e366538SXin Li       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
458*4e366538SXin Li       "vqrshrn.u16 d4, q2, #2                    \n"
459*4e366538SXin Li 
460*4e366538SXin Li       // Shuffle 2,3 reg around so that 2 can be added to the
461*4e366538SXin Li       //  0,1 reg and 3 can be added to the 4,5 reg. This
462*4e366538SXin Li       //  requires expanding from u8 to u16 as the 0,1 and 4,5
463*4e366538SXin Li       //  registers are already expanded. Then do transposes
464*4e366538SXin Li       //  to get aligned.
465*4e366538SXin Li       // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
466*4e366538SXin Li       "vmovl.u8    q1, d2                        \n"
467*4e366538SXin Li       "vmovl.u8    q3, d6                        \n"
468*4e366538SXin Li 
469*4e366538SXin Li       // combine source lines
470*4e366538SXin Li       "vadd.u16    q1, q3                        \n"
471*4e366538SXin Li 
472*4e366538SXin Li       // d4 = xx 20 xx 30 xx 22 xx 32
473*4e366538SXin Li       // d5 = xx 21 xx 31 xx 23 xx 33
474*4e366538SXin Li       "vtrn.u32    d2, d3                        \n"
475*4e366538SXin Li 
476*4e366538SXin Li       // d4 = xx 20 xx 21 xx 22 xx 23
477*4e366538SXin Li       // d5 = xx 30 xx 31 xx 32 xx 33
478*4e366538SXin Li       "vtrn.u16    d2, d3                        \n"
479*4e366538SXin Li 
480*4e366538SXin Li       // 0+1+2, 3+4+5
481*4e366538SXin Li       "vadd.u16    q0, q1                        \n"
482*4e366538SXin Li 
483*4e366538SXin Li       // Need to divide, but can't downshift as the the value
484*4e366538SXin Li       //  isn't a power of 2. So multiply by 65536 / n
485*4e366538SXin Li       //  and take the upper 16 bits.
486*4e366538SXin Li       "vqrdmulh.s16 q0, q0, q13                  \n"
487*4e366538SXin Li 
488*4e366538SXin Li       // Align for table lookup, vtbl requires registers to
489*4e366538SXin Li       //  be adjacent
490*4e366538SXin Li       "vmov.u8     d2, d4                        \n"
491*4e366538SXin Li 
492*4e366538SXin Li       "vtbl.u8     d3, {d0, d1, d2}, d28         \n"
493*4e366538SXin Li       "vtbl.u8     d4, {d0, d1, d2}, d29         \n"
494*4e366538SXin Li 
495*4e366538SXin Li       "vst1.8      {d3}, [%1]!                   \n"
496*4e366538SXin Li       "vst1.32     {d4[0]}, [%1]!                \n"
497*4e366538SXin Li       "bgt         1b                            \n"
498*4e366538SXin Li       : "+r"(src_ptr),       // %0
499*4e366538SXin Li         "+r"(dst_ptr),       // %1
500*4e366538SXin Li         "+r"(dst_width),     // %2
501*4e366538SXin Li         "+r"(src_stride)     // %3
502*4e366538SXin Li       : "r"(&kMult38_Div6),  // %4
503*4e366538SXin Li         "r"(&kShuf38_2)      // %5
504*4e366538SXin Li       : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
505*4e366538SXin Li }
506*4e366538SXin Li 
ScaleRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)507*4e366538SXin Li void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
508*4e366538SXin Li                              uint8_t* dst_ptr,
509*4e366538SXin Li                              int dst_width) {
510*4e366538SXin Li   const uint8_t* src_temp = src_ptr + 1;
511*4e366538SXin Li   asm volatile(
512*4e366538SXin Li       "vmov.u8     d30, #3                       \n"
513*4e366538SXin Li 
514*4e366538SXin Li       "1:                                        \n"
515*4e366538SXin Li       "vld1.8      {d4}, [%0]!                   \n"  // 01234567
516*4e366538SXin Li       "vld1.8      {d5}, [%3]!                   \n"  // 12345678
517*4e366538SXin Li 
518*4e366538SXin Li       "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
519*4e366538SXin Li       "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
520*4e366538SXin Li       "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
521*4e366538SXin Li       "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
522*4e366538SXin Li 
523*4e366538SXin Li       "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
524*4e366538SXin Li       "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
525*4e366538SXin Li 
526*4e366538SXin Li       "vst2.8      {d0, d1}, [%1]!               \n"  // store
527*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
528*4e366538SXin Li       "bgt         1b                            \n"
529*4e366538SXin Li       : "+r"(src_ptr),    // %0
530*4e366538SXin Li         "+r"(dst_ptr),    // %1
531*4e366538SXin Li         "+r"(dst_width),  // %2
532*4e366538SXin Li         "+r"(src_temp)    // %3
533*4e366538SXin Li       :
534*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
535*4e366538SXin Li   );
536*4e366538SXin Li }
537*4e366538SXin Li 
ScaleRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)538*4e366538SXin Li void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
539*4e366538SXin Li                                ptrdiff_t src_stride,
540*4e366538SXin Li                                uint8_t* dst_ptr,
541*4e366538SXin Li                                ptrdiff_t dst_stride,
542*4e366538SXin Li                                int dst_width) {
543*4e366538SXin Li   const uint8_t* src_ptr1 = src_ptr + src_stride;
544*4e366538SXin Li   uint8_t* dst_ptr1 = dst_ptr + dst_stride;
545*4e366538SXin Li   const uint8_t* src_temp = src_ptr + 1;
546*4e366538SXin Li   const uint8_t* src_temp1 = src_ptr1 + 1;
547*4e366538SXin Li 
548*4e366538SXin Li   asm volatile(
549*4e366538SXin Li       "vmov.u16    q15, #3                       \n"
550*4e366538SXin Li       "vmov.u8     d28, #3                       \n"
551*4e366538SXin Li 
552*4e366538SXin Li       "1:                                        \n"
553*4e366538SXin Li       "vld1.8      {d4}, [%0]!                   \n"  // 01234567
554*4e366538SXin Li       "vld1.8      {d5}, [%5]!                   \n"  // 12345678
555*4e366538SXin Li 
556*4e366538SXin Li       "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
557*4e366538SXin Li       "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
558*4e366538SXin Li       "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
559*4e366538SXin Li       "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
560*4e366538SXin Li 
561*4e366538SXin Li       "vld1.8      {d8}, [%1]!                   \n"
562*4e366538SXin Li       "vld1.8      {d9}, [%6]!                   \n"
563*4e366538SXin Li 
564*4e366538SXin Li       "vmovl.u8    q2, d8                        \n"
565*4e366538SXin Li       "vmovl.u8    q3, d9                        \n"
566*4e366538SXin Li       "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
567*4e366538SXin Li       "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
568*4e366538SXin Li 
569*4e366538SXin Li       // e  o
570*4e366538SXin Li       // q1 q0
571*4e366538SXin Li       // q3 q2
572*4e366538SXin Li 
573*4e366538SXin Li       "vmovq       q4, q2                        \n"
574*4e366538SXin Li       "vmovq       q5, q3                        \n"
575*4e366538SXin Li       "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
576*4e366538SXin Li       "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
577*4e366538SXin Li       "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
578*4e366538SXin Li       "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
579*4e366538SXin Li 
580*4e366538SXin Li       // e  o
581*4e366538SXin Li       // q5 q4
582*4e366538SXin Li       // q1 q0
583*4e366538SXin Li 
584*4e366538SXin Li       "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
585*4e366538SXin Li       "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
586*4e366538SXin Li       "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
587*4e366538SXin Li       "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
588*4e366538SXin Li 
589*4e366538SXin Li       "vst2.8      {d0, d1}, [%2]!               \n"  // store
590*4e366538SXin Li       "vst2.8      {d2, d3}, [%3]!               \n"  // store
591*4e366538SXin Li       "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
592*4e366538SXin Li       "bgt         1b                            \n"
593*4e366538SXin Li       : "+r"(src_ptr),    // %0
594*4e366538SXin Li         "+r"(src_ptr1),   // %1
595*4e366538SXin Li         "+r"(dst_ptr),    // %2
596*4e366538SXin Li         "+r"(dst_ptr1),   // %3
597*4e366538SXin Li         "+r"(dst_width),  // %4
598*4e366538SXin Li         "+r"(src_temp),   // %5
599*4e366538SXin Li         "+r"(src_temp1)   // %6
600*4e366538SXin Li       :
601*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
602*4e366538SXin Li         "q15"  // Clobber List
603*4e366538SXin Li   );
604*4e366538SXin Li }
605*4e366538SXin Li 
ScaleRowUp2_Linear_12_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)606*4e366538SXin Li void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
607*4e366538SXin Li                                 uint16_t* dst_ptr,
608*4e366538SXin Li                                 int dst_width) {
609*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 1;
610*4e366538SXin Li   asm volatile(
611*4e366538SXin Li       "vmov.u16    q15, #3                       \n"
612*4e366538SXin Li 
613*4e366538SXin Li       "1:                                        \n"
614*4e366538SXin Li       "vld1.16     {q1}, [%0]!                   \n"  // 01234567 (16b)
615*4e366538SXin Li       "vld1.16     {q0}, [%3]!                   \n"  // 12345678 (16b)
616*4e366538SXin Li 
617*4e366538SXin Li       "vmovq       q2, q0                        \n"
618*4e366538SXin Li       "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
619*4e366538SXin Li       "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
620*4e366538SXin Li 
621*4e366538SXin Li       "vrshr.u16   q0, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
622*4e366538SXin Li       "vrshr.u16   q1, q1, #2                    \n"  // 3/4*near+1/4*far (even)
623*4e366538SXin Li 
624*4e366538SXin Li       "vst2.16     {d0, d1, d2, d3}, [%1]!       \n"  // store
625*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
626*4e366538SXin Li       "bgt         1b                            \n"
627*4e366538SXin Li       : "+r"(src_ptr),    // %0
628*4e366538SXin Li         "+r"(dst_ptr),    // %1
629*4e366538SXin Li         "+r"(dst_width),  // %2
630*4e366538SXin Li         "+r"(src_temp)    // %3
631*4e366538SXin Li       :
632*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
633*4e366538SXin Li   );
634*4e366538SXin Li }
635*4e366538SXin Li 
ScaleRowUp2_Bilinear_12_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)636*4e366538SXin Li void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
637*4e366538SXin Li                                   ptrdiff_t src_stride,
638*4e366538SXin Li                                   uint16_t* dst_ptr,
639*4e366538SXin Li                                   ptrdiff_t dst_stride,
640*4e366538SXin Li                                   int dst_width) {
641*4e366538SXin Li   const uint16_t* src_ptr1 = src_ptr + src_stride;
642*4e366538SXin Li   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
643*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 1;
644*4e366538SXin Li   const uint16_t* src_temp1 = src_ptr1 + 1;
645*4e366538SXin Li 
646*4e366538SXin Li   asm volatile(
647*4e366538SXin Li       "vmov.u16    q15, #3                       \n"
648*4e366538SXin Li 
649*4e366538SXin Li       "1:                                        \n"
650*4e366538SXin Li       "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
651*4e366538SXin Li       "vld1.16     {q1}, [%5]!                   \n"  // 12345678 (16b)
652*4e366538SXin Li 
653*4e366538SXin Li       "vmovq       q2, q0                        \n"
654*4e366538SXin Li       "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
655*4e366538SXin Li       "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
656*4e366538SXin Li 
657*4e366538SXin Li       "vld1.16     {q2}, [%1]!                   \n"  // 01234567 (16b)
658*4e366538SXin Li       "vld1.16     {q3}, [%6]!                   \n"  // 12345678 (16b)
659*4e366538SXin Li 
660*4e366538SXin Li       "vmovq       q4, q2                        \n"
661*4e366538SXin Li       "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (odd)
662*4e366538SXin Li       "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (even)
663*4e366538SXin Li 
664*4e366538SXin Li       "vmovq       q4, q2                        \n"
665*4e366538SXin Li       "vmovq       q5, q3                        \n"
666*4e366538SXin Li       "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
667*4e366538SXin Li       "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
668*4e366538SXin Li       "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
669*4e366538SXin Li       "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
670*4e366538SXin Li 
671*4e366538SXin Li       "vrshr.u16   q2, q1, #4                    \n"  // 2, even
672*4e366538SXin Li       "vrshr.u16   q3, q0, #4                    \n"  // 2, odd
673*4e366538SXin Li       "vrshr.u16   q0, q5, #4                    \n"  // 1, even
674*4e366538SXin Li       "vrshr.u16   q1, q4, #4                    \n"  // 1, odd
675*4e366538SXin Li 
676*4e366538SXin Li       "vst2.16     {d0, d1, d2, d3}, [%2]!       \n"  // store
677*4e366538SXin Li       "vst2.16     {d4, d5, d6, d7}, [%3]!       \n"  // store
678*4e366538SXin Li       "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
679*4e366538SXin Li       "bgt         1b                            \n"
680*4e366538SXin Li       : "+r"(src_ptr),    // %0
681*4e366538SXin Li         "+r"(src_ptr1),   // %1
682*4e366538SXin Li         "+r"(dst_ptr),    // %2
683*4e366538SXin Li         "+r"(dst_ptr1),   // %3
684*4e366538SXin Li         "+r"(dst_width),  // %4
685*4e366538SXin Li         "+r"(src_temp),   // %5
686*4e366538SXin Li         "+r"(src_temp1)   // %6
687*4e366538SXin Li       :
688*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
689*4e366538SXin Li         "q15"  // Clobber List
690*4e366538SXin Li   );
691*4e366538SXin Li }
692*4e366538SXin Li 
ScaleRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)693*4e366538SXin Li void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
694*4e366538SXin Li                                 uint16_t* dst_ptr,
695*4e366538SXin Li                                 int dst_width) {
696*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 1;
697*4e366538SXin Li   asm volatile(
698*4e366538SXin Li       "vmov.u16    d31, #3                       \n"
699*4e366538SXin Li 
700*4e366538SXin Li       "1:                                        \n"
701*4e366538SXin Li       "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
702*4e366538SXin Li       "vld1.16     {q1}, [%3]!                   \n"  // 12345678 (16b)
703*4e366538SXin Li 
704*4e366538SXin Li       "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
705*4e366538SXin Li       "vmovl.u16   q3, d1                        \n"  // 4567 (32b)
706*4e366538SXin Li       "vmovl.u16   q4, d2                        \n"  // 1234 (32b)
707*4e366538SXin Li       "vmovl.u16   q5, d3                        \n"  // 5678 (32b)
708*4e366538SXin Li 
709*4e366538SXin Li       "vmlal.u16   q2, d2, d31                   \n"
710*4e366538SXin Li       "vmlal.u16   q3, d3, d31                   \n"
711*4e366538SXin Li       "vmlal.u16   q4, d0, d31                   \n"
712*4e366538SXin Li       "vmlal.u16   q5, d1, d31                   \n"
713*4e366538SXin Li 
714*4e366538SXin Li       "vrshrn.u32  d0, q4, #2                    \n"
715*4e366538SXin Li       "vrshrn.u32  d1, q5, #2                    \n"
716*4e366538SXin Li       "vrshrn.u32  d2, q2, #2                    \n"
717*4e366538SXin Li       "vrshrn.u32  d3, q3, #2                    \n"
718*4e366538SXin Li 
719*4e366538SXin Li       "vst2.16     {q0, q1}, [%1]!               \n"  // store
720*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
721*4e366538SXin Li       "bgt         1b                            \n"
722*4e366538SXin Li       : "+r"(src_ptr),    // %0
723*4e366538SXin Li         "+r"(dst_ptr),    // %1
724*4e366538SXin Li         "+r"(dst_width),  // %2
725*4e366538SXin Li         "+r"(src_temp)    // %3
726*4e366538SXin Li       :
727*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
728*4e366538SXin Li   );
729*4e366538SXin Li }
730*4e366538SXin Li 
ScaleRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)731*4e366538SXin Li void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
732*4e366538SXin Li                                   ptrdiff_t src_stride,
733*4e366538SXin Li                                   uint16_t* dst_ptr,
734*4e366538SXin Li                                   ptrdiff_t dst_stride,
735*4e366538SXin Li                                   int dst_width) {
736*4e366538SXin Li   const uint16_t* src_ptr1 = src_ptr + src_stride;
737*4e366538SXin Li   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
738*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 1;
739*4e366538SXin Li   const uint16_t* src_temp1 = src_ptr1 + 1;
740*4e366538SXin Li 
741*4e366538SXin Li   asm volatile(
742*4e366538SXin Li       "vmov.u16    d31, #3                       \n"
743*4e366538SXin Li       "vmov.u32    q14, #3                       \n"
744*4e366538SXin Li 
745*4e366538SXin Li       "1:                                        \n"
746*4e366538SXin Li       "vld1.16     {d0}, [%0]!                   \n"  // 0123 (16b)
747*4e366538SXin Li       "vld1.16     {d1}, [%5]!                   \n"  // 1234 (16b)
748*4e366538SXin Li       "vmovl.u16   q2, d0                        \n"  // 0123 (32b)
749*4e366538SXin Li       "vmovl.u16   q3, d1                        \n"  // 1234 (32b)
750*4e366538SXin Li       "vmlal.u16   q2, d1, d31                   \n"
751*4e366538SXin Li       "vmlal.u16   q3, d0, d31                   \n"
752*4e366538SXin Li 
753*4e366538SXin Li       "vld1.16     {d0}, [%1]!                   \n"  // 0123 (16b)
754*4e366538SXin Li       "vld1.16     {d1}, [%6]!                   \n"  // 1234 (16b)
755*4e366538SXin Li       "vmovl.u16   q4, d0                        \n"  // 0123 (32b)
756*4e366538SXin Li       "vmovl.u16   q5, d1                        \n"  // 1234 (32b)
757*4e366538SXin Li       "vmlal.u16   q4, d1, d31                   \n"
758*4e366538SXin Li       "vmlal.u16   q5, d0, d31                   \n"
759*4e366538SXin Li 
760*4e366538SXin Li       "vmovq       q0, q4                        \n"
761*4e366538SXin Li       "vmovq       q1, q5                        \n"
762*4e366538SXin Li       "vmla.u32    q4, q2, q14                   \n"
763*4e366538SXin Li       "vmla.u32    q5, q3, q14                   \n"
764*4e366538SXin Li       "vmla.u32    q2, q0, q14                   \n"
765*4e366538SXin Li       "vmla.u32    q3, q1, q14                   \n"
766*4e366538SXin Li 
767*4e366538SXin Li       "vrshrn.u32  d1, q4, #4                    \n"
768*4e366538SXin Li       "vrshrn.u32  d0, q5, #4                    \n"
769*4e366538SXin Li       "vrshrn.u32  d3, q2, #4                    \n"
770*4e366538SXin Li       "vrshrn.u32  d2, q3, #4                    \n"
771*4e366538SXin Li 
772*4e366538SXin Li       "vst2.16     {d0, d1}, [%2]!               \n"  // store
773*4e366538SXin Li       "vst2.16     {d2, d3}, [%3]!               \n"  // store
774*4e366538SXin Li       "subs        %4, %4, #8                    \n"  // 4 sample -> 8 sample
775*4e366538SXin Li       "bgt         1b                            \n"
776*4e366538SXin Li       : "+r"(src_ptr),    // %0
777*4e366538SXin Li         "+r"(src_ptr1),   // %1
778*4e366538SXin Li         "+r"(dst_ptr),    // %2
779*4e366538SXin Li         "+r"(dst_ptr1),   // %3
780*4e366538SXin Li         "+r"(dst_width),  // %4
781*4e366538SXin Li         "+r"(src_temp),   // %5
782*4e366538SXin Li         "+r"(src_temp1)   // %6
783*4e366538SXin Li       :
784*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
785*4e366538SXin Li         "d31"  // Clobber List
786*4e366538SXin Li   );
787*4e366538SXin Li }
788*4e366538SXin Li 
ScaleUVRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)789*4e366538SXin Li void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
790*4e366538SXin Li                                uint8_t* dst_ptr,
791*4e366538SXin Li                                int dst_width) {
792*4e366538SXin Li   const uint8_t* src_temp = src_ptr + 2;
793*4e366538SXin Li   asm volatile(
794*4e366538SXin Li       "vmov.u8     d30, #3                       \n"
795*4e366538SXin Li 
796*4e366538SXin Li       "1:                                        \n"
797*4e366538SXin Li       "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
798*4e366538SXin Li       "vld1.8      {d5}, [%3]!                   \n"  // 11223344 (1u1v)
799*4e366538SXin Li 
800*4e366538SXin Li       "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
801*4e366538SXin Li       "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
802*4e366538SXin Li       "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
803*4e366538SXin Li       "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
804*4e366538SXin Li 
805*4e366538SXin Li       "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
806*4e366538SXin Li       "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
807*4e366538SXin Li 
808*4e366538SXin Li       "vst2.16     {d0, d1}, [%1]!               \n"  // store
809*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
810*4e366538SXin Li       "bgt         1b                            \n"
811*4e366538SXin Li       : "+r"(src_ptr),    // %0
812*4e366538SXin Li         "+r"(dst_ptr),    // %1
813*4e366538SXin Li         "+r"(dst_width),  // %2
814*4e366538SXin Li         "+r"(src_temp)    // %3
815*4e366538SXin Li       :
816*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "d30"  // Clobber List
817*4e366538SXin Li   );
818*4e366538SXin Li }
819*4e366538SXin Li 
ScaleUVRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)820*4e366538SXin Li void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
821*4e366538SXin Li                                  ptrdiff_t src_stride,
822*4e366538SXin Li                                  uint8_t* dst_ptr,
823*4e366538SXin Li                                  ptrdiff_t dst_stride,
824*4e366538SXin Li                                  int dst_width) {
825*4e366538SXin Li   const uint8_t* src_ptr1 = src_ptr + src_stride;
826*4e366538SXin Li   uint8_t* dst_ptr1 = dst_ptr + dst_stride;
827*4e366538SXin Li   const uint8_t* src_temp = src_ptr + 2;
828*4e366538SXin Li   const uint8_t* src_temp1 = src_ptr1 + 2;
829*4e366538SXin Li 
830*4e366538SXin Li   asm volatile(
831*4e366538SXin Li       "vmov.u16    q15, #3                       \n"
832*4e366538SXin Li       "vmov.u8     d28, #3                       \n"
833*4e366538SXin Li 
834*4e366538SXin Li       "1:                                        \n"
835*4e366538SXin Li       "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
836*4e366538SXin Li       "vld1.8      {d5}, [%5]!                   \n"  // 11223344 (1u1v)
837*4e366538SXin Li 
838*4e366538SXin Li       "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
839*4e366538SXin Li       "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
840*4e366538SXin Li       "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
841*4e366538SXin Li       "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
842*4e366538SXin Li 
843*4e366538SXin Li       "vld1.8      {d8}, [%1]!                   \n"  // 00112233 (1u1v)
844*4e366538SXin Li       "vld1.8      {d9}, [%6]!                   \n"  // 11223344 (1u1v)
845*4e366538SXin Li 
846*4e366538SXin Li       "vmovl.u8    q2, d8                        \n"  // 00112233 (1u1v, 16b)
847*4e366538SXin Li       "vmovl.u8    q3, d9                        \n"  // 11223344 (1u1v, 16b)
848*4e366538SXin Li       "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
849*4e366538SXin Li       "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
850*4e366538SXin Li 
851*4e366538SXin Li       // e  o
852*4e366538SXin Li       // q1 q0
853*4e366538SXin Li       // q3 q2
854*4e366538SXin Li 
855*4e366538SXin Li       "vmovq       q4, q2                        \n"
856*4e366538SXin Li       "vmovq       q5, q3                        \n"
857*4e366538SXin Li       "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
858*4e366538SXin Li       "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
859*4e366538SXin Li       "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
860*4e366538SXin Li       "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
861*4e366538SXin Li 
862*4e366538SXin Li       // e  o
863*4e366538SXin Li       // q5 q4
864*4e366538SXin Li       // q1 q0
865*4e366538SXin Li 
866*4e366538SXin Li       "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
867*4e366538SXin Li       "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
868*4e366538SXin Li       "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
869*4e366538SXin Li       "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
870*4e366538SXin Li 
871*4e366538SXin Li       "vst2.16     {d0, d1}, [%2]!               \n"  // store
872*4e366538SXin Li       "vst2.16     {d2, d3}, [%3]!               \n"  // store
873*4e366538SXin Li       "subs        %4, %4, #8                    \n"  // 4 uv -> 8 uv
874*4e366538SXin Li       "bgt         1b                            \n"
875*4e366538SXin Li       : "+r"(src_ptr),    // %0
876*4e366538SXin Li         "+r"(src_ptr1),   // %1
877*4e366538SXin Li         "+r"(dst_ptr),    // %2
878*4e366538SXin Li         "+r"(dst_ptr1),   // %3
879*4e366538SXin Li         "+r"(dst_width),  // %4
880*4e366538SXin Li         "+r"(src_temp),   // %5
881*4e366538SXin Li         "+r"(src_temp1)   // %6
882*4e366538SXin Li       :
883*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
884*4e366538SXin Li         "q15"  // Clobber List
885*4e366538SXin Li   );
886*4e366538SXin Li }
887*4e366538SXin Li 
ScaleUVRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)888*4e366538SXin Li void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
889*4e366538SXin Li                                   uint16_t* dst_ptr,
890*4e366538SXin Li                                   int dst_width) {
891*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 2;
892*4e366538SXin Li   asm volatile(
893*4e366538SXin Li       "vmov.u16    d30, #3                       \n"
894*4e366538SXin Li 
895*4e366538SXin Li       "1:                                        \n"
896*4e366538SXin Li       "vld1.16     {q0}, [%0]!                   \n"  // 00112233 (1u1v, 16)
897*4e366538SXin Li       "vld1.16     {q1}, [%3]!                   \n"  // 11223344 (1u1v, 16)
898*4e366538SXin Li 
899*4e366538SXin Li       "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
900*4e366538SXin Li       "vmovl.u16   q3, d2                        \n"  // 1122 (1u1v, 32b)
901*4e366538SXin Li       "vmovl.u16   q4, d1                        \n"  // 2233 (1u1v, 32b)
902*4e366538SXin Li       "vmovl.u16   q5, d3                        \n"  // 3344 (1u1v, 32b)
903*4e366538SXin Li       "vmlal.u16   q2, d2, d30                   \n"  // 3*near+far (odd)
904*4e366538SXin Li       "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (even)
905*4e366538SXin Li       "vmlal.u16   q4, d3, d30                   \n"  // 3*near+far (odd)
906*4e366538SXin Li       "vmlal.u16   q5, d1, d30                   \n"  // 3*near+far (even)
907*4e366538SXin Li 
908*4e366538SXin Li       "vrshrn.u32  d1, q2, #2                    \n"  // 3/4*near+1/4*far (odd)
909*4e366538SXin Li       "vrshrn.u32  d0, q3, #2                    \n"  // 3/4*near+1/4*far (even)
910*4e366538SXin Li       "vrshrn.u32  d3, q4, #2                    \n"  // 3/4*near+1/4*far (odd)
911*4e366538SXin Li       "vrshrn.u32  d2, q5, #2                    \n"  // 3/4*near+1/4*far (even)
912*4e366538SXin Li 
913*4e366538SXin Li       "vst2.32     {d0, d1}, [%1]!               \n"  // store
914*4e366538SXin Li       "vst2.32     {d2, d3}, [%1]!               \n"  // store
915*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
916*4e366538SXin Li       "bgt         1b                            \n"
917*4e366538SXin Li       : "+r"(src_ptr),    // %0
918*4e366538SXin Li         "+r"(dst_ptr),    // %1
919*4e366538SXin Li         "+r"(dst_width),  // %2
920*4e366538SXin Li         "+r"(src_temp)    // %3
921*4e366538SXin Li       :
922*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
923*4e366538SXin Li         "d30"  // Clobber List
924*4e366538SXin Li   );
925*4e366538SXin Li }
926*4e366538SXin Li 
ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)927*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
928*4e366538SXin Li                                     ptrdiff_t src_stride,
929*4e366538SXin Li                                     uint16_t* dst_ptr,
930*4e366538SXin Li                                     ptrdiff_t dst_stride,
931*4e366538SXin Li                                     int dst_width) {
932*4e366538SXin Li   const uint16_t* src_ptr1 = src_ptr + src_stride;
933*4e366538SXin Li   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
934*4e366538SXin Li   const uint16_t* src_temp = src_ptr + 2;
935*4e366538SXin Li   const uint16_t* src_temp1 = src_ptr1 + 2;
936*4e366538SXin Li 
937*4e366538SXin Li   asm volatile(
938*4e366538SXin Li       "vmov.u16    d30, #3                       \n"
939*4e366538SXin Li       "vmov.u32    q14, #3                       \n"
940*4e366538SXin Li 
941*4e366538SXin Li       "1:                                        \n"
942*4e366538SXin Li       "vld1.8      {d0}, [%0]!                   \n"  // 0011 (1u1v)
943*4e366538SXin Li       "vld1.8      {d1}, [%5]!                   \n"  // 1122 (1u1v)
944*4e366538SXin Li       "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
945*4e366538SXin Li       "vmovl.u16   q3, d1                        \n"  // 1122 (1u1v, 32b)
946*4e366538SXin Li       "vmlal.u16   q2, d1, d30                   \n"  // 3*near+far (1, odd)
947*4e366538SXin Li       "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (1, even)
948*4e366538SXin Li 
949*4e366538SXin Li       "vld1.8      {d0}, [%1]!                   \n"  // 0011 (1u1v)
950*4e366538SXin Li       "vld1.8      {d1}, [%6]!                   \n"  // 1122 (1u1v)
951*4e366538SXin Li       "vmovl.u16   q4, d0                        \n"  // 0011 (1u1v, 32b)
952*4e366538SXin Li       "vmovl.u16   q5, d1                        \n"  // 1122 (1u1v, 32b)
953*4e366538SXin Li       "vmlal.u16   q4, d1, d30                   \n"  // 3*near+far (2, odd)
954*4e366538SXin Li       "vmlal.u16   q5, d0, d30                   \n"  // 3*near+far (2, even)
955*4e366538SXin Li 
956*4e366538SXin Li       "vmovq       q0, q4                        \n"
957*4e366538SXin Li       "vmovq       q1, q5                        \n"
958*4e366538SXin Li       "vmla.u32    q4, q2, q14                   \n"  // 9 3 3 1 (1, odd)
959*4e366538SXin Li       "vmla.u32    q5, q3, q14                   \n"  // 9 3 3 1 (1, even)
960*4e366538SXin Li       "vmla.u32    q2, q0, q14                   \n"  // 9 3 3 1 (2, odd)
961*4e366538SXin Li       "vmla.u32    q3, q1, q14                   \n"  // 9 3 3 1 (2, even)
962*4e366538SXin Li 
963*4e366538SXin Li       "vrshrn.u32  d1, q4, #4                    \n"  // 1, odd
964*4e366538SXin Li       "vrshrn.u32  d0, q5, #4                    \n"  // 1, even
965*4e366538SXin Li       "vrshrn.u32  d3, q2, #4                    \n"  // 2, odd
966*4e366538SXin Li       "vrshrn.u32  d2, q3, #4                    \n"  // 2, even
967*4e366538SXin Li 
968*4e366538SXin Li       "vst2.32     {d0, d1}, [%2]!               \n"  // store
969*4e366538SXin Li       "vst2.32     {d2, d3}, [%3]!               \n"  // store
970*4e366538SXin Li       "subs        %4, %4, #4                    \n"  // 2 uv -> 4 uv
971*4e366538SXin Li       "bgt         1b                            \n"
972*4e366538SXin Li       : "+r"(src_ptr),    // %0
973*4e366538SXin Li         "+r"(src_ptr1),   // %1
974*4e366538SXin Li         "+r"(dst_ptr),    // %2
975*4e366538SXin Li         "+r"(dst_ptr1),   // %3
976*4e366538SXin Li         "+r"(dst_width),  // %4
977*4e366538SXin Li         "+r"(src_temp),   // %5
978*4e366538SXin Li         "+r"(src_temp1)   // %6
979*4e366538SXin Li       :
980*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
981*4e366538SXin Li         "d30"  // Clobber List
982*4e366538SXin Li   );
983*4e366538SXin Li }
984*4e366538SXin Li 
985*4e366538SXin Li // Add a row of bytes to a row of shorts.  Used for box filter.
986*4e366538SXin Li // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)987*4e366538SXin Li void ScaleAddRow_NEON(const uint8_t* src_ptr,
988*4e366538SXin Li                       uint16_t* dst_ptr,
989*4e366538SXin Li                       int src_width) {
990*4e366538SXin Li   asm volatile(
991*4e366538SXin Li       "1:                                        \n"
992*4e366538SXin Li       "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
993*4e366538SXin Li       "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
994*4e366538SXin Li       "vaddw.u8    q2, q2, d1                    \n"  // add
995*4e366538SXin Li       "vaddw.u8    q1, q1, d0                    \n"
996*4e366538SXin Li       "vst1.16     {q1, q2}, [%1]!               \n"  // store accumulator
997*4e366538SXin Li       "subs        %2, %2, #16                   \n"  // 16 processed per loop
998*4e366538SXin Li       "bgt         1b                            \n"
999*4e366538SXin Li       : "+r"(src_ptr),   // %0
1000*4e366538SXin Li         "+r"(dst_ptr),   // %1
1001*4e366538SXin Li         "+r"(src_width)  // %2
1002*4e366538SXin Li       :
1003*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2"  // Clobber List
1004*4e366538SXin Li   );
1005*4e366538SXin Li }
1006*4e366538SXin Li 
1007*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1008*4e366538SXin Li // the x/dx stepping
1009*4e366538SXin Li #define LOAD2_DATA8_LANE(n)                      \
1010*4e366538SXin Li   "lsr        %5, %3, #16                    \n" \
1011*4e366538SXin Li   "add        %6, %1, %5                     \n" \
1012*4e366538SXin Li   "add        %3, %3, %4                     \n" \
1013*4e366538SXin Li   "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
1014*4e366538SXin Li 
1015*4e366538SXin Li // The NEON version mimics this formula (from row_common.cc):
1016*4e366538SXin Li // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
1017*4e366538SXin Li //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
1018*4e366538SXin Li 
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1019*4e366538SXin Li void ScaleFilterCols_NEON(uint8_t* dst_ptr,
1020*4e366538SXin Li                           const uint8_t* src_ptr,
1021*4e366538SXin Li                           int dst_width,
1022*4e366538SXin Li                           int x,
1023*4e366538SXin Li                           int dx) {
1024*4e366538SXin Li   int dx_offset[4] = {0, 1, 2, 3};
1025*4e366538SXin Li   int* tmp = dx_offset;
1026*4e366538SXin Li   const uint8_t* src_tmp = src_ptr;
1027*4e366538SXin Li   asm volatile (
1028*4e366538SXin Li       "vdup.32     q0, %3                        \n"  // x
1029*4e366538SXin Li       "vdup.32     q1, %4                        \n"  // dx
1030*4e366538SXin Li       "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
1031*4e366538SXin Li       "vshl.i32    q3, q1, #2                    \n"  // 4 * dx
1032*4e366538SXin Li       "vmul.s32    q1, q1, q2                    \n"
1033*4e366538SXin Li     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1034*4e366538SXin Li       "vadd.s32    q1, q1, q0                    \n"
1035*4e366538SXin Li     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
1036*4e366538SXin Li       "vadd.s32    q2, q1, q3                    \n"
1037*4e366538SXin Li       "vshl.i32    q0, q3, #1                    \n"  // 8 * dx
1038*4e366538SXin Li       "1:                                        \n"
1039*4e366538SXin Li     LOAD2_DATA8_LANE(0)
1040*4e366538SXin Li     LOAD2_DATA8_LANE(1)
1041*4e366538SXin Li     LOAD2_DATA8_LANE(2)
1042*4e366538SXin Li     LOAD2_DATA8_LANE(3)
1043*4e366538SXin Li     LOAD2_DATA8_LANE(4)
1044*4e366538SXin Li     LOAD2_DATA8_LANE(5)
1045*4e366538SXin Li     LOAD2_DATA8_LANE(6)
1046*4e366538SXin Li     LOAD2_DATA8_LANE(7)
1047*4e366538SXin Li       "vmov        q10, q1                       \n"
1048*4e366538SXin Li       "vmov        q11, q2                       \n"
1049*4e366538SXin Li       "vuzp.16     q10, q11                      \n"
1050*4e366538SXin Li       "vmovl.u8    q8, d6                        \n"
1051*4e366538SXin Li       "vmovl.u8    q9, d7                        \n"
1052*4e366538SXin Li       "vsubl.s16   q11, d18, d16                 \n"
1053*4e366538SXin Li       "vsubl.s16   q12, d19, d17                 \n"
1054*4e366538SXin Li       "vmovl.u16   q13, d20                      \n"
1055*4e366538SXin Li       "vmovl.u16   q10, d21                      \n"
1056*4e366538SXin Li       "vmul.s32    q11, q11, q13                 \n"
1057*4e366538SXin Li       "vmul.s32    q12, q12, q10                 \n"
1058*4e366538SXin Li       "vrshrn.s32  d18, q11, #16                 \n"
1059*4e366538SXin Li       "vrshrn.s32  d19, q12, #16                 \n"
1060*4e366538SXin Li       "vadd.s16    q8, q8, q9                    \n"
1061*4e366538SXin Li       "vmovn.s16   d6, q8                        \n"
1062*4e366538SXin Li 
1063*4e366538SXin Li       "vst1.8      {d6}, [%0]!                   \n"  // store pixels
1064*4e366538SXin Li       "vadd.s32    q1, q1, q0                    \n"
1065*4e366538SXin Li       "vadd.s32    q2, q2, q0                    \n"
1066*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop
1067*4e366538SXin Li       "bgt         1b                            \n"
1068*4e366538SXin Li   : "+r"(dst_ptr),          // %0
1069*4e366538SXin Li     "+r"(src_ptr),          // %1
1070*4e366538SXin Li     "+r"(dst_width),        // %2
1071*4e366538SXin Li     "+r"(x),                // %3
1072*4e366538SXin Li     "+r"(dx),               // %4
1073*4e366538SXin Li     "+r"(tmp),              // %5
1074*4e366538SXin Li     "+r"(src_tmp)           // %6
1075*4e366538SXin Li   :
1076*4e366538SXin Li   : "memory", "cc", "q0", "q1", "q2", "q3",
1077*4e366538SXin Li     "q8", "q9", "q10", "q11", "q12", "q13"
1078*4e366538SXin Li   );
1079*4e366538SXin Li }
1080*4e366538SXin Li 
1081*4e366538SXin Li #undef LOAD2_DATA8_LANE
1082*4e366538SXin Li 
1083*4e366538SXin Li // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)1084*4e366538SXin Li void ScaleFilterRows_NEON(uint8_t* dst_ptr,
1085*4e366538SXin Li                           const uint8_t* src_ptr,
1086*4e366538SXin Li                           ptrdiff_t src_stride,
1087*4e366538SXin Li                           int dst_width,
1088*4e366538SXin Li                           int source_y_fraction) {
1089*4e366538SXin Li   asm volatile(
1090*4e366538SXin Li       "cmp         %4, #0                        \n"
1091*4e366538SXin Li       "beq         100f                          \n"
1092*4e366538SXin Li       "add         %2, %1                        \n"
1093*4e366538SXin Li       "cmp         %4, #64                       \n"
1094*4e366538SXin Li       "beq         75f                           \n"
1095*4e366538SXin Li       "cmp         %4, #128                      \n"
1096*4e366538SXin Li       "beq         50f                           \n"
1097*4e366538SXin Li       "cmp         %4, #192                      \n"
1098*4e366538SXin Li       "beq         25f                           \n"
1099*4e366538SXin Li 
1100*4e366538SXin Li       "vdup.8      d5, %4                        \n"
1101*4e366538SXin Li       "rsb         %4, #256                      \n"
1102*4e366538SXin Li       "vdup.8      d4, %4                        \n"
1103*4e366538SXin Li       // General purpose row blend.
1104*4e366538SXin Li       "1:                                        \n"
1105*4e366538SXin Li       "vld1.8      {q0}, [%1]!                   \n"
1106*4e366538SXin Li       "vld1.8      {q1}, [%2]!                   \n"
1107*4e366538SXin Li       "subs        %3, %3, #16                   \n"
1108*4e366538SXin Li       "vmull.u8    q13, d0, d4                   \n"
1109*4e366538SXin Li       "vmull.u8    q14, d1, d4                   \n"
1110*4e366538SXin Li       "vmlal.u8    q13, d2, d5                   \n"
1111*4e366538SXin Li       "vmlal.u8    q14, d3, d5                   \n"
1112*4e366538SXin Li       "vrshrn.u16  d0, q13, #8                   \n"
1113*4e366538SXin Li       "vrshrn.u16  d1, q14, #8                   \n"
1114*4e366538SXin Li       "vst1.8      {q0}, [%0]!                   \n"
1115*4e366538SXin Li       "bgt         1b                            \n"
1116*4e366538SXin Li       "b           99f                           \n"
1117*4e366538SXin Li 
1118*4e366538SXin Li       // Blend 25 / 75.
1119*4e366538SXin Li       "25:                                       \n"
1120*4e366538SXin Li       "vld1.8      {q0}, [%1]!                   \n"
1121*4e366538SXin Li       "vld1.8      {q1}, [%2]!                   \n"
1122*4e366538SXin Li       "subs        %3, %3, #16                   \n"
1123*4e366538SXin Li       "vrhadd.u8   q0, q1                        \n"
1124*4e366538SXin Li       "vrhadd.u8   q0, q1                        \n"
1125*4e366538SXin Li       "vst1.8      {q0}, [%0]!                   \n"
1126*4e366538SXin Li       "bgt         25b                           \n"
1127*4e366538SXin Li       "b           99f                           \n"
1128*4e366538SXin Li 
1129*4e366538SXin Li       // Blend 50 / 50.
1130*4e366538SXin Li       "50:                                       \n"
1131*4e366538SXin Li       "vld1.8      {q0}, [%1]!                   \n"
1132*4e366538SXin Li       "vld1.8      {q1}, [%2]!                   \n"
1133*4e366538SXin Li       "subs        %3, %3, #16                   \n"
1134*4e366538SXin Li       "vrhadd.u8   q0, q1                        \n"
1135*4e366538SXin Li       "vst1.8      {q0}, [%0]!                   \n"
1136*4e366538SXin Li       "bgt         50b                           \n"
1137*4e366538SXin Li       "b           99f                           \n"
1138*4e366538SXin Li 
1139*4e366538SXin Li       // Blend 75 / 25.
1140*4e366538SXin Li       "75:                                       \n"
1141*4e366538SXin Li       "vld1.8      {q1}, [%1]!                   \n"
1142*4e366538SXin Li       "vld1.8      {q0}, [%2]!                   \n"
1143*4e366538SXin Li       "subs        %3, %3, #16                   \n"
1144*4e366538SXin Li       "vrhadd.u8   q0, q1                        \n"
1145*4e366538SXin Li       "vrhadd.u8   q0, q1                        \n"
1146*4e366538SXin Li       "vst1.8      {q0}, [%0]!                   \n"
1147*4e366538SXin Li       "bgt         75b                           \n"
1148*4e366538SXin Li       "b           99f                           \n"
1149*4e366538SXin Li 
1150*4e366538SXin Li       // Blend 100 / 0 - Copy row unchanged.
1151*4e366538SXin Li       "100:                                      \n"
1152*4e366538SXin Li       "vld1.8      {q0}, [%1]!                   \n"
1153*4e366538SXin Li       "subs        %3, %3, #16                   \n"
1154*4e366538SXin Li       "vst1.8      {q0}, [%0]!                   \n"
1155*4e366538SXin Li       "bgt         100b                          \n"
1156*4e366538SXin Li 
1157*4e366538SXin Li       "99:                                       \n"
1158*4e366538SXin Li       "vst1.8      {d1[7]}, [%0]                 \n"
1159*4e366538SXin Li       : "+r"(dst_ptr),           // %0
1160*4e366538SXin Li         "+r"(src_ptr),           // %1
1161*4e366538SXin Li         "+r"(src_stride),        // %2
1162*4e366538SXin Li         "+r"(dst_width),         // %3
1163*4e366538SXin Li         "+r"(source_y_fraction)  // %4
1164*4e366538SXin Li       :
1165*4e366538SXin Li       : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
1166*4e366538SXin Li }
1167*4e366538SXin Li 
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1168*4e366538SXin Li void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
1169*4e366538SXin Li                             ptrdiff_t src_stride,
1170*4e366538SXin Li                             uint8_t* dst,
1171*4e366538SXin Li                             int dst_width) {
1172*4e366538SXin Li   (void)src_stride;
1173*4e366538SXin Li   asm volatile(
1174*4e366538SXin Li       "1:                                        \n"
1175*4e366538SXin Li       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
1176*4e366538SXin Li       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
1177*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop
1178*4e366538SXin Li       "vmov        q2, q1                        \n"  // load next 8 ARGB
1179*4e366538SXin Li       "vst2.32     {q2, q3}, [%1]!               \n"  // store odd pixels
1180*4e366538SXin Li       "bgt         1b                            \n"
1181*4e366538SXin Li       : "+r"(src_ptr),   // %0
1182*4e366538SXin Li         "+r"(dst),       // %1
1183*4e366538SXin Li         "+r"(dst_width)  // %2
1184*4e366538SXin Li       :
1185*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
1186*4e366538SXin Li   );
1187*4e366538SXin Li }
1188*4e366538SXin Li 
1189*4e366538SXin Li //  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
1190*4e366538SXin Li //  4a:  3e04        subs  r6, #4
1191*4e366538SXin Li //  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
1192*4e366538SXin Li //  50:  ef64 21f4   vorr  q9, q10, q10
1193*4e366538SXin Li //  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
1194*4e366538SXin Li //  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
1195*4e366538SXin Li 
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1196*4e366538SXin Li void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
1197*4e366538SXin Li                                   ptrdiff_t src_stride,
1198*4e366538SXin Li                                   uint8_t* dst_argb,
1199*4e366538SXin Li                                   int dst_width) {
1200*4e366538SXin Li   (void)src_stride;
1201*4e366538SXin Li   asm volatile(
1202*4e366538SXin Li       "1:                                        \n"
1203*4e366538SXin Li       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
1204*4e366538SXin Li       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
1205*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop
1206*4e366538SXin Li       "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
1207*4e366538SXin Li       "vrhadd.u8   q1, q2, q3                    \n"  // rounding half add
1208*4e366538SXin Li       "vst2.32     {q0, q1}, [%1]!               \n"
1209*4e366538SXin Li       "bgt         1b                            \n"
1210*4e366538SXin Li       : "+r"(src_argb),  // %0
1211*4e366538SXin Li         "+r"(dst_argb),  // %1
1212*4e366538SXin Li         "+r"(dst_width)  // %2
1213*4e366538SXin Li       :
1214*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
1215*4e366538SXin Li   );
1216*4e366538SXin Li }
1217*4e366538SXin Li 
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1218*4e366538SXin Li void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
1219*4e366538SXin Li                                ptrdiff_t src_stride,
1220*4e366538SXin Li                                uint8_t* dst,
1221*4e366538SXin Li                                int dst_width) {
1222*4e366538SXin Li   asm volatile(
1223*4e366538SXin Li       // change the stride to row 2 pointer
1224*4e366538SXin Li       "add         %1, %1, %0                    \n"
1225*4e366538SXin Li       "1:                                        \n"
1226*4e366538SXin Li       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
1227*4e366538SXin Li       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
1228*4e366538SXin Li       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
1229*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
1230*4e366538SXin Li       "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
1231*4e366538SXin Li       "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
1232*4e366538SXin Li       "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
1233*4e366538SXin Li       "vld4.8      {d16, d18, d20, d22}, [%1]!   \n"  // load 8 more ARGB
1234*4e366538SXin Li       "vld4.8      {d17, d19, d21, d23}, [%1]!   \n"  // load last 8 ARGB
1235*4e366538SXin Li       "vpadal.u8   q0, q8                        \n"  // B 16 bytes -> 8 shorts.
1236*4e366538SXin Li       "vpadal.u8   q1, q9                        \n"  // G 16 bytes -> 8 shorts.
1237*4e366538SXin Li       "vpadal.u8   q2, q10                       \n"  // R 16 bytes -> 8 shorts.
1238*4e366538SXin Li       "vpadal.u8   q3, q11                       \n"  // A 16 bytes -> 8 shorts.
1239*4e366538SXin Li       "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
1240*4e366538SXin Li       "vrshrn.u16  d1, q1, #2                    \n"
1241*4e366538SXin Li       "vrshrn.u16  d2, q2, #2                    \n"
1242*4e366538SXin Li       "vrshrn.u16  d3, q3, #2                    \n"
1243*4e366538SXin Li       "vst4.8      {d0, d1, d2, d3}, [%2]!       \n"
1244*4e366538SXin Li       "bgt         1b                            \n"
1245*4e366538SXin Li       : "+r"(src_ptr),     // %0
1246*4e366538SXin Li         "+r"(src_stride),  // %1
1247*4e366538SXin Li         "+r"(dst),         // %2
1248*4e366538SXin Li         "+r"(dst_width)    // %3
1249*4e366538SXin Li       :
1250*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
1251*4e366538SXin Li }
1252*4e366538SXin Li 
1253*4e366538SXin Li // Reads 4 pixels at a time.
1254*4e366538SXin Li // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1255*4e366538SXin Li void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
1256*4e366538SXin Li                                ptrdiff_t src_stride,
1257*4e366538SXin Li                                int src_stepx,
1258*4e366538SXin Li                                uint8_t* dst_argb,
1259*4e366538SXin Li                                int dst_width) {
1260*4e366538SXin Li   (void)src_stride;
1261*4e366538SXin Li   asm volatile(
1262*4e366538SXin Li       "mov         r12, %3, lsl #2               \n"
1263*4e366538SXin Li       "1:                                        \n"
1264*4e366538SXin Li       "vld1.32     {d0[0]}, [%0], r12            \n"
1265*4e366538SXin Li       "vld1.32     {d0[1]}, [%0], r12            \n"
1266*4e366538SXin Li       "vld1.32     {d1[0]}, [%0], r12            \n"
1267*4e366538SXin Li       "vld1.32     {d1[1]}, [%0], r12            \n"
1268*4e366538SXin Li       "subs        %2, %2, #4                    \n"  // 4 pixels per loop.
1269*4e366538SXin Li       "vst1.8      {q0}, [%1]!                   \n"
1270*4e366538SXin Li       "bgt         1b                            \n"
1271*4e366538SXin Li       : "+r"(src_argb),  // %0
1272*4e366538SXin Li         "+r"(dst_argb),  // %1
1273*4e366538SXin Li         "+r"(dst_width)  // %2
1274*4e366538SXin Li       : "r"(src_stepx)   // %3
1275*4e366538SXin Li       : "memory", "cc", "r12", "q0");
1276*4e366538SXin Li }
1277*4e366538SXin Li 
1278*4e366538SXin Li // Reads 4 pixels at a time.
1279*4e366538SXin Li // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1280*4e366538SXin Li void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
1281*4e366538SXin Li                                   ptrdiff_t src_stride,
1282*4e366538SXin Li                                   int src_stepx,
1283*4e366538SXin Li                                   uint8_t* dst_argb,
1284*4e366538SXin Li                                   int dst_width) {
1285*4e366538SXin Li   asm volatile(
1286*4e366538SXin Li       "mov         r12, %4, lsl #2               \n"
1287*4e366538SXin Li       "add         %1, %1, %0                    \n"
1288*4e366538SXin Li       "1:                                        \n"
1289*4e366538SXin Li       "vld1.8      {d0}, [%0], r12               \n"  // 4 2x2 blocks -> 2x1
1290*4e366538SXin Li       "vld1.8      {d1}, [%1], r12               \n"
1291*4e366538SXin Li       "vld1.8      {d2}, [%0], r12               \n"
1292*4e366538SXin Li       "vld1.8      {d3}, [%1], r12               \n"
1293*4e366538SXin Li       "vld1.8      {d4}, [%0], r12               \n"
1294*4e366538SXin Li       "vld1.8      {d5}, [%1], r12               \n"
1295*4e366538SXin Li       "vld1.8      {d6}, [%0], r12               \n"
1296*4e366538SXin Li       "vld1.8      {d7}, [%1], r12               \n"
1297*4e366538SXin Li       "vaddl.u8    q0, d0, d1                    \n"
1298*4e366538SXin Li       "vaddl.u8    q1, d2, d3                    \n"
1299*4e366538SXin Li       "vaddl.u8    q2, d4, d5                    \n"
1300*4e366538SXin Li       "vaddl.u8    q3, d6, d7                    \n"
1301*4e366538SXin Li       "vswp.8      d1, d2                        \n"  // ab_cd -> ac_bd
1302*4e366538SXin Li       "vswp.8      d5, d6                        \n"  // ef_gh -> eg_fh
1303*4e366538SXin Li       "vadd.u16    q0, q0, q1                    \n"  // (a+b)_(c+d)
1304*4e366538SXin Li       "vadd.u16    q2, q2, q3                    \n"  // (e+f)_(g+h)
1305*4e366538SXin Li       "vrshrn.u16  d0, q0, #2                    \n"  // first 2 pixels.
1306*4e366538SXin Li       "vrshrn.u16  d1, q2, #2                    \n"  // next 2 pixels.
1307*4e366538SXin Li       "subs        %3, %3, #4                    \n"  // 4 pixels per loop.
1308*4e366538SXin Li       "vst1.8      {q0}, [%2]!                   \n"
1309*4e366538SXin Li       "bgt         1b                            \n"
1310*4e366538SXin Li       : "+r"(src_argb),    // %0
1311*4e366538SXin Li         "+r"(src_stride),  // %1
1312*4e366538SXin Li         "+r"(dst_argb),    // %2
1313*4e366538SXin Li         "+r"(dst_width)    // %3
1314*4e366538SXin Li       : "r"(src_stepx)     // %4
1315*4e366538SXin Li       : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
1316*4e366538SXin Li }
1317*4e366538SXin Li 
1318*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1319*4e366538SXin Li // the x/dx stepping
1320*4e366538SXin Li #define LOAD1_DATA32_LANE(dn, n)                 \
1321*4e366538SXin Li   "lsr        %5, %3, #16                    \n" \
1322*4e366538SXin Li   "add        %6, %1, %5, lsl #2             \n" \
1323*4e366538SXin Li   "add        %3, %3, %4                     \n" \
1324*4e366538SXin Li   "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
1325*4e366538SXin Li 
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1326*4e366538SXin Li void ScaleARGBCols_NEON(uint8_t* dst_argb,
1327*4e366538SXin Li                         const uint8_t* src_argb,
1328*4e366538SXin Li                         int dst_width,
1329*4e366538SXin Li                         int x,
1330*4e366538SXin Li                         int dx) {
1331*4e366538SXin Li   int tmp;
1332*4e366538SXin Li   const uint8_t* src_tmp = src_argb;
1333*4e366538SXin Li   asm volatile(
1334*4e366538SXin Li       "1:                                        \n"
1335*4e366538SXin Li       // clang-format off
1336*4e366538SXin Li       LOAD1_DATA32_LANE(d0, 0)
1337*4e366538SXin Li       LOAD1_DATA32_LANE(d0, 1)
1338*4e366538SXin Li       LOAD1_DATA32_LANE(d1, 0)
1339*4e366538SXin Li       LOAD1_DATA32_LANE(d1, 1)
1340*4e366538SXin Li       LOAD1_DATA32_LANE(d2, 0)
1341*4e366538SXin Li       LOAD1_DATA32_LANE(d2, 1)
1342*4e366538SXin Li       LOAD1_DATA32_LANE(d3, 0)
1343*4e366538SXin Li       LOAD1_DATA32_LANE(d3, 1)
1344*4e366538SXin Li       // clang-format on
1345*4e366538SXin Li       "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
1346*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop
1347*4e366538SXin Li       "bgt         1b                            \n"
1348*4e366538SXin Li       : "+r"(dst_argb),   // %0
1349*4e366538SXin Li         "+r"(src_argb),   // %1
1350*4e366538SXin Li         "+r"(dst_width),  // %2
1351*4e366538SXin Li         "+r"(x),          // %3
1352*4e366538SXin Li         "+r"(dx),         // %4
1353*4e366538SXin Li         "=&r"(tmp),       // %5
1354*4e366538SXin Li         "+r"(src_tmp)     // %6
1355*4e366538SXin Li       :
1356*4e366538SXin Li       : "memory", "cc", "q0", "q1");
1357*4e366538SXin Li }
1358*4e366538SXin Li 
1359*4e366538SXin Li #undef LOAD1_DATA32_LANE
1360*4e366538SXin Li 
1361*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1362*4e366538SXin Li // the x/dx stepping
1363*4e366538SXin Li #define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
1364*4e366538SXin Li   "lsr        %5, %3, #16                                \n" \
1365*4e366538SXin Li   "add        %6, %1, %5, lsl #2                         \n" \
1366*4e366538SXin Li   "add        %3, %3, %4                                 \n" \
1367*4e366538SXin Li   "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
1368*4e366538SXin Li 
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1369*4e366538SXin Li void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
1370*4e366538SXin Li                               const uint8_t* src_argb,
1371*4e366538SXin Li                               int dst_width,
1372*4e366538SXin Li                               int x,
1373*4e366538SXin Li                               int dx) {
1374*4e366538SXin Li   int dx_offset[4] = {0, 1, 2, 3};
1375*4e366538SXin Li   int* tmp = dx_offset;
1376*4e366538SXin Li   const uint8_t* src_tmp = src_argb;
1377*4e366538SXin Li   asm volatile (
1378*4e366538SXin Li       "vdup.32     q0, %3                        \n"  // x
1379*4e366538SXin Li       "vdup.32     q1, %4                        \n"  // dx
1380*4e366538SXin Li       "vld1.32     {q2}, [%5]                    \n"  // 0 1 2 3
1381*4e366538SXin Li       "vshl.i32    q9, q1, #2                    \n"  // 4 * dx
1382*4e366538SXin Li       "vmul.s32    q1, q1, q2                    \n"
1383*4e366538SXin Li       "vmov.i8     q3, #0x7f                     \n"  // 0x7F
1384*4e366538SXin Li       "vmov.i16    q15, #0x7f                    \n"  // 0x7F
1385*4e366538SXin Li     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1386*4e366538SXin Li       "vadd.s32    q8, q1, q0                    \n"
1387*4e366538SXin Li       "1:                                        \n"
1388*4e366538SXin Li     // d0, d1: a
1389*4e366538SXin Li     // d2, d3: b
1390*4e366538SXin Li     LOAD2_DATA32_LANE(d0, d2, 0)
1391*4e366538SXin Li     LOAD2_DATA32_LANE(d0, d2, 1)
1392*4e366538SXin Li     LOAD2_DATA32_LANE(d1, d3, 0)
1393*4e366538SXin Li     LOAD2_DATA32_LANE(d1, d3, 1)
1394*4e366538SXin Li     "vshrn.i32   d22, q8, #9                   \n"
1395*4e366538SXin Li     "vand.16     d22, d22, d30                 \n"
1396*4e366538SXin Li     "vdup.8      d24, d22[0]                   \n"
1397*4e366538SXin Li     "vdup.8      d25, d22[2]                   \n"
1398*4e366538SXin Li     "vdup.8      d26, d22[4]                   \n"
1399*4e366538SXin Li     "vdup.8      d27, d22[6]                   \n"
1400*4e366538SXin Li     "vext.8      d4, d24, d25, #4              \n"
1401*4e366538SXin Li     "vext.8      d5, d26, d27, #4              \n"  // f
1402*4e366538SXin Li     "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
1403*4e366538SXin Li     "vmull.u8    q11, d0, d20                  \n"
1404*4e366538SXin Li     "vmull.u8    q12, d1, d21                  \n"
1405*4e366538SXin Li     "vmull.u8    q13, d2, d4                   \n"
1406*4e366538SXin Li     "vmull.u8    q14, d3, d5                   \n"
1407*4e366538SXin Li     "vadd.i16    q11, q11, q13                 \n"
1408*4e366538SXin Li     "vadd.i16    q12, q12, q14                 \n"
1409*4e366538SXin Li     "vshrn.i16   d0, q11, #7                   \n"
1410*4e366538SXin Li     "vshrn.i16   d1, q12, #7                   \n"
1411*4e366538SXin Li 
1412*4e366538SXin Li     "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
1413*4e366538SXin Li     "vadd.s32    q8, q8, q9                    \n"
1414*4e366538SXin Li     "subs        %2, %2, #4                    \n"  // 4 processed per loop
1415*4e366538SXin Li     "bgt         1b                            \n"
1416*4e366538SXin Li   : "+r"(dst_argb),         // %0
1417*4e366538SXin Li     "+r"(src_argb),         // %1
1418*4e366538SXin Li     "+r"(dst_width),        // %2
1419*4e366538SXin Li     "+r"(x),                // %3
1420*4e366538SXin Li     "+r"(dx),               // %4
1421*4e366538SXin Li     "+r"(tmp),              // %5
1422*4e366538SXin Li     "+r"(src_tmp)           // %6
1423*4e366538SXin Li   :
1424*4e366538SXin Li   : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1425*4e366538SXin Li     "q10", "q11", "q12", "q13", "q14", "q15"
1426*4e366538SXin Li   );
1427*4e366538SXin Li }
1428*4e366538SXin Li 
1429*4e366538SXin Li #undef LOAD2_DATA32_LANE
1430*4e366538SXin Li 
ScaleUVRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1431*4e366538SXin Li void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
1432*4e366538SXin Li                           ptrdiff_t src_stride,
1433*4e366538SXin Li                           uint8_t* dst,
1434*4e366538SXin Li                           int dst_width) {
1435*4e366538SXin Li   (void)src_stride;
1436*4e366538SXin Li   asm volatile(
1437*4e366538SXin Li       "1:                                        \n"
1438*4e366538SXin Li       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
1439*4e366538SXin Li       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
1440*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
1441*4e366538SXin Li       "vst1.16     {q1}, [%1]!                   \n"  // store 8 UV
1442*4e366538SXin Li       "bgt         1b                            \n"
1443*4e366538SXin Li       : "+r"(src_ptr),   // %0
1444*4e366538SXin Li         "+r"(dst),       // %1
1445*4e366538SXin Li         "+r"(dst_width)  // %2
1446*4e366538SXin Li       :
1447*4e366538SXin Li       : "memory", "cc", "q0", "q1");
1448*4e366538SXin Li }
1449*4e366538SXin Li 
ScaleUVRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1450*4e366538SXin Li void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
1451*4e366538SXin Li                                 ptrdiff_t src_stride,
1452*4e366538SXin Li                                 uint8_t* dst,
1453*4e366538SXin Li                                 int dst_width) {
1454*4e366538SXin Li   (void)src_stride;
1455*4e366538SXin Li   asm volatile(
1456*4e366538SXin Li       "1:                                        \n"
1457*4e366538SXin Li       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
1458*4e366538SXin Li       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
1459*4e366538SXin Li       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
1460*4e366538SXin Li       "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
1461*4e366538SXin Li       "vst1.16     {q0}, [%1]!                   \n"  // store 8 UV
1462*4e366538SXin Li       "bgt         1b                            \n"
1463*4e366538SXin Li       : "+r"(src_ptr),   // %0
1464*4e366538SXin Li         "+r"(dst),       // %1
1465*4e366538SXin Li         "+r"(dst_width)  // %2
1466*4e366538SXin Li       :
1467*4e366538SXin Li       : "memory", "cc", "q0", "q1");
1468*4e366538SXin Li }
1469*4e366538SXin Li 
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1470*4e366538SXin Li void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1471*4e366538SXin Li                              ptrdiff_t src_stride,
1472*4e366538SXin Li                              uint8_t* dst,
1473*4e366538SXin Li                              int dst_width) {
1474*4e366538SXin Li   asm volatile(
1475*4e366538SXin Li       // change the stride to row 2 pointer
1476*4e366538SXin Li       "add         %1, %1, %0                    \n"
1477*4e366538SXin Li       "1:                                        \n"
1478*4e366538SXin Li       "vld2.8      {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
1479*4e366538SXin Li       "vld2.8      {d1, d3}, [%0]!               \n"  // load next 8 UV
1480*4e366538SXin Li       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
1481*4e366538SXin Li       "vpaddl.u8   q0, q0                        \n"  // U 16 bytes -> 8 shorts.
1482*4e366538SXin Li       "vpaddl.u8   q1, q1                        \n"  // V 16 bytes -> 8 shorts.
1483*4e366538SXin Li       "vld2.8      {d16, d18}, [%1]!             \n"  // load 8 more UV
1484*4e366538SXin Li       "vld2.8      {d17, d19}, [%1]!             \n"  // load last 8 UV
1485*4e366538SXin Li       "vpadal.u8   q0, q8                        \n"  // U 16 bytes -> 8 shorts.
1486*4e366538SXin Li       "vpadal.u8   q1, q9                        \n"  // V 16 bytes -> 8 shorts.
1487*4e366538SXin Li       "vrshrn.u16  d0, q0, #2                    \n"  // round and pack to bytes
1488*4e366538SXin Li       "vrshrn.u16  d1, q1, #2                    \n"
1489*4e366538SXin Li       "vst2.8      {d0, d1}, [%2]!               \n"
1490*4e366538SXin Li       "bgt         1b                            \n"
1491*4e366538SXin Li       : "+r"(src_ptr),     // %0
1492*4e366538SXin Li         "+r"(src_stride),  // %1
1493*4e366538SXin Li         "+r"(dst),         // %2
1494*4e366538SXin Li         "+r"(dst_width)    // %3
1495*4e366538SXin Li       :
1496*4e366538SXin Li       : "memory", "cc", "q0", "q1", "q8", "q9");
1497*4e366538SXin Li }
1498*4e366538SXin Li 
1499*4e366538SXin Li // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1500*4e366538SXin Li void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1501*4e366538SXin Li                              ptrdiff_t src_stride,
1502*4e366538SXin Li                              int src_stepx,  // pixel step
1503*4e366538SXin Li                              uint8_t* dst_ptr,
1504*4e366538SXin Li                              int dst_width) {
1505*4e366538SXin Li   const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1506*4e366538SXin Li   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1507*4e366538SXin Li   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1508*4e366538SXin Li   (void)src_stride;
1509*4e366538SXin Li   asm volatile(
1510*4e366538SXin Li       "1:                                        \n"
1511*4e366538SXin Li       "vld1.16     {d0[0]}, [%0], %6             \n"
1512*4e366538SXin Li       "vld1.16     {d0[1]}, [%1], %6             \n"
1513*4e366538SXin Li       "vld1.16     {d0[2]}, [%2], %6             \n"
1514*4e366538SXin Li       "vld1.16     {d0[3]}, [%3], %6             \n"
1515*4e366538SXin Li       "subs        %5, %5, #4                    \n"  // 4 pixels per loop.
1516*4e366538SXin Li       "vst1.8      {d0}, [%4]!                   \n"
1517*4e366538SXin Li       "bgt         1b                            \n"
1518*4e366538SXin Li       : "+r"(src_ptr),      // %0
1519*4e366538SXin Li         "+r"(src1_ptr),     // %1
1520*4e366538SXin Li         "+r"(src2_ptr),     // %2
1521*4e366538SXin Li         "+r"(src3_ptr),     // %3
1522*4e366538SXin Li         "+r"(dst_ptr),      // %4
1523*4e366538SXin Li         "+r"(dst_width)     // %5
1524*4e366538SXin Li       : "r"(src_stepx * 8)  // %6
1525*4e366538SXin Li       : "memory", "cc", "d0");
1526*4e366538SXin Li }
1527*4e366538SXin Li 
1528*4e366538SXin Li #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
1529*4e366538SXin Li 
1530*4e366538SXin Li #ifdef __cplusplus
1531*4e366538SXin Li }  // extern "C"
1532*4e366538SXin Li }  // namespace libyuv
1533*4e366538SXin Li #endif
1534