1*4e366538SXin Li /*
2*4e366538SXin Li * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li *
4*4e366538SXin Li * Use of this source code is governed by a BSD-style license
5*4e366538SXin Li * that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li * tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li * in the file PATENTS. All contributing project authors may
8*4e366538SXin Li * be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li */
10*4e366538SXin Li
11*4e366538SXin Li #include "libyuv/row.h"
12*4e366538SXin Li
13*4e366538SXin Li #ifdef __cplusplus
14*4e366538SXin Li namespace libyuv {
15*4e366538SXin Li extern "C" {
16*4e366538SXin Li #endif
17*4e366538SXin Li
18*4e366538SXin Li // This module is for GCC Neon.
19*4e366538SXin Li #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20*4e366538SXin Li !defined(__aarch64__)
21*4e366538SXin Li
22*4e366538SXin Li // NEON downscalers with interpolation.
23*4e366538SXin Li // Provided by Fritz Koenig
24*4e366538SXin Li
25*4e366538SXin Li // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)26*4e366538SXin Li void ScaleRowDown2_NEON(const uint8_t* src_ptr,
27*4e366538SXin Li ptrdiff_t src_stride,
28*4e366538SXin Li uint8_t* dst,
29*4e366538SXin Li int dst_width) {
30*4e366538SXin Li (void)src_stride;
31*4e366538SXin Li asm volatile(
32*4e366538SXin Li "1: \n"
33*4e366538SXin Li // load even pixels into q0, odd into q1
34*4e366538SXin Li "vld2.8 {q0, q1}, [%0]! \n"
35*4e366538SXin Li "subs %2, %2, #16 \n" // 16 processed per loop
36*4e366538SXin Li "vst1.8 {q1}, [%1]! \n" // store odd pixels
37*4e366538SXin Li "bgt 1b \n"
38*4e366538SXin Li : "+r"(src_ptr), // %0
39*4e366538SXin Li "+r"(dst), // %1
40*4e366538SXin Li "+r"(dst_width) // %2
41*4e366538SXin Li :
42*4e366538SXin Li : "q0", "q1" // Clobber List
43*4e366538SXin Li );
44*4e366538SXin Li }
45*4e366538SXin Li
46*4e366538SXin Li // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)47*4e366538SXin Li void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
48*4e366538SXin Li ptrdiff_t src_stride,
49*4e366538SXin Li uint8_t* dst,
50*4e366538SXin Li int dst_width) {
51*4e366538SXin Li (void)src_stride;
52*4e366538SXin Li asm volatile(
53*4e366538SXin Li "1: \n"
54*4e366538SXin Li "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
55*4e366538SXin Li "subs %2, %2, #16 \n" // 16 processed per loop
56*4e366538SXin Li "vrhadd.u8 q0, q0, q1 \n" // rounding half add
57*4e366538SXin Li "vst1.8 {q0}, [%1]! \n"
58*4e366538SXin Li "bgt 1b \n"
59*4e366538SXin Li : "+r"(src_ptr), // %0
60*4e366538SXin Li "+r"(dst), // %1
61*4e366538SXin Li "+r"(dst_width) // %2
62*4e366538SXin Li :
63*4e366538SXin Li : "q0", "q1" // Clobber List
64*4e366538SXin Li );
65*4e366538SXin Li }
66*4e366538SXin Li
67*4e366538SXin Li // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)68*4e366538SXin Li void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
69*4e366538SXin Li ptrdiff_t src_stride,
70*4e366538SXin Li uint8_t* dst,
71*4e366538SXin Li int dst_width) {
72*4e366538SXin Li asm volatile(
73*4e366538SXin Li // change the stride to row 2 pointer
74*4e366538SXin Li "add %1, %0 \n"
75*4e366538SXin Li "1: \n"
76*4e366538SXin Li "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
77*4e366538SXin Li "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
78*4e366538SXin Li "subs %3, %3, #16 \n" // 16 processed per loop
79*4e366538SXin Li "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
80*4e366538SXin Li "vpaddl.u8 q1, q1 \n"
81*4e366538SXin Li "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
82*4e366538SXin Li // row1
83*4e366538SXin Li "vpadal.u8 q1, q3 \n"
84*4e366538SXin Li "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
85*4e366538SXin Li // pack
86*4e366538SXin Li "vrshrn.u16 d1, q1, #2 \n"
87*4e366538SXin Li "vst1.8 {q0}, [%2]! \n"
88*4e366538SXin Li "bgt 1b \n"
89*4e366538SXin Li : "+r"(src_ptr), // %0
90*4e366538SXin Li "+r"(src_stride), // %1
91*4e366538SXin Li "+r"(dst), // %2
92*4e366538SXin Li "+r"(dst_width) // %3
93*4e366538SXin Li :
94*4e366538SXin Li : "q0", "q1", "q2", "q3" // Clobber List
95*4e366538SXin Li );
96*4e366538SXin Li }
97*4e366538SXin Li
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)98*4e366538SXin Li void ScaleRowDown4_NEON(const uint8_t* src_ptr,
99*4e366538SXin Li ptrdiff_t src_stride,
100*4e366538SXin Li uint8_t* dst_ptr,
101*4e366538SXin Li int dst_width) {
102*4e366538SXin Li (void)src_stride;
103*4e366538SXin Li asm volatile(
104*4e366538SXin Li "1: \n"
105*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
106*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop
107*4e366538SXin Li "vst1.8 {d2}, [%1]! \n"
108*4e366538SXin Li "bgt 1b \n"
109*4e366538SXin Li : "+r"(src_ptr), // %0
110*4e366538SXin Li "+r"(dst_ptr), // %1
111*4e366538SXin Li "+r"(dst_width) // %2
112*4e366538SXin Li :
113*4e366538SXin Li : "q0", "q1", "memory", "cc");
114*4e366538SXin Li }
115*4e366538SXin Li
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)116*4e366538SXin Li void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
117*4e366538SXin Li ptrdiff_t src_stride,
118*4e366538SXin Li uint8_t* dst_ptr,
119*4e366538SXin Li int dst_width) {
120*4e366538SXin Li const uint8_t* src_ptr1 = src_ptr + src_stride;
121*4e366538SXin Li const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
122*4e366538SXin Li const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
123*4e366538SXin Li asm volatile(
124*4e366538SXin Li "1: \n"
125*4e366538SXin Li "vld1.8 {q0}, [%0]! \n" // load up 16x4
126*4e366538SXin Li "vld1.8 {q1}, [%3]! \n"
127*4e366538SXin Li "vld1.8 {q2}, [%4]! \n"
128*4e366538SXin Li "vld1.8 {q3}, [%5]! \n"
129*4e366538SXin Li "subs %2, %2, #4 \n"
130*4e366538SXin Li "vpaddl.u8 q0, q0 \n"
131*4e366538SXin Li "vpadal.u8 q0, q1 \n"
132*4e366538SXin Li "vpadal.u8 q0, q2 \n"
133*4e366538SXin Li "vpadal.u8 q0, q3 \n"
134*4e366538SXin Li "vpaddl.u16 q0, q0 \n"
135*4e366538SXin Li "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
136*4e366538SXin Li "vmovn.u16 d0, q0 \n"
137*4e366538SXin Li "vst1.32 {d0[0]}, [%1]! \n"
138*4e366538SXin Li "bgt 1b \n"
139*4e366538SXin Li : "+r"(src_ptr), // %0
140*4e366538SXin Li "+r"(dst_ptr), // %1
141*4e366538SXin Li "+r"(dst_width), // %2
142*4e366538SXin Li "+r"(src_ptr1), // %3
143*4e366538SXin Li "+r"(src_ptr2), // %4
144*4e366538SXin Li "+r"(src_ptr3) // %5
145*4e366538SXin Li :
146*4e366538SXin Li : "q0", "q1", "q2", "q3", "memory", "cc");
147*4e366538SXin Li }
148*4e366538SXin Li
149*4e366538SXin Li // Down scale from 4 to 3 pixels. Use the neon multilane read/write
150*4e366538SXin Li // to load up the every 4th pixel into a 4 different registers.
151*4e366538SXin Li // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)152*4e366538SXin Li void ScaleRowDown34_NEON(const uint8_t* src_ptr,
153*4e366538SXin Li ptrdiff_t src_stride,
154*4e366538SXin Li uint8_t* dst_ptr,
155*4e366538SXin Li int dst_width) {
156*4e366538SXin Li (void)src_stride;
157*4e366538SXin Li asm volatile(
158*4e366538SXin Li "1: \n"
159*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
160*4e366538SXin Li "subs %2, %2, #24 \n"
161*4e366538SXin Li "vmov d2, d3 \n" // order d0, d1, d2
162*4e366538SXin Li "vst3.8 {d0, d1, d2}, [%1]! \n"
163*4e366538SXin Li "bgt 1b \n"
164*4e366538SXin Li : "+r"(src_ptr), // %0
165*4e366538SXin Li "+r"(dst_ptr), // %1
166*4e366538SXin Li "+r"(dst_width) // %2
167*4e366538SXin Li :
168*4e366538SXin Li : "d0", "d1", "d2", "d3", "memory", "cc");
169*4e366538SXin Li }
170*4e366538SXin Li
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)171*4e366538SXin Li void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
172*4e366538SXin Li ptrdiff_t src_stride,
173*4e366538SXin Li uint8_t* dst_ptr,
174*4e366538SXin Li int dst_width) {
175*4e366538SXin Li asm volatile(
176*4e366538SXin Li "vmov.u8 d24, #3 \n"
177*4e366538SXin Li "add %3, %0 \n"
178*4e366538SXin Li "1: \n"
179*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
180*4e366538SXin Li "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
181*4e366538SXin Li "subs %2, %2, #24 \n"
182*4e366538SXin Li
183*4e366538SXin Li // filter src line 0 with src line 1
184*4e366538SXin Li // expand chars to shorts to allow for room
185*4e366538SXin Li // when adding lines together
186*4e366538SXin Li "vmovl.u8 q8, d4 \n"
187*4e366538SXin Li "vmovl.u8 q9, d5 \n"
188*4e366538SXin Li "vmovl.u8 q10, d6 \n"
189*4e366538SXin Li "vmovl.u8 q11, d7 \n"
190*4e366538SXin Li
191*4e366538SXin Li // 3 * line_0 + line_1
192*4e366538SXin Li "vmlal.u8 q8, d0, d24 \n"
193*4e366538SXin Li "vmlal.u8 q9, d1, d24 \n"
194*4e366538SXin Li "vmlal.u8 q10, d2, d24 \n"
195*4e366538SXin Li "vmlal.u8 q11, d3, d24 \n"
196*4e366538SXin Li
197*4e366538SXin Li // (3 * line_0 + line_1 + 2) >> 2
198*4e366538SXin Li "vqrshrn.u16 d0, q8, #2 \n"
199*4e366538SXin Li "vqrshrn.u16 d1, q9, #2 \n"
200*4e366538SXin Li "vqrshrn.u16 d2, q10, #2 \n"
201*4e366538SXin Li "vqrshrn.u16 d3, q11, #2 \n"
202*4e366538SXin Li
203*4e366538SXin Li // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
204*4e366538SXin Li "vmovl.u8 q8, d1 \n"
205*4e366538SXin Li "vmlal.u8 q8, d0, d24 \n"
206*4e366538SXin Li "vqrshrn.u16 d0, q8, #2 \n"
207*4e366538SXin Li
208*4e366538SXin Li // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
209*4e366538SXin Li "vrhadd.u8 d1, d1, d2 \n"
210*4e366538SXin Li
211*4e366538SXin Li // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
212*4e366538SXin Li "vmovl.u8 q8, d2 \n"
213*4e366538SXin Li "vmlal.u8 q8, d3, d24 \n"
214*4e366538SXin Li "vqrshrn.u16 d2, q8, #2 \n"
215*4e366538SXin Li
216*4e366538SXin Li "vst3.8 {d0, d1, d2}, [%1]! \n"
217*4e366538SXin Li
218*4e366538SXin Li "bgt 1b \n"
219*4e366538SXin Li : "+r"(src_ptr), // %0
220*4e366538SXin Li "+r"(dst_ptr), // %1
221*4e366538SXin Li "+r"(dst_width), // %2
222*4e366538SXin Li "+r"(src_stride) // %3
223*4e366538SXin Li :
224*4e366538SXin Li : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
225*4e366538SXin Li "cc");
226*4e366538SXin Li }
227*4e366538SXin Li
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)228*4e366538SXin Li void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
229*4e366538SXin Li ptrdiff_t src_stride,
230*4e366538SXin Li uint8_t* dst_ptr,
231*4e366538SXin Li int dst_width) {
232*4e366538SXin Li asm volatile(
233*4e366538SXin Li "vmov.u8 d24, #3 \n"
234*4e366538SXin Li "add %3, %0 \n"
235*4e366538SXin Li "1: \n"
236*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
237*4e366538SXin Li "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
238*4e366538SXin Li "subs %2, %2, #24 \n"
239*4e366538SXin Li // average src line 0 with src line 1
240*4e366538SXin Li "vrhadd.u8 q0, q0, q2 \n"
241*4e366538SXin Li "vrhadd.u8 q1, q1, q3 \n"
242*4e366538SXin Li
243*4e366538SXin Li // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
244*4e366538SXin Li "vmovl.u8 q3, d1 \n"
245*4e366538SXin Li "vmlal.u8 q3, d0, d24 \n"
246*4e366538SXin Li "vqrshrn.u16 d0, q3, #2 \n"
247*4e366538SXin Li
248*4e366538SXin Li // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
249*4e366538SXin Li "vrhadd.u8 d1, d1, d2 \n"
250*4e366538SXin Li
251*4e366538SXin Li // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
252*4e366538SXin Li "vmovl.u8 q3, d2 \n"
253*4e366538SXin Li "vmlal.u8 q3, d3, d24 \n"
254*4e366538SXin Li "vqrshrn.u16 d2, q3, #2 \n"
255*4e366538SXin Li
256*4e366538SXin Li "vst3.8 {d0, d1, d2}, [%1]! \n"
257*4e366538SXin Li "bgt 1b \n"
258*4e366538SXin Li : "+r"(src_ptr), // %0
259*4e366538SXin Li "+r"(dst_ptr), // %1
260*4e366538SXin Li "+r"(dst_width), // %2
261*4e366538SXin Li "+r"(src_stride) // %3
262*4e366538SXin Li :
263*4e366538SXin Li : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
264*4e366538SXin Li }
265*4e366538SXin Li
266*4e366538SXin Li #define HAS_SCALEROWDOWN38_NEON
267*4e366538SXin Li static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
268*4e366538SXin Li 22, 24, 27, 30, 0, 0, 0, 0};
269*4e366538SXin Li static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
270*4e366538SXin Li 18, 6, 14, 19, 0, 0, 0, 0};
271*4e366538SXin Li static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
272*4e366538SXin Li 65536 / 12, 65536 / 12, 65536 / 12,
273*4e366538SXin Li 65536 / 12, 65536 / 12};
274*4e366538SXin Li static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
275*4e366538SXin Li 65536 / 18, 65536 / 18, 65536 / 18,
276*4e366538SXin Li 65536 / 18, 65536 / 18};
277*4e366538SXin Li
278*4e366538SXin Li // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)279*4e366538SXin Li void ScaleRowDown38_NEON(const uint8_t* src_ptr,
280*4e366538SXin Li ptrdiff_t src_stride,
281*4e366538SXin Li uint8_t* dst_ptr,
282*4e366538SXin Li int dst_width) {
283*4e366538SXin Li (void)src_stride;
284*4e366538SXin Li asm volatile(
285*4e366538SXin Li "vld1.8 {q3}, [%3] \n"
286*4e366538SXin Li "1: \n"
287*4e366538SXin Li "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
288*4e366538SXin Li "subs %2, %2, #12 \n"
289*4e366538SXin Li "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
290*4e366538SXin Li "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
291*4e366538SXin Li "vst1.8 {d4}, [%1]! \n"
292*4e366538SXin Li "vst1.32 {d5[0]}, [%1]! \n"
293*4e366538SXin Li "bgt 1b \n"
294*4e366538SXin Li : "+r"(src_ptr), // %0
295*4e366538SXin Li "+r"(dst_ptr), // %1
296*4e366538SXin Li "+r"(dst_width) // %2
297*4e366538SXin Li : "r"(&kShuf38) // %3
298*4e366538SXin Li : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
299*4e366538SXin Li }
300*4e366538SXin Li
301*4e366538SXin Li // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)302*4e366538SXin Li void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
303*4e366538SXin Li ptrdiff_t src_stride,
304*4e366538SXin Li uint8_t* dst_ptr,
305*4e366538SXin Li int dst_width) {
306*4e366538SXin Li const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
307*4e366538SXin Li
308*4e366538SXin Li asm volatile(
309*4e366538SXin Li "vld1.16 {q13}, [%5] \n"
310*4e366538SXin Li "vld1.8 {q14}, [%6] \n"
311*4e366538SXin Li "vld1.8 {q15}, [%7] \n"
312*4e366538SXin Li "add %3, %0 \n"
313*4e366538SXin Li "1: \n"
314*4e366538SXin Li
315*4e366538SXin Li // d0 = 00 40 01 41 02 42 03 43
316*4e366538SXin Li // d1 = 10 50 11 51 12 52 13 53
317*4e366538SXin Li // d2 = 20 60 21 61 22 62 23 63
318*4e366538SXin Li // d3 = 30 70 31 71 32 72 33 73
319*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
320*4e366538SXin Li "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
321*4e366538SXin Li "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
322*4e366538SXin Li "subs %2, %2, #12 \n"
323*4e366538SXin Li
324*4e366538SXin Li // Shuffle the input data around to get align the data
325*4e366538SXin Li // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
326*4e366538SXin Li // d0 = 00 10 01 11 02 12 03 13
327*4e366538SXin Li // d1 = 40 50 41 51 42 52 43 53
328*4e366538SXin Li "vtrn.u8 d0, d1 \n"
329*4e366538SXin Li "vtrn.u8 d4, d5 \n"
330*4e366538SXin Li "vtrn.u8 d16, d17 \n"
331*4e366538SXin Li
332*4e366538SXin Li // d2 = 20 30 21 31 22 32 23 33
333*4e366538SXin Li // d3 = 60 70 61 71 62 72 63 73
334*4e366538SXin Li "vtrn.u8 d2, d3 \n"
335*4e366538SXin Li "vtrn.u8 d6, d7 \n"
336*4e366538SXin Li "vtrn.u8 d18, d19 \n"
337*4e366538SXin Li
338*4e366538SXin Li // d0 = 00+10 01+11 02+12 03+13
339*4e366538SXin Li // d2 = 40+50 41+51 42+52 43+53
340*4e366538SXin Li "vpaddl.u8 q0, q0 \n"
341*4e366538SXin Li "vpaddl.u8 q2, q2 \n"
342*4e366538SXin Li "vpaddl.u8 q8, q8 \n"
343*4e366538SXin Li
344*4e366538SXin Li // d3 = 60+70 61+71 62+72 63+73
345*4e366538SXin Li "vpaddl.u8 d3, d3 \n"
346*4e366538SXin Li "vpaddl.u8 d7, d7 \n"
347*4e366538SXin Li "vpaddl.u8 d19, d19 \n"
348*4e366538SXin Li
349*4e366538SXin Li // combine source lines
350*4e366538SXin Li "vadd.u16 q0, q2 \n"
351*4e366538SXin Li "vadd.u16 q0, q8 \n"
352*4e366538SXin Li "vadd.u16 d4, d3, d7 \n"
353*4e366538SXin Li "vadd.u16 d4, d19 \n"
354*4e366538SXin Li
355*4e366538SXin Li // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
356*4e366538SXin Li // + s[6 + st * 1] + s[7 + st * 1]
357*4e366538SXin Li // + s[6 + st * 2] + s[7 + st * 2]) / 6
358*4e366538SXin Li "vqrdmulh.s16 q2, q2, q13 \n"
359*4e366538SXin Li "vmovn.u16 d4, q2 \n"
360*4e366538SXin Li
361*4e366538SXin Li // Shuffle 2,3 reg around so that 2 can be added to the
362*4e366538SXin Li // 0,1 reg and 3 can be added to the 4,5 reg. This
363*4e366538SXin Li // requires expanding from u8 to u16 as the 0,1 and 4,5
364*4e366538SXin Li // registers are already expanded. Then do transposes
365*4e366538SXin Li // to get aligned.
366*4e366538SXin Li // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
367*4e366538SXin Li "vmovl.u8 q1, d2 \n"
368*4e366538SXin Li "vmovl.u8 q3, d6 \n"
369*4e366538SXin Li "vmovl.u8 q9, d18 \n"
370*4e366538SXin Li
371*4e366538SXin Li // combine source lines
372*4e366538SXin Li "vadd.u16 q1, q3 \n"
373*4e366538SXin Li "vadd.u16 q1, q9 \n"
374*4e366538SXin Li
375*4e366538SXin Li // d4 = xx 20 xx 30 xx 22 xx 32
376*4e366538SXin Li // d5 = xx 21 xx 31 xx 23 xx 33
377*4e366538SXin Li "vtrn.u32 d2, d3 \n"
378*4e366538SXin Li
379*4e366538SXin Li // d4 = xx 20 xx 21 xx 22 xx 23
380*4e366538SXin Li // d5 = xx 30 xx 31 xx 32 xx 33
381*4e366538SXin Li "vtrn.u16 d2, d3 \n"
382*4e366538SXin Li
383*4e366538SXin Li // 0+1+2, 3+4+5
384*4e366538SXin Li "vadd.u16 q0, q1 \n"
385*4e366538SXin Li
386*4e366538SXin Li // Need to divide, but can't downshift as the the value
387*4e366538SXin Li // isn't a power of 2. So multiply by 65536 / n
388*4e366538SXin Li // and take the upper 16 bits.
389*4e366538SXin Li "vqrdmulh.s16 q0, q0, q15 \n"
390*4e366538SXin Li
391*4e366538SXin Li // Align for table lookup, vtbl requires registers to
392*4e366538SXin Li // be adjacent
393*4e366538SXin Li "vmov.u8 d2, d4 \n"
394*4e366538SXin Li
395*4e366538SXin Li "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
396*4e366538SXin Li "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
397*4e366538SXin Li
398*4e366538SXin Li "vst1.8 {d3}, [%1]! \n"
399*4e366538SXin Li "vst1.32 {d4[0]}, [%1]! \n"
400*4e366538SXin Li "bgt 1b \n"
401*4e366538SXin Li : "+r"(src_ptr), // %0
402*4e366538SXin Li "+r"(dst_ptr), // %1
403*4e366538SXin Li "+r"(dst_width), // %2
404*4e366538SXin Li "+r"(src_stride), // %3
405*4e366538SXin Li "+r"(src_ptr1) // %4
406*4e366538SXin Li : "r"(&kMult38_Div6), // %5
407*4e366538SXin Li "r"(&kShuf38_2), // %6
408*4e366538SXin Li "r"(&kMult38_Div9) // %7
409*4e366538SXin Li : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
410*4e366538SXin Li "cc");
411*4e366538SXin Li }
412*4e366538SXin Li
413*4e366538SXin Li // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)414*4e366538SXin Li void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
415*4e366538SXin Li ptrdiff_t src_stride,
416*4e366538SXin Li uint8_t* dst_ptr,
417*4e366538SXin Li int dst_width) {
418*4e366538SXin Li asm volatile(
419*4e366538SXin Li "vld1.16 {q13}, [%4] \n"
420*4e366538SXin Li "vld1.8 {q14}, [%5] \n"
421*4e366538SXin Li "add %3, %0 \n"
422*4e366538SXin Li "1: \n"
423*4e366538SXin Li
424*4e366538SXin Li // d0 = 00 40 01 41 02 42 03 43
425*4e366538SXin Li // d1 = 10 50 11 51 12 52 13 53
426*4e366538SXin Li // d2 = 20 60 21 61 22 62 23 63
427*4e366538SXin Li // d3 = 30 70 31 71 32 72 33 73
428*4e366538SXin Li "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
429*4e366538SXin Li "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
430*4e366538SXin Li "subs %2, %2, #12 \n"
431*4e366538SXin Li
432*4e366538SXin Li // Shuffle the input data around to get align the data
433*4e366538SXin Li // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
434*4e366538SXin Li // d0 = 00 10 01 11 02 12 03 13
435*4e366538SXin Li // d1 = 40 50 41 51 42 52 43 53
436*4e366538SXin Li "vtrn.u8 d0, d1 \n"
437*4e366538SXin Li "vtrn.u8 d4, d5 \n"
438*4e366538SXin Li
439*4e366538SXin Li // d2 = 20 30 21 31 22 32 23 33
440*4e366538SXin Li // d3 = 60 70 61 71 62 72 63 73
441*4e366538SXin Li "vtrn.u8 d2, d3 \n"
442*4e366538SXin Li "vtrn.u8 d6, d7 \n"
443*4e366538SXin Li
444*4e366538SXin Li // d0 = 00+10 01+11 02+12 03+13
445*4e366538SXin Li // d2 = 40+50 41+51 42+52 43+53
446*4e366538SXin Li "vpaddl.u8 q0, q0 \n"
447*4e366538SXin Li "vpaddl.u8 q2, q2 \n"
448*4e366538SXin Li
449*4e366538SXin Li // d3 = 60+70 61+71 62+72 63+73
450*4e366538SXin Li "vpaddl.u8 d3, d3 \n"
451*4e366538SXin Li "vpaddl.u8 d7, d7 \n"
452*4e366538SXin Li
453*4e366538SXin Li // combine source lines
454*4e366538SXin Li "vadd.u16 q0, q2 \n"
455*4e366538SXin Li "vadd.u16 d4, d3, d7 \n"
456*4e366538SXin Li
457*4e366538SXin Li // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
458*4e366538SXin Li "vqrshrn.u16 d4, q2, #2 \n"
459*4e366538SXin Li
460*4e366538SXin Li // Shuffle 2,3 reg around so that 2 can be added to the
461*4e366538SXin Li // 0,1 reg and 3 can be added to the 4,5 reg. This
462*4e366538SXin Li // requires expanding from u8 to u16 as the 0,1 and 4,5
463*4e366538SXin Li // registers are already expanded. Then do transposes
464*4e366538SXin Li // to get aligned.
465*4e366538SXin Li // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
466*4e366538SXin Li "vmovl.u8 q1, d2 \n"
467*4e366538SXin Li "vmovl.u8 q3, d6 \n"
468*4e366538SXin Li
469*4e366538SXin Li // combine source lines
470*4e366538SXin Li "vadd.u16 q1, q3 \n"
471*4e366538SXin Li
472*4e366538SXin Li // d4 = xx 20 xx 30 xx 22 xx 32
473*4e366538SXin Li // d5 = xx 21 xx 31 xx 23 xx 33
474*4e366538SXin Li "vtrn.u32 d2, d3 \n"
475*4e366538SXin Li
476*4e366538SXin Li // d4 = xx 20 xx 21 xx 22 xx 23
477*4e366538SXin Li // d5 = xx 30 xx 31 xx 32 xx 33
478*4e366538SXin Li "vtrn.u16 d2, d3 \n"
479*4e366538SXin Li
480*4e366538SXin Li // 0+1+2, 3+4+5
481*4e366538SXin Li "vadd.u16 q0, q1 \n"
482*4e366538SXin Li
483*4e366538SXin Li // Need to divide, but can't downshift as the the value
484*4e366538SXin Li // isn't a power of 2. So multiply by 65536 / n
485*4e366538SXin Li // and take the upper 16 bits.
486*4e366538SXin Li "vqrdmulh.s16 q0, q0, q13 \n"
487*4e366538SXin Li
488*4e366538SXin Li // Align for table lookup, vtbl requires registers to
489*4e366538SXin Li // be adjacent
490*4e366538SXin Li "vmov.u8 d2, d4 \n"
491*4e366538SXin Li
492*4e366538SXin Li "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
493*4e366538SXin Li "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
494*4e366538SXin Li
495*4e366538SXin Li "vst1.8 {d3}, [%1]! \n"
496*4e366538SXin Li "vst1.32 {d4[0]}, [%1]! \n"
497*4e366538SXin Li "bgt 1b \n"
498*4e366538SXin Li : "+r"(src_ptr), // %0
499*4e366538SXin Li "+r"(dst_ptr), // %1
500*4e366538SXin Li "+r"(dst_width), // %2
501*4e366538SXin Li "+r"(src_stride) // %3
502*4e366538SXin Li : "r"(&kMult38_Div6), // %4
503*4e366538SXin Li "r"(&kShuf38_2) // %5
504*4e366538SXin Li : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
505*4e366538SXin Li }
506*4e366538SXin Li
ScaleRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)507*4e366538SXin Li void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
508*4e366538SXin Li uint8_t* dst_ptr,
509*4e366538SXin Li int dst_width) {
510*4e366538SXin Li const uint8_t* src_temp = src_ptr + 1;
511*4e366538SXin Li asm volatile(
512*4e366538SXin Li "vmov.u8 d30, #3 \n"
513*4e366538SXin Li
514*4e366538SXin Li "1: \n"
515*4e366538SXin Li "vld1.8 {d4}, [%0]! \n" // 01234567
516*4e366538SXin Li "vld1.8 {d5}, [%3]! \n" // 12345678
517*4e366538SXin Li
518*4e366538SXin Li "vmovl.u8 q0, d4 \n" // 01234567 (16b)
519*4e366538SXin Li "vmovl.u8 q1, d5 \n" // 12345678 (16b)
520*4e366538SXin Li "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
521*4e366538SXin Li "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
522*4e366538SXin Li
523*4e366538SXin Li "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
524*4e366538SXin Li "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
525*4e366538SXin Li
526*4e366538SXin Li "vst2.8 {d0, d1}, [%1]! \n" // store
527*4e366538SXin Li "subs %2, %2, #16 \n" // 8 sample -> 16 sample
528*4e366538SXin Li "bgt 1b \n"
529*4e366538SXin Li : "+r"(src_ptr), // %0
530*4e366538SXin Li "+r"(dst_ptr), // %1
531*4e366538SXin Li "+r"(dst_width), // %2
532*4e366538SXin Li "+r"(src_temp) // %3
533*4e366538SXin Li :
534*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
535*4e366538SXin Li );
536*4e366538SXin Li }
537*4e366538SXin Li
ScaleRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)538*4e366538SXin Li void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
539*4e366538SXin Li ptrdiff_t src_stride,
540*4e366538SXin Li uint8_t* dst_ptr,
541*4e366538SXin Li ptrdiff_t dst_stride,
542*4e366538SXin Li int dst_width) {
543*4e366538SXin Li const uint8_t* src_ptr1 = src_ptr + src_stride;
544*4e366538SXin Li uint8_t* dst_ptr1 = dst_ptr + dst_stride;
545*4e366538SXin Li const uint8_t* src_temp = src_ptr + 1;
546*4e366538SXin Li const uint8_t* src_temp1 = src_ptr1 + 1;
547*4e366538SXin Li
548*4e366538SXin Li asm volatile(
549*4e366538SXin Li "vmov.u16 q15, #3 \n"
550*4e366538SXin Li "vmov.u8 d28, #3 \n"
551*4e366538SXin Li
552*4e366538SXin Li "1: \n"
553*4e366538SXin Li "vld1.8 {d4}, [%0]! \n" // 01234567
554*4e366538SXin Li "vld1.8 {d5}, [%5]! \n" // 12345678
555*4e366538SXin Li
556*4e366538SXin Li "vmovl.u8 q0, d4 \n" // 01234567 (16b)
557*4e366538SXin Li "vmovl.u8 q1, d5 \n" // 12345678 (16b)
558*4e366538SXin Li "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
559*4e366538SXin Li "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
560*4e366538SXin Li
561*4e366538SXin Li "vld1.8 {d8}, [%1]! \n"
562*4e366538SXin Li "vld1.8 {d9}, [%6]! \n"
563*4e366538SXin Li
564*4e366538SXin Li "vmovl.u8 q2, d8 \n"
565*4e366538SXin Li "vmovl.u8 q3, d9 \n"
566*4e366538SXin Li "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
567*4e366538SXin Li "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
568*4e366538SXin Li
569*4e366538SXin Li // e o
570*4e366538SXin Li // q1 q0
571*4e366538SXin Li // q3 q2
572*4e366538SXin Li
573*4e366538SXin Li "vmovq q4, q2 \n"
574*4e366538SXin Li "vmovq q5, q3 \n"
575*4e366538SXin Li "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
576*4e366538SXin Li "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
577*4e366538SXin Li "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
578*4e366538SXin Li "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
579*4e366538SXin Li
580*4e366538SXin Li // e o
581*4e366538SXin Li // q5 q4
582*4e366538SXin Li // q1 q0
583*4e366538SXin Li
584*4e366538SXin Li "vrshrn.u16 d2, q1, #4 \n" // 2, even
585*4e366538SXin Li "vrshrn.u16 d3, q0, #4 \n" // 2, odd
586*4e366538SXin Li "vrshrn.u16 d0, q5, #4 \n" // 1, even
587*4e366538SXin Li "vrshrn.u16 d1, q4, #4 \n" // 1, odd
588*4e366538SXin Li
589*4e366538SXin Li "vst2.8 {d0, d1}, [%2]! \n" // store
590*4e366538SXin Li "vst2.8 {d2, d3}, [%3]! \n" // store
591*4e366538SXin Li "subs %4, %4, #16 \n" // 8 sample -> 16 sample
592*4e366538SXin Li "bgt 1b \n"
593*4e366538SXin Li : "+r"(src_ptr), // %0
594*4e366538SXin Li "+r"(src_ptr1), // %1
595*4e366538SXin Li "+r"(dst_ptr), // %2
596*4e366538SXin Li "+r"(dst_ptr1), // %3
597*4e366538SXin Li "+r"(dst_width), // %4
598*4e366538SXin Li "+r"(src_temp), // %5
599*4e366538SXin Li "+r"(src_temp1) // %6
600*4e366538SXin Li :
601*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
602*4e366538SXin Li "q15" // Clobber List
603*4e366538SXin Li );
604*4e366538SXin Li }
605*4e366538SXin Li
ScaleRowUp2_Linear_12_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)606*4e366538SXin Li void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
607*4e366538SXin Li uint16_t* dst_ptr,
608*4e366538SXin Li int dst_width) {
609*4e366538SXin Li const uint16_t* src_temp = src_ptr + 1;
610*4e366538SXin Li asm volatile(
611*4e366538SXin Li "vmov.u16 q15, #3 \n"
612*4e366538SXin Li
613*4e366538SXin Li "1: \n"
614*4e366538SXin Li "vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
615*4e366538SXin Li "vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
616*4e366538SXin Li
617*4e366538SXin Li "vmovq q2, q0 \n"
618*4e366538SXin Li "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
619*4e366538SXin Li "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
620*4e366538SXin Li
621*4e366538SXin Li "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
622*4e366538SXin Li "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
623*4e366538SXin Li
624*4e366538SXin Li "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
625*4e366538SXin Li "subs %2, %2, #16 \n" // 8 sample -> 16 sample
626*4e366538SXin Li "bgt 1b \n"
627*4e366538SXin Li : "+r"(src_ptr), // %0
628*4e366538SXin Li "+r"(dst_ptr), // %1
629*4e366538SXin Li "+r"(dst_width), // %2
630*4e366538SXin Li "+r"(src_temp) // %3
631*4e366538SXin Li :
632*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
633*4e366538SXin Li );
634*4e366538SXin Li }
635*4e366538SXin Li
ScaleRowUp2_Bilinear_12_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)636*4e366538SXin Li void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
637*4e366538SXin Li ptrdiff_t src_stride,
638*4e366538SXin Li uint16_t* dst_ptr,
639*4e366538SXin Li ptrdiff_t dst_stride,
640*4e366538SXin Li int dst_width) {
641*4e366538SXin Li const uint16_t* src_ptr1 = src_ptr + src_stride;
642*4e366538SXin Li uint16_t* dst_ptr1 = dst_ptr + dst_stride;
643*4e366538SXin Li const uint16_t* src_temp = src_ptr + 1;
644*4e366538SXin Li const uint16_t* src_temp1 = src_ptr1 + 1;
645*4e366538SXin Li
646*4e366538SXin Li asm volatile(
647*4e366538SXin Li "vmov.u16 q15, #3 \n"
648*4e366538SXin Li
649*4e366538SXin Li "1: \n"
650*4e366538SXin Li "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
651*4e366538SXin Li "vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
652*4e366538SXin Li
653*4e366538SXin Li "vmovq q2, q0 \n"
654*4e366538SXin Li "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
655*4e366538SXin Li "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
656*4e366538SXin Li
657*4e366538SXin Li "vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
658*4e366538SXin Li "vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
659*4e366538SXin Li
660*4e366538SXin Li "vmovq q4, q2 \n"
661*4e366538SXin Li "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
662*4e366538SXin Li "vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
663*4e366538SXin Li
664*4e366538SXin Li "vmovq q4, q2 \n"
665*4e366538SXin Li "vmovq q5, q3 \n"
666*4e366538SXin Li "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
667*4e366538SXin Li "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
668*4e366538SXin Li "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
669*4e366538SXin Li "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
670*4e366538SXin Li
671*4e366538SXin Li "vrshr.u16 q2, q1, #4 \n" // 2, even
672*4e366538SXin Li "vrshr.u16 q3, q0, #4 \n" // 2, odd
673*4e366538SXin Li "vrshr.u16 q0, q5, #4 \n" // 1, even
674*4e366538SXin Li "vrshr.u16 q1, q4, #4 \n" // 1, odd
675*4e366538SXin Li
676*4e366538SXin Li "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
677*4e366538SXin Li "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
678*4e366538SXin Li "subs %4, %4, #16 \n" // 8 sample -> 16 sample
679*4e366538SXin Li "bgt 1b \n"
680*4e366538SXin Li : "+r"(src_ptr), // %0
681*4e366538SXin Li "+r"(src_ptr1), // %1
682*4e366538SXin Li "+r"(dst_ptr), // %2
683*4e366538SXin Li "+r"(dst_ptr1), // %3
684*4e366538SXin Li "+r"(dst_width), // %4
685*4e366538SXin Li "+r"(src_temp), // %5
686*4e366538SXin Li "+r"(src_temp1) // %6
687*4e366538SXin Li :
688*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
689*4e366538SXin Li "q15" // Clobber List
690*4e366538SXin Li );
691*4e366538SXin Li }
692*4e366538SXin Li
ScaleRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)693*4e366538SXin Li void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
694*4e366538SXin Li uint16_t* dst_ptr,
695*4e366538SXin Li int dst_width) {
696*4e366538SXin Li const uint16_t* src_temp = src_ptr + 1;
697*4e366538SXin Li asm volatile(
698*4e366538SXin Li "vmov.u16 d31, #3 \n"
699*4e366538SXin Li
700*4e366538SXin Li "1: \n"
701*4e366538SXin Li "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
702*4e366538SXin Li "vld1.16 {q1}, [%3]! \n" // 12345678 (16b)
703*4e366538SXin Li
704*4e366538SXin Li "vmovl.u16 q2, d0 \n" // 0123 (32b)
705*4e366538SXin Li "vmovl.u16 q3, d1 \n" // 4567 (32b)
706*4e366538SXin Li "vmovl.u16 q4, d2 \n" // 1234 (32b)
707*4e366538SXin Li "vmovl.u16 q5, d3 \n" // 5678 (32b)
708*4e366538SXin Li
709*4e366538SXin Li "vmlal.u16 q2, d2, d31 \n"
710*4e366538SXin Li "vmlal.u16 q3, d3, d31 \n"
711*4e366538SXin Li "vmlal.u16 q4, d0, d31 \n"
712*4e366538SXin Li "vmlal.u16 q5, d1, d31 \n"
713*4e366538SXin Li
714*4e366538SXin Li "vrshrn.u32 d0, q4, #2 \n"
715*4e366538SXin Li "vrshrn.u32 d1, q5, #2 \n"
716*4e366538SXin Li "vrshrn.u32 d2, q2, #2 \n"
717*4e366538SXin Li "vrshrn.u32 d3, q3, #2 \n"
718*4e366538SXin Li
719*4e366538SXin Li "vst2.16 {q0, q1}, [%1]! \n" // store
720*4e366538SXin Li "subs %2, %2, #16 \n" // 8 sample -> 16 sample
721*4e366538SXin Li "bgt 1b \n"
722*4e366538SXin Li : "+r"(src_ptr), // %0
723*4e366538SXin Li "+r"(dst_ptr), // %1
724*4e366538SXin Li "+r"(dst_width), // %2
725*4e366538SXin Li "+r"(src_temp) // %3
726*4e366538SXin Li :
727*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
728*4e366538SXin Li );
729*4e366538SXin Li }
730*4e366538SXin Li
ScaleRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)731*4e366538SXin Li void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
732*4e366538SXin Li ptrdiff_t src_stride,
733*4e366538SXin Li uint16_t* dst_ptr,
734*4e366538SXin Li ptrdiff_t dst_stride,
735*4e366538SXin Li int dst_width) {
736*4e366538SXin Li const uint16_t* src_ptr1 = src_ptr + src_stride;
737*4e366538SXin Li uint16_t* dst_ptr1 = dst_ptr + dst_stride;
738*4e366538SXin Li const uint16_t* src_temp = src_ptr + 1;
739*4e366538SXin Li const uint16_t* src_temp1 = src_ptr1 + 1;
740*4e366538SXin Li
741*4e366538SXin Li asm volatile(
742*4e366538SXin Li "vmov.u16 d31, #3 \n"
743*4e366538SXin Li "vmov.u32 q14, #3 \n"
744*4e366538SXin Li
745*4e366538SXin Li "1: \n"
746*4e366538SXin Li "vld1.16 {d0}, [%0]! \n" // 0123 (16b)
747*4e366538SXin Li "vld1.16 {d1}, [%5]! \n" // 1234 (16b)
748*4e366538SXin Li "vmovl.u16 q2, d0 \n" // 0123 (32b)
749*4e366538SXin Li "vmovl.u16 q3, d1 \n" // 1234 (32b)
750*4e366538SXin Li "vmlal.u16 q2, d1, d31 \n"
751*4e366538SXin Li "vmlal.u16 q3, d0, d31 \n"
752*4e366538SXin Li
753*4e366538SXin Li "vld1.16 {d0}, [%1]! \n" // 0123 (16b)
754*4e366538SXin Li "vld1.16 {d1}, [%6]! \n" // 1234 (16b)
755*4e366538SXin Li "vmovl.u16 q4, d0 \n" // 0123 (32b)
756*4e366538SXin Li "vmovl.u16 q5, d1 \n" // 1234 (32b)
757*4e366538SXin Li "vmlal.u16 q4, d1, d31 \n"
758*4e366538SXin Li "vmlal.u16 q5, d0, d31 \n"
759*4e366538SXin Li
760*4e366538SXin Li "vmovq q0, q4 \n"
761*4e366538SXin Li "vmovq q1, q5 \n"
762*4e366538SXin Li "vmla.u32 q4, q2, q14 \n"
763*4e366538SXin Li "vmla.u32 q5, q3, q14 \n"
764*4e366538SXin Li "vmla.u32 q2, q0, q14 \n"
765*4e366538SXin Li "vmla.u32 q3, q1, q14 \n"
766*4e366538SXin Li
767*4e366538SXin Li "vrshrn.u32 d1, q4, #4 \n"
768*4e366538SXin Li "vrshrn.u32 d0, q5, #4 \n"
769*4e366538SXin Li "vrshrn.u32 d3, q2, #4 \n"
770*4e366538SXin Li "vrshrn.u32 d2, q3, #4 \n"
771*4e366538SXin Li
772*4e366538SXin Li "vst2.16 {d0, d1}, [%2]! \n" // store
773*4e366538SXin Li "vst2.16 {d2, d3}, [%3]! \n" // store
774*4e366538SXin Li "subs %4, %4, #8 \n" // 4 sample -> 8 sample
775*4e366538SXin Li "bgt 1b \n"
776*4e366538SXin Li : "+r"(src_ptr), // %0
777*4e366538SXin Li "+r"(src_ptr1), // %1
778*4e366538SXin Li "+r"(dst_ptr), // %2
779*4e366538SXin Li "+r"(dst_ptr1), // %3
780*4e366538SXin Li "+r"(dst_width), // %4
781*4e366538SXin Li "+r"(src_temp), // %5
782*4e366538SXin Li "+r"(src_temp1) // %6
783*4e366538SXin Li :
784*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
785*4e366538SXin Li "d31" // Clobber List
786*4e366538SXin Li );
787*4e366538SXin Li }
788*4e366538SXin Li
ScaleUVRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)789*4e366538SXin Li void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
790*4e366538SXin Li uint8_t* dst_ptr,
791*4e366538SXin Li int dst_width) {
792*4e366538SXin Li const uint8_t* src_temp = src_ptr + 2;
793*4e366538SXin Li asm volatile(
794*4e366538SXin Li "vmov.u8 d30, #3 \n"
795*4e366538SXin Li
796*4e366538SXin Li "1: \n"
797*4e366538SXin Li "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
798*4e366538SXin Li "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
799*4e366538SXin Li
800*4e366538SXin Li "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
801*4e366538SXin Li "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
802*4e366538SXin Li "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
803*4e366538SXin Li "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
804*4e366538SXin Li
805*4e366538SXin Li "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
806*4e366538SXin Li "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
807*4e366538SXin Li
808*4e366538SXin Li "vst2.16 {d0, d1}, [%1]! \n" // store
809*4e366538SXin Li "subs %2, %2, #8 \n" // 4 uv -> 8 uv
810*4e366538SXin Li "bgt 1b \n"
811*4e366538SXin Li : "+r"(src_ptr), // %0
812*4e366538SXin Li "+r"(dst_ptr), // %1
813*4e366538SXin Li "+r"(dst_width), // %2
814*4e366538SXin Li "+r"(src_temp) // %3
815*4e366538SXin Li :
816*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List
817*4e366538SXin Li );
818*4e366538SXin Li }
819*4e366538SXin Li
ScaleUVRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)820*4e366538SXin Li void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
821*4e366538SXin Li ptrdiff_t src_stride,
822*4e366538SXin Li uint8_t* dst_ptr,
823*4e366538SXin Li ptrdiff_t dst_stride,
824*4e366538SXin Li int dst_width) {
825*4e366538SXin Li const uint8_t* src_ptr1 = src_ptr + src_stride;
826*4e366538SXin Li uint8_t* dst_ptr1 = dst_ptr + dst_stride;
827*4e366538SXin Li const uint8_t* src_temp = src_ptr + 2;
828*4e366538SXin Li const uint8_t* src_temp1 = src_ptr1 + 2;
829*4e366538SXin Li
830*4e366538SXin Li asm volatile(
831*4e366538SXin Li "vmov.u16 q15, #3 \n"
832*4e366538SXin Li "vmov.u8 d28, #3 \n"
833*4e366538SXin Li
834*4e366538SXin Li "1: \n"
835*4e366538SXin Li "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
836*4e366538SXin Li "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
837*4e366538SXin Li
838*4e366538SXin Li "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
839*4e366538SXin Li "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
840*4e366538SXin Li "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
841*4e366538SXin Li "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
842*4e366538SXin Li
843*4e366538SXin Li "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v)
844*4e366538SXin Li "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v)
845*4e366538SXin Li
846*4e366538SXin Li "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b)
847*4e366538SXin Li "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b)
848*4e366538SXin Li "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
849*4e366538SXin Li "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
850*4e366538SXin Li
851*4e366538SXin Li // e o
852*4e366538SXin Li // q1 q0
853*4e366538SXin Li // q3 q2
854*4e366538SXin Li
855*4e366538SXin Li "vmovq q4, q2 \n"
856*4e366538SXin Li "vmovq q5, q3 \n"
857*4e366538SXin Li "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
858*4e366538SXin Li "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
859*4e366538SXin Li "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
860*4e366538SXin Li "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
861*4e366538SXin Li
862*4e366538SXin Li // e o
863*4e366538SXin Li // q5 q4
864*4e366538SXin Li // q1 q0
865*4e366538SXin Li
866*4e366538SXin Li "vrshrn.u16 d2, q1, #4 \n" // 2, even
867*4e366538SXin Li "vrshrn.u16 d3, q0, #4 \n" // 2, odd
868*4e366538SXin Li "vrshrn.u16 d0, q5, #4 \n" // 1, even
869*4e366538SXin Li "vrshrn.u16 d1, q4, #4 \n" // 1, odd
870*4e366538SXin Li
871*4e366538SXin Li "vst2.16 {d0, d1}, [%2]! \n" // store
872*4e366538SXin Li "vst2.16 {d2, d3}, [%3]! \n" // store
873*4e366538SXin Li "subs %4, %4, #8 \n" // 4 uv -> 8 uv
874*4e366538SXin Li "bgt 1b \n"
875*4e366538SXin Li : "+r"(src_ptr), // %0
876*4e366538SXin Li "+r"(src_ptr1), // %1
877*4e366538SXin Li "+r"(dst_ptr), // %2
878*4e366538SXin Li "+r"(dst_ptr1), // %3
879*4e366538SXin Li "+r"(dst_width), // %4
880*4e366538SXin Li "+r"(src_temp), // %5
881*4e366538SXin Li "+r"(src_temp1) // %6
882*4e366538SXin Li :
883*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
884*4e366538SXin Li "q15" // Clobber List
885*4e366538SXin Li );
886*4e366538SXin Li }
887*4e366538SXin Li
ScaleUVRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)888*4e366538SXin Li void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
889*4e366538SXin Li uint16_t* dst_ptr,
890*4e366538SXin Li int dst_width) {
891*4e366538SXin Li const uint16_t* src_temp = src_ptr + 2;
892*4e366538SXin Li asm volatile(
893*4e366538SXin Li "vmov.u16 d30, #3 \n"
894*4e366538SXin Li
895*4e366538SXin Li "1: \n"
896*4e366538SXin Li "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
897*4e366538SXin Li "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
898*4e366538SXin Li
899*4e366538SXin Li "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
900*4e366538SXin Li "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b)
901*4e366538SXin Li "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b)
902*4e366538SXin Li "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b)
903*4e366538SXin Li "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd)
904*4e366538SXin Li "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even)
905*4e366538SXin Li "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd)
906*4e366538SXin Li "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even)
907*4e366538SXin Li
908*4e366538SXin Li "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd)
909*4e366538SXin Li "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even)
910*4e366538SXin Li "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd)
911*4e366538SXin Li "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even)
912*4e366538SXin Li
913*4e366538SXin Li "vst2.32 {d0, d1}, [%1]! \n" // store
914*4e366538SXin Li "vst2.32 {d2, d3}, [%1]! \n" // store
915*4e366538SXin Li "subs %2, %2, #8 \n" // 4 uv -> 8 uv
916*4e366538SXin Li "bgt 1b \n"
917*4e366538SXin Li : "+r"(src_ptr), // %0
918*4e366538SXin Li "+r"(dst_ptr), // %1
919*4e366538SXin Li "+r"(dst_width), // %2
920*4e366538SXin Li "+r"(src_temp) // %3
921*4e366538SXin Li :
922*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
923*4e366538SXin Li "d30" // Clobber List
924*4e366538SXin Li );
925*4e366538SXin Li }
926*4e366538SXin Li
ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)927*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
928*4e366538SXin Li ptrdiff_t src_stride,
929*4e366538SXin Li uint16_t* dst_ptr,
930*4e366538SXin Li ptrdiff_t dst_stride,
931*4e366538SXin Li int dst_width) {
932*4e366538SXin Li const uint16_t* src_ptr1 = src_ptr + src_stride;
933*4e366538SXin Li uint16_t* dst_ptr1 = dst_ptr + dst_stride;
934*4e366538SXin Li const uint16_t* src_temp = src_ptr + 2;
935*4e366538SXin Li const uint16_t* src_temp1 = src_ptr1 + 2;
936*4e366538SXin Li
937*4e366538SXin Li asm volatile(
938*4e366538SXin Li "vmov.u16 d30, #3 \n"
939*4e366538SXin Li "vmov.u32 q14, #3 \n"
940*4e366538SXin Li
941*4e366538SXin Li "1: \n"
942*4e366538SXin Li "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
943*4e366538SXin Li "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
944*4e366538SXin Li "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
945*4e366538SXin Li "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b)
946*4e366538SXin Li "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd)
947*4e366538SXin Li "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even)
948*4e366538SXin Li
949*4e366538SXin Li "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v)
950*4e366538SXin Li "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v)
951*4e366538SXin Li "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b)
952*4e366538SXin Li "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b)
953*4e366538SXin Li "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd)
954*4e366538SXin Li "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even)
955*4e366538SXin Li
956*4e366538SXin Li "vmovq q0, q4 \n"
957*4e366538SXin Li "vmovq q1, q5 \n"
958*4e366538SXin Li "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd)
959*4e366538SXin Li "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even)
960*4e366538SXin Li "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd)
961*4e366538SXin Li "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even)
962*4e366538SXin Li
963*4e366538SXin Li "vrshrn.u32 d1, q4, #4 \n" // 1, odd
964*4e366538SXin Li "vrshrn.u32 d0, q5, #4 \n" // 1, even
965*4e366538SXin Li "vrshrn.u32 d3, q2, #4 \n" // 2, odd
966*4e366538SXin Li "vrshrn.u32 d2, q3, #4 \n" // 2, even
967*4e366538SXin Li
968*4e366538SXin Li "vst2.32 {d0, d1}, [%2]! \n" // store
969*4e366538SXin Li "vst2.32 {d2, d3}, [%3]! \n" // store
970*4e366538SXin Li "subs %4, %4, #4 \n" // 2 uv -> 4 uv
971*4e366538SXin Li "bgt 1b \n"
972*4e366538SXin Li : "+r"(src_ptr), // %0
973*4e366538SXin Li "+r"(src_ptr1), // %1
974*4e366538SXin Li "+r"(dst_ptr), // %2
975*4e366538SXin Li "+r"(dst_ptr1), // %3
976*4e366538SXin Li "+r"(dst_width), // %4
977*4e366538SXin Li "+r"(src_temp), // %5
978*4e366538SXin Li "+r"(src_temp1) // %6
979*4e366538SXin Li :
980*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
981*4e366538SXin Li "d30" // Clobber List
982*4e366538SXin Li );
983*4e366538SXin Li }
984*4e366538SXin Li
985*4e366538SXin Li // Add a row of bytes to a row of shorts. Used for box filter.
986*4e366538SXin Li // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)987*4e366538SXin Li void ScaleAddRow_NEON(const uint8_t* src_ptr,
988*4e366538SXin Li uint16_t* dst_ptr,
989*4e366538SXin Li int src_width) {
990*4e366538SXin Li asm volatile(
991*4e366538SXin Li "1: \n"
992*4e366538SXin Li "vld1.16 {q1, q2}, [%1] \n" // load accumulator
993*4e366538SXin Li "vld1.8 {q0}, [%0]! \n" // load 16 bytes
994*4e366538SXin Li "vaddw.u8 q2, q2, d1 \n" // add
995*4e366538SXin Li "vaddw.u8 q1, q1, d0 \n"
996*4e366538SXin Li "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
997*4e366538SXin Li "subs %2, %2, #16 \n" // 16 processed per loop
998*4e366538SXin Li "bgt 1b \n"
999*4e366538SXin Li : "+r"(src_ptr), // %0
1000*4e366538SXin Li "+r"(dst_ptr), // %1
1001*4e366538SXin Li "+r"(src_width) // %2
1002*4e366538SXin Li :
1003*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2" // Clobber List
1004*4e366538SXin Li );
1005*4e366538SXin Li }
1006*4e366538SXin Li
1007*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1008*4e366538SXin Li // the x/dx stepping
1009*4e366538SXin Li #define LOAD2_DATA8_LANE(n) \
1010*4e366538SXin Li "lsr %5, %3, #16 \n" \
1011*4e366538SXin Li "add %6, %1, %5 \n" \
1012*4e366538SXin Li "add %3, %3, %4 \n" \
1013*4e366538SXin Li "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
1014*4e366538SXin Li
1015*4e366538SXin Li // The NEON version mimics this formula (from row_common.cc):
1016*4e366538SXin Li // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
1017*4e366538SXin Li // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
1018*4e366538SXin Li
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1019*4e366538SXin Li void ScaleFilterCols_NEON(uint8_t* dst_ptr,
1020*4e366538SXin Li const uint8_t* src_ptr,
1021*4e366538SXin Li int dst_width,
1022*4e366538SXin Li int x,
1023*4e366538SXin Li int dx) {
1024*4e366538SXin Li int dx_offset[4] = {0, 1, 2, 3};
1025*4e366538SXin Li int* tmp = dx_offset;
1026*4e366538SXin Li const uint8_t* src_tmp = src_ptr;
1027*4e366538SXin Li asm volatile (
1028*4e366538SXin Li "vdup.32 q0, %3 \n" // x
1029*4e366538SXin Li "vdup.32 q1, %4 \n" // dx
1030*4e366538SXin Li "vld1.32 {q2}, [%5] \n" // 0 1 2 3
1031*4e366538SXin Li "vshl.i32 q3, q1, #2 \n" // 4 * dx
1032*4e366538SXin Li "vmul.s32 q1, q1, q2 \n"
1033*4e366538SXin Li // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1034*4e366538SXin Li "vadd.s32 q1, q1, q0 \n"
1035*4e366538SXin Li // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
1036*4e366538SXin Li "vadd.s32 q2, q1, q3 \n"
1037*4e366538SXin Li "vshl.i32 q0, q3, #1 \n" // 8 * dx
1038*4e366538SXin Li "1: \n"
1039*4e366538SXin Li LOAD2_DATA8_LANE(0)
1040*4e366538SXin Li LOAD2_DATA8_LANE(1)
1041*4e366538SXin Li LOAD2_DATA8_LANE(2)
1042*4e366538SXin Li LOAD2_DATA8_LANE(3)
1043*4e366538SXin Li LOAD2_DATA8_LANE(4)
1044*4e366538SXin Li LOAD2_DATA8_LANE(5)
1045*4e366538SXin Li LOAD2_DATA8_LANE(6)
1046*4e366538SXin Li LOAD2_DATA8_LANE(7)
1047*4e366538SXin Li "vmov q10, q1 \n"
1048*4e366538SXin Li "vmov q11, q2 \n"
1049*4e366538SXin Li "vuzp.16 q10, q11 \n"
1050*4e366538SXin Li "vmovl.u8 q8, d6 \n"
1051*4e366538SXin Li "vmovl.u8 q9, d7 \n"
1052*4e366538SXin Li "vsubl.s16 q11, d18, d16 \n"
1053*4e366538SXin Li "vsubl.s16 q12, d19, d17 \n"
1054*4e366538SXin Li "vmovl.u16 q13, d20 \n"
1055*4e366538SXin Li "vmovl.u16 q10, d21 \n"
1056*4e366538SXin Li "vmul.s32 q11, q11, q13 \n"
1057*4e366538SXin Li "vmul.s32 q12, q12, q10 \n"
1058*4e366538SXin Li "vrshrn.s32 d18, q11, #16 \n"
1059*4e366538SXin Li "vrshrn.s32 d19, q12, #16 \n"
1060*4e366538SXin Li "vadd.s16 q8, q8, q9 \n"
1061*4e366538SXin Li "vmovn.s16 d6, q8 \n"
1062*4e366538SXin Li
1063*4e366538SXin Li "vst1.8 {d6}, [%0]! \n" // store pixels
1064*4e366538SXin Li "vadd.s32 q1, q1, q0 \n"
1065*4e366538SXin Li "vadd.s32 q2, q2, q0 \n"
1066*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop
1067*4e366538SXin Li "bgt 1b \n"
1068*4e366538SXin Li : "+r"(dst_ptr), // %0
1069*4e366538SXin Li "+r"(src_ptr), // %1
1070*4e366538SXin Li "+r"(dst_width), // %2
1071*4e366538SXin Li "+r"(x), // %3
1072*4e366538SXin Li "+r"(dx), // %4
1073*4e366538SXin Li "+r"(tmp), // %5
1074*4e366538SXin Li "+r"(src_tmp) // %6
1075*4e366538SXin Li :
1076*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3",
1077*4e366538SXin Li "q8", "q9", "q10", "q11", "q12", "q13"
1078*4e366538SXin Li );
1079*4e366538SXin Li }
1080*4e366538SXin Li
1081*4e366538SXin Li #undef LOAD2_DATA8_LANE
1082*4e366538SXin Li
1083*4e366538SXin Li // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)1084*4e366538SXin Li void ScaleFilterRows_NEON(uint8_t* dst_ptr,
1085*4e366538SXin Li const uint8_t* src_ptr,
1086*4e366538SXin Li ptrdiff_t src_stride,
1087*4e366538SXin Li int dst_width,
1088*4e366538SXin Li int source_y_fraction) {
1089*4e366538SXin Li asm volatile(
1090*4e366538SXin Li "cmp %4, #0 \n"
1091*4e366538SXin Li "beq 100f \n"
1092*4e366538SXin Li "add %2, %1 \n"
1093*4e366538SXin Li "cmp %4, #64 \n"
1094*4e366538SXin Li "beq 75f \n"
1095*4e366538SXin Li "cmp %4, #128 \n"
1096*4e366538SXin Li "beq 50f \n"
1097*4e366538SXin Li "cmp %4, #192 \n"
1098*4e366538SXin Li "beq 25f \n"
1099*4e366538SXin Li
1100*4e366538SXin Li "vdup.8 d5, %4 \n"
1101*4e366538SXin Li "rsb %4, #256 \n"
1102*4e366538SXin Li "vdup.8 d4, %4 \n"
1103*4e366538SXin Li // General purpose row blend.
1104*4e366538SXin Li "1: \n"
1105*4e366538SXin Li "vld1.8 {q0}, [%1]! \n"
1106*4e366538SXin Li "vld1.8 {q1}, [%2]! \n"
1107*4e366538SXin Li "subs %3, %3, #16 \n"
1108*4e366538SXin Li "vmull.u8 q13, d0, d4 \n"
1109*4e366538SXin Li "vmull.u8 q14, d1, d4 \n"
1110*4e366538SXin Li "vmlal.u8 q13, d2, d5 \n"
1111*4e366538SXin Li "vmlal.u8 q14, d3, d5 \n"
1112*4e366538SXin Li "vrshrn.u16 d0, q13, #8 \n"
1113*4e366538SXin Li "vrshrn.u16 d1, q14, #8 \n"
1114*4e366538SXin Li "vst1.8 {q0}, [%0]! \n"
1115*4e366538SXin Li "bgt 1b \n"
1116*4e366538SXin Li "b 99f \n"
1117*4e366538SXin Li
1118*4e366538SXin Li // Blend 25 / 75.
1119*4e366538SXin Li "25: \n"
1120*4e366538SXin Li "vld1.8 {q0}, [%1]! \n"
1121*4e366538SXin Li "vld1.8 {q1}, [%2]! \n"
1122*4e366538SXin Li "subs %3, %3, #16 \n"
1123*4e366538SXin Li "vrhadd.u8 q0, q1 \n"
1124*4e366538SXin Li "vrhadd.u8 q0, q1 \n"
1125*4e366538SXin Li "vst1.8 {q0}, [%0]! \n"
1126*4e366538SXin Li "bgt 25b \n"
1127*4e366538SXin Li "b 99f \n"
1128*4e366538SXin Li
1129*4e366538SXin Li // Blend 50 / 50.
1130*4e366538SXin Li "50: \n"
1131*4e366538SXin Li "vld1.8 {q0}, [%1]! \n"
1132*4e366538SXin Li "vld1.8 {q1}, [%2]! \n"
1133*4e366538SXin Li "subs %3, %3, #16 \n"
1134*4e366538SXin Li "vrhadd.u8 q0, q1 \n"
1135*4e366538SXin Li "vst1.8 {q0}, [%0]! \n"
1136*4e366538SXin Li "bgt 50b \n"
1137*4e366538SXin Li "b 99f \n"
1138*4e366538SXin Li
1139*4e366538SXin Li // Blend 75 / 25.
1140*4e366538SXin Li "75: \n"
1141*4e366538SXin Li "vld1.8 {q1}, [%1]! \n"
1142*4e366538SXin Li "vld1.8 {q0}, [%2]! \n"
1143*4e366538SXin Li "subs %3, %3, #16 \n"
1144*4e366538SXin Li "vrhadd.u8 q0, q1 \n"
1145*4e366538SXin Li "vrhadd.u8 q0, q1 \n"
1146*4e366538SXin Li "vst1.8 {q0}, [%0]! \n"
1147*4e366538SXin Li "bgt 75b \n"
1148*4e366538SXin Li "b 99f \n"
1149*4e366538SXin Li
1150*4e366538SXin Li // Blend 100 / 0 - Copy row unchanged.
1151*4e366538SXin Li "100: \n"
1152*4e366538SXin Li "vld1.8 {q0}, [%1]! \n"
1153*4e366538SXin Li "subs %3, %3, #16 \n"
1154*4e366538SXin Li "vst1.8 {q0}, [%0]! \n"
1155*4e366538SXin Li "bgt 100b \n"
1156*4e366538SXin Li
1157*4e366538SXin Li "99: \n"
1158*4e366538SXin Li "vst1.8 {d1[7]}, [%0] \n"
1159*4e366538SXin Li : "+r"(dst_ptr), // %0
1160*4e366538SXin Li "+r"(src_ptr), // %1
1161*4e366538SXin Li "+r"(src_stride), // %2
1162*4e366538SXin Li "+r"(dst_width), // %3
1163*4e366538SXin Li "+r"(source_y_fraction) // %4
1164*4e366538SXin Li :
1165*4e366538SXin Li : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
1166*4e366538SXin Li }
1167*4e366538SXin Li
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1168*4e366538SXin Li void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
1169*4e366538SXin Li ptrdiff_t src_stride,
1170*4e366538SXin Li uint8_t* dst,
1171*4e366538SXin Li int dst_width) {
1172*4e366538SXin Li (void)src_stride;
1173*4e366538SXin Li asm volatile(
1174*4e366538SXin Li "1: \n"
1175*4e366538SXin Li "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
1176*4e366538SXin Li "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
1177*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop
1178*4e366538SXin Li "vmov q2, q1 \n" // load next 8 ARGB
1179*4e366538SXin Li "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
1180*4e366538SXin Li "bgt 1b \n"
1181*4e366538SXin Li : "+r"(src_ptr), // %0
1182*4e366538SXin Li "+r"(dst), // %1
1183*4e366538SXin Li "+r"(dst_width) // %2
1184*4e366538SXin Li :
1185*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
1186*4e366538SXin Li );
1187*4e366538SXin Li }
1188*4e366538SXin Li
1189*4e366538SXin Li // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
1190*4e366538SXin Li // 4a: 3e04 subs r6, #4
1191*4e366538SXin Li // 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
1192*4e366538SXin Li // 50: ef64 21f4 vorr q9, q10, q10
1193*4e366538SXin Li // 54: f942 038d vst2.32 {d16-d19}, [r2]!
1194*4e366538SXin Li // 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
1195*4e366538SXin Li
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1196*4e366538SXin Li void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
1197*4e366538SXin Li ptrdiff_t src_stride,
1198*4e366538SXin Li uint8_t* dst_argb,
1199*4e366538SXin Li int dst_width) {
1200*4e366538SXin Li (void)src_stride;
1201*4e366538SXin Li asm volatile(
1202*4e366538SXin Li "1: \n"
1203*4e366538SXin Li "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
1204*4e366538SXin Li "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
1205*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop
1206*4e366538SXin Li "vrhadd.u8 q0, q0, q1 \n" // rounding half add
1207*4e366538SXin Li "vrhadd.u8 q1, q2, q3 \n" // rounding half add
1208*4e366538SXin Li "vst2.32 {q0, q1}, [%1]! \n"
1209*4e366538SXin Li "bgt 1b \n"
1210*4e366538SXin Li : "+r"(src_argb), // %0
1211*4e366538SXin Li "+r"(dst_argb), // %1
1212*4e366538SXin Li "+r"(dst_width) // %2
1213*4e366538SXin Li :
1214*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
1215*4e366538SXin Li );
1216*4e366538SXin Li }
1217*4e366538SXin Li
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1218*4e366538SXin Li void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
1219*4e366538SXin Li ptrdiff_t src_stride,
1220*4e366538SXin Li uint8_t* dst,
1221*4e366538SXin Li int dst_width) {
1222*4e366538SXin Li asm volatile(
1223*4e366538SXin Li // change the stride to row 2 pointer
1224*4e366538SXin Li "add %1, %1, %0 \n"
1225*4e366538SXin Li "1: \n"
1226*4e366538SXin Li "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
1227*4e366538SXin Li "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
1228*4e366538SXin Li "subs %3, %3, #8 \n" // 8 processed per loop.
1229*4e366538SXin Li "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
1230*4e366538SXin Li "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
1231*4e366538SXin Li "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
1232*4e366538SXin Li "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
1233*4e366538SXin Li "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
1234*4e366538SXin Li "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
1235*4e366538SXin Li "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
1236*4e366538SXin Li "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
1237*4e366538SXin Li "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
1238*4e366538SXin Li "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
1239*4e366538SXin Li "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
1240*4e366538SXin Li "vrshrn.u16 d1, q1, #2 \n"
1241*4e366538SXin Li "vrshrn.u16 d2, q2, #2 \n"
1242*4e366538SXin Li "vrshrn.u16 d3, q3, #2 \n"
1243*4e366538SXin Li "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
1244*4e366538SXin Li "bgt 1b \n"
1245*4e366538SXin Li : "+r"(src_ptr), // %0
1246*4e366538SXin Li "+r"(src_stride), // %1
1247*4e366538SXin Li "+r"(dst), // %2
1248*4e366538SXin Li "+r"(dst_width) // %3
1249*4e366538SXin Li :
1250*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
1251*4e366538SXin Li }
1252*4e366538SXin Li
1253*4e366538SXin Li // Reads 4 pixels at a time.
1254*4e366538SXin Li // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1255*4e366538SXin Li void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
1256*4e366538SXin Li ptrdiff_t src_stride,
1257*4e366538SXin Li int src_stepx,
1258*4e366538SXin Li uint8_t* dst_argb,
1259*4e366538SXin Li int dst_width) {
1260*4e366538SXin Li (void)src_stride;
1261*4e366538SXin Li asm volatile(
1262*4e366538SXin Li "mov r12, %3, lsl #2 \n"
1263*4e366538SXin Li "1: \n"
1264*4e366538SXin Li "vld1.32 {d0[0]}, [%0], r12 \n"
1265*4e366538SXin Li "vld1.32 {d0[1]}, [%0], r12 \n"
1266*4e366538SXin Li "vld1.32 {d1[0]}, [%0], r12 \n"
1267*4e366538SXin Li "vld1.32 {d1[1]}, [%0], r12 \n"
1268*4e366538SXin Li "subs %2, %2, #4 \n" // 4 pixels per loop.
1269*4e366538SXin Li "vst1.8 {q0}, [%1]! \n"
1270*4e366538SXin Li "bgt 1b \n"
1271*4e366538SXin Li : "+r"(src_argb), // %0
1272*4e366538SXin Li "+r"(dst_argb), // %1
1273*4e366538SXin Li "+r"(dst_width) // %2
1274*4e366538SXin Li : "r"(src_stepx) // %3
1275*4e366538SXin Li : "memory", "cc", "r12", "q0");
1276*4e366538SXin Li }
1277*4e366538SXin Li
1278*4e366538SXin Li // Reads 4 pixels at a time.
1279*4e366538SXin Li // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1280*4e366538SXin Li void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
1281*4e366538SXin Li ptrdiff_t src_stride,
1282*4e366538SXin Li int src_stepx,
1283*4e366538SXin Li uint8_t* dst_argb,
1284*4e366538SXin Li int dst_width) {
1285*4e366538SXin Li asm volatile(
1286*4e366538SXin Li "mov r12, %4, lsl #2 \n"
1287*4e366538SXin Li "add %1, %1, %0 \n"
1288*4e366538SXin Li "1: \n"
1289*4e366538SXin Li "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
1290*4e366538SXin Li "vld1.8 {d1}, [%1], r12 \n"
1291*4e366538SXin Li "vld1.8 {d2}, [%0], r12 \n"
1292*4e366538SXin Li "vld1.8 {d3}, [%1], r12 \n"
1293*4e366538SXin Li "vld1.8 {d4}, [%0], r12 \n"
1294*4e366538SXin Li "vld1.8 {d5}, [%1], r12 \n"
1295*4e366538SXin Li "vld1.8 {d6}, [%0], r12 \n"
1296*4e366538SXin Li "vld1.8 {d7}, [%1], r12 \n"
1297*4e366538SXin Li "vaddl.u8 q0, d0, d1 \n"
1298*4e366538SXin Li "vaddl.u8 q1, d2, d3 \n"
1299*4e366538SXin Li "vaddl.u8 q2, d4, d5 \n"
1300*4e366538SXin Li "vaddl.u8 q3, d6, d7 \n"
1301*4e366538SXin Li "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
1302*4e366538SXin Li "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
1303*4e366538SXin Li "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
1304*4e366538SXin Li "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
1305*4e366538SXin Li "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
1306*4e366538SXin Li "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
1307*4e366538SXin Li "subs %3, %3, #4 \n" // 4 pixels per loop.
1308*4e366538SXin Li "vst1.8 {q0}, [%2]! \n"
1309*4e366538SXin Li "bgt 1b \n"
1310*4e366538SXin Li : "+r"(src_argb), // %0
1311*4e366538SXin Li "+r"(src_stride), // %1
1312*4e366538SXin Li "+r"(dst_argb), // %2
1313*4e366538SXin Li "+r"(dst_width) // %3
1314*4e366538SXin Li : "r"(src_stepx) // %4
1315*4e366538SXin Li : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
1316*4e366538SXin Li }
1317*4e366538SXin Li
1318*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1319*4e366538SXin Li // the x/dx stepping
1320*4e366538SXin Li #define LOAD1_DATA32_LANE(dn, n) \
1321*4e366538SXin Li "lsr %5, %3, #16 \n" \
1322*4e366538SXin Li "add %6, %1, %5, lsl #2 \n" \
1323*4e366538SXin Li "add %3, %3, %4 \n" \
1324*4e366538SXin Li "vld1.32 {" #dn "[" #n "]}, [%6] \n"
1325*4e366538SXin Li
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1326*4e366538SXin Li void ScaleARGBCols_NEON(uint8_t* dst_argb,
1327*4e366538SXin Li const uint8_t* src_argb,
1328*4e366538SXin Li int dst_width,
1329*4e366538SXin Li int x,
1330*4e366538SXin Li int dx) {
1331*4e366538SXin Li int tmp;
1332*4e366538SXin Li const uint8_t* src_tmp = src_argb;
1333*4e366538SXin Li asm volatile(
1334*4e366538SXin Li "1: \n"
1335*4e366538SXin Li // clang-format off
1336*4e366538SXin Li LOAD1_DATA32_LANE(d0, 0)
1337*4e366538SXin Li LOAD1_DATA32_LANE(d0, 1)
1338*4e366538SXin Li LOAD1_DATA32_LANE(d1, 0)
1339*4e366538SXin Li LOAD1_DATA32_LANE(d1, 1)
1340*4e366538SXin Li LOAD1_DATA32_LANE(d2, 0)
1341*4e366538SXin Li LOAD1_DATA32_LANE(d2, 1)
1342*4e366538SXin Li LOAD1_DATA32_LANE(d3, 0)
1343*4e366538SXin Li LOAD1_DATA32_LANE(d3, 1)
1344*4e366538SXin Li // clang-format on
1345*4e366538SXin Li "vst1.32 {q0, q1}, [%0]! \n" // store pixels
1346*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop
1347*4e366538SXin Li "bgt 1b \n"
1348*4e366538SXin Li : "+r"(dst_argb), // %0
1349*4e366538SXin Li "+r"(src_argb), // %1
1350*4e366538SXin Li "+r"(dst_width), // %2
1351*4e366538SXin Li "+r"(x), // %3
1352*4e366538SXin Li "+r"(dx), // %4
1353*4e366538SXin Li "=&r"(tmp), // %5
1354*4e366538SXin Li "+r"(src_tmp) // %6
1355*4e366538SXin Li :
1356*4e366538SXin Li : "memory", "cc", "q0", "q1");
1357*4e366538SXin Li }
1358*4e366538SXin Li
1359*4e366538SXin Li #undef LOAD1_DATA32_LANE
1360*4e366538SXin Li
1361*4e366538SXin Li // TODO(Yang Zhang): Investigate less load instructions for
1362*4e366538SXin Li // the x/dx stepping
1363*4e366538SXin Li #define LOAD2_DATA32_LANE(dn1, dn2, n) \
1364*4e366538SXin Li "lsr %5, %3, #16 \n" \
1365*4e366538SXin Li "add %6, %1, %5, lsl #2 \n" \
1366*4e366538SXin Li "add %3, %3, %4 \n" \
1367*4e366538SXin Li "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
1368*4e366538SXin Li
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1369*4e366538SXin Li void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
1370*4e366538SXin Li const uint8_t* src_argb,
1371*4e366538SXin Li int dst_width,
1372*4e366538SXin Li int x,
1373*4e366538SXin Li int dx) {
1374*4e366538SXin Li int dx_offset[4] = {0, 1, 2, 3};
1375*4e366538SXin Li int* tmp = dx_offset;
1376*4e366538SXin Li const uint8_t* src_tmp = src_argb;
1377*4e366538SXin Li asm volatile (
1378*4e366538SXin Li "vdup.32 q0, %3 \n" // x
1379*4e366538SXin Li "vdup.32 q1, %4 \n" // dx
1380*4e366538SXin Li "vld1.32 {q2}, [%5] \n" // 0 1 2 3
1381*4e366538SXin Li "vshl.i32 q9, q1, #2 \n" // 4 * dx
1382*4e366538SXin Li "vmul.s32 q1, q1, q2 \n"
1383*4e366538SXin Li "vmov.i8 q3, #0x7f \n" // 0x7F
1384*4e366538SXin Li "vmov.i16 q15, #0x7f \n" // 0x7F
1385*4e366538SXin Li // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1386*4e366538SXin Li "vadd.s32 q8, q1, q0 \n"
1387*4e366538SXin Li "1: \n"
1388*4e366538SXin Li // d0, d1: a
1389*4e366538SXin Li // d2, d3: b
1390*4e366538SXin Li LOAD2_DATA32_LANE(d0, d2, 0)
1391*4e366538SXin Li LOAD2_DATA32_LANE(d0, d2, 1)
1392*4e366538SXin Li LOAD2_DATA32_LANE(d1, d3, 0)
1393*4e366538SXin Li LOAD2_DATA32_LANE(d1, d3, 1)
1394*4e366538SXin Li "vshrn.i32 d22, q8, #9 \n"
1395*4e366538SXin Li "vand.16 d22, d22, d30 \n"
1396*4e366538SXin Li "vdup.8 d24, d22[0] \n"
1397*4e366538SXin Li "vdup.8 d25, d22[2] \n"
1398*4e366538SXin Li "vdup.8 d26, d22[4] \n"
1399*4e366538SXin Li "vdup.8 d27, d22[6] \n"
1400*4e366538SXin Li "vext.8 d4, d24, d25, #4 \n"
1401*4e366538SXin Li "vext.8 d5, d26, d27, #4 \n" // f
1402*4e366538SXin Li "veor.8 q10, q2, q3 \n" // 0x7f ^ f
1403*4e366538SXin Li "vmull.u8 q11, d0, d20 \n"
1404*4e366538SXin Li "vmull.u8 q12, d1, d21 \n"
1405*4e366538SXin Li "vmull.u8 q13, d2, d4 \n"
1406*4e366538SXin Li "vmull.u8 q14, d3, d5 \n"
1407*4e366538SXin Li "vadd.i16 q11, q11, q13 \n"
1408*4e366538SXin Li "vadd.i16 q12, q12, q14 \n"
1409*4e366538SXin Li "vshrn.i16 d0, q11, #7 \n"
1410*4e366538SXin Li "vshrn.i16 d1, q12, #7 \n"
1411*4e366538SXin Li
1412*4e366538SXin Li "vst1.32 {d0, d1}, [%0]! \n" // store pixels
1413*4e366538SXin Li "vadd.s32 q8, q8, q9 \n"
1414*4e366538SXin Li "subs %2, %2, #4 \n" // 4 processed per loop
1415*4e366538SXin Li "bgt 1b \n"
1416*4e366538SXin Li : "+r"(dst_argb), // %0
1417*4e366538SXin Li "+r"(src_argb), // %1
1418*4e366538SXin Li "+r"(dst_width), // %2
1419*4e366538SXin Li "+r"(x), // %3
1420*4e366538SXin Li "+r"(dx), // %4
1421*4e366538SXin Li "+r"(tmp), // %5
1422*4e366538SXin Li "+r"(src_tmp) // %6
1423*4e366538SXin Li :
1424*4e366538SXin Li : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1425*4e366538SXin Li "q10", "q11", "q12", "q13", "q14", "q15"
1426*4e366538SXin Li );
1427*4e366538SXin Li }
1428*4e366538SXin Li
1429*4e366538SXin Li #undef LOAD2_DATA32_LANE
1430*4e366538SXin Li
ScaleUVRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1431*4e366538SXin Li void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
1432*4e366538SXin Li ptrdiff_t src_stride,
1433*4e366538SXin Li uint8_t* dst,
1434*4e366538SXin Li int dst_width) {
1435*4e366538SXin Li (void)src_stride;
1436*4e366538SXin Li asm volatile(
1437*4e366538SXin Li "1: \n"
1438*4e366538SXin Li "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
1439*4e366538SXin Li "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
1440*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop.
1441*4e366538SXin Li "vst1.16 {q1}, [%1]! \n" // store 8 UV
1442*4e366538SXin Li "bgt 1b \n"
1443*4e366538SXin Li : "+r"(src_ptr), // %0
1444*4e366538SXin Li "+r"(dst), // %1
1445*4e366538SXin Li "+r"(dst_width) // %2
1446*4e366538SXin Li :
1447*4e366538SXin Li : "memory", "cc", "q0", "q1");
1448*4e366538SXin Li }
1449*4e366538SXin Li
ScaleUVRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1450*4e366538SXin Li void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
1451*4e366538SXin Li ptrdiff_t src_stride,
1452*4e366538SXin Li uint8_t* dst,
1453*4e366538SXin Li int dst_width) {
1454*4e366538SXin Li (void)src_stride;
1455*4e366538SXin Li asm volatile(
1456*4e366538SXin Li "1: \n"
1457*4e366538SXin Li "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
1458*4e366538SXin Li "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
1459*4e366538SXin Li "subs %2, %2, #8 \n" // 8 processed per loop.
1460*4e366538SXin Li "vrhadd.u8 q0, q0, q1 \n" // rounding half add
1461*4e366538SXin Li "vst1.16 {q0}, [%1]! \n" // store 8 UV
1462*4e366538SXin Li "bgt 1b \n"
1463*4e366538SXin Li : "+r"(src_ptr), // %0
1464*4e366538SXin Li "+r"(dst), // %1
1465*4e366538SXin Li "+r"(dst_width) // %2
1466*4e366538SXin Li :
1467*4e366538SXin Li : "memory", "cc", "q0", "q1");
1468*4e366538SXin Li }
1469*4e366538SXin Li
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1470*4e366538SXin Li void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1471*4e366538SXin Li ptrdiff_t src_stride,
1472*4e366538SXin Li uint8_t* dst,
1473*4e366538SXin Li int dst_width) {
1474*4e366538SXin Li asm volatile(
1475*4e366538SXin Li // change the stride to row 2 pointer
1476*4e366538SXin Li "add %1, %1, %0 \n"
1477*4e366538SXin Li "1: \n"
1478*4e366538SXin Li "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
1479*4e366538SXin Li "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
1480*4e366538SXin Li "subs %3, %3, #8 \n" // 8 processed per loop.
1481*4e366538SXin Li "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
1482*4e366538SXin Li "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
1483*4e366538SXin Li "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
1484*4e366538SXin Li "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
1485*4e366538SXin Li "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
1486*4e366538SXin Li "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
1487*4e366538SXin Li "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
1488*4e366538SXin Li "vrshrn.u16 d1, q1, #2 \n"
1489*4e366538SXin Li "vst2.8 {d0, d1}, [%2]! \n"
1490*4e366538SXin Li "bgt 1b \n"
1491*4e366538SXin Li : "+r"(src_ptr), // %0
1492*4e366538SXin Li "+r"(src_stride), // %1
1493*4e366538SXin Li "+r"(dst), // %2
1494*4e366538SXin Li "+r"(dst_width) // %3
1495*4e366538SXin Li :
1496*4e366538SXin Li : "memory", "cc", "q0", "q1", "q8", "q9");
1497*4e366538SXin Li }
1498*4e366538SXin Li
1499*4e366538SXin Li // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1500*4e366538SXin Li void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1501*4e366538SXin Li ptrdiff_t src_stride,
1502*4e366538SXin Li int src_stepx, // pixel step
1503*4e366538SXin Li uint8_t* dst_ptr,
1504*4e366538SXin Li int dst_width) {
1505*4e366538SXin Li const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1506*4e366538SXin Li const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1507*4e366538SXin Li const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1508*4e366538SXin Li (void)src_stride;
1509*4e366538SXin Li asm volatile(
1510*4e366538SXin Li "1: \n"
1511*4e366538SXin Li "vld1.16 {d0[0]}, [%0], %6 \n"
1512*4e366538SXin Li "vld1.16 {d0[1]}, [%1], %6 \n"
1513*4e366538SXin Li "vld1.16 {d0[2]}, [%2], %6 \n"
1514*4e366538SXin Li "vld1.16 {d0[3]}, [%3], %6 \n"
1515*4e366538SXin Li "subs %5, %5, #4 \n" // 4 pixels per loop.
1516*4e366538SXin Li "vst1.8 {d0}, [%4]! \n"
1517*4e366538SXin Li "bgt 1b \n"
1518*4e366538SXin Li : "+r"(src_ptr), // %0
1519*4e366538SXin Li "+r"(src1_ptr), // %1
1520*4e366538SXin Li "+r"(src2_ptr), // %2
1521*4e366538SXin Li "+r"(src3_ptr), // %3
1522*4e366538SXin Li "+r"(dst_ptr), // %4
1523*4e366538SXin Li "+r"(dst_width) // %5
1524*4e366538SXin Li : "r"(src_stepx * 8) // %6
1525*4e366538SXin Li : "memory", "cc", "d0");
1526*4e366538SXin Li }
1527*4e366538SXin Li
1528*4e366538SXin Li #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
1529*4e366538SXin Li
1530*4e366538SXin Li #ifdef __cplusplus
1531*4e366538SXin Li } // extern "C"
1532*4e366538SXin Li } // namespace libyuv
1533*4e366538SXin Li #endif
1534