1*b2055c35SXin Li // Copyright 2012 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // ARM NEON version of speed-critical encoding functions.
11*b2055c35SXin Li //
12*b2055c35SXin Li // adapted from libvpx (https://www.webmproject.org/code/)
13*b2055c35SXin Li
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li
16*b2055c35SXin Li #if defined(WEBP_USE_NEON)
17*b2055c35SXin Li
18*b2055c35SXin Li #include <assert.h>
19*b2055c35SXin Li
20*b2055c35SXin Li #include "src/dsp/neon.h"
21*b2055c35SXin Li #include "src/enc/vp8i_enc.h"
22*b2055c35SXin Li
23*b2055c35SXin Li //------------------------------------------------------------------------------
24*b2055c35SXin Li // Transforms (Paragraph 14.4)
25*b2055c35SXin Li
26*b2055c35SXin Li // Inverse transform.
27*b2055c35SXin Li // This code is pretty much the same as TransformOne in the dec_neon.c, except
28*b2055c35SXin Li // for subtraction to *ref. See the comments there for algorithmic explanations.
29*b2055c35SXin Li
30*b2055c35SXin Li static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31*b2055c35SXin Li static const int16_t kC2 =
32*b2055c35SXin Li WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
33*b2055c35SXin Li
34*b2055c35SXin Li // This code works but is *slower* than the inlined-asm version below
35*b2055c35SXin Li // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36*b2055c35SXin Li // WEBP_USE_INTRINSICS define.
37*b2055c35SXin Li // With gcc-4.8, it's a little faster speed than inlined-assembly.
38*b2055c35SXin Li #if defined(WEBP_USE_INTRINSICS)
39*b2055c35SXin Li
40*b2055c35SXin Li // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)41*b2055c35SXin Li static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42*b2055c35SXin Li return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43*b2055c35SXin Li }
44*b2055c35SXin Li
45*b2055c35SXin Li // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46*b2055c35SXin Li // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)47*b2055c35SXin Li static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48*b2055c35SXin Li const int16x8_t dst01,
49*b2055c35SXin Li const int16x8_t dst23) {
50*b2055c35SXin Li // Unsigned saturate to 8b.
51*b2055c35SXin Li const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52*b2055c35SXin Li const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53*b2055c35SXin Li
54*b2055c35SXin Li // Store the results.
55*b2055c35SXin Li vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56*b2055c35SXin Li vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57*b2055c35SXin Li vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58*b2055c35SXin Li vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59*b2055c35SXin Li }
60*b2055c35SXin Li
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)61*b2055c35SXin Li static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62*b2055c35SXin Li const int16x8_t row23,
63*b2055c35SXin Li const uint8_t* const ref,
64*b2055c35SXin Li uint8_t* const dst) {
65*b2055c35SXin Li uint32x2_t dst01 = vdup_n_u32(0);
66*b2055c35SXin Li uint32x2_t dst23 = vdup_n_u32(0);
67*b2055c35SXin Li
68*b2055c35SXin Li // Load the source pixels.
69*b2055c35SXin Li dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70*b2055c35SXin Li dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71*b2055c35SXin Li dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72*b2055c35SXin Li dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73*b2055c35SXin Li
74*b2055c35SXin Li {
75*b2055c35SXin Li // Convert to 16b.
76*b2055c35SXin Li const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77*b2055c35SXin Li const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78*b2055c35SXin Li
79*b2055c35SXin Li // Descale with rounding.
80*b2055c35SXin Li const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81*b2055c35SXin Li const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82*b2055c35SXin Li // Add the inverse transform.
83*b2055c35SXin Li SaturateAndStore4x4_NEON(dst, out01, out23);
84*b2055c35SXin Li }
85*b2055c35SXin Li }
86*b2055c35SXin Li
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)87*b2055c35SXin Li static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88*b2055c35SXin Li const int16x8_t in1,
89*b2055c35SXin Li int16x8x2_t* const out) {
90*b2055c35SXin Li // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
91*b2055c35SXin Li // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
92*b2055c35SXin Li const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
93*b2055c35SXin Li // b0 d0 b1 d1 b2 d2 ...
94*b2055c35SXin Li *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95*b2055c35SXin Li }
96*b2055c35SXin Li
TransformPass_NEON(int16x8x2_t * const rows)97*b2055c35SXin Li static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98*b2055c35SXin Li // {rows} = in0 | in4
99*b2055c35SXin Li // in8 | in12
100*b2055c35SXin Li // B1 = in4 | in12
101*b2055c35SXin Li const int16x8_t B1 =
102*b2055c35SXin Li vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103*b2055c35SXin Li // C0 = kC1 * in4 | kC1 * in12
104*b2055c35SXin Li // C1 = kC2 * in4 | kC2 * in12
105*b2055c35SXin Li const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106*b2055c35SXin Li const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107*b2055c35SXin Li const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108*b2055c35SXin Li vget_low_s16(rows->val[1])); // in0 + in8
109*b2055c35SXin Li const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110*b2055c35SXin Li vget_low_s16(rows->val[1])); // in0 - in8
111*b2055c35SXin Li // c = kC2 * in4 - kC1 * in12
112*b2055c35SXin Li // d = kC1 * in4 + kC2 * in12
113*b2055c35SXin Li const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114*b2055c35SXin Li const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115*b2055c35SXin Li const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
116*b2055c35SXin Li const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
117*b2055c35SXin Li const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
118*b2055c35SXin Li const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
119*b2055c35SXin Li const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120*b2055c35SXin Li Transpose8x2_NEON(E0, E1, rows);
121*b2055c35SXin Li }
122*b2055c35SXin Li
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)123*b2055c35SXin Li static void ITransformOne_NEON(const uint8_t* ref,
124*b2055c35SXin Li const int16_t* in, uint8_t* dst) {
125*b2055c35SXin Li int16x8x2_t rows;
126*b2055c35SXin Li INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
127*b2055c35SXin Li TransformPass_NEON(&rows);
128*b2055c35SXin Li TransformPass_NEON(&rows);
129*b2055c35SXin Li Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
130*b2055c35SXin Li }
131*b2055c35SXin Li
132*b2055c35SXin Li #else
133*b2055c35SXin Li
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)134*b2055c35SXin Li static void ITransformOne_NEON(const uint8_t* ref,
135*b2055c35SXin Li const int16_t* in, uint8_t* dst) {
136*b2055c35SXin Li const int kBPS = BPS;
137*b2055c35SXin Li const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
138*b2055c35SXin Li
139*b2055c35SXin Li __asm__ volatile (
140*b2055c35SXin Li "vld1.16 {q1, q2}, [%[in]] \n"
141*b2055c35SXin Li "vld1.16 {d0}, [%[kC1C2]] \n"
142*b2055c35SXin Li
143*b2055c35SXin Li // d2: in[0]
144*b2055c35SXin Li // d3: in[8]
145*b2055c35SXin Li // d4: in[4]
146*b2055c35SXin Li // d5: in[12]
147*b2055c35SXin Li "vswp d3, d4 \n"
148*b2055c35SXin Li
149*b2055c35SXin Li // q8 = {in[4], in[12]} * kC1 * 2 >> 16
150*b2055c35SXin Li // q9 = {in[4], in[12]} * kC2 >> 16
151*b2055c35SXin Li "vqdmulh.s16 q8, q2, d0[0] \n"
152*b2055c35SXin Li "vqdmulh.s16 q9, q2, d0[1] \n"
153*b2055c35SXin Li
154*b2055c35SXin Li // d22 = a = in[0] + in[8]
155*b2055c35SXin Li // d23 = b = in[0] - in[8]
156*b2055c35SXin Li "vqadd.s16 d22, d2, d3 \n"
157*b2055c35SXin Li "vqsub.s16 d23, d2, d3 \n"
158*b2055c35SXin Li
159*b2055c35SXin Li // q8 = in[4]/[12] * kC1 >> 16
160*b2055c35SXin Li "vshr.s16 q8, q8, #1 \n"
161*b2055c35SXin Li
162*b2055c35SXin Li // Add {in[4], in[12]} back after the multiplication.
163*b2055c35SXin Li "vqadd.s16 q8, q2, q8 \n"
164*b2055c35SXin Li
165*b2055c35SXin Li // d20 = c = in[4]*kC2 - in[12]*kC1
166*b2055c35SXin Li // d21 = d = in[4]*kC1 + in[12]*kC2
167*b2055c35SXin Li "vqsub.s16 d20, d18, d17 \n"
168*b2055c35SXin Li "vqadd.s16 d21, d19, d16 \n"
169*b2055c35SXin Li
170*b2055c35SXin Li // d2 = tmp[0] = a + d
171*b2055c35SXin Li // d3 = tmp[1] = b + c
172*b2055c35SXin Li // d4 = tmp[2] = b - c
173*b2055c35SXin Li // d5 = tmp[3] = a - d
174*b2055c35SXin Li "vqadd.s16 d2, d22, d21 \n"
175*b2055c35SXin Li "vqadd.s16 d3, d23, d20 \n"
176*b2055c35SXin Li "vqsub.s16 d4, d23, d20 \n"
177*b2055c35SXin Li "vqsub.s16 d5, d22, d21 \n"
178*b2055c35SXin Li
179*b2055c35SXin Li "vzip.16 q1, q2 \n"
180*b2055c35SXin Li "vzip.16 q1, q2 \n"
181*b2055c35SXin Li
182*b2055c35SXin Li "vswp d3, d4 \n"
183*b2055c35SXin Li
184*b2055c35SXin Li // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
185*b2055c35SXin Li // q9 = {tmp[4], tmp[12]} * kC2 >> 16
186*b2055c35SXin Li "vqdmulh.s16 q8, q2, d0[0] \n"
187*b2055c35SXin Li "vqdmulh.s16 q9, q2, d0[1] \n"
188*b2055c35SXin Li
189*b2055c35SXin Li // d22 = a = tmp[0] + tmp[8]
190*b2055c35SXin Li // d23 = b = tmp[0] - tmp[8]
191*b2055c35SXin Li "vqadd.s16 d22, d2, d3 \n"
192*b2055c35SXin Li "vqsub.s16 d23, d2, d3 \n"
193*b2055c35SXin Li
194*b2055c35SXin Li "vshr.s16 q8, q8, #1 \n"
195*b2055c35SXin Li "vqadd.s16 q8, q2, q8 \n"
196*b2055c35SXin Li
197*b2055c35SXin Li // d20 = c = in[4]*kC2 - in[12]*kC1
198*b2055c35SXin Li // d21 = d = in[4]*kC1 + in[12]*kC2
199*b2055c35SXin Li "vqsub.s16 d20, d18, d17 \n"
200*b2055c35SXin Li "vqadd.s16 d21, d19, d16 \n"
201*b2055c35SXin Li
202*b2055c35SXin Li // d2 = tmp[0] = a + d
203*b2055c35SXin Li // d3 = tmp[1] = b + c
204*b2055c35SXin Li // d4 = tmp[2] = b - c
205*b2055c35SXin Li // d5 = tmp[3] = a - d
206*b2055c35SXin Li "vqadd.s16 d2, d22, d21 \n"
207*b2055c35SXin Li "vqadd.s16 d3, d23, d20 \n"
208*b2055c35SXin Li "vqsub.s16 d4, d23, d20 \n"
209*b2055c35SXin Li "vqsub.s16 d5, d22, d21 \n"
210*b2055c35SXin Li
211*b2055c35SXin Li "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
212*b2055c35SXin Li "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
213*b2055c35SXin Li "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
214*b2055c35SXin Li "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
215*b2055c35SXin Li
216*b2055c35SXin Li "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
217*b2055c35SXin Li
218*b2055c35SXin Li // (val) + 4 >> 3
219*b2055c35SXin Li "vrshr.s16 d2, d2, #3 \n"
220*b2055c35SXin Li "vrshr.s16 d3, d3, #3 \n"
221*b2055c35SXin Li "vrshr.s16 d4, d4, #3 \n"
222*b2055c35SXin Li "vrshr.s16 d5, d5, #3 \n"
223*b2055c35SXin Li
224*b2055c35SXin Li "vzip.16 q1, q2 \n"
225*b2055c35SXin Li "vzip.16 q1, q2 \n"
226*b2055c35SXin Li
227*b2055c35SXin Li // Must accumulate before saturating
228*b2055c35SXin Li "vmovl.u8 q8, d6 \n"
229*b2055c35SXin Li "vmovl.u8 q9, d7 \n"
230*b2055c35SXin Li
231*b2055c35SXin Li "vqadd.s16 q1, q1, q8 \n"
232*b2055c35SXin Li "vqadd.s16 q2, q2, q9 \n"
233*b2055c35SXin Li
234*b2055c35SXin Li "vqmovun.s16 d0, q1 \n"
235*b2055c35SXin Li "vqmovun.s16 d1, q2 \n"
236*b2055c35SXin Li
237*b2055c35SXin Li "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
238*b2055c35SXin Li "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
239*b2055c35SXin Li "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
240*b2055c35SXin Li "vst1.32 d1[1], [%[dst]] \n"
241*b2055c35SXin Li
242*b2055c35SXin Li : [in] "+r"(in), [dst] "+r"(dst) // modified registers
243*b2055c35SXin Li : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
244*b2055c35SXin Li : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
245*b2055c35SXin Li );
246*b2055c35SXin Li }
247*b2055c35SXin Li
248*b2055c35SXin Li #endif // WEBP_USE_INTRINSICS
249*b2055c35SXin Li
ITransform_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)250*b2055c35SXin Li static void ITransform_NEON(const uint8_t* ref,
251*b2055c35SXin Li const int16_t* in, uint8_t* dst, int do_two) {
252*b2055c35SXin Li ITransformOne_NEON(ref, in, dst);
253*b2055c35SXin Li if (do_two) {
254*b2055c35SXin Li ITransformOne_NEON(ref + 4, in + 16, dst + 4);
255*b2055c35SXin Li }
256*b2055c35SXin Li }
257*b2055c35SXin Li
258*b2055c35SXin Li // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)259*b2055c35SXin Li static uint8x16_t Load4x4_NEON(const uint8_t* src) {
260*b2055c35SXin Li uint32x4_t out = vdupq_n_u32(0);
261*b2055c35SXin Li out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
262*b2055c35SXin Li out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
263*b2055c35SXin Li out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
264*b2055c35SXin Li out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
265*b2055c35SXin Li return vreinterpretq_u8_u32(out);
266*b2055c35SXin Li }
267*b2055c35SXin Li
268*b2055c35SXin Li // Forward transform.
269*b2055c35SXin Li
270*b2055c35SXin Li #if defined(WEBP_USE_INTRINSICS)
271*b2055c35SXin Li
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)272*b2055c35SXin Li static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
273*b2055c35SXin Li const int16x4_t B,
274*b2055c35SXin Li const int16x4_t C,
275*b2055c35SXin Li const int16x4_t D,
276*b2055c35SXin Li int16x8_t* const out01,
277*b2055c35SXin Li int16x8_t* const out32) {
278*b2055c35SXin Li const int16x4x2_t AB = vtrn_s16(A, B);
279*b2055c35SXin Li const int16x4x2_t CD = vtrn_s16(C, D);
280*b2055c35SXin Li const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
281*b2055c35SXin Li vreinterpret_s32_s16(CD.val[0]));
282*b2055c35SXin Li const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
283*b2055c35SXin Li vreinterpret_s32_s16(CD.val[1]));
284*b2055c35SXin Li *out01 = vreinterpretq_s16_s64(
285*b2055c35SXin Li vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
286*b2055c35SXin Li vreinterpret_s64_s32(tmp13.val[0])));
287*b2055c35SXin Li *out32 = vreinterpretq_s16_s64(
288*b2055c35SXin Li vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
289*b2055c35SXin Li vreinterpret_s64_s32(tmp02.val[1])));
290*b2055c35SXin Li }
291*b2055c35SXin Li
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)292*b2055c35SXin Li static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
293*b2055c35SXin Li const uint8x8_t b) {
294*b2055c35SXin Li return vreinterpretq_s16_u16(vsubl_u8(a, b));
295*b2055c35SXin Li }
296*b2055c35SXin Li
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)297*b2055c35SXin Li static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
298*b2055c35SXin Li int16_t* out) {
299*b2055c35SXin Li int16x8_t d0d1, d3d2; // working 4x4 int16 variables
300*b2055c35SXin Li {
301*b2055c35SXin Li const uint8x16_t S0 = Load4x4_NEON(src);
302*b2055c35SXin Li const uint8x16_t R0 = Load4x4_NEON(ref);
303*b2055c35SXin Li const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
304*b2055c35SXin Li const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
305*b2055c35SXin Li const int16x4_t D0 = vget_low_s16(D0D1);
306*b2055c35SXin Li const int16x4_t D1 = vget_high_s16(D0D1);
307*b2055c35SXin Li const int16x4_t D2 = vget_low_s16(D2D3);
308*b2055c35SXin Li const int16x4_t D3 = vget_high_s16(D2D3);
309*b2055c35SXin Li Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
310*b2055c35SXin Li }
311*b2055c35SXin Li { // 1rst pass
312*b2055c35SXin Li const int32x4_t kCst937 = vdupq_n_s32(937);
313*b2055c35SXin Li const int32x4_t kCst1812 = vdupq_n_s32(1812);
314*b2055c35SXin Li const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
315*b2055c35SXin Li const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
316*b2055c35SXin Li const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
317*b2055c35SXin Li const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
318*b2055c35SXin Li vget_high_s16(a0a1_2));
319*b2055c35SXin Li const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
320*b2055c35SXin Li vget_high_s16(a0a1_2));
321*b2055c35SXin Li const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
322*b2055c35SXin Li const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
323*b2055c35SXin Li const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
324*b2055c35SXin Li const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
325*b2055c35SXin Li const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
326*b2055c35SXin Li const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
327*b2055c35SXin Li Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
328*b2055c35SXin Li }
329*b2055c35SXin Li { // 2nd pass
330*b2055c35SXin Li // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
331*b2055c35SXin Li const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
332*b2055c35SXin Li const int32x4_t kCst51000 = vdupq_n_s32(51000);
333*b2055c35SXin Li const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
334*b2055c35SXin Li const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
335*b2055c35SXin Li const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
336*b2055c35SXin Li const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
337*b2055c35SXin Li const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
338*b2055c35SXin Li const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
339*b2055c35SXin Li const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
340*b2055c35SXin Li const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
341*b2055c35SXin Li const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
342*b2055c35SXin Li const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
343*b2055c35SXin Li const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
344*b2055c35SXin Li const int16x4_t a3_eq_0 =
345*b2055c35SXin Li vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
346*b2055c35SXin Li const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
347*b2055c35SXin Li vst1_s16(out + 0, out0);
348*b2055c35SXin Li vst1_s16(out + 4, out1);
349*b2055c35SXin Li vst1_s16(out + 8, out2);
350*b2055c35SXin Li vst1_s16(out + 12, out3);
351*b2055c35SXin Li }
352*b2055c35SXin Li }
353*b2055c35SXin Li
354*b2055c35SXin Li #else
355*b2055c35SXin Li
356*b2055c35SXin Li // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
357*b2055c35SXin Li static const int16_t kCoeff16[] = {
358*b2055c35SXin Li 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
359*b2055c35SXin Li };
360*b2055c35SXin Li static const int32_t kCoeff32[] = {
361*b2055c35SXin Li 1812, 1812, 1812, 1812,
362*b2055c35SXin Li 937, 937, 937, 937,
363*b2055c35SXin Li 12000, 12000, 12000, 12000,
364*b2055c35SXin Li 51000, 51000, 51000, 51000
365*b2055c35SXin Li };
366*b2055c35SXin Li
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)367*b2055c35SXin Li static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
368*b2055c35SXin Li int16_t* out) {
369*b2055c35SXin Li const int kBPS = BPS;
370*b2055c35SXin Li const uint8_t* src_ptr = src;
371*b2055c35SXin Li const uint8_t* ref_ptr = ref;
372*b2055c35SXin Li const int16_t* coeff16 = kCoeff16;
373*b2055c35SXin Li const int32_t* coeff32 = kCoeff32;
374*b2055c35SXin Li
375*b2055c35SXin Li __asm__ volatile (
376*b2055c35SXin Li // load src into q4, q5 in high half
377*b2055c35SXin Li "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
378*b2055c35SXin Li "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
379*b2055c35SXin Li "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
380*b2055c35SXin Li "vld1.8 {d11}, [%[src_ptr]] \n"
381*b2055c35SXin Li
382*b2055c35SXin Li // load ref into q6, q7 in high half
383*b2055c35SXin Li "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
384*b2055c35SXin Li "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
385*b2055c35SXin Li "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
386*b2055c35SXin Li "vld1.8 {d15}, [%[ref_ptr]] \n"
387*b2055c35SXin Li
388*b2055c35SXin Li // Pack the high values in to q4 and q6
389*b2055c35SXin Li "vtrn.32 q4, q5 \n"
390*b2055c35SXin Li "vtrn.32 q6, q7 \n"
391*b2055c35SXin Li
392*b2055c35SXin Li // d[0-3] = src - ref
393*b2055c35SXin Li "vsubl.u8 q0, d8, d12 \n"
394*b2055c35SXin Li "vsubl.u8 q1, d9, d13 \n"
395*b2055c35SXin Li
396*b2055c35SXin Li // load coeff16 into q8(d16=5352, d17=2217)
397*b2055c35SXin Li "vld1.16 {q8}, [%[coeff16]] \n"
398*b2055c35SXin Li
399*b2055c35SXin Li // load coeff32 high half into q9 = 1812, q10 = 937
400*b2055c35SXin Li "vld1.32 {q9, q10}, [%[coeff32]]! \n"
401*b2055c35SXin Li
402*b2055c35SXin Li // load coeff32 low half into q11=12000, q12=51000
403*b2055c35SXin Li "vld1.32 {q11,q12}, [%[coeff32]] \n"
404*b2055c35SXin Li
405*b2055c35SXin Li // part 1
406*b2055c35SXin Li // Transpose. Register dN is the same as dN in C
407*b2055c35SXin Li "vtrn.32 d0, d2 \n"
408*b2055c35SXin Li "vtrn.32 d1, d3 \n"
409*b2055c35SXin Li "vtrn.16 d0, d1 \n"
410*b2055c35SXin Li "vtrn.16 d2, d3 \n"
411*b2055c35SXin Li
412*b2055c35SXin Li "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
413*b2055c35SXin Li "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
414*b2055c35SXin Li "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
415*b2055c35SXin Li "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
416*b2055c35SXin Li
417*b2055c35SXin Li "vadd.s16 d0, d4, d5 \n" // a0 + a1
418*b2055c35SXin Li "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
419*b2055c35SXin Li "vsub.s16 d2, d4, d5 \n" // a0 - a1
420*b2055c35SXin Li "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
421*b2055c35SXin Li
422*b2055c35SXin Li "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
423*b2055c35SXin Li "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
424*b2055c35SXin Li "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
425*b2055c35SXin Li "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
426*b2055c35SXin Li
427*b2055c35SXin Li // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
428*b2055c35SXin Li // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
429*b2055c35SXin Li "vshrn.s32 d1, q9, #9 \n"
430*b2055c35SXin Li "vshrn.s32 d3, q10, #9 \n"
431*b2055c35SXin Li
432*b2055c35SXin Li // part 2
433*b2055c35SXin Li // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
434*b2055c35SXin Li "vtrn.32 d0, d2 \n"
435*b2055c35SXin Li "vtrn.32 d1, d3 \n"
436*b2055c35SXin Li "vtrn.16 d0, d1 \n"
437*b2055c35SXin Li "vtrn.16 d2, d3 \n"
438*b2055c35SXin Li
439*b2055c35SXin Li "vmov.s16 d26, #7 \n"
440*b2055c35SXin Li
441*b2055c35SXin Li "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
442*b2055c35SXin Li "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
443*b2055c35SXin Li "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
444*b2055c35SXin Li "vadd.s16 d4, d4, d26 \n" // a1 + 7
445*b2055c35SXin Li "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
446*b2055c35SXin Li
447*b2055c35SXin Li "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
448*b2055c35SXin Li "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
449*b2055c35SXin Li
450*b2055c35SXin Li "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
451*b2055c35SXin Li "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
452*b2055c35SXin Li
453*b2055c35SXin Li "vceq.s16 d4, d7, #0 \n"
454*b2055c35SXin Li
455*b2055c35SXin Li "vshr.s16 d0, d0, #4 \n"
456*b2055c35SXin Li "vshr.s16 d2, d2, #4 \n"
457*b2055c35SXin Li
458*b2055c35SXin Li "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
459*b2055c35SXin Li "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
460*b2055c35SXin Li
461*b2055c35SXin Li "vmvn d4, d4 \n" // !(d1 == 0)
462*b2055c35SXin Li // op[4] = (c1*2217 + d1*5352 + 12000)>>16
463*b2055c35SXin Li "vshrn.s32 d1, q11, #16 \n"
464*b2055c35SXin Li // op[4] += (d1!=0)
465*b2055c35SXin Li "vsub.s16 d1, d1, d4 \n"
466*b2055c35SXin Li // op[12]= (d1*2217 - c1*5352 + 51000)>>16
467*b2055c35SXin Li "vshrn.s32 d3, q12, #16 \n"
468*b2055c35SXin Li
469*b2055c35SXin Li // set result to out array
470*b2055c35SXin Li "vst1.16 {q0, q1}, [%[out]] \n"
471*b2055c35SXin Li : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
472*b2055c35SXin Li [coeff32] "+r"(coeff32) // modified registers
473*b2055c35SXin Li : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
474*b2055c35SXin Li [out] "r"(out) // constants
475*b2055c35SXin Li : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
476*b2055c35SXin Li "q10", "q11", "q12", "q13" // clobbered
477*b2055c35SXin Li );
478*b2055c35SXin Li }
479*b2055c35SXin Li
480*b2055c35SXin Li #endif
481*b2055c35SXin Li
482*b2055c35SXin Li #define LOAD_LANE_16b(VALUE, LANE) do { \
483*b2055c35SXin Li (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
484*b2055c35SXin Li src += stride; \
485*b2055c35SXin Li } while (0)
486*b2055c35SXin Li
FTransformWHT_NEON(const int16_t * src,int16_t * out)487*b2055c35SXin Li static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
488*b2055c35SXin Li const int stride = 16;
489*b2055c35SXin Li const int16x4_t zero = vdup_n_s16(0);
490*b2055c35SXin Li int32x4x4_t tmp0;
491*b2055c35SXin Li int16x4x4_t in;
492*b2055c35SXin Li INIT_VECTOR4(in, zero, zero, zero, zero);
493*b2055c35SXin Li LOAD_LANE_16b(in.val[0], 0);
494*b2055c35SXin Li LOAD_LANE_16b(in.val[1], 0);
495*b2055c35SXin Li LOAD_LANE_16b(in.val[2], 0);
496*b2055c35SXin Li LOAD_LANE_16b(in.val[3], 0);
497*b2055c35SXin Li LOAD_LANE_16b(in.val[0], 1);
498*b2055c35SXin Li LOAD_LANE_16b(in.val[1], 1);
499*b2055c35SXin Li LOAD_LANE_16b(in.val[2], 1);
500*b2055c35SXin Li LOAD_LANE_16b(in.val[3], 1);
501*b2055c35SXin Li LOAD_LANE_16b(in.val[0], 2);
502*b2055c35SXin Li LOAD_LANE_16b(in.val[1], 2);
503*b2055c35SXin Li LOAD_LANE_16b(in.val[2], 2);
504*b2055c35SXin Li LOAD_LANE_16b(in.val[3], 2);
505*b2055c35SXin Li LOAD_LANE_16b(in.val[0], 3);
506*b2055c35SXin Li LOAD_LANE_16b(in.val[1], 3);
507*b2055c35SXin Li LOAD_LANE_16b(in.val[2], 3);
508*b2055c35SXin Li LOAD_LANE_16b(in.val[3], 3);
509*b2055c35SXin Li
510*b2055c35SXin Li {
511*b2055c35SXin Li // a0 = in[0 * 16] + in[2 * 16]
512*b2055c35SXin Li // a1 = in[1 * 16] + in[3 * 16]
513*b2055c35SXin Li // a2 = in[1 * 16] - in[3 * 16]
514*b2055c35SXin Li // a3 = in[0 * 16] - in[2 * 16]
515*b2055c35SXin Li const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
516*b2055c35SXin Li const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
517*b2055c35SXin Li const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
518*b2055c35SXin Li const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
519*b2055c35SXin Li tmp0.val[0] = vaddq_s32(a0, a1);
520*b2055c35SXin Li tmp0.val[1] = vaddq_s32(a3, a2);
521*b2055c35SXin Li tmp0.val[2] = vsubq_s32(a3, a2);
522*b2055c35SXin Li tmp0.val[3] = vsubq_s32(a0, a1);
523*b2055c35SXin Li }
524*b2055c35SXin Li {
525*b2055c35SXin Li const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
526*b2055c35SXin Li // a0 = tmp[0 + i] + tmp[ 8 + i]
527*b2055c35SXin Li // a1 = tmp[4 + i] + tmp[12 + i]
528*b2055c35SXin Li // a2 = tmp[4 + i] - tmp[12 + i]
529*b2055c35SXin Li // a3 = tmp[0 + i] - tmp[ 8 + i]
530*b2055c35SXin Li const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
531*b2055c35SXin Li const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
532*b2055c35SXin Li const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
533*b2055c35SXin Li const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
534*b2055c35SXin Li const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
535*b2055c35SXin Li const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
536*b2055c35SXin Li const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
537*b2055c35SXin Li const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
538*b2055c35SXin Li const int16x4_t out0 = vmovn_s32(b0);
539*b2055c35SXin Li const int16x4_t out1 = vmovn_s32(b1);
540*b2055c35SXin Li const int16x4_t out2 = vmovn_s32(b2);
541*b2055c35SXin Li const int16x4_t out3 = vmovn_s32(b3);
542*b2055c35SXin Li
543*b2055c35SXin Li vst1_s16(out + 0, out0);
544*b2055c35SXin Li vst1_s16(out + 4, out1);
545*b2055c35SXin Li vst1_s16(out + 8, out2);
546*b2055c35SXin Li vst1_s16(out + 12, out3);
547*b2055c35SXin Li }
548*b2055c35SXin Li }
549*b2055c35SXin Li #undef LOAD_LANE_16b
550*b2055c35SXin Li
551*b2055c35SXin Li //------------------------------------------------------------------------------
552*b2055c35SXin Li // Texture distortion
553*b2055c35SXin Li //
554*b2055c35SXin Li // We try to match the spectral content (weighted) between source and
555*b2055c35SXin Li // reconstructed samples.
556*b2055c35SXin Li
557*b2055c35SXin Li // a 0123, b 0123
558*b2055c35SXin Li // a 4567, b 4567
559*b2055c35SXin Li // a 89ab, b 89ab
560*b2055c35SXin Li // a cdef, b cdef
561*b2055c35SXin Li //
562*b2055c35SXin Li // transpose
563*b2055c35SXin Li //
564*b2055c35SXin Li // a 048c, b 048c
565*b2055c35SXin Li // a 159d, b 159d
566*b2055c35SXin Li // a 26ae, b 26ae
567*b2055c35SXin Li // a 37bf, b 37bf
568*b2055c35SXin Li //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)569*b2055c35SXin Li static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
570*b2055c35SXin Li const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
571*b2055c35SXin Li const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
572*b2055c35SXin Li const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
573*b2055c35SXin Li vreinterpretq_s32_s16(q2_tmp1.val[0]));
574*b2055c35SXin Li const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
575*b2055c35SXin Li vreinterpretq_s32_s16(q2_tmp1.val[1]));
576*b2055c35SXin Li q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
577*b2055c35SXin Li q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
578*b2055c35SXin Li q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
579*b2055c35SXin Li q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
580*b2055c35SXin Li return q4_in;
581*b2055c35SXin Li }
582*b2055c35SXin Li
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)583*b2055c35SXin Li static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
584*b2055c35SXin Li const int16x8x4_t q4_in) {
585*b2055c35SXin Li // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
586*b2055c35SXin Li // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
587*b2055c35SXin Li const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
588*b2055c35SXin Li const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
589*b2055c35SXin Li const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
590*b2055c35SXin Li const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
591*b2055c35SXin Li int16x8x4_t q4_out;
592*b2055c35SXin Li // tmp[0] = a0 + a1
593*b2055c35SXin Li // tmp[1] = a3 + a2
594*b2055c35SXin Li // tmp[2] = a3 - a2
595*b2055c35SXin Li // tmp[3] = a0 - a1
596*b2055c35SXin Li INIT_VECTOR4(q4_out,
597*b2055c35SXin Li vabsq_s16(vaddq_s16(q_a0, q_a1)),
598*b2055c35SXin Li vabsq_s16(vaddq_s16(q_a3, q_a2)),
599*b2055c35SXin Li vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
600*b2055c35SXin Li return q4_out;
601*b2055c35SXin Li }
602*b2055c35SXin Li
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)603*b2055c35SXin Li static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
604*b2055c35SXin Li const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
605*b2055c35SXin Li q4_in.val[2]));
606*b2055c35SXin Li const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
607*b2055c35SXin Li q4_in.val[3]));
608*b2055c35SXin Li const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
609*b2055c35SXin Li q4_in.val[3]));
610*b2055c35SXin Li const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
611*b2055c35SXin Li q4_in.val[2]));
612*b2055c35SXin Li int16x8x4_t q4_out;
613*b2055c35SXin Li
614*b2055c35SXin Li INIT_VECTOR4(q4_out,
615*b2055c35SXin Li vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
616*b2055c35SXin Li vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
617*b2055c35SXin Li return q4_out;
618*b2055c35SXin Li }
619*b2055c35SXin Li
DistoLoadW_NEON(const uint16_t * w)620*b2055c35SXin Li static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
621*b2055c35SXin Li const uint16x8_t q_w07 = vld1q_u16(&w[0]);
622*b2055c35SXin Li const uint16x8_t q_w8f = vld1q_u16(&w[8]);
623*b2055c35SXin Li int16x4x4_t d4_w;
624*b2055c35SXin Li INIT_VECTOR4(d4_w,
625*b2055c35SXin Li vget_low_s16(vreinterpretq_s16_u16(q_w07)),
626*b2055c35SXin Li vget_high_s16(vreinterpretq_s16_u16(q_w07)),
627*b2055c35SXin Li vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
628*b2055c35SXin Li vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
629*b2055c35SXin Li return d4_w;
630*b2055c35SXin Li }
631*b2055c35SXin Li
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)632*b2055c35SXin Li static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
633*b2055c35SXin Li const int16x4x4_t d4_w) {
634*b2055c35SXin Li int32x2_t d_sum;
635*b2055c35SXin Li // sum += w[ 0] * abs(b0);
636*b2055c35SXin Li // sum += w[ 4] * abs(b1);
637*b2055c35SXin Li // sum += w[ 8] * abs(b2);
638*b2055c35SXin Li // sum += w[12] * abs(b3);
639*b2055c35SXin Li int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
640*b2055c35SXin Li int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
641*b2055c35SXin Li int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
642*b2055c35SXin Li int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
643*b2055c35SXin Li q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
644*b2055c35SXin Li q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
645*b2055c35SXin Li q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
646*b2055c35SXin Li q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
647*b2055c35SXin Li
648*b2055c35SXin Li q_sum0 = vaddq_s32(q_sum0, q_sum1);
649*b2055c35SXin Li q_sum2 = vaddq_s32(q_sum2, q_sum3);
650*b2055c35SXin Li q_sum2 = vaddq_s32(q_sum0, q_sum2);
651*b2055c35SXin Li d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
652*b2055c35SXin Li d_sum = vpadd_s32(d_sum, d_sum);
653*b2055c35SXin Li return d_sum;
654*b2055c35SXin Li }
655*b2055c35SXin Li
656*b2055c35SXin Li #define LOAD_LANE_32b(src, VALUE, LANE) \
657*b2055c35SXin Li (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
658*b2055c35SXin Li
659*b2055c35SXin Li // Hadamard transform
660*b2055c35SXin Li // Returns the weighted sum of the absolute value of transformed coefficients.
661*b2055c35SXin Li // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)662*b2055c35SXin Li static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
663*b2055c35SXin Li const uint16_t* const w) {
664*b2055c35SXin Li uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
665*b2055c35SXin Li uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
666*b2055c35SXin Li uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
667*b2055c35SXin Li uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
668*b2055c35SXin Li uint8x8x4_t d4_in;
669*b2055c35SXin Li
670*b2055c35SXin Li // load data a, b
671*b2055c35SXin Li LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
672*b2055c35SXin Li LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
673*b2055c35SXin Li LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
674*b2055c35SXin Li LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
675*b2055c35SXin Li LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
676*b2055c35SXin Li LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
677*b2055c35SXin Li LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
678*b2055c35SXin Li LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
679*b2055c35SXin Li INIT_VECTOR4(d4_in,
680*b2055c35SXin Li vreinterpret_u8_u32(d_in_ab_0123),
681*b2055c35SXin Li vreinterpret_u8_u32(d_in_ab_4567),
682*b2055c35SXin Li vreinterpret_u8_u32(d_in_ab_89ab),
683*b2055c35SXin Li vreinterpret_u8_u32(d_in_ab_cdef));
684*b2055c35SXin Li
685*b2055c35SXin Li {
686*b2055c35SXin Li // Vertical pass first to avoid a transpose (vertical and horizontal passes
687*b2055c35SXin Li // are commutative because w/kWeightY is symmetric) and subsequent
688*b2055c35SXin Li // transpose.
689*b2055c35SXin Li const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
690*b2055c35SXin Li const int16x4x4_t d4_w = DistoLoadW_NEON(w);
691*b2055c35SXin Li // horizontal pass
692*b2055c35SXin Li const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
693*b2055c35SXin Li const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
694*b2055c35SXin Li int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
695*b2055c35SXin Li
696*b2055c35SXin Li // abs(sum2 - sum1) >> 5
697*b2055c35SXin Li d_sum = vabs_s32(d_sum);
698*b2055c35SXin Li d_sum = vshr_n_s32(d_sum, 5);
699*b2055c35SXin Li return vget_lane_s32(d_sum, 0);
700*b2055c35SXin Li }
701*b2055c35SXin Li }
702*b2055c35SXin Li #undef LOAD_LANE_32b
703*b2055c35SXin Li
Disto16x16_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)704*b2055c35SXin Li static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
705*b2055c35SXin Li const uint16_t* const w) {
706*b2055c35SXin Li int D = 0;
707*b2055c35SXin Li int x, y;
708*b2055c35SXin Li for (y = 0; y < 16 * BPS; y += 4 * BPS) {
709*b2055c35SXin Li for (x = 0; x < 16; x += 4) {
710*b2055c35SXin Li D += Disto4x4_NEON(a + x + y, b + x + y, w);
711*b2055c35SXin Li }
712*b2055c35SXin Li }
713*b2055c35SXin Li return D;
714*b2055c35SXin Li }
715*b2055c35SXin Li
716*b2055c35SXin Li //------------------------------------------------------------------------------
717*b2055c35SXin Li
CollectHistogram_NEON(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)718*b2055c35SXin Li static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
719*b2055c35SXin Li int start_block, int end_block,
720*b2055c35SXin Li VP8Histogram* const histo) {
721*b2055c35SXin Li const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
722*b2055c35SXin Li int j;
723*b2055c35SXin Li int distribution[MAX_COEFF_THRESH + 1] = { 0 };
724*b2055c35SXin Li for (j = start_block; j < end_block; ++j) {
725*b2055c35SXin Li int16_t out[16];
726*b2055c35SXin Li FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
727*b2055c35SXin Li {
728*b2055c35SXin Li int k;
729*b2055c35SXin Li const int16x8_t a0 = vld1q_s16(out + 0);
730*b2055c35SXin Li const int16x8_t b0 = vld1q_s16(out + 8);
731*b2055c35SXin Li const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
732*b2055c35SXin Li const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
733*b2055c35SXin Li const uint16x8_t a2 = vshrq_n_u16(a1, 3);
734*b2055c35SXin Li const uint16x8_t b2 = vshrq_n_u16(b1, 3);
735*b2055c35SXin Li const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
736*b2055c35SXin Li const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
737*b2055c35SXin Li vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
738*b2055c35SXin Li vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
739*b2055c35SXin Li // Convert coefficients to bin.
740*b2055c35SXin Li for (k = 0; k < 16; ++k) {
741*b2055c35SXin Li ++distribution[out[k]];
742*b2055c35SXin Li }
743*b2055c35SXin Li }
744*b2055c35SXin Li }
745*b2055c35SXin Li VP8SetHistogramData(distribution, histo);
746*b2055c35SXin Li }
747*b2055c35SXin Li
748*b2055c35SXin Li //------------------------------------------------------------------------------
749*b2055c35SXin Li
AccumulateSSE16_NEON(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)750*b2055c35SXin Li static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
751*b2055c35SXin Li const uint8_t* const b,
752*b2055c35SXin Li uint32x4_t* const sum) {
753*b2055c35SXin Li const uint8x16_t a0 = vld1q_u8(a);
754*b2055c35SXin Li const uint8x16_t b0 = vld1q_u8(b);
755*b2055c35SXin Li const uint8x16_t abs_diff = vabdq_u8(a0, b0);
756*b2055c35SXin Li const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
757*b2055c35SXin Li vget_low_u8(abs_diff));
758*b2055c35SXin Li const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
759*b2055c35SXin Li vget_high_u8(abs_diff));
760*b2055c35SXin Li /* pair-wise adds and widen */
761*b2055c35SXin Li const uint32x4_t sum1 = vpaddlq_u16(prod1);
762*b2055c35SXin Li const uint32x4_t sum2 = vpaddlq_u16(prod2);
763*b2055c35SXin Li *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
764*b2055c35SXin Li }
765*b2055c35SXin Li
766*b2055c35SXin Li // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)767*b2055c35SXin Li static int SumToInt_NEON(uint32x4_t sum) {
768*b2055c35SXin Li #if WEBP_AARCH64
769*b2055c35SXin Li return (int)vaddvq_u32(sum);
770*b2055c35SXin Li #else
771*b2055c35SXin Li const uint64x2_t sum2 = vpaddlq_u32(sum);
772*b2055c35SXin Li const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
773*b2055c35SXin Li vreinterpret_u32_u64(vget_high_u64(sum2)));
774*b2055c35SXin Li return (int)vget_lane_u32(sum3, 0);
775*b2055c35SXin Li #endif
776*b2055c35SXin Li }
777*b2055c35SXin Li
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)778*b2055c35SXin Li static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
779*b2055c35SXin Li uint32x4_t sum = vdupq_n_u32(0);
780*b2055c35SXin Li int y;
781*b2055c35SXin Li for (y = 0; y < 16; ++y) {
782*b2055c35SXin Li AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
783*b2055c35SXin Li }
784*b2055c35SXin Li return SumToInt_NEON(sum);
785*b2055c35SXin Li }
786*b2055c35SXin Li
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)787*b2055c35SXin Li static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
788*b2055c35SXin Li uint32x4_t sum = vdupq_n_u32(0);
789*b2055c35SXin Li int y;
790*b2055c35SXin Li for (y = 0; y < 8; ++y) {
791*b2055c35SXin Li AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
792*b2055c35SXin Li }
793*b2055c35SXin Li return SumToInt_NEON(sum);
794*b2055c35SXin Li }
795*b2055c35SXin Li
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)796*b2055c35SXin Li static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
797*b2055c35SXin Li uint32x4_t sum = vdupq_n_u32(0);
798*b2055c35SXin Li int y;
799*b2055c35SXin Li for (y = 0; y < 8; ++y) {
800*b2055c35SXin Li const uint8x8_t a0 = vld1_u8(a + y * BPS);
801*b2055c35SXin Li const uint8x8_t b0 = vld1_u8(b + y * BPS);
802*b2055c35SXin Li const uint8x8_t abs_diff = vabd_u8(a0, b0);
803*b2055c35SXin Li const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
804*b2055c35SXin Li sum = vpadalq_u16(sum, prod);
805*b2055c35SXin Li }
806*b2055c35SXin Li return SumToInt_NEON(sum);
807*b2055c35SXin Li }
808*b2055c35SXin Li
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)809*b2055c35SXin Li static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
810*b2055c35SXin Li const uint8x16_t a0 = Load4x4_NEON(a);
811*b2055c35SXin Li const uint8x16_t b0 = Load4x4_NEON(b);
812*b2055c35SXin Li const uint8x16_t abs_diff = vabdq_u8(a0, b0);
813*b2055c35SXin Li const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
814*b2055c35SXin Li vget_low_u8(abs_diff));
815*b2055c35SXin Li const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
816*b2055c35SXin Li vget_high_u8(abs_diff));
817*b2055c35SXin Li /* pair-wise adds and widen */
818*b2055c35SXin Li const uint32x4_t sum1 = vpaddlq_u16(prod1);
819*b2055c35SXin Li const uint32x4_t sum2 = vpaddlq_u16(prod2);
820*b2055c35SXin Li return SumToInt_NEON(vaddq_u32(sum1, sum2));
821*b2055c35SXin Li }
822*b2055c35SXin Li
823*b2055c35SXin Li //------------------------------------------------------------------------------
824*b2055c35SXin Li
825*b2055c35SXin Li // Compilation with gcc-4.6.x is problematic for now.
826*b2055c35SXin Li #if !defined(WORK_AROUND_GCC)
827*b2055c35SXin Li
Quantize_NEON(int16_t * const in,const VP8Matrix * const mtx,int offset)828*b2055c35SXin Li static int16x8_t Quantize_NEON(int16_t* const in,
829*b2055c35SXin Li const VP8Matrix* const mtx, int offset) {
830*b2055c35SXin Li const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
831*b2055c35SXin Li const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
832*b2055c35SXin Li const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
833*b2055c35SXin Li const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
834*b2055c35SXin Li const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
835*b2055c35SXin Li
836*b2055c35SXin Li const int16x8_t a = vld1q_s16(in + offset); // in
837*b2055c35SXin Li const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
838*b2055c35SXin Li const int16x8_t sign = vshrq_n_s16(a, 15); // sign
839*b2055c35SXin Li const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
840*b2055c35SXin Li const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
841*b2055c35SXin Li const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
842*b2055c35SXin Li const uint32x4_t m2 = vhaddq_u32(m0, bias0);
843*b2055c35SXin Li const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
844*b2055c35SXin Li const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
845*b2055c35SXin Li vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
846*b2055c35SXin Li const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
847*b2055c35SXin Li const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
848*b2055c35SXin Li const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
849*b2055c35SXin Li const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
850*b2055c35SXin Li vst1q_s16(in + offset, c4);
851*b2055c35SXin Li assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
852*b2055c35SXin Li return c3;
853*b2055c35SXin Li }
854*b2055c35SXin Li
855*b2055c35SXin Li static const uint8_t kShuffles[4][8] = {
856*b2055c35SXin Li { 0, 1, 2, 3, 8, 9, 16, 17 },
857*b2055c35SXin Li { 10, 11, 4, 5, 6, 7, 12, 13 },
858*b2055c35SXin Li { 18, 19, 24, 25, 26, 27, 20, 21 },
859*b2055c35SXin Li { 14, 15, 22, 23, 28, 29, 30, 31 }
860*b2055c35SXin Li };
861*b2055c35SXin Li
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)862*b2055c35SXin Li static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
863*b2055c35SXin Li const VP8Matrix* const mtx) {
864*b2055c35SXin Li const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
865*b2055c35SXin Li const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
866*b2055c35SXin Li uint8x8x4_t shuffles;
867*b2055c35SXin Li // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
868*b2055c35SXin Li // non-standard versions there.
869*b2055c35SXin Li #if defined(__APPLE__) && WEBP_AARCH64 && \
870*b2055c35SXin Li defined(__apple_build_version__) && (__apple_build_version__< 6020037)
871*b2055c35SXin Li uint8x16x2_t all_out;
872*b2055c35SXin Li INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
873*b2055c35SXin Li INIT_VECTOR4(shuffles,
874*b2055c35SXin Li vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
875*b2055c35SXin Li vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
876*b2055c35SXin Li vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
877*b2055c35SXin Li vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
878*b2055c35SXin Li #else
879*b2055c35SXin Li uint8x8x4_t all_out;
880*b2055c35SXin Li INIT_VECTOR4(all_out,
881*b2055c35SXin Li vreinterpret_u8_s16(vget_low_s16(out0)),
882*b2055c35SXin Li vreinterpret_u8_s16(vget_high_s16(out0)),
883*b2055c35SXin Li vreinterpret_u8_s16(vget_low_s16(out1)),
884*b2055c35SXin Li vreinterpret_u8_s16(vget_high_s16(out1)));
885*b2055c35SXin Li INIT_VECTOR4(shuffles,
886*b2055c35SXin Li vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
887*b2055c35SXin Li vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
888*b2055c35SXin Li vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
889*b2055c35SXin Li vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
890*b2055c35SXin Li #endif
891*b2055c35SXin Li // Zigzag reordering
892*b2055c35SXin Li vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
893*b2055c35SXin Li vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
894*b2055c35SXin Li vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
895*b2055c35SXin Li vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
896*b2055c35SXin Li // test zeros
897*b2055c35SXin Li if (*(uint64_t*)(out + 0) != 0) return 1;
898*b2055c35SXin Li if (*(uint64_t*)(out + 4) != 0) return 1;
899*b2055c35SXin Li if (*(uint64_t*)(out + 8) != 0) return 1;
900*b2055c35SXin Li if (*(uint64_t*)(out + 12) != 0) return 1;
901*b2055c35SXin Li return 0;
902*b2055c35SXin Li }
903*b2055c35SXin Li
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)904*b2055c35SXin Li static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
905*b2055c35SXin Li const VP8Matrix* const mtx) {
906*b2055c35SXin Li int nz;
907*b2055c35SXin Li nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
908*b2055c35SXin Li nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
909*b2055c35SXin Li return nz;
910*b2055c35SXin Li }
911*b2055c35SXin Li
912*b2055c35SXin Li #endif // !WORK_AROUND_GCC
913*b2055c35SXin Li
914*b2055c35SXin Li //------------------------------------------------------------------------------
915*b2055c35SXin Li // Entry point
916*b2055c35SXin Li
917*b2055c35SXin Li extern void VP8EncDspInitNEON(void);
918*b2055c35SXin Li
VP8EncDspInitNEON(void)919*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
920*b2055c35SXin Li VP8ITransform = ITransform_NEON;
921*b2055c35SXin Li VP8FTransform = FTransform_NEON;
922*b2055c35SXin Li
923*b2055c35SXin Li VP8FTransformWHT = FTransformWHT_NEON;
924*b2055c35SXin Li
925*b2055c35SXin Li VP8TDisto4x4 = Disto4x4_NEON;
926*b2055c35SXin Li VP8TDisto16x16 = Disto16x16_NEON;
927*b2055c35SXin Li VP8CollectHistogram = CollectHistogram_NEON;
928*b2055c35SXin Li
929*b2055c35SXin Li VP8SSE16x16 = SSE16x16_NEON;
930*b2055c35SXin Li VP8SSE16x8 = SSE16x8_NEON;
931*b2055c35SXin Li VP8SSE8x8 = SSE8x8_NEON;
932*b2055c35SXin Li VP8SSE4x4 = SSE4x4_NEON;
933*b2055c35SXin Li
934*b2055c35SXin Li #if !defined(WORK_AROUND_GCC)
935*b2055c35SXin Li VP8EncQuantizeBlock = QuantizeBlock_NEON;
936*b2055c35SXin Li VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
937*b2055c35SXin Li #endif
938*b2055c35SXin Li }
939*b2055c35SXin Li
940*b2055c35SXin Li #else // !WEBP_USE_NEON
941*b2055c35SXin Li
942*b2055c35SXin Li WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
943*b2055c35SXin Li
944*b2055c35SXin Li #endif // WEBP_USE_NEON
945