xref: /aosp_15_r20/external/webp/src/dsp/dec_neon.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of dsp functions and loop filtering.
11 //
12 // Authors: Somnath Banerjee ([email protected])
13 //          Johann Koenig ([email protected])
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_NEON)
18 
19 #include "src/dsp/neon.h"
20 #include "src/dec/vp8i_dec.h"
21 
22 //------------------------------------------------------------------------------
23 // NxM Loading functions
24 
25 #if !defined(WORK_AROUND_GCC)
26 
27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28 // (register alloc, probably). The variants somewhat mitigate the problem, but
29 // not quite. HFilter16i() remains problematic.
Load4x8_NEON(const uint8_t * const src,int stride)30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31                                             int stride) {
32   const uint8x8_t zero = vdup_n_u8(0);
33   uint8x8x4_t out;
34   INIT_VECTOR4(out, zero, zero, zero, zero);
35   out = vld4_lane_u8(src + 0 * stride, out, 0);
36   out = vld4_lane_u8(src + 1 * stride, out, 1);
37   out = vld4_lane_u8(src + 2 * stride, out, 2);
38   out = vld4_lane_u8(src + 3 * stride, out, 3);
39   out = vld4_lane_u8(src + 4 * stride, out, 4);
40   out = vld4_lane_u8(src + 5 * stride, out, 5);
41   out = vld4_lane_u8(src + 6 * stride, out, 6);
42   out = vld4_lane_u8(src + 7 * stride, out, 7);
43   return out;
44 }
45 
Load4x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47                                       uint8x16_t* const p1,
48                                       uint8x16_t* const p0,
49                                       uint8x16_t* const q0,
50                                       uint8x16_t* const q1) {
51   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53   const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54   const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59 }
60 
61 #else  // WORK_AROUND_GCC
62 
63 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65   src += stride;                                                     \
66 } while (0)
67 
Load4x16_NEON(const uint8_t * src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69                                       uint8x16_t* const p1,
70                                       uint8x16_t* const p0,
71                                       uint8x16_t* const q0,
72                                       uint8x16_t* const q1) {
73   const uint32x4_t zero = vdupq_n_u32(0);
74   uint32x4x4_t in;
75   INIT_VECTOR4(in, zero, zero, zero, zero);
76   src -= 2;
77   LOADQ_LANE_32b(in.val[0], 0);
78   LOADQ_LANE_32b(in.val[1], 0);
79   LOADQ_LANE_32b(in.val[2], 0);
80   LOADQ_LANE_32b(in.val[3], 0);
81   LOADQ_LANE_32b(in.val[0], 1);
82   LOADQ_LANE_32b(in.val[1], 1);
83   LOADQ_LANE_32b(in.val[2], 1);
84   LOADQ_LANE_32b(in.val[3], 1);
85   LOADQ_LANE_32b(in.val[0], 2);
86   LOADQ_LANE_32b(in.val[1], 2);
87   LOADQ_LANE_32b(in.val[2], 2);
88   LOADQ_LANE_32b(in.val[3], 2);
89   LOADQ_LANE_32b(in.val[0], 3);
90   LOADQ_LANE_32b(in.val[1], 3);
91   LOADQ_LANE_32b(in.val[2], 3);
92   LOADQ_LANE_32b(in.val[3], 3);
93   // Transpose four 4x4 parts:
94   {
95     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96                                         vreinterpretq_u8_u32(in.val[1]));
97     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98                                         vreinterpretq_u8_u32(in.val[3]));
99     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100                                          vreinterpretq_u16_u8(row23.val[0]));
101     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102                                          vreinterpretq_u16_u8(row23.val[1]));
103     *p1 = vreinterpretq_u8_u16(row02.val[0]);
104     *p0 = vreinterpretq_u8_u16(row13.val[0]);
105     *q0 = vreinterpretq_u8_u16(row02.val[1]);
106     *q1 = vreinterpretq_u8_u16(row13.val[1]);
107   }
108 }
109 #undef LOADQ_LANE_32b
110 
111 #endif  // !WORK_AROUND_GCC
112 
Load8x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)113 static WEBP_INLINE void Load8x16_NEON(
114     const uint8_t* const src, int stride,
115     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117     uint8x16_t* const q2, uint8x16_t* const q3) {
118   Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119   Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120 }
121 
Load16x4_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123                                       uint8x16_t* const p1,
124                                       uint8x16_t* const p0,
125                                       uint8x16_t* const q0,
126                                       uint8x16_t* const q1) {
127   *p1 = vld1q_u8(src - 2 * stride);
128   *p0 = vld1q_u8(src - 1 * stride);
129   *q0 = vld1q_u8(src + 0 * stride);
130   *q1 = vld1q_u8(src + 1 * stride);
131 }
132 
Load16x8_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)133 static WEBP_INLINE void Load16x8_NEON(
134     const uint8_t* const src, int stride,
135     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137     uint8x16_t* const q2, uint8x16_t* const q3) {
138   Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139   Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140 }
141 
Load8x8x2_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)142 static WEBP_INLINE void Load8x8x2_NEON(
143     const uint8_t* const u, const uint8_t* const v, int stride,
144     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146     uint8x16_t* const q2, uint8x16_t* const q3) {
147   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148   // and the v-samples on the higher half.
149   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157 }
158 
159 #if !defined(WORK_AROUND_GCC)
160 
161 #define LOAD_UV_8(ROW) \
162   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163 
Load8x8x2T_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)164 static WEBP_INLINE void Load8x8x2T_NEON(
165     const uint8_t* const u, const uint8_t* const v, int stride,
166     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168     uint8x16_t* const q2, uint8x16_t* const q3) {
169   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170   // and the v-samples on the higher half.
171   const uint8x16_t row0 = LOAD_UV_8(0);
172   const uint8x16_t row1 = LOAD_UV_8(1);
173   const uint8x16_t row2 = LOAD_UV_8(2);
174   const uint8x16_t row3 = LOAD_UV_8(3);
175   const uint8x16_t row4 = LOAD_UV_8(4);
176   const uint8x16_t row5 = LOAD_UV_8(5);
177   const uint8x16_t row6 = LOAD_UV_8(6);
178   const uint8x16_t row7 = LOAD_UV_8(7);
179   // Perform two side-by-side 8x8 transposes
180   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189                                                     // u01 u11 u03 u13 ...
190   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191                                                     // u21 u31 u23 u33 ...
192   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195                                        vreinterpretq_u16_u8(row23.val[0]));
196   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197                                        vreinterpretq_u16_u8(row23.val[1]));
198   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199                                        vreinterpretq_u16_u8(row67.val[0]));
200   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201                                        vreinterpretq_u16_u8(row67.val[1]));
202   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203                                        vreinterpretq_u32_u16(row46.val[0]));
204   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205                                        vreinterpretq_u32_u16(row46.val[1]));
206   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207                                        vreinterpretq_u32_u16(row57.val[0]));
208   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209                                        vreinterpretq_u32_u16(row57.val[1]));
210   *p3 = vreinterpretq_u8_u32(row04.val[0]);
211   *p2 = vreinterpretq_u8_u32(row15.val[0]);
212   *p1 = vreinterpretq_u8_u32(row26.val[0]);
213   *p0 = vreinterpretq_u8_u32(row37.val[0]);
214   *q0 = vreinterpretq_u8_u32(row04.val[1]);
215   *q1 = vreinterpretq_u8_u32(row15.val[1]);
216   *q2 = vreinterpretq_u8_u32(row26.val[1]);
217   *q3 = vreinterpretq_u8_u32(row37.val[1]);
218 }
219 #undef LOAD_UV_8
220 
221 #endif  // !WORK_AROUND_GCC
222 
Store2x8_NEON(const uint8x8x2_t v,uint8_t * const dst,int stride)223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224                                       uint8_t* const dst, int stride) {
225   vst2_lane_u8(dst + 0 * stride, v, 0);
226   vst2_lane_u8(dst + 1 * stride, v, 1);
227   vst2_lane_u8(dst + 2 * stride, v, 2);
228   vst2_lane_u8(dst + 3 * stride, v, 3);
229   vst2_lane_u8(dst + 4 * stride, v, 4);
230   vst2_lane_u8(dst + 5 * stride, v, 5);
231   vst2_lane_u8(dst + 6 * stride, v, 6);
232   vst2_lane_u8(dst + 7 * stride, v, 7);
233 }
234 
Store2x16_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236                                        uint8_t* const dst, int stride) {
237   uint8x8x2_t lo, hi;
238   lo.val[0] = vget_low_u8(p0);
239   lo.val[1] = vget_low_u8(q0);
240   hi.val[0] = vget_high_u8(p0);
241   hi.val[1] = vget_high_u8(q0);
242   Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243   Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244 }
245 
246 #if !defined(WORK_AROUND_GCC)
Store4x8_NEON(const uint8x8x4_t v,uint8_t * const dst,int stride)247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248                                       uint8_t* const dst, int stride) {
249   vst4_lane_u8(dst + 0 * stride, v, 0);
250   vst4_lane_u8(dst + 1 * stride, v, 1);
251   vst4_lane_u8(dst + 2 * stride, v, 2);
252   vst4_lane_u8(dst + 3 * stride, v, 3);
253   vst4_lane_u8(dst + 4 * stride, v, 4);
254   vst4_lane_u8(dst + 5 * stride, v, 5);
255   vst4_lane_u8(dst + 6 * stride, v, 6);
256   vst4_lane_u8(dst + 7 * stride, v, 7);
257 }
258 
Store4x16_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260                                        const uint8x16_t q0, const uint8x16_t q1,
261                                        uint8_t* const dst, int stride) {
262   uint8x8x4_t lo, hi;
263   INIT_VECTOR4(lo,
264                vget_low_u8(p1), vget_low_u8(p0),
265                vget_low_u8(q0), vget_low_u8(q1));
266   INIT_VECTOR4(hi,
267                vget_high_u8(p1), vget_high_u8(p0),
268                vget_high_u8(q0), vget_high_u8(q1));
269   Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270   Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271 }
272 #endif  // !WORK_AROUND_GCC
273 
Store16x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275                                        uint8_t* const dst, int stride) {
276   vst1q_u8(dst - stride, p0);
277   vst1q_u8(dst, q0);
278 }
279 
Store16x4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281                                        const uint8x16_t q0, const uint8x16_t q1,
282                                        uint8_t* const dst, int stride) {
283   Store16x2_NEON(p1, p0, dst - stride, stride);
284   Store16x2_NEON(q0, q1, dst + stride, stride);
285 }
286 
Store8x2x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const u,uint8_t * const v,int stride)287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288                                         const uint8x16_t q0,
289                                         uint8_t* const u, uint8_t* const v,
290                                         int stride) {
291   // p0 and q0 contain the u+v samples packed in low/high halves.
292   vst1_u8(u - stride, vget_low_u8(p0));
293   vst1_u8(u,          vget_low_u8(q0));
294   vst1_u8(v - stride, vget_high_u8(p0));
295   vst1_u8(v,          vget_high_u8(q0));
296 }
297 
Store8x4x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299                                         const uint8x16_t p0,
300                                         const uint8x16_t q0,
301                                         const uint8x16_t q1,
302                                         uint8_t* const u, uint8_t* const v,
303                                         int stride) {
304   // The p1...q1 registers contain the u+v samples packed in low/high halves.
305   Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306   Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307 }
308 
309 #if !defined(WORK_AROUND_GCC)
310 
311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314   (DST) += stride;                                \
315 } while (0)
316 
Store6x8x2_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,uint8_t * u,uint8_t * v,int stride)317 static WEBP_INLINE void Store6x8x2_NEON(
318     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320     uint8_t* u, uint8_t* v, int stride) {
321   uint8x8x3_t u0, u1, v0, v1;
322   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326   STORE6_LANE(u, u0, u1, 0);
327   STORE6_LANE(u, u0, u1, 1);
328   STORE6_LANE(u, u0, u1, 2);
329   STORE6_LANE(u, u0, u1, 3);
330   STORE6_LANE(u, u0, u1, 4);
331   STORE6_LANE(u, u0, u1, 5);
332   STORE6_LANE(u, u0, u1, 6);
333   STORE6_LANE(u, u0, u1, 7);
334   STORE6_LANE(v, v0, v1, 0);
335   STORE6_LANE(v, v0, v1, 1);
336   STORE6_LANE(v, v0, v1, 2);
337   STORE6_LANE(v, v0, v1, 3);
338   STORE6_LANE(v, v0, v1, 4);
339   STORE6_LANE(v, v0, v1, 5);
340   STORE6_LANE(v, v0, v1, 6);
341   STORE6_LANE(v, v0, v1, 7);
342 }
343 #undef STORE6_LANE
344 
Store4x8x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346                                         const uint8x16_t p0,
347                                         const uint8x16_t q0,
348                                         const uint8x16_t q1,
349                                         uint8_t* const u, uint8_t* const v,
350                                         int stride) {
351   uint8x8x4_t u0, v0;
352   INIT_VECTOR4(u0,
353                vget_low_u8(p1), vget_low_u8(p0),
354                vget_low_u8(q0), vget_low_u8(q1));
355   INIT_VECTOR4(v0,
356                vget_high_u8(p1), vget_high_u8(p0),
357                vget_high_u8(q0), vget_high_u8(q1));
358   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374 }
375 
376 #endif  // !WORK_AROUND_GCC
377 
378 // Zero extend 'v' to an int16x8_t.
ConvertU8ToS16_NEON(uint8x8_t v)379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380   return vreinterpretq_s16_u16(vmovl_u8(v));
381 }
382 
383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386                                                  const int16x8_t dst01,
387                                                  const int16x8_t dst23) {
388   // Unsigned saturate to 8b.
389   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391 
392   // Store the results.
393   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397 }
398 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,uint8_t * const dst)399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400                                     const int16x8_t row23,
401                                     uint8_t* const dst) {
402   uint32x2_t dst01 = vdup_n_u32(0);
403   uint32x2_t dst23 = vdup_n_u32(0);
404 
405   // Load the source pixels.
406   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410 
411   {
412     // Convert to 16b.
413     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415 
416     // Descale with rounding.
417     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419     // Add the inverse transform.
420     SaturateAndStore4x4_NEON(dst, out01, out23);
421   }
422 }
423 
424 //-----------------------------------------------------------------------------
425 // Simple In-loop filtering (Paragraph 15.2)
426 
NeedsFilter_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int thresh)427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428                                    const uint8x16_t q0, const uint8x16_t q1,
429                                    int thresh) {
430   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437   return mask;
438 }
439 
FlipSign_NEON(const uint8x16_t v)440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443 }
444 
FlipSignBack_NEON(const int8x16_t v)445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446   const int8x16_t sign_bit = vdupq_n_s8(0x80);
447   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448 }
449 
GetBaseDelta_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1)450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451                                    const int8x16_t q0, const int8x16_t q1) {
452   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457   return s3;
458 }
459 
GetBaseDelta0_NEON(const int8x16_t p0,const int8x16_t q0)460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464   return s2;
465 }
466 
467 //------------------------------------------------------------------------------
468 
ApplyFilter2NoFlip_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,int8x16_t * const op0,int8x16_t * const oq0)469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470                                     const int8x16_t delta,
471                                     int8x16_t* const op0,
472                                     int8x16_t* const oq0) {
473   const int8x16_t kCst3 = vdupq_n_s8(0x03);
474   const int8x16_t kCst4 = vdupq_n_s8(0x04);
475   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479   *op0 = vqaddq_s8(p0s, delta3);
480   *oq0 = vqsubq_s8(q0s, delta4);
481 }
482 
483 #if defined(WEBP_USE_INTRINSICS)
484 
ApplyFilter2_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,uint8x16_t * const op0,uint8x16_t * const oq0)485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486                               const int8x16_t delta,
487                               uint8x16_t* const op0, uint8x16_t* const oq0) {
488   const int8x16_t kCst3 = vdupq_n_s8(0x03);
489   const int8x16_t kCst4 = vdupq_n_s8(0x04);
490   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496   *op0 = FlipSignBack_NEON(sp0);
497   *oq0 = FlipSignBack_NEON(sq0);
498 }
499 
DoFilter2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,uint8x16_t * const op0,uint8x16_t * const oq0)500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501                            const uint8x16_t q0, const uint8x16_t q1,
502                            const uint8x16_t mask,
503                            uint8x16_t* const op0, uint8x16_t* const oq0) {
504   const int8x16_t p1s = FlipSign_NEON(p1);
505   const int8x16_t p0s = FlipSign_NEON(p0);
506   const int8x16_t q0s = FlipSign_NEON(q0);
507   const int8x16_t q1s = FlipSign_NEON(q1);
508   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510   ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511 }
512 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514   uint8x16_t p1, p0, q0, q1, op0, oq0;
515   Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516   {
517     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519   }
520   Store16x2_NEON(op0, oq0, p, stride);
521 }
522 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524   uint8x16_t p1, p0, q0, q1, oq0, op0;
525   Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526   {
527     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529   }
530   Store2x16_NEON(op0, oq0, p, stride);
531 }
532 
533 #else
534 
535 // Load/Store vertical edge
536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545 
546 #define STORE8x2(c1, c2, p, stride)                                            \
547   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555 
556 #define QRegs "q0", "q1", "q2", "q3",                                          \
557               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558 
559 #define FLIP_SIGN_BIT2(a, b, s)                                                \
560   "veor     " #a "," #a "," #s "               \n"                             \
561   "veor     " #b "," #b "," #s "               \n"                             \
562 
563 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564   FLIP_SIGN_BIT2(a, b, s)                                                      \
565   FLIP_SIGN_BIT2(c, d, s)                                                      \
566 
567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573   "vdup.8     q14, " #thresh "            \n"                                  \
574   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575 
576 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582 
583 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584   "vmov.i8    q15, #0x03                  \n"                                  \
585   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588                                                                                \
589   "vmov.i8    q15, #0x04                  \n"                                  \
590   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593 
594 // Applies filter on 2 pixels (p0 and q0)
595 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602   FLIP_SIGN_BIT2(p0, q0, q10)
603 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605   __asm__ volatile (
606     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607 
608     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612 
613     DO_FILTER2(q1, q2, q3, q12, %[thresh])
614 
615     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616 
617     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619     : [p] "+r"(p)
620     : [stride] "r"(stride), [thresh] "r"(thresh)
621     : "memory", QRegs
622   );
623 }
624 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626   __asm__ volatile (
627     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630 
631     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636 
637     DO_FILTER2(q1, q2, q12, q13, %[thresh])
638 
639     "sub        %[p], %[p], #1                 \n"  // p - 1
640 
641     "vswp        d5, d24                       \n"
642     STORE8x2(d4, d5, [%[p]], %[stride])
643     STORE8x2(d24, d25, [%[p]], %[stride])
644 
645     : [p] "+r"(p)
646     : [stride] "r"(stride), [thresh] "r"(thresh)
647     : "memory", "r4", "r5", "r6", QRegs
648   );
649 }
650 
651 #undef LOAD8x4
652 #undef STORE8x2
653 
654 #endif    // WEBP_USE_INTRINSICS
655 
SimpleVFilter16i_NEON(uint8_t * p,int stride,int thresh)656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657   uint32_t k;
658   for (k = 3; k != 0; --k) {
659     p += 4 * stride;
660     SimpleVFilter16_NEON(p, stride, thresh);
661   }
662 }
663 
SimpleHFilter16i_NEON(uint8_t * p,int stride,int thresh)664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665   uint32_t k;
666   for (k = 3; k != 0; --k) {
667     p += 4;
668     SimpleHFilter16_NEON(p, stride, thresh);
669   }
670 }
671 
672 //------------------------------------------------------------------------------
673 // Complex In-loop filtering (Paragraph 15.3)
674 
NeedsHev_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int hev_thresh)675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676                                 const uint8x16_t q0, const uint8x16_t q1,
677                                 int hev_thresh) {
678   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683   return mask;
684 }
685 
NeedsFilter2_NEON(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,int ithresh,int thresh)686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687                                     const uint8x16_t p1, const uint8x16_t p0,
688                                     const uint8x16_t q0, const uint8x16_t q1,
689                                     const uint8x16_t q2, const uint8x16_t q3,
690                                     int ithresh, int thresh) {
691   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701   const uint8x16_t max12 = vmaxq_u8(max1, max2);
702   const uint8x16_t max123 = vmaxq_u8(max12, max3);
703   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704   const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705   const uint8x16_t mask = vandq_u8(mask1, mask2);
706   return mask;
707 }
708 
709 //  4-points filter
710 
ApplyFilter4_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t delta0,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)711 static void ApplyFilter4_NEON(
712     const int8x16_t p1, const int8x16_t p0,
713     const int8x16_t q0, const int8x16_t q1,
714     const int8x16_t delta0,
715     uint8x16_t* const op1, uint8x16_t* const op0,
716     uint8x16_t* const oq0, uint8x16_t* const oq1) {
717   const int8x16_t kCst3 = vdupq_n_s8(0x03);
718   const int8x16_t kCst4 = vdupq_n_s8(0x04);
719   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728 }
729 
DoFilter4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)730 static void DoFilter4_NEON(
731     const uint8x16_t p1, const uint8x16_t p0,
732     const uint8x16_t q0, const uint8x16_t q1,
733     const uint8x16_t mask, const uint8x16_t hev_mask,
734     uint8x16_t* const op1, uint8x16_t* const op0,
735     uint8x16_t* const oq0, uint8x16_t* const oq1) {
736   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737   const int8x16_t p1s = FlipSign_NEON(p1);
738   int8x16_t p0s = FlipSign_NEON(p0);
739   int8x16_t q0s = FlipSign_NEON(q0);
740   const int8x16_t q1s = FlipSign_NEON(q1);
741   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742 
743   // do_filter2 part (simple loopfilter on pixels with hev)
744   {
745     const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746     const int8x16_t simple_lf_delta =
747         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749   }
750 
751   // do_filter4 part (complex loopfilter on pixels without hev)
752   {
753     const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756     const int8x16_t complex_lf_delta =
757         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758     ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759   }
760 }
761 
762 //  6-points filter
763 
ApplyFilter6_NEON(const int8x16_t p2,const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t q2,const int8x16_t delta,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)764 static void ApplyFilter6_NEON(
765     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767     const int8x16_t delta,
768     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774   const int8x8_t delta_lo = vget_low_s8(delta);
775   const int8x8_t delta_hi = vget_high_s8(delta);
776   const int8x8_t kCst9 = vdup_n_s8(9);
777   const int16x8_t kCstm1 = vdupq_n_s16(-1);
778   const int8x8_t kCst18 = vdup_n_s8(18);
779   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792 
793   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797   *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798   *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799 }
800 
DoFilter6_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)801 static void DoFilter6_NEON(
802     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804     const uint8x16_t mask, const uint8x16_t hev_mask,
805     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808   const int8x16_t p2s = FlipSign_NEON(p2);
809   const int8x16_t p1s = FlipSign_NEON(p1);
810   int8x16_t p0s = FlipSign_NEON(p0);
811   int8x16_t q0s = FlipSign_NEON(q0);
812   const int8x16_t q1s = FlipSign_NEON(q1);
813   const int8x16_t q2s = FlipSign_NEON(q2);
814   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816 
817   // do_filter2 part (simple loopfilter on pixels with hev)
818   {
819     const int8x16_t simple_lf_delta =
820         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822   }
823 
824   // do_filter6 part (complex loopfilter on pixels without hev)
825   {
826     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828     const int8x16_t complex_lf_delta =
829         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830     ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831                       op2, op1, op0, oq0, oq1, oq2);
832   }
833 }
834 
835 // on macroblock edges
836 
VFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)837 static void VFilter16_NEON(uint8_t* p, int stride,
838                            int thresh, int ithresh, int hev_thresh) {
839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840   Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841   {
842     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843                                               ithresh, thresh);
844     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
848     Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849     Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850     Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851   }
852 }
853 
HFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)854 static void HFilter16_NEON(uint8_t* p, int stride,
855                            int thresh, int ithresh, int hev_thresh) {
856   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857   Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858   {
859     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860                                               ithresh, thresh);
861     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
865     Store2x16_NEON(op2, op1, p - 2, stride);
866     Store2x16_NEON(op0, oq0, p + 0, stride);
867     Store2x16_NEON(oq1, oq2, p + 2, stride);
868   }
869 }
870 
871 // on three inner edges
VFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)872 static void VFilter16i_NEON(uint8_t* p, int stride,
873                             int thresh, int ithresh, int hev_thresh) {
874   uint32_t k;
875   uint8x16_t p3, p2, p1, p0;
876   Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877   for (k = 3; k != 0; --k) {
878     uint8x16_t q0, q1, q2, q3;
879     p += 4 * stride;
880     Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881     {
882       const uint8x16_t mask =
883           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885       // p3 and p2 are not just temporary variables here: they will be
886       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888       Store16x4_NEON(p1, p0, p3, p2, p, stride);
889       p1 = q2;
890       p0 = q3;
891     }
892   }
893 }
894 
895 #if !defined(WORK_AROUND_GCC)
HFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)896 static void HFilter16i_NEON(uint8_t* p, int stride,
897                             int thresh, int ithresh, int hev_thresh) {
898   uint32_t k;
899   uint8x16_t p3, p2, p1, p0;
900   Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901   for (k = 3; k != 0; --k) {
902     uint8x16_t q0, q1, q2, q3;
903     p += 4;
904     Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905     {
906       const uint8x16_t mask =
907           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910       Store4x16_NEON(p1, p0, p3, p2, p, stride);
911       p1 = q2;
912       p0 = q3;
913     }
914   }
915 }
916 #endif  // !WORK_AROUND_GCC
917 
918 // 8-pixels wide variant, for chroma filtering
VFilter8_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)919 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
920                           int thresh, int ithresh, int hev_thresh) {
921   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923   {
924     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925                                               ithresh, thresh);
926     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
930     Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931     Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933   }
934 }
VFilter8i_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)935 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
936                            int thresh, int ithresh, int hev_thresh) {
937   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938   u += 4 * stride;
939   v += 4 * stride;
940   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
941   {
942     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
943                                               ithresh, thresh);
944     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
945     uint8x16_t op1, op0, oq0, oq1;
946     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
947     Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
948   }
949 }
950 
951 #if !defined(WORK_AROUND_GCC)
HFilter8_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)952 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
953                           int thresh, int ithresh, int hev_thresh) {
954   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
955   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956   {
957     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
958                                               ithresh, thresh);
959     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
960     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
961     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
962                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
963     Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
964   }
965 }
966 
HFilter8i_NEON(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)967 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
968                            int thresh, int ithresh, int hev_thresh) {
969   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
970   u += 4;
971   v += 4;
972   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
973   {
974     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
975                                               ithresh, thresh);
976     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
977     uint8x16_t op1, op0, oq0, oq1;
978     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
979     Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
980   }
981 }
982 #endif  // !WORK_AROUND_GCC
983 
984 //-----------------------------------------------------------------------------
985 // Inverse transforms (Paragraph 14.4)
986 
987 // Technically these are unsigned but vqdmulh is only available in signed.
988 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
989 // changing the >> 16 to >> 15 and requiring an additional >> 1.
990 // We use this to our advantage with kC2. The canonical value is 35468.
991 // However, the high bit is set so treating it as signed will give incorrect
992 // results. We avoid this by down shifting by 1 here to clear the highest bit.
993 // Combined with the doubling effect of vqdmulh we get >> 16.
994 // This can not be applied to kC1 because the lowest bit is set. Down shifting
995 // the constant would reduce precision.
996 
997 // libwebp uses a trick to avoid some extra addition that libvpx does.
998 // Instead of:
999 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1000 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1001 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1002 
1003 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
1004 static const int16_t kC2 =
1005     WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
1006 
1007 #if defined(WEBP_USE_INTRINSICS)
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)1008 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1009                                           const int16x8_t in1,
1010                                           int16x8x2_t* const out) {
1011   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1012   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1013   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1014                                                   // b0 d0 b1 d1 b2 d2 ...
1015   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1016 }
1017 
TransformPass_NEON(int16x8x2_t * const rows)1018 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1019   // {rows} = in0 | in4
1020   //          in8 | in12
1021   // B1 = in4 | in12
1022   const int16x8_t B1 =
1023       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1024   // C0 = kC1 * in4 | kC1 * in12
1025   // C1 = kC2 * in4 | kC2 * in12
1026   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1027   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1028   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1029                                 vget_low_s16(rows->val[1]));   // in0 + in8
1030   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1031                                 vget_low_s16(rows->val[1]));   // in0 - in8
1032   // c = kC2 * in4 - kC1 * in12
1033   // d = kC1 * in4 + kC2 * in12
1034   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1035   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1036   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1037   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1038   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1039   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1040   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1041   Transpose8x2_NEON(E0, E1, rows);
1042 }
1043 
TransformOne_NEON(const int16_t * in,uint8_t * dst)1044 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1045   int16x8x2_t rows;
1046   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1047   TransformPass_NEON(&rows);
1048   TransformPass_NEON(&rows);
1049   Add4x4_NEON(rows.val[0], rows.val[1], dst);
1050 }
1051 
1052 #else
1053 
TransformOne_NEON(const int16_t * in,uint8_t * dst)1054 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1055   const int kBPS = BPS;
1056   // kC1, kC2. Padded because vld1.16 loads 8 bytes
1057   const int16_t constants[4] = { kC1, kC2, 0, 0 };
1058   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1059   __asm__ volatile (
1060     "vld1.16         {q1, q2}, [%[in]]           \n"
1061     "vld1.16         {d0}, [%[constants]]        \n"
1062 
1063     /* d2: in[0]
1064      * d3: in[8]
1065      * d4: in[4]
1066      * d5: in[12]
1067      */
1068     "vswp            d3, d4                      \n"
1069 
1070     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1071      * q9 = {in[4], in[12]} * kC2 >> 16
1072      */
1073     "vqdmulh.s16     q8, q2, d0[0]               \n"
1074     "vqdmulh.s16     q9, q2, d0[1]               \n"
1075 
1076     /* d22 = a = in[0] + in[8]
1077      * d23 = b = in[0] - in[8]
1078      */
1079     "vqadd.s16       d22, d2, d3                 \n"
1080     "vqsub.s16       d23, d2, d3                 \n"
1081 
1082     /* The multiplication should be x * kC1 >> 16
1083      * However, with vqdmulh we get x * kC1 * 2 >> 16
1084      * (multiply, double, return high half)
1085      * We avoided this in kC2 by pre-shifting the constant.
1086      * q8 = in[4]/[12] * kC1 >> 16
1087      */
1088     "vshr.s16        q8, q8, #1                  \n"
1089 
1090     /* Add {in[4], in[12]} back after the multiplication. This is handled by
1091      * adding 1 << 16 to kC1 in the libwebp C code.
1092      */
1093     "vqadd.s16       q8, q2, q8                  \n"
1094 
1095     /* d20 = c = in[4]*kC2 - in[12]*kC1
1096      * d21 = d = in[4]*kC1 + in[12]*kC2
1097      */
1098     "vqsub.s16       d20, d18, d17               \n"
1099     "vqadd.s16       d21, d19, d16               \n"
1100 
1101     /* d2 = tmp[0] = a + d
1102      * d3 = tmp[1] = b + c
1103      * d4 = tmp[2] = b - c
1104      * d5 = tmp[3] = a - d
1105      */
1106     "vqadd.s16       d2, d22, d21                \n"
1107     "vqadd.s16       d3, d23, d20                \n"
1108     "vqsub.s16       d4, d23, d20                \n"
1109     "vqsub.s16       d5, d22, d21                \n"
1110 
1111     "vzip.16         q1, q2                      \n"
1112     "vzip.16         q1, q2                      \n"
1113 
1114     "vswp            d3, d4                      \n"
1115 
1116     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1117      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1118      */
1119     "vqdmulh.s16     q8, q2, d0[0]               \n"
1120     "vqdmulh.s16     q9, q2, d0[1]               \n"
1121 
1122     /* d22 = a = tmp[0] + tmp[8]
1123      * d23 = b = tmp[0] - tmp[8]
1124      */
1125     "vqadd.s16       d22, d2, d3                 \n"
1126     "vqsub.s16       d23, d2, d3                 \n"
1127 
1128     /* See long winded explanations prior */
1129     "vshr.s16        q8, q8, #1                  \n"
1130     "vqadd.s16       q8, q2, q8                  \n"
1131 
1132     /* d20 = c = in[4]*kC2 - in[12]*kC1
1133      * d21 = d = in[4]*kC1 + in[12]*kC2
1134      */
1135     "vqsub.s16       d20, d18, d17               \n"
1136     "vqadd.s16       d21, d19, d16               \n"
1137 
1138     /* d2 = tmp[0] = a + d
1139      * d3 = tmp[1] = b + c
1140      * d4 = tmp[2] = b - c
1141      * d5 = tmp[3] = a - d
1142      */
1143     "vqadd.s16       d2, d22, d21                \n"
1144     "vqadd.s16       d3, d23, d20                \n"
1145     "vqsub.s16       d4, d23, d20                \n"
1146     "vqsub.s16       d5, d22, d21                \n"
1147 
1148     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1149     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1150     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1151     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1152 
1153     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1154 
1155     /* (val) + 4 >> 3 */
1156     "vrshr.s16       d2, d2, #3                  \n"
1157     "vrshr.s16       d3, d3, #3                  \n"
1158     "vrshr.s16       d4, d4, #3                  \n"
1159     "vrshr.s16       d5, d5, #3                  \n"
1160 
1161     "vzip.16         q1, q2                      \n"
1162     "vzip.16         q1, q2                      \n"
1163 
1164     /* Must accumulate before saturating */
1165     "vmovl.u8        q8, d6                      \n"
1166     "vmovl.u8        q9, d7                      \n"
1167 
1168     "vqadd.s16       q1, q1, q8                  \n"
1169     "vqadd.s16       q2, q2, q9                  \n"
1170 
1171     "vqmovun.s16     d0, q1                      \n"
1172     "vqmovun.s16     d1, q2                      \n"
1173 
1174     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1175     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1176     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1177     "vst1.32         d1[1], [%[dst]]             \n"
1178 
1179     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1180     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1181     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1182   );
1183 }
1184 
1185 #endif    // WEBP_USE_INTRINSICS
1186 
TransformTwo_NEON(const int16_t * in,uint8_t * dst,int do_two)1187 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1188   TransformOne_NEON(in, dst);
1189   if (do_two) {
1190     TransformOne_NEON(in + 16, dst + 4);
1191   }
1192 }
1193 
TransformDC_NEON(const int16_t * in,uint8_t * dst)1194 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1195   const int16x8_t DC = vdupq_n_s16(in[0]);
1196   Add4x4_NEON(DC, DC, dst);
1197 }
1198 
1199 //------------------------------------------------------------------------------
1200 
1201 #define STORE_WHT(dst, col, rows) do {                  \
1202   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1203   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1204   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1205   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1206 } while (0)
1207 
TransformWHT_NEON(const int16_t * in,int16_t * out)1208 static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1209   int32x4x4_t tmp;
1210 
1211   {
1212     // Load the source.
1213     const int16x4_t in00_03 = vld1_s16(in + 0);
1214     const int16x4_t in04_07 = vld1_s16(in + 4);
1215     const int16x4_t in08_11 = vld1_s16(in + 8);
1216     const int16x4_t in12_15 = vld1_s16(in + 12);
1217     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1218     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1219     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1220     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1221     tmp.val[0] = vaddq_s32(a0, a1);
1222     tmp.val[1] = vaddq_s32(a3, a2);
1223     tmp.val[2] = vsubq_s32(a0, a1);
1224     tmp.val[3] = vsubq_s32(a3, a2);
1225     // Arrange the temporary results column-wise.
1226     tmp = Transpose4x4_NEON(tmp);
1227   }
1228 
1229   {
1230     const int32x4_t kCst3 = vdupq_n_s32(3);
1231     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1232     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1233     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1234     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1235     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1236 
1237     tmp.val[0] = vaddq_s32(a0, a1);
1238     tmp.val[1] = vaddq_s32(a3, a2);
1239     tmp.val[2] = vsubq_s32(a0, a1);
1240     tmp.val[3] = vsubq_s32(a3, a2);
1241 
1242     // right shift the results by 3.
1243     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1244     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1245     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1246     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1247 
1248     STORE_WHT(out, 0, tmp);
1249     STORE_WHT(out, 1, tmp);
1250     STORE_WHT(out, 2, tmp);
1251     STORE_WHT(out, 3, tmp);
1252   }
1253 }
1254 
1255 #undef STORE_WHT
1256 
1257 //------------------------------------------------------------------------------
1258 
TransformAC3_NEON(const int16_t * in,uint8_t * dst)1259 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1260   const int16x4_t A = vld1_dup_s16(in);
1261   const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
1262   const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
1263   const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
1264   const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
1265   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1266                       (uint64_t)( c1 & 0xffff) << 16 |
1267                       (uint64_t)(-c1 & 0xffff) << 32 |
1268                       (uint64_t)(-d1 & 0xffff) << 48;
1269   const int16x4_t CD = vcreate_s16(cd);
1270   const int16x4_t B = vqadd_s16(A, CD);
1271   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1272   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1273   Add4x4_NEON(m0_m1, m2_m3, dst);
1274 }
1275 
1276 //------------------------------------------------------------------------------
1277 // 4x4
1278 
DC4_NEON(uint8_t * dst)1279 static void DC4_NEON(uint8_t* dst) {    // DC
1280   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1281   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1282   const uint16x4_t p1 = vpadd_u16(p0, p0);
1283   const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1284   const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1285   const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1286   const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1287   const uint16x8_t s0 = vaddl_u8(L0, L1);
1288   const uint16x8_t s1 = vaddl_u8(L2, L3);
1289   const uint16x8_t s01 = vaddq_u16(s0, s1);
1290   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1291   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1292   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1293   int i;
1294   for (i = 0; i < 4; ++i) {
1295     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1296   }
1297 }
1298 
1299 // TrueMotion (4x4 + 8x8)
TrueMotion_NEON(uint8_t * dst,int size)1300 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1301   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1302   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1303   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
1304   int y;
1305   for (y = 0; y < size; y += 4) {
1306     // left edge
1307     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1308     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1309     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1310     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1311     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
1312     const int16x8_t r1 = vaddq_s16(L1, d);
1313     const int16x8_t r2 = vaddq_s16(L2, d);
1314     const int16x8_t r3 = vaddq_s16(L3, d);
1315     // Saturate and store the result.
1316     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1317     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1318     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1319     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1320     if (size == 4) {
1321       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1322       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1323       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1324       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1325     } else {
1326       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1327       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1328       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1329       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1330     }
1331     dst += 4 * BPS;
1332   }
1333 }
1334 
TM4_NEON(uint8_t * dst)1335 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1336 
VE4_NEON(uint8_t * dst)1337 static void VE4_NEON(uint8_t* dst) {    // vertical
1338   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1339   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1340   const uint64x1_t A1 = vshr_n_u64(A0, 8);
1341   const uint64x1_t A2 = vshr_n_u64(A0, 16);
1342   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1343   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1344   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1345   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1346   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1347   int i;
1348   for (i = 0; i < 4; ++i) {
1349     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1350   }
1351 }
1352 
RD4_NEON(uint8_t * dst)1353 static void RD4_NEON(uint8_t* dst) {   // Down-right
1354   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1355   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1356   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1357   const uint32_t I = dst[-1 + 0 * BPS];
1358   const uint32_t J = dst[-1 + 1 * BPS];
1359   const uint32_t K = dst[-1 + 2 * BPS];
1360   const uint32_t L = dst[-1 + 3 * BPS];
1361   const uint64x1_t LKJI____ =
1362       vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1363   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1364   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1365   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1366   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1367   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1368   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1369   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1370   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1371   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1372   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1373   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1374   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1375   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1376   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1377   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1378   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1379   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1380 }
1381 
LD4_NEON(uint8_t * dst)1382 static void LD4_NEON(uint8_t* dst) {    // Down-left
1383   // Note using the same shift trick as VE4() is slower here.
1384   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1385   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1386   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1387   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1388   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1389   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1390   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1391   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1392   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1393   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1394   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1395   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1396   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1397   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1398   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1399 }
1400 
1401 //------------------------------------------------------------------------------
1402 // Chroma
1403 
VE8uv_NEON(uint8_t * dst)1404 static void VE8uv_NEON(uint8_t* dst) {    // vertical
1405   const uint8x8_t top = vld1_u8(dst - BPS);
1406   int j;
1407   for (j = 0; j < 8; ++j) {
1408     vst1_u8(dst + j * BPS, top);
1409   }
1410 }
1411 
HE8uv_NEON(uint8_t * dst)1412 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1413   int j;
1414   for (j = 0; j < 8; ++j) {
1415     const uint8x8_t left = vld1_dup_u8(dst - 1);
1416     vst1_u8(dst, left);
1417     dst += BPS;
1418   }
1419 }
1420 
DC8_NEON(uint8_t * dst,int do_top,int do_left)1421 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1422   uint16x8_t sum_top;
1423   uint16x8_t sum_left;
1424   uint8x8_t dc0;
1425 
1426   if (do_top) {
1427     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1428 #if WEBP_AARCH64
1429     const uint16_t p2 = vaddlv_u8(A);
1430     sum_top = vdupq_n_u16(p2);
1431 #else
1432     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1433     const uint16x4_t p1 = vpadd_u16(p0, p0);
1434     const uint16x4_t p2 = vpadd_u16(p1, p1);
1435     sum_top = vcombine_u16(p2, p2);
1436 #endif
1437   }
1438 
1439   if (do_left) {
1440     const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1441     const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1442     const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1443     const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1444     const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1445     const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1446     const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1447     const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1448     const uint16x8_t s0 = vaddl_u8(L0, L1);
1449     const uint16x8_t s1 = vaddl_u8(L2, L3);
1450     const uint16x8_t s2 = vaddl_u8(L4, L5);
1451     const uint16x8_t s3 = vaddl_u8(L6, L7);
1452     const uint16x8_t s01 = vaddq_u16(s0, s1);
1453     const uint16x8_t s23 = vaddq_u16(s2, s3);
1454     sum_left = vaddq_u16(s01, s23);
1455   }
1456 
1457   if (do_top && do_left) {
1458     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1459     dc0 = vrshrn_n_u16(sum, 4);
1460   } else if (do_top) {
1461     dc0 = vrshrn_n_u16(sum_top, 3);
1462   } else if (do_left) {
1463     dc0 = vrshrn_n_u16(sum_left, 3);
1464   } else {
1465     dc0 = vdup_n_u8(0x80);
1466   }
1467 
1468   {
1469     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1470     int i;
1471     for (i = 0; i < 8; ++i) {
1472       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1473     }
1474   }
1475 }
1476 
DC8uv_NEON(uint8_t * dst)1477 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
DC8uvNoTop_NEON(uint8_t * dst)1478 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
DC8uvNoLeft_NEON(uint8_t * dst)1479 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
DC8uvNoTopLeft_NEON(uint8_t * dst)1480 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1481 
TM8uv_NEON(uint8_t * dst)1482 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1483 
1484 //------------------------------------------------------------------------------
1485 // 16x16
1486 
VE16_NEON(uint8_t * dst)1487 static void VE16_NEON(uint8_t* dst) {     // vertical
1488   const uint8x16_t top = vld1q_u8(dst - BPS);
1489   int j;
1490   for (j = 0; j < 16; ++j) {
1491     vst1q_u8(dst + j * BPS, top);
1492   }
1493 }
1494 
HE16_NEON(uint8_t * dst)1495 static void HE16_NEON(uint8_t* dst) {     // horizontal
1496   int j;
1497   for (j = 0; j < 16; ++j) {
1498     const uint8x16_t left = vld1q_dup_u8(dst - 1);
1499     vst1q_u8(dst, left);
1500     dst += BPS;
1501   }
1502 }
1503 
DC16_NEON(uint8_t * dst,int do_top,int do_left)1504 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1505   uint16x8_t sum_top;
1506   uint16x8_t sum_left;
1507   uint8x8_t dc0;
1508 
1509   if (do_top) {
1510     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1511 #if WEBP_AARCH64
1512     const uint16_t p3 = vaddlvq_u8(A);
1513     sum_top = vdupq_n_u16(p3);
1514 #else
1515     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1516     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1517     const uint16x4_t p2 = vpadd_u16(p1, p1);
1518     const uint16x4_t p3 = vpadd_u16(p2, p2);
1519     sum_top = vcombine_u16(p3, p3);
1520 #endif
1521   }
1522 
1523   if (do_left) {
1524     int i;
1525     sum_left = vdupq_n_u16(0);
1526     for (i = 0; i < 16; i += 8) {
1527       const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1528       const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1529       const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1530       const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1531       const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1532       const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1533       const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1534       const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1535       const uint16x8_t s0 = vaddl_u8(L0, L1);
1536       const uint16x8_t s1 = vaddl_u8(L2, L3);
1537       const uint16x8_t s2 = vaddl_u8(L4, L5);
1538       const uint16x8_t s3 = vaddl_u8(L6, L7);
1539       const uint16x8_t s01 = vaddq_u16(s0, s1);
1540       const uint16x8_t s23 = vaddq_u16(s2, s3);
1541       const uint16x8_t sum = vaddq_u16(s01, s23);
1542       sum_left = vaddq_u16(sum_left, sum);
1543     }
1544   }
1545 
1546   if (do_top && do_left) {
1547     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1548     dc0 = vrshrn_n_u16(sum, 5);
1549   } else if (do_top) {
1550     dc0 = vrshrn_n_u16(sum_top, 4);
1551   } else if (do_left) {
1552     dc0 = vrshrn_n_u16(sum_left, 4);
1553   } else {
1554     dc0 = vdup_n_u8(0x80);
1555   }
1556 
1557   {
1558     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1559     int i;
1560     for (i = 0; i < 16; ++i) {
1561       vst1q_u8(dst + i * BPS, dc);
1562     }
1563   }
1564 }
1565 
DC16TopLeft_NEON(uint8_t * dst)1566 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
DC16NoTop_NEON(uint8_t * dst)1567 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
DC16NoLeft_NEON(uint8_t * dst)1568 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
DC16NoTopLeft_NEON(uint8_t * dst)1569 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1570 
TM16_NEON(uint8_t * dst)1571 static void TM16_NEON(uint8_t* dst) {
1572   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1573   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1574   // A[c] - A[-1]
1575   const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1576   const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1577   int y;
1578   for (y = 0; y < 16; y += 4) {
1579     // left edge
1580     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1581     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1582     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1583     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1584     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
1585     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1586     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1587     const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1588     const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1589     const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1590     const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1591     const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1592     // Saturate and store the result.
1593     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1594     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1595     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1596     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1597     vst1q_u8(dst + 0 * BPS, row0);
1598     vst1q_u8(dst + 1 * BPS, row1);
1599     vst1q_u8(dst + 2 * BPS, row2);
1600     vst1q_u8(dst + 3 * BPS, row3);
1601     dst += 4 * BPS;
1602   }
1603 }
1604 
1605 //------------------------------------------------------------------------------
1606 // Entry point
1607 
1608 extern void VP8DspInitNEON(void);
1609 
VP8DspInitNEON(void)1610 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1611   VP8Transform = TransformTwo_NEON;
1612   VP8TransformAC3 = TransformAC3_NEON;
1613   VP8TransformDC = TransformDC_NEON;
1614   VP8TransformWHT = TransformWHT_NEON;
1615 
1616   VP8VFilter16 = VFilter16_NEON;
1617   VP8VFilter16i = VFilter16i_NEON;
1618   VP8HFilter16 = HFilter16_NEON;
1619 #if !defined(WORK_AROUND_GCC)
1620   VP8HFilter16i = HFilter16i_NEON;
1621 #endif
1622   VP8VFilter8 = VFilter8_NEON;
1623   VP8VFilter8i = VFilter8i_NEON;
1624 #if !defined(WORK_AROUND_GCC)
1625   VP8HFilter8 = HFilter8_NEON;
1626   VP8HFilter8i = HFilter8i_NEON;
1627 #endif
1628   VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1629   VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1630   VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1631   VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1632 
1633   VP8PredLuma4[0] = DC4_NEON;
1634   VP8PredLuma4[1] = TM4_NEON;
1635   VP8PredLuma4[2] = VE4_NEON;
1636   VP8PredLuma4[4] = RD4_NEON;
1637   VP8PredLuma4[6] = LD4_NEON;
1638 
1639   VP8PredLuma16[0] = DC16TopLeft_NEON;
1640   VP8PredLuma16[1] = TM16_NEON;
1641   VP8PredLuma16[2] = VE16_NEON;
1642   VP8PredLuma16[3] = HE16_NEON;
1643   VP8PredLuma16[4] = DC16NoTop_NEON;
1644   VP8PredLuma16[5] = DC16NoLeft_NEON;
1645   VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1646 
1647   VP8PredChroma8[0] = DC8uv_NEON;
1648   VP8PredChroma8[1] = TM8uv_NEON;
1649   VP8PredChroma8[2] = VE8uv_NEON;
1650   VP8PredChroma8[3] = HE8uv_NEON;
1651   VP8PredChroma8[4] = DC8uvNoTop_NEON;
1652   VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1653   VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1654 }
1655 
1656 #else  // !WEBP_USE_NEON
1657 
1658 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1659 
1660 #endif  // WEBP_USE_NEON
1661