xref: /aosp_15_r20/external/libaom/aom_dsp/arm/mem_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
14 
15 #include <arm_neon.h>
16 #include <string.h>
17 #include "aom_dsp/aom_dsp_common.h"
18 
19 // Support for xN Neon intrinsics is lacking in some compilers.
20 #if defined(__arm__) || defined(_M_ARM)
21 #define ARM_32_BIT
22 #endif
23 
24 // DEFICIENT_CLANG_32_BIT includes clang-cl.
25 #if defined(__clang__) && defined(ARM_32_BIT) && \
26     (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
27 #define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
28 #endif
29 
30 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
31 #define GCC_32_BIT
32 #endif
33 
34 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
35 
vld1q_u8_x3(const uint8_t * ptr)36 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
37   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
38                          vld1q_u8(ptr + 2 * 16) } };
39   return res;
40 }
41 
vld1q_u8_x2(const uint8_t * ptr)42 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
43   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
44   return res;
45 }
46 
vld1q_u16_x2(const uint16_t * ptr)47 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
48   uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
49   return res;
50 }
51 
vld1q_u16_x4(const uint16_t * ptr)52 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
53   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
54                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
55   return res;
56 }
57 
58 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
59 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)60 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
61   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
62   return res;
63 }
64 #endif  // __GNUC__ < 8
65 
66 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)67 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
68   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
69                          vld1q_u8(ptr + 2 * 16) } };
70   return res;
71 }
72 #endif  // __GNUC__ < 9
73 
74 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
75 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)76 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
77   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
78                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
79   return res;
80 }
81 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
82 #endif  // defined(__GNUC__) && !defined(__clang__)
83 
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)84 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
85                                 const uint8x8_t s1) {
86   vst1_u8(s, s0);
87   s += p;
88   vst1_u8(s, s1);
89   s += p;
90 }
91 
load_u8_8x2(const uint8_t * s,ptrdiff_t p)92 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
93   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
94 }
95 
96 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)97 static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
98   uint8x8_t ret = vdup_n_u8(0);
99   ret = vreinterpret_u8_u32(
100       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
101   return ret;
102 }
103 
load_u8_4x2(const uint8_t * p,int stride)104 static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
105   uint8x8_t ret = vdup_n_u8(0);
106   ret = vreinterpret_u8_u32(
107       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
108   p += stride;
109   ret = vreinterpret_u8_u32(
110       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
111   return ret;
112 }
113 
load_u16_2x2(const uint16_t * p,int stride)114 static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
115   uint16x4_t ret = vdup_n_u16(0);
116   ret = vreinterpret_u16_u32(
117       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
118   p += stride;
119   ret = vreinterpret_u16_u32(
120       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
121   return ret;
122 }
123 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)124 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
125                                uint8x8_t *const s0, uint8x8_t *const s1,
126                                uint8x8_t *const s2, uint8x8_t *const s3,
127                                uint8x8_t *const s4, uint8x8_t *const s5,
128                                uint8x8_t *const s6, uint8x8_t *const s7) {
129   *s0 = vld1_u8(s);
130   s += p;
131   *s1 = vld1_u8(s);
132   s += p;
133   *s2 = vld1_u8(s);
134   s += p;
135   *s3 = vld1_u8(s);
136   s += p;
137   *s4 = vld1_u8(s);
138   s += p;
139   *s5 = vld1_u8(s);
140   s += p;
141   *s6 = vld1_u8(s);
142   s += p;
143   *s7 = vld1_u8(s);
144 }
145 
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)146 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
147                                uint8x8_t *const s0, uint8x8_t *const s1,
148                                uint8x8_t *const s2, uint8x8_t *const s3,
149                                uint8x8_t *const s4, uint8x8_t *const s5,
150                                uint8x8_t *const s6) {
151   *s0 = vld1_u8(s);
152   s += p;
153   *s1 = vld1_u8(s);
154   s += p;
155   *s2 = vld1_u8(s);
156   s += p;
157   *s3 = vld1_u8(s);
158   s += p;
159   *s4 = vld1_u8(s);
160   s += p;
161   *s5 = vld1_u8(s);
162   s += p;
163   *s6 = vld1_u8(s);
164 }
165 
load_u8_8x6(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5)166 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
167                                uint8x8_t *const s0, uint8x8_t *const s1,
168                                uint8x8_t *const s2, uint8x8_t *const s3,
169                                uint8x8_t *const s4, uint8x8_t *const s5) {
170   *s0 = vld1_u8(s);
171   s += p;
172   *s1 = vld1_u8(s);
173   s += p;
174   *s2 = vld1_u8(s);
175   s += p;
176   *s3 = vld1_u8(s);
177   s += p;
178   *s4 = vld1_u8(s);
179   s += p;
180   *s5 = vld1_u8(s);
181 }
182 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)183 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
184                                uint8x8_t *const s0, uint8x8_t *const s1,
185                                uint8x8_t *const s2, uint8x8_t *const s3) {
186   *s0 = vld1_u8(s);
187   s += p;
188   *s1 = vld1_u8(s);
189   s += p;
190   *s2 = vld1_u8(s);
191   s += p;
192   *s3 = vld1_u8(s);
193 }
194 
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)195 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
196                                uint8x8_t *const s0, uint8x8_t *const s1,
197                                uint8x8_t *const s2) {
198   *s0 = vld1_u8(s);
199   s += p;
200   *s1 = vld1_u8(s);
201   s += p;
202   *s2 = vld1_u8(s);
203 }
204 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)205 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
206                                 uint16x4_t *const s0, uint16x4_t *const s1,
207                                 uint16x4_t *const s2, uint16x4_t *const s3) {
208   *s0 = vld1_u16(s);
209   s += p;
210   *s1 = vld1_u16(s);
211   s += p;
212   *s2 = vld1_u16(s);
213   s += p;
214   *s3 = vld1_u16(s);
215   s += p;
216 }
217 
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)218 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
219                                 uint16x4_t *const s0, uint16x4_t *const s1,
220                                 uint16x4_t *const s2, uint16x4_t *const s3,
221                                 uint16x4_t *const s4, uint16x4_t *const s5,
222                                 uint16x4_t *const s6) {
223   *s0 = vld1_u16(s);
224   s += p;
225   *s1 = vld1_u16(s);
226   s += p;
227   *s2 = vld1_u16(s);
228   s += p;
229   *s3 = vld1_u16(s);
230   s += p;
231   *s4 = vld1_u16(s);
232   s += p;
233   *s5 = vld1_u16(s);
234   s += p;
235   *s6 = vld1_u16(s);
236 }
237 
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)238 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
239                                 int16x8_t *const s0, int16x8_t *const s1) {
240   *s0 = vld1q_s16(s);
241   s += p;
242   *s1 = vld1q_s16(s);
243 }
244 
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)245 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
246                                 uint16x8_t *const s0, uint16x8_t *const s1) {
247   *s0 = vld1q_u16(s);
248   s += p;
249   *s1 = vld1q_u16(s);
250 }
251 
load_u16_8x3(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2)252 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
253                                 uint16x8_t *const s0, uint16x8_t *const s1,
254                                 uint16x8_t *const s2) {
255   *s0 = vld1q_u16(s);
256   s += p;
257   *s1 = vld1q_u16(s);
258   s += p;
259   *s2 = vld1q_u16(s);
260 }
261 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)262 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
263                                 uint16x8_t *const s0, uint16x8_t *const s1,
264                                 uint16x8_t *const s2, uint16x8_t *const s3) {
265   *s0 = vld1q_u16(s);
266   s += p;
267   *s1 = vld1q_u16(s);
268   s += p;
269   *s2 = vld1q_u16(s);
270   s += p;
271   *s3 = vld1q_u16(s);
272   s += p;
273 }
274 
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)275 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
276                                  int16x4_t *const s0, int16x4_t *const s1,
277                                  int16x4_t *const s2, int16x4_t *const s3,
278                                  int16x4_t *const s4, int16x4_t *const s5,
279                                  int16x4_t *const s6, int16x4_t *const s7,
280                                  int16x4_t *const s8, int16x4_t *const s9,
281                                  int16x4_t *const s10, int16x4_t *const s11) {
282   *s0 = vld1_s16(s);
283   s += p;
284   *s1 = vld1_s16(s);
285   s += p;
286   *s2 = vld1_s16(s);
287   s += p;
288   *s3 = vld1_s16(s);
289   s += p;
290   *s4 = vld1_s16(s);
291   s += p;
292   *s5 = vld1_s16(s);
293   s += p;
294   *s6 = vld1_s16(s);
295   s += p;
296   *s7 = vld1_s16(s);
297   s += p;
298   *s8 = vld1_s16(s);
299   s += p;
300   *s9 = vld1_s16(s);
301   s += p;
302   *s10 = vld1_s16(s);
303   s += p;
304   *s11 = vld1_s16(s);
305 }
306 
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)307 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
308                                  int16x4_t *const s0, int16x4_t *const s1,
309                                  int16x4_t *const s2, int16x4_t *const s3,
310                                  int16x4_t *const s4, int16x4_t *const s5,
311                                  int16x4_t *const s6, int16x4_t *const s7,
312                                  int16x4_t *const s8, int16x4_t *const s9,
313                                  int16x4_t *const s10) {
314   *s0 = vld1_s16(s);
315   s += p;
316   *s1 = vld1_s16(s);
317   s += p;
318   *s2 = vld1_s16(s);
319   s += p;
320   *s3 = vld1_s16(s);
321   s += p;
322   *s4 = vld1_s16(s);
323   s += p;
324   *s5 = vld1_s16(s);
325   s += p;
326   *s6 = vld1_s16(s);
327   s += p;
328   *s7 = vld1_s16(s);
329   s += p;
330   *s8 = vld1_s16(s);
331   s += p;
332   *s9 = vld1_s16(s);
333   s += p;
334   *s10 = vld1_s16(s);
335 }
336 
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)337 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
338                                  uint16x4_t *const s0, uint16x4_t *const s1,
339                                  uint16x4_t *const s2, uint16x4_t *const s3,
340                                  uint16x4_t *const s4, uint16x4_t *const s5,
341                                  uint16x4_t *const s6, uint16x4_t *const s7,
342                                  uint16x4_t *const s8, uint16x4_t *const s9,
343                                  uint16x4_t *const s10) {
344   *s0 = vld1_u16(s);
345   s += p;
346   *s1 = vld1_u16(s);
347   s += p;
348   *s2 = vld1_u16(s);
349   s += p;
350   *s3 = vld1_u16(s);
351   s += p;
352   *s4 = vld1_u16(s);
353   s += p;
354   *s5 = vld1_u16(s);
355   s += p;
356   *s6 = vld1_u16(s);
357   s += p;
358   *s7 = vld1_u16(s);
359   s += p;
360   *s8 = vld1_u16(s);
361   s += p;
362   *s9 = vld1_u16(s);
363   s += p;
364   *s10 = vld1_u16(s);
365 }
366 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)367 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
368                                 int16x4_t *const s0, int16x4_t *const s1,
369                                 int16x4_t *const s2, int16x4_t *const s3,
370                                 int16x4_t *const s4, int16x4_t *const s5,
371                                 int16x4_t *const s6, int16x4_t *const s7) {
372   *s0 = vld1_s16(s);
373   s += p;
374   *s1 = vld1_s16(s);
375   s += p;
376   *s2 = vld1_s16(s);
377   s += p;
378   *s3 = vld1_s16(s);
379   s += p;
380   *s4 = vld1_s16(s);
381   s += p;
382   *s5 = vld1_s16(s);
383   s += p;
384   *s6 = vld1_s16(s);
385   s += p;
386   *s7 = vld1_s16(s);
387 }
388 
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)389 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
390                                 int16x4_t *const s0, int16x4_t *const s1,
391                                 int16x4_t *const s2, int16x4_t *const s3,
392                                 int16x4_t *const s4, int16x4_t *const s5,
393                                 int16x4_t *const s6) {
394   *s0 = vld1_s16(s);
395   s += p;
396   *s1 = vld1_s16(s);
397   s += p;
398   *s2 = vld1_s16(s);
399   s += p;
400   *s3 = vld1_s16(s);
401   s += p;
402   *s4 = vld1_s16(s);
403   s += p;
404   *s5 = vld1_s16(s);
405   s += p;
406   *s6 = vld1_s16(s);
407 }
408 
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)409 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
410                                 int16x4_t *const s0, int16x4_t *const s1,
411                                 int16x4_t *const s2, int16x4_t *const s3,
412                                 int16x4_t *const s4, int16x4_t *const s5) {
413   *s0 = vld1_s16(s);
414   s += p;
415   *s1 = vld1_s16(s);
416   s += p;
417   *s2 = vld1_s16(s);
418   s += p;
419   *s3 = vld1_s16(s);
420   s += p;
421   *s4 = vld1_s16(s);
422   s += p;
423   *s5 = vld1_s16(s);
424 }
425 
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)426 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
427                                 int16x4_t *const s0, int16x4_t *const s1,
428                                 int16x4_t *const s2, int16x4_t *const s3,
429                                 int16x4_t *const s4) {
430   *s0 = vld1_s16(s);
431   s += p;
432   *s1 = vld1_s16(s);
433   s += p;
434   *s2 = vld1_s16(s);
435   s += p;
436   *s3 = vld1_s16(s);
437   s += p;
438   *s4 = vld1_s16(s);
439 }
440 
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)441 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
442                                 uint16x4_t *const s0, uint16x4_t *const s1,
443                                 uint16x4_t *const s2, uint16x4_t *const s3,
444                                 uint16x4_t *const s4) {
445   *s0 = vld1_u16(s);
446   s += p;
447   *s1 = vld1_u16(s);
448   s += p;
449   *s2 = vld1_u16(s);
450   s += p;
451   *s3 = vld1_u16(s);
452   s += p;
453   *s4 = vld1_u16(s);
454   s += p;
455 }
456 
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)457 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
458                                uint8x8_t *const s0, uint8x8_t *const s1,
459                                uint8x8_t *const s2, uint8x8_t *const s3,
460                                uint8x8_t *const s4) {
461   *s0 = vld1_u8(s);
462   s += p;
463   *s1 = vld1_u8(s);
464   s += p;
465   *s2 = vld1_u8(s);
466   s += p;
467   *s3 = vld1_u8(s);
468   s += p;
469   *s4 = vld1_u8(s);
470 }
471 
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)472 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
473                                 uint16x8_t *const s0, uint16x8_t *const s1,
474                                 uint16x8_t *const s2, uint16x8_t *const s3,
475                                 uint16x8_t *const s4) {
476   *s0 = vld1q_u16(s);
477   s += p;
478   *s1 = vld1q_u16(s);
479   s += p;
480   *s2 = vld1q_u16(s);
481   s += p;
482   *s3 = vld1q_u16(s);
483   s += p;
484   *s4 = vld1q_u16(s);
485   s += p;
486 }
487 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)488 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
489                                 int16x4_t *const s0, int16x4_t *const s1,
490                                 int16x4_t *const s2, int16x4_t *const s3) {
491   *s0 = vld1_s16(s);
492   s += p;
493   *s1 = vld1_s16(s);
494   s += p;
495   *s2 = vld1_s16(s);
496   s += p;
497   *s3 = vld1_s16(s);
498 }
499 
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)500 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
501                                 int16x4_t *const s0, int16x4_t *const s1,
502                                 int16x4_t *const s2) {
503   *s0 = vld1_s16(s);
504   s += p;
505   *s1 = vld1_s16(s);
506   s += p;
507   *s2 = vld1_s16(s);
508 }
509 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)510 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
511                                 const uint8x8_t s1, const uint8x8_t s2,
512                                 const uint8x8_t s3, const uint8x8_t s4,
513                                 const uint8x8_t s5, const uint8x8_t s6,
514                                 const uint8x8_t s7) {
515   vst1_u8(s, s0);
516   s += p;
517   vst1_u8(s, s1);
518   s += p;
519   vst1_u8(s, s2);
520   s += p;
521   vst1_u8(s, s3);
522   s += p;
523   vst1_u8(s, s4);
524   s += p;
525   vst1_u8(s, s5);
526   s += p;
527   vst1_u8(s, s6);
528   s += p;
529   vst1_u8(s, s7);
530 }
531 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)532 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
533                                 const uint8x8_t s1, const uint8x8_t s2,
534                                 const uint8x8_t s3) {
535   vst1_u8(s, s0);
536   s += p;
537   vst1_u8(s, s1);
538   s += p;
539   vst1_u8(s, s2);
540   s += p;
541   vst1_u8(s, s3);
542 }
543 
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)544 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
545                                  const uint8x16_t s1, const uint8x16_t s2,
546                                  const uint8x16_t s3) {
547   vst1q_u8(s, s0);
548   s += p;
549   vst1q_u8(s, s1);
550   s += p;
551   vst1q_u8(s, s2);
552   s += p;
553   vst1q_u8(s, s3);
554 }
555 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)556 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
557                                  const uint16x8_t s0, const uint16x8_t s1,
558                                  const uint16x8_t s2, const uint16x8_t s3,
559                                  const uint16x8_t s4, const uint16x8_t s5,
560                                  const uint16x8_t s6, const uint16x8_t s7) {
561   vst1q_u16(s, s0);
562   s += dst_stride;
563   vst1q_u16(s, s1);
564   s += dst_stride;
565   vst1q_u16(s, s2);
566   s += dst_stride;
567   vst1q_u16(s, s3);
568   s += dst_stride;
569   vst1q_u16(s, s4);
570   s += dst_stride;
571   vst1q_u16(s, s5);
572   s += dst_stride;
573   vst1q_u16(s, s6);
574   s += dst_stride;
575   vst1q_u16(s, s7);
576 }
577 
store_u16_4x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)578 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
579                                  const uint16x4_t s0, const uint16x4_t s1,
580                                  const uint16x4_t s2) {
581   vst1_u16(s, s0);
582   s += dst_stride;
583   vst1_u16(s, s1);
584   s += dst_stride;
585   vst1_u16(s, s2);
586 }
587 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)588 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
589                                  const uint16x4_t s0, const uint16x4_t s1,
590                                  const uint16x4_t s2, const uint16x4_t s3) {
591   vst1_u16(s, s0);
592   s += dst_stride;
593   vst1_u16(s, s1);
594   s += dst_stride;
595   vst1_u16(s, s2);
596   s += dst_stride;
597   vst1_u16(s, s3);
598 }
599 
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)600 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
601                                  const uint16x8_t s0, const uint16x8_t s1) {
602   vst1q_u16(s, s0);
603   s += dst_stride;
604   vst1q_u16(s, s1);
605 }
606 
store_u16_8x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)607 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
608                                  const uint16x8_t s0, const uint16x8_t s1,
609                                  const uint16x8_t s2) {
610   vst1q_u16(s, s0);
611   s += dst_stride;
612   vst1q_u16(s, s1);
613   s += dst_stride;
614   vst1q_u16(s, s2);
615 }
616 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)617 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
618                                  const uint16x8_t s0, const uint16x8_t s1,
619                                  const uint16x8_t s2, const uint16x8_t s3) {
620   vst1q_u16(s, s0);
621   s += dst_stride;
622   vst1q_u16(s, s1);
623   s += dst_stride;
624   vst1q_u16(s, s2);
625   s += dst_stride;
626   vst1q_u16(s, s3);
627 }
628 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)629 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
630                                  const int16x8_t s0, const int16x8_t s1,
631                                  const int16x8_t s2, const int16x8_t s3,
632                                  const int16x8_t s4, const int16x8_t s5,
633                                  const int16x8_t s6, const int16x8_t s7) {
634   vst1q_s16(s, s0);
635   s += dst_stride;
636   vst1q_s16(s, s1);
637   s += dst_stride;
638   vst1q_s16(s, s2);
639   s += dst_stride;
640   vst1q_s16(s, s3);
641   s += dst_stride;
642   vst1q_s16(s, s4);
643   s += dst_stride;
644   vst1q_s16(s, s5);
645   s += dst_stride;
646   vst1q_s16(s, s6);
647   s += dst_stride;
648   vst1q_s16(s, s7);
649 }
650 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)651 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
652                                  const int16x4_t s0, const int16x4_t s1,
653                                  const int16x4_t s2, const int16x4_t s3) {
654   vst1_s16(s, s0);
655   s += dst_stride;
656   vst1_s16(s, s1);
657   s += dst_stride;
658   vst1_s16(s, s2);
659   s += dst_stride;
660   vst1_s16(s, s3);
661 }
662 
store_s16_4x8(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7)663 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
664                                  const int16x4_t s0, const int16x4_t s1,
665                                  const int16x4_t s2, const int16x4_t s3,
666                                  const int16x4_t s4, const int16x4_t s5,
667                                  const int16x4_t s6, const int16x4_t s7) {
668   vst1_s16(s, s0);
669   s += dst_stride;
670   vst1_s16(s, s1);
671   s += dst_stride;
672   vst1_s16(s, s2);
673   s += dst_stride;
674   vst1_s16(s, s3);
675   s += dst_stride;
676   vst1_s16(s, s4);
677   s += dst_stride;
678   vst1_s16(s, s5);
679   s += dst_stride;
680   vst1_s16(s, s6);
681   s += dst_stride;
682   vst1_s16(s, s7);
683 }
684 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)685 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
686                                  const int16x8_t s0, const int16x8_t s1,
687                                  const int16x8_t s2, const int16x8_t s3) {
688   vst1q_s16(s, s0);
689   s += dst_stride;
690   vst1q_s16(s, s1);
691   s += dst_stride;
692   vst1q_s16(s, s2);
693   s += dst_stride;
694   vst1q_s16(s, s3);
695 }
696 
store_s16_8x2(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1)697 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
698                                  const int16x8_t s0, const int16x8_t s1) {
699   vst1q_s16(s, s0);
700   s += dst_stride;
701   vst1q_s16(s, s1);
702 }
703 
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)704 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
705                                 uint8x8_t *const s0, uint8x8_t *const s1,
706                                 uint8x8_t *const s2, uint8x8_t *const s3,
707                                 uint8x8_t *const s4, uint8x8_t *const s5,
708                                 uint8x8_t *const s6, uint8x8_t *const s7,
709                                 uint8x8_t *const s8, uint8x8_t *const s9,
710                                 uint8x8_t *const s10) {
711   *s0 = vld1_u8(s);
712   s += p;
713   *s1 = vld1_u8(s);
714   s += p;
715   *s2 = vld1_u8(s);
716   s += p;
717   *s3 = vld1_u8(s);
718   s += p;
719   *s4 = vld1_u8(s);
720   s += p;
721   *s5 = vld1_u8(s);
722   s += p;
723   *s6 = vld1_u8(s);
724   s += p;
725   *s7 = vld1_u8(s);
726   s += p;
727   *s8 = vld1_u8(s);
728   s += p;
729   *s9 = vld1_u8(s);
730   s += p;
731   *s10 = vld1_u8(s);
732 }
733 
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)734 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
735                                  int16x8_t *const s0, int16x8_t *const s1,
736                                  int16x8_t *const s2, int16x8_t *const s3,
737                                  int16x8_t *const s4, int16x8_t *const s5,
738                                  int16x8_t *const s6, int16x8_t *const s7,
739                                  int16x8_t *const s8, int16x8_t *const s9) {
740   *s0 = vld1q_s16(s);
741   s += p;
742   *s1 = vld1q_s16(s);
743   s += p;
744   *s2 = vld1q_s16(s);
745   s += p;
746   *s3 = vld1q_s16(s);
747   s += p;
748   *s4 = vld1q_s16(s);
749   s += p;
750   *s5 = vld1q_s16(s);
751   s += p;
752   *s6 = vld1q_s16(s);
753   s += p;
754   *s7 = vld1q_s16(s);
755   s += p;
756   *s8 = vld1q_s16(s);
757   s += p;
758   *s9 = vld1q_s16(s);
759 }
760 
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)761 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
762                                  int16x8_t *const s0, int16x8_t *const s1,
763                                  int16x8_t *const s2, int16x8_t *const s3,
764                                  int16x8_t *const s4, int16x8_t *const s5,
765                                  int16x8_t *const s6, int16x8_t *const s7,
766                                  int16x8_t *const s8, int16x8_t *const s9,
767                                  int16x8_t *const s10) {
768   *s0 = vld1q_s16(s);
769   s += p;
770   *s1 = vld1q_s16(s);
771   s += p;
772   *s2 = vld1q_s16(s);
773   s += p;
774   *s3 = vld1q_s16(s);
775   s += p;
776   *s4 = vld1q_s16(s);
777   s += p;
778   *s5 = vld1q_s16(s);
779   s += p;
780   *s6 = vld1q_s16(s);
781   s += p;
782   *s7 = vld1q_s16(s);
783   s += p;
784   *s8 = vld1q_s16(s);
785   s += p;
786   *s9 = vld1q_s16(s);
787   s += p;
788   *s10 = vld1q_s16(s);
789 }
790 
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)791 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
792                                  int16x8_t *const s0, int16x8_t *const s1,
793                                  int16x8_t *const s2, int16x8_t *const s3,
794                                  int16x8_t *const s4, int16x8_t *const s5,
795                                  int16x8_t *const s6, int16x8_t *const s7,
796                                  int16x8_t *const s8, int16x8_t *const s9,
797                                  int16x8_t *const s10, int16x8_t *const s11) {
798   *s0 = vld1q_s16(s);
799   s += p;
800   *s1 = vld1q_s16(s);
801   s += p;
802   *s2 = vld1q_s16(s);
803   s += p;
804   *s3 = vld1q_s16(s);
805   s += p;
806   *s4 = vld1q_s16(s);
807   s += p;
808   *s5 = vld1q_s16(s);
809   s += p;
810   *s6 = vld1q_s16(s);
811   s += p;
812   *s7 = vld1q_s16(s);
813   s += p;
814   *s8 = vld1q_s16(s);
815   s += p;
816   *s9 = vld1q_s16(s);
817   s += p;
818   *s10 = vld1q_s16(s);
819   s += p;
820   *s11 = vld1q_s16(s);
821 }
822 
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)823 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
824                                  uint16x8_t *const s0, uint16x8_t *const s1,
825                                  uint16x8_t *const s2, uint16x8_t *const s3,
826                                  uint16x8_t *const s4, uint16x8_t *const s5,
827                                  uint16x8_t *const s6, uint16x8_t *const s7,
828                                  uint16x8_t *const s8, uint16x8_t *const s9,
829                                  uint16x8_t *const s10) {
830   *s0 = vld1q_u16(s);
831   s += p;
832   *s1 = vld1q_u16(s);
833   s += p;
834   *s2 = vld1q_u16(s);
835   s += p;
836   *s3 = vld1q_u16(s);
837   s += p;
838   *s4 = vld1q_u16(s);
839   s += p;
840   *s5 = vld1q_u16(s);
841   s += p;
842   *s6 = vld1q_u16(s);
843   s += p;
844   *s7 = vld1q_u16(s);
845   s += p;
846   *s8 = vld1q_u16(s);
847   s += p;
848   *s9 = vld1q_u16(s);
849   s += p;
850   *s10 = vld1q_u16(s);
851 }
852 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)853 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
854                                 int16x8_t *const s0, int16x8_t *const s1,
855                                 int16x8_t *const s2, int16x8_t *const s3,
856                                 int16x8_t *const s4, int16x8_t *const s5,
857                                 int16x8_t *const s6, int16x8_t *const s7) {
858   *s0 = vld1q_s16(s);
859   s += p;
860   *s1 = vld1q_s16(s);
861   s += p;
862   *s2 = vld1q_s16(s);
863   s += p;
864   *s3 = vld1q_s16(s);
865   s += p;
866   *s4 = vld1q_s16(s);
867   s += p;
868   *s5 = vld1q_s16(s);
869   s += p;
870   *s6 = vld1q_s16(s);
871   s += p;
872   *s7 = vld1q_s16(s);
873 }
874 
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)875 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
876                                 uint16x8_t *const s0, uint16x8_t *const s1,
877                                 uint16x8_t *const s2, uint16x8_t *const s3,
878                                 uint16x8_t *const s4, uint16x8_t *const s5,
879                                 uint16x8_t *const s6) {
880   *s0 = vld1q_u16(s);
881   s += p;
882   *s1 = vld1q_u16(s);
883   s += p;
884   *s2 = vld1q_u16(s);
885   s += p;
886   *s3 = vld1q_u16(s);
887   s += p;
888   *s4 = vld1q_u16(s);
889   s += p;
890   *s5 = vld1q_u16(s);
891   s += p;
892   *s6 = vld1q_u16(s);
893 }
894 
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)895 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
896                                 int16x8_t *const s0, int16x8_t *const s1,
897                                 int16x8_t *const s2, int16x8_t *const s3,
898                                 int16x8_t *const s4, int16x8_t *const s5,
899                                 int16x8_t *const s6) {
900   *s0 = vld1q_s16(s);
901   s += p;
902   *s1 = vld1q_s16(s);
903   s += p;
904   *s2 = vld1q_s16(s);
905   s += p;
906   *s3 = vld1q_s16(s);
907   s += p;
908   *s4 = vld1q_s16(s);
909   s += p;
910   *s5 = vld1q_s16(s);
911   s += p;
912   *s6 = vld1q_s16(s);
913 }
914 
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)915 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
916                                 int16x8_t *const s0, int16x8_t *const s1,
917                                 int16x8_t *const s2, int16x8_t *const s3,
918                                 int16x8_t *const s4, int16x8_t *const s5) {
919   *s0 = vld1q_s16(s);
920   s += p;
921   *s1 = vld1q_s16(s);
922   s += p;
923   *s2 = vld1q_s16(s);
924   s += p;
925   *s3 = vld1q_s16(s);
926   s += p;
927   *s4 = vld1q_s16(s);
928   s += p;
929   *s5 = vld1q_s16(s);
930 }
931 
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)932 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
933                                 int16x8_t *const s0, int16x8_t *const s1,
934                                 int16x8_t *const s2, int16x8_t *const s3,
935                                 int16x8_t *const s4) {
936   *s0 = vld1q_s16(s);
937   s += p;
938   *s1 = vld1q_s16(s);
939   s += p;
940   *s2 = vld1q_s16(s);
941   s += p;
942   *s3 = vld1q_s16(s);
943   s += p;
944   *s4 = vld1q_s16(s);
945 }
946 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)947 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
948                                 int16x8_t *const s0, int16x8_t *const s1,
949                                 int16x8_t *const s2, int16x8_t *const s3) {
950   *s0 = vld1q_s16(s);
951   s += p;
952   *s1 = vld1q_s16(s);
953   s += p;
954   *s2 = vld1q_s16(s);
955   s += p;
956   *s3 = vld1q_s16(s);
957 }
958 
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)959 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
960                                 int16x8_t *const s0, int16x8_t *const s1,
961                                 int16x8_t *const s2) {
962   *s0 = vld1q_s16(s);
963   s += p;
964   *s1 = vld1q_s16(s);
965   s += p;
966   *s2 = vld1q_s16(s);
967 }
968 
969 #if AOM_ARCH_AARCH64
970 #define load_unaligned_u32_2x1_lane(v, p, lane)              \
971   do {                                                       \
972     (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
973   } while (0)
974 
975 #define load_unaligned_u32_4x1_lane(v, p, lane)               \
976   do {                                                        \
977     (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
978   } while (0)
979 #else
980 #define load_unaligned_u32_2x1_lane(v, p, lane) \
981   do {                                          \
982     uint32_t tmp;                               \
983     memcpy(&tmp, (p), 4);                       \
984     (v) = vset_lane_u32(tmp, (v), (lane));      \
985   } while (0)
986 
987 #define load_unaligned_u32_4x1_lane(v, p, lane) \
988   do {                                          \
989     uint32_t tmp;                               \
990     memcpy(&tmp, (p), 4);                       \
991     (v) = vsetq_lane_u32(tmp, (v), (lane));     \
992   } while (0)
993 #endif
994 
995 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)996 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
997   uint32_t a;
998   memcpy(&a, buf, 4);
999   buf += stride;
1000   uint32x2_t a_u32 = vdup_n_u32(a);
1001   memcpy(&a, buf, 4);
1002   a_u32 = vset_lane_u32(a, a_u32, 1);
1003   return vreinterpret_u8_u32(a_u32);
1004 }
1005 
1006 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)1007 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
1008   uint32_t a;
1009   uint32x4_t a_u32;
1010   if (stride == 4) return vld1q_u8(buf);
1011   memcpy(&a, buf, 4);
1012   buf += stride;
1013   a_u32 = vdupq_n_u32(a);
1014   memcpy(&a, buf, 4);
1015   buf += stride;
1016   a_u32 = vsetq_lane_u32(a, a_u32, 1);
1017   memcpy(&a, buf, 4);
1018   buf += stride;
1019   a_u32 = vsetq_lane_u32(a, a_u32, 2);
1020   memcpy(&a, buf, 4);
1021   a_u32 = vsetq_lane_u32(a, a_u32, 3);
1022   return vreinterpretq_u8_u32(a_u32);
1023 }
1024 
load_unaligned_u8_2x2(const uint8_t * buf,int stride)1025 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
1026   uint16_t a;
1027   uint16x4_t a_u16;
1028 
1029   memcpy(&a, buf, 2);
1030   buf += stride;
1031   a_u16 = vdup_n_u16(a);
1032   memcpy(&a, buf, 2);
1033   a_u16 = vset_lane_u16(a, a_u16, 1);
1034   return vreinterpret_u8_u16(a_u16);
1035 }
1036 
load_unaligned_u8_4x1(const uint8_t * buf)1037 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
1038   uint32_t a;
1039   uint32x2_t a_u32;
1040 
1041   memcpy(&a, buf, 4);
1042   a_u32 = vdup_n_u32(0);
1043   a_u32 = vset_lane_u32(a, a_u32, 0);
1044   return vreinterpret_u8_u32(a_u32);
1045 }
1046 
load_unaligned_dup_u8_4x2(const uint8_t * buf)1047 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
1048   uint32_t a;
1049   uint32x2_t a_u32;
1050 
1051   memcpy(&a, buf, 4);
1052   a_u32 = vdup_n_u32(a);
1053   return vreinterpret_u8_u32(a_u32);
1054 }
1055 
load_unaligned_dup_u8_2x4(const uint8_t * buf)1056 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
1057   uint16_t a;
1058   uint16x4_t a_u32;
1059 
1060   memcpy(&a, buf, 2);
1061   a_u32 = vdup_n_u16(a);
1062   return vreinterpret_u8_u16(a_u32);
1063 }
1064 
load_unaligned_u8_4x2(const uint8_t * buf,int stride)1065 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
1066   uint32_t a;
1067   uint32x2_t a_u32;
1068 
1069   memcpy(&a, buf, 4);
1070   buf += stride;
1071   a_u32 = vdup_n_u32(a);
1072   memcpy(&a, buf, 4);
1073   a_u32 = vset_lane_u32(a, a_u32, 1);
1074   return vreinterpret_u8_u32(a_u32);
1075 }
1076 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)1077 static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
1078                                          uint8x8_t *tu0, uint8x8_t *tu1) {
1079   *tu0 = load_unaligned_u8_4x2(buf, stride);
1080   buf += 2 * stride;
1081   *tu1 = load_unaligned_u8_4x2(buf, stride);
1082 }
1083 
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)1084 static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
1085                                          uint8x8_t *tu0, uint8x8_t *tu1,
1086                                          uint8x8_t *tu2) {
1087   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1088   buf += 4 * stride;
1089   *tu2 = load_unaligned_u8_4x2(buf, stride);
1090 }
1091 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)1092 static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
1093                                          uint8x8_t *tu0, uint8x8_t *tu1,
1094                                          uint8x8_t *tu2, uint8x8_t *tu3) {
1095   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1096   buf += 4 * stride;
1097   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
1098 }
1099 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)1100 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
1101                                 uint8x16_t *const s0, uint8x16_t *const s1,
1102                                 uint8x16_t *const s2, uint8x16_t *const s3,
1103                                 uint8x16_t *const s4, uint8x16_t *const s5,
1104                                 uint8x16_t *const s6, uint8x16_t *const s7) {
1105   *s0 = vld1q_u8(s);
1106   s += p;
1107   *s1 = vld1q_u8(s);
1108   s += p;
1109   *s2 = vld1q_u8(s);
1110   s += p;
1111   *s3 = vld1q_u8(s);
1112   s += p;
1113   *s4 = vld1q_u8(s);
1114   s += p;
1115   *s5 = vld1q_u8(s);
1116   s += p;
1117   *s6 = vld1q_u8(s);
1118   s += p;
1119   *s7 = vld1q_u8(s);
1120 }
1121 
load_u8_16x5(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4)1122 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
1123                                 uint8x16_t *const s0, uint8x16_t *const s1,
1124                                 uint8x16_t *const s2, uint8x16_t *const s3,
1125                                 uint8x16_t *const s4) {
1126   *s0 = vld1q_u8(s);
1127   s += p;
1128   *s1 = vld1q_u8(s);
1129   s += p;
1130   *s2 = vld1q_u8(s);
1131   s += p;
1132   *s3 = vld1q_u8(s);
1133   s += p;
1134   *s4 = vld1q_u8(s);
1135 }
1136 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1137 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1138                                 uint8x16_t *const s0, uint8x16_t *const s1,
1139                                 uint8x16_t *const s2, uint8x16_t *const s3) {
1140   *s0 = vld1q_u8(s);
1141   s += p;
1142   *s1 = vld1q_u8(s);
1143   s += p;
1144   *s2 = vld1q_u8(s);
1145   s += p;
1146   *s3 = vld1q_u8(s);
1147 }
1148 
load_u8_16x3(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)1149 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
1150                                 uint8x16_t *const s0, uint8x16_t *const s1,
1151                                 uint8x16_t *const s2) {
1152   *s0 = vld1q_u8(s);
1153   s += p;
1154   *s1 = vld1q_u8(s);
1155   s += p;
1156   *s2 = vld1q_u8(s);
1157 }
1158 
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1159 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1160                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1161                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1162                                 uint16x8_t *s6, uint16x8_t *s7) {
1163   *s0 = vld1q_u16(s);
1164   s += p;
1165   *s1 = vld1q_u16(s);
1166   s += p;
1167   *s2 = vld1q_u16(s);
1168   s += p;
1169   *s3 = vld1q_u16(s);
1170   s += p;
1171   *s4 = vld1q_u16(s);
1172   s += p;
1173   *s5 = vld1q_u16(s);
1174   s += p;
1175   *s6 = vld1q_u16(s);
1176   s += p;
1177   *s7 = vld1q_u16(s);
1178 }
1179 
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1180 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1181                                  uint16x8_t *const s0, uint16x8_t *const s1,
1182                                  uint16x8_t *const s2, uint16x8_t *const s3,
1183                                  uint16x8_t *const s4, uint16x8_t *const s5,
1184                                  uint16x8_t *const s6, uint16x8_t *const s7) {
1185   *s0 = vld1q_u16(s);
1186   *s1 = vld1q_u16(s + 8);
1187   s += p;
1188   *s2 = vld1q_u16(s);
1189   *s3 = vld1q_u16(s + 8);
1190   s += p;
1191   *s4 = vld1q_u16(s);
1192   *s5 = vld1q_u16(s + 8);
1193   s += p;
1194   *s6 = vld1q_u16(s);
1195   *s7 = vld1q_u16(s + 8);
1196 }
1197 
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1198 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1199                                                 int stride) {
1200   uint32_t a;
1201   uint32x2_t a_u32;
1202 
1203   memcpy(&a, buf, 4);
1204   buf += stride;
1205   a_u32 = vdup_n_u32(a);
1206   memcpy(&a, buf, 4);
1207   a_u32 = vset_lane_u32(a, a_u32, 1);
1208   return vreinterpret_u16_u32(a_u32);
1209 }
1210 
load_unaligned_u16_4x1(const uint16_t * buf)1211 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1212   uint64_t a;
1213   uint64x1_t a_u64 = vdup_n_u64(0);
1214   memcpy(&a, buf, 8);
1215   a_u64 = vset_lane_u64(a, a_u64, 0);
1216   return vreinterpret_u16_u64(a_u64);
1217 }
1218 
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1219 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1220                                                 uint32_t stride) {
1221   uint64_t a;
1222   uint64x2_t a_u64;
1223 
1224   memcpy(&a, buf, 8);
1225   buf += stride;
1226   a_u64 = vdupq_n_u64(0);
1227   a_u64 = vsetq_lane_u64(a, a_u64, 0);
1228   memcpy(&a, buf, 8);
1229   buf += stride;
1230   a_u64 = vsetq_lane_u64(a, a_u64, 1);
1231   return vreinterpretq_u16_u64(a_u64);
1232 }
1233 
load_unaligned_s16_4x2(const int16_t * buf,uint32_t stride)1234 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
1235                                                uint32_t stride) {
1236   int64_t a;
1237   int64x2_t a_s64;
1238   memcpy(&a, buf, 8);
1239   buf += stride;
1240   a_s64 = vdupq_n_s64(0);
1241   a_s64 = vsetq_lane_s64(a, a_s64, 0);
1242   memcpy(&a, buf, 8);
1243   buf += stride;
1244   a_s64 = vsetq_lane_s64(a, a_s64, 1);
1245   return vreinterpretq_s16_s64(a_s64);
1246 }
1247 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1248 static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1249                                           uint16x8_t *tu0, uint16x8_t *tu1) {
1250   *tu0 = load_unaligned_u16_4x2(buf, stride);
1251   buf += 2 * stride;
1252   *tu1 = load_unaligned_u16_4x2(buf, stride);
1253 }
1254 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1255 static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1256                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1257   *s1 = vld1q_s32(s);
1258   s += p;
1259   *s2 = vld1q_s32(s);
1260   s += p;
1261   *s3 = vld1q_s32(s);
1262   s += p;
1263   *s4 = vld1q_s32(s);
1264 }
1265 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1266 static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1267                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1268   vst1q_s32(s, s1);
1269   s += p;
1270   vst1q_s32(s, s2);
1271   s += p;
1272   vst1q_s32(s, s3);
1273   s += p;
1274   vst1q_s32(s, s4);
1275 }
1276 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1277 static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1278                                 uint32x4_t *s2, uint32x4_t *s3,
1279                                 uint32x4_t *s4) {
1280   *s1 = vld1q_u32(s);
1281   s += p;
1282   *s2 = vld1q_u32(s);
1283   s += p;
1284   *s3 = vld1q_u32(s);
1285   s += p;
1286   *s4 = vld1q_u32(s);
1287 }
1288 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1289 static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1290                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1291   vst1q_u32(s, s1);
1292   s += p;
1293   vst1q_u32(s, s2);
1294   s += p;
1295   vst1q_u32(s, s3);
1296   s += p;
1297   vst1q_u32(s, s4);
1298 }
1299 
load_tran_low_to_s16q(const tran_low_t * buf)1300 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1301   const int32x4_t v0 = vld1q_s32(buf);
1302   const int32x4_t v1 = vld1q_s32(buf + 4);
1303   const int16x4_t s0 = vmovn_s32(v0);
1304   const int16x4_t s1 = vmovn_s32(v1);
1305   return vcombine_s16(s0, s1);
1306 }
1307 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1308 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1309   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1310   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1311   vst1q_s32(buf, v0);
1312   vst1q_s32(buf + 4, v1);
1313 }
1314 
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1315 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1316   const int32x4_t v0 = vmovl_s16(a);
1317   vst1q_s32(buf, v0);
1318 }
1319 
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1320 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1321                                               int16x8_t indices) {
1322   // Recent Clang and GCC versions correctly identify that this zero-broadcast
1323   // is redundant. Alternatively we could load and broadcast the zeroth element
1324   // and then replace the other lanes, however this is slower than loading a
1325   // single element without broadcast on some micro-architectures.
1326   uint8x8_t ret = vdup_n_u8(0);
1327   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1328   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1329   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1330   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1331   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1332   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1333   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1334   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1335   return ret;
1336 }
1337 
1338 // The `lane` parameter here must be an immediate.
1339 #define store_u8_2x1_lane(dst, src, lane)                       \
1340   do {                                                          \
1341     uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1342     memcpy(dst, &a, 2);                                         \
1343   } while (0)
1344 
1345 #define store_u8_4x1_lane(dst, src, lane)                       \
1346   do {                                                          \
1347     uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1348     memcpy(dst, &a, 4);                                         \
1349   } while (0)
1350 
1351 #define store_u16_2x1_lane(dst, src, lane)                       \
1352   do {                                                           \
1353     uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1354     memcpy(dst, &a, 4);                                          \
1355   } while (0)
1356 
1357 #define store_u16_4x1_lane(dst, src, lane)                         \
1358   do {                                                             \
1359     uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1360     memcpy(dst, &a, 8);                                            \
1361   } while (0)
1362 
1363 #define store_s16_4x1_lane(dst, src, lane)                        \
1364   do {                                                            \
1365     int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
1366     memcpy(dst, &a, 8);                                           \
1367   } while (0)
1368 
1369 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1370 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1371   store_u8_2x1_lane(dst, src, 0);
1372 }
1373 
1374 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1375 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1376   store_u8_4x1_lane(dst, src, 0);
1377 }
1378 
1379 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1380 static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1381                                          uint8x8_t src) {
1382   store_u8_2x1_lane(dst, src, 0);
1383   dst += dst_stride;
1384   store_u8_2x1_lane(dst, src, 1);
1385 }
1386 
store_u8x2_strided_x4(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1387 static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride,
1388                                          uint8x8_t src) {
1389   store_u8_2x1_lane(dst, src, 0);
1390   dst += dst_stride;
1391   store_u8_2x1_lane(dst, src, 1);
1392   dst += dst_stride;
1393   store_u8_2x1_lane(dst, src, 2);
1394   dst += dst_stride;
1395   store_u8_2x1_lane(dst, src, 3);
1396 }
1397 
1398 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1399 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1400                                          uint8x8_t src) {
1401   store_u8_4x1_lane(dst, src, 0);
1402   dst += stride;
1403   store_u8_4x1_lane(dst, src, 1);
1404 }
1405 
1406 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1407 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1408                                          uint8x16_t src) {
1409   store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1410   dst += stride;
1411   store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1412   dst += stride;
1413   store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1414   dst += stride;
1415   store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1416 }
1417 
1418 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1419 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1420   store_u16_2x1_lane(dst, src, 0);
1421 }
1422 
1423 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1424 static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1425                                           uint16x4_t src) {
1426   store_u16_2x1_lane(dst, src, 0);
1427   dst += dst_stride;
1428   store_u16_2x1_lane(dst, src, 1);
1429 }
1430 
1431 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1432 static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1433                                           uint16x8_t src) {
1434   store_u16_4x1_lane(dst, src, 0);
1435   dst += dst_stride;
1436   store_u16_4x1_lane(dst, src, 1);
1437 }
1438 
1439 // Store two blocks of 64-bits from a single vector.
store_s16x4_strided_x2(int16_t * dst,int32_t dst_stride,int16x8_t src)1440 static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
1441                                           int16x8_t src) {
1442   store_s16_4x1_lane(dst, src, 0);
1443   dst += dst_stride;
1444   store_s16_4x1_lane(dst, src, 1);
1445 }
1446 
1447 #undef store_u8_2x1_lane
1448 #undef store_u8_4x1_lane
1449 #undef store_u16_2x1_lane
1450 #undef store_u16_4x1_lane
1451 #undef store_s16_4x1_lane
1452 
1453 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
1454