1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
14
15 #include <arm_neon.h>
16 #include <string.h>
17 #include "aom_dsp/aom_dsp_common.h"
18
19 // Support for xN Neon intrinsics is lacking in some compilers.
20 #if defined(__arm__) || defined(_M_ARM)
21 #define ARM_32_BIT
22 #endif
23
24 // DEFICIENT_CLANG_32_BIT includes clang-cl.
25 #if defined(__clang__) && defined(ARM_32_BIT) && \
26 (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
27 #define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
28 #endif
29
30 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
31 #define GCC_32_BIT
32 #endif
33
34 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
35
vld1q_u8_x3(const uint8_t * ptr)36 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
37 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
38 vld1q_u8(ptr + 2 * 16) } };
39 return res;
40 }
41
vld1q_u8_x2(const uint8_t * ptr)42 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
43 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
44 return res;
45 }
46
vld1q_u16_x2(const uint16_t * ptr)47 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
48 uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
49 return res;
50 }
51
vld1q_u16_x4(const uint16_t * ptr)52 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
53 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
54 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
55 return res;
56 }
57
58 #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
59 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)60 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
61 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
62 return res;
63 }
64 #endif // __GNUC__ < 8
65
66 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)67 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
68 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
69 vld1q_u8(ptr + 2 * 16) } };
70 return res;
71 }
72 #endif // __GNUC__ < 9
73
74 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
75 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)76 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
77 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
78 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
79 return res;
80 }
81 #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
82 #endif // defined(__GNUC__) && !defined(__clang__)
83
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)84 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
85 const uint8x8_t s1) {
86 vst1_u8(s, s0);
87 s += p;
88 vst1_u8(s, s1);
89 s += p;
90 }
91
load_u8_8x2(const uint8_t * s,ptrdiff_t p)92 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
93 return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
94 }
95
96 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)97 static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
98 uint8x8_t ret = vdup_n_u8(0);
99 ret = vreinterpret_u8_u32(
100 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
101 return ret;
102 }
103
load_u8_4x2(const uint8_t * p,int stride)104 static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
105 uint8x8_t ret = vdup_n_u8(0);
106 ret = vreinterpret_u8_u32(
107 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
108 p += stride;
109 ret = vreinterpret_u8_u32(
110 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
111 return ret;
112 }
113
load_u16_2x2(const uint16_t * p,int stride)114 static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
115 uint16x4_t ret = vdup_n_u16(0);
116 ret = vreinterpret_u16_u32(
117 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
118 p += stride;
119 ret = vreinterpret_u16_u32(
120 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
121 return ret;
122 }
123
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)124 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
125 uint8x8_t *const s0, uint8x8_t *const s1,
126 uint8x8_t *const s2, uint8x8_t *const s3,
127 uint8x8_t *const s4, uint8x8_t *const s5,
128 uint8x8_t *const s6, uint8x8_t *const s7) {
129 *s0 = vld1_u8(s);
130 s += p;
131 *s1 = vld1_u8(s);
132 s += p;
133 *s2 = vld1_u8(s);
134 s += p;
135 *s3 = vld1_u8(s);
136 s += p;
137 *s4 = vld1_u8(s);
138 s += p;
139 *s5 = vld1_u8(s);
140 s += p;
141 *s6 = vld1_u8(s);
142 s += p;
143 *s7 = vld1_u8(s);
144 }
145
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)146 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
147 uint8x8_t *const s0, uint8x8_t *const s1,
148 uint8x8_t *const s2, uint8x8_t *const s3,
149 uint8x8_t *const s4, uint8x8_t *const s5,
150 uint8x8_t *const s6) {
151 *s0 = vld1_u8(s);
152 s += p;
153 *s1 = vld1_u8(s);
154 s += p;
155 *s2 = vld1_u8(s);
156 s += p;
157 *s3 = vld1_u8(s);
158 s += p;
159 *s4 = vld1_u8(s);
160 s += p;
161 *s5 = vld1_u8(s);
162 s += p;
163 *s6 = vld1_u8(s);
164 }
165
load_u8_8x6(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5)166 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
167 uint8x8_t *const s0, uint8x8_t *const s1,
168 uint8x8_t *const s2, uint8x8_t *const s3,
169 uint8x8_t *const s4, uint8x8_t *const s5) {
170 *s0 = vld1_u8(s);
171 s += p;
172 *s1 = vld1_u8(s);
173 s += p;
174 *s2 = vld1_u8(s);
175 s += p;
176 *s3 = vld1_u8(s);
177 s += p;
178 *s4 = vld1_u8(s);
179 s += p;
180 *s5 = vld1_u8(s);
181 }
182
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)183 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
184 uint8x8_t *const s0, uint8x8_t *const s1,
185 uint8x8_t *const s2, uint8x8_t *const s3) {
186 *s0 = vld1_u8(s);
187 s += p;
188 *s1 = vld1_u8(s);
189 s += p;
190 *s2 = vld1_u8(s);
191 s += p;
192 *s3 = vld1_u8(s);
193 }
194
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)195 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
196 uint8x8_t *const s0, uint8x8_t *const s1,
197 uint8x8_t *const s2) {
198 *s0 = vld1_u8(s);
199 s += p;
200 *s1 = vld1_u8(s);
201 s += p;
202 *s2 = vld1_u8(s);
203 }
204
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)205 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
206 uint16x4_t *const s0, uint16x4_t *const s1,
207 uint16x4_t *const s2, uint16x4_t *const s3) {
208 *s0 = vld1_u16(s);
209 s += p;
210 *s1 = vld1_u16(s);
211 s += p;
212 *s2 = vld1_u16(s);
213 s += p;
214 *s3 = vld1_u16(s);
215 s += p;
216 }
217
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)218 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
219 uint16x4_t *const s0, uint16x4_t *const s1,
220 uint16x4_t *const s2, uint16x4_t *const s3,
221 uint16x4_t *const s4, uint16x4_t *const s5,
222 uint16x4_t *const s6) {
223 *s0 = vld1_u16(s);
224 s += p;
225 *s1 = vld1_u16(s);
226 s += p;
227 *s2 = vld1_u16(s);
228 s += p;
229 *s3 = vld1_u16(s);
230 s += p;
231 *s4 = vld1_u16(s);
232 s += p;
233 *s5 = vld1_u16(s);
234 s += p;
235 *s6 = vld1_u16(s);
236 }
237
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)238 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
239 int16x8_t *const s0, int16x8_t *const s1) {
240 *s0 = vld1q_s16(s);
241 s += p;
242 *s1 = vld1q_s16(s);
243 }
244
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)245 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
246 uint16x8_t *const s0, uint16x8_t *const s1) {
247 *s0 = vld1q_u16(s);
248 s += p;
249 *s1 = vld1q_u16(s);
250 }
251
load_u16_8x3(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2)252 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
253 uint16x8_t *const s0, uint16x8_t *const s1,
254 uint16x8_t *const s2) {
255 *s0 = vld1q_u16(s);
256 s += p;
257 *s1 = vld1q_u16(s);
258 s += p;
259 *s2 = vld1q_u16(s);
260 }
261
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)262 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
263 uint16x8_t *const s0, uint16x8_t *const s1,
264 uint16x8_t *const s2, uint16x8_t *const s3) {
265 *s0 = vld1q_u16(s);
266 s += p;
267 *s1 = vld1q_u16(s);
268 s += p;
269 *s2 = vld1q_u16(s);
270 s += p;
271 *s3 = vld1q_u16(s);
272 s += p;
273 }
274
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)275 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
276 int16x4_t *const s0, int16x4_t *const s1,
277 int16x4_t *const s2, int16x4_t *const s3,
278 int16x4_t *const s4, int16x4_t *const s5,
279 int16x4_t *const s6, int16x4_t *const s7,
280 int16x4_t *const s8, int16x4_t *const s9,
281 int16x4_t *const s10, int16x4_t *const s11) {
282 *s0 = vld1_s16(s);
283 s += p;
284 *s1 = vld1_s16(s);
285 s += p;
286 *s2 = vld1_s16(s);
287 s += p;
288 *s3 = vld1_s16(s);
289 s += p;
290 *s4 = vld1_s16(s);
291 s += p;
292 *s5 = vld1_s16(s);
293 s += p;
294 *s6 = vld1_s16(s);
295 s += p;
296 *s7 = vld1_s16(s);
297 s += p;
298 *s8 = vld1_s16(s);
299 s += p;
300 *s9 = vld1_s16(s);
301 s += p;
302 *s10 = vld1_s16(s);
303 s += p;
304 *s11 = vld1_s16(s);
305 }
306
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)307 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
308 int16x4_t *const s0, int16x4_t *const s1,
309 int16x4_t *const s2, int16x4_t *const s3,
310 int16x4_t *const s4, int16x4_t *const s5,
311 int16x4_t *const s6, int16x4_t *const s7,
312 int16x4_t *const s8, int16x4_t *const s9,
313 int16x4_t *const s10) {
314 *s0 = vld1_s16(s);
315 s += p;
316 *s1 = vld1_s16(s);
317 s += p;
318 *s2 = vld1_s16(s);
319 s += p;
320 *s3 = vld1_s16(s);
321 s += p;
322 *s4 = vld1_s16(s);
323 s += p;
324 *s5 = vld1_s16(s);
325 s += p;
326 *s6 = vld1_s16(s);
327 s += p;
328 *s7 = vld1_s16(s);
329 s += p;
330 *s8 = vld1_s16(s);
331 s += p;
332 *s9 = vld1_s16(s);
333 s += p;
334 *s10 = vld1_s16(s);
335 }
336
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)337 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
338 uint16x4_t *const s0, uint16x4_t *const s1,
339 uint16x4_t *const s2, uint16x4_t *const s3,
340 uint16x4_t *const s4, uint16x4_t *const s5,
341 uint16x4_t *const s6, uint16x4_t *const s7,
342 uint16x4_t *const s8, uint16x4_t *const s9,
343 uint16x4_t *const s10) {
344 *s0 = vld1_u16(s);
345 s += p;
346 *s1 = vld1_u16(s);
347 s += p;
348 *s2 = vld1_u16(s);
349 s += p;
350 *s3 = vld1_u16(s);
351 s += p;
352 *s4 = vld1_u16(s);
353 s += p;
354 *s5 = vld1_u16(s);
355 s += p;
356 *s6 = vld1_u16(s);
357 s += p;
358 *s7 = vld1_u16(s);
359 s += p;
360 *s8 = vld1_u16(s);
361 s += p;
362 *s9 = vld1_u16(s);
363 s += p;
364 *s10 = vld1_u16(s);
365 }
366
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)367 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
368 int16x4_t *const s0, int16x4_t *const s1,
369 int16x4_t *const s2, int16x4_t *const s3,
370 int16x4_t *const s4, int16x4_t *const s5,
371 int16x4_t *const s6, int16x4_t *const s7) {
372 *s0 = vld1_s16(s);
373 s += p;
374 *s1 = vld1_s16(s);
375 s += p;
376 *s2 = vld1_s16(s);
377 s += p;
378 *s3 = vld1_s16(s);
379 s += p;
380 *s4 = vld1_s16(s);
381 s += p;
382 *s5 = vld1_s16(s);
383 s += p;
384 *s6 = vld1_s16(s);
385 s += p;
386 *s7 = vld1_s16(s);
387 }
388
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)389 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
390 int16x4_t *const s0, int16x4_t *const s1,
391 int16x4_t *const s2, int16x4_t *const s3,
392 int16x4_t *const s4, int16x4_t *const s5,
393 int16x4_t *const s6) {
394 *s0 = vld1_s16(s);
395 s += p;
396 *s1 = vld1_s16(s);
397 s += p;
398 *s2 = vld1_s16(s);
399 s += p;
400 *s3 = vld1_s16(s);
401 s += p;
402 *s4 = vld1_s16(s);
403 s += p;
404 *s5 = vld1_s16(s);
405 s += p;
406 *s6 = vld1_s16(s);
407 }
408
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)409 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
410 int16x4_t *const s0, int16x4_t *const s1,
411 int16x4_t *const s2, int16x4_t *const s3,
412 int16x4_t *const s4, int16x4_t *const s5) {
413 *s0 = vld1_s16(s);
414 s += p;
415 *s1 = vld1_s16(s);
416 s += p;
417 *s2 = vld1_s16(s);
418 s += p;
419 *s3 = vld1_s16(s);
420 s += p;
421 *s4 = vld1_s16(s);
422 s += p;
423 *s5 = vld1_s16(s);
424 }
425
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)426 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
427 int16x4_t *const s0, int16x4_t *const s1,
428 int16x4_t *const s2, int16x4_t *const s3,
429 int16x4_t *const s4) {
430 *s0 = vld1_s16(s);
431 s += p;
432 *s1 = vld1_s16(s);
433 s += p;
434 *s2 = vld1_s16(s);
435 s += p;
436 *s3 = vld1_s16(s);
437 s += p;
438 *s4 = vld1_s16(s);
439 }
440
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)441 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
442 uint16x4_t *const s0, uint16x4_t *const s1,
443 uint16x4_t *const s2, uint16x4_t *const s3,
444 uint16x4_t *const s4) {
445 *s0 = vld1_u16(s);
446 s += p;
447 *s1 = vld1_u16(s);
448 s += p;
449 *s2 = vld1_u16(s);
450 s += p;
451 *s3 = vld1_u16(s);
452 s += p;
453 *s4 = vld1_u16(s);
454 s += p;
455 }
456
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)457 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
458 uint8x8_t *const s0, uint8x8_t *const s1,
459 uint8x8_t *const s2, uint8x8_t *const s3,
460 uint8x8_t *const s4) {
461 *s0 = vld1_u8(s);
462 s += p;
463 *s1 = vld1_u8(s);
464 s += p;
465 *s2 = vld1_u8(s);
466 s += p;
467 *s3 = vld1_u8(s);
468 s += p;
469 *s4 = vld1_u8(s);
470 }
471
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)472 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
473 uint16x8_t *const s0, uint16x8_t *const s1,
474 uint16x8_t *const s2, uint16x8_t *const s3,
475 uint16x8_t *const s4) {
476 *s0 = vld1q_u16(s);
477 s += p;
478 *s1 = vld1q_u16(s);
479 s += p;
480 *s2 = vld1q_u16(s);
481 s += p;
482 *s3 = vld1q_u16(s);
483 s += p;
484 *s4 = vld1q_u16(s);
485 s += p;
486 }
487
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)488 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
489 int16x4_t *const s0, int16x4_t *const s1,
490 int16x4_t *const s2, int16x4_t *const s3) {
491 *s0 = vld1_s16(s);
492 s += p;
493 *s1 = vld1_s16(s);
494 s += p;
495 *s2 = vld1_s16(s);
496 s += p;
497 *s3 = vld1_s16(s);
498 }
499
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)500 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
501 int16x4_t *const s0, int16x4_t *const s1,
502 int16x4_t *const s2) {
503 *s0 = vld1_s16(s);
504 s += p;
505 *s1 = vld1_s16(s);
506 s += p;
507 *s2 = vld1_s16(s);
508 }
509
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)510 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
511 const uint8x8_t s1, const uint8x8_t s2,
512 const uint8x8_t s3, const uint8x8_t s4,
513 const uint8x8_t s5, const uint8x8_t s6,
514 const uint8x8_t s7) {
515 vst1_u8(s, s0);
516 s += p;
517 vst1_u8(s, s1);
518 s += p;
519 vst1_u8(s, s2);
520 s += p;
521 vst1_u8(s, s3);
522 s += p;
523 vst1_u8(s, s4);
524 s += p;
525 vst1_u8(s, s5);
526 s += p;
527 vst1_u8(s, s6);
528 s += p;
529 vst1_u8(s, s7);
530 }
531
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)532 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
533 const uint8x8_t s1, const uint8x8_t s2,
534 const uint8x8_t s3) {
535 vst1_u8(s, s0);
536 s += p;
537 vst1_u8(s, s1);
538 s += p;
539 vst1_u8(s, s2);
540 s += p;
541 vst1_u8(s, s3);
542 }
543
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)544 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
545 const uint8x16_t s1, const uint8x16_t s2,
546 const uint8x16_t s3) {
547 vst1q_u8(s, s0);
548 s += p;
549 vst1q_u8(s, s1);
550 s += p;
551 vst1q_u8(s, s2);
552 s += p;
553 vst1q_u8(s, s3);
554 }
555
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)556 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
557 const uint16x8_t s0, const uint16x8_t s1,
558 const uint16x8_t s2, const uint16x8_t s3,
559 const uint16x8_t s4, const uint16x8_t s5,
560 const uint16x8_t s6, const uint16x8_t s7) {
561 vst1q_u16(s, s0);
562 s += dst_stride;
563 vst1q_u16(s, s1);
564 s += dst_stride;
565 vst1q_u16(s, s2);
566 s += dst_stride;
567 vst1q_u16(s, s3);
568 s += dst_stride;
569 vst1q_u16(s, s4);
570 s += dst_stride;
571 vst1q_u16(s, s5);
572 s += dst_stride;
573 vst1q_u16(s, s6);
574 s += dst_stride;
575 vst1q_u16(s, s7);
576 }
577
store_u16_4x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)578 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
579 const uint16x4_t s0, const uint16x4_t s1,
580 const uint16x4_t s2) {
581 vst1_u16(s, s0);
582 s += dst_stride;
583 vst1_u16(s, s1);
584 s += dst_stride;
585 vst1_u16(s, s2);
586 }
587
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)588 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
589 const uint16x4_t s0, const uint16x4_t s1,
590 const uint16x4_t s2, const uint16x4_t s3) {
591 vst1_u16(s, s0);
592 s += dst_stride;
593 vst1_u16(s, s1);
594 s += dst_stride;
595 vst1_u16(s, s2);
596 s += dst_stride;
597 vst1_u16(s, s3);
598 }
599
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)600 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
601 const uint16x8_t s0, const uint16x8_t s1) {
602 vst1q_u16(s, s0);
603 s += dst_stride;
604 vst1q_u16(s, s1);
605 }
606
store_u16_8x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)607 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
608 const uint16x8_t s0, const uint16x8_t s1,
609 const uint16x8_t s2) {
610 vst1q_u16(s, s0);
611 s += dst_stride;
612 vst1q_u16(s, s1);
613 s += dst_stride;
614 vst1q_u16(s, s2);
615 }
616
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)617 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
618 const uint16x8_t s0, const uint16x8_t s1,
619 const uint16x8_t s2, const uint16x8_t s3) {
620 vst1q_u16(s, s0);
621 s += dst_stride;
622 vst1q_u16(s, s1);
623 s += dst_stride;
624 vst1q_u16(s, s2);
625 s += dst_stride;
626 vst1q_u16(s, s3);
627 }
628
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)629 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
630 const int16x8_t s0, const int16x8_t s1,
631 const int16x8_t s2, const int16x8_t s3,
632 const int16x8_t s4, const int16x8_t s5,
633 const int16x8_t s6, const int16x8_t s7) {
634 vst1q_s16(s, s0);
635 s += dst_stride;
636 vst1q_s16(s, s1);
637 s += dst_stride;
638 vst1q_s16(s, s2);
639 s += dst_stride;
640 vst1q_s16(s, s3);
641 s += dst_stride;
642 vst1q_s16(s, s4);
643 s += dst_stride;
644 vst1q_s16(s, s5);
645 s += dst_stride;
646 vst1q_s16(s, s6);
647 s += dst_stride;
648 vst1q_s16(s, s7);
649 }
650
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)651 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
652 const int16x4_t s0, const int16x4_t s1,
653 const int16x4_t s2, const int16x4_t s3) {
654 vst1_s16(s, s0);
655 s += dst_stride;
656 vst1_s16(s, s1);
657 s += dst_stride;
658 vst1_s16(s, s2);
659 s += dst_stride;
660 vst1_s16(s, s3);
661 }
662
store_s16_4x8(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7)663 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
664 const int16x4_t s0, const int16x4_t s1,
665 const int16x4_t s2, const int16x4_t s3,
666 const int16x4_t s4, const int16x4_t s5,
667 const int16x4_t s6, const int16x4_t s7) {
668 vst1_s16(s, s0);
669 s += dst_stride;
670 vst1_s16(s, s1);
671 s += dst_stride;
672 vst1_s16(s, s2);
673 s += dst_stride;
674 vst1_s16(s, s3);
675 s += dst_stride;
676 vst1_s16(s, s4);
677 s += dst_stride;
678 vst1_s16(s, s5);
679 s += dst_stride;
680 vst1_s16(s, s6);
681 s += dst_stride;
682 vst1_s16(s, s7);
683 }
684
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)685 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
686 const int16x8_t s0, const int16x8_t s1,
687 const int16x8_t s2, const int16x8_t s3) {
688 vst1q_s16(s, s0);
689 s += dst_stride;
690 vst1q_s16(s, s1);
691 s += dst_stride;
692 vst1q_s16(s, s2);
693 s += dst_stride;
694 vst1q_s16(s, s3);
695 }
696
store_s16_8x2(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1)697 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
698 const int16x8_t s0, const int16x8_t s1) {
699 vst1q_s16(s, s0);
700 s += dst_stride;
701 vst1q_s16(s, s1);
702 }
703
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)704 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
705 uint8x8_t *const s0, uint8x8_t *const s1,
706 uint8x8_t *const s2, uint8x8_t *const s3,
707 uint8x8_t *const s4, uint8x8_t *const s5,
708 uint8x8_t *const s6, uint8x8_t *const s7,
709 uint8x8_t *const s8, uint8x8_t *const s9,
710 uint8x8_t *const s10) {
711 *s0 = vld1_u8(s);
712 s += p;
713 *s1 = vld1_u8(s);
714 s += p;
715 *s2 = vld1_u8(s);
716 s += p;
717 *s3 = vld1_u8(s);
718 s += p;
719 *s4 = vld1_u8(s);
720 s += p;
721 *s5 = vld1_u8(s);
722 s += p;
723 *s6 = vld1_u8(s);
724 s += p;
725 *s7 = vld1_u8(s);
726 s += p;
727 *s8 = vld1_u8(s);
728 s += p;
729 *s9 = vld1_u8(s);
730 s += p;
731 *s10 = vld1_u8(s);
732 }
733
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)734 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
735 int16x8_t *const s0, int16x8_t *const s1,
736 int16x8_t *const s2, int16x8_t *const s3,
737 int16x8_t *const s4, int16x8_t *const s5,
738 int16x8_t *const s6, int16x8_t *const s7,
739 int16x8_t *const s8, int16x8_t *const s9) {
740 *s0 = vld1q_s16(s);
741 s += p;
742 *s1 = vld1q_s16(s);
743 s += p;
744 *s2 = vld1q_s16(s);
745 s += p;
746 *s3 = vld1q_s16(s);
747 s += p;
748 *s4 = vld1q_s16(s);
749 s += p;
750 *s5 = vld1q_s16(s);
751 s += p;
752 *s6 = vld1q_s16(s);
753 s += p;
754 *s7 = vld1q_s16(s);
755 s += p;
756 *s8 = vld1q_s16(s);
757 s += p;
758 *s9 = vld1q_s16(s);
759 }
760
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)761 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
762 int16x8_t *const s0, int16x8_t *const s1,
763 int16x8_t *const s2, int16x8_t *const s3,
764 int16x8_t *const s4, int16x8_t *const s5,
765 int16x8_t *const s6, int16x8_t *const s7,
766 int16x8_t *const s8, int16x8_t *const s9,
767 int16x8_t *const s10) {
768 *s0 = vld1q_s16(s);
769 s += p;
770 *s1 = vld1q_s16(s);
771 s += p;
772 *s2 = vld1q_s16(s);
773 s += p;
774 *s3 = vld1q_s16(s);
775 s += p;
776 *s4 = vld1q_s16(s);
777 s += p;
778 *s5 = vld1q_s16(s);
779 s += p;
780 *s6 = vld1q_s16(s);
781 s += p;
782 *s7 = vld1q_s16(s);
783 s += p;
784 *s8 = vld1q_s16(s);
785 s += p;
786 *s9 = vld1q_s16(s);
787 s += p;
788 *s10 = vld1q_s16(s);
789 }
790
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)791 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
792 int16x8_t *const s0, int16x8_t *const s1,
793 int16x8_t *const s2, int16x8_t *const s3,
794 int16x8_t *const s4, int16x8_t *const s5,
795 int16x8_t *const s6, int16x8_t *const s7,
796 int16x8_t *const s8, int16x8_t *const s9,
797 int16x8_t *const s10, int16x8_t *const s11) {
798 *s0 = vld1q_s16(s);
799 s += p;
800 *s1 = vld1q_s16(s);
801 s += p;
802 *s2 = vld1q_s16(s);
803 s += p;
804 *s3 = vld1q_s16(s);
805 s += p;
806 *s4 = vld1q_s16(s);
807 s += p;
808 *s5 = vld1q_s16(s);
809 s += p;
810 *s6 = vld1q_s16(s);
811 s += p;
812 *s7 = vld1q_s16(s);
813 s += p;
814 *s8 = vld1q_s16(s);
815 s += p;
816 *s9 = vld1q_s16(s);
817 s += p;
818 *s10 = vld1q_s16(s);
819 s += p;
820 *s11 = vld1q_s16(s);
821 }
822
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)823 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
824 uint16x8_t *const s0, uint16x8_t *const s1,
825 uint16x8_t *const s2, uint16x8_t *const s3,
826 uint16x8_t *const s4, uint16x8_t *const s5,
827 uint16x8_t *const s6, uint16x8_t *const s7,
828 uint16x8_t *const s8, uint16x8_t *const s9,
829 uint16x8_t *const s10) {
830 *s0 = vld1q_u16(s);
831 s += p;
832 *s1 = vld1q_u16(s);
833 s += p;
834 *s2 = vld1q_u16(s);
835 s += p;
836 *s3 = vld1q_u16(s);
837 s += p;
838 *s4 = vld1q_u16(s);
839 s += p;
840 *s5 = vld1q_u16(s);
841 s += p;
842 *s6 = vld1q_u16(s);
843 s += p;
844 *s7 = vld1q_u16(s);
845 s += p;
846 *s8 = vld1q_u16(s);
847 s += p;
848 *s9 = vld1q_u16(s);
849 s += p;
850 *s10 = vld1q_u16(s);
851 }
852
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)853 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
854 int16x8_t *const s0, int16x8_t *const s1,
855 int16x8_t *const s2, int16x8_t *const s3,
856 int16x8_t *const s4, int16x8_t *const s5,
857 int16x8_t *const s6, int16x8_t *const s7) {
858 *s0 = vld1q_s16(s);
859 s += p;
860 *s1 = vld1q_s16(s);
861 s += p;
862 *s2 = vld1q_s16(s);
863 s += p;
864 *s3 = vld1q_s16(s);
865 s += p;
866 *s4 = vld1q_s16(s);
867 s += p;
868 *s5 = vld1q_s16(s);
869 s += p;
870 *s6 = vld1q_s16(s);
871 s += p;
872 *s7 = vld1q_s16(s);
873 }
874
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)875 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
876 uint16x8_t *const s0, uint16x8_t *const s1,
877 uint16x8_t *const s2, uint16x8_t *const s3,
878 uint16x8_t *const s4, uint16x8_t *const s5,
879 uint16x8_t *const s6) {
880 *s0 = vld1q_u16(s);
881 s += p;
882 *s1 = vld1q_u16(s);
883 s += p;
884 *s2 = vld1q_u16(s);
885 s += p;
886 *s3 = vld1q_u16(s);
887 s += p;
888 *s4 = vld1q_u16(s);
889 s += p;
890 *s5 = vld1q_u16(s);
891 s += p;
892 *s6 = vld1q_u16(s);
893 }
894
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)895 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
896 int16x8_t *const s0, int16x8_t *const s1,
897 int16x8_t *const s2, int16x8_t *const s3,
898 int16x8_t *const s4, int16x8_t *const s5,
899 int16x8_t *const s6) {
900 *s0 = vld1q_s16(s);
901 s += p;
902 *s1 = vld1q_s16(s);
903 s += p;
904 *s2 = vld1q_s16(s);
905 s += p;
906 *s3 = vld1q_s16(s);
907 s += p;
908 *s4 = vld1q_s16(s);
909 s += p;
910 *s5 = vld1q_s16(s);
911 s += p;
912 *s6 = vld1q_s16(s);
913 }
914
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)915 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
916 int16x8_t *const s0, int16x8_t *const s1,
917 int16x8_t *const s2, int16x8_t *const s3,
918 int16x8_t *const s4, int16x8_t *const s5) {
919 *s0 = vld1q_s16(s);
920 s += p;
921 *s1 = vld1q_s16(s);
922 s += p;
923 *s2 = vld1q_s16(s);
924 s += p;
925 *s3 = vld1q_s16(s);
926 s += p;
927 *s4 = vld1q_s16(s);
928 s += p;
929 *s5 = vld1q_s16(s);
930 }
931
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)932 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
933 int16x8_t *const s0, int16x8_t *const s1,
934 int16x8_t *const s2, int16x8_t *const s3,
935 int16x8_t *const s4) {
936 *s0 = vld1q_s16(s);
937 s += p;
938 *s1 = vld1q_s16(s);
939 s += p;
940 *s2 = vld1q_s16(s);
941 s += p;
942 *s3 = vld1q_s16(s);
943 s += p;
944 *s4 = vld1q_s16(s);
945 }
946
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)947 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
948 int16x8_t *const s0, int16x8_t *const s1,
949 int16x8_t *const s2, int16x8_t *const s3) {
950 *s0 = vld1q_s16(s);
951 s += p;
952 *s1 = vld1q_s16(s);
953 s += p;
954 *s2 = vld1q_s16(s);
955 s += p;
956 *s3 = vld1q_s16(s);
957 }
958
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)959 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
960 int16x8_t *const s0, int16x8_t *const s1,
961 int16x8_t *const s2) {
962 *s0 = vld1q_s16(s);
963 s += p;
964 *s1 = vld1q_s16(s);
965 s += p;
966 *s2 = vld1q_s16(s);
967 }
968
969 #if AOM_ARCH_AARCH64
970 #define load_unaligned_u32_2x1_lane(v, p, lane) \
971 do { \
972 (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
973 } while (0)
974
975 #define load_unaligned_u32_4x1_lane(v, p, lane) \
976 do { \
977 (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
978 } while (0)
979 #else
980 #define load_unaligned_u32_2x1_lane(v, p, lane) \
981 do { \
982 uint32_t tmp; \
983 memcpy(&tmp, (p), 4); \
984 (v) = vset_lane_u32(tmp, (v), (lane)); \
985 } while (0)
986
987 #define load_unaligned_u32_4x1_lane(v, p, lane) \
988 do { \
989 uint32_t tmp; \
990 memcpy(&tmp, (p), 4); \
991 (v) = vsetq_lane_u32(tmp, (v), (lane)); \
992 } while (0)
993 #endif
994
995 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)996 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
997 uint32_t a;
998 memcpy(&a, buf, 4);
999 buf += stride;
1000 uint32x2_t a_u32 = vdup_n_u32(a);
1001 memcpy(&a, buf, 4);
1002 a_u32 = vset_lane_u32(a, a_u32, 1);
1003 return vreinterpret_u8_u32(a_u32);
1004 }
1005
1006 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)1007 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
1008 uint32_t a;
1009 uint32x4_t a_u32;
1010 if (stride == 4) return vld1q_u8(buf);
1011 memcpy(&a, buf, 4);
1012 buf += stride;
1013 a_u32 = vdupq_n_u32(a);
1014 memcpy(&a, buf, 4);
1015 buf += stride;
1016 a_u32 = vsetq_lane_u32(a, a_u32, 1);
1017 memcpy(&a, buf, 4);
1018 buf += stride;
1019 a_u32 = vsetq_lane_u32(a, a_u32, 2);
1020 memcpy(&a, buf, 4);
1021 a_u32 = vsetq_lane_u32(a, a_u32, 3);
1022 return vreinterpretq_u8_u32(a_u32);
1023 }
1024
load_unaligned_u8_2x2(const uint8_t * buf,int stride)1025 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
1026 uint16_t a;
1027 uint16x4_t a_u16;
1028
1029 memcpy(&a, buf, 2);
1030 buf += stride;
1031 a_u16 = vdup_n_u16(a);
1032 memcpy(&a, buf, 2);
1033 a_u16 = vset_lane_u16(a, a_u16, 1);
1034 return vreinterpret_u8_u16(a_u16);
1035 }
1036
load_unaligned_u8_4x1(const uint8_t * buf)1037 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
1038 uint32_t a;
1039 uint32x2_t a_u32;
1040
1041 memcpy(&a, buf, 4);
1042 a_u32 = vdup_n_u32(0);
1043 a_u32 = vset_lane_u32(a, a_u32, 0);
1044 return vreinterpret_u8_u32(a_u32);
1045 }
1046
load_unaligned_dup_u8_4x2(const uint8_t * buf)1047 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
1048 uint32_t a;
1049 uint32x2_t a_u32;
1050
1051 memcpy(&a, buf, 4);
1052 a_u32 = vdup_n_u32(a);
1053 return vreinterpret_u8_u32(a_u32);
1054 }
1055
load_unaligned_dup_u8_2x4(const uint8_t * buf)1056 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
1057 uint16_t a;
1058 uint16x4_t a_u32;
1059
1060 memcpy(&a, buf, 2);
1061 a_u32 = vdup_n_u16(a);
1062 return vreinterpret_u8_u16(a_u32);
1063 }
1064
load_unaligned_u8_4x2(const uint8_t * buf,int stride)1065 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
1066 uint32_t a;
1067 uint32x2_t a_u32;
1068
1069 memcpy(&a, buf, 4);
1070 buf += stride;
1071 a_u32 = vdup_n_u32(a);
1072 memcpy(&a, buf, 4);
1073 a_u32 = vset_lane_u32(a, a_u32, 1);
1074 return vreinterpret_u8_u32(a_u32);
1075 }
1076
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)1077 static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
1078 uint8x8_t *tu0, uint8x8_t *tu1) {
1079 *tu0 = load_unaligned_u8_4x2(buf, stride);
1080 buf += 2 * stride;
1081 *tu1 = load_unaligned_u8_4x2(buf, stride);
1082 }
1083
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)1084 static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
1085 uint8x8_t *tu0, uint8x8_t *tu1,
1086 uint8x8_t *tu2) {
1087 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1088 buf += 4 * stride;
1089 *tu2 = load_unaligned_u8_4x2(buf, stride);
1090 }
1091
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)1092 static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
1093 uint8x8_t *tu0, uint8x8_t *tu1,
1094 uint8x8_t *tu2, uint8x8_t *tu3) {
1095 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1096 buf += 4 * stride;
1097 load_unaligned_u8_4x4(buf, stride, tu2, tu3);
1098 }
1099
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)1100 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
1101 uint8x16_t *const s0, uint8x16_t *const s1,
1102 uint8x16_t *const s2, uint8x16_t *const s3,
1103 uint8x16_t *const s4, uint8x16_t *const s5,
1104 uint8x16_t *const s6, uint8x16_t *const s7) {
1105 *s0 = vld1q_u8(s);
1106 s += p;
1107 *s1 = vld1q_u8(s);
1108 s += p;
1109 *s2 = vld1q_u8(s);
1110 s += p;
1111 *s3 = vld1q_u8(s);
1112 s += p;
1113 *s4 = vld1q_u8(s);
1114 s += p;
1115 *s5 = vld1q_u8(s);
1116 s += p;
1117 *s6 = vld1q_u8(s);
1118 s += p;
1119 *s7 = vld1q_u8(s);
1120 }
1121
load_u8_16x5(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4)1122 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
1123 uint8x16_t *const s0, uint8x16_t *const s1,
1124 uint8x16_t *const s2, uint8x16_t *const s3,
1125 uint8x16_t *const s4) {
1126 *s0 = vld1q_u8(s);
1127 s += p;
1128 *s1 = vld1q_u8(s);
1129 s += p;
1130 *s2 = vld1q_u8(s);
1131 s += p;
1132 *s3 = vld1q_u8(s);
1133 s += p;
1134 *s4 = vld1q_u8(s);
1135 }
1136
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1137 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1138 uint8x16_t *const s0, uint8x16_t *const s1,
1139 uint8x16_t *const s2, uint8x16_t *const s3) {
1140 *s0 = vld1q_u8(s);
1141 s += p;
1142 *s1 = vld1q_u8(s);
1143 s += p;
1144 *s2 = vld1q_u8(s);
1145 s += p;
1146 *s3 = vld1q_u8(s);
1147 }
1148
load_u8_16x3(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)1149 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
1150 uint8x16_t *const s0, uint8x16_t *const s1,
1151 uint8x16_t *const s2) {
1152 *s0 = vld1q_u8(s);
1153 s += p;
1154 *s1 = vld1q_u8(s);
1155 s += p;
1156 *s2 = vld1q_u8(s);
1157 }
1158
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1159 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1160 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1161 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1162 uint16x8_t *s6, uint16x8_t *s7) {
1163 *s0 = vld1q_u16(s);
1164 s += p;
1165 *s1 = vld1q_u16(s);
1166 s += p;
1167 *s2 = vld1q_u16(s);
1168 s += p;
1169 *s3 = vld1q_u16(s);
1170 s += p;
1171 *s4 = vld1q_u16(s);
1172 s += p;
1173 *s5 = vld1q_u16(s);
1174 s += p;
1175 *s6 = vld1q_u16(s);
1176 s += p;
1177 *s7 = vld1q_u16(s);
1178 }
1179
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1180 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1181 uint16x8_t *const s0, uint16x8_t *const s1,
1182 uint16x8_t *const s2, uint16x8_t *const s3,
1183 uint16x8_t *const s4, uint16x8_t *const s5,
1184 uint16x8_t *const s6, uint16x8_t *const s7) {
1185 *s0 = vld1q_u16(s);
1186 *s1 = vld1q_u16(s + 8);
1187 s += p;
1188 *s2 = vld1q_u16(s);
1189 *s3 = vld1q_u16(s + 8);
1190 s += p;
1191 *s4 = vld1q_u16(s);
1192 *s5 = vld1q_u16(s + 8);
1193 s += p;
1194 *s6 = vld1q_u16(s);
1195 *s7 = vld1q_u16(s + 8);
1196 }
1197
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1198 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1199 int stride) {
1200 uint32_t a;
1201 uint32x2_t a_u32;
1202
1203 memcpy(&a, buf, 4);
1204 buf += stride;
1205 a_u32 = vdup_n_u32(a);
1206 memcpy(&a, buf, 4);
1207 a_u32 = vset_lane_u32(a, a_u32, 1);
1208 return vreinterpret_u16_u32(a_u32);
1209 }
1210
load_unaligned_u16_4x1(const uint16_t * buf)1211 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1212 uint64_t a;
1213 uint64x1_t a_u64 = vdup_n_u64(0);
1214 memcpy(&a, buf, 8);
1215 a_u64 = vset_lane_u64(a, a_u64, 0);
1216 return vreinterpret_u16_u64(a_u64);
1217 }
1218
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1219 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1220 uint32_t stride) {
1221 uint64_t a;
1222 uint64x2_t a_u64;
1223
1224 memcpy(&a, buf, 8);
1225 buf += stride;
1226 a_u64 = vdupq_n_u64(0);
1227 a_u64 = vsetq_lane_u64(a, a_u64, 0);
1228 memcpy(&a, buf, 8);
1229 buf += stride;
1230 a_u64 = vsetq_lane_u64(a, a_u64, 1);
1231 return vreinterpretq_u16_u64(a_u64);
1232 }
1233
load_unaligned_s16_4x2(const int16_t * buf,uint32_t stride)1234 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
1235 uint32_t stride) {
1236 int64_t a;
1237 int64x2_t a_s64;
1238 memcpy(&a, buf, 8);
1239 buf += stride;
1240 a_s64 = vdupq_n_s64(0);
1241 a_s64 = vsetq_lane_s64(a, a_s64, 0);
1242 memcpy(&a, buf, 8);
1243 buf += stride;
1244 a_s64 = vsetq_lane_s64(a, a_s64, 1);
1245 return vreinterpretq_s16_s64(a_s64);
1246 }
1247
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1248 static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1249 uint16x8_t *tu0, uint16x8_t *tu1) {
1250 *tu0 = load_unaligned_u16_4x2(buf, stride);
1251 buf += 2 * stride;
1252 *tu1 = load_unaligned_u16_4x2(buf, stride);
1253 }
1254
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1255 static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1256 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1257 *s1 = vld1q_s32(s);
1258 s += p;
1259 *s2 = vld1q_s32(s);
1260 s += p;
1261 *s3 = vld1q_s32(s);
1262 s += p;
1263 *s4 = vld1q_s32(s);
1264 }
1265
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1266 static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1267 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1268 vst1q_s32(s, s1);
1269 s += p;
1270 vst1q_s32(s, s2);
1271 s += p;
1272 vst1q_s32(s, s3);
1273 s += p;
1274 vst1q_s32(s, s4);
1275 }
1276
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1277 static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1278 uint32x4_t *s2, uint32x4_t *s3,
1279 uint32x4_t *s4) {
1280 *s1 = vld1q_u32(s);
1281 s += p;
1282 *s2 = vld1q_u32(s);
1283 s += p;
1284 *s3 = vld1q_u32(s);
1285 s += p;
1286 *s4 = vld1q_u32(s);
1287 }
1288
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1289 static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1290 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1291 vst1q_u32(s, s1);
1292 s += p;
1293 vst1q_u32(s, s2);
1294 s += p;
1295 vst1q_u32(s, s3);
1296 s += p;
1297 vst1q_u32(s, s4);
1298 }
1299
load_tran_low_to_s16q(const tran_low_t * buf)1300 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1301 const int32x4_t v0 = vld1q_s32(buf);
1302 const int32x4_t v1 = vld1q_s32(buf + 4);
1303 const int16x4_t s0 = vmovn_s32(v0);
1304 const int16x4_t s1 = vmovn_s32(v1);
1305 return vcombine_s16(s0, s1);
1306 }
1307
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1308 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1309 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1310 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1311 vst1q_s32(buf, v0);
1312 vst1q_s32(buf + 4, v1);
1313 }
1314
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1315 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1316 const int32x4_t v0 = vmovl_s16(a);
1317 vst1q_s32(buf, v0);
1318 }
1319
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1320 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1321 int16x8_t indices) {
1322 // Recent Clang and GCC versions correctly identify that this zero-broadcast
1323 // is redundant. Alternatively we could load and broadcast the zeroth element
1324 // and then replace the other lanes, however this is slower than loading a
1325 // single element without broadcast on some micro-architectures.
1326 uint8x8_t ret = vdup_n_u8(0);
1327 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1328 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1329 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1330 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1331 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1332 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1333 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1334 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1335 return ret;
1336 }
1337
1338 // The `lane` parameter here must be an immediate.
1339 #define store_u8_2x1_lane(dst, src, lane) \
1340 do { \
1341 uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1342 memcpy(dst, &a, 2); \
1343 } while (0)
1344
1345 #define store_u8_4x1_lane(dst, src, lane) \
1346 do { \
1347 uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1348 memcpy(dst, &a, 4); \
1349 } while (0)
1350
1351 #define store_u16_2x1_lane(dst, src, lane) \
1352 do { \
1353 uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1354 memcpy(dst, &a, 4); \
1355 } while (0)
1356
1357 #define store_u16_4x1_lane(dst, src, lane) \
1358 do { \
1359 uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1360 memcpy(dst, &a, 8); \
1361 } while (0)
1362
1363 #define store_s16_4x1_lane(dst, src, lane) \
1364 do { \
1365 int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
1366 memcpy(dst, &a, 8); \
1367 } while (0)
1368
1369 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1370 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1371 store_u8_2x1_lane(dst, src, 0);
1372 }
1373
1374 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1375 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1376 store_u8_4x1_lane(dst, src, 0);
1377 }
1378
1379 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1380 static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1381 uint8x8_t src) {
1382 store_u8_2x1_lane(dst, src, 0);
1383 dst += dst_stride;
1384 store_u8_2x1_lane(dst, src, 1);
1385 }
1386
store_u8x2_strided_x4(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1387 static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride,
1388 uint8x8_t src) {
1389 store_u8_2x1_lane(dst, src, 0);
1390 dst += dst_stride;
1391 store_u8_2x1_lane(dst, src, 1);
1392 dst += dst_stride;
1393 store_u8_2x1_lane(dst, src, 2);
1394 dst += dst_stride;
1395 store_u8_2x1_lane(dst, src, 3);
1396 }
1397
1398 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1399 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1400 uint8x8_t src) {
1401 store_u8_4x1_lane(dst, src, 0);
1402 dst += stride;
1403 store_u8_4x1_lane(dst, src, 1);
1404 }
1405
1406 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1407 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1408 uint8x16_t src) {
1409 store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1410 dst += stride;
1411 store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1412 dst += stride;
1413 store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1414 dst += stride;
1415 store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1416 }
1417
1418 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1419 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1420 store_u16_2x1_lane(dst, src, 0);
1421 }
1422
1423 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1424 static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1425 uint16x4_t src) {
1426 store_u16_2x1_lane(dst, src, 0);
1427 dst += dst_stride;
1428 store_u16_2x1_lane(dst, src, 1);
1429 }
1430
1431 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1432 static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1433 uint16x8_t src) {
1434 store_u16_4x1_lane(dst, src, 0);
1435 dst += dst_stride;
1436 store_u16_4x1_lane(dst, src, 1);
1437 }
1438
1439 // Store two blocks of 64-bits from a single vector.
store_s16x4_strided_x2(int16_t * dst,int32_t dst_stride,int16x8_t src)1440 static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
1441 int16x8_t src) {
1442 store_s16_4x1_lane(dst, src, 0);
1443 dst += dst_stride;
1444 store_s16_4x1_lane(dst, src, 1);
1445 }
1446
1447 #undef store_u8_2x1_lane
1448 #undef store_u8_4x1_lane
1449 #undef store_u16_2x1_lane
1450 #undef store_u16_4x1_lane
1451 #undef store_s16_4x1_lane
1452
1453 #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
1454