1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
12 #define VPX_VPX_DSP_ARM_MEM_NEON_H_
13
14 #include <arm_neon.h>
15 #include <assert.h>
16 #include <string.h>
17
18 #include "./vpx_config.h"
19 #include "vpx/vpx_integer.h"
20 #include "vpx_dsp/vpx_dsp_common.h"
21
22 // Support for these xN intrinsics is lacking in older versions of GCC.
23 #if defined(__GNUC__) && !defined(__clang__)
24 #if __GNUC__ < 8 || defined(__arm__)
vld1q_u8_x2(uint8_t const * ptr)25 static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
26 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
27 return res;
28 }
29 #endif
30
31 #if __GNUC__ < 9 || defined(__arm__)
vld1q_u8_x3(uint8_t const * ptr)32 static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
33 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
34 vld1q_u8(ptr + 2 * 16) } };
35 return res;
36 }
37 #endif
38 #endif
39
create_s16x4_neon(const int16_t c0,const int16_t c1,const int16_t c2,const int16_t c3)40 static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
41 const int16_t c2, const int16_t c3) {
42 return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
43 ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
44 }
45
create_s32x2_neon(const int32_t c0,const int32_t c1)46 static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
47 return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
48 }
49
create_s32x4_neon(const int32_t c0,const int32_t c1,const int32_t c2,const int32_t c3)50 static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
51 const int32_t c2, const int32_t c3) {
52 return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
53 }
54
55 // Helper functions used to load tran_low_t into int16, narrowing if necessary.
load_tran_low_to_s16x2q(const tran_low_t * buf)56 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
57 #if CONFIG_VP9_HIGHBITDEPTH
58 const int32x4x2_t v0 = vld2q_s32(buf);
59 const int32x4x2_t v1 = vld2q_s32(buf + 8);
60 const int16x4_t s0 = vmovn_s32(v0.val[0]);
61 const int16x4_t s1 = vmovn_s32(v0.val[1]);
62 const int16x4_t s2 = vmovn_s32(v1.val[0]);
63 const int16x4_t s3 = vmovn_s32(v1.val[1]);
64 int16x8x2_t res;
65 res.val[0] = vcombine_s16(s0, s2);
66 res.val[1] = vcombine_s16(s1, s3);
67 return res;
68 #else
69 return vld2q_s16(buf);
70 #endif
71 }
72
load_tran_low_to_s16q(const tran_low_t * buf)73 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
74 #if CONFIG_VP9_HIGHBITDEPTH
75 const int32x4_t v0 = vld1q_s32(buf);
76 const int32x4_t v1 = vld1q_s32(buf + 4);
77 const int16x4_t s0 = vmovn_s32(v0);
78 const int16x4_t s1 = vmovn_s32(v1);
79 return vcombine_s16(s0, s1);
80 #else
81 return vld1q_s16(buf);
82 #endif
83 }
84
load_tran_low_to_s16d(const tran_low_t * buf)85 static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
86 #if CONFIG_VP9_HIGHBITDEPTH
87 const int32x4_t v0 = vld1q_s32(buf);
88 return vmovn_s32(v0);
89 #else
90 return vld1_s16(buf);
91 #endif
92 }
93
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)94 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
95 #if CONFIG_VP9_HIGHBITDEPTH
96 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
97 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
98 vst1q_s32(buf, v0);
99 vst1q_s32(buf + 4, v1);
100 #else
101 vst1q_s16(buf, a);
102 #endif
103 }
104
105 #if CONFIG_VP9_HIGHBITDEPTH
store_s32q_to_tran_low(tran_low_t * buf,const int32x4_t a)106 static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
107 vst1q_s32(buf, a);
108 }
109
load_tran_low_to_s32q(const tran_low_t * buf)110 static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
111 return vld1q_s32(buf);
112 }
113 #endif
114
115 // Propagate type information to the compiler. Without this the compiler may
116 // assume the required alignment of uint32_t (4 bytes) and add alignment hints
117 // to the memory access.
118 //
119 // This is used for functions operating on uint8_t which wish to load or store 4
120 // values at a time but which may not be on 4 byte boundaries.
uint32_to_mem(uint8_t * buf,uint32_t a)121 static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
122 memcpy(buf, &a, 4);
123 }
124
125 // Load 4 contiguous bytes when alignment is not guaranteed.
load_unaligned_u8_4x1(const uint8_t * buf)126 static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
127 uint32_t a;
128 uint32x2_t a_u32;
129 memcpy(&a, buf, 4);
130 a_u32 = vdup_n_u32(0);
131 a_u32 = vset_lane_u32(a, a_u32, 0);
132 return vreinterpret_u8_u32(a_u32);
133 }
134
135 // Load 4 contiguous bytes and replicate across a vector when alignment is not
136 // guaranteed.
load_replicate_u8_4x1(const uint8_t * buf)137 static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
138 uint32_t a;
139 memcpy(&a, buf, 4);
140 return vreinterpret_u8_u32(vdup_n_u32(a));
141 }
142
143 // Store 4 contiguous bytes from the low half of an 8x8 vector.
store_u8_4x1(uint8_t * buf,uint8x8_t a)144 static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
145 vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
146 }
147
148 // Store 4 contiguous bytes from the high half of an 8x8 vector.
store_u8_4x1_high(uint8_t * buf,uint8x8_t a)149 static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
150 vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
151 }
152
153 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,ptrdiff_t stride)154 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
155 ptrdiff_t stride) {
156 uint32_t a;
157 uint32x2_t a_u32 = vdup_n_u32(0);
158 memcpy(&a, buf, 4);
159 buf += stride;
160 a_u32 = vset_lane_u32(a, a_u32, 0);
161 memcpy(&a, buf, 4);
162 a_u32 = vset_lane_u32(a, a_u32, 1);
163 return vreinterpret_u8_u32(a_u32);
164 }
165
166 // Load 8 bytes when alignment is not guaranteed.
load_unaligned_u16(const uint16_t * buf)167 static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
168 uint64_t a;
169 uint64x1_t a_u64 = vdup_n_u64(0);
170 memcpy(&a, buf, 8);
171 a_u64 = vset_lane_u64(a, a_u64, 0);
172 return vreinterpret_u16_u64(a_u64);
173 }
174
175 // Load 2 sets of 8 bytes when alignment is not guaranteed.
load_unaligned_u16q(const uint16_t * buf,ptrdiff_t stride)176 static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
177 ptrdiff_t stride) {
178 uint64_t a;
179 uint64x2_t a_u64 = vdupq_n_u64(0);
180 memcpy(&a, buf, 8);
181 buf += stride;
182 a_u64 = vsetq_lane_u64(a, a_u64, 0);
183 memcpy(&a, buf, 8);
184 a_u64 = vsetq_lane_u64(a, a_u64, 1);
185 return vreinterpretq_u16_u64(a_u64);
186 }
187
188 // Store 2 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)189 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
190 const uint8x8_t a) {
191 const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
192 uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
193 buf += stride;
194 uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
195 }
196
197 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,ptrdiff_t stride)198 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
199 ptrdiff_t stride) {
200 uint32_t a;
201 uint32x4_t a_u32 = vdupq_n_u32(0);
202 memcpy(&a, buf, 4);
203 buf += stride;
204 a_u32 = vsetq_lane_u32(a, a_u32, 0);
205 memcpy(&a, buf, 4);
206 buf += stride;
207 a_u32 = vsetq_lane_u32(a, a_u32, 1);
208 memcpy(&a, buf, 4);
209 buf += stride;
210 a_u32 = vsetq_lane_u32(a, a_u32, 2);
211 memcpy(&a, buf, 4);
212 buf += stride;
213 a_u32 = vsetq_lane_u32(a, a_u32, 3);
214 return vreinterpretq_u8_u32(a_u32);
215 }
216
217 // Store 4 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8q(uint8_t * buf,ptrdiff_t stride,const uint8x16_t a)218 static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
219 const uint8x16_t a) {
220 const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
221 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
222 buf += stride;
223 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
224 buf += stride;
225 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
226 buf += stride;
227 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
228 }
229
230 // Load 2 sets of 4 bytes when alignment is guaranteed.
load_u8(const uint8_t * buf,ptrdiff_t stride)231 static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
232 uint32x2_t a = vdup_n_u32(0);
233
234 assert(!((intptr_t)buf % sizeof(uint32_t)));
235 assert(!(stride % sizeof(uint32_t)));
236
237 a = vld1_lane_u32((const uint32_t *)buf, a, 0);
238 buf += stride;
239 a = vld1_lane_u32((const uint32_t *)buf, a, 1);
240 return vreinterpret_u8_u32(a);
241 }
242
243 // Store 2 sets of 4 bytes when alignment is guaranteed.
store_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)244 static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
245 uint32x2_t a_u32 = vreinterpret_u32_u8(a);
246
247 assert(!((intptr_t)buf % sizeof(uint32_t)));
248 assert(!(stride % sizeof(uint32_t)));
249
250 vst1_lane_u32((uint32_t *)buf, a_u32, 0);
251 buf += stride;
252 vst1_lane_u32((uint32_t *)buf, a_u32, 1);
253 }
254
store_u8_8x3(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2)255 static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
256 const uint8x8_t s0, const uint8x8_t s1,
257 const uint8x8_t s2) {
258 vst1_u8(s, s0);
259 s += p;
260 vst1_u8(s, s1);
261 s += p;
262 vst1_u8(s, s2);
263 }
264
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)265 static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
266 uint8x8_t *const s0, uint8x8_t *const s1,
267 uint8x8_t *const s2) {
268 *s0 = vld1_u8(s);
269 s += p;
270 *s1 = vld1_u8(s);
271 s += p;
272 *s2 = vld1_u8(s);
273 }
274
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)275 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
276 uint8x8_t *const s0, uint8x8_t *const s1,
277 uint8x8_t *const s2, uint8x8_t *const s3) {
278 *s0 = vld1_u8(s);
279 s += p;
280 *s1 = vld1_u8(s);
281 s += p;
282 *s2 = vld1_u8(s);
283 s += p;
284 *s3 = vld1_u8(s);
285 }
286
store_u8_8x4(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)287 static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
288 const uint8x8_t s0, const uint8x8_t s1,
289 const uint8x8_t s2, const uint8x8_t s3) {
290 vst1_u8(s, s0);
291 s += p;
292 vst1_u8(s, s1);
293 s += p;
294 vst1_u8(s, s2);
295 s += p;
296 vst1_u8(s, s3);
297 }
298
load_u8_16x3(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)299 static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
300 uint8x16_t *const s0, uint8x16_t *const s1,
301 uint8x16_t *const s2) {
302 *s0 = vld1q_u8(s);
303 s += p;
304 *s1 = vld1q_u8(s);
305 s += p;
306 *s2 = vld1q_u8(s);
307 }
308
load_u8_16x4(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)309 static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
310 uint8x16_t *const s0, uint8x16_t *const s1,
311 uint8x16_t *const s2, uint8x16_t *const s3) {
312 *s0 = vld1q_u8(s);
313 s += p;
314 *s1 = vld1q_u8(s);
315 s += p;
316 *s2 = vld1q_u8(s);
317 s += p;
318 *s3 = vld1q_u8(s);
319 }
320
store_u8_16x4(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)321 static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
322 const uint8x16_t s0, const uint8x16_t s1,
323 const uint8x16_t s2, const uint8x16_t s3) {
324 vst1q_u8(s, s0);
325 s += p;
326 vst1q_u8(s, s1);
327 s += p;
328 vst1q_u8(s, s2);
329 s += p;
330 vst1q_u8(s, s3);
331 }
332
load_u8_8x7(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)333 static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
334 uint8x8_t *const s0, uint8x8_t *const s1,
335 uint8x8_t *const s2, uint8x8_t *const s3,
336 uint8x8_t *const s4, uint8x8_t *const s5,
337 uint8x8_t *const s6) {
338 *s0 = vld1_u8(s);
339 s += p;
340 *s1 = vld1_u8(s);
341 s += p;
342 *s2 = vld1_u8(s);
343 s += p;
344 *s3 = vld1_u8(s);
345 s += p;
346 *s4 = vld1_u8(s);
347 s += p;
348 *s5 = vld1_u8(s);
349 s += p;
350 *s6 = vld1_u8(s);
351 }
352
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)353 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
354 uint8x8_t *const s0, uint8x8_t *const s1,
355 uint8x8_t *const s2, uint8x8_t *const s3,
356 uint8x8_t *const s4, uint8x8_t *const s5,
357 uint8x8_t *const s6, uint8x8_t *const s7) {
358 *s0 = vld1_u8(s);
359 s += p;
360 *s1 = vld1_u8(s);
361 s += p;
362 *s2 = vld1_u8(s);
363 s += p;
364 *s3 = vld1_u8(s);
365 s += p;
366 *s4 = vld1_u8(s);
367 s += p;
368 *s5 = vld1_u8(s);
369 s += p;
370 *s6 = vld1_u8(s);
371 s += p;
372 *s7 = vld1_u8(s);
373 }
374
store_u8_8x8(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)375 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
376 const uint8x8_t s0, const uint8x8_t s1,
377 const uint8x8_t s2, const uint8x8_t s3,
378 const uint8x8_t s4, const uint8x8_t s5,
379 const uint8x8_t s6, const uint8x8_t s7) {
380 vst1_u8(s, s0);
381 s += p;
382 vst1_u8(s, s1);
383 s += p;
384 vst1_u8(s, s2);
385 s += p;
386 vst1_u8(s, s3);
387 s += p;
388 vst1_u8(s, s4);
389 s += p;
390 vst1_u8(s, s5);
391 s += p;
392 vst1_u8(s, s6);
393 s += p;
394 vst1_u8(s, s7);
395 }
396
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)397 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
398 uint8x16_t *const s0, uint8x16_t *const s1,
399 uint8x16_t *const s2, uint8x16_t *const s3,
400 uint8x16_t *const s4, uint8x16_t *const s5,
401 uint8x16_t *const s6, uint8x16_t *const s7) {
402 *s0 = vld1q_u8(s);
403 s += p;
404 *s1 = vld1q_u8(s);
405 s += p;
406 *s2 = vld1q_u8(s);
407 s += p;
408 *s3 = vld1q_u8(s);
409 s += p;
410 *s4 = vld1q_u8(s);
411 s += p;
412 *s5 = vld1q_u8(s);
413 s += p;
414 *s6 = vld1q_u8(s);
415 s += p;
416 *s7 = vld1q_u8(s);
417 }
418
store_u8_16x8(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3,const uint8x16_t s4,const uint8x16_t s5,const uint8x16_t s6,const uint8x16_t s7)419 static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
420 const uint8x16_t s0, const uint8x16_t s1,
421 const uint8x16_t s2, const uint8x16_t s3,
422 const uint8x16_t s4, const uint8x16_t s5,
423 const uint8x16_t s6, const uint8x16_t s7) {
424 vst1q_u8(s, s0);
425 s += p;
426 vst1q_u8(s, s1);
427 s += p;
428 vst1q_u8(s, s2);
429 s += p;
430 vst1q_u8(s, s3);
431 s += p;
432 vst1q_u8(s, s4);
433 s += p;
434 vst1q_u8(s, s5);
435 s += p;
436 vst1q_u8(s, s6);
437 s += p;
438 vst1q_u8(s, s7);
439 }
440
store_u16_4x3(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)441 static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
442 const uint16x4_t s0, const uint16x4_t s1,
443 const uint16x4_t s2) {
444 vst1_u16(s, s0);
445 s += p;
446 vst1_u16(s, s1);
447 s += p;
448 vst1_u16(s, s2);
449 }
450
load_s16_4x3(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2)451 static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
452 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
453 *s0 = vld1_s16(s);
454 s += p;
455 *s1 = vld1_s16(s);
456 s += p;
457 *s2 = vld1_s16(s);
458 }
459
load_s16_4x4(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3)460 static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
461 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
462 int16x4_t *s3) {
463 *s0 = vld1_s16(s);
464 s += p;
465 *s1 = vld1_s16(s);
466 s += p;
467 *s2 = vld1_s16(s);
468 s += p;
469 *s3 = vld1_s16(s);
470 }
471
store_u16_4x4(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)472 static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
473 const uint16x4_t s0, const uint16x4_t s1,
474 const uint16x4_t s2, const uint16x4_t s3) {
475 vst1_u16(s, s0);
476 s += p;
477 vst1_u16(s, s1);
478 s += p;
479 vst1_u16(s, s2);
480 s += p;
481 vst1_u16(s, s3);
482 }
483
load_s16_4x7(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6)484 static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
485 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
486 int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
487 int16x4_t *s6) {
488 *s0 = vld1_s16(s);
489 s += p;
490 *s1 = vld1_s16(s);
491 s += p;
492 *s2 = vld1_s16(s);
493 s += p;
494 *s3 = vld1_s16(s);
495 s += p;
496 *s4 = vld1_s16(s);
497 s += p;
498 *s5 = vld1_s16(s);
499 s += p;
500 *s6 = vld1_s16(s);
501 }
502
load_s16_8x3(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2)503 static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
504 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
505 *s0 = vld1q_s16(s);
506 s += p;
507 *s1 = vld1q_s16(s);
508 s += p;
509 *s2 = vld1q_s16(s);
510 }
511
load_s16_8x4(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3)512 static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
513 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
514 int16x8_t *s3) {
515 *s0 = vld1q_s16(s);
516 s += p;
517 *s1 = vld1q_s16(s);
518 s += p;
519 *s2 = vld1q_s16(s);
520 s += p;
521 *s3 = vld1q_s16(s);
522 }
523
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3)524 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
525 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
526 uint16x8_t *s3) {
527 *s0 = vld1q_u16(s);
528 s += p;
529 *s1 = vld1q_u16(s);
530 s += p;
531 *s2 = vld1q_u16(s);
532 s += p;
533 *s3 = vld1q_u16(s);
534 }
535
store_u16_8x4(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)536 static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
537 const uint16x8_t s0, const uint16x8_t s1,
538 const uint16x8_t s2, const uint16x8_t s3) {
539 vst1q_u16(s, s0);
540 s += p;
541 vst1q_u16(s, s1);
542 s += p;
543 vst1q_u16(s, s2);
544 s += p;
545 vst1q_u16(s, s3);
546 }
547
store_u16_8x3(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)548 static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
549 const uint16x8_t s0, const uint16x8_t s1,
550 const uint16x8_t s2) {
551 vst1q_u16(s, s0);
552 s += p;
553 vst1q_u16(s, s1);
554 s += p;
555 vst1q_u16(s, s2);
556 }
557
load_s16_8x7(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6)558 static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
559 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
560 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
561 int16x8_t *s6) {
562 *s0 = vld1q_s16(s);
563 s += p;
564 *s1 = vld1q_s16(s);
565 s += p;
566 *s2 = vld1q_s16(s);
567 s += p;
568 *s3 = vld1q_s16(s);
569 s += p;
570 *s4 = vld1q_s16(s);
571 s += p;
572 *s5 = vld1q_s16(s);
573 s += p;
574 *s6 = vld1q_s16(s);
575 }
576
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)577 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
578 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
579 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
580 uint16x8_t *s6, uint16x8_t *s7) {
581 *s0 = vld1q_u16(s);
582 s += p;
583 *s1 = vld1q_u16(s);
584 s += p;
585 *s2 = vld1q_u16(s);
586 s += p;
587 *s3 = vld1q_u16(s);
588 s += p;
589 *s4 = vld1q_u16(s);
590 s += p;
591 *s5 = vld1q_u16(s);
592 s += p;
593 *s6 = vld1q_u16(s);
594 s += p;
595 *s7 = vld1q_u16(s);
596 }
597
load_s16_4x8(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6,int16x4_t * s7)598 static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
599 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
600 int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
601 int16x4_t *s6, int16x4_t *s7) {
602 *s0 = vld1_s16(s);
603 s += p;
604 *s1 = vld1_s16(s);
605 s += p;
606 *s2 = vld1_s16(s);
607 s += p;
608 *s3 = vld1_s16(s);
609 s += p;
610 *s4 = vld1_s16(s);
611 s += p;
612 *s5 = vld1_s16(s);
613 s += p;
614 *s6 = vld1_s16(s);
615 s += p;
616 *s7 = vld1_s16(s);
617 }
618
load_s16_8x8(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6,int16x8_t * s7)619 static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
620 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
621 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
622 int16x8_t *s6, int16x8_t *s7) {
623 *s0 = vld1q_s16(s);
624 s += p;
625 *s1 = vld1q_s16(s);
626 s += p;
627 *s2 = vld1q_s16(s);
628 s += p;
629 *s3 = vld1q_s16(s);
630 s += p;
631 *s4 = vld1q_s16(s);
632 s += p;
633 *s5 = vld1q_s16(s);
634 s += p;
635 *s6 = vld1q_s16(s);
636 s += p;
637 *s7 = vld1q_s16(s);
638 }
639
640 #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
641