xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/mem_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
12 #define VPX_VPX_DSP_ARM_MEM_NEON_H_
13 
14 #include <arm_neon.h>
15 #include <assert.h>
16 #include <string.h>
17 
18 #include "./vpx_config.h"
19 #include "vpx/vpx_integer.h"
20 #include "vpx_dsp/vpx_dsp_common.h"
21 
22 // Support for these xN intrinsics is lacking in older versions of GCC.
23 #if defined(__GNUC__) && !defined(__clang__)
24 #if __GNUC__ < 8 || defined(__arm__)
vld1q_u8_x2(uint8_t const * ptr)25 static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
26   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
27   return res;
28 }
29 #endif
30 
31 #if __GNUC__ < 9 || defined(__arm__)
vld1q_u8_x3(uint8_t const * ptr)32 static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
33   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
34                          vld1q_u8(ptr + 2 * 16) } };
35   return res;
36 }
37 #endif
38 #endif
39 
create_s16x4_neon(const int16_t c0,const int16_t c1,const int16_t c2,const int16_t c3)40 static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
41                                           const int16_t c2, const int16_t c3) {
42   return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
43                      ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
44 }
45 
create_s32x2_neon(const int32_t c0,const int32_t c1)46 static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
47   return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
48 }
49 
create_s32x4_neon(const int32_t c0,const int32_t c1,const int32_t c2,const int32_t c3)50 static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
51                                           const int32_t c2, const int32_t c3) {
52   return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
53 }
54 
55 // Helper functions used to load tran_low_t into int16, narrowing if necessary.
load_tran_low_to_s16x2q(const tran_low_t * buf)56 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
57 #if CONFIG_VP9_HIGHBITDEPTH
58   const int32x4x2_t v0 = vld2q_s32(buf);
59   const int32x4x2_t v1 = vld2q_s32(buf + 8);
60   const int16x4_t s0 = vmovn_s32(v0.val[0]);
61   const int16x4_t s1 = vmovn_s32(v0.val[1]);
62   const int16x4_t s2 = vmovn_s32(v1.val[0]);
63   const int16x4_t s3 = vmovn_s32(v1.val[1]);
64   int16x8x2_t res;
65   res.val[0] = vcombine_s16(s0, s2);
66   res.val[1] = vcombine_s16(s1, s3);
67   return res;
68 #else
69   return vld2q_s16(buf);
70 #endif
71 }
72 
load_tran_low_to_s16q(const tran_low_t * buf)73 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
74 #if CONFIG_VP9_HIGHBITDEPTH
75   const int32x4_t v0 = vld1q_s32(buf);
76   const int32x4_t v1 = vld1q_s32(buf + 4);
77   const int16x4_t s0 = vmovn_s32(v0);
78   const int16x4_t s1 = vmovn_s32(v1);
79   return vcombine_s16(s0, s1);
80 #else
81   return vld1q_s16(buf);
82 #endif
83 }
84 
load_tran_low_to_s16d(const tran_low_t * buf)85 static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
86 #if CONFIG_VP9_HIGHBITDEPTH
87   const int32x4_t v0 = vld1q_s32(buf);
88   return vmovn_s32(v0);
89 #else
90   return vld1_s16(buf);
91 #endif
92 }
93 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)94 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
95 #if CONFIG_VP9_HIGHBITDEPTH
96   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
97   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
98   vst1q_s32(buf, v0);
99   vst1q_s32(buf + 4, v1);
100 #else
101   vst1q_s16(buf, a);
102 #endif
103 }
104 
105 #if CONFIG_VP9_HIGHBITDEPTH
store_s32q_to_tran_low(tran_low_t * buf,const int32x4_t a)106 static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
107   vst1q_s32(buf, a);
108 }
109 
load_tran_low_to_s32q(const tran_low_t * buf)110 static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
111   return vld1q_s32(buf);
112 }
113 #endif
114 
115 // Propagate type information to the compiler. Without this the compiler may
116 // assume the required alignment of uint32_t (4 bytes) and add alignment hints
117 // to the memory access.
118 //
119 // This is used for functions operating on uint8_t which wish to load or store 4
120 // values at a time but which may not be on 4 byte boundaries.
uint32_to_mem(uint8_t * buf,uint32_t a)121 static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
122   memcpy(buf, &a, 4);
123 }
124 
125 // Load 4 contiguous bytes when alignment is not guaranteed.
load_unaligned_u8_4x1(const uint8_t * buf)126 static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
127   uint32_t a;
128   uint32x2_t a_u32;
129   memcpy(&a, buf, 4);
130   a_u32 = vdup_n_u32(0);
131   a_u32 = vset_lane_u32(a, a_u32, 0);
132   return vreinterpret_u8_u32(a_u32);
133 }
134 
135 // Load 4 contiguous bytes and replicate across a vector when alignment is not
136 // guaranteed.
load_replicate_u8_4x1(const uint8_t * buf)137 static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
138   uint32_t a;
139   memcpy(&a, buf, 4);
140   return vreinterpret_u8_u32(vdup_n_u32(a));
141 }
142 
143 // Store 4 contiguous bytes from the low half of an 8x8 vector.
store_u8_4x1(uint8_t * buf,uint8x8_t a)144 static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
145   vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
146 }
147 
148 // Store 4 contiguous bytes from the high half of an 8x8 vector.
store_u8_4x1_high(uint8_t * buf,uint8x8_t a)149 static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
150   vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
151 }
152 
153 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,ptrdiff_t stride)154 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
155                                           ptrdiff_t stride) {
156   uint32_t a;
157   uint32x2_t a_u32 = vdup_n_u32(0);
158   memcpy(&a, buf, 4);
159   buf += stride;
160   a_u32 = vset_lane_u32(a, a_u32, 0);
161   memcpy(&a, buf, 4);
162   a_u32 = vset_lane_u32(a, a_u32, 1);
163   return vreinterpret_u8_u32(a_u32);
164 }
165 
166 // Load 8 bytes when alignment is not guaranteed.
load_unaligned_u16(const uint16_t * buf)167 static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
168   uint64_t a;
169   uint64x1_t a_u64 = vdup_n_u64(0);
170   memcpy(&a, buf, 8);
171   a_u64 = vset_lane_u64(a, a_u64, 0);
172   return vreinterpret_u16_u64(a_u64);
173 }
174 
175 // Load 2 sets of 8 bytes when alignment is not guaranteed.
load_unaligned_u16q(const uint16_t * buf,ptrdiff_t stride)176 static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
177                                              ptrdiff_t stride) {
178   uint64_t a;
179   uint64x2_t a_u64 = vdupq_n_u64(0);
180   memcpy(&a, buf, 8);
181   buf += stride;
182   a_u64 = vsetq_lane_u64(a, a_u64, 0);
183   memcpy(&a, buf, 8);
184   a_u64 = vsetq_lane_u64(a, a_u64, 1);
185   return vreinterpretq_u16_u64(a_u64);
186 }
187 
188 // Store 2 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)189 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
190                                       const uint8x8_t a) {
191   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
192   uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
193   buf += stride;
194   uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
195 }
196 
197 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,ptrdiff_t stride)198 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
199                                             ptrdiff_t stride) {
200   uint32_t a;
201   uint32x4_t a_u32 = vdupq_n_u32(0);
202   memcpy(&a, buf, 4);
203   buf += stride;
204   a_u32 = vsetq_lane_u32(a, a_u32, 0);
205   memcpy(&a, buf, 4);
206   buf += stride;
207   a_u32 = vsetq_lane_u32(a, a_u32, 1);
208   memcpy(&a, buf, 4);
209   buf += stride;
210   a_u32 = vsetq_lane_u32(a, a_u32, 2);
211   memcpy(&a, buf, 4);
212   buf += stride;
213   a_u32 = vsetq_lane_u32(a, a_u32, 3);
214   return vreinterpretq_u8_u32(a_u32);
215 }
216 
217 // Store 4 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8q(uint8_t * buf,ptrdiff_t stride,const uint8x16_t a)218 static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
219                                        const uint8x16_t a) {
220   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
221   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
222   buf += stride;
223   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
224   buf += stride;
225   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
226   buf += stride;
227   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
228 }
229 
230 // Load 2 sets of 4 bytes when alignment is guaranteed.
load_u8(const uint8_t * buf,ptrdiff_t stride)231 static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
232   uint32x2_t a = vdup_n_u32(0);
233 
234   assert(!((intptr_t)buf % sizeof(uint32_t)));
235   assert(!(stride % sizeof(uint32_t)));
236 
237   a = vld1_lane_u32((const uint32_t *)buf, a, 0);
238   buf += stride;
239   a = vld1_lane_u32((const uint32_t *)buf, a, 1);
240   return vreinterpret_u8_u32(a);
241 }
242 
243 // Store 2 sets of 4 bytes when alignment is guaranteed.
store_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)244 static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
245   uint32x2_t a_u32 = vreinterpret_u32_u8(a);
246 
247   assert(!((intptr_t)buf % sizeof(uint32_t)));
248   assert(!(stride % sizeof(uint32_t)));
249 
250   vst1_lane_u32((uint32_t *)buf, a_u32, 0);
251   buf += stride;
252   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
253 }
254 
store_u8_8x3(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2)255 static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
256                                 const uint8x8_t s0, const uint8x8_t s1,
257                                 const uint8x8_t s2) {
258   vst1_u8(s, s0);
259   s += p;
260   vst1_u8(s, s1);
261   s += p;
262   vst1_u8(s, s2);
263 }
264 
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)265 static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
266                                uint8x8_t *const s0, uint8x8_t *const s1,
267                                uint8x8_t *const s2) {
268   *s0 = vld1_u8(s);
269   s += p;
270   *s1 = vld1_u8(s);
271   s += p;
272   *s2 = vld1_u8(s);
273 }
274 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)275 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
276                                uint8x8_t *const s0, uint8x8_t *const s1,
277                                uint8x8_t *const s2, uint8x8_t *const s3) {
278   *s0 = vld1_u8(s);
279   s += p;
280   *s1 = vld1_u8(s);
281   s += p;
282   *s2 = vld1_u8(s);
283   s += p;
284   *s3 = vld1_u8(s);
285 }
286 
store_u8_8x4(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)287 static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
288                                 const uint8x8_t s0, const uint8x8_t s1,
289                                 const uint8x8_t s2, const uint8x8_t s3) {
290   vst1_u8(s, s0);
291   s += p;
292   vst1_u8(s, s1);
293   s += p;
294   vst1_u8(s, s2);
295   s += p;
296   vst1_u8(s, s3);
297 }
298 
load_u8_16x3(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)299 static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
300                                 uint8x16_t *const s0, uint8x16_t *const s1,
301                                 uint8x16_t *const s2) {
302   *s0 = vld1q_u8(s);
303   s += p;
304   *s1 = vld1q_u8(s);
305   s += p;
306   *s2 = vld1q_u8(s);
307 }
308 
load_u8_16x4(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)309 static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
310                                 uint8x16_t *const s0, uint8x16_t *const s1,
311                                 uint8x16_t *const s2, uint8x16_t *const s3) {
312   *s0 = vld1q_u8(s);
313   s += p;
314   *s1 = vld1q_u8(s);
315   s += p;
316   *s2 = vld1q_u8(s);
317   s += p;
318   *s3 = vld1q_u8(s);
319 }
320 
store_u8_16x4(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)321 static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
322                                  const uint8x16_t s0, const uint8x16_t s1,
323                                  const uint8x16_t s2, const uint8x16_t s3) {
324   vst1q_u8(s, s0);
325   s += p;
326   vst1q_u8(s, s1);
327   s += p;
328   vst1q_u8(s, s2);
329   s += p;
330   vst1q_u8(s, s3);
331 }
332 
load_u8_8x7(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)333 static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
334                                uint8x8_t *const s0, uint8x8_t *const s1,
335                                uint8x8_t *const s2, uint8x8_t *const s3,
336                                uint8x8_t *const s4, uint8x8_t *const s5,
337                                uint8x8_t *const s6) {
338   *s0 = vld1_u8(s);
339   s += p;
340   *s1 = vld1_u8(s);
341   s += p;
342   *s2 = vld1_u8(s);
343   s += p;
344   *s3 = vld1_u8(s);
345   s += p;
346   *s4 = vld1_u8(s);
347   s += p;
348   *s5 = vld1_u8(s);
349   s += p;
350   *s6 = vld1_u8(s);
351 }
352 
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)353 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
354                                uint8x8_t *const s0, uint8x8_t *const s1,
355                                uint8x8_t *const s2, uint8x8_t *const s3,
356                                uint8x8_t *const s4, uint8x8_t *const s5,
357                                uint8x8_t *const s6, uint8x8_t *const s7) {
358   *s0 = vld1_u8(s);
359   s += p;
360   *s1 = vld1_u8(s);
361   s += p;
362   *s2 = vld1_u8(s);
363   s += p;
364   *s3 = vld1_u8(s);
365   s += p;
366   *s4 = vld1_u8(s);
367   s += p;
368   *s5 = vld1_u8(s);
369   s += p;
370   *s6 = vld1_u8(s);
371   s += p;
372   *s7 = vld1_u8(s);
373 }
374 
store_u8_8x8(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)375 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
376                                 const uint8x8_t s0, const uint8x8_t s1,
377                                 const uint8x8_t s2, const uint8x8_t s3,
378                                 const uint8x8_t s4, const uint8x8_t s5,
379                                 const uint8x8_t s6, const uint8x8_t s7) {
380   vst1_u8(s, s0);
381   s += p;
382   vst1_u8(s, s1);
383   s += p;
384   vst1_u8(s, s2);
385   s += p;
386   vst1_u8(s, s3);
387   s += p;
388   vst1_u8(s, s4);
389   s += p;
390   vst1_u8(s, s5);
391   s += p;
392   vst1_u8(s, s6);
393   s += p;
394   vst1_u8(s, s7);
395 }
396 
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)397 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
398                                 uint8x16_t *const s0, uint8x16_t *const s1,
399                                 uint8x16_t *const s2, uint8x16_t *const s3,
400                                 uint8x16_t *const s4, uint8x16_t *const s5,
401                                 uint8x16_t *const s6, uint8x16_t *const s7) {
402   *s0 = vld1q_u8(s);
403   s += p;
404   *s1 = vld1q_u8(s);
405   s += p;
406   *s2 = vld1q_u8(s);
407   s += p;
408   *s3 = vld1q_u8(s);
409   s += p;
410   *s4 = vld1q_u8(s);
411   s += p;
412   *s5 = vld1q_u8(s);
413   s += p;
414   *s6 = vld1q_u8(s);
415   s += p;
416   *s7 = vld1q_u8(s);
417 }
418 
store_u8_16x8(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3,const uint8x16_t s4,const uint8x16_t s5,const uint8x16_t s6,const uint8x16_t s7)419 static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
420                                  const uint8x16_t s0, const uint8x16_t s1,
421                                  const uint8x16_t s2, const uint8x16_t s3,
422                                  const uint8x16_t s4, const uint8x16_t s5,
423                                  const uint8x16_t s6, const uint8x16_t s7) {
424   vst1q_u8(s, s0);
425   s += p;
426   vst1q_u8(s, s1);
427   s += p;
428   vst1q_u8(s, s2);
429   s += p;
430   vst1q_u8(s, s3);
431   s += p;
432   vst1q_u8(s, s4);
433   s += p;
434   vst1q_u8(s, s5);
435   s += p;
436   vst1q_u8(s, s6);
437   s += p;
438   vst1q_u8(s, s7);
439 }
440 
store_u16_4x3(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)441 static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
442                                  const uint16x4_t s0, const uint16x4_t s1,
443                                  const uint16x4_t s2) {
444   vst1_u16(s, s0);
445   s += p;
446   vst1_u16(s, s1);
447   s += p;
448   vst1_u16(s, s2);
449 }
450 
load_s16_4x3(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2)451 static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
452                                 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
453   *s0 = vld1_s16(s);
454   s += p;
455   *s1 = vld1_s16(s);
456   s += p;
457   *s2 = vld1_s16(s);
458 }
459 
load_s16_4x4(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3)460 static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
461                                 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
462                                 int16x4_t *s3) {
463   *s0 = vld1_s16(s);
464   s += p;
465   *s1 = vld1_s16(s);
466   s += p;
467   *s2 = vld1_s16(s);
468   s += p;
469   *s3 = vld1_s16(s);
470 }
471 
store_u16_4x4(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)472 static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
473                                  const uint16x4_t s0, const uint16x4_t s1,
474                                  const uint16x4_t s2, const uint16x4_t s3) {
475   vst1_u16(s, s0);
476   s += p;
477   vst1_u16(s, s1);
478   s += p;
479   vst1_u16(s, s2);
480   s += p;
481   vst1_u16(s, s3);
482 }
483 
load_s16_4x7(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6)484 static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
485                                 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
486                                 int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
487                                 int16x4_t *s6) {
488   *s0 = vld1_s16(s);
489   s += p;
490   *s1 = vld1_s16(s);
491   s += p;
492   *s2 = vld1_s16(s);
493   s += p;
494   *s3 = vld1_s16(s);
495   s += p;
496   *s4 = vld1_s16(s);
497   s += p;
498   *s5 = vld1_s16(s);
499   s += p;
500   *s6 = vld1_s16(s);
501 }
502 
load_s16_8x3(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2)503 static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
504                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
505   *s0 = vld1q_s16(s);
506   s += p;
507   *s1 = vld1q_s16(s);
508   s += p;
509   *s2 = vld1q_s16(s);
510 }
511 
load_s16_8x4(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3)512 static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
513                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
514                                 int16x8_t *s3) {
515   *s0 = vld1q_s16(s);
516   s += p;
517   *s1 = vld1q_s16(s);
518   s += p;
519   *s2 = vld1q_s16(s);
520   s += p;
521   *s3 = vld1q_s16(s);
522 }
523 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3)524 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
525                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
526                                 uint16x8_t *s3) {
527   *s0 = vld1q_u16(s);
528   s += p;
529   *s1 = vld1q_u16(s);
530   s += p;
531   *s2 = vld1q_u16(s);
532   s += p;
533   *s3 = vld1q_u16(s);
534 }
535 
store_u16_8x4(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)536 static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
537                                  const uint16x8_t s0, const uint16x8_t s1,
538                                  const uint16x8_t s2, const uint16x8_t s3) {
539   vst1q_u16(s, s0);
540   s += p;
541   vst1q_u16(s, s1);
542   s += p;
543   vst1q_u16(s, s2);
544   s += p;
545   vst1q_u16(s, s3);
546 }
547 
store_u16_8x3(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)548 static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
549                                  const uint16x8_t s0, const uint16x8_t s1,
550                                  const uint16x8_t s2) {
551   vst1q_u16(s, s0);
552   s += p;
553   vst1q_u16(s, s1);
554   s += p;
555   vst1q_u16(s, s2);
556 }
557 
load_s16_8x7(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6)558 static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
559                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
560                                 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
561                                 int16x8_t *s6) {
562   *s0 = vld1q_s16(s);
563   s += p;
564   *s1 = vld1q_s16(s);
565   s += p;
566   *s2 = vld1q_s16(s);
567   s += p;
568   *s3 = vld1q_s16(s);
569   s += p;
570   *s4 = vld1q_s16(s);
571   s += p;
572   *s5 = vld1q_s16(s);
573   s += p;
574   *s6 = vld1q_s16(s);
575 }
576 
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)577 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
578                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
579                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
580                                 uint16x8_t *s6, uint16x8_t *s7) {
581   *s0 = vld1q_u16(s);
582   s += p;
583   *s1 = vld1q_u16(s);
584   s += p;
585   *s2 = vld1q_u16(s);
586   s += p;
587   *s3 = vld1q_u16(s);
588   s += p;
589   *s4 = vld1q_u16(s);
590   s += p;
591   *s5 = vld1q_u16(s);
592   s += p;
593   *s6 = vld1q_u16(s);
594   s += p;
595   *s7 = vld1q_u16(s);
596 }
597 
load_s16_4x8(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6,int16x4_t * s7)598 static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
599                                 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
600                                 int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
601                                 int16x4_t *s6, int16x4_t *s7) {
602   *s0 = vld1_s16(s);
603   s += p;
604   *s1 = vld1_s16(s);
605   s += p;
606   *s2 = vld1_s16(s);
607   s += p;
608   *s3 = vld1_s16(s);
609   s += p;
610   *s4 = vld1_s16(s);
611   s += p;
612   *s5 = vld1_s16(s);
613   s += p;
614   *s6 = vld1_s16(s);
615   s += p;
616   *s7 = vld1_s16(s);
617 }
618 
load_s16_8x8(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6,int16x8_t * s7)619 static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
620                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
621                                 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
622                                 int16x8_t *s6, int16x8_t *s7) {
623   *s0 = vld1q_s16(s);
624   s += p;
625   *s1 = vld1q_s16(s);
626   s += p;
627   *s2 = vld1q_s16(s);
628   s += p;
629   *s3 = vld1q_s16(s);
630   s += p;
631   *s4 = vld1q_s16(s);
632   s += p;
633   *s5 = vld1q_s16(s);
634   s += p;
635   *s6 = vld1q_s16(s);
636   s += p;
637   *s7 = vld1q_s16(s);
638 }
639 
640 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
641