xref: /aosp_15_r20/external/libaom/aom_dsp/arm/mem_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AOM_DSP_ARM_MEM_NEON_H_
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
16*77c1e3ccSAndroid Build Coastguard Worker #include <string.h>
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"
18*77c1e3ccSAndroid Build Coastguard Worker 
19*77c1e3ccSAndroid Build Coastguard Worker // Support for xN Neon intrinsics is lacking in some compilers.
20*77c1e3ccSAndroid Build Coastguard Worker #if defined(__arm__) || defined(_M_ARM)
21*77c1e3ccSAndroid Build Coastguard Worker #define ARM_32_BIT
22*77c1e3ccSAndroid Build Coastguard Worker #endif
23*77c1e3ccSAndroid Build Coastguard Worker 
24*77c1e3ccSAndroid Build Coastguard Worker // DEFICIENT_CLANG_32_BIT includes clang-cl.
25*77c1e3ccSAndroid Build Coastguard Worker #if defined(__clang__) && defined(ARM_32_BIT) && \
26*77c1e3ccSAndroid Build Coastguard Worker     (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
27*77c1e3ccSAndroid Build Coastguard Worker #define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
28*77c1e3ccSAndroid Build Coastguard Worker #endif
29*77c1e3ccSAndroid Build Coastguard Worker 
30*77c1e3ccSAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
31*77c1e3ccSAndroid Build Coastguard Worker #define GCC_32_BIT
32*77c1e3ccSAndroid Build Coastguard Worker #endif
33*77c1e3ccSAndroid Build Coastguard Worker 
34*77c1e3ccSAndroid Build Coastguard Worker #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
35*77c1e3ccSAndroid Build Coastguard Worker 
vld1q_u8_x3(const uint8_t * ptr)36*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
37*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
38*77c1e3ccSAndroid Build Coastguard Worker                          vld1q_u8(ptr + 2 * 16) } };
39*77c1e3ccSAndroid Build Coastguard Worker   return res;
40*77c1e3ccSAndroid Build Coastguard Worker }
41*77c1e3ccSAndroid Build Coastguard Worker 
vld1q_u8_x2(const uint8_t * ptr)42*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
43*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
44*77c1e3ccSAndroid Build Coastguard Worker   return res;
45*77c1e3ccSAndroid Build Coastguard Worker }
46*77c1e3ccSAndroid Build Coastguard Worker 
vld1q_u16_x2(const uint16_t * ptr)47*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
48*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
49*77c1e3ccSAndroid Build Coastguard Worker   return res;
50*77c1e3ccSAndroid Build Coastguard Worker }
51*77c1e3ccSAndroid Build Coastguard Worker 
vld1q_u16_x4(const uint16_t * ptr)52*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
53*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
54*77c1e3ccSAndroid Build Coastguard Worker                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
55*77c1e3ccSAndroid Build Coastguard Worker   return res;
56*77c1e3ccSAndroid Build Coastguard Worker }
57*77c1e3ccSAndroid Build Coastguard Worker 
58*77c1e3ccSAndroid Build Coastguard Worker #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
59*77c1e3ccSAndroid Build Coastguard Worker #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)60*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
61*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
62*77c1e3ccSAndroid Build Coastguard Worker   return res;
63*77c1e3ccSAndroid Build Coastguard Worker }
64*77c1e3ccSAndroid Build Coastguard Worker #endif  // __GNUC__ < 8
65*77c1e3ccSAndroid Build Coastguard Worker 
66*77c1e3ccSAndroid Build Coastguard Worker #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)67*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
68*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
69*77c1e3ccSAndroid Build Coastguard Worker                          vld1q_u8(ptr + 2 * 16) } };
70*77c1e3ccSAndroid Build Coastguard Worker   return res;
71*77c1e3ccSAndroid Build Coastguard Worker }
72*77c1e3ccSAndroid Build Coastguard Worker #endif  // __GNUC__ < 9
73*77c1e3ccSAndroid Build Coastguard Worker 
74*77c1e3ccSAndroid Build Coastguard Worker // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
75*77c1e3ccSAndroid Build Coastguard Worker #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)76*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
77*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
78*77c1e3ccSAndroid Build Coastguard Worker                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
79*77c1e3ccSAndroid Build Coastguard Worker   return res;
80*77c1e3ccSAndroid Build Coastguard Worker }
81*77c1e3ccSAndroid Build Coastguard Worker #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
82*77c1e3ccSAndroid Build Coastguard Worker #endif  // defined(__GNUC__) && !defined(__clang__)
83*77c1e3ccSAndroid Build Coastguard Worker 
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)84*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
85*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s1) {
86*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s0);
87*77c1e3ccSAndroid Build Coastguard Worker   s += p;
88*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s1);
89*77c1e3ccSAndroid Build Coastguard Worker   s += p;
90*77c1e3ccSAndroid Build Coastguard Worker }
91*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x2(const uint8_t * s,ptrdiff_t p)92*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
93*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
94*77c1e3ccSAndroid Build Coastguard Worker }
95*77c1e3ccSAndroid Build Coastguard Worker 
96*77c1e3ccSAndroid Build Coastguard Worker // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)97*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
98*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t ret = vdup_n_u8(0);
99*77c1e3ccSAndroid Build Coastguard Worker   ret = vreinterpret_u8_u32(
100*77c1e3ccSAndroid Build Coastguard Worker       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
101*77c1e3ccSAndroid Build Coastguard Worker   return ret;
102*77c1e3ccSAndroid Build Coastguard Worker }
103*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_4x2(const uint8_t * p,int stride)104*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
105*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t ret = vdup_n_u8(0);
106*77c1e3ccSAndroid Build Coastguard Worker   ret = vreinterpret_u8_u32(
107*77c1e3ccSAndroid Build Coastguard Worker       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
108*77c1e3ccSAndroid Build Coastguard Worker   p += stride;
109*77c1e3ccSAndroid Build Coastguard Worker   ret = vreinterpret_u8_u32(
110*77c1e3ccSAndroid Build Coastguard Worker       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
111*77c1e3ccSAndroid Build Coastguard Worker   return ret;
112*77c1e3ccSAndroid Build Coastguard Worker }
113*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_2x2(const uint16_t * p,int stride)114*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
115*77c1e3ccSAndroid Build Coastguard Worker   uint16x4_t ret = vdup_n_u16(0);
116*77c1e3ccSAndroid Build Coastguard Worker   ret = vreinterpret_u16_u32(
117*77c1e3ccSAndroid Build Coastguard Worker       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
118*77c1e3ccSAndroid Build Coastguard Worker   p += stride;
119*77c1e3ccSAndroid Build Coastguard Worker   ret = vreinterpret_u16_u32(
120*77c1e3ccSAndroid Build Coastguard Worker       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
121*77c1e3ccSAndroid Build Coastguard Worker   return ret;
122*77c1e3ccSAndroid Build Coastguard Worker }
123*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)124*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
125*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
126*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2, uint8x8_t *const s3,
127*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s4, uint8x8_t *const s5,
128*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s6, uint8x8_t *const s7) {
129*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
130*77c1e3ccSAndroid Build Coastguard Worker   s += p;
131*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
132*77c1e3ccSAndroid Build Coastguard Worker   s += p;
133*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
134*77c1e3ccSAndroid Build Coastguard Worker   s += p;
135*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
136*77c1e3ccSAndroid Build Coastguard Worker   s += p;
137*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u8(s);
138*77c1e3ccSAndroid Build Coastguard Worker   s += p;
139*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u8(s);
140*77c1e3ccSAndroid Build Coastguard Worker   s += p;
141*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_u8(s);
142*77c1e3ccSAndroid Build Coastguard Worker   s += p;
143*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_u8(s);
144*77c1e3ccSAndroid Build Coastguard Worker }
145*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)146*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
147*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
148*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2, uint8x8_t *const s3,
149*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s4, uint8x8_t *const s5,
150*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s6) {
151*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
152*77c1e3ccSAndroid Build Coastguard Worker   s += p;
153*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
154*77c1e3ccSAndroid Build Coastguard Worker   s += p;
155*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
156*77c1e3ccSAndroid Build Coastguard Worker   s += p;
157*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
158*77c1e3ccSAndroid Build Coastguard Worker   s += p;
159*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u8(s);
160*77c1e3ccSAndroid Build Coastguard Worker   s += p;
161*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u8(s);
162*77c1e3ccSAndroid Build Coastguard Worker   s += p;
163*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_u8(s);
164*77c1e3ccSAndroid Build Coastguard Worker }
165*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x6(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5)166*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
167*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
168*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2, uint8x8_t *const s3,
169*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s4, uint8x8_t *const s5) {
170*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
171*77c1e3ccSAndroid Build Coastguard Worker   s += p;
172*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
173*77c1e3ccSAndroid Build Coastguard Worker   s += p;
174*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
175*77c1e3ccSAndroid Build Coastguard Worker   s += p;
176*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
177*77c1e3ccSAndroid Build Coastguard Worker   s += p;
178*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u8(s);
179*77c1e3ccSAndroid Build Coastguard Worker   s += p;
180*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u8(s);
181*77c1e3ccSAndroid Build Coastguard Worker }
182*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)183*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
184*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
185*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2, uint8x8_t *const s3) {
186*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
187*77c1e3ccSAndroid Build Coastguard Worker   s += p;
188*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
189*77c1e3ccSAndroid Build Coastguard Worker   s += p;
190*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
191*77c1e3ccSAndroid Build Coastguard Worker   s += p;
192*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
193*77c1e3ccSAndroid Build Coastguard Worker }
194*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)195*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
196*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
197*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2) {
198*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
199*77c1e3ccSAndroid Build Coastguard Worker   s += p;
200*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
201*77c1e3ccSAndroid Build Coastguard Worker   s += p;
202*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
203*77c1e3ccSAndroid Build Coastguard Worker }
204*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)205*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
206*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s0, uint16x4_t *const s1,
207*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s2, uint16x4_t *const s3) {
208*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u16(s);
209*77c1e3ccSAndroid Build Coastguard Worker   s += p;
210*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u16(s);
211*77c1e3ccSAndroid Build Coastguard Worker   s += p;
212*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u16(s);
213*77c1e3ccSAndroid Build Coastguard Worker   s += p;
214*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u16(s);
215*77c1e3ccSAndroid Build Coastguard Worker   s += p;
216*77c1e3ccSAndroid Build Coastguard Worker }
217*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)218*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
219*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s0, uint16x4_t *const s1,
220*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s2, uint16x4_t *const s3,
221*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s4, uint16x4_t *const s5,
222*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s6) {
223*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u16(s);
224*77c1e3ccSAndroid Build Coastguard Worker   s += p;
225*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u16(s);
226*77c1e3ccSAndroid Build Coastguard Worker   s += p;
227*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u16(s);
228*77c1e3ccSAndroid Build Coastguard Worker   s += p;
229*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u16(s);
230*77c1e3ccSAndroid Build Coastguard Worker   s += p;
231*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u16(s);
232*77c1e3ccSAndroid Build Coastguard Worker   s += p;
233*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u16(s);
234*77c1e3ccSAndroid Build Coastguard Worker   s += p;
235*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_u16(s);
236*77c1e3ccSAndroid Build Coastguard Worker }
237*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)238*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
239*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1) {
240*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
241*77c1e3ccSAndroid Build Coastguard Worker   s += p;
242*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
243*77c1e3ccSAndroid Build Coastguard Worker }
244*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)245*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
246*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s0, uint16x8_t *const s1) {
247*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
248*77c1e3ccSAndroid Build Coastguard Worker   s += p;
249*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
250*77c1e3ccSAndroid Build Coastguard Worker }
251*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x3(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2)252*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
253*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s0, uint16x8_t *const s1,
254*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s2) {
255*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
256*77c1e3ccSAndroid Build Coastguard Worker   s += p;
257*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
258*77c1e3ccSAndroid Build Coastguard Worker   s += p;
259*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
260*77c1e3ccSAndroid Build Coastguard Worker }
261*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)262*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
263*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s0, uint16x8_t *const s1,
264*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s2, uint16x8_t *const s3) {
265*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
266*77c1e3ccSAndroid Build Coastguard Worker   s += p;
267*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
268*77c1e3ccSAndroid Build Coastguard Worker   s += p;
269*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
270*77c1e3ccSAndroid Build Coastguard Worker   s += p;
271*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s);
272*77c1e3ccSAndroid Build Coastguard Worker   s += p;
273*77c1e3ccSAndroid Build Coastguard Worker }
274*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)275*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
276*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s0, int16x4_t *const s1,
277*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s2, int16x4_t *const s3,
278*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s4, int16x4_t *const s5,
279*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s6, int16x4_t *const s7,
280*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s8, int16x4_t *const s9,
281*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s10, int16x4_t *const s11) {
282*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
283*77c1e3ccSAndroid Build Coastguard Worker   s += p;
284*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
285*77c1e3ccSAndroid Build Coastguard Worker   s += p;
286*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
287*77c1e3ccSAndroid Build Coastguard Worker   s += p;
288*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
289*77c1e3ccSAndroid Build Coastguard Worker   s += p;
290*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
291*77c1e3ccSAndroid Build Coastguard Worker   s += p;
292*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_s16(s);
293*77c1e3ccSAndroid Build Coastguard Worker   s += p;
294*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_s16(s);
295*77c1e3ccSAndroid Build Coastguard Worker   s += p;
296*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_s16(s);
297*77c1e3ccSAndroid Build Coastguard Worker   s += p;
298*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1_s16(s);
299*77c1e3ccSAndroid Build Coastguard Worker   s += p;
300*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1_s16(s);
301*77c1e3ccSAndroid Build Coastguard Worker   s += p;
302*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1_s16(s);
303*77c1e3ccSAndroid Build Coastguard Worker   s += p;
304*77c1e3ccSAndroid Build Coastguard Worker   *s11 = vld1_s16(s);
305*77c1e3ccSAndroid Build Coastguard Worker }
306*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)307*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
308*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s0, int16x4_t *const s1,
309*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s2, int16x4_t *const s3,
310*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s4, int16x4_t *const s5,
311*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s6, int16x4_t *const s7,
312*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s8, int16x4_t *const s9,
313*77c1e3ccSAndroid Build Coastguard Worker                                  int16x4_t *const s10) {
314*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
315*77c1e3ccSAndroid Build Coastguard Worker   s += p;
316*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
317*77c1e3ccSAndroid Build Coastguard Worker   s += p;
318*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
319*77c1e3ccSAndroid Build Coastguard Worker   s += p;
320*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
321*77c1e3ccSAndroid Build Coastguard Worker   s += p;
322*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
323*77c1e3ccSAndroid Build Coastguard Worker   s += p;
324*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_s16(s);
325*77c1e3ccSAndroid Build Coastguard Worker   s += p;
326*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_s16(s);
327*77c1e3ccSAndroid Build Coastguard Worker   s += p;
328*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_s16(s);
329*77c1e3ccSAndroid Build Coastguard Worker   s += p;
330*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1_s16(s);
331*77c1e3ccSAndroid Build Coastguard Worker   s += p;
332*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1_s16(s);
333*77c1e3ccSAndroid Build Coastguard Worker   s += p;
334*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1_s16(s);
335*77c1e3ccSAndroid Build Coastguard Worker }
336*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)337*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
338*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s0, uint16x4_t *const s1,
339*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s2, uint16x4_t *const s3,
340*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s4, uint16x4_t *const s5,
341*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s6, uint16x4_t *const s7,
342*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s8, uint16x4_t *const s9,
343*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x4_t *const s10) {
344*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u16(s);
345*77c1e3ccSAndroid Build Coastguard Worker   s += p;
346*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u16(s);
347*77c1e3ccSAndroid Build Coastguard Worker   s += p;
348*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u16(s);
349*77c1e3ccSAndroid Build Coastguard Worker   s += p;
350*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u16(s);
351*77c1e3ccSAndroid Build Coastguard Worker   s += p;
352*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u16(s);
353*77c1e3ccSAndroid Build Coastguard Worker   s += p;
354*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u16(s);
355*77c1e3ccSAndroid Build Coastguard Worker   s += p;
356*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_u16(s);
357*77c1e3ccSAndroid Build Coastguard Worker   s += p;
358*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_u16(s);
359*77c1e3ccSAndroid Build Coastguard Worker   s += p;
360*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1_u16(s);
361*77c1e3ccSAndroid Build Coastguard Worker   s += p;
362*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1_u16(s);
363*77c1e3ccSAndroid Build Coastguard Worker   s += p;
364*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1_u16(s);
365*77c1e3ccSAndroid Build Coastguard Worker }
366*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)367*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
368*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
369*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2, int16x4_t *const s3,
370*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s4, int16x4_t *const s5,
371*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s6, int16x4_t *const s7) {
372*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
373*77c1e3ccSAndroid Build Coastguard Worker   s += p;
374*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
375*77c1e3ccSAndroid Build Coastguard Worker   s += p;
376*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
377*77c1e3ccSAndroid Build Coastguard Worker   s += p;
378*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
379*77c1e3ccSAndroid Build Coastguard Worker   s += p;
380*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
381*77c1e3ccSAndroid Build Coastguard Worker   s += p;
382*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_s16(s);
383*77c1e3ccSAndroid Build Coastguard Worker   s += p;
384*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_s16(s);
385*77c1e3ccSAndroid Build Coastguard Worker   s += p;
386*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_s16(s);
387*77c1e3ccSAndroid Build Coastguard Worker }
388*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)389*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
390*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
391*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2, int16x4_t *const s3,
392*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s4, int16x4_t *const s5,
393*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s6) {
394*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
395*77c1e3ccSAndroid Build Coastguard Worker   s += p;
396*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
397*77c1e3ccSAndroid Build Coastguard Worker   s += p;
398*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
399*77c1e3ccSAndroid Build Coastguard Worker   s += p;
400*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
401*77c1e3ccSAndroid Build Coastguard Worker   s += p;
402*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
403*77c1e3ccSAndroid Build Coastguard Worker   s += p;
404*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_s16(s);
405*77c1e3ccSAndroid Build Coastguard Worker   s += p;
406*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_s16(s);
407*77c1e3ccSAndroid Build Coastguard Worker }
408*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)409*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
410*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
411*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2, int16x4_t *const s3,
412*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s4, int16x4_t *const s5) {
413*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
414*77c1e3ccSAndroid Build Coastguard Worker   s += p;
415*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
416*77c1e3ccSAndroid Build Coastguard Worker   s += p;
417*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
418*77c1e3ccSAndroid Build Coastguard Worker   s += p;
419*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
420*77c1e3ccSAndroid Build Coastguard Worker   s += p;
421*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
422*77c1e3ccSAndroid Build Coastguard Worker   s += p;
423*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_s16(s);
424*77c1e3ccSAndroid Build Coastguard Worker }
425*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)426*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
427*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
428*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2, int16x4_t *const s3,
429*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s4) {
430*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
431*77c1e3ccSAndroid Build Coastguard Worker   s += p;
432*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
433*77c1e3ccSAndroid Build Coastguard Worker   s += p;
434*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
435*77c1e3ccSAndroid Build Coastguard Worker   s += p;
436*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
437*77c1e3ccSAndroid Build Coastguard Worker   s += p;
438*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_s16(s);
439*77c1e3ccSAndroid Build Coastguard Worker }
440*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)441*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
442*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s0, uint16x4_t *const s1,
443*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s2, uint16x4_t *const s3,
444*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x4_t *const s4) {
445*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u16(s);
446*77c1e3ccSAndroid Build Coastguard Worker   s += p;
447*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u16(s);
448*77c1e3ccSAndroid Build Coastguard Worker   s += p;
449*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u16(s);
450*77c1e3ccSAndroid Build Coastguard Worker   s += p;
451*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u16(s);
452*77c1e3ccSAndroid Build Coastguard Worker   s += p;
453*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u16(s);
454*77c1e3ccSAndroid Build Coastguard Worker   s += p;
455*77c1e3ccSAndroid Build Coastguard Worker }
456*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)457*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
458*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s0, uint8x8_t *const s1,
459*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s2, uint8x8_t *const s3,
460*77c1e3ccSAndroid Build Coastguard Worker                                uint8x8_t *const s4) {
461*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
462*77c1e3ccSAndroid Build Coastguard Worker   s += p;
463*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
464*77c1e3ccSAndroid Build Coastguard Worker   s += p;
465*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
466*77c1e3ccSAndroid Build Coastguard Worker   s += p;
467*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
468*77c1e3ccSAndroid Build Coastguard Worker   s += p;
469*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u8(s);
470*77c1e3ccSAndroid Build Coastguard Worker }
471*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)472*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
473*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s0, uint16x8_t *const s1,
474*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s2, uint16x8_t *const s3,
475*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s4) {
476*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
477*77c1e3ccSAndroid Build Coastguard Worker   s += p;
478*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
479*77c1e3ccSAndroid Build Coastguard Worker   s += p;
480*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
481*77c1e3ccSAndroid Build Coastguard Worker   s += p;
482*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s);
483*77c1e3ccSAndroid Build Coastguard Worker   s += p;
484*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u16(s);
485*77c1e3ccSAndroid Build Coastguard Worker   s += p;
486*77c1e3ccSAndroid Build Coastguard Worker }
487*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)488*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
489*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
490*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2, int16x4_t *const s3) {
491*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
492*77c1e3ccSAndroid Build Coastguard Worker   s += p;
493*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
494*77c1e3ccSAndroid Build Coastguard Worker   s += p;
495*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
496*77c1e3ccSAndroid Build Coastguard Worker   s += p;
497*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_s16(s);
498*77c1e3ccSAndroid Build Coastguard Worker }
499*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)500*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
501*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s0, int16x4_t *const s1,
502*77c1e3ccSAndroid Build Coastguard Worker                                 int16x4_t *const s2) {
503*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_s16(s);
504*77c1e3ccSAndroid Build Coastguard Worker   s += p;
505*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_s16(s);
506*77c1e3ccSAndroid Build Coastguard Worker   s += p;
507*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_s16(s);
508*77c1e3ccSAndroid Build Coastguard Worker }
509*77c1e3ccSAndroid Build Coastguard Worker 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)510*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
511*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s1, const uint8x8_t s2,
512*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s3, const uint8x8_t s4,
513*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s5, const uint8x8_t s6,
514*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s7) {
515*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s0);
516*77c1e3ccSAndroid Build Coastguard Worker   s += p;
517*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s1);
518*77c1e3ccSAndroid Build Coastguard Worker   s += p;
519*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s2);
520*77c1e3ccSAndroid Build Coastguard Worker   s += p;
521*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s3);
522*77c1e3ccSAndroid Build Coastguard Worker   s += p;
523*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s4);
524*77c1e3ccSAndroid Build Coastguard Worker   s += p;
525*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s5);
526*77c1e3ccSAndroid Build Coastguard Worker   s += p;
527*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s6);
528*77c1e3ccSAndroid Build Coastguard Worker   s += p;
529*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s7);
530*77c1e3ccSAndroid Build Coastguard Worker }
531*77c1e3ccSAndroid Build Coastguard Worker 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)532*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
533*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s1, const uint8x8_t s2,
534*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8x8_t s3) {
535*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s0);
536*77c1e3ccSAndroid Build Coastguard Worker   s += p;
537*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s1);
538*77c1e3ccSAndroid Build Coastguard Worker   s += p;
539*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s2);
540*77c1e3ccSAndroid Build Coastguard Worker   s += p;
541*77c1e3ccSAndroid Build Coastguard Worker   vst1_u8(s, s3);
542*77c1e3ccSAndroid Build Coastguard Worker }
543*77c1e3ccSAndroid Build Coastguard Worker 
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)544*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
545*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8x16_t s1, const uint8x16_t s2,
546*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8x16_t s3) {
547*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u8(s, s0);
548*77c1e3ccSAndroid Build Coastguard Worker   s += p;
549*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u8(s, s1);
550*77c1e3ccSAndroid Build Coastguard Worker   s += p;
551*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u8(s, s2);
552*77c1e3ccSAndroid Build Coastguard Worker   s += p;
553*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u8(s, s3);
554*77c1e3ccSAndroid Build Coastguard Worker }
555*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)556*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
557*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s0, const uint16x8_t s1,
558*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s2, const uint16x8_t s3,
559*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s4, const uint16x8_t s5,
560*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s6, const uint16x8_t s7) {
561*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s0);
562*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
563*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s1);
564*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
565*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s2);
566*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
567*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s3);
568*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
569*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s4);
570*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
571*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s5);
572*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
573*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s6);
574*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
575*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s7);
576*77c1e3ccSAndroid Build Coastguard Worker }
577*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_4x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)578*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
579*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x4_t s0, const uint16x4_t s1,
580*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x4_t s2) {
581*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s0);
582*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
583*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s1);
584*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
585*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s2);
586*77c1e3ccSAndroid Build Coastguard Worker }
587*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)588*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
589*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x4_t s0, const uint16x4_t s1,
590*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x4_t s2, const uint16x4_t s3) {
591*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s0);
592*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
593*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s1);
594*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
595*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s2);
596*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
597*77c1e3ccSAndroid Build Coastguard Worker   vst1_u16(s, s3);
598*77c1e3ccSAndroid Build Coastguard Worker }
599*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)600*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
601*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s0, const uint16x8_t s1) {
602*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s0);
603*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
604*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s1);
605*77c1e3ccSAndroid Build Coastguard Worker }
606*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_8x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)607*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
608*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s0, const uint16x8_t s1,
609*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s2) {
610*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s0);
611*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
612*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s1);
613*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
614*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s2);
615*77c1e3ccSAndroid Build Coastguard Worker }
616*77c1e3ccSAndroid Build Coastguard Worker 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)617*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
618*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s0, const uint16x8_t s1,
619*77c1e3ccSAndroid Build Coastguard Worker                                  const uint16x8_t s2, const uint16x8_t s3) {
620*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s0);
621*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
622*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s1);
623*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
624*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s2);
625*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
626*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u16(s, s3);
627*77c1e3ccSAndroid Build Coastguard Worker }
628*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)629*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
630*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s0, const int16x8_t s1,
631*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s2, const int16x8_t s3,
632*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s4, const int16x8_t s5,
633*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s6, const int16x8_t s7) {
634*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s0);
635*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
636*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s1);
637*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
638*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s2);
639*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
640*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s3);
641*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
642*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s4);
643*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
644*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s5);
645*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
646*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s6);
647*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
648*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s7);
649*77c1e3ccSAndroid Build Coastguard Worker }
650*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)651*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
652*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s0, const int16x4_t s1,
653*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s2, const int16x4_t s3) {
654*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s0);
655*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
656*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s1);
657*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
658*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s2);
659*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
660*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s3);
661*77c1e3ccSAndroid Build Coastguard Worker }
662*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_4x8(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7)663*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
664*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s0, const int16x4_t s1,
665*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s2, const int16x4_t s3,
666*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s4, const int16x4_t s5,
667*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x4_t s6, const int16x4_t s7) {
668*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s0);
669*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
670*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s1);
671*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
672*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s2);
673*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
674*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s3);
675*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
676*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s4);
677*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
678*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s5);
679*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
680*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s6);
681*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
682*77c1e3ccSAndroid Build Coastguard Worker   vst1_s16(s, s7);
683*77c1e3ccSAndroid Build Coastguard Worker }
684*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)685*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
686*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s0, const int16x8_t s1,
687*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s2, const int16x8_t s3) {
688*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s0);
689*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
690*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s1);
691*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
692*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s2);
693*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
694*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s3);
695*77c1e3ccSAndroid Build Coastguard Worker }
696*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_8x2(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1)697*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
698*77c1e3ccSAndroid Build Coastguard Worker                                  const int16x8_t s0, const int16x8_t s1) {
699*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s0);
700*77c1e3ccSAndroid Build Coastguard Worker   s += dst_stride;
701*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s16(s, s1);
702*77c1e3ccSAndroid Build Coastguard Worker }
703*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)704*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
705*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s0, uint8x8_t *const s1,
706*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s2, uint8x8_t *const s3,
707*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s4, uint8x8_t *const s5,
708*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s6, uint8x8_t *const s7,
709*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s8, uint8x8_t *const s9,
710*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x8_t *const s10) {
711*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1_u8(s);
712*77c1e3ccSAndroid Build Coastguard Worker   s += p;
713*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1_u8(s);
714*77c1e3ccSAndroid Build Coastguard Worker   s += p;
715*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1_u8(s);
716*77c1e3ccSAndroid Build Coastguard Worker   s += p;
717*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1_u8(s);
718*77c1e3ccSAndroid Build Coastguard Worker   s += p;
719*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1_u8(s);
720*77c1e3ccSAndroid Build Coastguard Worker   s += p;
721*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1_u8(s);
722*77c1e3ccSAndroid Build Coastguard Worker   s += p;
723*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1_u8(s);
724*77c1e3ccSAndroid Build Coastguard Worker   s += p;
725*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1_u8(s);
726*77c1e3ccSAndroid Build Coastguard Worker   s += p;
727*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1_u8(s);
728*77c1e3ccSAndroid Build Coastguard Worker   s += p;
729*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1_u8(s);
730*77c1e3ccSAndroid Build Coastguard Worker   s += p;
731*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1_u8(s);
732*77c1e3ccSAndroid Build Coastguard Worker }
733*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)734*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
735*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s0, int16x8_t *const s1,
736*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s2, int16x8_t *const s3,
737*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s4, int16x8_t *const s5,
738*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s6, int16x8_t *const s7,
739*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s8, int16x8_t *const s9) {
740*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
741*77c1e3ccSAndroid Build Coastguard Worker   s += p;
742*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
743*77c1e3ccSAndroid Build Coastguard Worker   s += p;
744*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
745*77c1e3ccSAndroid Build Coastguard Worker   s += p;
746*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
747*77c1e3ccSAndroid Build Coastguard Worker   s += p;
748*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
749*77c1e3ccSAndroid Build Coastguard Worker   s += p;
750*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
751*77c1e3ccSAndroid Build Coastguard Worker   s += p;
752*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_s16(s);
753*77c1e3ccSAndroid Build Coastguard Worker   s += p;
754*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_s16(s);
755*77c1e3ccSAndroid Build Coastguard Worker   s += p;
756*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1q_s16(s);
757*77c1e3ccSAndroid Build Coastguard Worker   s += p;
758*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1q_s16(s);
759*77c1e3ccSAndroid Build Coastguard Worker }
760*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)761*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
762*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s0, int16x8_t *const s1,
763*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s2, int16x8_t *const s3,
764*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s4, int16x8_t *const s5,
765*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s6, int16x8_t *const s7,
766*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s8, int16x8_t *const s9,
767*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s10) {
768*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
769*77c1e3ccSAndroid Build Coastguard Worker   s += p;
770*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
771*77c1e3ccSAndroid Build Coastguard Worker   s += p;
772*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
773*77c1e3ccSAndroid Build Coastguard Worker   s += p;
774*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
775*77c1e3ccSAndroid Build Coastguard Worker   s += p;
776*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
777*77c1e3ccSAndroid Build Coastguard Worker   s += p;
778*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
779*77c1e3ccSAndroid Build Coastguard Worker   s += p;
780*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_s16(s);
781*77c1e3ccSAndroid Build Coastguard Worker   s += p;
782*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_s16(s);
783*77c1e3ccSAndroid Build Coastguard Worker   s += p;
784*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1q_s16(s);
785*77c1e3ccSAndroid Build Coastguard Worker   s += p;
786*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1q_s16(s);
787*77c1e3ccSAndroid Build Coastguard Worker   s += p;
788*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1q_s16(s);
789*77c1e3ccSAndroid Build Coastguard Worker }
790*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)791*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
792*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s0, int16x8_t *const s1,
793*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s2, int16x8_t *const s3,
794*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s4, int16x8_t *const s5,
795*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s6, int16x8_t *const s7,
796*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s8, int16x8_t *const s9,
797*77c1e3ccSAndroid Build Coastguard Worker                                  int16x8_t *const s10, int16x8_t *const s11) {
798*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
799*77c1e3ccSAndroid Build Coastguard Worker   s += p;
800*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
801*77c1e3ccSAndroid Build Coastguard Worker   s += p;
802*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
803*77c1e3ccSAndroid Build Coastguard Worker   s += p;
804*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
805*77c1e3ccSAndroid Build Coastguard Worker   s += p;
806*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
807*77c1e3ccSAndroid Build Coastguard Worker   s += p;
808*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
809*77c1e3ccSAndroid Build Coastguard Worker   s += p;
810*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_s16(s);
811*77c1e3ccSAndroid Build Coastguard Worker   s += p;
812*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_s16(s);
813*77c1e3ccSAndroid Build Coastguard Worker   s += p;
814*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1q_s16(s);
815*77c1e3ccSAndroid Build Coastguard Worker   s += p;
816*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1q_s16(s);
817*77c1e3ccSAndroid Build Coastguard Worker   s += p;
818*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1q_s16(s);
819*77c1e3ccSAndroid Build Coastguard Worker   s += p;
820*77c1e3ccSAndroid Build Coastguard Worker   *s11 = vld1q_s16(s);
821*77c1e3ccSAndroid Build Coastguard Worker }
822*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)823*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
824*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s0, uint16x8_t *const s1,
825*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s2, uint16x8_t *const s3,
826*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s4, uint16x8_t *const s5,
827*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s6, uint16x8_t *const s7,
828*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s8, uint16x8_t *const s9,
829*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s10) {
830*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
831*77c1e3ccSAndroid Build Coastguard Worker   s += p;
832*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
833*77c1e3ccSAndroid Build Coastguard Worker   s += p;
834*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
835*77c1e3ccSAndroid Build Coastguard Worker   s += p;
836*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s);
837*77c1e3ccSAndroid Build Coastguard Worker   s += p;
838*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u16(s);
839*77c1e3ccSAndroid Build Coastguard Worker   s += p;
840*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_u16(s);
841*77c1e3ccSAndroid Build Coastguard Worker   s += p;
842*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_u16(s);
843*77c1e3ccSAndroid Build Coastguard Worker   s += p;
844*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_u16(s);
845*77c1e3ccSAndroid Build Coastguard Worker   s += p;
846*77c1e3ccSAndroid Build Coastguard Worker   *s8 = vld1q_u16(s);
847*77c1e3ccSAndroid Build Coastguard Worker   s += p;
848*77c1e3ccSAndroid Build Coastguard Worker   *s9 = vld1q_u16(s);
849*77c1e3ccSAndroid Build Coastguard Worker   s += p;
850*77c1e3ccSAndroid Build Coastguard Worker   *s10 = vld1q_u16(s);
851*77c1e3ccSAndroid Build Coastguard Worker }
852*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)853*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
854*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
855*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2, int16x8_t *const s3,
856*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s4, int16x8_t *const s5,
857*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s6, int16x8_t *const s7) {
858*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
859*77c1e3ccSAndroid Build Coastguard Worker   s += p;
860*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
861*77c1e3ccSAndroid Build Coastguard Worker   s += p;
862*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
863*77c1e3ccSAndroid Build Coastguard Worker   s += p;
864*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
865*77c1e3ccSAndroid Build Coastguard Worker   s += p;
866*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
867*77c1e3ccSAndroid Build Coastguard Worker   s += p;
868*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
869*77c1e3ccSAndroid Build Coastguard Worker   s += p;
870*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_s16(s);
871*77c1e3ccSAndroid Build Coastguard Worker   s += p;
872*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_s16(s);
873*77c1e3ccSAndroid Build Coastguard Worker }
874*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)875*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
876*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s0, uint16x8_t *const s1,
877*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s2, uint16x8_t *const s3,
878*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s4, uint16x8_t *const s5,
879*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *const s6) {
880*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
881*77c1e3ccSAndroid Build Coastguard Worker   s += p;
882*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
883*77c1e3ccSAndroid Build Coastguard Worker   s += p;
884*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
885*77c1e3ccSAndroid Build Coastguard Worker   s += p;
886*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s);
887*77c1e3ccSAndroid Build Coastguard Worker   s += p;
888*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u16(s);
889*77c1e3ccSAndroid Build Coastguard Worker   s += p;
890*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_u16(s);
891*77c1e3ccSAndroid Build Coastguard Worker   s += p;
892*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_u16(s);
893*77c1e3ccSAndroid Build Coastguard Worker }
894*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)895*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
896*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
897*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2, int16x8_t *const s3,
898*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s4, int16x8_t *const s5,
899*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s6) {
900*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
901*77c1e3ccSAndroid Build Coastguard Worker   s += p;
902*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
903*77c1e3ccSAndroid Build Coastguard Worker   s += p;
904*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
905*77c1e3ccSAndroid Build Coastguard Worker   s += p;
906*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
907*77c1e3ccSAndroid Build Coastguard Worker   s += p;
908*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
909*77c1e3ccSAndroid Build Coastguard Worker   s += p;
910*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
911*77c1e3ccSAndroid Build Coastguard Worker   s += p;
912*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_s16(s);
913*77c1e3ccSAndroid Build Coastguard Worker }
914*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)915*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
916*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
917*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2, int16x8_t *const s3,
918*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s4, int16x8_t *const s5) {
919*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
920*77c1e3ccSAndroid Build Coastguard Worker   s += p;
921*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
922*77c1e3ccSAndroid Build Coastguard Worker   s += p;
923*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
924*77c1e3ccSAndroid Build Coastguard Worker   s += p;
925*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
926*77c1e3ccSAndroid Build Coastguard Worker   s += p;
927*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
928*77c1e3ccSAndroid Build Coastguard Worker   s += p;
929*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_s16(s);
930*77c1e3ccSAndroid Build Coastguard Worker }
931*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)932*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
933*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
934*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2, int16x8_t *const s3,
935*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s4) {
936*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
937*77c1e3ccSAndroid Build Coastguard Worker   s += p;
938*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
939*77c1e3ccSAndroid Build Coastguard Worker   s += p;
940*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
941*77c1e3ccSAndroid Build Coastguard Worker   s += p;
942*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
943*77c1e3ccSAndroid Build Coastguard Worker   s += p;
944*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s16(s);
945*77c1e3ccSAndroid Build Coastguard Worker }
946*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)947*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
948*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
949*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2, int16x8_t *const s3) {
950*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
951*77c1e3ccSAndroid Build Coastguard Worker   s += p;
952*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
953*77c1e3ccSAndroid Build Coastguard Worker   s += p;
954*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
955*77c1e3ccSAndroid Build Coastguard Worker   s += p;
956*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s16(s);
957*77c1e3ccSAndroid Build Coastguard Worker }
958*77c1e3ccSAndroid Build Coastguard Worker 
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)959*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
960*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s0, int16x8_t *const s1,
961*77c1e3ccSAndroid Build Coastguard Worker                                 int16x8_t *const s2) {
962*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_s16(s);
963*77c1e3ccSAndroid Build Coastguard Worker   s += p;
964*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s16(s);
965*77c1e3ccSAndroid Build Coastguard Worker   s += p;
966*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s16(s);
967*77c1e3ccSAndroid Build Coastguard Worker }
968*77c1e3ccSAndroid Build Coastguard Worker 
969*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
970*77c1e3ccSAndroid Build Coastguard Worker #define load_unaligned_u32_2x1_lane(v, p, lane)              \
971*77c1e3ccSAndroid Build Coastguard Worker   do {                                                       \
972*77c1e3ccSAndroid Build Coastguard Worker     (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
973*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
974*77c1e3ccSAndroid Build Coastguard Worker 
975*77c1e3ccSAndroid Build Coastguard Worker #define load_unaligned_u32_4x1_lane(v, p, lane)               \
976*77c1e3ccSAndroid Build Coastguard Worker   do {                                                        \
977*77c1e3ccSAndroid Build Coastguard Worker     (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
978*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
979*77c1e3ccSAndroid Build Coastguard Worker #else
980*77c1e3ccSAndroid Build Coastguard Worker #define load_unaligned_u32_2x1_lane(v, p, lane) \
981*77c1e3ccSAndroid Build Coastguard Worker   do {                                          \
982*77c1e3ccSAndroid Build Coastguard Worker     uint32_t tmp;                               \
983*77c1e3ccSAndroid Build Coastguard Worker     memcpy(&tmp, (p), 4);                       \
984*77c1e3ccSAndroid Build Coastguard Worker     (v) = vset_lane_u32(tmp, (v), (lane));      \
985*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
986*77c1e3ccSAndroid Build Coastguard Worker 
987*77c1e3ccSAndroid Build Coastguard Worker #define load_unaligned_u32_4x1_lane(v, p, lane) \
988*77c1e3ccSAndroid Build Coastguard Worker   do {                                          \
989*77c1e3ccSAndroid Build Coastguard Worker     uint32_t tmp;                               \
990*77c1e3ccSAndroid Build Coastguard Worker     memcpy(&tmp, (p), 4);                       \
991*77c1e3ccSAndroid Build Coastguard Worker     (v) = vsetq_lane_u32(tmp, (v), (lane));     \
992*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
993*77c1e3ccSAndroid Build Coastguard Worker #endif
994*77c1e3ccSAndroid Build Coastguard Worker 
995*77c1e3ccSAndroid Build Coastguard Worker // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)996*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
997*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
998*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
999*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1000*77c1e3ccSAndroid Build Coastguard Worker   uint32x2_t a_u32 = vdup_n_u32(a);
1001*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1002*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vset_lane_u32(a, a_u32, 1);
1003*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u32(a_u32);
1004*77c1e3ccSAndroid Build Coastguard Worker }
1005*77c1e3ccSAndroid Build Coastguard Worker 
1006*77c1e3ccSAndroid Build Coastguard Worker // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)1007*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
1008*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
1009*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t a_u32;
1010*77c1e3ccSAndroid Build Coastguard Worker   if (stride == 4) return vld1q_u8(buf);
1011*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1012*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1013*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdupq_n_u32(a);
1014*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1015*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1016*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vsetq_lane_u32(a, a_u32, 1);
1017*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1018*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1019*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vsetq_lane_u32(a, a_u32, 2);
1020*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1021*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vsetq_lane_u32(a, a_u32, 3);
1022*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpretq_u8_u32(a_u32);
1023*77c1e3ccSAndroid Build Coastguard Worker }
1024*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_2x2(const uint8_t * buf,int stride)1025*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
1026*77c1e3ccSAndroid Build Coastguard Worker   uint16_t a;
1027*77c1e3ccSAndroid Build Coastguard Worker   uint16x4_t a_u16;
1028*77c1e3ccSAndroid Build Coastguard Worker 
1029*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 2);
1030*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1031*77c1e3ccSAndroid Build Coastguard Worker   a_u16 = vdup_n_u16(a);
1032*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 2);
1033*77c1e3ccSAndroid Build Coastguard Worker   a_u16 = vset_lane_u16(a, a_u16, 1);
1034*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u16(a_u16);
1035*77c1e3ccSAndroid Build Coastguard Worker }
1036*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_4x1(const uint8_t * buf)1037*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
1038*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
1039*77c1e3ccSAndroid Build Coastguard Worker   uint32x2_t a_u32;
1040*77c1e3ccSAndroid Build Coastguard Worker 
1041*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1042*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdup_n_u32(0);
1043*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vset_lane_u32(a, a_u32, 0);
1044*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u32(a_u32);
1045*77c1e3ccSAndroid Build Coastguard Worker }
1046*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_dup_u8_4x2(const uint8_t * buf)1047*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
1048*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
1049*77c1e3ccSAndroid Build Coastguard Worker   uint32x2_t a_u32;
1050*77c1e3ccSAndroid Build Coastguard Worker 
1051*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1052*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdup_n_u32(a);
1053*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u32(a_u32);
1054*77c1e3ccSAndroid Build Coastguard Worker }
1055*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_dup_u8_2x4(const uint8_t * buf)1056*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
1057*77c1e3ccSAndroid Build Coastguard Worker   uint16_t a;
1058*77c1e3ccSAndroid Build Coastguard Worker   uint16x4_t a_u32;
1059*77c1e3ccSAndroid Build Coastguard Worker 
1060*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 2);
1061*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdup_n_u16(a);
1062*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u16(a_u32);
1063*77c1e3ccSAndroid Build Coastguard Worker }
1064*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_4x2(const uint8_t * buf,int stride)1065*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
1066*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
1067*77c1e3ccSAndroid Build Coastguard Worker   uint32x2_t a_u32;
1068*77c1e3ccSAndroid Build Coastguard Worker 
1069*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1070*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1071*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdup_n_u32(a);
1072*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1073*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vset_lane_u32(a, a_u32, 1);
1074*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u8_u32(a_u32);
1075*77c1e3ccSAndroid Build Coastguard Worker }
1076*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)1077*77c1e3ccSAndroid Build Coastguard Worker static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
1078*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t *tu0, uint8x8_t *tu1) {
1079*77c1e3ccSAndroid Build Coastguard Worker   *tu0 = load_unaligned_u8_4x2(buf, stride);
1080*77c1e3ccSAndroid Build Coastguard Worker   buf += 2 * stride;
1081*77c1e3ccSAndroid Build Coastguard Worker   *tu1 = load_unaligned_u8_4x2(buf, stride);
1082*77c1e3ccSAndroid Build Coastguard Worker }
1083*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)1084*77c1e3ccSAndroid Build Coastguard Worker static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
1085*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t *tu0, uint8x8_t *tu1,
1086*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t *tu2) {
1087*77c1e3ccSAndroid Build Coastguard Worker   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1088*77c1e3ccSAndroid Build Coastguard Worker   buf += 4 * stride;
1089*77c1e3ccSAndroid Build Coastguard Worker   *tu2 = load_unaligned_u8_4x2(buf, stride);
1090*77c1e3ccSAndroid Build Coastguard Worker }
1091*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)1092*77c1e3ccSAndroid Build Coastguard Worker static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
1093*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t *tu0, uint8x8_t *tu1,
1094*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t *tu2, uint8x8_t *tu3) {
1095*77c1e3ccSAndroid Build Coastguard Worker   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1096*77c1e3ccSAndroid Build Coastguard Worker   buf += 4 * stride;
1097*77c1e3ccSAndroid Build Coastguard Worker   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
1098*77c1e3ccSAndroid Build Coastguard Worker }
1099*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)1100*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
1101*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s0, uint8x16_t *const s1,
1102*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s2, uint8x16_t *const s3,
1103*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s4, uint8x16_t *const s5,
1104*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s6, uint8x16_t *const s7) {
1105*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u8(s);
1106*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1107*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u8(s);
1108*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1109*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u8(s);
1110*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1111*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u8(s);
1112*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1113*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u8(s);
1114*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1115*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_u8(s);
1116*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1117*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_u8(s);
1118*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1119*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_u8(s);
1120*77c1e3ccSAndroid Build Coastguard Worker }
1121*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_16x5(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4)1122*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
1123*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s0, uint8x16_t *const s1,
1124*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s2, uint8x16_t *const s3,
1125*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s4) {
1126*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u8(s);
1127*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1128*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u8(s);
1129*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1130*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u8(s);
1131*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1132*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u8(s);
1133*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1134*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u8(s);
1135*77c1e3ccSAndroid Build Coastguard Worker }
1136*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1137*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1138*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s0, uint8x16_t *const s1,
1139*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s2, uint8x16_t *const s3) {
1140*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u8(s);
1141*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1142*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u8(s);
1143*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1144*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u8(s);
1145*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1146*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u8(s);
1147*77c1e3ccSAndroid Build Coastguard Worker }
1148*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_16x3(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)1149*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
1150*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s0, uint8x16_t *const s1,
1151*77c1e3ccSAndroid Build Coastguard Worker                                 uint8x16_t *const s2) {
1152*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u8(s);
1153*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1154*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u8(s);
1155*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1156*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u8(s);
1157*77c1e3ccSAndroid Build Coastguard Worker }
1158*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1159*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1160*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1161*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1162*77c1e3ccSAndroid Build Coastguard Worker                                 uint16x8_t *s6, uint16x8_t *s7) {
1163*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
1164*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1165*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s);
1166*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1167*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
1168*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1169*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s);
1170*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1171*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u16(s);
1172*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1173*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_u16(s);
1174*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1175*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_u16(s);
1176*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1177*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_u16(s);
1178*77c1e3ccSAndroid Build Coastguard Worker }
1179*77c1e3ccSAndroid Build Coastguard Worker 
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1180*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1181*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s0, uint16x8_t *const s1,
1182*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s2, uint16x8_t *const s3,
1183*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s4, uint16x8_t *const s5,
1184*77c1e3ccSAndroid Build Coastguard Worker                                  uint16x8_t *const s6, uint16x8_t *const s7) {
1185*77c1e3ccSAndroid Build Coastguard Worker   *s0 = vld1q_u16(s);
1186*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u16(s + 8);
1187*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1188*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u16(s);
1189*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u16(s + 8);
1190*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1191*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u16(s);
1192*77c1e3ccSAndroid Build Coastguard Worker   *s5 = vld1q_u16(s + 8);
1193*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1194*77c1e3ccSAndroid Build Coastguard Worker   *s6 = vld1q_u16(s);
1195*77c1e3ccSAndroid Build Coastguard Worker   *s7 = vld1q_u16(s + 8);
1196*77c1e3ccSAndroid Build Coastguard Worker }
1197*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1198*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1199*77c1e3ccSAndroid Build Coastguard Worker                                                 int stride) {
1200*77c1e3ccSAndroid Build Coastguard Worker   uint32_t a;
1201*77c1e3ccSAndroid Build Coastguard Worker   uint32x2_t a_u32;
1202*77c1e3ccSAndroid Build Coastguard Worker 
1203*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1204*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1205*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vdup_n_u32(a);
1206*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 4);
1207*77c1e3ccSAndroid Build Coastguard Worker   a_u32 = vset_lane_u32(a, a_u32, 1);
1208*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u16_u32(a_u32);
1209*77c1e3ccSAndroid Build Coastguard Worker }
1210*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u16_4x1(const uint16_t * buf)1211*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1212*77c1e3ccSAndroid Build Coastguard Worker   uint64_t a;
1213*77c1e3ccSAndroid Build Coastguard Worker   uint64x1_t a_u64 = vdup_n_u64(0);
1214*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 8);
1215*77c1e3ccSAndroid Build Coastguard Worker   a_u64 = vset_lane_u64(a, a_u64, 0);
1216*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpret_u16_u64(a_u64);
1217*77c1e3ccSAndroid Build Coastguard Worker }
1218*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1219*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1220*77c1e3ccSAndroid Build Coastguard Worker                                                 uint32_t stride) {
1221*77c1e3ccSAndroid Build Coastguard Worker   uint64_t a;
1222*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t a_u64;
1223*77c1e3ccSAndroid Build Coastguard Worker 
1224*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 8);
1225*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1226*77c1e3ccSAndroid Build Coastguard Worker   a_u64 = vdupq_n_u64(0);
1227*77c1e3ccSAndroid Build Coastguard Worker   a_u64 = vsetq_lane_u64(a, a_u64, 0);
1228*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 8);
1229*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1230*77c1e3ccSAndroid Build Coastguard Worker   a_u64 = vsetq_lane_u64(a, a_u64, 1);
1231*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpretq_u16_u64(a_u64);
1232*77c1e3ccSAndroid Build Coastguard Worker }
1233*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_s16_4x2(const int16_t * buf,uint32_t stride)1234*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
1235*77c1e3ccSAndroid Build Coastguard Worker                                                uint32_t stride) {
1236*77c1e3ccSAndroid Build Coastguard Worker   int64_t a;
1237*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t a_s64;
1238*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 8);
1239*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1240*77c1e3ccSAndroid Build Coastguard Worker   a_s64 = vdupq_n_s64(0);
1241*77c1e3ccSAndroid Build Coastguard Worker   a_s64 = vsetq_lane_s64(a, a_s64, 0);
1242*77c1e3ccSAndroid Build Coastguard Worker   memcpy(&a, buf, 8);
1243*77c1e3ccSAndroid Build Coastguard Worker   buf += stride;
1244*77c1e3ccSAndroid Build Coastguard Worker   a_s64 = vsetq_lane_s64(a, a_s64, 1);
1245*77c1e3ccSAndroid Build Coastguard Worker   return vreinterpretq_s16_s64(a_s64);
1246*77c1e3ccSAndroid Build Coastguard Worker }
1247*77c1e3ccSAndroid Build Coastguard Worker 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1248*77c1e3ccSAndroid Build Coastguard Worker static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1249*77c1e3ccSAndroid Build Coastguard Worker                                           uint16x8_t *tu0, uint16x8_t *tu1) {
1250*77c1e3ccSAndroid Build Coastguard Worker   *tu0 = load_unaligned_u16_4x2(buf, stride);
1251*77c1e3ccSAndroid Build Coastguard Worker   buf += 2 * stride;
1252*77c1e3ccSAndroid Build Coastguard Worker   *tu1 = load_unaligned_u16_4x2(buf, stride);
1253*77c1e3ccSAndroid Build Coastguard Worker }
1254*77c1e3ccSAndroid Build Coastguard Worker 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1255*77c1e3ccSAndroid Build Coastguard Worker static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1256*77c1e3ccSAndroid Build Coastguard Worker                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1257*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_s32(s);
1258*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1259*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_s32(s);
1260*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1261*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_s32(s);
1262*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1263*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_s32(s);
1264*77c1e3ccSAndroid Build Coastguard Worker }
1265*77c1e3ccSAndroid Build Coastguard Worker 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1266*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1267*77c1e3ccSAndroid Build Coastguard Worker                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1268*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(s, s1);
1269*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1270*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(s, s2);
1271*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1272*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(s, s3);
1273*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1274*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(s, s4);
1275*77c1e3ccSAndroid Build Coastguard Worker }
1276*77c1e3ccSAndroid Build Coastguard Worker 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1277*77c1e3ccSAndroid Build Coastguard Worker static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1278*77c1e3ccSAndroid Build Coastguard Worker                                 uint32x4_t *s2, uint32x4_t *s3,
1279*77c1e3ccSAndroid Build Coastguard Worker                                 uint32x4_t *s4) {
1280*77c1e3ccSAndroid Build Coastguard Worker   *s1 = vld1q_u32(s);
1281*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1282*77c1e3ccSAndroid Build Coastguard Worker   *s2 = vld1q_u32(s);
1283*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1284*77c1e3ccSAndroid Build Coastguard Worker   *s3 = vld1q_u32(s);
1285*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1286*77c1e3ccSAndroid Build Coastguard Worker   *s4 = vld1q_u32(s);
1287*77c1e3ccSAndroid Build Coastguard Worker }
1288*77c1e3ccSAndroid Build Coastguard Worker 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1289*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1290*77c1e3ccSAndroid Build Coastguard Worker                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1291*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(s, s1);
1292*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1293*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(s, s2);
1294*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1295*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(s, s3);
1296*77c1e3ccSAndroid Build Coastguard Worker   s += p;
1297*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(s, s4);
1298*77c1e3ccSAndroid Build Coastguard Worker }
1299*77c1e3ccSAndroid Build Coastguard Worker 
load_tran_low_to_s16q(const tran_low_t * buf)1300*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1301*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t v0 = vld1q_s32(buf);
1302*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t v1 = vld1q_s32(buf + 4);
1303*77c1e3ccSAndroid Build Coastguard Worker   const int16x4_t s0 = vmovn_s32(v0);
1304*77c1e3ccSAndroid Build Coastguard Worker   const int16x4_t s1 = vmovn_s32(v1);
1305*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_s16(s0, s1);
1306*77c1e3ccSAndroid Build Coastguard Worker }
1307*77c1e3ccSAndroid Build Coastguard Worker 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1308*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1309*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1310*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1311*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(buf, v0);
1312*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(buf + 4, v1);
1313*77c1e3ccSAndroid Build Coastguard Worker }
1314*77c1e3ccSAndroid Build Coastguard Worker 
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1315*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1316*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t v0 = vmovl_s16(a);
1317*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(buf, v0);
1318*77c1e3ccSAndroid Build Coastguard Worker }
1319*77c1e3ccSAndroid Build Coastguard Worker 
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1320*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1321*77c1e3ccSAndroid Build Coastguard Worker                                               int16x8_t indices) {
1322*77c1e3ccSAndroid Build Coastguard Worker   // Recent Clang and GCC versions correctly identify that this zero-broadcast
1323*77c1e3ccSAndroid Build Coastguard Worker   // is redundant. Alternatively we could load and broadcast the zeroth element
1324*77c1e3ccSAndroid Build Coastguard Worker   // and then replace the other lanes, however this is slower than loading a
1325*77c1e3ccSAndroid Build Coastguard Worker   // single element without broadcast on some micro-architectures.
1326*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t ret = vdup_n_u8(0);
1327*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1328*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1329*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1330*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1331*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1332*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1333*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1334*77c1e3ccSAndroid Build Coastguard Worker   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1335*77c1e3ccSAndroid Build Coastguard Worker   return ret;
1336*77c1e3ccSAndroid Build Coastguard Worker }
1337*77c1e3ccSAndroid Build Coastguard Worker 
1338*77c1e3ccSAndroid Build Coastguard Worker // The `lane` parameter here must be an immediate.
1339*77c1e3ccSAndroid Build Coastguard Worker #define store_u8_2x1_lane(dst, src, lane)                       \
1340*77c1e3ccSAndroid Build Coastguard Worker   do {                                                          \
1341*77c1e3ccSAndroid Build Coastguard Worker     uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1342*77c1e3ccSAndroid Build Coastguard Worker     memcpy(dst, &a, 2);                                         \
1343*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
1344*77c1e3ccSAndroid Build Coastguard Worker 
1345*77c1e3ccSAndroid Build Coastguard Worker #define store_u8_4x1_lane(dst, src, lane)                       \
1346*77c1e3ccSAndroid Build Coastguard Worker   do {                                                          \
1347*77c1e3ccSAndroid Build Coastguard Worker     uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1348*77c1e3ccSAndroid Build Coastguard Worker     memcpy(dst, &a, 4);                                         \
1349*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
1350*77c1e3ccSAndroid Build Coastguard Worker 
1351*77c1e3ccSAndroid Build Coastguard Worker #define store_u16_2x1_lane(dst, src, lane)                       \
1352*77c1e3ccSAndroid Build Coastguard Worker   do {                                                           \
1353*77c1e3ccSAndroid Build Coastguard Worker     uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1354*77c1e3ccSAndroid Build Coastguard Worker     memcpy(dst, &a, 4);                                          \
1355*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
1356*77c1e3ccSAndroid Build Coastguard Worker 
1357*77c1e3ccSAndroid Build Coastguard Worker #define store_u16_4x1_lane(dst, src, lane)                         \
1358*77c1e3ccSAndroid Build Coastguard Worker   do {                                                             \
1359*77c1e3ccSAndroid Build Coastguard Worker     uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1360*77c1e3ccSAndroid Build Coastguard Worker     memcpy(dst, &a, 8);                                            \
1361*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
1362*77c1e3ccSAndroid Build Coastguard Worker 
1363*77c1e3ccSAndroid Build Coastguard Worker #define store_s16_4x1_lane(dst, src, lane)                        \
1364*77c1e3ccSAndroid Build Coastguard Worker   do {                                                            \
1365*77c1e3ccSAndroid Build Coastguard Worker     int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
1366*77c1e3ccSAndroid Build Coastguard Worker     memcpy(dst, &a, 8);                                           \
1367*77c1e3ccSAndroid Build Coastguard Worker   } while (0)
1368*77c1e3ccSAndroid Build Coastguard Worker 
1369*77c1e3ccSAndroid Build Coastguard Worker // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1370*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1371*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 0);
1372*77c1e3ccSAndroid Build Coastguard Worker }
1373*77c1e3ccSAndroid Build Coastguard Worker 
1374*77c1e3ccSAndroid Build Coastguard Worker // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1375*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1376*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, src, 0);
1377*77c1e3ccSAndroid Build Coastguard Worker }
1378*77c1e3ccSAndroid Build Coastguard Worker 
1379*77c1e3ccSAndroid Build Coastguard Worker // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1380*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1381*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t src) {
1382*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 0);
1383*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1384*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 1);
1385*77c1e3ccSAndroid Build Coastguard Worker }
1386*77c1e3ccSAndroid Build Coastguard Worker 
store_u8x2_strided_x4(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1387*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride,
1388*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t src) {
1389*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 0);
1390*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1391*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 1);
1392*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1393*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 2);
1394*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1395*77c1e3ccSAndroid Build Coastguard Worker   store_u8_2x1_lane(dst, src, 3);
1396*77c1e3ccSAndroid Build Coastguard Worker }
1397*77c1e3ccSAndroid Build Coastguard Worker 
1398*77c1e3ccSAndroid Build Coastguard Worker // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1399*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1400*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x8_t src) {
1401*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, src, 0);
1402*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1403*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, src, 1);
1404*77c1e3ccSAndroid Build Coastguard Worker }
1405*77c1e3ccSAndroid Build Coastguard Worker 
1406*77c1e3ccSAndroid Build Coastguard Worker // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1407*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1408*77c1e3ccSAndroid Build Coastguard Worker                                          uint8x16_t src) {
1409*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1410*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1411*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1412*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1413*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1414*77c1e3ccSAndroid Build Coastguard Worker   dst += stride;
1415*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1416*77c1e3ccSAndroid Build Coastguard Worker }
1417*77c1e3ccSAndroid Build Coastguard Worker 
1418*77c1e3ccSAndroid Build Coastguard Worker // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1419*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1420*77c1e3ccSAndroid Build Coastguard Worker   store_u16_2x1_lane(dst, src, 0);
1421*77c1e3ccSAndroid Build Coastguard Worker }
1422*77c1e3ccSAndroid Build Coastguard Worker 
1423*77c1e3ccSAndroid Build Coastguard Worker // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1424*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1425*77c1e3ccSAndroid Build Coastguard Worker                                           uint16x4_t src) {
1426*77c1e3ccSAndroid Build Coastguard Worker   store_u16_2x1_lane(dst, src, 0);
1427*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1428*77c1e3ccSAndroid Build Coastguard Worker   store_u16_2x1_lane(dst, src, 1);
1429*77c1e3ccSAndroid Build Coastguard Worker }
1430*77c1e3ccSAndroid Build Coastguard Worker 
1431*77c1e3ccSAndroid Build Coastguard Worker // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1432*77c1e3ccSAndroid Build Coastguard Worker static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1433*77c1e3ccSAndroid Build Coastguard Worker                                           uint16x8_t src) {
1434*77c1e3ccSAndroid Build Coastguard Worker   store_u16_4x1_lane(dst, src, 0);
1435*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1436*77c1e3ccSAndroid Build Coastguard Worker   store_u16_4x1_lane(dst, src, 1);
1437*77c1e3ccSAndroid Build Coastguard Worker }
1438*77c1e3ccSAndroid Build Coastguard Worker 
1439*77c1e3ccSAndroid Build Coastguard Worker // Store two blocks of 64-bits from a single vector.
store_s16x4_strided_x2(int16_t * dst,int32_t dst_stride,int16x8_t src)1440*77c1e3ccSAndroid Build Coastguard Worker static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
1441*77c1e3ccSAndroid Build Coastguard Worker                                           int16x8_t src) {
1442*77c1e3ccSAndroid Build Coastguard Worker   store_s16_4x1_lane(dst, src, 0);
1443*77c1e3ccSAndroid Build Coastguard Worker   dst += dst_stride;
1444*77c1e3ccSAndroid Build Coastguard Worker   store_s16_4x1_lane(dst, src, 1);
1445*77c1e3ccSAndroid Build Coastguard Worker }
1446*77c1e3ccSAndroid Build Coastguard Worker 
1447*77c1e3ccSAndroid Build Coastguard Worker #undef store_u8_2x1_lane
1448*77c1e3ccSAndroid Build Coastguard Worker #undef store_u8_4x1_lane
1449*77c1e3ccSAndroid Build Coastguard Worker #undef store_u16_2x1_lane
1450*77c1e3ccSAndroid Build Coastguard Worker #undef store_u16_4x1_lane
1451*77c1e3ccSAndroid Build Coastguard Worker #undef store_s16_4x1_lane
1452*77c1e3ccSAndroid Build Coastguard Worker 
1453*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
1454