1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_ARM_MEM_NEON_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
15*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
16*fb1b10abSAndroid Build Coastguard Worker #include <string.h>
17*fb1b10abSAndroid Build Coastguard Worker
18*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
19*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
20*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/vpx_dsp_common.h"
21*fb1b10abSAndroid Build Coastguard Worker
22*fb1b10abSAndroid Build Coastguard Worker // Support for these xN intrinsics is lacking in older versions of GCC.
23*fb1b10abSAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__)
24*fb1b10abSAndroid Build Coastguard Worker #if __GNUC__ < 8 || defined(__arm__)
vld1q_u8_x2(uint8_t const * ptr)25*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
26*fb1b10abSAndroid Build Coastguard Worker uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
27*fb1b10abSAndroid Build Coastguard Worker return res;
28*fb1b10abSAndroid Build Coastguard Worker }
29*fb1b10abSAndroid Build Coastguard Worker #endif
30*fb1b10abSAndroid Build Coastguard Worker
31*fb1b10abSAndroid Build Coastguard Worker #if __GNUC__ < 9 || defined(__arm__)
vld1q_u8_x3(uint8_t const * ptr)32*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
33*fb1b10abSAndroid Build Coastguard Worker uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
34*fb1b10abSAndroid Build Coastguard Worker vld1q_u8(ptr + 2 * 16) } };
35*fb1b10abSAndroid Build Coastguard Worker return res;
36*fb1b10abSAndroid Build Coastguard Worker }
37*fb1b10abSAndroid Build Coastguard Worker #endif
38*fb1b10abSAndroid Build Coastguard Worker #endif
39*fb1b10abSAndroid Build Coastguard Worker
create_s16x4_neon(const int16_t c0,const int16_t c1,const int16_t c2,const int16_t c3)40*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
41*fb1b10abSAndroid Build Coastguard Worker const int16_t c2, const int16_t c3) {
42*fb1b10abSAndroid Build Coastguard Worker return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
43*fb1b10abSAndroid Build Coastguard Worker ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
44*fb1b10abSAndroid Build Coastguard Worker }
45*fb1b10abSAndroid Build Coastguard Worker
create_s32x2_neon(const int32_t c0,const int32_t c1)46*fb1b10abSAndroid Build Coastguard Worker static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
47*fb1b10abSAndroid Build Coastguard Worker return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
48*fb1b10abSAndroid Build Coastguard Worker }
49*fb1b10abSAndroid Build Coastguard Worker
create_s32x4_neon(const int32_t c0,const int32_t c1,const int32_t c2,const int32_t c3)50*fb1b10abSAndroid Build Coastguard Worker static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
51*fb1b10abSAndroid Build Coastguard Worker const int32_t c2, const int32_t c3) {
52*fb1b10abSAndroid Build Coastguard Worker return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
53*fb1b10abSAndroid Build Coastguard Worker }
54*fb1b10abSAndroid Build Coastguard Worker
55*fb1b10abSAndroid Build Coastguard Worker // Helper functions used to load tran_low_t into int16, narrowing if necessary.
load_tran_low_to_s16x2q(const tran_low_t * buf)56*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
57*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
58*fb1b10abSAndroid Build Coastguard Worker const int32x4x2_t v0 = vld2q_s32(buf);
59*fb1b10abSAndroid Build Coastguard Worker const int32x4x2_t v1 = vld2q_s32(buf + 8);
60*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s0 = vmovn_s32(v0.val[0]);
61*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s1 = vmovn_s32(v0.val[1]);
62*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s2 = vmovn_s32(v1.val[0]);
63*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s3 = vmovn_s32(v1.val[1]);
64*fb1b10abSAndroid Build Coastguard Worker int16x8x2_t res;
65*fb1b10abSAndroid Build Coastguard Worker res.val[0] = vcombine_s16(s0, s2);
66*fb1b10abSAndroid Build Coastguard Worker res.val[1] = vcombine_s16(s1, s3);
67*fb1b10abSAndroid Build Coastguard Worker return res;
68*fb1b10abSAndroid Build Coastguard Worker #else
69*fb1b10abSAndroid Build Coastguard Worker return vld2q_s16(buf);
70*fb1b10abSAndroid Build Coastguard Worker #endif
71*fb1b10abSAndroid Build Coastguard Worker }
72*fb1b10abSAndroid Build Coastguard Worker
load_tran_low_to_s16q(const tran_low_t * buf)73*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
74*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
75*fb1b10abSAndroid Build Coastguard Worker const int32x4_t v0 = vld1q_s32(buf);
76*fb1b10abSAndroid Build Coastguard Worker const int32x4_t v1 = vld1q_s32(buf + 4);
77*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s0 = vmovn_s32(v0);
78*fb1b10abSAndroid Build Coastguard Worker const int16x4_t s1 = vmovn_s32(v1);
79*fb1b10abSAndroid Build Coastguard Worker return vcombine_s16(s0, s1);
80*fb1b10abSAndroid Build Coastguard Worker #else
81*fb1b10abSAndroid Build Coastguard Worker return vld1q_s16(buf);
82*fb1b10abSAndroid Build Coastguard Worker #endif
83*fb1b10abSAndroid Build Coastguard Worker }
84*fb1b10abSAndroid Build Coastguard Worker
load_tran_low_to_s16d(const tran_low_t * buf)85*fb1b10abSAndroid Build Coastguard Worker static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
86*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
87*fb1b10abSAndroid Build Coastguard Worker const int32x4_t v0 = vld1q_s32(buf);
88*fb1b10abSAndroid Build Coastguard Worker return vmovn_s32(v0);
89*fb1b10abSAndroid Build Coastguard Worker #else
90*fb1b10abSAndroid Build Coastguard Worker return vld1_s16(buf);
91*fb1b10abSAndroid Build Coastguard Worker #endif
92*fb1b10abSAndroid Build Coastguard Worker }
93*fb1b10abSAndroid Build Coastguard Worker
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)94*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
95*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
96*fb1b10abSAndroid Build Coastguard Worker const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
97*fb1b10abSAndroid Build Coastguard Worker const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
98*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(buf, v0);
99*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(buf + 4, v1);
100*fb1b10abSAndroid Build Coastguard Worker #else
101*fb1b10abSAndroid Build Coastguard Worker vst1q_s16(buf, a);
102*fb1b10abSAndroid Build Coastguard Worker #endif
103*fb1b10abSAndroid Build Coastguard Worker }
104*fb1b10abSAndroid Build Coastguard Worker
105*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
store_s32q_to_tran_low(tran_low_t * buf,const int32x4_t a)106*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
107*fb1b10abSAndroid Build Coastguard Worker vst1q_s32(buf, a);
108*fb1b10abSAndroid Build Coastguard Worker }
109*fb1b10abSAndroid Build Coastguard Worker
load_tran_low_to_s32q(const tran_low_t * buf)110*fb1b10abSAndroid Build Coastguard Worker static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
111*fb1b10abSAndroid Build Coastguard Worker return vld1q_s32(buf);
112*fb1b10abSAndroid Build Coastguard Worker }
113*fb1b10abSAndroid Build Coastguard Worker #endif
114*fb1b10abSAndroid Build Coastguard Worker
115*fb1b10abSAndroid Build Coastguard Worker // Propagate type information to the compiler. Without this the compiler may
116*fb1b10abSAndroid Build Coastguard Worker // assume the required alignment of uint32_t (4 bytes) and add alignment hints
117*fb1b10abSAndroid Build Coastguard Worker // to the memory access.
118*fb1b10abSAndroid Build Coastguard Worker //
119*fb1b10abSAndroid Build Coastguard Worker // This is used for functions operating on uint8_t which wish to load or store 4
120*fb1b10abSAndroid Build Coastguard Worker // values at a time but which may not be on 4 byte boundaries.
uint32_to_mem(uint8_t * buf,uint32_t a)121*fb1b10abSAndroid Build Coastguard Worker static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
122*fb1b10abSAndroid Build Coastguard Worker memcpy(buf, &a, 4);
123*fb1b10abSAndroid Build Coastguard Worker }
124*fb1b10abSAndroid Build Coastguard Worker
125*fb1b10abSAndroid Build Coastguard Worker // Load 4 contiguous bytes when alignment is not guaranteed.
load_unaligned_u8_4x1(const uint8_t * buf)126*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
127*fb1b10abSAndroid Build Coastguard Worker uint32_t a;
128*fb1b10abSAndroid Build Coastguard Worker uint32x2_t a_u32;
129*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
130*fb1b10abSAndroid Build Coastguard Worker a_u32 = vdup_n_u32(0);
131*fb1b10abSAndroid Build Coastguard Worker a_u32 = vset_lane_u32(a, a_u32, 0);
132*fb1b10abSAndroid Build Coastguard Worker return vreinterpret_u8_u32(a_u32);
133*fb1b10abSAndroid Build Coastguard Worker }
134*fb1b10abSAndroid Build Coastguard Worker
135*fb1b10abSAndroid Build Coastguard Worker // Load 4 contiguous bytes and replicate across a vector when alignment is not
136*fb1b10abSAndroid Build Coastguard Worker // guaranteed.
load_replicate_u8_4x1(const uint8_t * buf)137*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
138*fb1b10abSAndroid Build Coastguard Worker uint32_t a;
139*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
140*fb1b10abSAndroid Build Coastguard Worker return vreinterpret_u8_u32(vdup_n_u32(a));
141*fb1b10abSAndroid Build Coastguard Worker }
142*fb1b10abSAndroid Build Coastguard Worker
143*fb1b10abSAndroid Build Coastguard Worker // Store 4 contiguous bytes from the low half of an 8x8 vector.
store_u8_4x1(uint8_t * buf,uint8x8_t a)144*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
145*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
146*fb1b10abSAndroid Build Coastguard Worker }
147*fb1b10abSAndroid Build Coastguard Worker
148*fb1b10abSAndroid Build Coastguard Worker // Store 4 contiguous bytes from the high half of an 8x8 vector.
store_u8_4x1_high(uint8_t * buf,uint8x8_t a)149*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
150*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
151*fb1b10abSAndroid Build Coastguard Worker }
152*fb1b10abSAndroid Build Coastguard Worker
153*fb1b10abSAndroid Build Coastguard Worker // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,ptrdiff_t stride)154*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
155*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t stride) {
156*fb1b10abSAndroid Build Coastguard Worker uint32_t a;
157*fb1b10abSAndroid Build Coastguard Worker uint32x2_t a_u32 = vdup_n_u32(0);
158*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
159*fb1b10abSAndroid Build Coastguard Worker buf += stride;
160*fb1b10abSAndroid Build Coastguard Worker a_u32 = vset_lane_u32(a, a_u32, 0);
161*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
162*fb1b10abSAndroid Build Coastguard Worker a_u32 = vset_lane_u32(a, a_u32, 1);
163*fb1b10abSAndroid Build Coastguard Worker return vreinterpret_u8_u32(a_u32);
164*fb1b10abSAndroid Build Coastguard Worker }
165*fb1b10abSAndroid Build Coastguard Worker
166*fb1b10abSAndroid Build Coastguard Worker // Load 8 bytes when alignment is not guaranteed.
load_unaligned_u16(const uint16_t * buf)167*fb1b10abSAndroid Build Coastguard Worker static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
168*fb1b10abSAndroid Build Coastguard Worker uint64_t a;
169*fb1b10abSAndroid Build Coastguard Worker uint64x1_t a_u64 = vdup_n_u64(0);
170*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 8);
171*fb1b10abSAndroid Build Coastguard Worker a_u64 = vset_lane_u64(a, a_u64, 0);
172*fb1b10abSAndroid Build Coastguard Worker return vreinterpret_u16_u64(a_u64);
173*fb1b10abSAndroid Build Coastguard Worker }
174*fb1b10abSAndroid Build Coastguard Worker
175*fb1b10abSAndroid Build Coastguard Worker // Load 2 sets of 8 bytes when alignment is not guaranteed.
load_unaligned_u16q(const uint16_t * buf,ptrdiff_t stride)176*fb1b10abSAndroid Build Coastguard Worker static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
177*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t stride) {
178*fb1b10abSAndroid Build Coastguard Worker uint64_t a;
179*fb1b10abSAndroid Build Coastguard Worker uint64x2_t a_u64 = vdupq_n_u64(0);
180*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 8);
181*fb1b10abSAndroid Build Coastguard Worker buf += stride;
182*fb1b10abSAndroid Build Coastguard Worker a_u64 = vsetq_lane_u64(a, a_u64, 0);
183*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 8);
184*fb1b10abSAndroid Build Coastguard Worker a_u64 = vsetq_lane_u64(a, a_u64, 1);
185*fb1b10abSAndroid Build Coastguard Worker return vreinterpretq_u16_u64(a_u64);
186*fb1b10abSAndroid Build Coastguard Worker }
187*fb1b10abSAndroid Build Coastguard Worker
188*fb1b10abSAndroid Build Coastguard Worker // Store 2 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)189*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
190*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t a) {
191*fb1b10abSAndroid Build Coastguard Worker const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
192*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
193*fb1b10abSAndroid Build Coastguard Worker buf += stride;
194*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
195*fb1b10abSAndroid Build Coastguard Worker }
196*fb1b10abSAndroid Build Coastguard Worker
197*fb1b10abSAndroid Build Coastguard Worker // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,ptrdiff_t stride)198*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
199*fb1b10abSAndroid Build Coastguard Worker ptrdiff_t stride) {
200*fb1b10abSAndroid Build Coastguard Worker uint32_t a;
201*fb1b10abSAndroid Build Coastguard Worker uint32x4_t a_u32 = vdupq_n_u32(0);
202*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
203*fb1b10abSAndroid Build Coastguard Worker buf += stride;
204*fb1b10abSAndroid Build Coastguard Worker a_u32 = vsetq_lane_u32(a, a_u32, 0);
205*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
206*fb1b10abSAndroid Build Coastguard Worker buf += stride;
207*fb1b10abSAndroid Build Coastguard Worker a_u32 = vsetq_lane_u32(a, a_u32, 1);
208*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
209*fb1b10abSAndroid Build Coastguard Worker buf += stride;
210*fb1b10abSAndroid Build Coastguard Worker a_u32 = vsetq_lane_u32(a, a_u32, 2);
211*fb1b10abSAndroid Build Coastguard Worker memcpy(&a, buf, 4);
212*fb1b10abSAndroid Build Coastguard Worker buf += stride;
213*fb1b10abSAndroid Build Coastguard Worker a_u32 = vsetq_lane_u32(a, a_u32, 3);
214*fb1b10abSAndroid Build Coastguard Worker return vreinterpretq_u8_u32(a_u32);
215*fb1b10abSAndroid Build Coastguard Worker }
216*fb1b10abSAndroid Build Coastguard Worker
217*fb1b10abSAndroid Build Coastguard Worker // Store 4 sets of 4 bytes when alignment is not guaranteed.
store_unaligned_u8q(uint8_t * buf,ptrdiff_t stride,const uint8x16_t a)218*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
219*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t a) {
220*fb1b10abSAndroid Build Coastguard Worker const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
221*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
222*fb1b10abSAndroid Build Coastguard Worker buf += stride;
223*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
224*fb1b10abSAndroid Build Coastguard Worker buf += stride;
225*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
226*fb1b10abSAndroid Build Coastguard Worker buf += stride;
227*fb1b10abSAndroid Build Coastguard Worker uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
228*fb1b10abSAndroid Build Coastguard Worker }
229*fb1b10abSAndroid Build Coastguard Worker
230*fb1b10abSAndroid Build Coastguard Worker // Load 2 sets of 4 bytes when alignment is guaranteed.
load_u8(const uint8_t * buf,ptrdiff_t stride)231*fb1b10abSAndroid Build Coastguard Worker static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
232*fb1b10abSAndroid Build Coastguard Worker uint32x2_t a = vdup_n_u32(0);
233*fb1b10abSAndroid Build Coastguard Worker
234*fb1b10abSAndroid Build Coastguard Worker assert(!((intptr_t)buf % sizeof(uint32_t)));
235*fb1b10abSAndroid Build Coastguard Worker assert(!(stride % sizeof(uint32_t)));
236*fb1b10abSAndroid Build Coastguard Worker
237*fb1b10abSAndroid Build Coastguard Worker a = vld1_lane_u32((const uint32_t *)buf, a, 0);
238*fb1b10abSAndroid Build Coastguard Worker buf += stride;
239*fb1b10abSAndroid Build Coastguard Worker a = vld1_lane_u32((const uint32_t *)buf, a, 1);
240*fb1b10abSAndroid Build Coastguard Worker return vreinterpret_u8_u32(a);
241*fb1b10abSAndroid Build Coastguard Worker }
242*fb1b10abSAndroid Build Coastguard Worker
243*fb1b10abSAndroid Build Coastguard Worker // Store 2 sets of 4 bytes when alignment is guaranteed.
store_u8(uint8_t * buf,ptrdiff_t stride,const uint8x8_t a)244*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
245*fb1b10abSAndroid Build Coastguard Worker uint32x2_t a_u32 = vreinterpret_u32_u8(a);
246*fb1b10abSAndroid Build Coastguard Worker
247*fb1b10abSAndroid Build Coastguard Worker assert(!((intptr_t)buf % sizeof(uint32_t)));
248*fb1b10abSAndroid Build Coastguard Worker assert(!(stride % sizeof(uint32_t)));
249*fb1b10abSAndroid Build Coastguard Worker
250*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u32((uint32_t *)buf, a_u32, 0);
251*fb1b10abSAndroid Build Coastguard Worker buf += stride;
252*fb1b10abSAndroid Build Coastguard Worker vst1_lane_u32((uint32_t *)buf, a_u32, 1);
253*fb1b10abSAndroid Build Coastguard Worker }
254*fb1b10abSAndroid Build Coastguard Worker
store_u8_8x3(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2)255*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
256*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s0, const uint8x8_t s1,
257*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s2) {
258*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s0);
259*fb1b10abSAndroid Build Coastguard Worker s += p;
260*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s1);
261*fb1b10abSAndroid Build Coastguard Worker s += p;
262*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s2);
263*fb1b10abSAndroid Build Coastguard Worker }
264*fb1b10abSAndroid Build Coastguard Worker
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)265*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
266*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s0, uint8x8_t *const s1,
267*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s2) {
268*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_u8(s);
269*fb1b10abSAndroid Build Coastguard Worker s += p;
270*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_u8(s);
271*fb1b10abSAndroid Build Coastguard Worker s += p;
272*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_u8(s);
273*fb1b10abSAndroid Build Coastguard Worker }
274*fb1b10abSAndroid Build Coastguard Worker
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)275*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
276*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s0, uint8x8_t *const s1,
277*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s2, uint8x8_t *const s3) {
278*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_u8(s);
279*fb1b10abSAndroid Build Coastguard Worker s += p;
280*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_u8(s);
281*fb1b10abSAndroid Build Coastguard Worker s += p;
282*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_u8(s);
283*fb1b10abSAndroid Build Coastguard Worker s += p;
284*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_u8(s);
285*fb1b10abSAndroid Build Coastguard Worker }
286*fb1b10abSAndroid Build Coastguard Worker
store_u8_8x4(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)287*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
288*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s0, const uint8x8_t s1,
289*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s2, const uint8x8_t s3) {
290*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s0);
291*fb1b10abSAndroid Build Coastguard Worker s += p;
292*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s1);
293*fb1b10abSAndroid Build Coastguard Worker s += p;
294*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s2);
295*fb1b10abSAndroid Build Coastguard Worker s += p;
296*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s3);
297*fb1b10abSAndroid Build Coastguard Worker }
298*fb1b10abSAndroid Build Coastguard Worker
load_u8_16x3(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)299*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
300*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s0, uint8x16_t *const s1,
301*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s2) {
302*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_u8(s);
303*fb1b10abSAndroid Build Coastguard Worker s += p;
304*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_u8(s);
305*fb1b10abSAndroid Build Coastguard Worker s += p;
306*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_u8(s);
307*fb1b10abSAndroid Build Coastguard Worker }
308*fb1b10abSAndroid Build Coastguard Worker
load_u8_16x4(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)309*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
310*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s0, uint8x16_t *const s1,
311*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s2, uint8x16_t *const s3) {
312*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_u8(s);
313*fb1b10abSAndroid Build Coastguard Worker s += p;
314*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_u8(s);
315*fb1b10abSAndroid Build Coastguard Worker s += p;
316*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_u8(s);
317*fb1b10abSAndroid Build Coastguard Worker s += p;
318*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_u8(s);
319*fb1b10abSAndroid Build Coastguard Worker }
320*fb1b10abSAndroid Build Coastguard Worker
store_u8_16x4(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)321*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
322*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s0, const uint8x16_t s1,
323*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s2, const uint8x16_t s3) {
324*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s0);
325*fb1b10abSAndroid Build Coastguard Worker s += p;
326*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s1);
327*fb1b10abSAndroid Build Coastguard Worker s += p;
328*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s2);
329*fb1b10abSAndroid Build Coastguard Worker s += p;
330*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s3);
331*fb1b10abSAndroid Build Coastguard Worker }
332*fb1b10abSAndroid Build Coastguard Worker
load_u8_8x7(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)333*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
334*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s0, uint8x8_t *const s1,
335*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s2, uint8x8_t *const s3,
336*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s4, uint8x8_t *const s5,
337*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s6) {
338*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_u8(s);
339*fb1b10abSAndroid Build Coastguard Worker s += p;
340*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_u8(s);
341*fb1b10abSAndroid Build Coastguard Worker s += p;
342*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_u8(s);
343*fb1b10abSAndroid Build Coastguard Worker s += p;
344*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_u8(s);
345*fb1b10abSAndroid Build Coastguard Worker s += p;
346*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1_u8(s);
347*fb1b10abSAndroid Build Coastguard Worker s += p;
348*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1_u8(s);
349*fb1b10abSAndroid Build Coastguard Worker s += p;
350*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1_u8(s);
351*fb1b10abSAndroid Build Coastguard Worker }
352*fb1b10abSAndroid Build Coastguard Worker
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)353*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
354*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s0, uint8x8_t *const s1,
355*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s2, uint8x8_t *const s3,
356*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s4, uint8x8_t *const s5,
357*fb1b10abSAndroid Build Coastguard Worker uint8x8_t *const s6, uint8x8_t *const s7) {
358*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_u8(s);
359*fb1b10abSAndroid Build Coastguard Worker s += p;
360*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_u8(s);
361*fb1b10abSAndroid Build Coastguard Worker s += p;
362*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_u8(s);
363*fb1b10abSAndroid Build Coastguard Worker s += p;
364*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_u8(s);
365*fb1b10abSAndroid Build Coastguard Worker s += p;
366*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1_u8(s);
367*fb1b10abSAndroid Build Coastguard Worker s += p;
368*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1_u8(s);
369*fb1b10abSAndroid Build Coastguard Worker s += p;
370*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1_u8(s);
371*fb1b10abSAndroid Build Coastguard Worker s += p;
372*fb1b10abSAndroid Build Coastguard Worker *s7 = vld1_u8(s);
373*fb1b10abSAndroid Build Coastguard Worker }
374*fb1b10abSAndroid Build Coastguard Worker
store_u8_8x8(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)375*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
376*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s0, const uint8x8_t s1,
377*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s2, const uint8x8_t s3,
378*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s4, const uint8x8_t s5,
379*fb1b10abSAndroid Build Coastguard Worker const uint8x8_t s6, const uint8x8_t s7) {
380*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s0);
381*fb1b10abSAndroid Build Coastguard Worker s += p;
382*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s1);
383*fb1b10abSAndroid Build Coastguard Worker s += p;
384*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s2);
385*fb1b10abSAndroid Build Coastguard Worker s += p;
386*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s3);
387*fb1b10abSAndroid Build Coastguard Worker s += p;
388*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s4);
389*fb1b10abSAndroid Build Coastguard Worker s += p;
390*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s5);
391*fb1b10abSAndroid Build Coastguard Worker s += p;
392*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s6);
393*fb1b10abSAndroid Build Coastguard Worker s += p;
394*fb1b10abSAndroid Build Coastguard Worker vst1_u8(s, s7);
395*fb1b10abSAndroid Build Coastguard Worker }
396*fb1b10abSAndroid Build Coastguard Worker
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)397*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
398*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s0, uint8x16_t *const s1,
399*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s2, uint8x16_t *const s3,
400*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s4, uint8x16_t *const s5,
401*fb1b10abSAndroid Build Coastguard Worker uint8x16_t *const s6, uint8x16_t *const s7) {
402*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_u8(s);
403*fb1b10abSAndroid Build Coastguard Worker s += p;
404*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_u8(s);
405*fb1b10abSAndroid Build Coastguard Worker s += p;
406*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_u8(s);
407*fb1b10abSAndroid Build Coastguard Worker s += p;
408*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_u8(s);
409*fb1b10abSAndroid Build Coastguard Worker s += p;
410*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1q_u8(s);
411*fb1b10abSAndroid Build Coastguard Worker s += p;
412*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1q_u8(s);
413*fb1b10abSAndroid Build Coastguard Worker s += p;
414*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1q_u8(s);
415*fb1b10abSAndroid Build Coastguard Worker s += p;
416*fb1b10abSAndroid Build Coastguard Worker *s7 = vld1q_u8(s);
417*fb1b10abSAndroid Build Coastguard Worker }
418*fb1b10abSAndroid Build Coastguard Worker
store_u8_16x8(uint8_t * s,const ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3,const uint8x16_t s4,const uint8x16_t s5,const uint8x16_t s6,const uint8x16_t s7)419*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
420*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s0, const uint8x16_t s1,
421*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s2, const uint8x16_t s3,
422*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s4, const uint8x16_t s5,
423*fb1b10abSAndroid Build Coastguard Worker const uint8x16_t s6, const uint8x16_t s7) {
424*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s0);
425*fb1b10abSAndroid Build Coastguard Worker s += p;
426*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s1);
427*fb1b10abSAndroid Build Coastguard Worker s += p;
428*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s2);
429*fb1b10abSAndroid Build Coastguard Worker s += p;
430*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s3);
431*fb1b10abSAndroid Build Coastguard Worker s += p;
432*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s4);
433*fb1b10abSAndroid Build Coastguard Worker s += p;
434*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s5);
435*fb1b10abSAndroid Build Coastguard Worker s += p;
436*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s6);
437*fb1b10abSAndroid Build Coastguard Worker s += p;
438*fb1b10abSAndroid Build Coastguard Worker vst1q_u8(s, s7);
439*fb1b10abSAndroid Build Coastguard Worker }
440*fb1b10abSAndroid Build Coastguard Worker
store_u16_4x3(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)441*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
442*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t s0, const uint16x4_t s1,
443*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t s2) {
444*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s0);
445*fb1b10abSAndroid Build Coastguard Worker s += p;
446*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s1);
447*fb1b10abSAndroid Build Coastguard Worker s += p;
448*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s2);
449*fb1b10abSAndroid Build Coastguard Worker }
450*fb1b10abSAndroid Build Coastguard Worker
load_s16_4x3(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2)451*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
452*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
453*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_s16(s);
454*fb1b10abSAndroid Build Coastguard Worker s += p;
455*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_s16(s);
456*fb1b10abSAndroid Build Coastguard Worker s += p;
457*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_s16(s);
458*fb1b10abSAndroid Build Coastguard Worker }
459*fb1b10abSAndroid Build Coastguard Worker
load_s16_4x4(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3)460*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
461*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
462*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s3) {
463*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_s16(s);
464*fb1b10abSAndroid Build Coastguard Worker s += p;
465*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_s16(s);
466*fb1b10abSAndroid Build Coastguard Worker s += p;
467*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_s16(s);
468*fb1b10abSAndroid Build Coastguard Worker s += p;
469*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_s16(s);
470*fb1b10abSAndroid Build Coastguard Worker }
471*fb1b10abSAndroid Build Coastguard Worker
store_u16_4x4(uint16_t * s,const ptrdiff_t p,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)472*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
473*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t s0, const uint16x4_t s1,
474*fb1b10abSAndroid Build Coastguard Worker const uint16x4_t s2, const uint16x4_t s3) {
475*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s0);
476*fb1b10abSAndroid Build Coastguard Worker s += p;
477*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s1);
478*fb1b10abSAndroid Build Coastguard Worker s += p;
479*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s2);
480*fb1b10abSAndroid Build Coastguard Worker s += p;
481*fb1b10abSAndroid Build Coastguard Worker vst1_u16(s, s3);
482*fb1b10abSAndroid Build Coastguard Worker }
483*fb1b10abSAndroid Build Coastguard Worker
load_s16_4x7(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6)484*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
485*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
486*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
487*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s6) {
488*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_s16(s);
489*fb1b10abSAndroid Build Coastguard Worker s += p;
490*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_s16(s);
491*fb1b10abSAndroid Build Coastguard Worker s += p;
492*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_s16(s);
493*fb1b10abSAndroid Build Coastguard Worker s += p;
494*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_s16(s);
495*fb1b10abSAndroid Build Coastguard Worker s += p;
496*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1_s16(s);
497*fb1b10abSAndroid Build Coastguard Worker s += p;
498*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1_s16(s);
499*fb1b10abSAndroid Build Coastguard Worker s += p;
500*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1_s16(s);
501*fb1b10abSAndroid Build Coastguard Worker }
502*fb1b10abSAndroid Build Coastguard Worker
load_s16_8x3(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2)503*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
504*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
505*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_s16(s);
506*fb1b10abSAndroid Build Coastguard Worker s += p;
507*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_s16(s);
508*fb1b10abSAndroid Build Coastguard Worker s += p;
509*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_s16(s);
510*fb1b10abSAndroid Build Coastguard Worker }
511*fb1b10abSAndroid Build Coastguard Worker
load_s16_8x4(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3)512*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
513*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
514*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s3) {
515*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_s16(s);
516*fb1b10abSAndroid Build Coastguard Worker s += p;
517*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_s16(s);
518*fb1b10abSAndroid Build Coastguard Worker s += p;
519*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_s16(s);
520*fb1b10abSAndroid Build Coastguard Worker s += p;
521*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_s16(s);
522*fb1b10abSAndroid Build Coastguard Worker }
523*fb1b10abSAndroid Build Coastguard Worker
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3)524*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
525*fb1b10abSAndroid Build Coastguard Worker uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
526*fb1b10abSAndroid Build Coastguard Worker uint16x8_t *s3) {
527*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_u16(s);
528*fb1b10abSAndroid Build Coastguard Worker s += p;
529*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_u16(s);
530*fb1b10abSAndroid Build Coastguard Worker s += p;
531*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_u16(s);
532*fb1b10abSAndroid Build Coastguard Worker s += p;
533*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_u16(s);
534*fb1b10abSAndroid Build Coastguard Worker }
535*fb1b10abSAndroid Build Coastguard Worker
store_u16_8x4(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)536*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
537*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s0, const uint16x8_t s1,
538*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s2, const uint16x8_t s3) {
539*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s0);
540*fb1b10abSAndroid Build Coastguard Worker s += p;
541*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s1);
542*fb1b10abSAndroid Build Coastguard Worker s += p;
543*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s2);
544*fb1b10abSAndroid Build Coastguard Worker s += p;
545*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s3);
546*fb1b10abSAndroid Build Coastguard Worker }
547*fb1b10abSAndroid Build Coastguard Worker
store_u16_8x3(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)548*fb1b10abSAndroid Build Coastguard Worker static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
549*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s0, const uint16x8_t s1,
550*fb1b10abSAndroid Build Coastguard Worker const uint16x8_t s2) {
551*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s0);
552*fb1b10abSAndroid Build Coastguard Worker s += p;
553*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s1);
554*fb1b10abSAndroid Build Coastguard Worker s += p;
555*fb1b10abSAndroid Build Coastguard Worker vst1q_u16(s, s2);
556*fb1b10abSAndroid Build Coastguard Worker }
557*fb1b10abSAndroid Build Coastguard Worker
load_s16_8x7(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6)558*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
559*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
560*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
561*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s6) {
562*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_s16(s);
563*fb1b10abSAndroid Build Coastguard Worker s += p;
564*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_s16(s);
565*fb1b10abSAndroid Build Coastguard Worker s += p;
566*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_s16(s);
567*fb1b10abSAndroid Build Coastguard Worker s += p;
568*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_s16(s);
569*fb1b10abSAndroid Build Coastguard Worker s += p;
570*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1q_s16(s);
571*fb1b10abSAndroid Build Coastguard Worker s += p;
572*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1q_s16(s);
573*fb1b10abSAndroid Build Coastguard Worker s += p;
574*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1q_s16(s);
575*fb1b10abSAndroid Build Coastguard Worker }
576*fb1b10abSAndroid Build Coastguard Worker
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)577*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
578*fb1b10abSAndroid Build Coastguard Worker uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
579*fb1b10abSAndroid Build Coastguard Worker uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
580*fb1b10abSAndroid Build Coastguard Worker uint16x8_t *s6, uint16x8_t *s7) {
581*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_u16(s);
582*fb1b10abSAndroid Build Coastguard Worker s += p;
583*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_u16(s);
584*fb1b10abSAndroid Build Coastguard Worker s += p;
585*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_u16(s);
586*fb1b10abSAndroid Build Coastguard Worker s += p;
587*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_u16(s);
588*fb1b10abSAndroid Build Coastguard Worker s += p;
589*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1q_u16(s);
590*fb1b10abSAndroid Build Coastguard Worker s += p;
591*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1q_u16(s);
592*fb1b10abSAndroid Build Coastguard Worker s += p;
593*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1q_u16(s);
594*fb1b10abSAndroid Build Coastguard Worker s += p;
595*fb1b10abSAndroid Build Coastguard Worker *s7 = vld1q_u16(s);
596*fb1b10abSAndroid Build Coastguard Worker }
597*fb1b10abSAndroid Build Coastguard Worker
load_s16_4x8(const int16_t * s,const ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3,int16x4_t * s4,int16x4_t * s5,int16x4_t * s6,int16x4_t * s7)598*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
599*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
600*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
601*fb1b10abSAndroid Build Coastguard Worker int16x4_t *s6, int16x4_t *s7) {
602*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1_s16(s);
603*fb1b10abSAndroid Build Coastguard Worker s += p;
604*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1_s16(s);
605*fb1b10abSAndroid Build Coastguard Worker s += p;
606*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1_s16(s);
607*fb1b10abSAndroid Build Coastguard Worker s += p;
608*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1_s16(s);
609*fb1b10abSAndroid Build Coastguard Worker s += p;
610*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1_s16(s);
611*fb1b10abSAndroid Build Coastguard Worker s += p;
612*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1_s16(s);
613*fb1b10abSAndroid Build Coastguard Worker s += p;
614*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1_s16(s);
615*fb1b10abSAndroid Build Coastguard Worker s += p;
616*fb1b10abSAndroid Build Coastguard Worker *s7 = vld1_s16(s);
617*fb1b10abSAndroid Build Coastguard Worker }
618*fb1b10abSAndroid Build Coastguard Worker
load_s16_8x8(const int16_t * s,const ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6,int16x8_t * s7)619*fb1b10abSAndroid Build Coastguard Worker static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
620*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
621*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
622*fb1b10abSAndroid Build Coastguard Worker int16x8_t *s6, int16x8_t *s7) {
623*fb1b10abSAndroid Build Coastguard Worker *s0 = vld1q_s16(s);
624*fb1b10abSAndroid Build Coastguard Worker s += p;
625*fb1b10abSAndroid Build Coastguard Worker *s1 = vld1q_s16(s);
626*fb1b10abSAndroid Build Coastguard Worker s += p;
627*fb1b10abSAndroid Build Coastguard Worker *s2 = vld1q_s16(s);
628*fb1b10abSAndroid Build Coastguard Worker s += p;
629*fb1b10abSAndroid Build Coastguard Worker *s3 = vld1q_s16(s);
630*fb1b10abSAndroid Build Coastguard Worker s += p;
631*fb1b10abSAndroid Build Coastguard Worker *s4 = vld1q_s16(s);
632*fb1b10abSAndroid Build Coastguard Worker s += p;
633*fb1b10abSAndroid Build Coastguard Worker *s5 = vld1q_s16(s);
634*fb1b10abSAndroid Build Coastguard Worker s += p;
635*fb1b10abSAndroid Build Coastguard Worker *s6 = vld1q_s16(s);
636*fb1b10abSAndroid Build Coastguard Worker s += p;
637*fb1b10abSAndroid Build Coastguard Worker *s7 = vld1q_s16(s);
638*fb1b10abSAndroid Build Coastguard Worker }
639*fb1b10abSAndroid Build Coastguard Worker
640*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
641