1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include <assert.h>
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19
sad16_neon(uint8x16_t src,uint8x16_t ref,uint16x8_t * const sad_sum)20 static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
21 uint16x8_t *const sad_sum) {
22 uint8x16_t abs_diff = vabdq_u8(src, ref);
23 *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
24 }
25
sad64xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)26 static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
27 const uint8_t *const ref[4], int ref_stride,
28 uint32_t res[4], int h) {
29 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
30 vdupq_n_u16(0) };
31 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
32 vdupq_n_u16(0) };
33
34 int i = 0;
35 do {
36 uint8x16_t s0, s1, s2, s3;
37
38 s0 = vld1q_u8(src + i * src_stride);
39 sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
40 sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
41 sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
42 sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
43
44 s1 = vld1q_u8(src + i * src_stride + 16);
45 sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
46 sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
47 sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
48 sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
49
50 s2 = vld1q_u8(src + i * src_stride + 32);
51 sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
52 sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
53 sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
54 sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
55
56 s3 = vld1q_u8(src + i * src_stride + 48);
57 sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
58 sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
59 sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
60 sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
61
62 i++;
63 } while (i < h);
64
65 vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
66 }
67
sad32xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)68 static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
69 const uint8_t *const ref[4], int ref_stride,
70 uint32_t res[4], int h) {
71 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
72 vdupq_n_u16(0) };
73 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
74 vdupq_n_u16(0) };
75
76 int i = 0;
77 do {
78 uint8x16_t s0, s1;
79
80 s0 = vld1q_u8(src + i * src_stride);
81 sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
82 sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
83 sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
84 sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
85
86 s1 = vld1q_u8(src + i * src_stride + 16);
87 sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
88 sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
89 sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
90 sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
91
92 i++;
93 } while (i < h);
94
95 vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
96 }
97
sad16xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)98 static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
99 const uint8_t *const ref[4], int ref_stride,
100 uint32_t res[4], int h) {
101 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
102 vdupq_n_u16(0) };
103
104 int i = 0;
105 do {
106 const uint8x16_t s = vld1q_u8(src + i * src_stride);
107 sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
108 sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
109 sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
110 sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
111
112 i++;
113 } while (i < h);
114
115 vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
116 }
117
sad8_neon(uint8x8_t src,uint8x8_t ref,uint16x8_t * const sad_sum)118 static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
119 uint16x8_t *const sad_sum) {
120 uint8x8_t abs_diff = vabd_u8(src, ref);
121 *sad_sum = vaddw_u8(*sad_sum, abs_diff);
122 }
123
sad8xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)124 static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
125 const uint8_t *const ref[4], int ref_stride,
126 uint32_t res[4], int h) {
127 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
128 vdupq_n_u16(0) };
129
130 int i = 0;
131 do {
132 const uint8x8_t s = vld1_u8(src + i * src_stride);
133 sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
134 sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
135 sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
136 sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
137
138 i++;
139 } while (i < h);
140
141 vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
142 }
143
sad4xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)144 static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
145 const uint8_t *const ref[4], int ref_stride,
146 uint32_t res[4], int h) {
147 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
148 vdupq_n_u16(0) };
149
150 int i = 0;
151 do {
152 uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
153 uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
154 uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
155 uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
156 uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
157
158 sad8_neon(s, r0, &sum[0]);
159 sad8_neon(s, r1, &sum[1]);
160 sad8_neon(s, r2, &sum[2]);
161 sad8_neon(s, r3, &sum[3]);
162
163 i += 2;
164 } while (i < h);
165
166 vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
167 }
168
169 #define SAD_WXH_4D_NEON(w, h) \
170 void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \
171 const uint8_t *const ref_array[4], \
172 int ref_stride, uint32_t sad_array[4]) { \
173 sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
174 (h)); \
175 }
176
177 SAD_WXH_4D_NEON(4, 4)
178 SAD_WXH_4D_NEON(4, 8)
179
180 SAD_WXH_4D_NEON(8, 4)
181 SAD_WXH_4D_NEON(8, 8)
182 SAD_WXH_4D_NEON(8, 16)
183
184 SAD_WXH_4D_NEON(16, 8)
185 SAD_WXH_4D_NEON(16, 16)
186 SAD_WXH_4D_NEON(16, 32)
187
188 SAD_WXH_4D_NEON(32, 16)
189 SAD_WXH_4D_NEON(32, 32)
190 SAD_WXH_4D_NEON(32, 64)
191
192 SAD_WXH_4D_NEON(64, 32)
193 SAD_WXH_4D_NEON(64, 64)
194
195 #undef SAD_WXH_4D_NEON
196
197 #define SAD_SKIP_WXH_4D_NEON(w, h) \
198 void vpx_sad_skip_##w##x##h##x4d_neon( \
199 const uint8_t *src_ptr, int src_stride, \
200 const uint8_t *const ref_array[4], int ref_stride, \
201 uint32_t sad_array[4]) { \
202 sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
203 sad_array, ((h) >> 1)); \
204 sad_array[0] <<= 1; \
205 sad_array[1] <<= 1; \
206 sad_array[2] <<= 1; \
207 sad_array[3] <<= 1; \
208 }
209
210 SAD_SKIP_WXH_4D_NEON(4, 4)
211 SAD_SKIP_WXH_4D_NEON(4, 8)
212
213 SAD_SKIP_WXH_4D_NEON(8, 4)
214 SAD_SKIP_WXH_4D_NEON(8, 8)
215 SAD_SKIP_WXH_4D_NEON(8, 16)
216
217 SAD_SKIP_WXH_4D_NEON(16, 8)
218 SAD_SKIP_WXH_4D_NEON(16, 16)
219 SAD_SKIP_WXH_4D_NEON(16, 32)
220
221 SAD_SKIP_WXH_4D_NEON(32, 16)
222 SAD_SKIP_WXH_4D_NEON(32, 32)
223 SAD_SKIP_WXH_4D_NEON(32, 64)
224
225 SAD_SKIP_WXH_4D_NEON(64, 32)
226 SAD_SKIP_WXH_4D_NEON(64, 64)
227
228 #undef SAD_SKIP_WXH_4D_NEON
229