1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20
sad16_neon(uint8x16_t src,uint8x16_t ref,uint16x8_t * const sad_sum)21 static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
22 uint16x8_t *const sad_sum) {
23 uint8x16_t abs_diff = vabdq_u8(src, ref);
24 *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
25 }
26
sadwxhx3d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int w,int h,int h_overflow)27 static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
28 const uint8_t *const ref[3],
29 int ref_stride, uint32_t res[3], int w,
30 int h, int h_overflow) {
31 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
32 int h_limit = h > h_overflow ? h_overflow : h;
33
34 int ref_offset = 0;
35 int i = 0;
36 do {
37 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
38 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
39
40 do {
41 int j = 0;
42 do {
43 const uint8x16_t s0 = vld1q_u8(src + j);
44 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
45 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
46 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
47
48 const uint8x16_t s1 = vld1q_u8(src + j + 16);
49 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
50 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
51 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
52
53 j += 32;
54 } while (j < w);
55
56 src += src_stride;
57 ref_offset += ref_stride;
58 } while (++i < h_limit);
59
60 sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
61 sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
62 sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
63 sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
64 sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
65 sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
66
67 h_limit += h_overflow;
68 } while (i < h);
69
70 res[0] = horizontal_add_u32x4(sum[0]);
71 res[1] = horizontal_add_u32x4(sum[1]);
72 res[2] = horizontal_add_u32x4(sum[2]);
73 }
74
sad128xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)75 static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride,
76 const uint8_t *const ref[3], int ref_stride,
77 uint32_t res[3], int h) {
78 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
79 }
80
sad64xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)81 static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride,
82 const uint8_t *const ref[3], int ref_stride,
83 uint32_t res[3], int h) {
84 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
85 }
86
sad32xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)87 static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride,
88 const uint8_t *const ref[3], int ref_stride,
89 uint32_t res[3], int h) {
90 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
91 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
92
93 int ref_offset = 0;
94 int i = h;
95 do {
96 const uint8x16_t s0 = vld1q_u8(src);
97 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
98 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
99 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
100
101 const uint8x16_t s1 = vld1q_u8(src + 16);
102 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
103 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
104 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
105
106 src += src_stride;
107 ref_offset += ref_stride;
108 } while (--i != 0);
109
110 res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
111 res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
112 res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
113 }
114
sad16xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)115 static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride,
116 const uint8_t *const ref[3], int ref_stride,
117 uint32_t res[3], int h) {
118 uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
119
120 int ref_offset = 0;
121 int i = h;
122 do {
123 const uint8x16_t s = vld1q_u8(src);
124 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
125 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
126 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
127
128 src += src_stride;
129 ref_offset += ref_stride;
130 } while (--i != 0);
131
132 res[0] = horizontal_add_u16x8(sum[0]);
133 res[1] = horizontal_add_u16x8(sum[1]);
134 res[2] = horizontal_add_u16x8(sum[2]);
135 }
136
sad8xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)137 static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride,
138 const uint8_t *const ref[3], int ref_stride,
139 uint32_t res[3], int h) {
140 uint16x8_t sum[3];
141
142 uint8x8_t s = vld1_u8(src);
143 sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
144 sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
145 sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
146
147 src += src_stride;
148 int ref_offset = ref_stride;
149 int i = h - 1;
150 do {
151 s = vld1_u8(src);
152 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
153 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
154 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
155
156 src += src_stride;
157 ref_offset += ref_stride;
158 } while (--i != 0);
159
160 res[0] = horizontal_add_u16x8(sum[0]);
161 res[1] = horizontal_add_u16x8(sum[1]);
162 res[2] = horizontal_add_u16x8(sum[2]);
163 }
164
sad4xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)165 static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride,
166 const uint8_t *const ref[3], int ref_stride,
167 uint32_t res[3], int h) {
168 assert(h % 2 == 0);
169 uint16x8_t sum[3];
170
171 uint8x8_t s = load_unaligned_u8(src, src_stride);
172 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
173 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
174 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
175
176 sum[0] = vabdl_u8(s, r0);
177 sum[1] = vabdl_u8(s, r1);
178 sum[2] = vabdl_u8(s, r2);
179
180 src += 2 * src_stride;
181 int ref_offset = 2 * ref_stride;
182 int i = (h / 2) - 1;
183 do {
184 s = load_unaligned_u8(src, src_stride);
185 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
186 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
187 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
188
189 sum[0] = vabal_u8(sum[0], s, r0);
190 sum[1] = vabal_u8(sum[1], s, r1);
191 sum[2] = vabal_u8(sum[2], s, r2);
192
193 src += 2 * src_stride;
194 ref_offset += 2 * ref_stride;
195 } while (--i != 0);
196
197 res[0] = horizontal_add_u16x8(sum[0]);
198 res[1] = horizontal_add_u16x8(sum[1]);
199 res[2] = horizontal_add_u16x8(sum[2]);
200 }
201
202 #define SAD_WXH_3D_NEON(w, h) \
203 void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \
204 const uint8_t *const ref[4], int ref_stride, \
205 uint32_t res[4]) { \
206 sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \
207 }
208
209 SAD_WXH_3D_NEON(4, 4)
210 SAD_WXH_3D_NEON(4, 8)
211
212 SAD_WXH_3D_NEON(8, 4)
213 SAD_WXH_3D_NEON(8, 8)
214 SAD_WXH_3D_NEON(8, 16)
215
216 SAD_WXH_3D_NEON(16, 8)
217 SAD_WXH_3D_NEON(16, 16)
218 SAD_WXH_3D_NEON(16, 32)
219
220 SAD_WXH_3D_NEON(32, 16)
221 SAD_WXH_3D_NEON(32, 32)
222 SAD_WXH_3D_NEON(32, 64)
223
224 SAD_WXH_3D_NEON(64, 32)
225 SAD_WXH_3D_NEON(64, 64)
226 SAD_WXH_3D_NEON(64, 128)
227
228 SAD_WXH_3D_NEON(128, 64)
229 SAD_WXH_3D_NEON(128, 128)
230
231 #if !CONFIG_REALTIME_ONLY
232 SAD_WXH_3D_NEON(4, 16)
233 SAD_WXH_3D_NEON(8, 32)
234 SAD_WXH_3D_NEON(16, 4)
235 SAD_WXH_3D_NEON(16, 64)
236 SAD_WXH_3D_NEON(32, 8)
237 SAD_WXH_3D_NEON(64, 16)
238 #endif // !CONFIG_REALTIME_ONLY
239
240 #undef SAD_WXH_3D_NEON
241
sadwxhx4d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int w,int h,int h_overflow)242 static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
243 const uint8_t *const ref[4],
244 int ref_stride, uint32_t res[4], int w,
245 int h, int h_overflow) {
246 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
247 vdupq_n_u32(0) };
248 int h_limit = h > h_overflow ? h_overflow : h;
249
250 int ref_offset = 0;
251 int i = 0;
252 do {
253 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
254 vdupq_n_u16(0) };
255 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
256 vdupq_n_u16(0) };
257
258 do {
259 int j = 0;
260 do {
261 const uint8x16_t s0 = vld1q_u8(src + j);
262 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
263 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
264 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
265 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
266
267 const uint8x16_t s1 = vld1q_u8(src + j + 16);
268 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
269 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
270 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
271 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
272
273 j += 32;
274 } while (j < w);
275
276 src += src_stride;
277 ref_offset += ref_stride;
278 } while (++i < h_limit);
279
280 sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
281 sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
282 sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
283 sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
284 sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
285 sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
286 sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
287 sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
288
289 h_limit += h_overflow;
290 } while (i < h);
291
292 vst1q_u32(res, horizontal_add_4d_u32x4(sum));
293 }
294
sad128xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)295 static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride,
296 const uint8_t *const ref[4], int ref_stride,
297 uint32_t res[4], int h) {
298 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
299 }
300
sad64xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)301 static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride,
302 const uint8_t *const ref[4], int ref_stride,
303 uint32_t res[4], int h) {
304 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
305 }
306
sad32xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)307 static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride,
308 const uint8_t *const ref[4], int ref_stride,
309 uint32_t res[4], int h) {
310 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
311 vdupq_n_u16(0) };
312 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
313 vdupq_n_u16(0) };
314
315 int ref_offset = 0;
316 int i = h;
317 do {
318 const uint8x16_t s0 = vld1q_u8(src);
319 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
320 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
321 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
322 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
323
324 const uint8x16_t s1 = vld1q_u8(src + 16);
325 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
326 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
327 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
328 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
329
330 src += src_stride;
331 ref_offset += ref_stride;
332 } while (--i != 0);
333
334 vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
335 }
336
sad16xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)337 static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride,
338 const uint8_t *const ref[4], int ref_stride,
339 uint32_t res[4], int h) {
340 uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
341 vdupq_n_u16(0) };
342 uint32x4_t sum_u32[4];
343
344 int ref_offset = 0;
345 int i = h;
346 do {
347 const uint8x16_t s = vld1q_u8(src);
348 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
349 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
350 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
351 sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
352
353 src += src_stride;
354 ref_offset += ref_stride;
355 } while (--i != 0);
356
357 sum_u32[0] = vpaddlq_u16(sum_u16[0]);
358 sum_u32[1] = vpaddlq_u16(sum_u16[1]);
359 sum_u32[2] = vpaddlq_u16(sum_u16[2]);
360 sum_u32[3] = vpaddlq_u16(sum_u16[3]);
361
362 vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
363 }
364
sad8xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)365 static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride,
366 const uint8_t *const ref[4], int ref_stride,
367 uint32_t res[4], int h) {
368 uint16x8_t sum[4];
369
370 uint8x8_t s = vld1_u8(src);
371 sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
372 sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
373 sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
374 sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
375
376 src += src_stride;
377 int ref_offset = ref_stride;
378 int i = h - 1;
379 do {
380 s = vld1_u8(src);
381 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
382 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
383 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
384 sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
385
386 src += src_stride;
387 ref_offset += ref_stride;
388 } while (--i != 0);
389
390 vst1q_u32(res, horizontal_add_4d_u16x8(sum));
391 }
392
sad4xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)393 static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride,
394 const uint8_t *const ref[4], int ref_stride,
395 uint32_t res[4], int h) {
396 uint16x8_t sum[4];
397
398 uint8x8_t s = load_unaligned_u8(src, src_stride);
399 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
400 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
401 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
402 uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
403
404 sum[0] = vabdl_u8(s, r0);
405 sum[1] = vabdl_u8(s, r1);
406 sum[2] = vabdl_u8(s, r2);
407 sum[3] = vabdl_u8(s, r3);
408
409 src += 2 * src_stride;
410 int ref_offset = 2 * ref_stride;
411 int i = h / 2;
412 while (--i != 0) {
413 s = load_unaligned_u8(src, src_stride);
414 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
415 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
416 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
417 r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
418
419 sum[0] = vabal_u8(sum[0], s, r0);
420 sum[1] = vabal_u8(sum[1], s, r1);
421 sum[2] = vabal_u8(sum[2], s, r2);
422 sum[3] = vabal_u8(sum[3], s, r3);
423
424 src += 2 * src_stride;
425 ref_offset += 2 * ref_stride;
426 }
427
428 vst1q_u32(res, horizontal_add_4d_u16x8(sum));
429 }
430
431 #define SAD_WXH_4D_NEON(w, h) \
432 void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
433 const uint8_t *const ref[4], int ref_stride, \
434 uint32_t res[4]) { \
435 sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
436 }
437
438 SAD_WXH_4D_NEON(4, 4)
439 SAD_WXH_4D_NEON(4, 8)
440
441 SAD_WXH_4D_NEON(8, 4)
442 SAD_WXH_4D_NEON(8, 8)
443 SAD_WXH_4D_NEON(8, 16)
444
445 SAD_WXH_4D_NEON(16, 8)
446 SAD_WXH_4D_NEON(16, 16)
447 SAD_WXH_4D_NEON(16, 32)
448
449 SAD_WXH_4D_NEON(32, 16)
450 SAD_WXH_4D_NEON(32, 32)
451 SAD_WXH_4D_NEON(32, 64)
452
453 SAD_WXH_4D_NEON(64, 32)
454 SAD_WXH_4D_NEON(64, 64)
455 SAD_WXH_4D_NEON(64, 128)
456
457 SAD_WXH_4D_NEON(128, 64)
458 SAD_WXH_4D_NEON(128, 128)
459
460 #if !CONFIG_REALTIME_ONLY
461 SAD_WXH_4D_NEON(4, 16)
462 SAD_WXH_4D_NEON(8, 32)
463 SAD_WXH_4D_NEON(16, 4)
464 SAD_WXH_4D_NEON(16, 64)
465 SAD_WXH_4D_NEON(32, 8)
466 SAD_WXH_4D_NEON(64, 16)
467 #endif // !CONFIG_REALTIME_ONLY
468
469 #undef SAD_WXH_4D_NEON
470
471 #define SAD_SKIP_WXH_4D_NEON(w, h) \
472 void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
473 const uint8_t *const ref[4], \
474 int ref_stride, uint32_t res[4]) { \
475 sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
476 ((h) >> 1)); \
477 res[0] <<= 1; \
478 res[1] <<= 1; \
479 res[2] <<= 1; \
480 res[3] <<= 1; \
481 }
482
483 SAD_SKIP_WXH_4D_NEON(4, 4)
484 SAD_SKIP_WXH_4D_NEON(4, 8)
485
486 SAD_SKIP_WXH_4D_NEON(8, 4)
487 SAD_SKIP_WXH_4D_NEON(8, 8)
488 SAD_SKIP_WXH_4D_NEON(8, 16)
489
490 SAD_SKIP_WXH_4D_NEON(16, 8)
491 SAD_SKIP_WXH_4D_NEON(16, 16)
492 SAD_SKIP_WXH_4D_NEON(16, 32)
493
494 SAD_SKIP_WXH_4D_NEON(32, 16)
495 SAD_SKIP_WXH_4D_NEON(32, 32)
496 SAD_SKIP_WXH_4D_NEON(32, 64)
497
498 SAD_SKIP_WXH_4D_NEON(64, 32)
499 SAD_SKIP_WXH_4D_NEON(64, 64)
500 SAD_SKIP_WXH_4D_NEON(64, 128)
501
502 SAD_SKIP_WXH_4D_NEON(128, 64)
503 SAD_SKIP_WXH_4D_NEON(128, 128)
504
505 #if !CONFIG_REALTIME_ONLY
506 SAD_SKIP_WXH_4D_NEON(4, 16)
507 SAD_SKIP_WXH_4D_NEON(8, 32)
508 SAD_SKIP_WXH_4D_NEON(16, 4)
509 SAD_SKIP_WXH_4D_NEON(16, 64)
510 SAD_SKIP_WXH_4D_NEON(32, 8)
511 SAD_SKIP_WXH_4D_NEON(64, 16)
512 #endif // !CONFIG_REALTIME_ONLY
513
514 #undef SAD_SKIP_WXH_4D_NEON
515