xref: /aosp_15_r20/external/libaom/aom_dsp/arm/sadxd_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 
sad16_neon(uint8x16_t src,uint8x16_t ref,uint16x8_t * const sad_sum)21 static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
22                               uint16x8_t *const sad_sum) {
23   uint8x16_t abs_diff = vabdq_u8(src, ref);
24   *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
25 }
26 
sadwxhx3d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int w,int h,int h_overflow)27 static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
28                                         const uint8_t *const ref[3],
29                                         int ref_stride, uint32_t res[3], int w,
30                                         int h, int h_overflow) {
31   uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
32   int h_limit = h > h_overflow ? h_overflow : h;
33 
34   int ref_offset = 0;
35   int i = 0;
36   do {
37     uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
38     uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
39 
40     do {
41       int j = 0;
42       do {
43         const uint8x16_t s0 = vld1q_u8(src + j);
44         sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
45         sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
46         sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
47 
48         const uint8x16_t s1 = vld1q_u8(src + j + 16);
49         sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
50         sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
51         sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
52 
53         j += 32;
54       } while (j < w);
55 
56       src += src_stride;
57       ref_offset += ref_stride;
58     } while (++i < h_limit);
59 
60     sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
61     sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
62     sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
63     sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
64     sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
65     sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
66 
67     h_limit += h_overflow;
68   } while (i < h);
69 
70   res[0] = horizontal_add_u32x4(sum[0]);
71   res[1] = horizontal_add_u32x4(sum[1]);
72   res[2] = horizontal_add_u32x4(sum[2]);
73 }
74 
sad128xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)75 static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride,
76                                     const uint8_t *const ref[3], int ref_stride,
77                                     uint32_t res[3], int h) {
78   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
79 }
80 
sad64xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)81 static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride,
82                                    const uint8_t *const ref[3], int ref_stride,
83                                    uint32_t res[3], int h) {
84   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
85 }
86 
sad32xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)87 static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride,
88                                    const uint8_t *const ref[3], int ref_stride,
89                                    uint32_t res[3], int h) {
90   uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
91   uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
92 
93   int ref_offset = 0;
94   int i = h;
95   do {
96     const uint8x16_t s0 = vld1q_u8(src);
97     sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
98     sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
99     sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
100 
101     const uint8x16_t s1 = vld1q_u8(src + 16);
102     sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
103     sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
104     sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
105 
106     src += src_stride;
107     ref_offset += ref_stride;
108   } while (--i != 0);
109 
110   res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
111   res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
112   res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
113 }
114 
sad16xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)115 static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride,
116                                    const uint8_t *const ref[3], int ref_stride,
117                                    uint32_t res[3], int h) {
118   uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
119 
120   int ref_offset = 0;
121   int i = h;
122   do {
123     const uint8x16_t s = vld1q_u8(src);
124     sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
125     sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
126     sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
127 
128     src += src_stride;
129     ref_offset += ref_stride;
130   } while (--i != 0);
131 
132   res[0] = horizontal_add_u16x8(sum[0]);
133   res[1] = horizontal_add_u16x8(sum[1]);
134   res[2] = horizontal_add_u16x8(sum[2]);
135 }
136 
sad8xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)137 static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride,
138                                   const uint8_t *const ref[3], int ref_stride,
139                                   uint32_t res[3], int h) {
140   uint16x8_t sum[3];
141 
142   uint8x8_t s = vld1_u8(src);
143   sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
144   sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
145   sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
146 
147   src += src_stride;
148   int ref_offset = ref_stride;
149   int i = h - 1;
150   do {
151     s = vld1_u8(src);
152     sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
153     sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
154     sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
155 
156     src += src_stride;
157     ref_offset += ref_stride;
158   } while (--i != 0);
159 
160   res[0] = horizontal_add_u16x8(sum[0]);
161   res[1] = horizontal_add_u16x8(sum[1]);
162   res[2] = horizontal_add_u16x8(sum[2]);
163 }
164 
sad4xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)165 static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride,
166                                   const uint8_t *const ref[3], int ref_stride,
167                                   uint32_t res[3], int h) {
168   assert(h % 2 == 0);
169   uint16x8_t sum[3];
170 
171   uint8x8_t s = load_unaligned_u8(src, src_stride);
172   uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
173   uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
174   uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
175 
176   sum[0] = vabdl_u8(s, r0);
177   sum[1] = vabdl_u8(s, r1);
178   sum[2] = vabdl_u8(s, r2);
179 
180   src += 2 * src_stride;
181   int ref_offset = 2 * ref_stride;
182   int i = (h / 2) - 1;
183   do {
184     s = load_unaligned_u8(src, src_stride);
185     r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
186     r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
187     r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
188 
189     sum[0] = vabal_u8(sum[0], s, r0);
190     sum[1] = vabal_u8(sum[1], s, r1);
191     sum[2] = vabal_u8(sum[2], s, r2);
192 
193     src += 2 * src_stride;
194     ref_offset += 2 * ref_stride;
195   } while (--i != 0);
196 
197   res[0] = horizontal_add_u16x8(sum[0]);
198   res[1] = horizontal_add_u16x8(sum[1]);
199   res[2] = horizontal_add_u16x8(sum[2]);
200 }
201 
202 #define SAD_WXH_3D_NEON(w, h)                                                  \
203   void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride,          \
204                                   const uint8_t *const ref[4], int ref_stride, \
205                                   uint32_t res[4]) {                           \
206     sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h));            \
207   }
208 
209 SAD_WXH_3D_NEON(4, 4)
210 SAD_WXH_3D_NEON(4, 8)
211 
212 SAD_WXH_3D_NEON(8, 4)
213 SAD_WXH_3D_NEON(8, 8)
214 SAD_WXH_3D_NEON(8, 16)
215 
216 SAD_WXH_3D_NEON(16, 8)
217 SAD_WXH_3D_NEON(16, 16)
218 SAD_WXH_3D_NEON(16, 32)
219 
220 SAD_WXH_3D_NEON(32, 16)
221 SAD_WXH_3D_NEON(32, 32)
222 SAD_WXH_3D_NEON(32, 64)
223 
224 SAD_WXH_3D_NEON(64, 32)
225 SAD_WXH_3D_NEON(64, 64)
226 SAD_WXH_3D_NEON(64, 128)
227 
228 SAD_WXH_3D_NEON(128, 64)
229 SAD_WXH_3D_NEON(128, 128)
230 
231 #if !CONFIG_REALTIME_ONLY
232 SAD_WXH_3D_NEON(4, 16)
233 SAD_WXH_3D_NEON(8, 32)
234 SAD_WXH_3D_NEON(16, 4)
235 SAD_WXH_3D_NEON(16, 64)
236 SAD_WXH_3D_NEON(32, 8)
237 SAD_WXH_3D_NEON(64, 16)
238 #endif  // !CONFIG_REALTIME_ONLY
239 
240 #undef SAD_WXH_3D_NEON
241 
sadwxhx4d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int w,int h,int h_overflow)242 static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
243                                         const uint8_t *const ref[4],
244                                         int ref_stride, uint32_t res[4], int w,
245                                         int h, int h_overflow) {
246   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
247                         vdupq_n_u32(0) };
248   int h_limit = h > h_overflow ? h_overflow : h;
249 
250   int ref_offset = 0;
251   int i = 0;
252   do {
253     uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
254                              vdupq_n_u16(0) };
255     uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
256                              vdupq_n_u16(0) };
257 
258     do {
259       int j = 0;
260       do {
261         const uint8x16_t s0 = vld1q_u8(src + j);
262         sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
263         sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
264         sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
265         sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
266 
267         const uint8x16_t s1 = vld1q_u8(src + j + 16);
268         sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
269         sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
270         sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
271         sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
272 
273         j += 32;
274       } while (j < w);
275 
276       src += src_stride;
277       ref_offset += ref_stride;
278     } while (++i < h_limit);
279 
280     sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
281     sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
282     sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
283     sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
284     sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
285     sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
286     sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
287     sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
288 
289     h_limit += h_overflow;
290   } while (i < h);
291 
292   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
293 }
294 
sad128xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)295 static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride,
296                                     const uint8_t *const ref[4], int ref_stride,
297                                     uint32_t res[4], int h) {
298   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
299 }
300 
sad64xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)301 static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride,
302                                    const uint8_t *const ref[4], int ref_stride,
303                                    uint32_t res[4], int h) {
304   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
305 }
306 
sad32xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)307 static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride,
308                                    const uint8_t *const ref[4], int ref_stride,
309                                    uint32_t res[4], int h) {
310   uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
311                            vdupq_n_u16(0) };
312   uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
313                            vdupq_n_u16(0) };
314 
315   int ref_offset = 0;
316   int i = h;
317   do {
318     const uint8x16_t s0 = vld1q_u8(src);
319     sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
320     sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
321     sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
322     sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
323 
324     const uint8x16_t s1 = vld1q_u8(src + 16);
325     sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
326     sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
327     sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
328     sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
329 
330     src += src_stride;
331     ref_offset += ref_stride;
332   } while (--i != 0);
333 
334   vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
335 }
336 
sad16xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)337 static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride,
338                                    const uint8_t *const ref[4], int ref_stride,
339                                    uint32_t res[4], int h) {
340   uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
341                             vdupq_n_u16(0) };
342   uint32x4_t sum_u32[4];
343 
344   int ref_offset = 0;
345   int i = h;
346   do {
347     const uint8x16_t s = vld1q_u8(src);
348     sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
349     sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
350     sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
351     sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
352 
353     src += src_stride;
354     ref_offset += ref_stride;
355   } while (--i != 0);
356 
357   sum_u32[0] = vpaddlq_u16(sum_u16[0]);
358   sum_u32[1] = vpaddlq_u16(sum_u16[1]);
359   sum_u32[2] = vpaddlq_u16(sum_u16[2]);
360   sum_u32[3] = vpaddlq_u16(sum_u16[3]);
361 
362   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
363 }
364 
sad8xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)365 static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride,
366                                   const uint8_t *const ref[4], int ref_stride,
367                                   uint32_t res[4], int h) {
368   uint16x8_t sum[4];
369 
370   uint8x8_t s = vld1_u8(src);
371   sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
372   sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
373   sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
374   sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
375 
376   src += src_stride;
377   int ref_offset = ref_stride;
378   int i = h - 1;
379   do {
380     s = vld1_u8(src);
381     sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
382     sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
383     sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
384     sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
385 
386     src += src_stride;
387     ref_offset += ref_stride;
388   } while (--i != 0);
389 
390   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
391 }
392 
sad4xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)393 static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride,
394                                   const uint8_t *const ref[4], int ref_stride,
395                                   uint32_t res[4], int h) {
396   uint16x8_t sum[4];
397 
398   uint8x8_t s = load_unaligned_u8(src, src_stride);
399   uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
400   uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
401   uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
402   uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
403 
404   sum[0] = vabdl_u8(s, r0);
405   sum[1] = vabdl_u8(s, r1);
406   sum[2] = vabdl_u8(s, r2);
407   sum[3] = vabdl_u8(s, r3);
408 
409   src += 2 * src_stride;
410   int ref_offset = 2 * ref_stride;
411   int i = h / 2;
412   while (--i != 0) {
413     s = load_unaligned_u8(src, src_stride);
414     r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
415     r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
416     r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
417     r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
418 
419     sum[0] = vabal_u8(sum[0], s, r0);
420     sum[1] = vabal_u8(sum[1], s, r1);
421     sum[2] = vabal_u8(sum[2], s, r2);
422     sum[3] = vabal_u8(sum[3], s, r3);
423 
424     src += 2 * src_stride;
425     ref_offset += 2 * ref_stride;
426   }
427 
428   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
429 }
430 
431 #define SAD_WXH_4D_NEON(w, h)                                                  \
432   void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
433                                   const uint8_t *const ref[4], int ref_stride, \
434                                   uint32_t res[4]) {                           \
435     sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
436   }
437 
438 SAD_WXH_4D_NEON(4, 4)
439 SAD_WXH_4D_NEON(4, 8)
440 
441 SAD_WXH_4D_NEON(8, 4)
442 SAD_WXH_4D_NEON(8, 8)
443 SAD_WXH_4D_NEON(8, 16)
444 
445 SAD_WXH_4D_NEON(16, 8)
446 SAD_WXH_4D_NEON(16, 16)
447 SAD_WXH_4D_NEON(16, 32)
448 
449 SAD_WXH_4D_NEON(32, 16)
450 SAD_WXH_4D_NEON(32, 32)
451 SAD_WXH_4D_NEON(32, 64)
452 
453 SAD_WXH_4D_NEON(64, 32)
454 SAD_WXH_4D_NEON(64, 64)
455 SAD_WXH_4D_NEON(64, 128)
456 
457 SAD_WXH_4D_NEON(128, 64)
458 SAD_WXH_4D_NEON(128, 128)
459 
460 #if !CONFIG_REALTIME_ONLY
461 SAD_WXH_4D_NEON(4, 16)
462 SAD_WXH_4D_NEON(8, 32)
463 SAD_WXH_4D_NEON(16, 4)
464 SAD_WXH_4D_NEON(16, 64)
465 SAD_WXH_4D_NEON(32, 8)
466 SAD_WXH_4D_NEON(64, 16)
467 #endif  // !CONFIG_REALTIME_ONLY
468 
469 #undef SAD_WXH_4D_NEON
470 
471 #define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
472   void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
473                                         const uint8_t *const ref[4],        \
474                                         int ref_stride, uint32_t res[4]) {  \
475     sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
476                        ((h) >> 1));                                         \
477     res[0] <<= 1;                                                           \
478     res[1] <<= 1;                                                           \
479     res[2] <<= 1;                                                           \
480     res[3] <<= 1;                                                           \
481   }
482 
483 SAD_SKIP_WXH_4D_NEON(4, 4)
484 SAD_SKIP_WXH_4D_NEON(4, 8)
485 
486 SAD_SKIP_WXH_4D_NEON(8, 4)
487 SAD_SKIP_WXH_4D_NEON(8, 8)
488 SAD_SKIP_WXH_4D_NEON(8, 16)
489 
490 SAD_SKIP_WXH_4D_NEON(16, 8)
491 SAD_SKIP_WXH_4D_NEON(16, 16)
492 SAD_SKIP_WXH_4D_NEON(16, 32)
493 
494 SAD_SKIP_WXH_4D_NEON(32, 16)
495 SAD_SKIP_WXH_4D_NEON(32, 32)
496 SAD_SKIP_WXH_4D_NEON(32, 64)
497 
498 SAD_SKIP_WXH_4D_NEON(64, 32)
499 SAD_SKIP_WXH_4D_NEON(64, 64)
500 SAD_SKIP_WXH_4D_NEON(64, 128)
501 
502 SAD_SKIP_WXH_4D_NEON(128, 64)
503 SAD_SKIP_WXH_4D_NEON(128, 128)
504 
505 #if !CONFIG_REALTIME_ONLY
506 SAD_SKIP_WXH_4D_NEON(4, 16)
507 SAD_SKIP_WXH_4D_NEON(8, 32)
508 SAD_SKIP_WXH_4D_NEON(16, 4)
509 SAD_SKIP_WXH_4D_NEON(16, 64)
510 SAD_SKIP_WXH_4D_NEON(32, 8)
511 SAD_SKIP_WXH_4D_NEON(64, 16)
512 #endif  // !CONFIG_REALTIME_ONLY
513 
514 #undef SAD_SKIP_WXH_4D_NEON
515