xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/subpel_variance_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14 
15 #include "vpx/vpx_integer.h"
16 
17 #include "vpx_dsp/variance.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19 
20 // Process a block exactly 4 wide and a multiple of 2 high.
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)21 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
22                                       int src_stride, int pixel_step,
23                                       int dst_height, int filter_offset) {
24   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
25   const uint8x8_t f1 = vdup_n_u8(filter_offset);
26 
27   int i = dst_height;
28   do {
29     uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
30     uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
31     uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
32     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
33     vst1_u8(dst_ptr, blend_u8);
34 
35     src_ptr += 2 * src_stride;
36     dst_ptr += 2 * 4;
37     i -= 2;
38   } while (i != 0);
39 }
40 
41 // Process a block exactly 8 wide and any height.
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)42 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
43                                       int src_stride, int pixel_step,
44                                       int dst_height, int filter_offset) {
45   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
46   const uint8x8_t f1 = vdup_n_u8(filter_offset);
47 
48   int i = dst_height;
49   do {
50     uint8x8_t s0 = vld1_u8(src_ptr);
51     uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
52     uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
53     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
54     vst1_u8(dst_ptr, blend_u8);
55 
56     src_ptr += src_stride;
57     dst_ptr += 8;
58   } while (--i != 0);
59 }
60 
61 // Process a block which is a mutiple of 16 wide and any height.
var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)62 static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
63                                          uint8_t *dst_ptr, int src_stride,
64                                          int pixel_step, int dst_width,
65                                          int dst_height, int filter_offset) {
66   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
67   const uint8x8_t f1 = vdup_n_u8(filter_offset);
68 
69   int i = dst_height;
70   do {
71     int j = 0;
72     do {
73       uint8x16_t s0 = vld1q_u8(src_ptr + j);
74       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
75       uint16x8_t blend_l =
76           vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
77       uint16x8_t blend_h =
78           vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
79       uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
80       uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
81       vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
82 
83       j += 16;
84     } while (j < dst_width);
85 
86     src_ptr += src_stride;
87     dst_ptr += dst_width;
88   } while (--i != 0);
89 }
90 
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)91 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
92                                        int src_stride, int pixel_step,
93                                        int dst_height, int filter_offset) {
94   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
95                                dst_height, filter_offset);
96 }
var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)97 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
98                                        int src_stride, int pixel_step,
99                                        int dst_height, int filter_offset) {
100   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
101                                dst_height, filter_offset);
102 }
var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)103 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
104                                        int src_stride, int pixel_step,
105                                        int dst_height, int filter_offset) {
106   var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
107                                dst_height, filter_offset);
108 }
109 
var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)110 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
111                                    int src_stride, int pixel_step,
112                                    int dst_width, int dst_height) {
113   int i = dst_height;
114 
115   // We only specialize on the filter values for large block sizes (>= 16x16.)
116   assert(dst_width >= 16 && dst_width % 16 == 0);
117 
118   do {
119     int j = 0;
120     do {
121       uint8x16_t s0 = vld1q_u8(src_ptr + j);
122       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
123       uint8x16_t avg = vrhaddq_u8(s0, s1);
124       vst1q_u8(dst_ptr + j, avg);
125 
126       j += 16;
127     } while (j < dst_width);
128 
129     src_ptr += src_stride;
130     dst_ptr += dst_width;
131   } while (--i != 0);
132 }
133 
134 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
135   unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
136       const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
137       const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
138     uint8_t tmp0[w * (h + padding)];                                     \
139     uint8_t tmp1[w * h];                                                 \
140     var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
141                                 xoffset);                                \
142     var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
143     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
144   }
145 
146 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
147   unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                       \
148       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
149       const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
150     if (xoffset == 0) {                                                      \
151       if (yoffset == 0) {                                                    \
152         return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
153       } else if (yoffset == 4) {                                             \
154         uint8_t tmp[w * h];                                                  \
155         var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
156         return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
157       } else {                                                               \
158         uint8_t tmp[w * h];                                                  \
159         var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
160                                     yoffset);                                \
161         return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
162       }                                                                      \
163     } else if (xoffset == 4) {                                               \
164       uint8_t tmp0[w * (h + padding)];                                       \
165       if (yoffset == 0) {                                                    \
166         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
167         return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
168       } else if (yoffset == 4) {                                             \
169         uint8_t tmp1[w * (h + padding)];                                     \
170         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
171         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
172         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
173       } else {                                                               \
174         uint8_t tmp1[w * (h + padding)];                                     \
175         var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
176         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
177         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
178       }                                                                      \
179     } else {                                                                 \
180       uint8_t tmp0[w * (h + padding)];                                       \
181       if (yoffset == 0) {                                                    \
182         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
183         return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
184       } else if (yoffset == 4) {                                             \
185         uint8_t tmp1[w * h];                                                 \
186         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
187                                     xoffset);                                \
188         var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
189         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
190       } else {                                                               \
191         uint8_t tmp1[w * h];                                                 \
192         var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
193                                     xoffset);                                \
194         var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
195         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
196       }                                                                      \
197     }                                                                        \
198   }
199 
200 // 4x<h> blocks are processed two rows at a time, so require an extra row of
201 // padding.
202 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
203 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
204 
205 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
206 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
207 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
208 
209 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
210 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
211 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
212 
213 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
214 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
215 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
216 
217 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
218 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
219 
220 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)221 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
222                                                uint8_t *dst_ptr, int src_stride,
223                                                int pixel_step, int dst_height,
224                                                int filter_offset,
225                                                const uint8_t *second_pred) {
226   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
227   const uint8x8_t f1 = vdup_n_u8(filter_offset);
228 
229   int i = dst_height;
230   do {
231     uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
232     uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
233     uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
234     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
235 
236     uint8x8_t p = vld1_u8(second_pred);
237     uint8x8_t avg = vrhadd_u8(blend_u8, p);
238 
239     vst1_u8(dst_ptr, avg);
240 
241     src_ptr += 2 * src_stride;
242     dst_ptr += 2 * 4;
243     second_pred += 2 * 4;
244     i -= 2;
245   } while (i != 0);
246 }
247 
248 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)249 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
250                                                uint8_t *dst_ptr, int src_stride,
251                                                int pixel_step, int dst_height,
252                                                int filter_offset,
253                                                const uint8_t *second_pred) {
254   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
255   const uint8x8_t f1 = vdup_n_u8(filter_offset);
256 
257   int i = dst_height;
258   do {
259     uint8x8_t s0 = vld1_u8(src_ptr);
260     uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
261     uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
262     uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
263 
264     uint8x8_t p = vld1_u8(second_pred);
265     uint8x8_t avg = vrhadd_u8(blend_u8, p);
266 
267     vst1_u8(dst_ptr, avg);
268 
269     src_ptr += src_stride;
270     dst_ptr += 8;
271     second_pred += 8;
272   } while (--i > 0);
273 }
274 
275 // Combine bilinear filter with vpx_comp_avg_pred for large blocks.
avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred)276 static void avg_pred_var_filter_block2d_bil_large(
277     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
278     int dst_width, int dst_height, int filter_offset,
279     const uint8_t *second_pred) {
280   const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
281   const uint8x8_t f1 = vdup_n_u8(filter_offset);
282 
283   int i = dst_height;
284   do {
285     int j = 0;
286     do {
287       uint8x16_t s0 = vld1q_u8(src_ptr + j);
288       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
289       uint16x8_t blend_l =
290           vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
291       uint16x8_t blend_h =
292           vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
293       uint8x16_t blend_u8 =
294           vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
295 
296       uint8x16_t p = vld1q_u8(second_pred);
297       uint8x16_t avg = vrhaddq_u8(blend_u8, p);
298 
299       vst1q_u8(dst_ptr + j, avg);
300 
301       j += 16;
302       second_pred += 16;
303     } while (j < dst_width);
304 
305     src_ptr += src_stride;
306     dst_ptr += dst_width;
307   } while (--i != 0);
308 }
309 
310 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)311 static void avg_pred_var_filter_block2d_bil_w16(
312     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
313     int dst_height, int filter_offset, const uint8_t *second_pred) {
314   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
315                                         pixel_step, 16, dst_height,
316                                         filter_offset, second_pred);
317 }
318 
319 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)320 static void avg_pred_var_filter_block2d_bil_w32(
321     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
322     int dst_height, int filter_offset, const uint8_t *second_pred) {
323   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
324                                         pixel_step, 32, dst_height,
325                                         filter_offset, second_pred);
326 }
327 
328 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)329 static void avg_pred_var_filter_block2d_bil_w64(
330     const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
331     int dst_height, int filter_offset, const uint8_t *second_pred) {
332   avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
333                                         pixel_step, 64, dst_height,
334                                         filter_offset, second_pred);
335 }
336 
337 // Combine averaging subpel filter with vpx_comp_avg_pred.
avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred)338 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
339                                             uint8_t *dst_ptr, int src_stride,
340                                             int pixel_step, int dst_width,
341                                             int dst_height,
342                                             const uint8_t *second_pred) {
343   int i = dst_height;
344 
345   // We only specialize on the filter values for large block sizes (>= 16x16.)
346   assert(dst_width >= 16 && dst_width % 16 == 0);
347 
348   do {
349     int j = 0;
350     do {
351       uint8x16_t s0 = vld1q_u8(src_ptr + j);
352       uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
353       uint8x16_t avg = vrhaddq_u8(s0, s1);
354 
355       uint8x16_t p = vld1q_u8(second_pred);
356       avg = vrhaddq_u8(avg, p);
357 
358       vst1q_u8(dst_ptr + j, avg);
359 
360       j += 16;
361       second_pred += 16;
362     } while (j < dst_width);
363 
364     src_ptr += src_stride;
365     dst_ptr += dst_width;
366   } while (--i != 0);
367 }
368 
369 // Implementation of vpx_comp_avg_pred for blocks having width >= 16.
avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred)370 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
371                      int dst_width, int dst_height,
372                      const uint8_t *second_pred) {
373   int i = dst_height;
374 
375   // We only specialize on the filter values for large block sizes (>= 16x16.)
376   assert(dst_width >= 16 && dst_width % 16 == 0);
377 
378   do {
379     int j = 0;
380     do {
381       uint8x16_t s = vld1q_u8(src_ptr + j);
382       uint8x16_t p = vld1q_u8(second_pred);
383 
384       uint8x16_t avg = vrhaddq_u8(s, p);
385 
386       vst1q_u8(dst_ptr + j, avg);
387 
388       j += 16;
389       second_pred += 16;
390     } while (j < dst_width);
391 
392     src_ptr += src_stride;
393     dst_ptr += dst_width;
394   } while (--i != 0);
395 }
396 
397 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
398   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
399       const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
400       const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
401       const uint8_t *second_pred) {                                         \
402     uint8_t tmp0[w * (h + padding)];                                        \
403     uint8_t tmp1[w * h];                                                    \
404     var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
405                                 xoffset);                                   \
406     avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
407                                          second_pred);                      \
408     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
409   }
410 
411 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
412   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                     \
413       const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
414       const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
415       const uint8_t *second_pred) {                                            \
416     if (xoffset == 0) {                                                        \
417       uint8_t tmp[w * h];                                                      \
418       if (yoffset == 0) {                                                      \
419         avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
420         return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
421       } else if (yoffset == 4) {                                               \
422         avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
423                                         source_stride, w, h, second_pred);     \
424         return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
425       } else {                                                                 \
426         avg_pred_var_filter_block2d_bil_w##w(                                  \
427             src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
428         return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
429       }                                                                        \
430     } else if (xoffset == 4) {                                                 \
431       uint8_t tmp0[w * (h + padding)];                                         \
432       if (yoffset == 0) {                                                      \
433         avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
434                                         second_pred);                          \
435         return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
436       } else if (yoffset == 4) {                                               \
437         uint8_t tmp1[w * (h + padding)];                                       \
438         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
439         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
440         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
441       } else {                                                                 \
442         uint8_t tmp1[w * (h + padding)];                                       \
443         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
444         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
445                                              second_pred);                     \
446         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
447       }                                                                        \
448     } else {                                                                   \
449       uint8_t tmp0[w * (h + padding)];                                         \
450       if (yoffset == 0) {                                                      \
451         avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
452                                              xoffset, second_pred);            \
453         return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
454       } else if (yoffset == 4) {                                               \
455         uint8_t tmp1[w * h];                                                   \
456         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
457                                     (h + padding), xoffset);                   \
458         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
459         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
460       } else {                                                                 \
461         uint8_t tmp1[w * h];                                                   \
462         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
463                                     (h + padding), xoffset);                   \
464         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
465                                              second_pred);                     \
466         return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
467       }                                                                        \
468     }                                                                          \
469   }
470 
471 // 4x<h> blocks are processed two rows at a time, so require an extra row of
472 // padding.
473 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
474 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
475 
476 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
477 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
478 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
479 
480 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
481 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
482 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
483 
484 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
485 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
486 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
487 
488 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
489 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
490