1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14
15 #include "vpx/vpx_integer.h"
16
17 #include "vpx_dsp/variance.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19
20 // Process a block exactly 4 wide and a multiple of 2 high.
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)21 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
22 int src_stride, int pixel_step,
23 int dst_height, int filter_offset) {
24 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
25 const uint8x8_t f1 = vdup_n_u8(filter_offset);
26
27 int i = dst_height;
28 do {
29 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
30 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
31 uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
32 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
33 vst1_u8(dst_ptr, blend_u8);
34
35 src_ptr += 2 * src_stride;
36 dst_ptr += 2 * 4;
37 i -= 2;
38 } while (i != 0);
39 }
40
41 // Process a block exactly 8 wide and any height.
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)42 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
43 int src_stride, int pixel_step,
44 int dst_height, int filter_offset) {
45 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
46 const uint8x8_t f1 = vdup_n_u8(filter_offset);
47
48 int i = dst_height;
49 do {
50 uint8x8_t s0 = vld1_u8(src_ptr);
51 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
52 uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
53 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
54 vst1_u8(dst_ptr, blend_u8);
55
56 src_ptr += src_stride;
57 dst_ptr += 8;
58 } while (--i != 0);
59 }
60
61 // Process a block which is a mutiple of 16 wide and any height.
var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset)62 static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
63 uint8_t *dst_ptr, int src_stride,
64 int pixel_step, int dst_width,
65 int dst_height, int filter_offset) {
66 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
67 const uint8x8_t f1 = vdup_n_u8(filter_offset);
68
69 int i = dst_height;
70 do {
71 int j = 0;
72 do {
73 uint8x16_t s0 = vld1q_u8(src_ptr + j);
74 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
75 uint16x8_t blend_l =
76 vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
77 uint16x8_t blend_h =
78 vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
79 uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
80 uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
81 vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
82
83 j += 16;
84 } while (j < dst_width);
85
86 src_ptr += src_stride;
87 dst_ptr += dst_width;
88 } while (--i != 0);
89 }
90
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)91 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
92 int src_stride, int pixel_step,
93 int dst_height, int filter_offset) {
94 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
95 dst_height, filter_offset);
96 }
var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)97 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
98 int src_stride, int pixel_step,
99 int dst_height, int filter_offset) {
100 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
101 dst_height, filter_offset);
102 }
var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset)103 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
104 int src_stride, int pixel_step,
105 int dst_height, int filter_offset) {
106 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
107 dst_height, filter_offset);
108 }
109
var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height)110 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
111 int src_stride, int pixel_step,
112 int dst_width, int dst_height) {
113 int i = dst_height;
114
115 // We only specialize on the filter values for large block sizes (>= 16x16.)
116 assert(dst_width >= 16 && dst_width % 16 == 0);
117
118 do {
119 int j = 0;
120 do {
121 uint8x16_t s0 = vld1q_u8(src_ptr + j);
122 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
123 uint8x16_t avg = vrhaddq_u8(s0, s1);
124 vst1q_u8(dst_ptr + j, avg);
125
126 j += 16;
127 } while (j < dst_width);
128
129 src_ptr += src_stride;
130 dst_ptr += dst_width;
131 } while (--i != 0);
132 }
133
134 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
135 unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
136 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
137 const uint8_t *ref, int ref_stride, uint32_t *sse) { \
138 uint8_t tmp0[w * (h + padding)]; \
139 uint8_t tmp1[w * h]; \
140 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
141 xoffset); \
142 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
143 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
144 }
145
146 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
147 unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
148 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
149 const uint8_t *ref, int ref_stride, unsigned int *sse) { \
150 if (xoffset == 0) { \
151 if (yoffset == 0) { \
152 return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
153 } else if (yoffset == 4) { \
154 uint8_t tmp[w * h]; \
155 var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
156 return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
157 } else { \
158 uint8_t tmp[w * h]; \
159 var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
160 yoffset); \
161 return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
162 } \
163 } else if (xoffset == 4) { \
164 uint8_t tmp0[w * (h + padding)]; \
165 if (yoffset == 0) { \
166 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
167 return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
168 } else if (yoffset == 4) { \
169 uint8_t tmp1[w * (h + padding)]; \
170 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
171 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
172 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
173 } else { \
174 uint8_t tmp1[w * (h + padding)]; \
175 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
176 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
177 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
178 } \
179 } else { \
180 uint8_t tmp0[w * (h + padding)]; \
181 if (yoffset == 0) { \
182 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
183 return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
184 } else if (yoffset == 4) { \
185 uint8_t tmp1[w * h]; \
186 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
187 xoffset); \
188 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
189 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
190 } else { \
191 uint8_t tmp1[w * h]; \
192 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
193 xoffset); \
194 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
195 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
196 } \
197 } \
198 }
199
200 // 4x<h> blocks are processed two rows at a time, so require an extra row of
201 // padding.
202 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
203 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
204
205 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
206 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
207 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
208
209 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
210 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
211 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
212
213 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
214 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
215 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
216
217 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
218 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
219
220 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
avg_pred_var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)221 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
222 uint8_t *dst_ptr, int src_stride,
223 int pixel_step, int dst_height,
224 int filter_offset,
225 const uint8_t *second_pred) {
226 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
227 const uint8x8_t f1 = vdup_n_u8(filter_offset);
228
229 int i = dst_height;
230 do {
231 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
232 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
233 uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
234 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
235
236 uint8x8_t p = vld1_u8(second_pred);
237 uint8x8_t avg = vrhadd_u8(blend_u8, p);
238
239 vst1_u8(dst_ptr, avg);
240
241 src_ptr += 2 * src_stride;
242 dst_ptr += 2 * 4;
243 second_pred += 2 * 4;
244 i -= 2;
245 } while (i != 0);
246 }
247
248 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
avg_pred_var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)249 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
250 uint8_t *dst_ptr, int src_stride,
251 int pixel_step, int dst_height,
252 int filter_offset,
253 const uint8_t *second_pred) {
254 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
255 const uint8x8_t f1 = vdup_n_u8(filter_offset);
256
257 int i = dst_height;
258 do {
259 uint8x8_t s0 = vld1_u8(src_ptr);
260 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
261 uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
262 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
263
264 uint8x8_t p = vld1_u8(second_pred);
265 uint8x8_t avg = vrhadd_u8(blend_u8, p);
266
267 vst1_u8(dst_ptr, avg);
268
269 src_ptr += src_stride;
270 dst_ptr += 8;
271 second_pred += 8;
272 } while (--i > 0);
273 }
274
275 // Combine bilinear filter with vpx_comp_avg_pred for large blocks.
avg_pred_var_filter_block2d_bil_large(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,int filter_offset,const uint8_t * second_pred)276 static void avg_pred_var_filter_block2d_bil_large(
277 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
278 int dst_width, int dst_height, int filter_offset,
279 const uint8_t *second_pred) {
280 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
281 const uint8x8_t f1 = vdup_n_u8(filter_offset);
282
283 int i = dst_height;
284 do {
285 int j = 0;
286 do {
287 uint8x16_t s0 = vld1q_u8(src_ptr + j);
288 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
289 uint16x8_t blend_l =
290 vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
291 uint16x8_t blend_h =
292 vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
293 uint8x16_t blend_u8 =
294 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
295
296 uint8x16_t p = vld1q_u8(second_pred);
297 uint8x16_t avg = vrhaddq_u8(blend_u8, p);
298
299 vst1q_u8(dst_ptr + j, avg);
300
301 j += 16;
302 second_pred += 16;
303 } while (j < dst_width);
304
305 src_ptr += src_stride;
306 dst_ptr += dst_width;
307 } while (--i != 0);
308 }
309
310 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
avg_pred_var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)311 static void avg_pred_var_filter_block2d_bil_w16(
312 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
313 int dst_height, int filter_offset, const uint8_t *second_pred) {
314 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
315 pixel_step, 16, dst_height,
316 filter_offset, second_pred);
317 }
318
319 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
avg_pred_var_filter_block2d_bil_w32(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)320 static void avg_pred_var_filter_block2d_bil_w32(
321 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
322 int dst_height, int filter_offset, const uint8_t *second_pred) {
323 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
324 pixel_step, 32, dst_height,
325 filter_offset, second_pred);
326 }
327
328 // Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
avg_pred_var_filter_block2d_bil_w64(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_height,int filter_offset,const uint8_t * second_pred)329 static void avg_pred_var_filter_block2d_bil_w64(
330 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
331 int dst_height, int filter_offset, const uint8_t *second_pred) {
332 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
333 pixel_step, 64, dst_height,
334 filter_offset, second_pred);
335 }
336
337 // Combine averaging subpel filter with vpx_comp_avg_pred.
avg_pred_var_filter_block2d_avg(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int pixel_step,int dst_width,int dst_height,const uint8_t * second_pred)338 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
339 uint8_t *dst_ptr, int src_stride,
340 int pixel_step, int dst_width,
341 int dst_height,
342 const uint8_t *second_pred) {
343 int i = dst_height;
344
345 // We only specialize on the filter values for large block sizes (>= 16x16.)
346 assert(dst_width >= 16 && dst_width % 16 == 0);
347
348 do {
349 int j = 0;
350 do {
351 uint8x16_t s0 = vld1q_u8(src_ptr + j);
352 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
353 uint8x16_t avg = vrhaddq_u8(s0, s1);
354
355 uint8x16_t p = vld1q_u8(second_pred);
356 avg = vrhaddq_u8(avg, p);
357
358 vst1q_u8(dst_ptr + j, avg);
359
360 j += 16;
361 second_pred += 16;
362 } while (j < dst_width);
363
364 src_ptr += src_stride;
365 dst_ptr += dst_width;
366 } while (--i != 0);
367 }
368
369 // Implementation of vpx_comp_avg_pred for blocks having width >= 16.
avg_pred(const uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_width,int dst_height,const uint8_t * second_pred)370 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
371 int dst_width, int dst_height,
372 const uint8_t *second_pred) {
373 int i = dst_height;
374
375 // We only specialize on the filter values for large block sizes (>= 16x16.)
376 assert(dst_width >= 16 && dst_width % 16 == 0);
377
378 do {
379 int j = 0;
380 do {
381 uint8x16_t s = vld1q_u8(src_ptr + j);
382 uint8x16_t p = vld1q_u8(second_pred);
383
384 uint8x16_t avg = vrhaddq_u8(s, p);
385
386 vst1q_u8(dst_ptr + j, avg);
387
388 j += 16;
389 second_pred += 16;
390 } while (j < dst_width);
391
392 src_ptr += src_stride;
393 dst_ptr += dst_width;
394 } while (--i != 0);
395 }
396
397 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
398 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
399 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
400 const uint8_t *ref, int ref_stride, uint32_t *sse, \
401 const uint8_t *second_pred) { \
402 uint8_t tmp0[w * (h + padding)]; \
403 uint8_t tmp1[w * h]; \
404 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
405 xoffset); \
406 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
407 second_pred); \
408 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
409 }
410
411 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
412 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
413 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
414 const uint8_t *ref, int ref_stride, unsigned int *sse, \
415 const uint8_t *second_pred) { \
416 if (xoffset == 0) { \
417 uint8_t tmp[w * h]; \
418 if (yoffset == 0) { \
419 avg_pred(src, tmp, source_stride, w, h, second_pred); \
420 return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
421 } else if (yoffset == 4) { \
422 avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
423 source_stride, w, h, second_pred); \
424 return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
425 } else { \
426 avg_pred_var_filter_block2d_bil_w##w( \
427 src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
428 return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
429 } \
430 } else if (xoffset == 4) { \
431 uint8_t tmp0[w * (h + padding)]; \
432 if (yoffset == 0) { \
433 avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
434 second_pred); \
435 return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
436 } else if (yoffset == 4) { \
437 uint8_t tmp1[w * (h + padding)]; \
438 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
439 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
440 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
441 } else { \
442 uint8_t tmp1[w * (h + padding)]; \
443 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
444 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
445 second_pred); \
446 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
447 } \
448 } else { \
449 uint8_t tmp0[w * (h + padding)]; \
450 if (yoffset == 0) { \
451 avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
452 xoffset, second_pred); \
453 return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
454 } else if (yoffset == 4) { \
455 uint8_t tmp1[w * h]; \
456 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
457 (h + padding), xoffset); \
458 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
459 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
460 } else { \
461 uint8_t tmp1[w * h]; \
462 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
463 (h + padding), xoffset); \
464 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
465 second_pred); \
466 return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
467 } \
468 } \
469 }
470
471 // 4x<h> blocks are processed two rows at a time, so require an extra row of
472 // padding.
473 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
474 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
475
476 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
477 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
478 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
479
480 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
481 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
482 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
483
484 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
485 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
486 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
487
488 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
489 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
490