1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26
27 #if !CONFIG_REALTIME_ONLY
aom_get_mb_ss_c(const int16_t * a)28 uint32_t aom_get_mb_ss_c(const int16_t *a) {
29 unsigned int i, sum = 0;
30
31 for (i = 0; i < 256; ++i) {
32 sum += a[i] * a[i];
33 }
34
35 return sum;
36 }
37 #endif // !CONFIG_REALTIME_ONLY
38
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)39 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
40 int b_stride, int w, int h, uint32_t *sse, int *sum) {
41 int i, j;
42
43 *sum = 0;
44 *sse = 0;
45
46 for (i = 0; i < h; ++i) {
47 for (j = 0; j < w; ++j) {
48 const int diff = a[j] - b[j];
49 *sum += diff;
50 *sse += diff * diff;
51 }
52
53 a += a_stride;
54 b += b_stride;
55 }
56 }
57
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)58 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
59 int b_stride, int w, int h) {
60 uint32_t sse;
61 int sum;
62 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
63 return sse;
64 }
65
66 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
67 // or vertical direction to produce the filtered output block. Used to implement
68 // the first-pass of 2-D separable filter.
69 //
70 // Produces int16_t output to retain precision for the next pass. Two filter
71 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
72 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
73 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)74 static void var_filter_block2d_bil_first_pass_c(
75 const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
76 unsigned int pixel_step, unsigned int output_height,
77 unsigned int output_width, const uint8_t *filter) {
78 unsigned int i, j;
79
80 for (i = 0; i < output_height; ++i) {
81 for (j = 0; j < output_width; ++j) {
82 b[j] = ROUND_POWER_OF_TWO(
83 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
84
85 ++a;
86 }
87
88 a += src_pixels_per_line - output_width;
89 b += output_width;
90 }
91 }
92
93 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
94 // or vertical direction to produce the filtered output block. Used to implement
95 // the second-pass of 2-D separable filter.
96 //
97 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
98 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
99 // filter is applied horizontally (pixel_step = 1) or vertically
100 // (pixel_step = stride). It defines the offset required to move from one input
101 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)102 static void var_filter_block2d_bil_second_pass_c(
103 const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
104 unsigned int pixel_step, unsigned int output_height,
105 unsigned int output_width, const uint8_t *filter) {
106 unsigned int i, j;
107
108 for (i = 0; i < output_height; ++i) {
109 for (j = 0; j < output_width; ++j) {
110 b[j] = ROUND_POWER_OF_TWO(
111 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
112 ++a;
113 }
114
115 a += src_pixels_per_line - output_width;
116 b += output_width;
117 }
118 }
119
120 #define VAR(W, H) \
121 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
122 const uint8_t *b, int b_stride, \
123 uint32_t *sse) { \
124 int sum; \
125 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
126 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
127 }
128
129 #define SUBPIX_VAR(W, H) \
130 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
131 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
132 const uint8_t *b, int b_stride, uint32_t *sse) { \
133 uint16_t fdata3[(H + 1) * W]; \
134 uint8_t temp2[H * W]; \
135 \
136 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
137 bilinear_filters_2t[xoffset]); \
138 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
139 bilinear_filters_2t[yoffset]); \
140 \
141 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
142 }
143
144 #define SUBPIX_AVG_VAR(W, H) \
145 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
146 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
147 const uint8_t *b, int b_stride, uint32_t *sse, \
148 const uint8_t *second_pred) { \
149 uint16_t fdata3[(H + 1) * W]; \
150 uint8_t temp2[H * W]; \
151 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
152 \
153 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
154 bilinear_filters_2t[xoffset]); \
155 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
156 bilinear_filters_2t[yoffset]); \
157 \
158 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
159 \
160 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
161 } \
162 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
163 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
164 const uint8_t *b, int b_stride, uint32_t *sse, \
165 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
166 uint16_t fdata3[(H + 1) * W]; \
167 uint8_t temp2[H * W]; \
168 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
169 \
170 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
171 bilinear_filters_2t[xoffset]); \
172 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
173 bilinear_filters_2t[yoffset]); \
174 \
175 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
176 \
177 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
178 }
179
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)180 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
181 const uint8_t *b, int b_stride,
182 uint32_t *sse8x8, int *sum8x8,
183 unsigned int *tot_sse, int *tot_sum,
184 uint32_t *var8x8) {
185 // Loop over 4 8x8 blocks. Process one 8x32 block.
186 for (int k = 0; k < 4; k++) {
187 variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
188 &sum8x8[k]);
189 }
190
191 // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
192 *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
193 *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
194 for (int i = 0; i < 4; i++)
195 var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
196 }
197
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)198 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
199 const uint8_t *ref_ptr, int ref_stride,
200 uint32_t *sse16x16, unsigned int *tot_sse,
201 int *tot_sum, uint32_t *var16x16) {
202 int sum16x16[2] = { 0 };
203 // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
204 for (int k = 0; k < 2; k++) {
205 variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
206 16, 16, &sse16x16[k], &sum16x16[k]);
207 }
208
209 // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
210 *tot_sse += sse16x16[0] + sse16x16[1];
211 *tot_sum += sum16x16[0] + sum16x16[1];
212 for (int i = 0; i < 2; i++)
213 var16x16[i] =
214 sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
215 }
216
217 /* Identical to the variance call except it does not calculate the
218 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
219 * variable.
220 */
221 #define MSE(W, H) \
222 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
223 const uint8_t *b, int b_stride, \
224 uint32_t *sse) { \
225 int sum; \
226 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
227 return *sse; \
228 }
229
230 /* All three forms of the variance are available in the same sizes. */
231 #define VARIANCES(W, H) \
232 VAR(W, H) \
233 SUBPIX_VAR(W, H) \
234 SUBPIX_AVG_VAR(W, H)
235
236 VARIANCES(128, 128)
237 VARIANCES(128, 64)
238 VARIANCES(64, 128)
239 VARIANCES(64, 64)
240 VARIANCES(64, 32)
241 VARIANCES(32, 64)
242 VARIANCES(32, 32)
243 VARIANCES(32, 16)
244 VARIANCES(16, 32)
245 VARIANCES(16, 16)
246 VARIANCES(16, 8)
247 VARIANCES(8, 16)
248 VARIANCES(8, 8)
249 VARIANCES(8, 4)
250 VARIANCES(4, 8)
251 VARIANCES(4, 4)
252
253 // Realtime mode doesn't use rectangular blocks.
254 #if !CONFIG_REALTIME_ONLY
255 VARIANCES(4, 16)
256 VARIANCES(16, 4)
257 VARIANCES(8, 32)
258 VARIANCES(32, 8)
259 VARIANCES(16, 64)
260 VARIANCES(64, 16)
261 #endif
262
263 MSE(16, 16)
264 MSE(16, 8)
265 MSE(8, 16)
266 MSE(8, 8)
267
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)268 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
269 int height, const uint8_t *ref, int ref_stride) {
270 int i, j;
271
272 for (i = 0; i < height; ++i) {
273 for (j = 0; j < width; ++j) {
274 const int tmp = pred[j] + ref[j];
275 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
276 }
277 comp_pred += width;
278 pred += width;
279 ref += ref_stride;
280 }
281 }
282
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)283 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
284 int width, int height, const uint8_t *ref,
285 int ref_stride,
286 const DIST_WTD_COMP_PARAMS *jcp_param) {
287 int i, j;
288 const int fwd_offset = jcp_param->fwd_offset;
289 const int bck_offset = jcp_param->bck_offset;
290
291 for (i = 0; i < height; ++i) {
292 for (j = 0; j < width; ++j) {
293 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
294 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
295 comp_pred[j] = (uint8_t)tmp;
296 }
297 comp_pred += width;
298 pred += width;
299 ref += ref_stride;
300 }
301 }
302
303 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)304 static void highbd_variance64(const uint8_t *a8, int a_stride,
305 const uint8_t *b8, int b_stride, int w, int h,
306 uint64_t *sse, int64_t *sum) {
307 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
308 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
309 int64_t tsum = 0;
310 uint64_t tsse = 0;
311 for (int i = 0; i < h; ++i) {
312 int32_t lsum = 0;
313 for (int j = 0; j < w; ++j) {
314 const int diff = a[j] - b[j];
315 lsum += diff;
316 tsse += (uint32_t)(diff * diff);
317 }
318 tsum += lsum;
319 a += a_stride;
320 b += b_stride;
321 }
322 *sum = tsum;
323 *sse = tsse;
324 }
325
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)326 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
327 const uint8_t *b, int b_stride, int w, int h) {
328 uint64_t sse;
329 int64_t sum;
330 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
331 return sse;
332 }
333
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)334 static void highbd_8_variance(const uint8_t *a8, int a_stride,
335 const uint8_t *b8, int b_stride, int w, int h,
336 uint32_t *sse, int *sum) {
337 uint64_t sse_long = 0;
338 int64_t sum_long = 0;
339 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
340 *sse = (uint32_t)sse_long;
341 *sum = (int)sum_long;
342 }
343
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)344 static void highbd_10_variance(const uint8_t *a8, int a_stride,
345 const uint8_t *b8, int b_stride, int w, int h,
346 uint32_t *sse, int *sum) {
347 uint64_t sse_long = 0;
348 int64_t sum_long = 0;
349 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
350 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
351 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
352 }
353
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)354 static void highbd_12_variance(const uint8_t *a8, int a_stride,
355 const uint8_t *b8, int b_stride, int w, int h,
356 uint32_t *sse, int *sum) {
357 uint64_t sse_long = 0;
358 int64_t sum_long = 0;
359 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
360 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
361 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
362 }
363
364 #define HIGHBD_VAR(W, H) \
365 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
366 const uint8_t *b, int b_stride, \
367 uint32_t *sse) { \
368 int sum; \
369 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
370 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
371 } \
372 \
373 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
374 const uint8_t *b, int b_stride, \
375 uint32_t *sse) { \
376 int sum; \
377 int64_t var; \
378 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
379 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
380 return (var >= 0) ? (uint32_t)var : 0; \
381 } \
382 \
383 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
384 const uint8_t *b, int b_stride, \
385 uint32_t *sse) { \
386 int sum; \
387 int64_t var; \
388 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
389 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
390 return (var >= 0) ? (uint32_t)var : 0; \
391 }
392
393 #define HIGHBD_MSE(W, H) \
394 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
395 const uint8_t *ref, int ref_stride, \
396 uint32_t *sse) { \
397 int sum; \
398 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
399 return *sse; \
400 } \
401 \
402 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
403 const uint8_t *ref, int ref_stride, \
404 uint32_t *sse) { \
405 int sum; \
406 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
407 return *sse; \
408 } \
409 \
410 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
411 const uint8_t *ref, int ref_stride, \
412 uint32_t *sse) { \
413 int sum; \
414 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
415 return *sse; \
416 }
417
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)418 void aom_highbd_var_filter_block2d_bil_first_pass(
419 const uint8_t *src_ptr8, uint16_t *output_ptr,
420 unsigned int src_pixels_per_line, int pixel_step,
421 unsigned int output_height, unsigned int output_width,
422 const uint8_t *filter) {
423 unsigned int i, j;
424 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
425 for (i = 0; i < output_height; ++i) {
426 for (j = 0; j < output_width; ++j) {
427 output_ptr[j] = ROUND_POWER_OF_TWO(
428 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
429 FILTER_BITS);
430
431 ++src_ptr;
432 }
433
434 // Next row...
435 src_ptr += src_pixels_per_line - output_width;
436 output_ptr += output_width;
437 }
438 }
439
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)440 void aom_highbd_var_filter_block2d_bil_second_pass(
441 const uint16_t *src_ptr, uint16_t *output_ptr,
442 unsigned int src_pixels_per_line, unsigned int pixel_step,
443 unsigned int output_height, unsigned int output_width,
444 const uint8_t *filter) {
445 unsigned int i, j;
446
447 for (i = 0; i < output_height; ++i) {
448 for (j = 0; j < output_width; ++j) {
449 output_ptr[j] = ROUND_POWER_OF_TWO(
450 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
451 FILTER_BITS);
452 ++src_ptr;
453 }
454
455 src_ptr += src_pixels_per_line - output_width;
456 output_ptr += output_width;
457 }
458 }
459
460 #define HIGHBD_SUBPIX_VAR(W, H) \
461 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
462 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
463 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
464 uint16_t fdata3[(H + 1) * W]; \
465 uint16_t temp2[H * W]; \
466 \
467 aom_highbd_var_filter_block2d_bil_first_pass( \
468 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
469 aom_highbd_var_filter_block2d_bil_second_pass( \
470 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
471 \
472 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
473 dst, dst_stride, sse); \
474 } \
475 \
476 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
477 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
478 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
479 uint16_t fdata3[(H + 1) * W]; \
480 uint16_t temp2[H * W]; \
481 \
482 aom_highbd_var_filter_block2d_bil_first_pass( \
483 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
484 aom_highbd_var_filter_block2d_bil_second_pass( \
485 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
486 \
487 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
488 dst, dst_stride, sse); \
489 } \
490 \
491 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
492 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
493 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
494 uint16_t fdata3[(H + 1) * W]; \
495 uint16_t temp2[H * W]; \
496 \
497 aom_highbd_var_filter_block2d_bil_first_pass( \
498 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
499 aom_highbd_var_filter_block2d_bil_second_pass( \
500 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
501 \
502 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
503 dst, dst_stride, sse); \
504 }
505
506 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
507 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
508 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
509 const uint8_t *dst, int dst_stride, uint32_t *sse, \
510 const uint8_t *second_pred) { \
511 uint16_t fdata3[(H + 1) * W]; \
512 uint16_t temp2[H * W]; \
513 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
514 \
515 aom_highbd_var_filter_block2d_bil_first_pass( \
516 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
517 aom_highbd_var_filter_block2d_bil_second_pass( \
518 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
519 \
520 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
521 CONVERT_TO_BYTEPTR(temp2), W); \
522 \
523 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
524 dst, dst_stride, sse); \
525 } \
526 \
527 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
528 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
529 const uint8_t *dst, int dst_stride, uint32_t *sse, \
530 const uint8_t *second_pred) { \
531 uint16_t fdata3[(H + 1) * W]; \
532 uint16_t temp2[H * W]; \
533 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
534 \
535 aom_highbd_var_filter_block2d_bil_first_pass( \
536 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
537 aom_highbd_var_filter_block2d_bil_second_pass( \
538 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
539 \
540 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
541 CONVERT_TO_BYTEPTR(temp2), W); \
542 \
543 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
544 dst, dst_stride, sse); \
545 } \
546 \
547 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
548 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
549 const uint8_t *dst, int dst_stride, uint32_t *sse, \
550 const uint8_t *second_pred) { \
551 uint16_t fdata3[(H + 1) * W]; \
552 uint16_t temp2[H * W]; \
553 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
554 \
555 aom_highbd_var_filter_block2d_bil_first_pass( \
556 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
557 aom_highbd_var_filter_block2d_bil_second_pass( \
558 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
559 \
560 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
561 CONVERT_TO_BYTEPTR(temp2), W); \
562 \
563 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
564 dst, dst_stride, sse); \
565 } \
566 \
567 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
568 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
569 const uint8_t *dst, int dst_stride, uint32_t *sse, \
570 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
571 uint16_t fdata3[(H + 1) * W]; \
572 uint16_t temp2[H * W]; \
573 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
574 \
575 aom_highbd_var_filter_block2d_bil_first_pass( \
576 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
577 aom_highbd_var_filter_block2d_bil_second_pass( \
578 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
579 \
580 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
581 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
582 jcp_param); \
583 \
584 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
585 dst_stride, sse); \
586 } \
587 \
588 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
589 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
590 const uint8_t *dst, int dst_stride, uint32_t *sse, \
591 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
592 uint16_t fdata3[(H + 1) * W]; \
593 uint16_t temp2[H * W]; \
594 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
595 \
596 aom_highbd_var_filter_block2d_bil_first_pass( \
597 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
598 aom_highbd_var_filter_block2d_bil_second_pass( \
599 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
600 \
601 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
602 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
603 jcp_param); \
604 \
605 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
606 dst_stride, sse); \
607 } \
608 \
609 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
610 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
611 const uint8_t *dst, int dst_stride, uint32_t *sse, \
612 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
613 uint16_t fdata3[(H + 1) * W]; \
614 uint16_t temp2[H * W]; \
615 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
616 \
617 aom_highbd_var_filter_block2d_bil_first_pass( \
618 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
619 aom_highbd_var_filter_block2d_bil_second_pass( \
620 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
621 \
622 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
623 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
624 jcp_param); \
625 \
626 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
627 dst_stride, sse); \
628 }
629
630 /* All three forms of the variance are available in the same sizes. */
631 #define HIGHBD_VARIANCES(W, H) \
632 HIGHBD_VAR(W, H) \
633 HIGHBD_SUBPIX_VAR(W, H) \
634 HIGHBD_SUBPIX_AVG_VAR(W, H)
635
636 HIGHBD_VARIANCES(128, 128)
637 HIGHBD_VARIANCES(128, 64)
638 HIGHBD_VARIANCES(64, 128)
639 HIGHBD_VARIANCES(64, 64)
640 HIGHBD_VARIANCES(64, 32)
641 HIGHBD_VARIANCES(32, 64)
642 HIGHBD_VARIANCES(32, 32)
643 HIGHBD_VARIANCES(32, 16)
644 HIGHBD_VARIANCES(16, 32)
645 HIGHBD_VARIANCES(16, 16)
646 HIGHBD_VARIANCES(16, 8)
647 HIGHBD_VARIANCES(8, 16)
648 HIGHBD_VARIANCES(8, 8)
649 HIGHBD_VARIANCES(8, 4)
650 HIGHBD_VARIANCES(4, 8)
651 HIGHBD_VARIANCES(4, 4)
652
653 // Realtime mode doesn't use 4x rectangular blocks.
654 #if !CONFIG_REALTIME_ONLY
655 HIGHBD_VARIANCES(4, 16)
656 HIGHBD_VARIANCES(16, 4)
657 HIGHBD_VARIANCES(8, 32)
658 HIGHBD_VARIANCES(32, 8)
659 HIGHBD_VARIANCES(16, 64)
660 HIGHBD_VARIANCES(64, 16)
661 #endif
662
663 HIGHBD_MSE(16, 16)
664 HIGHBD_MSE(16, 8)
665 HIGHBD_MSE(8, 16)
666 HIGHBD_MSE(8, 8)
667
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)668 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
669 int width, int height, const uint8_t *ref8,
670 int ref_stride) {
671 int i, j;
672 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
673 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
674 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
675 for (i = 0; i < height; ++i) {
676 for (j = 0; j < width; ++j) {
677 const int tmp = pred[j] + ref[j];
678 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
679 }
680 comp_pred += width;
681 pred += width;
682 ref += ref_stride;
683 }
684 }
685
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)686 void aom_highbd_dist_wtd_comp_avg_pred_c(
687 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
688 const uint8_t *ref8, int ref_stride,
689 const DIST_WTD_COMP_PARAMS *jcp_param) {
690 int i, j;
691 const int fwd_offset = jcp_param->fwd_offset;
692 const int bck_offset = jcp_param->bck_offset;
693 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
694 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
695 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
696
697 for (i = 0; i < height; ++i) {
698 for (j = 0; j < width; ++j) {
699 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
700 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
701 comp_pred[j] = (uint16_t)tmp;
702 }
703 comp_pred += width;
704 pred += width;
705 ref += ref_stride;
706 }
707 }
708 #endif // CONFIG_AV1_HIGHBITDEPTH
709
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)710 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
711 int height, const uint8_t *ref, int ref_stride,
712 const uint8_t *mask, int mask_stride,
713 int invert_mask) {
714 int i, j;
715 const uint8_t *src0 = invert_mask ? pred : ref;
716 const uint8_t *src1 = invert_mask ? ref : pred;
717 const int stride0 = invert_mask ? width : ref_stride;
718 const int stride1 = invert_mask ? ref_stride : width;
719 for (i = 0; i < height; ++i) {
720 for (j = 0; j < width; ++j) {
721 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
722 }
723 comp_pred += width;
724 src0 += stride0;
725 src1 += stride1;
726 mask += mask_stride;
727 }
728 }
729
730 #define MASK_SUBPIX_VAR(W, H) \
731 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
732 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
733 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
734 const uint8_t *msk, int msk_stride, int invert_mask, \
735 unsigned int *sse) { \
736 uint16_t fdata3[(H + 1) * W]; \
737 uint8_t temp2[H * W]; \
738 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
739 \
740 var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
741 bilinear_filters_2t[xoffset]); \
742 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
743 bilinear_filters_2t[yoffset]); \
744 \
745 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
746 invert_mask); \
747 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
748 }
749
750 MASK_SUBPIX_VAR(4, 4)
751 MASK_SUBPIX_VAR(4, 8)
752 MASK_SUBPIX_VAR(8, 4)
753 MASK_SUBPIX_VAR(8, 8)
754 MASK_SUBPIX_VAR(8, 16)
755 MASK_SUBPIX_VAR(16, 8)
756 MASK_SUBPIX_VAR(16, 16)
757 MASK_SUBPIX_VAR(16, 32)
758 MASK_SUBPIX_VAR(32, 16)
759 MASK_SUBPIX_VAR(32, 32)
760 MASK_SUBPIX_VAR(32, 64)
761 MASK_SUBPIX_VAR(64, 32)
762 MASK_SUBPIX_VAR(64, 64)
763 MASK_SUBPIX_VAR(64, 128)
764 MASK_SUBPIX_VAR(128, 64)
765 MASK_SUBPIX_VAR(128, 128)
766
767 // Realtime mode doesn't use 4x rectangular blocks.
768 #if !CONFIG_REALTIME_ONLY
769 MASK_SUBPIX_VAR(4, 16)
770 MASK_SUBPIX_VAR(16, 4)
771 MASK_SUBPIX_VAR(8, 32)
772 MASK_SUBPIX_VAR(32, 8)
773 MASK_SUBPIX_VAR(16, 64)
774 MASK_SUBPIX_VAR(64, 16)
775 #endif
776
777 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)778 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
779 int width, int height, const uint8_t *ref8,
780 int ref_stride, const uint8_t *mask,
781 int mask_stride, int invert_mask) {
782 int i, j;
783 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
784 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
785 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
786 for (i = 0; i < height; ++i) {
787 for (j = 0; j < width; ++j) {
788 if (!invert_mask)
789 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
790 else
791 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
792 }
793 comp_pred += width;
794 pred += width;
795 ref += ref_stride;
796 mask += mask_stride;
797 }
798 }
799
800 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
801 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
802 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
803 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
804 const uint8_t *msk, int msk_stride, int invert_mask, \
805 unsigned int *sse) { \
806 uint16_t fdata3[(H + 1) * W]; \
807 uint16_t temp2[H * W]; \
808 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
809 \
810 aom_highbd_var_filter_block2d_bil_first_pass( \
811 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
812 aom_highbd_var_filter_block2d_bil_second_pass( \
813 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
814 \
815 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
816 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
817 invert_mask); \
818 \
819 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
820 ref, ref_stride, sse); \
821 } \
822 \
823 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
824 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
825 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
826 const uint8_t *msk, int msk_stride, int invert_mask, \
827 unsigned int *sse) { \
828 uint16_t fdata3[(H + 1) * W]; \
829 uint16_t temp2[H * W]; \
830 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
831 \
832 aom_highbd_var_filter_block2d_bil_first_pass( \
833 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
834 aom_highbd_var_filter_block2d_bil_second_pass( \
835 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
836 \
837 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
838 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
839 invert_mask); \
840 \
841 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
842 ref, ref_stride, sse); \
843 } \
844 \
845 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
846 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
847 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
848 const uint8_t *msk, int msk_stride, int invert_mask, \
849 unsigned int *sse) { \
850 uint16_t fdata3[(H + 1) * W]; \
851 uint16_t temp2[H * W]; \
852 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
853 \
854 aom_highbd_var_filter_block2d_bil_first_pass( \
855 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
856 aom_highbd_var_filter_block2d_bil_second_pass( \
857 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
858 \
859 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
860 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
861 invert_mask); \
862 \
863 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
864 ref, ref_stride, sse); \
865 }
866
867 HIGHBD_MASK_SUBPIX_VAR(4, 4)
868 HIGHBD_MASK_SUBPIX_VAR(4, 8)
869 HIGHBD_MASK_SUBPIX_VAR(8, 4)
870 HIGHBD_MASK_SUBPIX_VAR(8, 8)
871 HIGHBD_MASK_SUBPIX_VAR(8, 16)
872 HIGHBD_MASK_SUBPIX_VAR(16, 8)
873 HIGHBD_MASK_SUBPIX_VAR(16, 16)
874 HIGHBD_MASK_SUBPIX_VAR(16, 32)
875 HIGHBD_MASK_SUBPIX_VAR(32, 16)
876 HIGHBD_MASK_SUBPIX_VAR(32, 32)
877 HIGHBD_MASK_SUBPIX_VAR(32, 64)
878 HIGHBD_MASK_SUBPIX_VAR(64, 32)
879 HIGHBD_MASK_SUBPIX_VAR(64, 64)
880 HIGHBD_MASK_SUBPIX_VAR(64, 128)
881 HIGHBD_MASK_SUBPIX_VAR(128, 64)
882 HIGHBD_MASK_SUBPIX_VAR(128, 128)
883 #if !CONFIG_REALTIME_ONLY
884 HIGHBD_MASK_SUBPIX_VAR(4, 16)
885 HIGHBD_MASK_SUBPIX_VAR(16, 4)
886 HIGHBD_MASK_SUBPIX_VAR(8, 32)
887 HIGHBD_MASK_SUBPIX_VAR(32, 8)
888 HIGHBD_MASK_SUBPIX_VAR(16, 64)
889 HIGHBD_MASK_SUBPIX_VAR(64, 16)
890 #endif
891 #endif // CONFIG_AV1_HIGHBITDEPTH
892
893 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)894 static inline void obmc_variance(const uint8_t *pre, int pre_stride,
895 const int32_t *wsrc, const int32_t *mask,
896 int w, int h, unsigned int *sse, int *sum) {
897 int i, j;
898
899 *sse = 0;
900 *sum = 0;
901
902 for (i = 0; i < h; i++) {
903 for (j = 0; j < w; j++) {
904 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
905 *sum += diff;
906 *sse += diff * diff;
907 }
908
909 pre += pre_stride;
910 wsrc += w;
911 mask += w;
912 }
913 }
914
915 #define OBMC_VAR(W, H) \
916 unsigned int aom_obmc_variance##W##x##H##_c( \
917 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
918 const int32_t *mask, unsigned int *sse) { \
919 int sum; \
920 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
921 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
922 }
923
924 #define OBMC_SUBPIX_VAR(W, H) \
925 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
926 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
927 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
928 uint16_t fdata3[(H + 1) * W]; \
929 uint8_t temp2[H * W]; \
930 \
931 var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
932 bilinear_filters_2t[xoffset]); \
933 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
934 bilinear_filters_2t[yoffset]); \
935 \
936 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
937 }
938
939 OBMC_VAR(4, 4)
940 OBMC_SUBPIX_VAR(4, 4)
941
942 OBMC_VAR(4, 8)
943 OBMC_SUBPIX_VAR(4, 8)
944
945 OBMC_VAR(8, 4)
946 OBMC_SUBPIX_VAR(8, 4)
947
948 OBMC_VAR(8, 8)
949 OBMC_SUBPIX_VAR(8, 8)
950
951 OBMC_VAR(8, 16)
952 OBMC_SUBPIX_VAR(8, 16)
953
954 OBMC_VAR(16, 8)
955 OBMC_SUBPIX_VAR(16, 8)
956
957 OBMC_VAR(16, 16)
958 OBMC_SUBPIX_VAR(16, 16)
959
960 OBMC_VAR(16, 32)
961 OBMC_SUBPIX_VAR(16, 32)
962
963 OBMC_VAR(32, 16)
964 OBMC_SUBPIX_VAR(32, 16)
965
966 OBMC_VAR(32, 32)
967 OBMC_SUBPIX_VAR(32, 32)
968
969 OBMC_VAR(32, 64)
970 OBMC_SUBPIX_VAR(32, 64)
971
972 OBMC_VAR(64, 32)
973 OBMC_SUBPIX_VAR(64, 32)
974
975 OBMC_VAR(64, 64)
976 OBMC_SUBPIX_VAR(64, 64)
977
978 OBMC_VAR(64, 128)
979 OBMC_SUBPIX_VAR(64, 128)
980
981 OBMC_VAR(128, 64)
982 OBMC_SUBPIX_VAR(128, 64)
983
984 OBMC_VAR(128, 128)
985 OBMC_SUBPIX_VAR(128, 128)
986
987 OBMC_VAR(4, 16)
988 OBMC_SUBPIX_VAR(4, 16)
989 OBMC_VAR(16, 4)
990 OBMC_SUBPIX_VAR(16, 4)
991 OBMC_VAR(8, 32)
992 OBMC_SUBPIX_VAR(8, 32)
993 OBMC_VAR(32, 8)
994 OBMC_SUBPIX_VAR(32, 8)
995 OBMC_VAR(16, 64)
996 OBMC_SUBPIX_VAR(16, 64)
997 OBMC_VAR(64, 16)
998 OBMC_SUBPIX_VAR(64, 16)
999
1000 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1001 static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1002 const int32_t *wsrc,
1003 const int32_t *mask, int w, int h,
1004 uint64_t *sse, int64_t *sum) {
1005 int i, j;
1006 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1007
1008 *sse = 0;
1009 *sum = 0;
1010
1011 for (i = 0; i < h; i++) {
1012 for (j = 0; j < w; j++) {
1013 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1014 *sum += diff;
1015 *sse += diff * diff;
1016 }
1017
1018 pre += pre_stride;
1019 wsrc += w;
1020 mask += w;
1021 }
1022 }
1023
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1024 static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1025 const int32_t *wsrc,
1026 const int32_t *mask, int w, int h,
1027 unsigned int *sse, int *sum) {
1028 int64_t sum64;
1029 uint64_t sse64;
1030 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1031 *sum = (int)sum64;
1032 *sse = (unsigned int)sse64;
1033 }
1034
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1035 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1036 const int32_t *wsrc,
1037 const int32_t *mask, int w, int h,
1038 unsigned int *sse, int *sum) {
1039 int64_t sum64;
1040 uint64_t sse64;
1041 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1042 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1043 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1044 }
1045
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1046 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1047 const int32_t *wsrc,
1048 const int32_t *mask, int w, int h,
1049 unsigned int *sse, int *sum) {
1050 int64_t sum64;
1051 uint64_t sse64;
1052 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1053 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1054 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1055 }
1056
1057 #define HIGHBD_OBMC_VAR(W, H) \
1058 unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \
1059 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1060 const int32_t *mask, unsigned int *sse) { \
1061 int sum; \
1062 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1063 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1064 } \
1065 \
1066 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1067 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1068 const int32_t *mask, unsigned int *sse) { \
1069 int sum; \
1070 int64_t var; \
1071 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1072 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1073 return (var >= 0) ? (uint32_t)var : 0; \
1074 } \
1075 \
1076 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1077 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1078 const int32_t *mask, unsigned int *sse) { \
1079 int sum; \
1080 int64_t var; \
1081 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1082 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1083 return (var >= 0) ? (uint32_t)var : 0; \
1084 }
1085
1086 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1087 unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \
1088 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1089 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1090 uint16_t fdata3[(H + 1) * W]; \
1091 uint16_t temp2[H * W]; \
1092 \
1093 aom_highbd_var_filter_block2d_bil_first_pass( \
1094 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1095 aom_highbd_var_filter_block2d_bil_second_pass( \
1096 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1097 \
1098 return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1099 W, wsrc, mask, sse); \
1100 } \
1101 \
1102 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1103 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1104 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1105 uint16_t fdata3[(H + 1) * W]; \
1106 uint16_t temp2[H * W]; \
1107 \
1108 aom_highbd_var_filter_block2d_bil_first_pass( \
1109 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1110 aom_highbd_var_filter_block2d_bil_second_pass( \
1111 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1112 \
1113 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1114 W, wsrc, mask, sse); \
1115 } \
1116 \
1117 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1118 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1119 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1120 uint16_t fdata3[(H + 1) * W]; \
1121 uint16_t temp2[H * W]; \
1122 \
1123 aom_highbd_var_filter_block2d_bil_first_pass( \
1124 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1125 aom_highbd_var_filter_block2d_bil_second_pass( \
1126 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1127 \
1128 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1129 W, wsrc, mask, sse); \
1130 }
1131
1132 HIGHBD_OBMC_VAR(4, 4)
1133 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1134
1135 HIGHBD_OBMC_VAR(4, 8)
1136 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1137
1138 HIGHBD_OBMC_VAR(8, 4)
1139 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1140
1141 HIGHBD_OBMC_VAR(8, 8)
1142 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1143
1144 HIGHBD_OBMC_VAR(8, 16)
1145 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1146
1147 HIGHBD_OBMC_VAR(16, 8)
1148 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1149
1150 HIGHBD_OBMC_VAR(16, 16)
1151 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1152
1153 HIGHBD_OBMC_VAR(16, 32)
1154 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1155
1156 HIGHBD_OBMC_VAR(32, 16)
1157 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1158
1159 HIGHBD_OBMC_VAR(32, 32)
1160 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1161
1162 HIGHBD_OBMC_VAR(32, 64)
1163 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1164
1165 HIGHBD_OBMC_VAR(64, 32)
1166 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1167
1168 HIGHBD_OBMC_VAR(64, 64)
1169 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1170
1171 HIGHBD_OBMC_VAR(64, 128)
1172 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1173
1174 HIGHBD_OBMC_VAR(128, 64)
1175 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1176
1177 HIGHBD_OBMC_VAR(128, 128)
1178 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1179
1180 HIGHBD_OBMC_VAR(4, 16)
1181 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1182 HIGHBD_OBMC_VAR(16, 4)
1183 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1184 HIGHBD_OBMC_VAR(8, 32)
1185 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1186 HIGHBD_OBMC_VAR(32, 8)
1187 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1188 HIGHBD_OBMC_VAR(16, 64)
1189 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1190 HIGHBD_OBMC_VAR(64, 16)
1191 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1192 #endif // CONFIG_AV1_HIGHBITDEPTH
1193 #endif // !CONFIG_REALTIME_ONLY
1194
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1195 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1196 int sstride, int w, int h) {
1197 uint64_t sum = 0;
1198 for (int i = 0; i < h; i++) {
1199 for (int j = 0; j < w; j++) {
1200 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1201 sum += e * e;
1202 }
1203 }
1204 return sum;
1205 }
1206
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1207 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1208 int h) {
1209 uint16_t *src_temp = src;
1210 uint8_t *dst_temp = dst;
1211 const int num_blks = 16 / w;
1212 int64_t sum = 0;
1213 for (int i = 0; i < num_blks; i++) {
1214 sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1215 dst_temp += w;
1216 src_temp += (w * h);
1217 }
1218 return sum;
1219 }
1220
1221 #if CONFIG_AV1_HIGHBITDEPTH
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1222 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1223 int sstride, int w, int h) {
1224 uint64_t sum = 0;
1225 for (int i = 0; i < h; i++) {
1226 for (int j = 0; j < w; j++) {
1227 int e = dst[i * dstride + j] - src[i * sstride + j];
1228 sum += e * e;
1229 }
1230 }
1231 return sum;
1232 }
1233 #endif // CONFIG_AV1_HIGHBITDEPTH
1234