xref: /aosp_15_r20/external/libaom/aom_dsp/variance.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19 
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23 
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26 
27 #if !CONFIG_REALTIME_ONLY
aom_get_mb_ss_c(const int16_t * a)28 uint32_t aom_get_mb_ss_c(const int16_t *a) {
29   unsigned int i, sum = 0;
30 
31   for (i = 0; i < 256; ++i) {
32     sum += a[i] * a[i];
33   }
34 
35   return sum;
36 }
37 #endif  // !CONFIG_REALTIME_ONLY
38 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)39 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
40                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
41   int i, j;
42 
43   *sum = 0;
44   *sse = 0;
45 
46   for (i = 0; i < h; ++i) {
47     for (j = 0; j < w; ++j) {
48       const int diff = a[j] - b[j];
49       *sum += diff;
50       *sse += diff * diff;
51     }
52 
53     a += a_stride;
54     b += b_stride;
55   }
56 }
57 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)58 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
59                           int b_stride, int w, int h) {
60   uint32_t sse;
61   int sum;
62   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
63   return sse;
64 }
65 
66 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
67 // or vertical direction to produce the filtered output block. Used to implement
68 // the first-pass of 2-D separable filter.
69 //
70 // Produces int16_t output to retain precision for the next pass. Two filter
71 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
72 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
73 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)74 static void var_filter_block2d_bil_first_pass_c(
75     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
76     unsigned int pixel_step, unsigned int output_height,
77     unsigned int output_width, const uint8_t *filter) {
78   unsigned int i, j;
79 
80   for (i = 0; i < output_height; ++i) {
81     for (j = 0; j < output_width; ++j) {
82       b[j] = ROUND_POWER_OF_TWO(
83           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
84 
85       ++a;
86     }
87 
88     a += src_pixels_per_line - output_width;
89     b += output_width;
90   }
91 }
92 
93 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
94 // or vertical direction to produce the filtered output block. Used to implement
95 // the second-pass of 2-D separable filter.
96 //
97 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
98 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
99 // filter is applied horizontally (pixel_step = 1) or vertically
100 // (pixel_step = stride). It defines the offset required to move from one input
101 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)102 static void var_filter_block2d_bil_second_pass_c(
103     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
104     unsigned int pixel_step, unsigned int output_height,
105     unsigned int output_width, const uint8_t *filter) {
106   unsigned int i, j;
107 
108   for (i = 0; i < output_height; ++i) {
109     for (j = 0; j < output_width; ++j) {
110       b[j] = ROUND_POWER_OF_TWO(
111           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
112       ++a;
113     }
114 
115     a += src_pixels_per_line - output_width;
116     b += output_width;
117   }
118 }
119 
120 #define VAR(W, H)                                                    \
121   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
122                                      const uint8_t *b, int b_stride, \
123                                      uint32_t *sse) {                \
124     int sum;                                                         \
125     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
126     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
127   }
128 
129 #define SUBPIX_VAR(W, H)                                                  \
130   uint32_t aom_sub_pixel_variance##W##x##H##_c(                           \
131       const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
132       const uint8_t *b, int b_stride, uint32_t *sse) {                    \
133     uint16_t fdata3[(H + 1) * W];                                         \
134     uint8_t temp2[H * W];                                                 \
135                                                                           \
136     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
137                                         bilinear_filters_2t[xoffset]);    \
138     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
139                                          bilinear_filters_2t[yoffset]);   \
140                                                                           \
141     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);         \
142   }
143 
144 #define SUBPIX_AVG_VAR(W, H)                                                   \
145   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
146       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
147       const uint8_t *b, int b_stride, uint32_t *sse,                           \
148       const uint8_t *second_pred) {                                            \
149     uint16_t fdata3[(H + 1) * W];                                              \
150     uint8_t temp2[H * W];                                                      \
151     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
152                                                                                \
153     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,      \
154                                         bilinear_filters_2t[xoffset]);         \
155     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,            \
156                                          bilinear_filters_2t[yoffset]);        \
157                                                                                \
158     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
159                                                                                \
160     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
161   }                                                                            \
162   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
163       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
164       const uint8_t *b, int b_stride, uint32_t *sse,                           \
165       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
166     uint16_t fdata3[(H + 1) * W];                                              \
167     uint8_t temp2[H * W];                                                      \
168     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
169                                                                                \
170     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,      \
171                                         bilinear_filters_2t[xoffset]);         \
172     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,            \
173                                          bilinear_filters_2t[yoffset]);        \
174                                                                                \
175     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
176                                                                                \
177     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
178   }
179 
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)180 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
181                                     const uint8_t *b, int b_stride,
182                                     uint32_t *sse8x8, int *sum8x8,
183                                     unsigned int *tot_sse, int *tot_sum,
184                                     uint32_t *var8x8) {
185   // Loop over 4 8x8 blocks. Process one 8x32 block.
186   for (int k = 0; k < 4; k++) {
187     variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
188              &sum8x8[k]);
189   }
190 
191   // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
192   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
193   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
194   for (int i = 0; i < 4; i++)
195     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
196 }
197 
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)198 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
199                                       const uint8_t *ref_ptr, int ref_stride,
200                                       uint32_t *sse16x16, unsigned int *tot_sse,
201                                       int *tot_sum, uint32_t *var16x16) {
202   int sum16x16[2] = { 0 };
203   // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
204   for (int k = 0; k < 2; k++) {
205     variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
206              16, 16, &sse16x16[k], &sum16x16[k]);
207   }
208 
209   // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
210   *tot_sse += sse16x16[0] + sse16x16[1];
211   *tot_sum += sum16x16[0] + sum16x16[1];
212   for (int i = 0; i < 2; i++)
213     var16x16[i] =
214         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
215 }
216 
217 /* Identical to the variance call except it does not calculate the
218  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
219  * variable.
220  */
221 #define MSE(W, H)                                               \
222   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
223                                 const uint8_t *b, int b_stride, \
224                                 uint32_t *sse) {                \
225     int sum;                                                    \
226     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
227     return *sse;                                                \
228   }
229 
230 /* All three forms of the variance are available in the same sizes. */
231 #define VARIANCES(W, H) \
232   VAR(W, H)             \
233   SUBPIX_VAR(W, H)      \
234   SUBPIX_AVG_VAR(W, H)
235 
236 VARIANCES(128, 128)
237 VARIANCES(128, 64)
238 VARIANCES(64, 128)
239 VARIANCES(64, 64)
240 VARIANCES(64, 32)
241 VARIANCES(32, 64)
242 VARIANCES(32, 32)
243 VARIANCES(32, 16)
244 VARIANCES(16, 32)
245 VARIANCES(16, 16)
246 VARIANCES(16, 8)
247 VARIANCES(8, 16)
248 VARIANCES(8, 8)
249 VARIANCES(8, 4)
250 VARIANCES(4, 8)
251 VARIANCES(4, 4)
252 
253 // Realtime mode doesn't use rectangular blocks.
254 #if !CONFIG_REALTIME_ONLY
255 VARIANCES(4, 16)
256 VARIANCES(16, 4)
257 VARIANCES(8, 32)
258 VARIANCES(32, 8)
259 VARIANCES(16, 64)
260 VARIANCES(64, 16)
261 #endif
262 
263 MSE(16, 16)
264 MSE(16, 8)
265 MSE(8, 16)
266 MSE(8, 8)
267 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)268 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
269                          int height, const uint8_t *ref, int ref_stride) {
270   int i, j;
271 
272   for (i = 0; i < height; ++i) {
273     for (j = 0; j < width; ++j) {
274       const int tmp = pred[j] + ref[j];
275       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
276     }
277     comp_pred += width;
278     pred += width;
279     ref += ref_stride;
280   }
281 }
282 
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)283 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
284                                   int width, int height, const uint8_t *ref,
285                                   int ref_stride,
286                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
287   int i, j;
288   const int fwd_offset = jcp_param->fwd_offset;
289   const int bck_offset = jcp_param->bck_offset;
290 
291   for (i = 0; i < height; ++i) {
292     for (j = 0; j < width; ++j) {
293       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
294       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
295       comp_pred[j] = (uint8_t)tmp;
296     }
297     comp_pred += width;
298     pred += width;
299     ref += ref_stride;
300   }
301 }
302 
303 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)304 static void highbd_variance64(const uint8_t *a8, int a_stride,
305                               const uint8_t *b8, int b_stride, int w, int h,
306                               uint64_t *sse, int64_t *sum) {
307   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
308   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
309   int64_t tsum = 0;
310   uint64_t tsse = 0;
311   for (int i = 0; i < h; ++i) {
312     int32_t lsum = 0;
313     for (int j = 0; j < w; ++j) {
314       const int diff = a[j] - b[j];
315       lsum += diff;
316       tsse += (uint32_t)(diff * diff);
317     }
318     tsum += lsum;
319     a += a_stride;
320     b += b_stride;
321   }
322   *sum = tsum;
323   *sse = tsse;
324 }
325 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)326 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
327                                  const uint8_t *b, int b_stride, int w, int h) {
328   uint64_t sse;
329   int64_t sum;
330   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
331   return sse;
332 }
333 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)334 static void highbd_8_variance(const uint8_t *a8, int a_stride,
335                               const uint8_t *b8, int b_stride, int w, int h,
336                               uint32_t *sse, int *sum) {
337   uint64_t sse_long = 0;
338   int64_t sum_long = 0;
339   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
340   *sse = (uint32_t)sse_long;
341   *sum = (int)sum_long;
342 }
343 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)344 static void highbd_10_variance(const uint8_t *a8, int a_stride,
345                                const uint8_t *b8, int b_stride, int w, int h,
346                                uint32_t *sse, int *sum) {
347   uint64_t sse_long = 0;
348   int64_t sum_long = 0;
349   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
350   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
351   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
352 }
353 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)354 static void highbd_12_variance(const uint8_t *a8, int a_stride,
355                                const uint8_t *b8, int b_stride, int w, int h,
356                                uint32_t *sse, int *sum) {
357   uint64_t sse_long = 0;
358   int64_t sum_long = 0;
359   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
360   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
361   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
362 }
363 
364 #define HIGHBD_VAR(W, H)                                                       \
365   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
366                                               const uint8_t *b, int b_stride,  \
367                                               uint32_t *sse) {                 \
368     int sum;                                                                   \
369     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
370     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
371   }                                                                            \
372                                                                                \
373   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
374                                                const uint8_t *b, int b_stride, \
375                                                uint32_t *sse) {                \
376     int sum;                                                                   \
377     int64_t var;                                                               \
378     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
379     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
380     return (var >= 0) ? (uint32_t)var : 0;                                     \
381   }                                                                            \
382                                                                                \
383   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
384                                                const uint8_t *b, int b_stride, \
385                                                uint32_t *sse) {                \
386     int sum;                                                                   \
387     int64_t var;                                                               \
388     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
389     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
390     return (var >= 0) ? (uint32_t)var : 0;                                     \
391   }
392 
393 #define HIGHBD_MSE(W, H)                                                      \
394   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
395                                          const uint8_t *ref, int ref_stride,  \
396                                          uint32_t *sse) {                     \
397     int sum;                                                                  \
398     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
399     return *sse;                                                              \
400   }                                                                           \
401                                                                               \
402   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
403                                           const uint8_t *ref, int ref_stride, \
404                                           uint32_t *sse) {                    \
405     int sum;                                                                  \
406     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
407     return *sse;                                                              \
408   }                                                                           \
409                                                                               \
410   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
411                                           const uint8_t *ref, int ref_stride, \
412                                           uint32_t *sse) {                    \
413     int sum;                                                                  \
414     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
415     return *sse;                                                              \
416   }
417 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)418 void aom_highbd_var_filter_block2d_bil_first_pass(
419     const uint8_t *src_ptr8, uint16_t *output_ptr,
420     unsigned int src_pixels_per_line, int pixel_step,
421     unsigned int output_height, unsigned int output_width,
422     const uint8_t *filter) {
423   unsigned int i, j;
424   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
425   for (i = 0; i < output_height; ++i) {
426     for (j = 0; j < output_width; ++j) {
427       output_ptr[j] = ROUND_POWER_OF_TWO(
428           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
429           FILTER_BITS);
430 
431       ++src_ptr;
432     }
433 
434     // Next row...
435     src_ptr += src_pixels_per_line - output_width;
436     output_ptr += output_width;
437   }
438 }
439 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)440 void aom_highbd_var_filter_block2d_bil_second_pass(
441     const uint16_t *src_ptr, uint16_t *output_ptr,
442     unsigned int src_pixels_per_line, unsigned int pixel_step,
443     unsigned int output_height, unsigned int output_width,
444     const uint8_t *filter) {
445   unsigned int i, j;
446 
447   for (i = 0; i < output_height; ++i) {
448     for (j = 0; j < output_width; ++j) {
449       output_ptr[j] = ROUND_POWER_OF_TWO(
450           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
451           FILTER_BITS);
452       ++src_ptr;
453     }
454 
455     src_ptr += src_pixels_per_line - output_width;
456     output_ptr += output_width;
457   }
458 }
459 
460 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
461   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
462       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
463       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
464     uint16_t fdata3[(H + 1) * W];                                            \
465     uint16_t temp2[H * W];                                                   \
466                                                                              \
467     aom_highbd_var_filter_block2d_bil_first_pass(                            \
468         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
469     aom_highbd_var_filter_block2d_bil_second_pass(                           \
470         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
471                                                                              \
472     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
473                                               dst, dst_stride, sse);         \
474   }                                                                          \
475                                                                              \
476   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
477       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
478       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
479     uint16_t fdata3[(H + 1) * W];                                            \
480     uint16_t temp2[H * W];                                                   \
481                                                                              \
482     aom_highbd_var_filter_block2d_bil_first_pass(                            \
483         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
484     aom_highbd_var_filter_block2d_bil_second_pass(                           \
485         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
486                                                                              \
487     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
488                                                dst, dst_stride, sse);        \
489   }                                                                          \
490                                                                              \
491   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
492       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
493       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
494     uint16_t fdata3[(H + 1) * W];                                            \
495     uint16_t temp2[H * W];                                                   \
496                                                                              \
497     aom_highbd_var_filter_block2d_bil_first_pass(                            \
498         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
499     aom_highbd_var_filter_block2d_bil_second_pass(                           \
500         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
501                                                                              \
502     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
503                                                dst, dst_stride, sse);        \
504   }
505 
506 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
507   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
508       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
509       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
510       const uint8_t *second_pred) {                                           \
511     uint16_t fdata3[(H + 1) * W];                                             \
512     uint16_t temp2[H * W];                                                    \
513     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
514                                                                               \
515     aom_highbd_var_filter_block2d_bil_first_pass(                             \
516         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
517     aom_highbd_var_filter_block2d_bil_second_pass(                            \
518         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
519                                                                               \
520     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
521                                CONVERT_TO_BYTEPTR(temp2), W);                 \
522                                                                               \
523     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
524                                               dst, dst_stride, sse);          \
525   }                                                                           \
526                                                                               \
527   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
528       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
529       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
530       const uint8_t *second_pred) {                                           \
531     uint16_t fdata3[(H + 1) * W];                                             \
532     uint16_t temp2[H * W];                                                    \
533     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
534                                                                               \
535     aom_highbd_var_filter_block2d_bil_first_pass(                             \
536         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
537     aom_highbd_var_filter_block2d_bil_second_pass(                            \
538         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
539                                                                               \
540     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
541                                CONVERT_TO_BYTEPTR(temp2), W);                 \
542                                                                               \
543     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
544                                                dst, dst_stride, sse);         \
545   }                                                                           \
546                                                                               \
547   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
548       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
549       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
550       const uint8_t *second_pred) {                                           \
551     uint16_t fdata3[(H + 1) * W];                                             \
552     uint16_t temp2[H * W];                                                    \
553     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
554                                                                               \
555     aom_highbd_var_filter_block2d_bil_first_pass(                             \
556         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
557     aom_highbd_var_filter_block2d_bil_second_pass(                            \
558         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
559                                                                               \
560     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
561                                CONVERT_TO_BYTEPTR(temp2), W);                 \
562                                                                               \
563     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
564                                                dst, dst_stride, sse);         \
565   }                                                                           \
566                                                                               \
567   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
568       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
569       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
570       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
571     uint16_t fdata3[(H + 1) * W];                                             \
572     uint16_t temp2[H * W];                                                    \
573     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
574                                                                               \
575     aom_highbd_var_filter_block2d_bil_first_pass(                             \
576         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
577     aom_highbd_var_filter_block2d_bil_second_pass(                            \
578         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
579                                                                               \
580     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
581                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
582                                       jcp_param);                             \
583                                                                               \
584     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
585                                           dst_stride, sse);                   \
586   }                                                                           \
587                                                                               \
588   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
589       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
590       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
591       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
592     uint16_t fdata3[(H + 1) * W];                                             \
593     uint16_t temp2[H * W];                                                    \
594     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
595                                                                               \
596     aom_highbd_var_filter_block2d_bil_first_pass(                             \
597         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
598     aom_highbd_var_filter_block2d_bil_second_pass(                            \
599         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
600                                                                               \
601     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
602                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
603                                       jcp_param);                             \
604                                                                               \
605     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
606                                            dst_stride, sse);                  \
607   }                                                                           \
608                                                                               \
609   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
610       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
611       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
612       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
613     uint16_t fdata3[(H + 1) * W];                                             \
614     uint16_t temp2[H * W];                                                    \
615     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
616                                                                               \
617     aom_highbd_var_filter_block2d_bil_first_pass(                             \
618         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
619     aom_highbd_var_filter_block2d_bil_second_pass(                            \
620         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
621                                                                               \
622     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
623                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
624                                       jcp_param);                             \
625                                                                               \
626     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
627                                            dst_stride, sse);                  \
628   }
629 
630 /* All three forms of the variance are available in the same sizes. */
631 #define HIGHBD_VARIANCES(W, H) \
632   HIGHBD_VAR(W, H)             \
633   HIGHBD_SUBPIX_VAR(W, H)      \
634   HIGHBD_SUBPIX_AVG_VAR(W, H)
635 
636 HIGHBD_VARIANCES(128, 128)
637 HIGHBD_VARIANCES(128, 64)
638 HIGHBD_VARIANCES(64, 128)
639 HIGHBD_VARIANCES(64, 64)
640 HIGHBD_VARIANCES(64, 32)
641 HIGHBD_VARIANCES(32, 64)
642 HIGHBD_VARIANCES(32, 32)
643 HIGHBD_VARIANCES(32, 16)
644 HIGHBD_VARIANCES(16, 32)
645 HIGHBD_VARIANCES(16, 16)
646 HIGHBD_VARIANCES(16, 8)
647 HIGHBD_VARIANCES(8, 16)
648 HIGHBD_VARIANCES(8, 8)
649 HIGHBD_VARIANCES(8, 4)
650 HIGHBD_VARIANCES(4, 8)
651 HIGHBD_VARIANCES(4, 4)
652 
653 // Realtime mode doesn't use 4x rectangular blocks.
654 #if !CONFIG_REALTIME_ONLY
655 HIGHBD_VARIANCES(4, 16)
656 HIGHBD_VARIANCES(16, 4)
657 HIGHBD_VARIANCES(8, 32)
658 HIGHBD_VARIANCES(32, 8)
659 HIGHBD_VARIANCES(16, 64)
660 HIGHBD_VARIANCES(64, 16)
661 #endif
662 
663 HIGHBD_MSE(16, 16)
664 HIGHBD_MSE(16, 8)
665 HIGHBD_MSE(8, 16)
666 HIGHBD_MSE(8, 8)
667 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)668 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
669                                 int width, int height, const uint8_t *ref8,
670                                 int ref_stride) {
671   int i, j;
672   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
673   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
674   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
675   for (i = 0; i < height; ++i) {
676     for (j = 0; j < width; ++j) {
677       const int tmp = pred[j] + ref[j];
678       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
679     }
680     comp_pred += width;
681     pred += width;
682     ref += ref_stride;
683   }
684 }
685 
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)686 void aom_highbd_dist_wtd_comp_avg_pred_c(
687     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
688     const uint8_t *ref8, int ref_stride,
689     const DIST_WTD_COMP_PARAMS *jcp_param) {
690   int i, j;
691   const int fwd_offset = jcp_param->fwd_offset;
692   const int bck_offset = jcp_param->bck_offset;
693   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
694   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
695   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
696 
697   for (i = 0; i < height; ++i) {
698     for (j = 0; j < width; ++j) {
699       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
700       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
701       comp_pred[j] = (uint16_t)tmp;
702     }
703     comp_pred += width;
704     pred += width;
705     ref += ref_stride;
706   }
707 }
708 #endif  // CONFIG_AV1_HIGHBITDEPTH
709 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)710 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
711                           int height, const uint8_t *ref, int ref_stride,
712                           const uint8_t *mask, int mask_stride,
713                           int invert_mask) {
714   int i, j;
715   const uint8_t *src0 = invert_mask ? pred : ref;
716   const uint8_t *src1 = invert_mask ? ref : pred;
717   const int stride0 = invert_mask ? width : ref_stride;
718   const int stride1 = invert_mask ? ref_stride : width;
719   for (i = 0; i < height; ++i) {
720     for (j = 0; j < width; ++j) {
721       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
722     }
723     comp_pred += width;
724     src0 += stride0;
725     src1 += stride1;
726     mask += mask_stride;
727   }
728 }
729 
730 #define MASK_SUBPIX_VAR(W, H)                                                 \
731   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
732       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
733       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
734       const uint8_t *msk, int msk_stride, int invert_mask,                    \
735       unsigned int *sse) {                                                    \
736     uint16_t fdata3[(H + 1) * W];                                             \
737     uint8_t temp2[H * W];                                                     \
738     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
739                                                                               \
740     var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
741                                         bilinear_filters_2t[xoffset]);        \
742     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
743                                          bilinear_filters_2t[yoffset]);       \
744                                                                               \
745     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
746                          invert_mask);                                        \
747     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
748   }
749 
750 MASK_SUBPIX_VAR(4, 4)
751 MASK_SUBPIX_VAR(4, 8)
752 MASK_SUBPIX_VAR(8, 4)
753 MASK_SUBPIX_VAR(8, 8)
754 MASK_SUBPIX_VAR(8, 16)
755 MASK_SUBPIX_VAR(16, 8)
756 MASK_SUBPIX_VAR(16, 16)
757 MASK_SUBPIX_VAR(16, 32)
758 MASK_SUBPIX_VAR(32, 16)
759 MASK_SUBPIX_VAR(32, 32)
760 MASK_SUBPIX_VAR(32, 64)
761 MASK_SUBPIX_VAR(64, 32)
762 MASK_SUBPIX_VAR(64, 64)
763 MASK_SUBPIX_VAR(64, 128)
764 MASK_SUBPIX_VAR(128, 64)
765 MASK_SUBPIX_VAR(128, 128)
766 
767 // Realtime mode doesn't use 4x rectangular blocks.
768 #if !CONFIG_REALTIME_ONLY
769 MASK_SUBPIX_VAR(4, 16)
770 MASK_SUBPIX_VAR(16, 4)
771 MASK_SUBPIX_VAR(8, 32)
772 MASK_SUBPIX_VAR(32, 8)
773 MASK_SUBPIX_VAR(16, 64)
774 MASK_SUBPIX_VAR(64, 16)
775 #endif
776 
777 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)778 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
779                                  int width, int height, const uint8_t *ref8,
780                                  int ref_stride, const uint8_t *mask,
781                                  int mask_stride, int invert_mask) {
782   int i, j;
783   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
784   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
785   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
786   for (i = 0; i < height; ++i) {
787     for (j = 0; j < width; ++j) {
788       if (!invert_mask)
789         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
790       else
791         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
792     }
793     comp_pred += width;
794     pred += width;
795     ref += ref_stride;
796     mask += mask_stride;
797   }
798 }
799 
800 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
801   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
802       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
803       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
804       const uint8_t *msk, int msk_stride, int invert_mask,                     \
805       unsigned int *sse) {                                                     \
806     uint16_t fdata3[(H + 1) * W];                                              \
807     uint16_t temp2[H * W];                                                     \
808     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
809                                                                                \
810     aom_highbd_var_filter_block2d_bil_first_pass(                              \
811         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
812     aom_highbd_var_filter_block2d_bil_second_pass(                             \
813         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
814                                                                                \
815     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
816                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
817                                 invert_mask);                                  \
818                                                                                \
819     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
820                                               ref, ref_stride, sse);           \
821   }                                                                            \
822                                                                                \
823   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
824       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
825       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
826       const uint8_t *msk, int msk_stride, int invert_mask,                     \
827       unsigned int *sse) {                                                     \
828     uint16_t fdata3[(H + 1) * W];                                              \
829     uint16_t temp2[H * W];                                                     \
830     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
831                                                                                \
832     aom_highbd_var_filter_block2d_bil_first_pass(                              \
833         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
834     aom_highbd_var_filter_block2d_bil_second_pass(                             \
835         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
836                                                                                \
837     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
838                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
839                                 invert_mask);                                  \
840                                                                                \
841     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
842                                                ref, ref_stride, sse);          \
843   }                                                                            \
844                                                                                \
845   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
846       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
847       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
848       const uint8_t *msk, int msk_stride, int invert_mask,                     \
849       unsigned int *sse) {                                                     \
850     uint16_t fdata3[(H + 1) * W];                                              \
851     uint16_t temp2[H * W];                                                     \
852     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
853                                                                                \
854     aom_highbd_var_filter_block2d_bil_first_pass(                              \
855         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
856     aom_highbd_var_filter_block2d_bil_second_pass(                             \
857         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
858                                                                                \
859     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
860                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
861                                 invert_mask);                                  \
862                                                                                \
863     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
864                                                ref, ref_stride, sse);          \
865   }
866 
867 HIGHBD_MASK_SUBPIX_VAR(4, 4)
868 HIGHBD_MASK_SUBPIX_VAR(4, 8)
869 HIGHBD_MASK_SUBPIX_VAR(8, 4)
870 HIGHBD_MASK_SUBPIX_VAR(8, 8)
871 HIGHBD_MASK_SUBPIX_VAR(8, 16)
872 HIGHBD_MASK_SUBPIX_VAR(16, 8)
873 HIGHBD_MASK_SUBPIX_VAR(16, 16)
874 HIGHBD_MASK_SUBPIX_VAR(16, 32)
875 HIGHBD_MASK_SUBPIX_VAR(32, 16)
876 HIGHBD_MASK_SUBPIX_VAR(32, 32)
877 HIGHBD_MASK_SUBPIX_VAR(32, 64)
878 HIGHBD_MASK_SUBPIX_VAR(64, 32)
879 HIGHBD_MASK_SUBPIX_VAR(64, 64)
880 HIGHBD_MASK_SUBPIX_VAR(64, 128)
881 HIGHBD_MASK_SUBPIX_VAR(128, 64)
882 HIGHBD_MASK_SUBPIX_VAR(128, 128)
883 #if !CONFIG_REALTIME_ONLY
884 HIGHBD_MASK_SUBPIX_VAR(4, 16)
885 HIGHBD_MASK_SUBPIX_VAR(16, 4)
886 HIGHBD_MASK_SUBPIX_VAR(8, 32)
887 HIGHBD_MASK_SUBPIX_VAR(32, 8)
888 HIGHBD_MASK_SUBPIX_VAR(16, 64)
889 HIGHBD_MASK_SUBPIX_VAR(64, 16)
890 #endif
891 #endif  // CONFIG_AV1_HIGHBITDEPTH
892 
893 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)894 static inline void obmc_variance(const uint8_t *pre, int pre_stride,
895                                  const int32_t *wsrc, const int32_t *mask,
896                                  int w, int h, unsigned int *sse, int *sum) {
897   int i, j;
898 
899   *sse = 0;
900   *sum = 0;
901 
902   for (i = 0; i < h; i++) {
903     for (j = 0; j < w; j++) {
904       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
905       *sum += diff;
906       *sse += diff * diff;
907     }
908 
909     pre += pre_stride;
910     wsrc += w;
911     mask += w;
912   }
913 }
914 
915 #define OBMC_VAR(W, H)                                            \
916   unsigned int aom_obmc_variance##W##x##H##_c(                    \
917       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
918       const int32_t *mask, unsigned int *sse) {                   \
919     int sum;                                                      \
920     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
921     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
922   }
923 
924 #define OBMC_SUBPIX_VAR(W, H)                                                 \
925   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                      \
926       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
927       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
928     uint16_t fdata3[(H + 1) * W];                                             \
929     uint8_t temp2[H * W];                                                     \
930                                                                               \
931     var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
932                                         bilinear_filters_2t[xoffset]);        \
933     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
934                                          bilinear_filters_2t[yoffset]);       \
935                                                                               \
936     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);         \
937   }
938 
939 OBMC_VAR(4, 4)
940 OBMC_SUBPIX_VAR(4, 4)
941 
942 OBMC_VAR(4, 8)
943 OBMC_SUBPIX_VAR(4, 8)
944 
945 OBMC_VAR(8, 4)
946 OBMC_SUBPIX_VAR(8, 4)
947 
948 OBMC_VAR(8, 8)
949 OBMC_SUBPIX_VAR(8, 8)
950 
951 OBMC_VAR(8, 16)
952 OBMC_SUBPIX_VAR(8, 16)
953 
954 OBMC_VAR(16, 8)
955 OBMC_SUBPIX_VAR(16, 8)
956 
957 OBMC_VAR(16, 16)
958 OBMC_SUBPIX_VAR(16, 16)
959 
960 OBMC_VAR(16, 32)
961 OBMC_SUBPIX_VAR(16, 32)
962 
963 OBMC_VAR(32, 16)
964 OBMC_SUBPIX_VAR(32, 16)
965 
966 OBMC_VAR(32, 32)
967 OBMC_SUBPIX_VAR(32, 32)
968 
969 OBMC_VAR(32, 64)
970 OBMC_SUBPIX_VAR(32, 64)
971 
972 OBMC_VAR(64, 32)
973 OBMC_SUBPIX_VAR(64, 32)
974 
975 OBMC_VAR(64, 64)
976 OBMC_SUBPIX_VAR(64, 64)
977 
978 OBMC_VAR(64, 128)
979 OBMC_SUBPIX_VAR(64, 128)
980 
981 OBMC_VAR(128, 64)
982 OBMC_SUBPIX_VAR(128, 64)
983 
984 OBMC_VAR(128, 128)
985 OBMC_SUBPIX_VAR(128, 128)
986 
987 OBMC_VAR(4, 16)
988 OBMC_SUBPIX_VAR(4, 16)
989 OBMC_VAR(16, 4)
990 OBMC_SUBPIX_VAR(16, 4)
991 OBMC_VAR(8, 32)
992 OBMC_SUBPIX_VAR(8, 32)
993 OBMC_VAR(32, 8)
994 OBMC_SUBPIX_VAR(32, 8)
995 OBMC_VAR(16, 64)
996 OBMC_SUBPIX_VAR(16, 64)
997 OBMC_VAR(64, 16)
998 OBMC_SUBPIX_VAR(64, 16)
999 
1000 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1001 static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1002                                           const int32_t *wsrc,
1003                                           const int32_t *mask, int w, int h,
1004                                           uint64_t *sse, int64_t *sum) {
1005   int i, j;
1006   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1007 
1008   *sse = 0;
1009   *sum = 0;
1010 
1011   for (i = 0; i < h; i++) {
1012     for (j = 0; j < w; j++) {
1013       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1014       *sum += diff;
1015       *sse += diff * diff;
1016     }
1017 
1018     pre += pre_stride;
1019     wsrc += w;
1020     mask += w;
1021   }
1022 }
1023 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1024 static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1025                                         const int32_t *wsrc,
1026                                         const int32_t *mask, int w, int h,
1027                                         unsigned int *sse, int *sum) {
1028   int64_t sum64;
1029   uint64_t sse64;
1030   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1031   *sum = (int)sum64;
1032   *sse = (unsigned int)sse64;
1033 }
1034 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1035 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1036                                            const int32_t *wsrc,
1037                                            const int32_t *mask, int w, int h,
1038                                            unsigned int *sse, int *sum) {
1039   int64_t sum64;
1040   uint64_t sse64;
1041   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1042   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1043   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1044 }
1045 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1046 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1047                                            const int32_t *wsrc,
1048                                            const int32_t *mask, int w, int h,
1049                                            unsigned int *sse, int *sum) {
1050   int64_t sum64;
1051   uint64_t sse64;
1052   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1053   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1054   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1055 }
1056 
1057 #define HIGHBD_OBMC_VAR(W, H)                                              \
1058   unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
1059       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1060       const int32_t *mask, unsigned int *sse) {                            \
1061     int sum;                                                               \
1062     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1063     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1064   }                                                                        \
1065                                                                            \
1066   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1067       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1068       const int32_t *mask, unsigned int *sse) {                            \
1069     int sum;                                                               \
1070     int64_t var;                                                           \
1071     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1072     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1073     return (var >= 0) ? (uint32_t)var : 0;                                 \
1074   }                                                                        \
1075                                                                            \
1076   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1077       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1078       const int32_t *mask, unsigned int *sse) {                            \
1079     int sum;                                                               \
1080     int64_t var;                                                           \
1081     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1082     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1083     return (var >= 0) ? (uint32_t)var : 0;                                 \
1084   }
1085 
1086 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1087   unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
1088       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1089       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1090     uint16_t fdata3[(H + 1) * W];                                              \
1091     uint16_t temp2[H * W];                                                     \
1092                                                                                \
1093     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1094         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1095     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1096         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1097                                                                                \
1098     return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
1099                                                    W, wsrc, mask, sse);        \
1100   }                                                                            \
1101                                                                                \
1102   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1103       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1104       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1105     uint16_t fdata3[(H + 1) * W];                                              \
1106     uint16_t temp2[H * W];                                                     \
1107                                                                                \
1108     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1109         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1110     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1111         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1112                                                                                \
1113     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1114                                                     W, wsrc, mask, sse);       \
1115   }                                                                            \
1116                                                                                \
1117   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1118       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1119       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1120     uint16_t fdata3[(H + 1) * W];                                              \
1121     uint16_t temp2[H * W];                                                     \
1122                                                                                \
1123     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1124         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1125     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1126         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1127                                                                                \
1128     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1129                                                     W, wsrc, mask, sse);       \
1130   }
1131 
1132 HIGHBD_OBMC_VAR(4, 4)
1133 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1134 
1135 HIGHBD_OBMC_VAR(4, 8)
1136 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1137 
1138 HIGHBD_OBMC_VAR(8, 4)
1139 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1140 
1141 HIGHBD_OBMC_VAR(8, 8)
1142 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1143 
1144 HIGHBD_OBMC_VAR(8, 16)
1145 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1146 
1147 HIGHBD_OBMC_VAR(16, 8)
1148 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1149 
1150 HIGHBD_OBMC_VAR(16, 16)
1151 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1152 
1153 HIGHBD_OBMC_VAR(16, 32)
1154 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1155 
1156 HIGHBD_OBMC_VAR(32, 16)
1157 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1158 
1159 HIGHBD_OBMC_VAR(32, 32)
1160 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1161 
1162 HIGHBD_OBMC_VAR(32, 64)
1163 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1164 
1165 HIGHBD_OBMC_VAR(64, 32)
1166 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1167 
1168 HIGHBD_OBMC_VAR(64, 64)
1169 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1170 
1171 HIGHBD_OBMC_VAR(64, 128)
1172 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1173 
1174 HIGHBD_OBMC_VAR(128, 64)
1175 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1176 
1177 HIGHBD_OBMC_VAR(128, 128)
1178 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1179 
1180 HIGHBD_OBMC_VAR(4, 16)
1181 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1182 HIGHBD_OBMC_VAR(16, 4)
1183 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1184 HIGHBD_OBMC_VAR(8, 32)
1185 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1186 HIGHBD_OBMC_VAR(32, 8)
1187 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1188 HIGHBD_OBMC_VAR(16, 64)
1189 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1190 HIGHBD_OBMC_VAR(64, 16)
1191 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1192 #endif  // CONFIG_AV1_HIGHBITDEPTH
1193 #endif  // !CONFIG_REALTIME_ONLY
1194 
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1195 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1196                              int sstride, int w, int h) {
1197   uint64_t sum = 0;
1198   for (int i = 0; i < h; i++) {
1199     for (int j = 0; j < w; j++) {
1200       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1201       sum += e * e;
1202     }
1203   }
1204   return sum;
1205 }
1206 
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1207 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1208                               int h) {
1209   uint16_t *src_temp = src;
1210   uint8_t *dst_temp = dst;
1211   const int num_blks = 16 / w;
1212   int64_t sum = 0;
1213   for (int i = 0; i < num_blks; i++) {
1214     sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1215     dst_temp += w;
1216     src_temp += (w * h);
1217   }
1218   return sum;
1219 }
1220 
1221 #if CONFIG_AV1_HIGHBITDEPTH
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1222 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1223                                     int sstride, int w, int h) {
1224   uint64_t sum = 0;
1225   for (int i = 0; i < h; i++) {
1226     for (int j = 0; j < w; j++) {
1227       int e = dst[i * dstride + j] - src[i * sstride + j];
1228       sum += e * e;
1229     }
1230   }
1231   return sum;
1232 }
1233 #endif  // CONFIG_AV1_HIGHBITDEPTH
1234