xref: /aosp_15_r20/external/libaom/av1/common/convolve.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25 
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27                              int dst_stride, int w, int h,
28                              const int16_t *x_filters, int x0_qn,
29                              int x_step_qn) {
30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31   for (int y = 0; y < h; ++y) {
32     int x_qn = x0_qn;
33     for (int x = 0; x < w; ++x) {
34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35       const int x_filter_idx =
36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37       assert(x_filter_idx <= RS_SUBPEL_MASK);
38       const int16_t *const x_filter =
39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40       int sum = 0;
41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42         sum += src_x[k] * x_filter[k];
43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44       x_qn += x_step_qn;
45     }
46     src += src_stride;
47     dst += dst_stride;
48   }
49 }
50 
51 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)52 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
53                                     uint16_t *dst, int dst_stride, int w, int h,
54                                     const int16_t *x_filters, int x0_qn,
55                                     int x_step_qn, int bd) {
56   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
57   for (int y = 0; y < h; ++y) {
58     int x_qn = x0_qn;
59     for (int x = 0; x < w; ++x) {
60       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
61       const int x_filter_idx =
62           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
63       assert(x_filter_idx <= RS_SUBPEL_MASK);
64       const int16_t *const x_filter =
65           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
66       int sum = 0;
67       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
68         sum += src_x[k] * x_filter[k];
69       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
70       x_qn += x_step_qn;
71     }
72     src += src_stride;
73     dst += dst_stride;
74   }
75 }
76 #endif  // CONFIG_AV1_HIGHBITDEPTH
77 
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)78 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
79                           int dst_stride, int w, int h,
80                           const InterpFilterParams *filter_params_x,
81                           const InterpFilterParams *filter_params_y,
82                           const int subpel_x_qn, const int subpel_y_qn,
83                           ConvolveParams *conv_params) {
84   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
85   int im_h = h + filter_params_y->taps - 1;
86   int im_stride = w;
87   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
88   const int fo_vert = filter_params_y->taps / 2 - 1;
89   const int fo_horiz = filter_params_x->taps / 2 - 1;
90   const int bd = 8;
91   const int bits =
92       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
93 
94   // horizontal filter
95   const uint8_t *src_horiz = src - fo_vert * src_stride;
96   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
97       filter_params_x, subpel_x_qn & SUBPEL_MASK);
98   for (int y = 0; y < im_h; ++y) {
99     for (int x = 0; x < w; ++x) {
100       int32_t sum = (1 << (bd + FILTER_BITS - 1));
101       for (int k = 0; k < filter_params_x->taps; ++k) {
102         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
103       }
104 
105       // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
106       // be beyond the following range. For better prediction, a clamping can be
107       // added for 12 tap filter to ensure the horizontal filtering result is
108       // within 16 bit. The same applies to the vertical filtering.
109       assert(filter_params_x->taps > 8 ||
110              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
111       im_block[y * im_stride + x] =
112           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
113     }
114   }
115 
116   // vertical filter
117   int16_t *src_vert = im_block + fo_vert * im_stride;
118   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
119       filter_params_y, subpel_y_qn & SUBPEL_MASK);
120   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
121   for (int y = 0; y < h; ++y) {
122     for (int x = 0; x < w; ++x) {
123       int32_t sum = 1 << offset_bits;
124       for (int k = 0; k < filter_params_y->taps; ++k) {
125         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
126       }
127       assert(filter_params_y->taps > 8 ||
128              (0 <= sum && sum < (1 << (offset_bits + 2))));
129       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
130                     ((1 << (offset_bits - conv_params->round_1)) +
131                      (1 << (offset_bits - conv_params->round_1 - 1)));
132       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
133     }
134   }
135 }
136 
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)137 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
138                          int dst_stride, int w, int h,
139                          const InterpFilterParams *filter_params_y,
140                          const int subpel_y_qn) {
141   const int fo_vert = filter_params_y->taps / 2 - 1;
142 
143   // vertical filter
144   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
145       filter_params_y, subpel_y_qn & SUBPEL_MASK);
146   for (int y = 0; y < h; ++y) {
147     for (int x = 0; x < w; ++x) {
148       int32_t res = 0;
149       for (int k = 0; k < filter_params_y->taps; ++k) {
150         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
151       }
152       dst[y * dst_stride + x] =
153           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
154     }
155   }
156 }
157 
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)158 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
159                          int dst_stride, int w, int h,
160                          const InterpFilterParams *filter_params_x,
161                          const int subpel_x_qn, ConvolveParams *conv_params) {
162   const int fo_horiz = filter_params_x->taps / 2 - 1;
163   const int bits = FILTER_BITS - conv_params->round_0;
164 
165   assert(bits >= 0);
166   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
167          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
168 
169   // horizontal filter
170   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
171       filter_params_x, subpel_x_qn & SUBPEL_MASK);
172 
173   for (int y = 0; y < h; ++y) {
174     for (int x = 0; x < w; ++x) {
175       int32_t res = 0;
176       for (int k = 0; k < filter_params_x->taps; ++k) {
177         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
178       }
179       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
180       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
181     }
182   }
183 }
184 
185 // This function is exactly the same as av1_convolve_2d_sr_c, and is an
186 // optimized version for intrabc. Use the following 2-tap filter:
187 // DECLARE_ALIGNED(256, static const int16_t,
188 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
189 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 // };
av1_convolve_2d_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)192 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
193                                   uint8_t *dst, int dst_stride, int w, int h,
194                                   const InterpFilterParams *filter_params_x,
195                                   const InterpFilterParams *filter_params_y,
196                                   const int subpel_x_qn, const int subpel_y_qn,
197                                   ConvolveParams *conv_params) {
198   assert(subpel_x_qn == 8);
199   assert(subpel_y_qn == 8);
200   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
201   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
202   (void)filter_params_x;
203   (void)subpel_x_qn;
204   (void)filter_params_y;
205   (void)subpel_y_qn;
206   (void)conv_params;
207 
208   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
209   int im_h = h + 1;
210   int im_stride = w;
211   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
212   const int bd = 8;
213 
214   // horizontal filter
215   // explicitly operate for subpel_x_qn = 8.
216   int16_t *im = im_block;
217   for (int y = 0; y < im_h; ++y) {
218     for (int x = 0; x < w; ++x) {
219       const int32_t sum = (1 << bd) + src[x] + src[x + 1];
220       assert(0 <= sum && sum < (1 << (bd + 2)));
221       im[x] = sum;
222     }
223     src += src_stride;
224     im += im_stride;
225   }
226 
227   // vertical filter
228   // explicitly operate for subpel_y_qn = 8.
229   int16_t *src_vert = im_block;
230   for (int y = 0; y < h; ++y) {
231     for (int x = 0; x < w; ++x) {
232       const int32_t sum =
233           (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
234       assert(0 <= sum && sum < (1 << (bd + 4)));
235       const int16_t res =
236           ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
237       dst[x] = clip_pixel(res);
238     }
239     src_vert += im_stride;
240     dst += dst_stride;
241   }
242 }
243 
244 // This function is exactly the same as av1_convolve_y_sr_c, and is an
245 // optimized version for intrabc.
av1_convolve_y_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)246 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
247                                  uint8_t *dst, int dst_stride, int w, int h,
248                                  const InterpFilterParams *filter_params_y,
249                                  const int subpel_y_qn) {
250   assert(subpel_y_qn == 8);
251   assert(filter_params_y->taps == 2);
252   (void)filter_params_y;
253   (void)subpel_y_qn;
254 
255   // vertical filter
256   // explicitly operate for subpel_y_qn = 8.
257   for (int y = 0; y < h; ++y) {
258     for (int x = 0; x < w; ++x) {
259       const int32_t res = src[x] + src[src_stride + x];
260       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
261     }
262     src += src_stride;
263     dst += dst_stride;
264   }
265 }
266 
267 // This function is exactly the same as av1_convolve_x_sr_c, and is an
268 // optimized version for intrabc.
av1_convolve_x_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)269 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
270                                  uint8_t *dst, int dst_stride, int w, int h,
271                                  const InterpFilterParams *filter_params_x,
272                                  const int subpel_x_qn,
273                                  ConvolveParams *conv_params) {
274   assert(subpel_x_qn == 8);
275   assert(filter_params_x->taps == 2);
276   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
277   (void)filter_params_x;
278   (void)subpel_x_qn;
279   (void)conv_params;
280 
281   // horizontal filter
282   // explicitly operate for subpel_x_qn = 8.
283   for (int y = 0; y < h; ++y) {
284     for (int x = 0; x < w; ++x) {
285       const int32_t res = src[x] + src[x + 1];
286       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
287     }
288     src += src_stride;
289     dst += dst_stride;
290   }
291 }
292 
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)293 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
294                                 uint8_t *dst, int dst_stride, int w, int h,
295                                 const InterpFilterParams *filter_params_x,
296                                 const InterpFilterParams *filter_params_y,
297                                 const int subpel_x_qn, const int subpel_y_qn,
298                                 ConvolveParams *conv_params) {
299   CONV_BUF_TYPE *dst16 = conv_params->dst;
300   int dst16_stride = conv_params->dst_stride;
301   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
302   int im_h = h + filter_params_y->taps - 1;
303   int im_stride = w;
304   const int fo_vert = filter_params_y->taps / 2 - 1;
305   const int fo_horiz = filter_params_x->taps / 2 - 1;
306   const int bd = 8;
307   const int round_bits =
308       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
309 
310   // horizontal filter
311   const uint8_t *src_horiz = src - fo_vert * src_stride;
312   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
313       filter_params_x, subpel_x_qn & SUBPEL_MASK);
314   for (int y = 0; y < im_h; ++y) {
315     for (int x = 0; x < w; ++x) {
316       int32_t sum = (1 << (bd + FILTER_BITS - 1));
317       for (int k = 0; k < filter_params_x->taps; ++k) {
318         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
319       }
320       assert(filter_params_x->taps > 8 ||
321              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
322       im_block[y * im_stride + x] =
323           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
324     }
325   }
326 
327   // vertical filter
328   int16_t *src_vert = im_block + fo_vert * im_stride;
329   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
330       filter_params_y, subpel_y_qn & SUBPEL_MASK);
331   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
332   for (int y = 0; y < h; ++y) {
333     for (int x = 0; x < w; ++x) {
334       int32_t sum = 1 << offset_bits;
335       for (int k = 0; k < filter_params_y->taps; ++k) {
336         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
337       }
338       assert(filter_params_y->taps > 8 ||
339              (0 <= sum && sum < (1 << (offset_bits + 2))));
340       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
341       if (conv_params->do_average) {
342         int32_t tmp = dst16[y * dst16_stride + x];
343         if (conv_params->use_dist_wtd_comp_avg) {
344           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
345           tmp = tmp >> DIST_PRECISION_BITS;
346         } else {
347           tmp += res;
348           tmp = tmp >> 1;
349         }
350         tmp -= (1 << (offset_bits - conv_params->round_1)) +
351                (1 << (offset_bits - conv_params->round_1 - 1));
352         dst[y * dst_stride + x] =
353             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
354       } else {
355         dst16[y * dst16_stride + x] = res;
356       }
357     }
358   }
359 }
360 
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)361 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
362                                int dst_stride, int w, int h,
363                                const InterpFilterParams *filter_params_y,
364                                const int subpel_y_qn,
365                                ConvolveParams *conv_params) {
366   CONV_BUF_TYPE *dst16 = conv_params->dst;
367   int dst16_stride = conv_params->dst_stride;
368   const int fo_vert = filter_params_y->taps / 2 - 1;
369   const int bits = FILTER_BITS - conv_params->round_0;
370   const int bd = 8;
371   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
372   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
373                            (1 << (offset_bits - conv_params->round_1 - 1));
374   const int round_bits =
375       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
376 
377   // vertical filter
378   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
379       filter_params_y, subpel_y_qn & SUBPEL_MASK);
380   for (int y = 0; y < h; ++y) {
381     for (int x = 0; x < w; ++x) {
382       int32_t res = 0;
383       for (int k = 0; k < filter_params_y->taps; ++k) {
384         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
385       }
386       res *= (1 << bits);
387       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
388 
389       if (conv_params->do_average) {
390         int32_t tmp = dst16[y * dst16_stride + x];
391         if (conv_params->use_dist_wtd_comp_avg) {
392           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
393           tmp = tmp >> DIST_PRECISION_BITS;
394         } else {
395           tmp += res;
396           tmp = tmp >> 1;
397         }
398         tmp -= round_offset;
399         dst[y * dst_stride + x] =
400             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
401       } else {
402         dst16[y * dst16_stride + x] = res;
403       }
404     }
405   }
406 }
407 
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)408 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
409                                int dst_stride, int w, int h,
410                                const InterpFilterParams *filter_params_x,
411                                const int subpel_x_qn,
412                                ConvolveParams *conv_params) {
413   CONV_BUF_TYPE *dst16 = conv_params->dst;
414   int dst16_stride = conv_params->dst_stride;
415   const int fo_horiz = filter_params_x->taps / 2 - 1;
416   const int bits = FILTER_BITS - conv_params->round_1;
417   const int bd = 8;
418   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
419   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
420                            (1 << (offset_bits - conv_params->round_1 - 1));
421   const int round_bits =
422       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
423 
424   // horizontal filter
425   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
426       filter_params_x, subpel_x_qn & SUBPEL_MASK);
427   for (int y = 0; y < h; ++y) {
428     for (int x = 0; x < w; ++x) {
429       int32_t res = 0;
430       for (int k = 0; k < filter_params_x->taps; ++k) {
431         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
432       }
433       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
434       res += round_offset;
435 
436       if (conv_params->do_average) {
437         int32_t tmp = dst16[y * dst16_stride + x];
438         if (conv_params->use_dist_wtd_comp_avg) {
439           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
440           tmp = tmp >> DIST_PRECISION_BITS;
441         } else {
442           tmp += res;
443           tmp = tmp >> 1;
444         }
445         tmp -= round_offset;
446         dst[y * dst_stride + x] =
447             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
448       } else {
449         dst16[y * dst16_stride + x] = res;
450       }
451     }
452   }
453 }
454 
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)455 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
456                                      uint8_t *dst, int dst_stride, int w, int h,
457                                      ConvolveParams *conv_params) {
458   CONV_BUF_TYPE *dst16 = conv_params->dst;
459   int dst16_stride = conv_params->dst_stride;
460   const int bits =
461       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
462   const int bd = 8;
463   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
464   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
465                            (1 << (offset_bits - conv_params->round_1 - 1));
466 
467   for (int y = 0; y < h; ++y) {
468     for (int x = 0; x < w; ++x) {
469       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
470       res += round_offset;
471 
472       if (conv_params->do_average) {
473         int32_t tmp = dst16[y * dst16_stride + x];
474         if (conv_params->use_dist_wtd_comp_avg) {
475           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
476           tmp = tmp >> DIST_PRECISION_BITS;
477         } else {
478           tmp += res;
479           tmp = tmp >> 1;
480         }
481         tmp -= round_offset;
482         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
483       } else {
484         dst16[y * dst16_stride + x] = res;
485       }
486     }
487   }
488 }
489 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)490 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
491                              int dst_stride, int w, int h,
492                              const InterpFilterParams *filter_params_x,
493                              const InterpFilterParams *filter_params_y,
494                              const int subpel_x_qn, const int x_step_qn,
495                              const int subpel_y_qn, const int y_step_qn,
496                              ConvolveParams *conv_params) {
497   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
498   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
499              filter_params_y->taps;
500   CONV_BUF_TYPE *dst16 = conv_params->dst;
501   const int dst16_stride = conv_params->dst_stride;
502   const int bits =
503       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
504   assert(bits >= 0);
505   int im_stride = w;
506   const int fo_vert = filter_params_y->taps / 2 - 1;
507   const int fo_horiz = filter_params_x->taps / 2 - 1;
508   const int bd = 8;
509 
510   // horizontal filter
511   const uint8_t *src_horiz = src - fo_vert * src_stride;
512   for (int y = 0; y < im_h; ++y) {
513     int x_qn = subpel_x_qn;
514     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
515       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
516       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
517       assert(x_filter_idx < SUBPEL_SHIFTS);
518       const int16_t *x_filter =
519           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
520       int32_t sum = (1 << (bd + FILTER_BITS - 1));
521       for (int k = 0; k < filter_params_x->taps; ++k) {
522         sum += x_filter[k] * src_x[k - fo_horiz];
523       }
524       assert(filter_params_x->taps > 8 ||
525              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
526       im_block[y * im_stride + x] =
527           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
528     }
529     src_horiz += src_stride;
530   }
531 
532   // vertical filter
533   int16_t *src_vert = im_block + fo_vert * im_stride;
534   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
535   for (int x = 0; x < w; ++x) {
536     int y_qn = subpel_y_qn;
537     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
538       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
539       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
540       assert(y_filter_idx < SUBPEL_SHIFTS);
541       const int16_t *y_filter =
542           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
543       int32_t sum = 1 << offset_bits;
544       for (int k = 0; k < filter_params_y->taps; ++k) {
545         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
546       }
547       assert(filter_params_y->taps > 8 ||
548              (0 <= sum && sum < (1 << (offset_bits + 2))));
549       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
550       if (conv_params->is_compound) {
551         if (conv_params->do_average) {
552           int32_t tmp = dst16[y * dst16_stride + x];
553           if (conv_params->use_dist_wtd_comp_avg) {
554             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
555             tmp = tmp >> DIST_PRECISION_BITS;
556           } else {
557             tmp += res;
558             tmp = tmp >> 1;
559           }
560           /* Subtract round offset and convolve round */
561           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
562                        (1 << (offset_bits - conv_params->round_1 - 1)));
563           dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
564         } else {
565           dst16[y * dst16_stride + x] = res;
566         }
567       } else {
568         /* Subtract round offset and convolve round */
569         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
570                              (1 << (offset_bits - conv_params->round_1 - 1)));
571         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
572       }
573     }
574     src_vert++;
575   }
576 }
577 
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)578 static void convolve_2d_scale_wrapper(
579     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
580     int h, const InterpFilterParams *filter_params_x,
581     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
582     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
583     ConvolveParams *conv_params) {
584   if (conv_params->is_compound) {
585     assert(conv_params->dst != NULL);
586   }
587   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
588                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
589                         y_step_qn, conv_params);
590 }
591 
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)592 static void convolve_2d_facade_compound(
593     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
594     int h, const InterpFilterParams *filter_params_x,
595     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
596     const int subpel_y_qn, ConvolveParams *conv_params) {
597   const bool need_x = subpel_x_qn != 0;
598   const bool need_y = subpel_y_qn != 0;
599   if (!need_x && !need_y) {
600     av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
601                                   conv_params);
602   } else if (need_x && !need_y) {
603     av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
604                             filter_params_x, subpel_x_qn, conv_params);
605   } else if (!need_x && need_y) {
606     av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
607                             filter_params_y, subpel_y_qn, conv_params);
608   } else {
609     assert(need_y && need_x);
610     av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
611                              filter_params_x, filter_params_y, subpel_x_qn,
612                              subpel_y_qn, conv_params);
613   }
614 }
615 
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)616 static void convolve_2d_facade_single(
617     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
618     int h, const InterpFilterParams *filter_params_x,
619     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
620     const int subpel_y_qn, ConvolveParams *conv_params) {
621   const bool need_x = subpel_x_qn != 0;
622   const bool need_y = subpel_y_qn != 0;
623   if (!need_x && !need_y) {
624     aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
625   } else if (need_x && !need_y) {
626     av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
627                       subpel_x_qn, conv_params);
628   } else if (!need_x && need_y) {
629     av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
630                       subpel_y_qn);
631   } else {
632     assert(need_x && need_y);
633     av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
634                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
635   }
636 }
637 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)638 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
639                             int dst_stride, int w, int h,
640                             const InterpFilterParams *interp_filters[2],
641                             const int subpel_x_qn, int x_step_q4,
642                             const int subpel_y_qn, int y_step_q4, int scaled,
643                             ConvolveParams *conv_params) {
644   (void)x_step_q4;
645   (void)y_step_q4;
646   (void)dst;
647   (void)dst_stride;
648 
649   const InterpFilterParams *filter_params_x = interp_filters[0];
650   const InterpFilterParams *filter_params_y = interp_filters[1];
651 
652   // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
653   // 2-tap filter indicates that it is for IntraBC.
654   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
655     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
656     assert(!scaled);
657     if (subpel_x_qn && subpel_y_qn) {
658       av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
659                                  filter_params_x, filter_params_y, subpel_x_qn,
660                                  subpel_y_qn, conv_params);
661       return;
662     } else if (subpel_x_qn) {
663       av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
664                                 filter_params_x, subpel_x_qn, conv_params);
665       return;
666     } else if (subpel_y_qn) {
667       av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
668                                 filter_params_y, subpel_y_qn);
669       return;
670     }
671   }
672 
673   if (scaled) {
674     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
675                               filter_params_x, filter_params_y, subpel_x_qn,
676                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
677   } else if (conv_params->is_compound) {
678     convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
679                                 filter_params_x, filter_params_y, subpel_x_qn,
680                                 subpel_y_qn, conv_params);
681   } else {
682     convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
683                               filter_params_x, filter_params_y, subpel_x_qn,
684                               subpel_y_qn, conv_params);
685   }
686 }
687 
688 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)689 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
690                                 uint16_t *dst, int dst_stride, int w, int h,
691                                 const InterpFilterParams *filter_params_x,
692                                 const int subpel_x_qn,
693                                 ConvolveParams *conv_params, int bd) {
694   const int fo_horiz = filter_params_x->taps / 2 - 1;
695   const int bits = FILTER_BITS - conv_params->round_0;
696 
697   assert(bits >= 0);
698   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
699          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
700 
701   // horizontal filter
702   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
703       filter_params_x, subpel_x_qn & SUBPEL_MASK);
704   for (int y = 0; y < h; ++y) {
705     for (int x = 0; x < w; ++x) {
706       int32_t res = 0;
707       for (int k = 0; k < filter_params_x->taps; ++k) {
708         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
709       }
710       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
711       dst[y * dst_stride + x] =
712           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
713     }
714   }
715 }
716 
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)717 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
718                                 uint16_t *dst, int dst_stride, int w, int h,
719                                 const InterpFilterParams *filter_params_y,
720                                 const int subpel_y_qn, int bd) {
721   const int fo_vert = filter_params_y->taps / 2 - 1;
722   // vertical filter
723   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
724       filter_params_y, subpel_y_qn & SUBPEL_MASK);
725   for (int y = 0; y < h; ++y) {
726     for (int x = 0; x < w; ++x) {
727       int32_t res = 0;
728       for (int k = 0; k < filter_params_y->taps; ++k) {
729         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
730       }
731       dst[y * dst_stride + x] =
732           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
733     }
734   }
735 }
736 
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)737 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
738                                  uint16_t *dst, int dst_stride, int w, int h,
739                                  const InterpFilterParams *filter_params_x,
740                                  const InterpFilterParams *filter_params_y,
741                                  const int subpel_x_qn, const int subpel_y_qn,
742                                  ConvolveParams *conv_params, int bd) {
743   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
744   int im_h = h + filter_params_y->taps - 1;
745   int im_stride = w;
746   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
747   const int fo_vert = filter_params_y->taps / 2 - 1;
748   const int fo_horiz = filter_params_x->taps / 2 - 1;
749   const int bits =
750       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
751   assert(bits >= 0);
752 
753   // horizontal filter
754   const uint16_t *src_horiz = src - fo_vert * src_stride;
755   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756       filter_params_x, subpel_x_qn & SUBPEL_MASK);
757   for (int y = 0; y < im_h; ++y) {
758     for (int x = 0; x < w; ++x) {
759       int32_t sum = (1 << (bd + FILTER_BITS - 1));
760       for (int k = 0; k < filter_params_x->taps; ++k) {
761         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
762       }
763       assert(filter_params_x->taps > 8 ||
764              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
765       im_block[y * im_stride + x] =
766           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
767     }
768   }
769 
770   // vertical filter
771   int16_t *src_vert = im_block + fo_vert * im_stride;
772   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
773       filter_params_y, subpel_y_qn & SUBPEL_MASK);
774   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
775   for (int y = 0; y < h; ++y) {
776     for (int x = 0; x < w; ++x) {
777       int32_t sum = 1 << offset_bits;
778       for (int k = 0; k < filter_params_y->taps; ++k) {
779         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
780       }
781       assert(filter_params_y->taps > 8 ||
782              (0 <= sum && sum < (1 << (offset_bits + 2))));
783       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
784                     ((1 << (offset_bits - conv_params->round_1)) +
785                      (1 << (offset_bits - conv_params->round_1 - 1)));
786       dst[y * dst_stride + x] =
787           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
788     }
789   }
790 }
791 
792 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
793 // optimized version for intrabc. Use the following 2-tap filter:
794 // DECLARE_ALIGNED(256, static const int16_t,
795 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
796 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
797 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
798 // };
av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)799 void av1_highbd_convolve_2d_sr_intrabc_c(
800     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
801     int h, const InterpFilterParams *filter_params_x,
802     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
803     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
804   const int bits =
805       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
806   assert(bits >= 0);
807   assert(subpel_x_qn == 8);
808   assert(subpel_y_qn == 8);
809   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
810   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
811   (void)filter_params_x;
812   (void)subpel_x_qn;
813   (void)filter_params_y;
814   (void)subpel_y_qn;
815   (void)conv_params;
816 
817   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
818   int im_h = h + 1;
819   int im_stride = w;
820   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
821 
822   // horizontal filter
823   // explicitly operate for subpel_x_qn = 8.
824   int16_t *im = im_block;
825   for (int y = 0; y < im_h; ++y) {
826     for (int x = 0; x < w; ++x) {
827       int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
828       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
829       sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
830       im[x] = sum;
831     }
832     src += src_stride;
833     im += im_stride;
834   }
835 
836   // vertical filter
837   // explicitly operate for subpel_y_qn = 8.
838   int16_t *src_vert = im_block;
839   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
840   for (int y = 0; y < h; ++y) {
841     for (int x = 0; x < w; ++x) {
842       const int32_t sum =
843           (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
844       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
845       const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
846                           ((1 << (offset_bits - conv_params->round_1)) +
847                            (1 << (offset_bits - conv_params->round_1 - 1)));
848 
849       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
850     }
851     src_vert += im_stride;
852     dst += dst_stride;
853   }
854 }
855 
856 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
857 // optimized version for intrabc.
av1_highbd_convolve_y_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)858 void av1_highbd_convolve_y_sr_intrabc_c(
859     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
860     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
861     int bd) {
862   assert(subpel_y_qn == 8);
863   assert(filter_params_y->taps == 2);
864   (void)filter_params_y;
865   (void)subpel_y_qn;
866 
867   // vertical filter
868   // explicitly operate for subpel_y_qn = 8.
869   for (int y = 0; y < h; ++y) {
870     for (int x = 0; x < w; ++x) {
871       const int32_t res = src[x] + src[src_stride + x];
872       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
873     }
874     src += src_stride;
875     dst += dst_stride;
876   }
877 }
878 
879 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
880 // optimized version for intrabc.
av1_highbd_convolve_x_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)881 void av1_highbd_convolve_x_sr_intrabc_c(
882     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
883     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
884     ConvolveParams *conv_params, int bd) {
885   const int bits = FILTER_BITS - conv_params->round_0;
886   assert(bits >= 0);
887   assert(subpel_x_qn == 8);
888   assert(filter_params_x->taps == 2);
889   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
890   (void)filter_params_x;
891   (void)subpel_x_qn;
892 
893   // horizontal filter
894   // explicitly operate for subpel_x_qn = 8.
895   for (int y = 0; y < h; ++y) {
896     for (int x = 0; x < w; ++x) {
897       int32_t res = 64 * (src[x] + src[x + 1]);
898       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
899       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
900     }
901     src += src_stride;
902     dst += dst_stride;
903   }
904 }
905 
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)906 void av1_highbd_dist_wtd_convolve_2d_c(
907     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
908     int h, const InterpFilterParams *filter_params_x,
909     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
910     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
911   int x, y, k;
912   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
913   CONV_BUF_TYPE *dst16 = conv_params->dst;
914   int dst16_stride = conv_params->dst_stride;
915   int im_h = h + filter_params_y->taps - 1;
916   int im_stride = w;
917   const int fo_vert = filter_params_y->taps / 2 - 1;
918   const int fo_horiz = filter_params_x->taps / 2 - 1;
919   const int round_bits =
920       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
921   assert(round_bits >= 0);
922 
923   // horizontal filter
924   const uint16_t *src_horiz = src - fo_vert * src_stride;
925   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
926       filter_params_x, subpel_x_qn & SUBPEL_MASK);
927   for (y = 0; y < im_h; ++y) {
928     for (x = 0; x < w; ++x) {
929       int32_t sum = (1 << (bd + FILTER_BITS - 1));
930       for (k = 0; k < filter_params_x->taps; ++k) {
931         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
932       }
933       assert(filter_params_x->taps > 8 ||
934              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
935       (void)bd;
936       im_block[y * im_stride + x] =
937           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
938     }
939   }
940 
941   // vertical filter
942   int16_t *src_vert = im_block + fo_vert * im_stride;
943   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
944   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
945       filter_params_y, subpel_y_qn & SUBPEL_MASK);
946   for (y = 0; y < h; ++y) {
947     for (x = 0; x < w; ++x) {
948       int32_t sum = 1 << offset_bits;
949       for (k = 0; k < filter_params_y->taps; ++k) {
950         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
951       }
952       assert(filter_params_y->taps > 8 ||
953              (0 <= sum && sum < (1 << (offset_bits + 2))));
954       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
955       if (conv_params->do_average) {
956         int32_t tmp = dst16[y * dst16_stride + x];
957         if (conv_params->use_dist_wtd_comp_avg) {
958           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
959           tmp = tmp >> DIST_PRECISION_BITS;
960         } else {
961           tmp += res;
962           tmp = tmp >> 1;
963         }
964         tmp -= (1 << (offset_bits - conv_params->round_1)) +
965                (1 << (offset_bits - conv_params->round_1 - 1));
966         dst[y * dst_stride + x] =
967             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
968       } else {
969         dst16[y * dst16_stride + x] = res;
970       }
971     }
972   }
973 }
974 
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)975 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
976                                       uint16_t *dst, int dst_stride, int w,
977                                       int h,
978                                       const InterpFilterParams *filter_params_x,
979                                       const int subpel_x_qn,
980                                       ConvolveParams *conv_params, int bd) {
981   CONV_BUF_TYPE *dst16 = conv_params->dst;
982   int dst16_stride = conv_params->dst_stride;
983   const int fo_horiz = filter_params_x->taps / 2 - 1;
984   const int bits = FILTER_BITS - conv_params->round_1;
985   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
986   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
987                            (1 << (offset_bits - conv_params->round_1 - 1));
988   const int round_bits =
989       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
990   assert(round_bits >= 0);
991   assert(bits >= 0);
992   // horizontal filter
993   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
994       filter_params_x, subpel_x_qn & SUBPEL_MASK);
995   for (int y = 0; y < h; ++y) {
996     for (int x = 0; x < w; ++x) {
997       int32_t res = 0;
998       for (int k = 0; k < filter_params_x->taps; ++k) {
999         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
1000       }
1001       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
1002       res += round_offset;
1003 
1004       if (conv_params->do_average) {
1005         int32_t tmp = dst16[y * dst16_stride + x];
1006         if (conv_params->use_dist_wtd_comp_avg) {
1007           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1008           tmp = tmp >> DIST_PRECISION_BITS;
1009         } else {
1010           tmp += res;
1011           tmp = tmp >> 1;
1012         }
1013         tmp -= round_offset;
1014         dst[y * dst_stride + x] =
1015             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1016       } else {
1017         dst16[y * dst16_stride + x] = res;
1018       }
1019     }
1020   }
1021 }
1022 
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1023 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
1024                                       uint16_t *dst, int dst_stride, int w,
1025                                       int h,
1026                                       const InterpFilterParams *filter_params_y,
1027                                       const int subpel_y_qn,
1028                                       ConvolveParams *conv_params, int bd) {
1029   CONV_BUF_TYPE *dst16 = conv_params->dst;
1030   int dst16_stride = conv_params->dst_stride;
1031   const int fo_vert = filter_params_y->taps / 2 - 1;
1032   const int bits = FILTER_BITS - conv_params->round_0;
1033   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1034   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1035                            (1 << (offset_bits - conv_params->round_1 - 1));
1036   const int round_bits =
1037       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1038   assert(round_bits >= 0);
1039   assert(bits >= 0);
1040   // vertical filter
1041   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1042       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1043   for (int y = 0; y < h; ++y) {
1044     for (int x = 0; x < w; ++x) {
1045       int32_t res = 0;
1046       for (int k = 0; k < filter_params_y->taps; ++k) {
1047         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1048       }
1049       res *= (1 << bits);
1050       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1051 
1052       if (conv_params->do_average) {
1053         int32_t tmp = dst16[y * dst16_stride + x];
1054         if (conv_params->use_dist_wtd_comp_avg) {
1055           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1056           tmp = tmp >> DIST_PRECISION_BITS;
1057         } else {
1058           tmp += res;
1059           tmp = tmp >> 1;
1060         }
1061         tmp -= round_offset;
1062         dst[y * dst_stride + x] =
1063             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1064       } else {
1065         dst16[y * dst16_stride + x] = res;
1066       }
1067     }
1068   }
1069 }
1070 
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1071 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
1072                                             uint16_t *dst, int dst_stride,
1073                                             int w, int h,
1074                                             ConvolveParams *conv_params,
1075                                             int bd) {
1076   CONV_BUF_TYPE *dst16 = conv_params->dst;
1077   int dst16_stride = conv_params->dst_stride;
1078   const int bits =
1079       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1080   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1081   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1082                            (1 << (offset_bits - conv_params->round_1 - 1));
1083   assert(bits >= 0);
1084 
1085   for (int y = 0; y < h; ++y) {
1086     for (int x = 0; x < w; ++x) {
1087       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
1088       res += round_offset;
1089       if (conv_params->do_average) {
1090         int32_t tmp = dst16[y * dst16_stride + x];
1091         if (conv_params->use_dist_wtd_comp_avg) {
1092           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1093           tmp = tmp >> DIST_PRECISION_BITS;
1094         } else {
1095           tmp += res;
1096           tmp = tmp >> 1;
1097         }
1098         tmp -= round_offset;
1099         dst[y * dst_stride + x] =
1100             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1101       } else {
1102         dst16[y * dst16_stride + x] = res;
1103       }
1104     }
1105   }
1106 }
1107 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)1108 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
1109                                     uint16_t *dst, int dst_stride, int w, int h,
1110                                     const InterpFilterParams *filter_params_x,
1111                                     const InterpFilterParams *filter_params_y,
1112                                     const int subpel_x_qn, const int x_step_qn,
1113                                     const int subpel_y_qn, const int y_step_qn,
1114                                     ConvolveParams *conv_params, int bd) {
1115   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1116   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1117              filter_params_y->taps;
1118   int im_stride = w;
1119   const int fo_vert = filter_params_y->taps / 2 - 1;
1120   const int fo_horiz = filter_params_x->taps / 2 - 1;
1121   CONV_BUF_TYPE *dst16 = conv_params->dst;
1122   const int dst16_stride = conv_params->dst_stride;
1123   const int bits =
1124       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1125   assert(bits >= 0);
1126   // horizontal filter
1127   const uint16_t *src_horiz = src - fo_vert * src_stride;
1128   for (int y = 0; y < im_h; ++y) {
1129     int x_qn = subpel_x_qn;
1130     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1131       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1132       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1133       assert(x_filter_idx < SUBPEL_SHIFTS);
1134       const int16_t *x_filter =
1135           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1136       int32_t sum = (1 << (bd + FILTER_BITS - 1));
1137       for (int k = 0; k < filter_params_x->taps; ++k) {
1138         sum += x_filter[k] * src_x[k - fo_horiz];
1139       }
1140       assert(filter_params_x->taps > 8 ||
1141              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1142       im_block[y * im_stride + x] =
1143           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1144     }
1145     src_horiz += src_stride;
1146   }
1147 
1148   // vertical filter
1149   int16_t *src_vert = im_block + fo_vert * im_stride;
1150   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1151   for (int x = 0; x < w; ++x) {
1152     int y_qn = subpel_y_qn;
1153     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1154       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1155       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1156       assert(y_filter_idx < SUBPEL_SHIFTS);
1157       const int16_t *y_filter =
1158           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1159       int32_t sum = 1 << offset_bits;
1160       for (int k = 0; k < filter_params_y->taps; ++k) {
1161         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1162       }
1163       assert(filter_params_y->taps > 8 ||
1164              (0 <= sum && sum < (1 << (offset_bits + 2))));
1165       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1166       if (conv_params->is_compound) {
1167         if (conv_params->do_average) {
1168           int32_t tmp = dst16[y * dst16_stride + x];
1169           if (conv_params->use_dist_wtd_comp_avg) {
1170             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1171             tmp = tmp >> DIST_PRECISION_BITS;
1172           } else {
1173             tmp += res;
1174             tmp = tmp >> 1;
1175           }
1176           /* Subtract round offset and convolve round */
1177           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1178                        (1 << (offset_bits - conv_params->round_1 - 1)));
1179           dst[y * dst_stride + x] =
1180               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1181         } else {
1182           dst16[y * dst16_stride + x] = res;
1183         }
1184       } else {
1185         /* Subtract round offset and convolve round */
1186         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1187                              (1 << (offset_bits - conv_params->round_1 - 1)));
1188         dst[y * dst_stride + x] =
1189             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1190       }
1191     }
1192     src_vert++;
1193   }
1194 }
1195 
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1196 static void highbd_convolve_2d_facade_compound(
1197     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1198     const int w, const int h, const InterpFilterParams *filter_params_x,
1199     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1200     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1201   const bool need_x = subpel_x_qn != 0;
1202   const bool need_y = subpel_y_qn != 0;
1203   if (!need_x && !need_y) {
1204     av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1205                                          conv_params, bd);
1206   } else if (need_x && !need_y) {
1207     av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1208                                    filter_params_x, subpel_x_qn, conv_params,
1209                                    bd);
1210   } else if (!need_x && need_y) {
1211     av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1212                                    filter_params_y, subpel_y_qn, conv_params,
1213                                    bd);
1214   } else {
1215     assert(need_x && need_y);
1216     av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1217                                     filter_params_x, filter_params_y,
1218                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
1219   }
1220 }
1221 
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1222 static void highbd_convolve_2d_facade_single(
1223     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1224     const int w, const int h, const InterpFilterParams *filter_params_x,
1225     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1226     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1227   const bool need_x = subpel_x_qn != 0;
1228   const bool need_y = subpel_y_qn != 0;
1229 
1230   if (!need_x && !need_y) {
1231     aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1232   } else if (need_x && !need_y) {
1233     av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1234                              filter_params_x, subpel_x_qn, conv_params, bd);
1235   } else if (!need_x && need_y) {
1236     av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1237                              filter_params_y, subpel_y_qn, bd);
1238   } else {
1239     assert(need_x && need_y);
1240     av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1241                               filter_params_x, filter_params_y, subpel_x_qn,
1242                               subpel_y_qn, conv_params, bd);
1243   }
1244 }
1245 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1246 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1247                                    uint8_t *dst8, int dst_stride, int w, int h,
1248                                    const InterpFilterParams *interp_filters[2],
1249                                    const int subpel_x_qn, int x_step_q4,
1250                                    const int subpel_y_qn, int y_step_q4,
1251                                    int scaled, ConvolveParams *conv_params,
1252                                    int bd) {
1253   (void)x_step_q4;
1254   (void)y_step_q4;
1255   (void)dst_stride;
1256   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1257 
1258   const InterpFilterParams *filter_params_x = interp_filters[0];
1259   const InterpFilterParams *filter_params_y = interp_filters[1];
1260 
1261   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1262   // 2-tap filter indicates that it is for IntraBC.
1263   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
1264     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1265     assert(!scaled);
1266     if (subpel_x_qn && subpel_y_qn) {
1267       av1_highbd_convolve_2d_sr_intrabc_c(
1268           src, src_stride, dst, dst_stride, w, h, filter_params_x,
1269           filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1270       return;
1271     } else if (subpel_x_qn) {
1272       av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1273                                          filter_params_x, subpel_x_qn,
1274                                          conv_params, bd);
1275       return;
1276     } else if (subpel_y_qn) {
1277       av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1278                                          filter_params_y, subpel_y_qn, bd);
1279       return;
1280     }
1281   }
1282 
1283   if (scaled) {
1284     if (conv_params->is_compound) {
1285       assert(conv_params->dst != NULL);
1286     }
1287     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1288                                  filter_params_x, filter_params_y, subpel_x_qn,
1289                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1290                                  bd);
1291   } else if (conv_params->is_compound) {
1292     highbd_convolve_2d_facade_compound(
1293         src, src_stride, dst, dst_stride, w, h, filter_params_x,
1294         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1295   } else {
1296     highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1297                                      filter_params_x, filter_params_y,
1298                                      subpel_x_qn, subpel_y_qn, conv_params, bd);
1299   }
1300 }
1301 #endif  // CONFIG_AV1_HIGHBITDEPTH
1302 
1303 // Note: Fixed size intermediate buffers, place limits on parameters
1304 // of some functions. 2d filtering proceeds in 2 steps:
1305 //   (1) Interpolate horizontally into an intermediate buffer, temp.
1306 //   (2) Interpolate temp vertically to derive the sub-pixel result.
1307 // Deriving the maximum number of rows in the temp buffer (135):
1308 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1309 // --Largest block size is 128x128 pixels.
1310 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1311 //   original frame (in 1/16th pixel units).
1312 // --Must round-up because block may be located at sub-pixel position.
1313 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1314 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1315 #define WIENER_MAX_EXT_SIZE 263
1316 
1317 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
horz_scalar_product(const uint8_t * a,const int16_t * b)1318 static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1319   int sum = 0;
1320   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1321   return sum;
1322 }
1323 
1324 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1325 static inline int highbd_horz_scalar_product(const uint16_t *a,
1326                                              const int16_t *b) {
1327   int sum = 0;
1328   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1329   return sum;
1330 }
1331 #endif
1332 
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1333 static inline int highbd_vert_scalar_product(const uint16_t *a,
1334                                              ptrdiff_t a_stride,
1335                                              const int16_t *b) {
1336   int sum = 0;
1337   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1338   return sum;
1339 }
1340 
get_filter_base(const int16_t * filter)1341 static const InterpKernel *get_filter_base(const int16_t *filter) {
1342   // NOTE: This assumes that the filter table is 256-byte aligned.
1343   // TODO(agrange) Modify to make independent of table alignment.
1344   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1345 }
1346 
get_filter_offset(const int16_t * f,const InterpKernel * base)1347 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1348   return (int)((const InterpKernel *)(intptr_t)f - base);
1349 }
1350 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1351 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1352                                        uint16_t *dst, ptrdiff_t dst_stride,
1353                                        const InterpKernel *x_filters, int x0_q4,
1354                                        int x_step_q4, int w, int h,
1355                                        int round0_bits) {
1356   const int bd = 8;
1357   src -= SUBPEL_TAPS / 2 - 1;
1358   for (int y = 0; y < h; ++y) {
1359     int x_q4 = x0_q4;
1360     for (int x = 0; x < w; ++x) {
1361       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1362       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1363       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1364                            (1 << (bd + FILTER_BITS - 1));
1365       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1366       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1367                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1368       x_q4 += x_step_q4;
1369     }
1370     src += src_stride;
1371     dst += dst_stride;
1372   }
1373 }
1374 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1375 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1376                                       uint8_t *dst, ptrdiff_t dst_stride,
1377                                       const InterpKernel *y_filters, int y0_q4,
1378                                       int y_step_q4, int w, int h,
1379                                       int round1_bits) {
1380   const int bd = 8;
1381   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1382 
1383   for (int x = 0; x < w; ++x) {
1384     int y_q4 = y0_q4;
1385     for (int y = 0; y < h; ++y) {
1386       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1387       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1388       const int rounding =
1389           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1390           (1 << (bd + round1_bits - 1));
1391       const int sum =
1392           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1393       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1394       y_q4 += y_step_q4;
1395     }
1396     ++src;
1397     ++dst;
1398   }
1399 }
1400 
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params)1401 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1402                                    uint8_t *dst, ptrdiff_t dst_stride,
1403                                    const int16_t *filter_x, int x_step_q4,
1404                                    const int16_t *filter_y, int y_step_q4,
1405                                    int w, int h,
1406                                    const WienerConvolveParams *conv_params) {
1407   const InterpKernel *const filters_x = get_filter_base(filter_x);
1408   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1409 
1410   const InterpKernel *const filters_y = get_filter_base(filter_y);
1411   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1412 
1413   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1414   const int intermediate_height =
1415       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1416   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1417 
1418   assert(w <= MAX_SB_SIZE);
1419   assert(h <= MAX_SB_SIZE);
1420   assert(y_step_q4 <= 32);
1421   assert(x_step_q4 <= 32);
1422 
1423   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1424                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1425                              x_step_q4, w, intermediate_height,
1426                              conv_params->round_0);
1427   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1428                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1429                             y_step_q4, w, h, conv_params->round_1);
1430 }
1431 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1432 
1433 #if CONFIG_AV1_HIGHBITDEPTH
1434 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1435 static void highbd_convolve_add_src_horiz_hip(
1436     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1437     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1438     int x_step_q4, int w, int h, int round0_bits, int bd) {
1439   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1440   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1441   src -= SUBPEL_TAPS / 2 - 1;
1442   for (int y = 0; y < h; ++y) {
1443     int x_q4 = x0_q4;
1444     for (int x = 0; x < w; ++x) {
1445       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1446       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1447       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1448                            (1 << (bd + FILTER_BITS - 1));
1449       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1450       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1451                                extraprec_clamp_limit - 1);
1452       x_q4 += x_step_q4;
1453     }
1454     src += src_stride;
1455     dst += dst_stride;
1456   }
1457 }
1458 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1459 static void highbd_convolve_add_src_vert_hip(
1460     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1461     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1462     int y_step_q4, int w, int h, int round1_bits, int bd) {
1463   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1464   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1465   for (int x = 0; x < w; ++x) {
1466     int y_q4 = y0_q4;
1467     for (int y = 0; y < h; ++y) {
1468       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1469       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1470       const int rounding =
1471           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1472           (1 << (bd + round1_bits - 1));
1473       const int sum =
1474           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1475       dst[y * dst_stride] =
1476           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1477       y_q4 += y_step_q4;
1478     }
1479     ++src;
1480     ++dst;
1481   }
1482 }
1483 
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params,int bd)1484 void av1_highbd_wiener_convolve_add_src_c(
1485     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1486     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1487     const int16_t *filter_y, int y_step_q4, int w, int h,
1488     const WienerConvolveParams *conv_params, int bd) {
1489   const InterpKernel *const filters_x = get_filter_base(filter_x);
1490   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1491 
1492   const InterpKernel *const filters_y = get_filter_base(filter_y);
1493   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1494 
1495   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1496   const int intermediate_height =
1497       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1498 
1499   assert(w <= MAX_SB_SIZE);
1500   assert(h <= MAX_SB_SIZE);
1501   assert(y_step_q4 <= 32);
1502   assert(x_step_q4 <= 32);
1503   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1504 
1505   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1506                                     src_stride, temp, MAX_SB_SIZE, filters_x,
1507                                     x0_q4, x_step_q4, w, intermediate_height,
1508                                     conv_params->round_0, bd);
1509   highbd_convolve_add_src_vert_hip(
1510       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1511       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1512 }
1513 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1514 #endif  // CONFIG_AV1_HIGHBITDEPTH
1515