1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27 int dst_stride, int w, int h,
28 const int16_t *x_filters, int x0_qn,
29 int x_step_qn) {
30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31 for (int y = 0; y < h; ++y) {
32 int x_qn = x0_qn;
33 for (int x = 0; x < w; ++x) {
34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35 const int x_filter_idx =
36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37 assert(x_filter_idx <= RS_SUBPEL_MASK);
38 const int16_t *const x_filter =
39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40 int sum = 0;
41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42 sum += src_x[k] * x_filter[k];
43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44 x_qn += x_step_qn;
45 }
46 src += src_stride;
47 dst += dst_stride;
48 }
49 }
50
51 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)52 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
53 uint16_t *dst, int dst_stride, int w, int h,
54 const int16_t *x_filters, int x0_qn,
55 int x_step_qn, int bd) {
56 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
57 for (int y = 0; y < h; ++y) {
58 int x_qn = x0_qn;
59 for (int x = 0; x < w; ++x) {
60 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
61 const int x_filter_idx =
62 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
63 assert(x_filter_idx <= RS_SUBPEL_MASK);
64 const int16_t *const x_filter =
65 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
66 int sum = 0;
67 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
68 sum += src_x[k] * x_filter[k];
69 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
70 x_qn += x_step_qn;
71 }
72 src += src_stride;
73 dst += dst_stride;
74 }
75 }
76 #endif // CONFIG_AV1_HIGHBITDEPTH
77
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)78 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
79 int dst_stride, int w, int h,
80 const InterpFilterParams *filter_params_x,
81 const InterpFilterParams *filter_params_y,
82 const int subpel_x_qn, const int subpel_y_qn,
83 ConvolveParams *conv_params) {
84 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
85 int im_h = h + filter_params_y->taps - 1;
86 int im_stride = w;
87 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
88 const int fo_vert = filter_params_y->taps / 2 - 1;
89 const int fo_horiz = filter_params_x->taps / 2 - 1;
90 const int bd = 8;
91 const int bits =
92 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
93
94 // horizontal filter
95 const uint8_t *src_horiz = src - fo_vert * src_stride;
96 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
97 filter_params_x, subpel_x_qn & SUBPEL_MASK);
98 for (int y = 0; y < im_h; ++y) {
99 for (int x = 0; x < w; ++x) {
100 int32_t sum = (1 << (bd + FILTER_BITS - 1));
101 for (int k = 0; k < filter_params_x->taps; ++k) {
102 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
103 }
104
105 // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
106 // be beyond the following range. For better prediction, a clamping can be
107 // added for 12 tap filter to ensure the horizontal filtering result is
108 // within 16 bit. The same applies to the vertical filtering.
109 assert(filter_params_x->taps > 8 ||
110 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
111 im_block[y * im_stride + x] =
112 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
113 }
114 }
115
116 // vertical filter
117 int16_t *src_vert = im_block + fo_vert * im_stride;
118 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
119 filter_params_y, subpel_y_qn & SUBPEL_MASK);
120 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
121 for (int y = 0; y < h; ++y) {
122 for (int x = 0; x < w; ++x) {
123 int32_t sum = 1 << offset_bits;
124 for (int k = 0; k < filter_params_y->taps; ++k) {
125 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
126 }
127 assert(filter_params_y->taps > 8 ||
128 (0 <= sum && sum < (1 << (offset_bits + 2))));
129 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
130 ((1 << (offset_bits - conv_params->round_1)) +
131 (1 << (offset_bits - conv_params->round_1 - 1)));
132 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
133 }
134 }
135 }
136
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)137 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
138 int dst_stride, int w, int h,
139 const InterpFilterParams *filter_params_y,
140 const int subpel_y_qn) {
141 const int fo_vert = filter_params_y->taps / 2 - 1;
142
143 // vertical filter
144 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
145 filter_params_y, subpel_y_qn & SUBPEL_MASK);
146 for (int y = 0; y < h; ++y) {
147 for (int x = 0; x < w; ++x) {
148 int32_t res = 0;
149 for (int k = 0; k < filter_params_y->taps; ++k) {
150 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
151 }
152 dst[y * dst_stride + x] =
153 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
154 }
155 }
156 }
157
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)158 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
159 int dst_stride, int w, int h,
160 const InterpFilterParams *filter_params_x,
161 const int subpel_x_qn, ConvolveParams *conv_params) {
162 const int fo_horiz = filter_params_x->taps / 2 - 1;
163 const int bits = FILTER_BITS - conv_params->round_0;
164
165 assert(bits >= 0);
166 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
167 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
168
169 // horizontal filter
170 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
171 filter_params_x, subpel_x_qn & SUBPEL_MASK);
172
173 for (int y = 0; y < h; ++y) {
174 for (int x = 0; x < w; ++x) {
175 int32_t res = 0;
176 for (int k = 0; k < filter_params_x->taps; ++k) {
177 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
178 }
179 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
180 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
181 }
182 }
183 }
184
185 // This function is exactly the same as av1_convolve_2d_sr_c, and is an
186 // optimized version for intrabc. Use the following 2-tap filter:
187 // DECLARE_ALIGNED(256, static const int16_t,
188 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
189 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 // };
av1_convolve_2d_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)192 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
193 uint8_t *dst, int dst_stride, int w, int h,
194 const InterpFilterParams *filter_params_x,
195 const InterpFilterParams *filter_params_y,
196 const int subpel_x_qn, const int subpel_y_qn,
197 ConvolveParams *conv_params) {
198 assert(subpel_x_qn == 8);
199 assert(subpel_y_qn == 8);
200 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
201 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
202 (void)filter_params_x;
203 (void)subpel_x_qn;
204 (void)filter_params_y;
205 (void)subpel_y_qn;
206 (void)conv_params;
207
208 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
209 int im_h = h + 1;
210 int im_stride = w;
211 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
212 const int bd = 8;
213
214 // horizontal filter
215 // explicitly operate for subpel_x_qn = 8.
216 int16_t *im = im_block;
217 for (int y = 0; y < im_h; ++y) {
218 for (int x = 0; x < w; ++x) {
219 const int32_t sum = (1 << bd) + src[x] + src[x + 1];
220 assert(0 <= sum && sum < (1 << (bd + 2)));
221 im[x] = sum;
222 }
223 src += src_stride;
224 im += im_stride;
225 }
226
227 // vertical filter
228 // explicitly operate for subpel_y_qn = 8.
229 int16_t *src_vert = im_block;
230 for (int y = 0; y < h; ++y) {
231 for (int x = 0; x < w; ++x) {
232 const int32_t sum =
233 (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
234 assert(0 <= sum && sum < (1 << (bd + 4)));
235 const int16_t res =
236 ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
237 dst[x] = clip_pixel(res);
238 }
239 src_vert += im_stride;
240 dst += dst_stride;
241 }
242 }
243
244 // This function is exactly the same as av1_convolve_y_sr_c, and is an
245 // optimized version for intrabc.
av1_convolve_y_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)246 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
247 uint8_t *dst, int dst_stride, int w, int h,
248 const InterpFilterParams *filter_params_y,
249 const int subpel_y_qn) {
250 assert(subpel_y_qn == 8);
251 assert(filter_params_y->taps == 2);
252 (void)filter_params_y;
253 (void)subpel_y_qn;
254
255 // vertical filter
256 // explicitly operate for subpel_y_qn = 8.
257 for (int y = 0; y < h; ++y) {
258 for (int x = 0; x < w; ++x) {
259 const int32_t res = src[x] + src[src_stride + x];
260 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
261 }
262 src += src_stride;
263 dst += dst_stride;
264 }
265 }
266
267 // This function is exactly the same as av1_convolve_x_sr_c, and is an
268 // optimized version for intrabc.
av1_convolve_x_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)269 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
270 uint8_t *dst, int dst_stride, int w, int h,
271 const InterpFilterParams *filter_params_x,
272 const int subpel_x_qn,
273 ConvolveParams *conv_params) {
274 assert(subpel_x_qn == 8);
275 assert(filter_params_x->taps == 2);
276 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
277 (void)filter_params_x;
278 (void)subpel_x_qn;
279 (void)conv_params;
280
281 // horizontal filter
282 // explicitly operate for subpel_x_qn = 8.
283 for (int y = 0; y < h; ++y) {
284 for (int x = 0; x < w; ++x) {
285 const int32_t res = src[x] + src[x + 1];
286 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
287 }
288 src += src_stride;
289 dst += dst_stride;
290 }
291 }
292
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)293 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
294 uint8_t *dst, int dst_stride, int w, int h,
295 const InterpFilterParams *filter_params_x,
296 const InterpFilterParams *filter_params_y,
297 const int subpel_x_qn, const int subpel_y_qn,
298 ConvolveParams *conv_params) {
299 CONV_BUF_TYPE *dst16 = conv_params->dst;
300 int dst16_stride = conv_params->dst_stride;
301 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
302 int im_h = h + filter_params_y->taps - 1;
303 int im_stride = w;
304 const int fo_vert = filter_params_y->taps / 2 - 1;
305 const int fo_horiz = filter_params_x->taps / 2 - 1;
306 const int bd = 8;
307 const int round_bits =
308 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
309
310 // horizontal filter
311 const uint8_t *src_horiz = src - fo_vert * src_stride;
312 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
313 filter_params_x, subpel_x_qn & SUBPEL_MASK);
314 for (int y = 0; y < im_h; ++y) {
315 for (int x = 0; x < w; ++x) {
316 int32_t sum = (1 << (bd + FILTER_BITS - 1));
317 for (int k = 0; k < filter_params_x->taps; ++k) {
318 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
319 }
320 assert(filter_params_x->taps > 8 ||
321 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
322 im_block[y * im_stride + x] =
323 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
324 }
325 }
326
327 // vertical filter
328 int16_t *src_vert = im_block + fo_vert * im_stride;
329 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
330 filter_params_y, subpel_y_qn & SUBPEL_MASK);
331 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
332 for (int y = 0; y < h; ++y) {
333 for (int x = 0; x < w; ++x) {
334 int32_t sum = 1 << offset_bits;
335 for (int k = 0; k < filter_params_y->taps; ++k) {
336 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
337 }
338 assert(filter_params_y->taps > 8 ||
339 (0 <= sum && sum < (1 << (offset_bits + 2))));
340 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
341 if (conv_params->do_average) {
342 int32_t tmp = dst16[y * dst16_stride + x];
343 if (conv_params->use_dist_wtd_comp_avg) {
344 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
345 tmp = tmp >> DIST_PRECISION_BITS;
346 } else {
347 tmp += res;
348 tmp = tmp >> 1;
349 }
350 tmp -= (1 << (offset_bits - conv_params->round_1)) +
351 (1 << (offset_bits - conv_params->round_1 - 1));
352 dst[y * dst_stride + x] =
353 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
354 } else {
355 dst16[y * dst16_stride + x] = res;
356 }
357 }
358 }
359 }
360
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)361 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
362 int dst_stride, int w, int h,
363 const InterpFilterParams *filter_params_y,
364 const int subpel_y_qn,
365 ConvolveParams *conv_params) {
366 CONV_BUF_TYPE *dst16 = conv_params->dst;
367 int dst16_stride = conv_params->dst_stride;
368 const int fo_vert = filter_params_y->taps / 2 - 1;
369 const int bits = FILTER_BITS - conv_params->round_0;
370 const int bd = 8;
371 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
372 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
373 (1 << (offset_bits - conv_params->round_1 - 1));
374 const int round_bits =
375 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
376
377 // vertical filter
378 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
379 filter_params_y, subpel_y_qn & SUBPEL_MASK);
380 for (int y = 0; y < h; ++y) {
381 for (int x = 0; x < w; ++x) {
382 int32_t res = 0;
383 for (int k = 0; k < filter_params_y->taps; ++k) {
384 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
385 }
386 res *= (1 << bits);
387 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
388
389 if (conv_params->do_average) {
390 int32_t tmp = dst16[y * dst16_stride + x];
391 if (conv_params->use_dist_wtd_comp_avg) {
392 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
393 tmp = tmp >> DIST_PRECISION_BITS;
394 } else {
395 tmp += res;
396 tmp = tmp >> 1;
397 }
398 tmp -= round_offset;
399 dst[y * dst_stride + x] =
400 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
401 } else {
402 dst16[y * dst16_stride + x] = res;
403 }
404 }
405 }
406 }
407
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)408 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
409 int dst_stride, int w, int h,
410 const InterpFilterParams *filter_params_x,
411 const int subpel_x_qn,
412 ConvolveParams *conv_params) {
413 CONV_BUF_TYPE *dst16 = conv_params->dst;
414 int dst16_stride = conv_params->dst_stride;
415 const int fo_horiz = filter_params_x->taps / 2 - 1;
416 const int bits = FILTER_BITS - conv_params->round_1;
417 const int bd = 8;
418 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
419 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
420 (1 << (offset_bits - conv_params->round_1 - 1));
421 const int round_bits =
422 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
423
424 // horizontal filter
425 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
426 filter_params_x, subpel_x_qn & SUBPEL_MASK);
427 for (int y = 0; y < h; ++y) {
428 for (int x = 0; x < w; ++x) {
429 int32_t res = 0;
430 for (int k = 0; k < filter_params_x->taps; ++k) {
431 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
432 }
433 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
434 res += round_offset;
435
436 if (conv_params->do_average) {
437 int32_t tmp = dst16[y * dst16_stride + x];
438 if (conv_params->use_dist_wtd_comp_avg) {
439 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
440 tmp = tmp >> DIST_PRECISION_BITS;
441 } else {
442 tmp += res;
443 tmp = tmp >> 1;
444 }
445 tmp -= round_offset;
446 dst[y * dst_stride + x] =
447 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
448 } else {
449 dst16[y * dst16_stride + x] = res;
450 }
451 }
452 }
453 }
454
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)455 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
456 uint8_t *dst, int dst_stride, int w, int h,
457 ConvolveParams *conv_params) {
458 CONV_BUF_TYPE *dst16 = conv_params->dst;
459 int dst16_stride = conv_params->dst_stride;
460 const int bits =
461 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
462 const int bd = 8;
463 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
464 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
465 (1 << (offset_bits - conv_params->round_1 - 1));
466
467 for (int y = 0; y < h; ++y) {
468 for (int x = 0; x < w; ++x) {
469 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
470 res += round_offset;
471
472 if (conv_params->do_average) {
473 int32_t tmp = dst16[y * dst16_stride + x];
474 if (conv_params->use_dist_wtd_comp_avg) {
475 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
476 tmp = tmp >> DIST_PRECISION_BITS;
477 } else {
478 tmp += res;
479 tmp = tmp >> 1;
480 }
481 tmp -= round_offset;
482 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
483 } else {
484 dst16[y * dst16_stride + x] = res;
485 }
486 }
487 }
488 }
489
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)490 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
491 int dst_stride, int w, int h,
492 const InterpFilterParams *filter_params_x,
493 const InterpFilterParams *filter_params_y,
494 const int subpel_x_qn, const int x_step_qn,
495 const int subpel_y_qn, const int y_step_qn,
496 ConvolveParams *conv_params) {
497 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
498 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
499 filter_params_y->taps;
500 CONV_BUF_TYPE *dst16 = conv_params->dst;
501 const int dst16_stride = conv_params->dst_stride;
502 const int bits =
503 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
504 assert(bits >= 0);
505 int im_stride = w;
506 const int fo_vert = filter_params_y->taps / 2 - 1;
507 const int fo_horiz = filter_params_x->taps / 2 - 1;
508 const int bd = 8;
509
510 // horizontal filter
511 const uint8_t *src_horiz = src - fo_vert * src_stride;
512 for (int y = 0; y < im_h; ++y) {
513 int x_qn = subpel_x_qn;
514 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
515 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
516 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
517 assert(x_filter_idx < SUBPEL_SHIFTS);
518 const int16_t *x_filter =
519 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
520 int32_t sum = (1 << (bd + FILTER_BITS - 1));
521 for (int k = 0; k < filter_params_x->taps; ++k) {
522 sum += x_filter[k] * src_x[k - fo_horiz];
523 }
524 assert(filter_params_x->taps > 8 ||
525 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
526 im_block[y * im_stride + x] =
527 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
528 }
529 src_horiz += src_stride;
530 }
531
532 // vertical filter
533 int16_t *src_vert = im_block + fo_vert * im_stride;
534 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
535 for (int x = 0; x < w; ++x) {
536 int y_qn = subpel_y_qn;
537 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
538 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
539 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
540 assert(y_filter_idx < SUBPEL_SHIFTS);
541 const int16_t *y_filter =
542 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
543 int32_t sum = 1 << offset_bits;
544 for (int k = 0; k < filter_params_y->taps; ++k) {
545 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
546 }
547 assert(filter_params_y->taps > 8 ||
548 (0 <= sum && sum < (1 << (offset_bits + 2))));
549 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
550 if (conv_params->is_compound) {
551 if (conv_params->do_average) {
552 int32_t tmp = dst16[y * dst16_stride + x];
553 if (conv_params->use_dist_wtd_comp_avg) {
554 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
555 tmp = tmp >> DIST_PRECISION_BITS;
556 } else {
557 tmp += res;
558 tmp = tmp >> 1;
559 }
560 /* Subtract round offset and convolve round */
561 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
562 (1 << (offset_bits - conv_params->round_1 - 1)));
563 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
564 } else {
565 dst16[y * dst16_stride + x] = res;
566 }
567 } else {
568 /* Subtract round offset and convolve round */
569 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
570 (1 << (offset_bits - conv_params->round_1 - 1)));
571 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
572 }
573 }
574 src_vert++;
575 }
576 }
577
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)578 static void convolve_2d_scale_wrapper(
579 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
580 int h, const InterpFilterParams *filter_params_x,
581 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
582 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
583 ConvolveParams *conv_params) {
584 if (conv_params->is_compound) {
585 assert(conv_params->dst != NULL);
586 }
587 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
588 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
589 y_step_qn, conv_params);
590 }
591
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)592 static void convolve_2d_facade_compound(
593 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
594 int h, const InterpFilterParams *filter_params_x,
595 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
596 const int subpel_y_qn, ConvolveParams *conv_params) {
597 const bool need_x = subpel_x_qn != 0;
598 const bool need_y = subpel_y_qn != 0;
599 if (!need_x && !need_y) {
600 av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
601 conv_params);
602 } else if (need_x && !need_y) {
603 av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
604 filter_params_x, subpel_x_qn, conv_params);
605 } else if (!need_x && need_y) {
606 av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
607 filter_params_y, subpel_y_qn, conv_params);
608 } else {
609 assert(need_y && need_x);
610 av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
611 filter_params_x, filter_params_y, subpel_x_qn,
612 subpel_y_qn, conv_params);
613 }
614 }
615
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)616 static void convolve_2d_facade_single(
617 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
618 int h, const InterpFilterParams *filter_params_x,
619 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
620 const int subpel_y_qn, ConvolveParams *conv_params) {
621 const bool need_x = subpel_x_qn != 0;
622 const bool need_y = subpel_y_qn != 0;
623 if (!need_x && !need_y) {
624 aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
625 } else if (need_x && !need_y) {
626 av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
627 subpel_x_qn, conv_params);
628 } else if (!need_x && need_y) {
629 av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
630 subpel_y_qn);
631 } else {
632 assert(need_x && need_y);
633 av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
634 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
635 }
636 }
637
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)638 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
639 int dst_stride, int w, int h,
640 const InterpFilterParams *interp_filters[2],
641 const int subpel_x_qn, int x_step_q4,
642 const int subpel_y_qn, int y_step_q4, int scaled,
643 ConvolveParams *conv_params) {
644 (void)x_step_q4;
645 (void)y_step_q4;
646 (void)dst;
647 (void)dst_stride;
648
649 const InterpFilterParams *filter_params_x = interp_filters[0];
650 const InterpFilterParams *filter_params_y = interp_filters[1];
651
652 // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
653 // 2-tap filter indicates that it is for IntraBC.
654 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
655 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
656 assert(!scaled);
657 if (subpel_x_qn && subpel_y_qn) {
658 av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
659 filter_params_x, filter_params_y, subpel_x_qn,
660 subpel_y_qn, conv_params);
661 return;
662 } else if (subpel_x_qn) {
663 av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
664 filter_params_x, subpel_x_qn, conv_params);
665 return;
666 } else if (subpel_y_qn) {
667 av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h,
668 filter_params_y, subpel_y_qn);
669 return;
670 }
671 }
672
673 if (scaled) {
674 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
675 filter_params_x, filter_params_y, subpel_x_qn,
676 x_step_q4, subpel_y_qn, y_step_q4, conv_params);
677 } else if (conv_params->is_compound) {
678 convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
679 filter_params_x, filter_params_y, subpel_x_qn,
680 subpel_y_qn, conv_params);
681 } else {
682 convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
683 filter_params_x, filter_params_y, subpel_x_qn,
684 subpel_y_qn, conv_params);
685 }
686 }
687
688 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)689 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
690 uint16_t *dst, int dst_stride, int w, int h,
691 const InterpFilterParams *filter_params_x,
692 const int subpel_x_qn,
693 ConvolveParams *conv_params, int bd) {
694 const int fo_horiz = filter_params_x->taps / 2 - 1;
695 const int bits = FILTER_BITS - conv_params->round_0;
696
697 assert(bits >= 0);
698 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
699 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
700
701 // horizontal filter
702 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
703 filter_params_x, subpel_x_qn & SUBPEL_MASK);
704 for (int y = 0; y < h; ++y) {
705 for (int x = 0; x < w; ++x) {
706 int32_t res = 0;
707 for (int k = 0; k < filter_params_x->taps; ++k) {
708 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
709 }
710 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
711 dst[y * dst_stride + x] =
712 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
713 }
714 }
715 }
716
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)717 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
718 uint16_t *dst, int dst_stride, int w, int h,
719 const InterpFilterParams *filter_params_y,
720 const int subpel_y_qn, int bd) {
721 const int fo_vert = filter_params_y->taps / 2 - 1;
722 // vertical filter
723 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
724 filter_params_y, subpel_y_qn & SUBPEL_MASK);
725 for (int y = 0; y < h; ++y) {
726 for (int x = 0; x < w; ++x) {
727 int32_t res = 0;
728 for (int k = 0; k < filter_params_y->taps; ++k) {
729 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
730 }
731 dst[y * dst_stride + x] =
732 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
733 }
734 }
735 }
736
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)737 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
738 uint16_t *dst, int dst_stride, int w, int h,
739 const InterpFilterParams *filter_params_x,
740 const InterpFilterParams *filter_params_y,
741 const int subpel_x_qn, const int subpel_y_qn,
742 ConvolveParams *conv_params, int bd) {
743 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
744 int im_h = h + filter_params_y->taps - 1;
745 int im_stride = w;
746 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
747 const int fo_vert = filter_params_y->taps / 2 - 1;
748 const int fo_horiz = filter_params_x->taps / 2 - 1;
749 const int bits =
750 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
751 assert(bits >= 0);
752
753 // horizontal filter
754 const uint16_t *src_horiz = src - fo_vert * src_stride;
755 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756 filter_params_x, subpel_x_qn & SUBPEL_MASK);
757 for (int y = 0; y < im_h; ++y) {
758 for (int x = 0; x < w; ++x) {
759 int32_t sum = (1 << (bd + FILTER_BITS - 1));
760 for (int k = 0; k < filter_params_x->taps; ++k) {
761 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
762 }
763 assert(filter_params_x->taps > 8 ||
764 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
765 im_block[y * im_stride + x] =
766 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
767 }
768 }
769
770 // vertical filter
771 int16_t *src_vert = im_block + fo_vert * im_stride;
772 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
773 filter_params_y, subpel_y_qn & SUBPEL_MASK);
774 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
775 for (int y = 0; y < h; ++y) {
776 for (int x = 0; x < w; ++x) {
777 int32_t sum = 1 << offset_bits;
778 for (int k = 0; k < filter_params_y->taps; ++k) {
779 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
780 }
781 assert(filter_params_y->taps > 8 ||
782 (0 <= sum && sum < (1 << (offset_bits + 2))));
783 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
784 ((1 << (offset_bits - conv_params->round_1)) +
785 (1 << (offset_bits - conv_params->round_1 - 1)));
786 dst[y * dst_stride + x] =
787 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
788 }
789 }
790 }
791
792 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
793 // optimized version for intrabc. Use the following 2-tap filter:
794 // DECLARE_ALIGNED(256, static const int16_t,
795 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
796 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
797 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
798 // };
av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)799 void av1_highbd_convolve_2d_sr_intrabc_c(
800 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
801 int h, const InterpFilterParams *filter_params_x,
802 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
803 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
804 const int bits =
805 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
806 assert(bits >= 0);
807 assert(subpel_x_qn == 8);
808 assert(subpel_y_qn == 8);
809 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
810 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
811 (void)filter_params_x;
812 (void)subpel_x_qn;
813 (void)filter_params_y;
814 (void)subpel_y_qn;
815 (void)conv_params;
816
817 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
818 int im_h = h + 1;
819 int im_stride = w;
820 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
821
822 // horizontal filter
823 // explicitly operate for subpel_x_qn = 8.
824 int16_t *im = im_block;
825 for (int y = 0; y < im_h; ++y) {
826 for (int x = 0; x < w; ++x) {
827 int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
828 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
829 sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
830 im[x] = sum;
831 }
832 src += src_stride;
833 im += im_stride;
834 }
835
836 // vertical filter
837 // explicitly operate for subpel_y_qn = 8.
838 int16_t *src_vert = im_block;
839 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
840 for (int y = 0; y < h; ++y) {
841 for (int x = 0; x < w; ++x) {
842 const int32_t sum =
843 (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
844 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
845 const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
846 ((1 << (offset_bits - conv_params->round_1)) +
847 (1 << (offset_bits - conv_params->round_1 - 1)));
848
849 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
850 }
851 src_vert += im_stride;
852 dst += dst_stride;
853 }
854 }
855
856 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
857 // optimized version for intrabc.
av1_highbd_convolve_y_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)858 void av1_highbd_convolve_y_sr_intrabc_c(
859 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
860 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
861 int bd) {
862 assert(subpel_y_qn == 8);
863 assert(filter_params_y->taps == 2);
864 (void)filter_params_y;
865 (void)subpel_y_qn;
866
867 // vertical filter
868 // explicitly operate for subpel_y_qn = 8.
869 for (int y = 0; y < h; ++y) {
870 for (int x = 0; x < w; ++x) {
871 const int32_t res = src[x] + src[src_stride + x];
872 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
873 }
874 src += src_stride;
875 dst += dst_stride;
876 }
877 }
878
879 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
880 // optimized version for intrabc.
av1_highbd_convolve_x_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)881 void av1_highbd_convolve_x_sr_intrabc_c(
882 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
883 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
884 ConvolveParams *conv_params, int bd) {
885 const int bits = FILTER_BITS - conv_params->round_0;
886 assert(bits >= 0);
887 assert(subpel_x_qn == 8);
888 assert(filter_params_x->taps == 2);
889 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
890 (void)filter_params_x;
891 (void)subpel_x_qn;
892
893 // horizontal filter
894 // explicitly operate for subpel_x_qn = 8.
895 for (int y = 0; y < h; ++y) {
896 for (int x = 0; x < w; ++x) {
897 int32_t res = 64 * (src[x] + src[x + 1]);
898 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
899 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
900 }
901 src += src_stride;
902 dst += dst_stride;
903 }
904 }
905
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)906 void av1_highbd_dist_wtd_convolve_2d_c(
907 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
908 int h, const InterpFilterParams *filter_params_x,
909 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
910 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
911 int x, y, k;
912 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
913 CONV_BUF_TYPE *dst16 = conv_params->dst;
914 int dst16_stride = conv_params->dst_stride;
915 int im_h = h + filter_params_y->taps - 1;
916 int im_stride = w;
917 const int fo_vert = filter_params_y->taps / 2 - 1;
918 const int fo_horiz = filter_params_x->taps / 2 - 1;
919 const int round_bits =
920 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
921 assert(round_bits >= 0);
922
923 // horizontal filter
924 const uint16_t *src_horiz = src - fo_vert * src_stride;
925 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
926 filter_params_x, subpel_x_qn & SUBPEL_MASK);
927 for (y = 0; y < im_h; ++y) {
928 for (x = 0; x < w; ++x) {
929 int32_t sum = (1 << (bd + FILTER_BITS - 1));
930 for (k = 0; k < filter_params_x->taps; ++k) {
931 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
932 }
933 assert(filter_params_x->taps > 8 ||
934 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
935 (void)bd;
936 im_block[y * im_stride + x] =
937 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
938 }
939 }
940
941 // vertical filter
942 int16_t *src_vert = im_block + fo_vert * im_stride;
943 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
944 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
945 filter_params_y, subpel_y_qn & SUBPEL_MASK);
946 for (y = 0; y < h; ++y) {
947 for (x = 0; x < w; ++x) {
948 int32_t sum = 1 << offset_bits;
949 for (k = 0; k < filter_params_y->taps; ++k) {
950 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
951 }
952 assert(filter_params_y->taps > 8 ||
953 (0 <= sum && sum < (1 << (offset_bits + 2))));
954 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
955 if (conv_params->do_average) {
956 int32_t tmp = dst16[y * dst16_stride + x];
957 if (conv_params->use_dist_wtd_comp_avg) {
958 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
959 tmp = tmp >> DIST_PRECISION_BITS;
960 } else {
961 tmp += res;
962 tmp = tmp >> 1;
963 }
964 tmp -= (1 << (offset_bits - conv_params->round_1)) +
965 (1 << (offset_bits - conv_params->round_1 - 1));
966 dst[y * dst_stride + x] =
967 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
968 } else {
969 dst16[y * dst16_stride + x] = res;
970 }
971 }
972 }
973 }
974
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)975 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
976 uint16_t *dst, int dst_stride, int w,
977 int h,
978 const InterpFilterParams *filter_params_x,
979 const int subpel_x_qn,
980 ConvolveParams *conv_params, int bd) {
981 CONV_BUF_TYPE *dst16 = conv_params->dst;
982 int dst16_stride = conv_params->dst_stride;
983 const int fo_horiz = filter_params_x->taps / 2 - 1;
984 const int bits = FILTER_BITS - conv_params->round_1;
985 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
986 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
987 (1 << (offset_bits - conv_params->round_1 - 1));
988 const int round_bits =
989 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
990 assert(round_bits >= 0);
991 assert(bits >= 0);
992 // horizontal filter
993 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
994 filter_params_x, subpel_x_qn & SUBPEL_MASK);
995 for (int y = 0; y < h; ++y) {
996 for (int x = 0; x < w; ++x) {
997 int32_t res = 0;
998 for (int k = 0; k < filter_params_x->taps; ++k) {
999 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
1000 }
1001 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
1002 res += round_offset;
1003
1004 if (conv_params->do_average) {
1005 int32_t tmp = dst16[y * dst16_stride + x];
1006 if (conv_params->use_dist_wtd_comp_avg) {
1007 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1008 tmp = tmp >> DIST_PRECISION_BITS;
1009 } else {
1010 tmp += res;
1011 tmp = tmp >> 1;
1012 }
1013 tmp -= round_offset;
1014 dst[y * dst_stride + x] =
1015 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1016 } else {
1017 dst16[y * dst16_stride + x] = res;
1018 }
1019 }
1020 }
1021 }
1022
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1023 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
1024 uint16_t *dst, int dst_stride, int w,
1025 int h,
1026 const InterpFilterParams *filter_params_y,
1027 const int subpel_y_qn,
1028 ConvolveParams *conv_params, int bd) {
1029 CONV_BUF_TYPE *dst16 = conv_params->dst;
1030 int dst16_stride = conv_params->dst_stride;
1031 const int fo_vert = filter_params_y->taps / 2 - 1;
1032 const int bits = FILTER_BITS - conv_params->round_0;
1033 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1034 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1035 (1 << (offset_bits - conv_params->round_1 - 1));
1036 const int round_bits =
1037 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1038 assert(round_bits >= 0);
1039 assert(bits >= 0);
1040 // vertical filter
1041 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1042 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1043 for (int y = 0; y < h; ++y) {
1044 for (int x = 0; x < w; ++x) {
1045 int32_t res = 0;
1046 for (int k = 0; k < filter_params_y->taps; ++k) {
1047 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1048 }
1049 res *= (1 << bits);
1050 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1051
1052 if (conv_params->do_average) {
1053 int32_t tmp = dst16[y * dst16_stride + x];
1054 if (conv_params->use_dist_wtd_comp_avg) {
1055 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1056 tmp = tmp >> DIST_PRECISION_BITS;
1057 } else {
1058 tmp += res;
1059 tmp = tmp >> 1;
1060 }
1061 tmp -= round_offset;
1062 dst[y * dst_stride + x] =
1063 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1064 } else {
1065 dst16[y * dst16_stride + x] = res;
1066 }
1067 }
1068 }
1069 }
1070
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1071 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
1072 uint16_t *dst, int dst_stride,
1073 int w, int h,
1074 ConvolveParams *conv_params,
1075 int bd) {
1076 CONV_BUF_TYPE *dst16 = conv_params->dst;
1077 int dst16_stride = conv_params->dst_stride;
1078 const int bits =
1079 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1080 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1081 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1082 (1 << (offset_bits - conv_params->round_1 - 1));
1083 assert(bits >= 0);
1084
1085 for (int y = 0; y < h; ++y) {
1086 for (int x = 0; x < w; ++x) {
1087 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
1088 res += round_offset;
1089 if (conv_params->do_average) {
1090 int32_t tmp = dst16[y * dst16_stride + x];
1091 if (conv_params->use_dist_wtd_comp_avg) {
1092 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1093 tmp = tmp >> DIST_PRECISION_BITS;
1094 } else {
1095 tmp += res;
1096 tmp = tmp >> 1;
1097 }
1098 tmp -= round_offset;
1099 dst[y * dst_stride + x] =
1100 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1101 } else {
1102 dst16[y * dst16_stride + x] = res;
1103 }
1104 }
1105 }
1106 }
1107
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)1108 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
1109 uint16_t *dst, int dst_stride, int w, int h,
1110 const InterpFilterParams *filter_params_x,
1111 const InterpFilterParams *filter_params_y,
1112 const int subpel_x_qn, const int x_step_qn,
1113 const int subpel_y_qn, const int y_step_qn,
1114 ConvolveParams *conv_params, int bd) {
1115 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1116 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1117 filter_params_y->taps;
1118 int im_stride = w;
1119 const int fo_vert = filter_params_y->taps / 2 - 1;
1120 const int fo_horiz = filter_params_x->taps / 2 - 1;
1121 CONV_BUF_TYPE *dst16 = conv_params->dst;
1122 const int dst16_stride = conv_params->dst_stride;
1123 const int bits =
1124 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1125 assert(bits >= 0);
1126 // horizontal filter
1127 const uint16_t *src_horiz = src - fo_vert * src_stride;
1128 for (int y = 0; y < im_h; ++y) {
1129 int x_qn = subpel_x_qn;
1130 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1131 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1132 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1133 assert(x_filter_idx < SUBPEL_SHIFTS);
1134 const int16_t *x_filter =
1135 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1136 int32_t sum = (1 << (bd + FILTER_BITS - 1));
1137 for (int k = 0; k < filter_params_x->taps; ++k) {
1138 sum += x_filter[k] * src_x[k - fo_horiz];
1139 }
1140 assert(filter_params_x->taps > 8 ||
1141 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1142 im_block[y * im_stride + x] =
1143 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1144 }
1145 src_horiz += src_stride;
1146 }
1147
1148 // vertical filter
1149 int16_t *src_vert = im_block + fo_vert * im_stride;
1150 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1151 for (int x = 0; x < w; ++x) {
1152 int y_qn = subpel_y_qn;
1153 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1154 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1155 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1156 assert(y_filter_idx < SUBPEL_SHIFTS);
1157 const int16_t *y_filter =
1158 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1159 int32_t sum = 1 << offset_bits;
1160 for (int k = 0; k < filter_params_y->taps; ++k) {
1161 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1162 }
1163 assert(filter_params_y->taps > 8 ||
1164 (0 <= sum && sum < (1 << (offset_bits + 2))));
1165 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1166 if (conv_params->is_compound) {
1167 if (conv_params->do_average) {
1168 int32_t tmp = dst16[y * dst16_stride + x];
1169 if (conv_params->use_dist_wtd_comp_avg) {
1170 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1171 tmp = tmp >> DIST_PRECISION_BITS;
1172 } else {
1173 tmp += res;
1174 tmp = tmp >> 1;
1175 }
1176 /* Subtract round offset and convolve round */
1177 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1178 (1 << (offset_bits - conv_params->round_1 - 1)));
1179 dst[y * dst_stride + x] =
1180 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1181 } else {
1182 dst16[y * dst16_stride + x] = res;
1183 }
1184 } else {
1185 /* Subtract round offset and convolve round */
1186 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1187 (1 << (offset_bits - conv_params->round_1 - 1)));
1188 dst[y * dst_stride + x] =
1189 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1190 }
1191 }
1192 src_vert++;
1193 }
1194 }
1195
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1196 static void highbd_convolve_2d_facade_compound(
1197 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1198 const int w, const int h, const InterpFilterParams *filter_params_x,
1199 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1200 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1201 const bool need_x = subpel_x_qn != 0;
1202 const bool need_y = subpel_y_qn != 0;
1203 if (!need_x && !need_y) {
1204 av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1205 conv_params, bd);
1206 } else if (need_x && !need_y) {
1207 av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1208 filter_params_x, subpel_x_qn, conv_params,
1209 bd);
1210 } else if (!need_x && need_y) {
1211 av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1212 filter_params_y, subpel_y_qn, conv_params,
1213 bd);
1214 } else {
1215 assert(need_x && need_y);
1216 av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1217 filter_params_x, filter_params_y,
1218 subpel_x_qn, subpel_y_qn, conv_params, bd);
1219 }
1220 }
1221
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1222 static void highbd_convolve_2d_facade_single(
1223 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1224 const int w, const int h, const InterpFilterParams *filter_params_x,
1225 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1226 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1227 const bool need_x = subpel_x_qn != 0;
1228 const bool need_y = subpel_y_qn != 0;
1229
1230 if (!need_x && !need_y) {
1231 aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1232 } else if (need_x && !need_y) {
1233 av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1234 filter_params_x, subpel_x_qn, conv_params, bd);
1235 } else if (!need_x && need_y) {
1236 av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1237 filter_params_y, subpel_y_qn, bd);
1238 } else {
1239 assert(need_x && need_y);
1240 av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1241 filter_params_x, filter_params_y, subpel_x_qn,
1242 subpel_y_qn, conv_params, bd);
1243 }
1244 }
1245
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1246 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1247 uint8_t *dst8, int dst_stride, int w, int h,
1248 const InterpFilterParams *interp_filters[2],
1249 const int subpel_x_qn, int x_step_q4,
1250 const int subpel_y_qn, int y_step_q4,
1251 int scaled, ConvolveParams *conv_params,
1252 int bd) {
1253 (void)x_step_q4;
1254 (void)y_step_q4;
1255 (void)dst_stride;
1256 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1257
1258 const InterpFilterParams *filter_params_x = interp_filters[0];
1259 const InterpFilterParams *filter_params_y = interp_filters[1];
1260
1261 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1262 // 2-tap filter indicates that it is for IntraBC.
1263 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
1264 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1265 assert(!scaled);
1266 if (subpel_x_qn && subpel_y_qn) {
1267 av1_highbd_convolve_2d_sr_intrabc_c(
1268 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1269 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1270 return;
1271 } else if (subpel_x_qn) {
1272 av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1273 filter_params_x, subpel_x_qn,
1274 conv_params, bd);
1275 return;
1276 } else if (subpel_y_qn) {
1277 av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1278 filter_params_y, subpel_y_qn, bd);
1279 return;
1280 }
1281 }
1282
1283 if (scaled) {
1284 if (conv_params->is_compound) {
1285 assert(conv_params->dst != NULL);
1286 }
1287 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1288 filter_params_x, filter_params_y, subpel_x_qn,
1289 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1290 bd);
1291 } else if (conv_params->is_compound) {
1292 highbd_convolve_2d_facade_compound(
1293 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1294 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1295 } else {
1296 highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1297 filter_params_x, filter_params_y,
1298 subpel_x_qn, subpel_y_qn, conv_params, bd);
1299 }
1300 }
1301 #endif // CONFIG_AV1_HIGHBITDEPTH
1302
1303 // Note: Fixed size intermediate buffers, place limits on parameters
1304 // of some functions. 2d filtering proceeds in 2 steps:
1305 // (1) Interpolate horizontally into an intermediate buffer, temp.
1306 // (2) Interpolate temp vertically to derive the sub-pixel result.
1307 // Deriving the maximum number of rows in the temp buffer (135):
1308 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1309 // --Largest block size is 128x128 pixels.
1310 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1311 // original frame (in 1/16th pixel units).
1312 // --Must round-up because block may be located at sub-pixel position.
1313 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1314 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1315 #define WIENER_MAX_EXT_SIZE 263
1316
1317 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
horz_scalar_product(const uint8_t * a,const int16_t * b)1318 static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1319 int sum = 0;
1320 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1321 return sum;
1322 }
1323
1324 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1325 static inline int highbd_horz_scalar_product(const uint16_t *a,
1326 const int16_t *b) {
1327 int sum = 0;
1328 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1329 return sum;
1330 }
1331 #endif
1332
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1333 static inline int highbd_vert_scalar_product(const uint16_t *a,
1334 ptrdiff_t a_stride,
1335 const int16_t *b) {
1336 int sum = 0;
1337 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1338 return sum;
1339 }
1340
get_filter_base(const int16_t * filter)1341 static const InterpKernel *get_filter_base(const int16_t *filter) {
1342 // NOTE: This assumes that the filter table is 256-byte aligned.
1343 // TODO(agrange) Modify to make independent of table alignment.
1344 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1345 }
1346
get_filter_offset(const int16_t * f,const InterpKernel * base)1347 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1348 return (int)((const InterpKernel *)(intptr_t)f - base);
1349 }
1350
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1351 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1352 uint16_t *dst, ptrdiff_t dst_stride,
1353 const InterpKernel *x_filters, int x0_q4,
1354 int x_step_q4, int w, int h,
1355 int round0_bits) {
1356 const int bd = 8;
1357 src -= SUBPEL_TAPS / 2 - 1;
1358 for (int y = 0; y < h; ++y) {
1359 int x_q4 = x0_q4;
1360 for (int x = 0; x < w; ++x) {
1361 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1362 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1363 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1364 (1 << (bd + FILTER_BITS - 1));
1365 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1366 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1367 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1368 x_q4 += x_step_q4;
1369 }
1370 src += src_stride;
1371 dst += dst_stride;
1372 }
1373 }
1374
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1375 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1376 uint8_t *dst, ptrdiff_t dst_stride,
1377 const InterpKernel *y_filters, int y0_q4,
1378 int y_step_q4, int w, int h,
1379 int round1_bits) {
1380 const int bd = 8;
1381 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1382
1383 for (int x = 0; x < w; ++x) {
1384 int y_q4 = y0_q4;
1385 for (int y = 0; y < h; ++y) {
1386 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1387 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1388 const int rounding =
1389 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1390 (1 << (bd + round1_bits - 1));
1391 const int sum =
1392 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1393 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1394 y_q4 += y_step_q4;
1395 }
1396 ++src;
1397 ++dst;
1398 }
1399 }
1400
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params)1401 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1402 uint8_t *dst, ptrdiff_t dst_stride,
1403 const int16_t *filter_x, int x_step_q4,
1404 const int16_t *filter_y, int y_step_q4,
1405 int w, int h,
1406 const WienerConvolveParams *conv_params) {
1407 const InterpKernel *const filters_x = get_filter_base(filter_x);
1408 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1409
1410 const InterpKernel *const filters_y = get_filter_base(filter_y);
1411 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1412
1413 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1414 const int intermediate_height =
1415 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1416 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1417
1418 assert(w <= MAX_SB_SIZE);
1419 assert(h <= MAX_SB_SIZE);
1420 assert(y_step_q4 <= 32);
1421 assert(x_step_q4 <= 32);
1422
1423 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1424 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1425 x_step_q4, w, intermediate_height,
1426 conv_params->round_0);
1427 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1428 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1429 y_step_q4, w, h, conv_params->round_1);
1430 }
1431 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1432
1433 #if CONFIG_AV1_HIGHBITDEPTH
1434 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1435 static void highbd_convolve_add_src_horiz_hip(
1436 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1437 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1438 int x_step_q4, int w, int h, int round0_bits, int bd) {
1439 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1440 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1441 src -= SUBPEL_TAPS / 2 - 1;
1442 for (int y = 0; y < h; ++y) {
1443 int x_q4 = x0_q4;
1444 for (int x = 0; x < w; ++x) {
1445 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1446 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1447 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1448 (1 << (bd + FILTER_BITS - 1));
1449 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1450 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1451 extraprec_clamp_limit - 1);
1452 x_q4 += x_step_q4;
1453 }
1454 src += src_stride;
1455 dst += dst_stride;
1456 }
1457 }
1458
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1459 static void highbd_convolve_add_src_vert_hip(
1460 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1461 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1462 int y_step_q4, int w, int h, int round1_bits, int bd) {
1463 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1464 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1465 for (int x = 0; x < w; ++x) {
1466 int y_q4 = y0_q4;
1467 for (int y = 0; y < h; ++y) {
1468 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1469 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1470 const int rounding =
1471 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1472 (1 << (bd + round1_bits - 1));
1473 const int sum =
1474 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1475 dst[y * dst_stride] =
1476 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1477 y_q4 += y_step_q4;
1478 }
1479 ++src;
1480 ++dst;
1481 }
1482 }
1483
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params,int bd)1484 void av1_highbd_wiener_convolve_add_src_c(
1485 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1486 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1487 const int16_t *filter_y, int y_step_q4, int w, int h,
1488 const WienerConvolveParams *conv_params, int bd) {
1489 const InterpKernel *const filters_x = get_filter_base(filter_x);
1490 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1491
1492 const InterpKernel *const filters_y = get_filter_base(filter_y);
1493 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1494
1495 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1496 const int intermediate_height =
1497 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1498
1499 assert(w <= MAX_SB_SIZE);
1500 assert(h <= MAX_SB_SIZE);
1501 assert(y_step_q4 <= 32);
1502 assert(x_step_q4 <= 32);
1503 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1504
1505 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1506 src_stride, temp, MAX_SB_SIZE, filters_x,
1507 x0_q4, x_step_q4, w, intermediate_height,
1508 conv_params->round_0, bd);
1509 highbd_convolve_add_src_vert_hip(
1510 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1511 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1512 }
1513 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1514 #endif // CONFIG_AV1_HIGHBITDEPTH
1515