1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 *
11 */
12
13 #include <math.h>
14 #include <stddef.h>
15
16 #include "config/aom_config.h"
17 #include "config/aom_scale_rtcd.h"
18
19 #include "aom/internal/aom_codec_internal.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_mem/aom_mem.h"
23 #include "aom_ports/mem.h"
24 #include "aom_util/aom_pthread.h"
25
26 #include "av1/common/av1_common_int.h"
27 #include "av1/common/convolve.h"
28 #include "av1/common/enums.h"
29 #include "av1/common/resize.h"
30 #include "av1/common/restoration.h"
31 #include "av1/common/thread_common.h"
32
33 // The 's' values are calculated based on original 'r' and 'e' values in the
34 // spec using GenSgrprojVtable().
35 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36 const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37 { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38 { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
39 { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
40 { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
41 { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
42 { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
43 { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
44 { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
45 };
46
av1_get_upsampled_plane_size(const AV1_COMMON * cm,int is_uv,int * plane_w,int * plane_h)47 void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48 int *plane_h) {
49 int ss_x = is_uv && cm->seq_params->subsampling_x;
50 int ss_y = is_uv && cm->seq_params->subsampling_y;
51 *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52 *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53 }
54
55 // Count horizontal or vertical units in a plane (use a width or height for
56 // plane_size, respectively). We basically want to divide the plane size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height.
60 //
61 // The max with 1 is to deal with small frames, which may be smaller than
62 // half of an LR unit in size.
av1_lr_count_units(int unit_size,int plane_size)63 int av1_lr_count_units(int unit_size, int plane_size) {
64 return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65 }
66
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)67 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68 int is_uv) {
69 int plane_w, plane_h;
70 av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71
72 const int unit_size = rsi->restoration_unit_size;
73 const int horz_units = av1_lr_count_units(unit_size, plane_w);
74 const int vert_units = av1_lr_count_units(unit_size, plane_h);
75
76 rsi->num_rest_units = horz_units * vert_units;
77 rsi->horz_units = horz_units;
78 rsi->vert_units = vert_units;
79
80 aom_free(rsi->unit_info);
81 CHECK_MEM_ERROR(cm, rsi->unit_info,
82 (RestorationUnitInfo *)aom_memalign(
83 16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84 }
85
av1_free_restoration_struct(RestorationInfo * rst_info)86 void av1_free_restoration_struct(RestorationInfo *rst_info) {
87 aom_free(rst_info->unit_info);
88 rst_info->unit_info = NULL;
89 }
90
91 #if 0
92 // Pair of values for each sgrproj parameter:
93 // Index 0 corresponds to r[0], e[0]
94 // Index 1 corresponds to r[1], e[1]
95 int sgrproj_mtable[SGRPROJ_PARAMS][2];
96
97 static void GenSgrprojVtable(void) {
98 for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99 const sgr_params_type *const params = &av1_sgr_params[i];
100 for (int j = 0; j < 2; ++j) {
101 const int e = params->e[j];
102 const int r = params->r[j];
103 if (r == 0) { // filter is disabled
104 sgrproj_mtable[i][j] = -1; // mark invalid
105 } else { // filter is enabled
106 const int n = (2 * r + 1) * (2 * r + 1);
107 const int n2e = n * n * e;
108 assert(n2e != 0);
109 sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110 }
111 }
112 }
113 }
114 #endif
115
av1_loop_restoration_precal(void)116 void av1_loop_restoration_precal(void) {
117 #if 0
118 GenSgrprojVtable();
119 #endif
120 }
121
extend_frame_lowbd(uint8_t * data,int width,int height,ptrdiff_t stride,int border_horz,int border_vert)122 static void extend_frame_lowbd(uint8_t *data, int width, int height,
123 ptrdiff_t stride, int border_horz,
124 int border_vert) {
125 uint8_t *data_p;
126 int i;
127 for (i = 0; i < height; ++i) {
128 data_p = data + i * stride;
129 memset(data_p - border_horz, data_p[0], border_horz);
130 memset(data_p + width, data_p[width - 1], border_horz);
131 }
132 data_p = data - border_horz;
133 for (i = -border_vert; i < 0; ++i) {
134 memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135 }
136 for (i = height; i < height + border_vert; ++i) {
137 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138 width + 2 * border_horz);
139 }
140 }
141
142 #if CONFIG_AV1_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,ptrdiff_t stride,int border_horz,int border_vert)143 static void extend_frame_highbd(uint16_t *data, int width, int height,
144 ptrdiff_t stride, int border_horz,
145 int border_vert) {
146 uint16_t *data_p;
147 int i, j;
148 for (i = 0; i < height; ++i) {
149 data_p = data + i * stride;
150 for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151 for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152 }
153 data_p = data - border_horz;
154 for (i = -border_vert; i < 0; ++i) {
155 memcpy(data_p + i * stride, data_p,
156 (width + 2 * border_horz) * sizeof(uint16_t));
157 }
158 for (i = height; i < height + border_vert; ++i) {
159 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160 (width + 2 * border_horz) * sizeof(uint16_t));
161 }
162 }
163
copy_rest_unit_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)164 static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165 int src_stride, uint16_t *dst,
166 int dst_stride) {
167 for (int i = 0; i < height; ++i)
168 memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169 }
170 #endif
171
av1_extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)172 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173 int border_horz, int border_vert, int highbd) {
174 #if CONFIG_AV1_HIGHBITDEPTH
175 if (highbd) {
176 extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177 border_horz, border_vert);
178 return;
179 }
180 #endif
181 (void)highbd;
182 extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183 }
184
copy_rest_unit_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)185 static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186 int src_stride, uint8_t *dst, int dst_stride) {
187 for (int i = 0; i < height; ++i)
188 memcpy(dst + i * dst_stride, src + i * src_stride, width);
189 }
190
copy_rest_unit(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)191 static void copy_rest_unit(int width, int height, const uint8_t *src,
192 int src_stride, uint8_t *dst, int dst_stride,
193 int highbd) {
194 #if CONFIG_AV1_HIGHBITDEPTH
195 if (highbd) {
196 copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197 CONVERT_TO_SHORTPTR(dst), dst_stride);
198 return;
199 }
200 #endif
201 (void)highbd;
202 copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203 }
204
205 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206
207 // With striped loop restoration, the filtering for each 64-pixel stripe gets
208 // most of its input from the output of CDEF (stored in data8), but we need to
209 // fill out a border of 3 pixels above/below the stripe according to the
210 // following rules:
211 //
212 // * At the top and bottom of the frame, we copy the outermost row of CDEF
213 // pixels three times. This extension is done by a call to av1_extend_frame()
214 // at the start of the loop restoration process, so the value of
215 // copy_above/copy_below doesn't strictly matter.
216 //
217 // * All other boundaries are stripe boundaries within the frame. In that case,
218 // we take 2 rows of deblocked pixels and extend them to 3 rows of context.
get_stripe_boundary_info(const RestorationTileLimits * limits,int plane_w,int plane_h,int ss_y,int * copy_above,int * copy_below)219 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220 int plane_w, int plane_h, int ss_y,
221 int *copy_above, int *copy_below) {
222 (void)plane_w;
223
224 *copy_above = 1;
225 *copy_below = 1;
226
227 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229
230 const int first_stripe_in_plane = (limits->v_start == 0);
231 const int this_stripe_height =
232 full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233 const int last_stripe_in_plane =
234 (limits->v_start + this_stripe_height >= plane_h);
235
236 if (first_stripe_in_plane) *copy_above = 0;
237 if (last_stripe_in_plane) *copy_below = 0;
238 }
239
240 // Overwrite the border pixels around a processing stripe so that the conditions
241 // listed above get_stripe_boundary_info() are preserved.
242 // We save the pixels which get overwritten into a temporary buffer, so that
243 // they can be restored by restore_processing_stripe_boundary() after we've
244 // processed the stripe.
245 //
246 // limits gives the rectangular limits of the remaining stripes for the current
247 // restoration unit. rsb is the stored stripe boundaries (taken from either
248 // deblock or CDEF output as necessary).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)249 static void setup_processing_stripe_boundary(
250 const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251 int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252 RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253 // Offsets within the line buffers. The buffer logically starts at column
254 // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255 // has column x0 in the buffer.
256 const int buf_stride = rsb->stripe_boundary_stride;
257 const int buf_x0_off = limits->h_start;
258 const int line_width =
259 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260 const int line_size = line_width << use_highbd;
261
262 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263
264 // Replace RESTORATION_BORDER pixels above the top of the stripe
265 // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266 // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267 // duplicating the topmost of the 2 lines (see the AOMMAX call when
268 // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269 if (!opt) {
270 if (copy_above) {
271 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272
273 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274 const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275 const int buf_off = buf_x0_off + buf_row * buf_stride;
276 const uint8_t *buf =
277 rsb->stripe_boundary_above + (buf_off << use_highbd);
278 uint8_t *dst8 = data8_tl + i * data_stride;
279 // Save old pixels, then replace with data from stripe_boundary_above
280 memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281 REAL_PTR(use_highbd, dst8), line_size);
282 memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283 }
284 }
285
286 // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287 // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288 // for i = 0, 1, 2.
289 if (copy_below) {
290 const int stripe_end = limits->v_start + h;
291 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292
293 for (int i = 0; i < RESTORATION_BORDER; ++i) {
294 const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295 const int buf_off = buf_x0_off + buf_row * buf_stride;
296 const uint8_t *src =
297 rsb->stripe_boundary_below + (buf_off << use_highbd);
298
299 uint8_t *dst8 = data8_bl + i * data_stride;
300 // Save old pixels, then replace with data from stripe_boundary_below
301 memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302 memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303 }
304 }
305 } else {
306 if (copy_above) {
307 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308
309 // Only save and overwrite i=-RESTORATION_BORDER line.
310 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311 // Save old pixels, then replace with data from stripe_boundary_above
312 memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313 memcpy(REAL_PTR(use_highbd, dst8),
314 REAL_PTR(use_highbd,
315 data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316 line_size);
317 }
318
319 if (copy_below) {
320 const int stripe_end = limits->v_start + h;
321 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322
323 // Only save and overwrite i=2 line.
324 uint8_t *dst8 = data8_bl + 2 * data_stride;
325 // Save old pixels, then replace with data from stripe_boundary_below
326 memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327 memcpy(REAL_PTR(use_highbd, dst8),
328 REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329 }
330 }
331 }
332
333 // Once a processing stripe is finished, this function sets the boundary
334 // pixels which were overwritten by setup_processing_stripe_boundary()
335 // back to their original values
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)336 static void restore_processing_stripe_boundary(
337 const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338 int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339 int copy_below, int opt) {
340 const int line_width =
341 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342 const int line_size = line_width << use_highbd;
343
344 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345
346 if (!opt) {
347 if (copy_above) {
348 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349 for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350 uint8_t *dst8 = data8_tl + i * data_stride;
351 memcpy(REAL_PTR(use_highbd, dst8),
352 rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353 }
354 }
355
356 if (copy_below) {
357 const int stripe_bottom = limits->v_start + h;
358 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359
360 for (int i = 0; i < RESTORATION_BORDER; ++i) {
361 if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362
363 uint8_t *dst8 = data8_bl + i * data_stride;
364 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365 }
366 }
367 } else {
368 if (copy_above) {
369 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370
371 // Only restore i=-RESTORATION_BORDER line.
372 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374 }
375
376 if (copy_below) {
377 const int stripe_bottom = limits->v_start + h;
378 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379
380 // Only restore i=2 line.
381 if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382 uint8_t *dst8 = data8_bl + 2 * data_stride;
383 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384 }
385 }
386 }
387 }
388
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)389 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390 int stripe_width, int stripe_height,
391 int procunit_width, const uint8_t *src,
392 int src_stride, uint8_t *dst, int dst_stride,
393 int32_t *tmpbuf, int bit_depth,
394 struct aom_internal_error_info *error_info) {
395 (void)tmpbuf;
396 (void)bit_depth;
397 (void)error_info;
398 assert(bit_depth == 8);
399 const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400
401 for (int j = 0; j < stripe_width; j += procunit_width) {
402 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403 const uint8_t *src_p = src + j;
404 uint8_t *dst_p = dst + j;
405 av1_wiener_convolve_add_src(
406 src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407 rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408 }
409 }
410
411 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412 over the input. The window is of size (2r + 1)x(2r + 1), and we
413 specialize to r = 1, 2, 3. A default function is used for r > 3.
414
415 Each loop follows the same format: We keep a window's worth of input
416 in individual variables and select data out of that as appropriate.
417 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)418 static void boxsum1(int32_t *src, int width, int height, int src_stride,
419 int sqr, int32_t *dst, int dst_stride) {
420 int i, j, a, b, c;
421 assert(width > 2 * SGRPROJ_BORDER_HORZ);
422 assert(height > 2 * SGRPROJ_BORDER_VERT);
423
424 // Vertical sum over 3-pixel regions, from src into dst.
425 if (!sqr) {
426 for (j = 0; j < width; ++j) {
427 a = src[j];
428 b = src[src_stride + j];
429 c = src[2 * src_stride + j];
430
431 dst[j] = a + b;
432 for (i = 1; i < height - 2; ++i) {
433 // Loop invariant: At the start of each iteration,
434 // a = src[(i - 1) * src_stride + j]
435 // b = src[(i ) * src_stride + j]
436 // c = src[(i + 1) * src_stride + j]
437 dst[i * dst_stride + j] = a + b + c;
438 a = b;
439 b = c;
440 c = src[(i + 2) * src_stride + j];
441 }
442 dst[i * dst_stride + j] = a + b + c;
443 dst[(i + 1) * dst_stride + j] = b + c;
444 }
445 } else {
446 for (j = 0; j < width; ++j) {
447 a = src[j] * src[j];
448 b = src[src_stride + j] * src[src_stride + j];
449 c = src[2 * src_stride + j] * src[2 * src_stride + j];
450
451 dst[j] = a + b;
452 for (i = 1; i < height - 2; ++i) {
453 dst[i * dst_stride + j] = a + b + c;
454 a = b;
455 b = c;
456 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457 }
458 dst[i * dst_stride + j] = a + b + c;
459 dst[(i + 1) * dst_stride + j] = b + c;
460 }
461 }
462
463 // Horizontal sum over 3-pixel regions of dst
464 for (i = 0; i < height; ++i) {
465 a = dst[i * dst_stride];
466 b = dst[i * dst_stride + 1];
467 c = dst[i * dst_stride + 2];
468
469 dst[i * dst_stride] = a + b;
470 for (j = 1; j < width - 2; ++j) {
471 // Loop invariant: At the start of each iteration,
472 // a = src[i * src_stride + (j - 1)]
473 // b = src[i * src_stride + (j )]
474 // c = src[i * src_stride + (j + 1)]
475 dst[i * dst_stride + j] = a + b + c;
476 a = b;
477 b = c;
478 c = dst[i * dst_stride + (j + 2)];
479 }
480 dst[i * dst_stride + j] = a + b + c;
481 dst[i * dst_stride + (j + 1)] = b + c;
482 }
483 }
484
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)485 static void boxsum2(int32_t *src, int width, int height, int src_stride,
486 int sqr, int32_t *dst, int dst_stride) {
487 int i, j, a, b, c, d, e;
488 assert(width > 2 * SGRPROJ_BORDER_HORZ);
489 assert(height > 2 * SGRPROJ_BORDER_VERT);
490
491 // Vertical sum over 5-pixel regions, from src into dst.
492 if (!sqr) {
493 for (j = 0; j < width; ++j) {
494 a = src[j];
495 b = src[src_stride + j];
496 c = src[2 * src_stride + j];
497 d = src[3 * src_stride + j];
498 e = src[4 * src_stride + j];
499
500 dst[j] = a + b + c;
501 dst[dst_stride + j] = a + b + c + d;
502 for (i = 2; i < height - 3; ++i) {
503 // Loop invariant: At the start of each iteration,
504 // a = src[(i - 2) * src_stride + j]
505 // b = src[(i - 1) * src_stride + j]
506 // c = src[(i ) * src_stride + j]
507 // d = src[(i + 1) * src_stride + j]
508 // e = src[(i + 2) * src_stride + j]
509 dst[i * dst_stride + j] = a + b + c + d + e;
510 a = b;
511 b = c;
512 c = d;
513 d = e;
514 e = src[(i + 3) * src_stride + j];
515 }
516 dst[i * dst_stride + j] = a + b + c + d + e;
517 dst[(i + 1) * dst_stride + j] = b + c + d + e;
518 dst[(i + 2) * dst_stride + j] = c + d + e;
519 }
520 } else {
521 for (j = 0; j < width; ++j) {
522 a = src[j] * src[j];
523 b = src[src_stride + j] * src[src_stride + j];
524 c = src[2 * src_stride + j] * src[2 * src_stride + j];
525 d = src[3 * src_stride + j] * src[3 * src_stride + j];
526 e = src[4 * src_stride + j] * src[4 * src_stride + j];
527
528 dst[j] = a + b + c;
529 dst[dst_stride + j] = a + b + c + d;
530 for (i = 2; i < height - 3; ++i) {
531 dst[i * dst_stride + j] = a + b + c + d + e;
532 a = b;
533 b = c;
534 c = d;
535 d = e;
536 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537 }
538 dst[i * dst_stride + j] = a + b + c + d + e;
539 dst[(i + 1) * dst_stride + j] = b + c + d + e;
540 dst[(i + 2) * dst_stride + j] = c + d + e;
541 }
542 }
543
544 // Horizontal sum over 5-pixel regions of dst
545 for (i = 0; i < height; ++i) {
546 a = dst[i * dst_stride];
547 b = dst[i * dst_stride + 1];
548 c = dst[i * dst_stride + 2];
549 d = dst[i * dst_stride + 3];
550 e = dst[i * dst_stride + 4];
551
552 dst[i * dst_stride] = a + b + c;
553 dst[i * dst_stride + 1] = a + b + c + d;
554 for (j = 2; j < width - 3; ++j) {
555 // Loop invariant: At the start of each iteration,
556 // a = src[i * src_stride + (j - 2)]
557 // b = src[i * src_stride + (j - 1)]
558 // c = src[i * src_stride + (j )]
559 // d = src[i * src_stride + (j + 1)]
560 // e = src[i * src_stride + (j + 2)]
561 dst[i * dst_stride + j] = a + b + c + d + e;
562 a = b;
563 b = c;
564 c = d;
565 d = e;
566 e = dst[i * dst_stride + (j + 3)];
567 }
568 dst[i * dst_stride + j] = a + b + c + d + e;
569 dst[i * dst_stride + (j + 1)] = b + c + d + e;
570 dst[i * dst_stride + (j + 2)] = c + d + e;
571 }
572 }
573
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)574 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575 int sqr, int32_t *dst, int dst_stride) {
576 if (r == 1)
577 boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578 else if (r == 2)
579 boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580 else
581 assert(0 && "Invalid value of r in self-guided filter");
582 }
583
av1_decode_xq(const int * xqd,int * xq,const sgr_params_type * params)584 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585 if (params->r[0] == 0) {
586 xq[0] = 0;
587 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588 } else if (params->r[1] == 0) {
589 xq[0] = xqd[0];
590 xq[1] = 0;
591 } else {
592 xq[0] = xqd[0];
593 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594 }
595 }
596
597 const int32_t av1_x_by_xplus1[256] = {
598 // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599 // instead of 0. See comments in selfguided_restoration_internal() for why
600 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617 256,
618 };
619
620 const int32_t av1_one_by_x[MAX_NELEM] = {
621 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
623 };
624
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)625 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626 int dgd_stride, int bit_depth,
627 int sgr_params_idx, int radius_idx,
628 int pass, int32_t *A, int32_t *B) {
629 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630 const int r = params->r[radius_idx];
631 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632 const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633 // Adjusting the stride of A and B here appears to avoid bad cache effects,
634 // leading to a significant speed improvement.
635 // We also align the stride to a multiple of 16 bytes, for consistency
636 // with the SIMD version of this function.
637 int buf_stride = ((width_ext + 3) & ~3) + 16;
638 const int step = pass == 0 ? 1 : 2;
639 int i, j;
640
641 assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642 assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643 "Need SGRPROJ_BORDER_* >= r+1");
644
645 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646 width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648 width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651 // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652 // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653 for (i = -1; i < height + 1; i += step) {
654 for (j = -1; j < width + 1; ++j) {
655 const int k = i * buf_stride + j;
656 const int n = (2 * r + 1) * (2 * r + 1);
657
658 // a < 2^16 * n < 2^22 regardless of bit depth
659 uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660 // b < 2^8 * n < 2^14 regardless of bit depth
661 uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662
663 // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664 // and p itself satisfies p < 2^14 * n^2 < 2^26.
665 // This bound on p is due to:
666 // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667 //
668 // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669 // This is an artefact of rounding, and can only happen if all pixels
670 // are (almost) identical, so in this case we saturate to p=0.
671 uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672
673 const uint32_t s = params->s[radius_idx];
674
675 // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677 // (this holds even after accounting for the rounding in s)
678 const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679
680 // Note: We have to be quite careful about the value of A[k].
681 // This is used as a blend factor between individual pixel values and the
682 // local mean. So it logically has a range of [0, 256], including both
683 // endpoints.
684 //
685 // This is a pain for hardware, as we'd like something which can be stored
686 // in exactly 8 bits.
687 // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688 // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689 // slightly above 2^(8 + bit depth), due to rounding in the value of
690 // av1_one_by_x[25-1].
691 //
692 // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693 // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694 // overflow), without significantly affecting the final result: z == 0
695 // implies that the image is essentially "flat", so the local mean and
696 // individual pixel values are very similar.
697 //
698 // Note that saturating on the other side, ie. requring A[k] <= 255,
699 // would be a bad idea, as that corresponds to the case where the image
700 // is very variable, when we want to preserve the local pixel value as
701 // much as possible.
702 A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
703
704 // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705 // av1_one_by_x[n - 1] = round(2^12 / n)
706 // => the product here is < 2^(20 + bit_depth) <= 2^32,
707 // and B[k] is set to a value < 2^(8 + bit depth)
708 // This holds even with the rounding in av1_one_by_x and in the overall
709 // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710 B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711 (uint32_t)B[k] *
712 (uint32_t)av1_one_by_x[n - 1],
713 SGRPROJ_RECIP_BITS);
714 }
715 }
716 }
717
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)718 static void selfguided_restoration_fast_internal(
719 int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720 int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722 const int r = params->r[radius_idx];
723 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724 // Adjusting the stride of A and B here appears to avoid bad cache effects,
725 // leading to a significant speed improvement.
726 // We also align the stride to a multiple of 16 bytes, for consistency
727 // with the SIMD version of this function.
728 int buf_stride = ((width_ext + 3) & ~3) + 16;
729 int32_t A_[RESTORATION_PROC_UNIT_PELS];
730 int32_t B_[RESTORATION_PROC_UNIT_PELS];
731 int32_t *A = A_;
732 int32_t *B = B_;
733 int i, j;
734 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735 sgr_params_idx, radius_idx, 1, A, B);
736 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738
739 // Use the A[] and B[] arrays to calculate the filtered image
740 (void)r;
741 assert(r == 2);
742 for (i = 0; i < height; ++i) {
743 if (!(i & 1)) { // even row
744 for (j = 0; j < width; ++j) {
745 const int k = i * buf_stride + j;
746 const int l = i * dgd_stride + j;
747 const int m = i * dst_stride + j;
748 const int nb = 5;
749 const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752 5;
753 const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756 5;
757 const int32_t v = a * dgd[l] + b;
758 dst[m] =
759 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760 }
761 } else { // odd row
762 for (j = 0; j < width; ++j) {
763 const int k = i * buf_stride + j;
764 const int l = i * dgd_stride + j;
765 const int m = i * dst_stride + j;
766 const int nb = 4;
767 const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768 const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769 const int32_t v = a * dgd[l] + b;
770 dst[m] =
771 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772 }
773 }
774 }
775 }
776
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)777 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778 int dgd_stride, int32_t *dst,
779 int dst_stride, int bit_depth,
780 int sgr_params_idx,
781 int radius_idx) {
782 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783 // Adjusting the stride of A and B here appears to avoid bad cache effects,
784 // leading to a significant speed improvement.
785 // We also align the stride to a multiple of 16 bytes, for consistency
786 // with the SIMD version of this function.
787 int buf_stride = ((width_ext + 3) & ~3) + 16;
788 int32_t A_[RESTORATION_PROC_UNIT_PELS];
789 int32_t B_[RESTORATION_PROC_UNIT_PELS];
790 int32_t *A = A_;
791 int32_t *B = B_;
792 int i, j;
793 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794 sgr_params_idx, radius_idx, 0, A, B);
795 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797
798 // Use the A[] and B[] arrays to calculate the filtered image
799 for (i = 0; i < height; ++i) {
800 for (j = 0; j < width; ++j) {
801 const int k = i * buf_stride + j;
802 const int l = i * dgd_stride + j;
803 const int m = i * dst_stride + j;
804 const int nb = 5;
805 const int32_t a =
806 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807 4 +
808 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810 3;
811 const int32_t b =
812 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813 4 +
814 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816 3;
817 const int32_t v = a * dgd[l] + b;
818 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819 }
820 }
821 }
822
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)823 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824 int dgd_stride, int32_t *flt0, int32_t *flt1,
825 int flt_stride, int sgr_params_idx,
826 int bit_depth, int highbd) {
827 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829 int32_t *dgd32 =
830 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831
832 if (highbd) {
833 const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836 dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837 }
838 }
839 } else {
840 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842 dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843 }
844 }
845 }
846
847 const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848 // If params->r == 0 we skip the corresponding filter. We only allow one of
849 // the radii to be 0, as having both equal to 0 would be equivalent to
850 // skipping SGR entirely.
851 assert(!(params->r[0] == 0 && params->r[1] == 0));
852
853 if (params->r[0] > 0)
854 selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855 flt0, flt_stride, bit_depth,
856 sgr_params_idx, 0);
857 if (params->r[1] > 0)
858 selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859 flt_stride, bit_depth, sgr_params_idx, 1);
860 return 0;
861 }
862
av1_apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)863 int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864 int height, int stride, int eps,
865 const int *xqd, uint8_t *dst8,
866 int dst_stride, int32_t *tmpbuf,
867 int bit_depth, int highbd) {
868 int32_t *flt0 = tmpbuf;
869 int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870 assert(width * height <= RESTORATION_UNITPELS_MAX);
871
872 const int ret = av1_selfguided_restoration_c(
873 dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874 if (ret != 0) return ret;
875 const sgr_params_type *const params = &av1_sgr_params[eps];
876 int xq[2];
877 av1_decode_xq(xqd, xq, params);
878 for (int i = 0; i < height; ++i) {
879 for (int j = 0; j < width; ++j) {
880 const int k = i * width + j;
881 uint8_t *dst8ij = dst8 + i * dst_stride + j;
882 const uint8_t *dat8ij = dat8 + i * stride + j;
883
884 const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885 const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886 int32_t v = u << SGRPROJ_PRJ_BITS;
887 // If params->r == 0 then we skipped the filtering in
888 // av1_selfguided_restoration_c, i.e. flt[k] == u
889 if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890 if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891 const int16_t w =
892 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893
894 const uint16_t out = clip_pixel_highbd(w, bit_depth);
895 if (highbd)
896 *CONVERT_TO_SHORTPTR(dst8ij) = out;
897 else
898 *dst8ij = (uint8_t)out;
899 }
900 }
901 return 0;
902 }
903
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)904 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905 int stripe_width, int stripe_height,
906 int procunit_width, const uint8_t *src,
907 int src_stride, uint8_t *dst, int dst_stride,
908 int32_t *tmpbuf, int bit_depth,
909 struct aom_internal_error_info *error_info) {
910 (void)bit_depth;
911 assert(bit_depth == 8);
912
913 for (int j = 0; j < stripe_width; j += procunit_width) {
914 int w = AOMMIN(procunit_width, stripe_width - j);
915 if (av1_apply_selfguided_restoration(
916 src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917 rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918 0) != 0) {
919 aom_internal_error(
920 error_info, AOM_CODEC_MEM_ERROR,
921 "Error allocating buffer in av1_apply_selfguided_restoration");
922 }
923 }
924 }
925
926 #if CONFIG_AV1_HIGHBITDEPTH
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)927 static void wiener_filter_stripe_highbd(
928 const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929 int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930 int dst_stride, int32_t *tmpbuf, int bit_depth,
931 struct aom_internal_error_info *error_info) {
932 (void)tmpbuf;
933 (void)error_info;
934 const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935
936 for (int j = 0; j < stripe_width; j += procunit_width) {
937 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938 const uint8_t *src8_p = src8 + j;
939 uint8_t *dst8_p = dst8 + j;
940 av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941 rui->wiener_info.hfilter, 16,
942 rui->wiener_info.vfilter, 16, w,
943 stripe_height, &conv_params, bit_depth);
944 }
945 }
946
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)947 static void sgrproj_filter_stripe_highbd(
948 const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949 int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950 int dst_stride, int32_t *tmpbuf, int bit_depth,
951 struct aom_internal_error_info *error_info) {
952 for (int j = 0; j < stripe_width; j += procunit_width) {
953 int w = AOMMIN(procunit_width, stripe_width - j);
954 if (av1_apply_selfguided_restoration(
955 src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956 rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957 1) != 0) {
958 aom_internal_error(
959 error_info, AOM_CODEC_MEM_ERROR,
960 "Error allocating buffer in av1_apply_selfguided_restoration");
961 }
962 }
963 }
964 #endif // CONFIG_AV1_HIGHBITDEPTH
965
966 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967 int stripe_width, int stripe_height,
968 int procunit_width, const uint8_t *src,
969 int src_stride, uint8_t *dst, int dst_stride,
970 int32_t *tmpbuf, int bit_depth,
971 struct aom_internal_error_info *error_info);
972
973 #if CONFIG_AV1_HIGHBITDEPTH
974 #define NUM_STRIPE_FILTERS 4
975 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976 wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977 sgrproj_filter_stripe_highbd
978 };
979 #else
980 #define NUM_STRIPE_FILTERS 2
981 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982 wiener_filter_stripe, sgrproj_filter_stripe
983 };
984 #endif // CONFIG_AV1_HIGHBITDEPTH
985
986 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,int plane_w,int plane_h,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr,struct aom_internal_error_info * error_info)987 void av1_loop_restoration_filter_unit(
988 const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989 const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990 int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991 uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992 int optimized_lr, struct aom_internal_error_info *error_info) {
993 RestorationType unit_rtype = rui->restoration_type;
994
995 int unit_h = limits->v_end - limits->v_start;
996 int unit_w = limits->h_end - limits->h_start;
997 uint8_t *data8_tl =
998 data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999 uint8_t *dst8_tl =
1000 dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001
1002 if (unit_rtype == RESTORE_NONE) {
1003 copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004 highbd);
1005 return;
1006 }
1007
1008 const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009 assert(filter_idx < NUM_STRIPE_FILTERS);
1010 const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011
1012 const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013
1014 // Filter the whole image one stripe at a time
1015 RestorationTileLimits remaining_stripes = *limits;
1016 int i = 0;
1017 while (i < unit_h) {
1018 int copy_above, copy_below;
1019 remaining_stripes.v_start = limits->v_start + i;
1020
1021 get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022 ©_above, ©_below);
1023
1024 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026
1027 // Work out where this stripe's boundaries are within
1028 // rsb->stripe_boundary_{above,below}
1029 const int frame_stripe =
1030 (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031 const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032
1033 // Calculate this stripe's height, based on two rules:
1034 // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035 // * We can't extend past the end of the current restoration unit
1036 const int nominal_stripe_height =
1037 full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038 const int h = AOMMIN(nominal_stripe_height,
1039 remaining_stripes.v_end - remaining_stripes.v_start);
1040
1041 setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042 h, data8, stride, rlbs, copy_above,
1043 copy_below, optimized_lr);
1044
1045 stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046 dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047 error_info);
1048
1049 restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050 data8, stride, copy_above, copy_below,
1051 optimized_lr);
1052
1053 i += h;
1054 }
1055 }
1056
filter_frame_on_unit(const RestorationTileLimits * limits,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,struct aom_internal_error_info * error_info)1057 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058 int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059 RestorationLineBuffers *rlbs,
1060 struct aom_internal_error_info *error_info) {
1061 FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062 const RestorationInfo *rsi = ctxt->rsi;
1063
1064 av1_loop_restoration_filter_unit(
1065 limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066 ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067 ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068 ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069 }
1070
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1071 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072 YV12_BUFFER_CONFIG *frame,
1073 AV1_COMMON *cm, int optimized_lr,
1074 int num_planes) {
1075 const SequenceHeader *const seq_params = cm->seq_params;
1076 const int bit_depth = seq_params->bit_depth;
1077 const int highbd = seq_params->use_highbitdepth;
1078 lr_ctxt->dst = &cm->rst_frame;
1079
1080 const int frame_width = frame->crop_widths[0];
1081 const int frame_height = frame->crop_heights[0];
1082 if (aom_realloc_frame_buffer(
1083 lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084 seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085 cm->features.byte_alignment, NULL, NULL, NULL, false,
1086 0) != AOM_CODEC_OK)
1087 aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088 "Failed to allocate restoration dst buffer");
1089
1090 lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091 lr_ctxt->frame = frame;
1092 for (int plane = 0; plane < num_planes; ++plane) {
1093 RestorationInfo *rsi = &cm->rst_info[plane];
1094 RestorationType rtype = rsi->frame_restoration_type;
1095 rsi->optimized_lr = optimized_lr;
1096 lr_ctxt->ctxt[plane].rsi = rsi;
1097
1098 if (rtype == RESTORE_NONE) {
1099 continue;
1100 }
1101
1102 const int is_uv = plane > 0;
1103 int plane_w, plane_h;
1104 av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105 assert(plane_w == frame->crop_widths[is_uv]);
1106 assert(plane_h == frame->crop_heights[is_uv]);
1107
1108 av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109 frame->strides[is_uv], RESTORATION_BORDER,
1110 RESTORATION_BORDER, highbd);
1111
1112 FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113 lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114 lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115 lr_plane_ctxt->plane_w = plane_w;
1116 lr_plane_ctxt->plane_h = plane_h;
1117 lr_plane_ctxt->highbd = highbd;
1118 lr_plane_ctxt->bit_depth = bit_depth;
1119 lr_plane_ctxt->data8 = frame->buffers[plane];
1120 lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121 lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122 lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123 }
1124 }
1125
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1126 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127 AV1_COMMON *cm, int num_planes) {
1128 typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129 YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130 int vstart, int vend);
1131 static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132 aom_yv12_partial_coloc_copy_u,
1133 aom_yv12_partial_coloc_copy_v };
1134 assert(num_planes <= 3);
1135 for (int plane = 0; plane < num_planes; ++plane) {
1136 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137 FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138 copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139 lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140 }
1141 }
1142
1143 // Call on_rest_unit for each loop restoration unit in the plane.
foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1144 static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145 rest_unit_visitor_t on_rest_unit,
1146 void *priv, int32_t *tmpbuf,
1147 RestorationLineBuffers *rlbs) {
1148 const RestorationInfo *rsi = &cm->rst_info[plane];
1149 const int hnum_rest_units = rsi->horz_units;
1150 const int vnum_rest_units = rsi->vert_units;
1151 const int unit_size = rsi->restoration_unit_size;
1152
1153 const int is_uv = plane > 0;
1154 const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155 const int ext_size = unit_size * 3 / 2;
1156 int plane_w, plane_h;
1157 av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158
1159 int y0 = 0, i = 0;
1160 while (y0 < plane_h) {
1161 int remaining_h = plane_h - y0;
1162 int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163
1164 RestorationTileLimits limits;
1165 limits.v_start = y0;
1166 limits.v_end = y0 + h;
1167 assert(limits.v_end <= plane_h);
1168 // Offset upwards to align with the restoration processing stripe
1169 const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170 limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171 if (limits.v_end < plane_h) limits.v_end -= voffset;
1172
1173 av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174 hnum_rest_units, vnum_rest_units, plane, priv,
1175 tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176 av1_lr_sync_write_dummy, NULL, cm->error);
1177
1178 y0 += h;
1179 ++i;
1180 }
1181 }
1182
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1183 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184 int num_planes) {
1185 FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186
1187 for (int plane = 0; plane < num_planes; ++plane) {
1188 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189 continue;
1190 }
1191
1192 foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193 cm->rst_tmpbuf, cm->rlbs);
1194 }
1195 }
1196
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1197 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198 AV1_COMMON *cm, int optimized_lr,
1199 void *lr_ctxt) {
1200 assert(!cm->features.all_lossless);
1201 const int num_planes = av1_num_planes(cm);
1202
1203 AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204
1205 av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206 optimized_lr, num_planes);
1207
1208 foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209
1210 av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211 }
1212
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,int plane_w,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int hnum_rest_units,int vnum_rest_units,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync,struct aom_internal_error_info * error_info)1213 void av1_foreach_rest_unit_in_row(
1214 RestorationTileLimits *limits, int plane_w,
1215 rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216 int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217 int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218 sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219 struct aom_internal_error_info *error_info) {
1220 const int ext_size = unit_size * 3 / 2;
1221 int x0 = 0, j = 0;
1222 while (x0 < plane_w) {
1223 int remaining_w = plane_w - x0;
1224 int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225
1226 limits->h_start = x0;
1227 limits->h_end = x0 + w;
1228 assert(limits->h_end <= plane_w);
1229
1230 const int unit_idx = row_number * hnum_rest_units + j;
1231
1232 // No sync for even numbered rows
1233 // For odd numbered rows, Loop Restoration of current block requires the LR
1234 // of top-right and bottom-right blocks to be completed
1235
1236 // top-right sync
1237 on_sync_read(lr_sync, row_number, j, plane);
1238 if ((row_number + 1) < vnum_rest_units)
1239 // bottom-right sync
1240 on_sync_read(lr_sync, row_number + 2, j, plane);
1241
1242 #if CONFIG_MULTITHREAD
1243 if (lr_sync && lr_sync->num_workers > 1) {
1244 pthread_mutex_lock(lr_sync->job_mutex);
1245 const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246 pthread_mutex_unlock(lr_sync->job_mutex);
1247 // Exit in case any worker has encountered an error.
1248 if (lr_mt_exit) return;
1249 }
1250 #endif
1251
1252 on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253
1254 on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255
1256 x0 += w;
1257 ++j;
1258 }
1259 }
1260
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1261 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262 (void)lr_sync;
1263 (void)r;
1264 (void)c;
1265 (void)plane;
1266 }
1267
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1268 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269 const int sb_cols, int plane) {
1270 (void)lr_sync;
1271 (void)r;
1272 (void)c;
1273 (void)sb_cols;
1274 (void)plane;
1275 }
1276
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1277 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278 int mi_row, int mi_col, BLOCK_SIZE bsize,
1279 int *rcol0, int *rcol1, int *rrow0,
1280 int *rrow1) {
1281 assert(rcol0 && rcol1 && rrow0 && rrow1);
1282
1283 if (bsize != cm->seq_params->sb_size) return 0;
1284
1285 assert(!cm->features.all_lossless);
1286
1287 const int is_uv = plane > 0;
1288
1289 // Compute the mi-unit corners of the superblock
1290 const int mi_row0 = mi_row;
1291 const int mi_col0 = mi_col;
1292 const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293 const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294
1295 const RestorationInfo *rsi = &cm->rst_info[plane];
1296 const int size = rsi->restoration_unit_size;
1297 const int horz_units = rsi->horz_units;
1298 const int vert_units = rsi->vert_units;
1299
1300 // The size of an MI-unit on this plane of the image
1301 const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302 const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303 const int mi_size_x = MI_SIZE >> ss_x;
1304 const int mi_size_y = MI_SIZE >> ss_y;
1305
1306 // Write m for the relative mi column or row, D for the superres denominator
1307 // and N for the superres numerator. If u is the upscaled pixel offset then
1308 // we can write the downscaled pixel offset in two ways as:
1309 //
1310 // MI_SIZE * m = N / D u
1311 //
1312 // from which we get u = D * MI_SIZE * m / N
1313 const int mi_to_num_x = av1_superres_scaled(cm)
1314 ? mi_size_x * cm->superres_scale_denominator
1315 : mi_size_x;
1316 const int mi_to_num_y = mi_size_y;
1317 const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318 const int denom_y = size;
1319
1320 const int rnd_x = denom_x - 1;
1321 const int rnd_y = denom_y - 1;
1322
1323 // rcol0/rrow0 should be the first column/row of restoration units that
1324 // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325 // to round up the division (if the sb starts at runit column 10.1, the first
1326 // matching runit has column index 11)
1327 *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328 *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329
1330 // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331 // below-right. If we're at the bottom or right of the frame, this restoration
1332 // unit might not exist, in which case we'll clamp accordingly.
1333 *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334 *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335
1336 return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337 }
1338
1339 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1340 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341 int extend, int use_highbitdepth) {
1342 for (int i = 0; i < height; ++i) {
1343 if (use_highbitdepth) {
1344 uint16_t *buf16 = (uint16_t *)buf;
1345 aom_memset16(buf16 - extend, buf16[0], extend);
1346 aom_memset16(buf16 + width, buf16[width - 1], extend);
1347 } else {
1348 memset(buf - extend, buf[0], extend);
1349 memset(buf + width, buf[width - 1], extend);
1350 }
1351 buf += stride;
1352 }
1353 }
1354
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1355 static void save_deblock_boundary_lines(
1356 const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357 int stripe, int use_highbd, int is_above,
1358 RestorationStripeBoundaries *boundaries) {
1359 const int is_uv = plane > 0;
1360 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361 const int src_stride = frame->strides[is_uv] << use_highbd;
1362 const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363
1364 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365 : boundaries->stripe_boundary_below;
1366 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369
1370 // There is a rare case in which a processing stripe can end 1px above the
1371 // crop border. In this case, we do want to use deblocked pixels from below
1372 // the stripe (hence why we ended up in this function), but instead of
1373 // fetching 2 "below" rows we need to fetch one and duplicate it.
1374 // This is equivalent to clamping the sample locations against the crop border
1375 const int lines_to_save =
1376 AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377 assert(lines_to_save == 1 || lines_to_save == 2);
1378
1379 int upscaled_width;
1380 int line_bytes;
1381 if (av1_superres_scaled(cm)) {
1382 const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383 upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384 line_bytes = upscaled_width << use_highbd;
1385 if (use_highbd)
1386 av1_upscale_normative_rows(
1387 cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388 CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389 plane, lines_to_save);
1390 else
1391 av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392 boundaries->stripe_boundary_stride, plane,
1393 lines_to_save);
1394 } else {
1395 upscaled_width = frame->crop_widths[is_uv];
1396 line_bytes = upscaled_width << use_highbd;
1397 for (int i = 0; i < lines_to_save; i++) {
1398 memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399 line_bytes);
1400 }
1401 }
1402 // If we only saved one line, then copy it into the second line buffer
1403 if (lines_to_save == 1)
1404 memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405
1406 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407 RESTORATION_EXTRA_HORZ, use_highbd);
1408 }
1409
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1410 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411 const AV1_COMMON *cm, int plane, int row,
1412 int stripe, int use_highbd, int is_above,
1413 RestorationStripeBoundaries *boundaries) {
1414 const int is_uv = plane > 0;
1415 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416 const int src_stride = frame->strides[is_uv] << use_highbd;
1417 const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418
1419 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420 : boundaries->stripe_boundary_below;
1421 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424 const int src_width = frame->crop_widths[is_uv];
1425
1426 // At the point where this function is called, we've already applied
1427 // superres. So we don't need to extend the lines here, we can just
1428 // pull directly from the topmost row of the upscaled frame.
1429 const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430 const int upscaled_width = av1_superres_scaled(cm)
1431 ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432 : src_width;
1433 const int line_bytes = upscaled_width << use_highbd;
1434 for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435 // Copy the line at 'src_rows' into both context lines
1436 memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437 }
1438 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439 RESTORATION_EXTRA_HORZ, use_highbd);
1440 }
1441
save_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1442 static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443 int plane, AV1_COMMON *cm, int after_cdef) {
1444 const int is_uv = plane > 0;
1445 const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446 const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447 const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448
1449 int plane_w, plane_h;
1450 av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451
1452 RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453
1454 const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455
1456 int stripe_idx;
1457 for (stripe_idx = 0;; ++stripe_idx) {
1458 const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459 const int y0 = rel_y0;
1460 if (y0 >= plane_h) break;
1461
1462 const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463 const int y1 = AOMMIN(rel_y1, plane_h);
1464
1465 // Extend using CDEF pixels at the top and bottom of the frame,
1466 // and deblocked pixels at internal stripe boundaries
1467 const int use_deblock_above = (stripe_idx > 0);
1468 const int use_deblock_below = (y1 < plane_height);
1469
1470 if (!after_cdef) {
1471 // Save deblocked context at internal stripe boundaries
1472 if (use_deblock_above) {
1473 save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474 stripe_idx, use_highbd, 1, boundaries);
1475 }
1476 if (use_deblock_below) {
1477 save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478 use_highbd, 0, boundaries);
1479 }
1480 } else {
1481 // Save CDEF context at frame boundaries
1482 if (!use_deblock_above) {
1483 save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484 1, boundaries);
1485 }
1486 if (!use_deblock_below) {
1487 save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488 use_highbd, 0, boundaries);
1489 }
1490 }
1491 }
1492 }
1493
1494 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495 // lines to be used as boundary in the loop restoration process. The
1496 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1497 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498 AV1_COMMON *cm, int after_cdef) {
1499 const int num_planes = av1_num_planes(cm);
1500 const int use_highbd = cm->seq_params->use_highbitdepth;
1501 for (int p = 0; p < num_planes; ++p) {
1502 save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503 }
1504 }
1505