xref: /aosp_15_r20/external/libaom/av1/common/restoration.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  *
11  */
12 
13 #include <math.h>
14 #include <stddef.h>
15 
16 #include "config/aom_config.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom/internal/aom_codec_internal.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_mem/aom_mem.h"
23 #include "aom_ports/mem.h"
24 #include "aom_util/aom_pthread.h"
25 
26 #include "av1/common/av1_common_int.h"
27 #include "av1/common/convolve.h"
28 #include "av1/common/enums.h"
29 #include "av1/common/resize.h"
30 #include "av1/common/restoration.h"
31 #include "av1/common/thread_common.h"
32 
33 // The 's' values are calculated based on original 'r' and 'e' values in the
34 // spec using GenSgrprojVtable().
35 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
36 const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
37   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
38   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
39   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
40   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
41   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
42   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
43   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
44   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
45 };
46 
av1_get_upsampled_plane_size(const AV1_COMMON * cm,int is_uv,int * plane_w,int * plane_h)47 void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
48                                   int *plane_h) {
49   int ss_x = is_uv && cm->seq_params->subsampling_x;
50   int ss_y = is_uv && cm->seq_params->subsampling_y;
51   *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52   *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
53 }
54 
55 // Count horizontal or vertical units in a plane (use a width or height for
56 // plane_size, respectively). We basically want to divide the plane size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height.
60 //
61 // The max with 1 is to deal with small frames, which may be smaller than
62 // half of an LR unit in size.
av1_lr_count_units(int unit_size,int plane_size)63 int av1_lr_count_units(int unit_size, int plane_size) {
64   return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
65 }
66 
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)67 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
68                                   int is_uv) {
69   int plane_w, plane_h;
70   av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
71 
72   const int unit_size = rsi->restoration_unit_size;
73   const int horz_units = av1_lr_count_units(unit_size, plane_w);
74   const int vert_units = av1_lr_count_units(unit_size, plane_h);
75 
76   rsi->num_rest_units = horz_units * vert_units;
77   rsi->horz_units = horz_units;
78   rsi->vert_units = vert_units;
79 
80   aom_free(rsi->unit_info);
81   CHECK_MEM_ERROR(cm, rsi->unit_info,
82                   (RestorationUnitInfo *)aom_memalign(
83                       16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
84 }
85 
av1_free_restoration_struct(RestorationInfo * rst_info)86 void av1_free_restoration_struct(RestorationInfo *rst_info) {
87   aom_free(rst_info->unit_info);
88   rst_info->unit_info = NULL;
89 }
90 
91 #if 0
92 // Pair of values for each sgrproj parameter:
93 // Index 0 corresponds to r[0], e[0]
94 // Index 1 corresponds to r[1], e[1]
95 int sgrproj_mtable[SGRPROJ_PARAMS][2];
96 
97 static void GenSgrprojVtable(void) {
98   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
99     const sgr_params_type *const params = &av1_sgr_params[i];
100     for (int j = 0; j < 2; ++j) {
101       const int e = params->e[j];
102       const int r = params->r[j];
103       if (r == 0) {                 // filter is disabled
104         sgrproj_mtable[i][j] = -1;  // mark invalid
105       } else {                      // filter is enabled
106         const int n = (2 * r + 1) * (2 * r + 1);
107         const int n2e = n * n * e;
108         assert(n2e != 0);
109         sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
110       }
111     }
112   }
113 }
114 #endif
115 
av1_loop_restoration_precal(void)116 void av1_loop_restoration_precal(void) {
117 #if 0
118   GenSgrprojVtable();
119 #endif
120 }
121 
extend_frame_lowbd(uint8_t * data,int width,int height,ptrdiff_t stride,int border_horz,int border_vert)122 static void extend_frame_lowbd(uint8_t *data, int width, int height,
123                                ptrdiff_t stride, int border_horz,
124                                int border_vert) {
125   uint8_t *data_p;
126   int i;
127   for (i = 0; i < height; ++i) {
128     data_p = data + i * stride;
129     memset(data_p - border_horz, data_p[0], border_horz);
130     memset(data_p + width, data_p[width - 1], border_horz);
131   }
132   data_p = data - border_horz;
133   for (i = -border_vert; i < 0; ++i) {
134     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
135   }
136   for (i = height; i < height + border_vert; ++i) {
137     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
138            width + 2 * border_horz);
139   }
140 }
141 
142 #if CONFIG_AV1_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,ptrdiff_t stride,int border_horz,int border_vert)143 static void extend_frame_highbd(uint16_t *data, int width, int height,
144                                 ptrdiff_t stride, int border_horz,
145                                 int border_vert) {
146   uint16_t *data_p;
147   int i, j;
148   for (i = 0; i < height; ++i) {
149     data_p = data + i * stride;
150     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
151     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
152   }
153   data_p = data - border_horz;
154   for (i = -border_vert; i < 0; ++i) {
155     memcpy(data_p + i * stride, data_p,
156            (width + 2 * border_horz) * sizeof(uint16_t));
157   }
158   for (i = height; i < height + border_vert; ++i) {
159     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
160            (width + 2 * border_horz) * sizeof(uint16_t));
161   }
162 }
163 
copy_rest_unit_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)164 static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
165                                   int src_stride, uint16_t *dst,
166                                   int dst_stride) {
167   for (int i = 0; i < height; ++i)
168     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
169 }
170 #endif
171 
av1_extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)172 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
173                       int border_horz, int border_vert, int highbd) {
174 #if CONFIG_AV1_HIGHBITDEPTH
175   if (highbd) {
176     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
177                         border_horz, border_vert);
178     return;
179   }
180 #endif
181   (void)highbd;
182   extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183 }
184 
copy_rest_unit_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)185 static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
186                                  int src_stride, uint8_t *dst, int dst_stride) {
187   for (int i = 0; i < height; ++i)
188     memcpy(dst + i * dst_stride, src + i * src_stride, width);
189 }
190 
copy_rest_unit(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)191 static void copy_rest_unit(int width, int height, const uint8_t *src,
192                            int src_stride, uint8_t *dst, int dst_stride,
193                            int highbd) {
194 #if CONFIG_AV1_HIGHBITDEPTH
195   if (highbd) {
196     copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
197                           CONVERT_TO_SHORTPTR(dst), dst_stride);
198     return;
199   }
200 #endif
201   (void)highbd;
202   copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
203 }
204 
205 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
206 
207 // With striped loop restoration, the filtering for each 64-pixel stripe gets
208 // most of its input from the output of CDEF (stored in data8), but we need to
209 // fill out a border of 3 pixels above/below the stripe according to the
210 // following rules:
211 //
212 // * At the top and bottom of the frame, we copy the outermost row of CDEF
213 //   pixels three times. This extension is done by a call to av1_extend_frame()
214 //   at the start of the loop restoration process, so the value of
215 //   copy_above/copy_below doesn't strictly matter.
216 //
217 // * All other boundaries are stripe boundaries within the frame. In that case,
218 //   we take 2 rows of deblocked pixels and extend them to 3 rows of context.
get_stripe_boundary_info(const RestorationTileLimits * limits,int plane_w,int plane_h,int ss_y,int * copy_above,int * copy_below)219 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
220                                      int plane_w, int plane_h, int ss_y,
221                                      int *copy_above, int *copy_below) {
222   (void)plane_w;
223 
224   *copy_above = 1;
225   *copy_below = 1;
226 
227   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
228   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
229 
230   const int first_stripe_in_plane = (limits->v_start == 0);
231   const int this_stripe_height =
232       full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
233   const int last_stripe_in_plane =
234       (limits->v_start + this_stripe_height >= plane_h);
235 
236   if (first_stripe_in_plane) *copy_above = 0;
237   if (last_stripe_in_plane) *copy_below = 0;
238 }
239 
240 // Overwrite the border pixels around a processing stripe so that the conditions
241 // listed above get_stripe_boundary_info() are preserved.
242 // We save the pixels which get overwritten into a temporary buffer, so that
243 // they can be restored by restore_processing_stripe_boundary() after we've
244 // processed the stripe.
245 //
246 // limits gives the rectangular limits of the remaining stripes for the current
247 // restoration unit. rsb is the stored stripe boundaries (taken from either
248 // deblock or CDEF output as necessary).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)249 static void setup_processing_stripe_boundary(
250     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
251     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
252     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
253   // Offsets within the line buffers. The buffer logically starts at column
254   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
255   // has column x0 in the buffer.
256   const int buf_stride = rsb->stripe_boundary_stride;
257   const int buf_x0_off = limits->h_start;
258   const int line_width =
259       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
260   const int line_size = line_width << use_highbd;
261 
262   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
263 
264   // Replace RESTORATION_BORDER pixels above the top of the stripe
265   // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
266   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
267   // duplicating the topmost of the 2 lines (see the AOMMAX call when
268   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
269   if (!opt) {
270     if (copy_above) {
271       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
272 
273       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
274         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
275         const int buf_off = buf_x0_off + buf_row * buf_stride;
276         const uint8_t *buf =
277             rsb->stripe_boundary_above + (buf_off << use_highbd);
278         uint8_t *dst8 = data8_tl + i * data_stride;
279         // Save old pixels, then replace with data from stripe_boundary_above
280         memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
281                REAL_PTR(use_highbd, dst8), line_size);
282         memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
283       }
284     }
285 
286     // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
287     // The second buffer row is repeated, so src_row gets the values 0, 1, 1
288     // for i = 0, 1, 2.
289     if (copy_below) {
290       const int stripe_end = limits->v_start + h;
291       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
292 
293       for (int i = 0; i < RESTORATION_BORDER; ++i) {
294         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
295         const int buf_off = buf_x0_off + buf_row * buf_stride;
296         const uint8_t *src =
297             rsb->stripe_boundary_below + (buf_off << use_highbd);
298 
299         uint8_t *dst8 = data8_bl + i * data_stride;
300         // Save old pixels, then replace with data from stripe_boundary_below
301         memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
302         memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
303       }
304     }
305   } else {
306     if (copy_above) {
307       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
308 
309       // Only save and overwrite i=-RESTORATION_BORDER line.
310       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
311       // Save old pixels, then replace with data from stripe_boundary_above
312       memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
313       memcpy(REAL_PTR(use_highbd, dst8),
314              REAL_PTR(use_highbd,
315                       data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
316              line_size);
317     }
318 
319     if (copy_below) {
320       const int stripe_end = limits->v_start + h;
321       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
322 
323       // Only save and overwrite i=2 line.
324       uint8_t *dst8 = data8_bl + 2 * data_stride;
325       // Save old pixels, then replace with data from stripe_boundary_below
326       memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
327       memcpy(REAL_PTR(use_highbd, dst8),
328              REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
329     }
330   }
331 }
332 
333 // Once a processing stripe is finished, this function sets the boundary
334 // pixels which were overwritten by setup_processing_stripe_boundary()
335 // back to their original values
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)336 static void restore_processing_stripe_boundary(
337     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
338     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
339     int copy_below, int opt) {
340   const int line_width =
341       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
342   const int line_size = line_width << use_highbd;
343 
344   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
345 
346   if (!opt) {
347     if (copy_above) {
348       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
349       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
350         uint8_t *dst8 = data8_tl + i * data_stride;
351         memcpy(REAL_PTR(use_highbd, dst8),
352                rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
353       }
354     }
355 
356     if (copy_below) {
357       const int stripe_bottom = limits->v_start + h;
358       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
359 
360       for (int i = 0; i < RESTORATION_BORDER; ++i) {
361         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
362 
363         uint8_t *dst8 = data8_bl + i * data_stride;
364         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
365       }
366     }
367   } else {
368     if (copy_above) {
369       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
370 
371       // Only restore i=-RESTORATION_BORDER line.
372       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
373       memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
374     }
375 
376     if (copy_below) {
377       const int stripe_bottom = limits->v_start + h;
378       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
379 
380       // Only restore i=2 line.
381       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
382         uint8_t *dst8 = data8_bl + 2 * data_stride;
383         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
384       }
385     }
386   }
387 }
388 
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)389 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
390                                  int stripe_width, int stripe_height,
391                                  int procunit_width, const uint8_t *src,
392                                  int src_stride, uint8_t *dst, int dst_stride,
393                                  int32_t *tmpbuf, int bit_depth,
394                                  struct aom_internal_error_info *error_info) {
395   (void)tmpbuf;
396   (void)bit_depth;
397   (void)error_info;
398   assert(bit_depth == 8);
399   const WienerConvolveParams conv_params = get_conv_params_wiener(8);
400 
401   for (int j = 0; j < stripe_width; j += procunit_width) {
402     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
403     const uint8_t *src_p = src + j;
404     uint8_t *dst_p = dst + j;
405     av1_wiener_convolve_add_src(
406         src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
407         rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
408   }
409 }
410 
411 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
412    over the input. The window is of size (2r + 1)x(2r + 1), and we
413    specialize to r = 1, 2, 3. A default function is used for r > 3.
414 
415    Each loop follows the same format: We keep a window's worth of input
416    in individual variables and select data out of that as appropriate.
417 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)418 static void boxsum1(int32_t *src, int width, int height, int src_stride,
419                     int sqr, int32_t *dst, int dst_stride) {
420   int i, j, a, b, c;
421   assert(width > 2 * SGRPROJ_BORDER_HORZ);
422   assert(height > 2 * SGRPROJ_BORDER_VERT);
423 
424   // Vertical sum over 3-pixel regions, from src into dst.
425   if (!sqr) {
426     for (j = 0; j < width; ++j) {
427       a = src[j];
428       b = src[src_stride + j];
429       c = src[2 * src_stride + j];
430 
431       dst[j] = a + b;
432       for (i = 1; i < height - 2; ++i) {
433         // Loop invariant: At the start of each iteration,
434         // a = src[(i - 1) * src_stride + j]
435         // b = src[(i    ) * src_stride + j]
436         // c = src[(i + 1) * src_stride + j]
437         dst[i * dst_stride + j] = a + b + c;
438         a = b;
439         b = c;
440         c = src[(i + 2) * src_stride + j];
441       }
442       dst[i * dst_stride + j] = a + b + c;
443       dst[(i + 1) * dst_stride + j] = b + c;
444     }
445   } else {
446     for (j = 0; j < width; ++j) {
447       a = src[j] * src[j];
448       b = src[src_stride + j] * src[src_stride + j];
449       c = src[2 * src_stride + j] * src[2 * src_stride + j];
450 
451       dst[j] = a + b;
452       for (i = 1; i < height - 2; ++i) {
453         dst[i * dst_stride + j] = a + b + c;
454         a = b;
455         b = c;
456         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
457       }
458       dst[i * dst_stride + j] = a + b + c;
459       dst[(i + 1) * dst_stride + j] = b + c;
460     }
461   }
462 
463   // Horizontal sum over 3-pixel regions of dst
464   for (i = 0; i < height; ++i) {
465     a = dst[i * dst_stride];
466     b = dst[i * dst_stride + 1];
467     c = dst[i * dst_stride + 2];
468 
469     dst[i * dst_stride] = a + b;
470     for (j = 1; j < width - 2; ++j) {
471       // Loop invariant: At the start of each iteration,
472       // a = src[i * src_stride + (j - 1)]
473       // b = src[i * src_stride + (j    )]
474       // c = src[i * src_stride + (j + 1)]
475       dst[i * dst_stride + j] = a + b + c;
476       a = b;
477       b = c;
478       c = dst[i * dst_stride + (j + 2)];
479     }
480     dst[i * dst_stride + j] = a + b + c;
481     dst[i * dst_stride + (j + 1)] = b + c;
482   }
483 }
484 
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)485 static void boxsum2(int32_t *src, int width, int height, int src_stride,
486                     int sqr, int32_t *dst, int dst_stride) {
487   int i, j, a, b, c, d, e;
488   assert(width > 2 * SGRPROJ_BORDER_HORZ);
489   assert(height > 2 * SGRPROJ_BORDER_VERT);
490 
491   // Vertical sum over 5-pixel regions, from src into dst.
492   if (!sqr) {
493     for (j = 0; j < width; ++j) {
494       a = src[j];
495       b = src[src_stride + j];
496       c = src[2 * src_stride + j];
497       d = src[3 * src_stride + j];
498       e = src[4 * src_stride + j];
499 
500       dst[j] = a + b + c;
501       dst[dst_stride + j] = a + b + c + d;
502       for (i = 2; i < height - 3; ++i) {
503         // Loop invariant: At the start of each iteration,
504         // a = src[(i - 2) * src_stride + j]
505         // b = src[(i - 1) * src_stride + j]
506         // c = src[(i    ) * src_stride + j]
507         // d = src[(i + 1) * src_stride + j]
508         // e = src[(i + 2) * src_stride + j]
509         dst[i * dst_stride + j] = a + b + c + d + e;
510         a = b;
511         b = c;
512         c = d;
513         d = e;
514         e = src[(i + 3) * src_stride + j];
515       }
516       dst[i * dst_stride + j] = a + b + c + d + e;
517       dst[(i + 1) * dst_stride + j] = b + c + d + e;
518       dst[(i + 2) * dst_stride + j] = c + d + e;
519     }
520   } else {
521     for (j = 0; j < width; ++j) {
522       a = src[j] * src[j];
523       b = src[src_stride + j] * src[src_stride + j];
524       c = src[2 * src_stride + j] * src[2 * src_stride + j];
525       d = src[3 * src_stride + j] * src[3 * src_stride + j];
526       e = src[4 * src_stride + j] * src[4 * src_stride + j];
527 
528       dst[j] = a + b + c;
529       dst[dst_stride + j] = a + b + c + d;
530       for (i = 2; i < height - 3; ++i) {
531         dst[i * dst_stride + j] = a + b + c + d + e;
532         a = b;
533         b = c;
534         c = d;
535         d = e;
536         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
537       }
538       dst[i * dst_stride + j] = a + b + c + d + e;
539       dst[(i + 1) * dst_stride + j] = b + c + d + e;
540       dst[(i + 2) * dst_stride + j] = c + d + e;
541     }
542   }
543 
544   // Horizontal sum over 5-pixel regions of dst
545   for (i = 0; i < height; ++i) {
546     a = dst[i * dst_stride];
547     b = dst[i * dst_stride + 1];
548     c = dst[i * dst_stride + 2];
549     d = dst[i * dst_stride + 3];
550     e = dst[i * dst_stride + 4];
551 
552     dst[i * dst_stride] = a + b + c;
553     dst[i * dst_stride + 1] = a + b + c + d;
554     for (j = 2; j < width - 3; ++j) {
555       // Loop invariant: At the start of each iteration,
556       // a = src[i * src_stride + (j - 2)]
557       // b = src[i * src_stride + (j - 1)]
558       // c = src[i * src_stride + (j    )]
559       // d = src[i * src_stride + (j + 1)]
560       // e = src[i * src_stride + (j + 2)]
561       dst[i * dst_stride + j] = a + b + c + d + e;
562       a = b;
563       b = c;
564       c = d;
565       d = e;
566       e = dst[i * dst_stride + (j + 3)];
567     }
568     dst[i * dst_stride + j] = a + b + c + d + e;
569     dst[i * dst_stride + (j + 1)] = b + c + d + e;
570     dst[i * dst_stride + (j + 2)] = c + d + e;
571   }
572 }
573 
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)574 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
575                    int sqr, int32_t *dst, int dst_stride) {
576   if (r == 1)
577     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
578   else if (r == 2)
579     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
580   else
581     assert(0 && "Invalid value of r in self-guided filter");
582 }
583 
av1_decode_xq(const int * xqd,int * xq,const sgr_params_type * params)584 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
585   if (params->r[0] == 0) {
586     xq[0] = 0;
587     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
588   } else if (params->r[1] == 0) {
589     xq[0] = xqd[0];
590     xq[1] = 0;
591   } else {
592     xq[0] = xqd[0];
593     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
594   }
595 }
596 
597 const int32_t av1_x_by_xplus1[256] = {
598   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
599   // instead of 0. See comments in selfguided_restoration_internal() for why
600   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
601   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
602   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
603   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
604   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
605   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
606   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
607   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
608   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
609   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
610   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
611   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
612   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
613   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
614   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
615   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
616   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
617   256,
618 };
619 
620 const int32_t av1_one_by_x[MAX_NELEM] = {
621   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
622   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
623 };
624 
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)625 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
626                                           int dgd_stride, int bit_depth,
627                                           int sgr_params_idx, int radius_idx,
628                                           int pass, int32_t *A, int32_t *B) {
629   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
630   const int r = params->r[radius_idx];
631   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
632   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
633   // Adjusting the stride of A and B here appears to avoid bad cache effects,
634   // leading to a significant speed improvement.
635   // We also align the stride to a multiple of 16 bytes, for consistency
636   // with the SIMD version of this function.
637   int buf_stride = ((width_ext + 3) & ~3) + 16;
638   const int step = pass == 0 ? 1 : 2;
639   int i, j;
640 
641   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
642   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
643          "Need SGRPROJ_BORDER_* >= r+1");
644 
645   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
646          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
647   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
648          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
649   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
650   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
651   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
652   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
653   for (i = -1; i < height + 1; i += step) {
654     for (j = -1; j < width + 1; ++j) {
655       const int k = i * buf_stride + j;
656       const int n = (2 * r + 1) * (2 * r + 1);
657 
658       // a < 2^16 * n < 2^22 regardless of bit depth
659       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
660       // b < 2^8 * n < 2^14 regardless of bit depth
661       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
662 
663       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
664       // and p itself satisfies p < 2^14 * n^2 < 2^26.
665       // This bound on p is due to:
666       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
667       //
668       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
669       // This is an artefact of rounding, and can only happen if all pixels
670       // are (almost) identical, so in this case we saturate to p=0.
671       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
672 
673       const uint32_t s = params->s[radius_idx];
674 
675       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
676       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
677       // (this holds even after accounting for the rounding in s)
678       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
679 
680       // Note: We have to be quite careful about the value of A[k].
681       // This is used as a blend factor between individual pixel values and the
682       // local mean. So it logically has a range of [0, 256], including both
683       // endpoints.
684       //
685       // This is a pain for hardware, as we'd like something which can be stored
686       // in exactly 8 bits.
687       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
688       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
689       // slightly above 2^(8 + bit depth), due to rounding in the value of
690       // av1_one_by_x[25-1].
691       //
692       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
693       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
694       // overflow), without significantly affecting the final result: z == 0
695       // implies that the image is essentially "flat", so the local mean and
696       // individual pixel values are very similar.
697       //
698       // Note that saturating on the other side, ie. requring A[k] <= 255,
699       // would be a bad idea, as that corresponds to the case where the image
700       // is very variable, when we want to preserve the local pixel value as
701       // much as possible.
702       A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
703 
704       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
705       // av1_one_by_x[n - 1] = round(2^12 / n)
706       // => the product here is < 2^(20 + bit_depth) <= 2^32,
707       // and B[k] is set to a value < 2^(8 + bit depth)
708       // This holds even with the rounding in av1_one_by_x and in the overall
709       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
710       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
711                                              (uint32_t)B[k] *
712                                              (uint32_t)av1_one_by_x[n - 1],
713                                          SGRPROJ_RECIP_BITS);
714     }
715   }
716 }
717 
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)718 static void selfguided_restoration_fast_internal(
719     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
720     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
721   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
722   const int r = params->r[radius_idx];
723   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
724   // Adjusting the stride of A and B here appears to avoid bad cache effects,
725   // leading to a significant speed improvement.
726   // We also align the stride to a multiple of 16 bytes, for consistency
727   // with the SIMD version of this function.
728   int buf_stride = ((width_ext + 3) & ~3) + 16;
729   int32_t A_[RESTORATION_PROC_UNIT_PELS];
730   int32_t B_[RESTORATION_PROC_UNIT_PELS];
731   int32_t *A = A_;
732   int32_t *B = B_;
733   int i, j;
734   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
735                                 sgr_params_idx, radius_idx, 1, A, B);
736   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
737   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
738 
739   // Use the A[] and B[] arrays to calculate the filtered image
740   (void)r;
741   assert(r == 2);
742   for (i = 0; i < height; ++i) {
743     if (!(i & 1)) {  // even row
744       for (j = 0; j < width; ++j) {
745         const int k = i * buf_stride + j;
746         const int l = i * dgd_stride + j;
747         const int m = i * dst_stride + j;
748         const int nb = 5;
749         const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
750                           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
751                            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
752                               5;
753         const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
754                           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
755                            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
756                               5;
757         const int32_t v = a * dgd[l] + b;
758         dst[m] =
759             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
760       }
761     } else {  // odd row
762       for (j = 0; j < width; ++j) {
763         const int k = i * buf_stride + j;
764         const int l = i * dgd_stride + j;
765         const int m = i * dst_stride + j;
766         const int nb = 4;
767         const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
768         const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
769         const int32_t v = a * dgd[l] + b;
770         dst[m] =
771             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
772       }
773     }
774   }
775 }
776 
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)777 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
778                                             int dgd_stride, int32_t *dst,
779                                             int dst_stride, int bit_depth,
780                                             int sgr_params_idx,
781                                             int radius_idx) {
782   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
783   // Adjusting the stride of A and B here appears to avoid bad cache effects,
784   // leading to a significant speed improvement.
785   // We also align the stride to a multiple of 16 bytes, for consistency
786   // with the SIMD version of this function.
787   int buf_stride = ((width_ext + 3) & ~3) + 16;
788   int32_t A_[RESTORATION_PROC_UNIT_PELS];
789   int32_t B_[RESTORATION_PROC_UNIT_PELS];
790   int32_t *A = A_;
791   int32_t *B = B_;
792   int i, j;
793   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
794                                 sgr_params_idx, radius_idx, 0, A, B);
795   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
796   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797 
798   // Use the A[] and B[] arrays to calculate the filtered image
799   for (i = 0; i < height; ++i) {
800     for (j = 0; j < width; ++j) {
801       const int k = i * buf_stride + j;
802       const int l = i * dgd_stride + j;
803       const int m = i * dst_stride + j;
804       const int nb = 5;
805       const int32_t a =
806           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
807               4 +
808           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
809            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
810               3;
811       const int32_t b =
812           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
813               4 +
814           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
815            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
816               3;
817       const int32_t v = a * dgd[l] + b;
818       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
819     }
820   }
821 }
822 
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)823 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
824                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
825                                  int flt_stride, int sgr_params_idx,
826                                  int bit_depth, int highbd) {
827   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
828   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
829   int32_t *dgd32 =
830       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
831 
832   if (highbd) {
833     const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
834     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
835       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
836         dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
837       }
838     }
839   } else {
840     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
841       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
842         dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
843       }
844     }
845   }
846 
847   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
848   // If params->r == 0 we skip the corresponding filter. We only allow one of
849   // the radii to be 0, as having both equal to 0 would be equivalent to
850   // skipping SGR entirely.
851   assert(!(params->r[0] == 0 && params->r[1] == 0));
852 
853   if (params->r[0] > 0)
854     selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
855                                          flt0, flt_stride, bit_depth,
856                                          sgr_params_idx, 0);
857   if (params->r[1] > 0)
858     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
859                                     flt_stride, bit_depth, sgr_params_idx, 1);
860   return 0;
861 }
862 
av1_apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)863 int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
864                                        int height, int stride, int eps,
865                                        const int *xqd, uint8_t *dst8,
866                                        int dst_stride, int32_t *tmpbuf,
867                                        int bit_depth, int highbd) {
868   int32_t *flt0 = tmpbuf;
869   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
870   assert(width * height <= RESTORATION_UNITPELS_MAX);
871 
872   const int ret = av1_selfguided_restoration_c(
873       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
874   if (ret != 0) return ret;
875   const sgr_params_type *const params = &av1_sgr_params[eps];
876   int xq[2];
877   av1_decode_xq(xqd, xq, params);
878   for (int i = 0; i < height; ++i) {
879     for (int j = 0; j < width; ++j) {
880       const int k = i * width + j;
881       uint8_t *dst8ij = dst8 + i * dst_stride + j;
882       const uint8_t *dat8ij = dat8 + i * stride + j;
883 
884       const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
885       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
886       int32_t v = u << SGRPROJ_PRJ_BITS;
887       // If params->r == 0 then we skipped the filtering in
888       // av1_selfguided_restoration_c, i.e. flt[k] == u
889       if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
890       if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
891       const int16_t w =
892           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
893 
894       const uint16_t out = clip_pixel_highbd(w, bit_depth);
895       if (highbd)
896         *CONVERT_TO_SHORTPTR(dst8ij) = out;
897       else
898         *dst8ij = (uint8_t)out;
899     }
900   }
901   return 0;
902 }
903 
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)904 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
905                                   int stripe_width, int stripe_height,
906                                   int procunit_width, const uint8_t *src,
907                                   int src_stride, uint8_t *dst, int dst_stride,
908                                   int32_t *tmpbuf, int bit_depth,
909                                   struct aom_internal_error_info *error_info) {
910   (void)bit_depth;
911   assert(bit_depth == 8);
912 
913   for (int j = 0; j < stripe_width; j += procunit_width) {
914     int w = AOMMIN(procunit_width, stripe_width - j);
915     if (av1_apply_selfguided_restoration(
916             src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
917             rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
918             0) != 0) {
919       aom_internal_error(
920           error_info, AOM_CODEC_MEM_ERROR,
921           "Error allocating buffer in av1_apply_selfguided_restoration");
922     }
923   }
924 }
925 
926 #if CONFIG_AV1_HIGHBITDEPTH
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)927 static void wiener_filter_stripe_highbd(
928     const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
929     int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
930     int dst_stride, int32_t *tmpbuf, int bit_depth,
931     struct aom_internal_error_info *error_info) {
932   (void)tmpbuf;
933   (void)error_info;
934   const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
935 
936   for (int j = 0; j < stripe_width; j += procunit_width) {
937     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
938     const uint8_t *src8_p = src8 + j;
939     uint8_t *dst8_p = dst8 + j;
940     av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
941                                        rui->wiener_info.hfilter, 16,
942                                        rui->wiener_info.vfilter, 16, w,
943                                        stripe_height, &conv_params, bit_depth);
944   }
945 }
946 
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,struct aom_internal_error_info * error_info)947 static void sgrproj_filter_stripe_highbd(
948     const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
949     int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
950     int dst_stride, int32_t *tmpbuf, int bit_depth,
951     struct aom_internal_error_info *error_info) {
952   for (int j = 0; j < stripe_width; j += procunit_width) {
953     int w = AOMMIN(procunit_width, stripe_width - j);
954     if (av1_apply_selfguided_restoration(
955             src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
956             rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
957             1) != 0) {
958       aom_internal_error(
959           error_info, AOM_CODEC_MEM_ERROR,
960           "Error allocating buffer in av1_apply_selfguided_restoration");
961     }
962   }
963 }
964 #endif  // CONFIG_AV1_HIGHBITDEPTH
965 
966 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
967                                   int stripe_width, int stripe_height,
968                                   int procunit_width, const uint8_t *src,
969                                   int src_stride, uint8_t *dst, int dst_stride,
970                                   int32_t *tmpbuf, int bit_depth,
971                                   struct aom_internal_error_info *error_info);
972 
973 #if CONFIG_AV1_HIGHBITDEPTH
974 #define NUM_STRIPE_FILTERS 4
975 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
976   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
977   sgrproj_filter_stripe_highbd
978 };
979 #else
980 #define NUM_STRIPE_FILTERS 2
981 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
982   wiener_filter_stripe, sgrproj_filter_stripe
983 };
984 #endif  // CONFIG_AV1_HIGHBITDEPTH
985 
986 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,int plane_w,int plane_h,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr,struct aom_internal_error_info * error_info)987 void av1_loop_restoration_filter_unit(
988     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
989     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
990     int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
991     uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
992     int optimized_lr, struct aom_internal_error_info *error_info) {
993   RestorationType unit_rtype = rui->restoration_type;
994 
995   int unit_h = limits->v_end - limits->v_start;
996   int unit_w = limits->h_end - limits->h_start;
997   uint8_t *data8_tl =
998       data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start;
999   uint8_t *dst8_tl =
1000       dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start;
1001 
1002   if (unit_rtype == RESTORE_NONE) {
1003     copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
1004                    highbd);
1005     return;
1006   }
1007 
1008   const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1009   assert(filter_idx < NUM_STRIPE_FILTERS);
1010   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1011 
1012   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1013 
1014   // Filter the whole image one stripe at a time
1015   RestorationTileLimits remaining_stripes = *limits;
1016   int i = 0;
1017   while (i < unit_h) {
1018     int copy_above, copy_below;
1019     remaining_stripes.v_start = limits->v_start + i;
1020 
1021     get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
1022                              &copy_above, &copy_below);
1023 
1024     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1025     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1026 
1027     // Work out where this stripe's boundaries are within
1028     // rsb->stripe_boundary_{above,below}
1029     const int frame_stripe =
1030         (remaining_stripes.v_start + runit_offset) / full_stripe_height;
1031     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1032 
1033     // Calculate this stripe's height, based on two rules:
1034     // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
1035     // * We can't extend past the end of the current restoration unit
1036     const int nominal_stripe_height =
1037         full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
1038     const int h = AOMMIN(nominal_stripe_height,
1039                          remaining_stripes.v_end - remaining_stripes.v_start);
1040 
1041     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1042                                      h, data8, stride, rlbs, copy_above,
1043                                      copy_below, optimized_lr);
1044 
1045     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1046                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
1047                   error_info);
1048 
1049     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1050                                        data8, stride, copy_above, copy_below,
1051                                        optimized_lr);
1052 
1053     i += h;
1054   }
1055 }
1056 
filter_frame_on_unit(const RestorationTileLimits * limits,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,struct aom_internal_error_info * error_info)1057 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1058                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
1059                                  RestorationLineBuffers *rlbs,
1060                                  struct aom_internal_error_info *error_info) {
1061   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1062   const RestorationInfo *rsi = ctxt->rsi;
1063 
1064   av1_loop_restoration_filter_unit(
1065       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
1066       ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
1067       ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
1068       ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
1069 }
1070 
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1071 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1072                                             YV12_BUFFER_CONFIG *frame,
1073                                             AV1_COMMON *cm, int optimized_lr,
1074                                             int num_planes) {
1075   const SequenceHeader *const seq_params = cm->seq_params;
1076   const int bit_depth = seq_params->bit_depth;
1077   const int highbd = seq_params->use_highbitdepth;
1078   lr_ctxt->dst = &cm->rst_frame;
1079 
1080   const int frame_width = frame->crop_widths[0];
1081   const int frame_height = frame->crop_heights[0];
1082   if (aom_realloc_frame_buffer(
1083           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1084           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
1085           cm->features.byte_alignment, NULL, NULL, NULL, false,
1086           0) != AOM_CODEC_OK)
1087     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1088                        "Failed to allocate restoration dst buffer");
1089 
1090   lr_ctxt->on_rest_unit = filter_frame_on_unit;
1091   lr_ctxt->frame = frame;
1092   for (int plane = 0; plane < num_planes; ++plane) {
1093     RestorationInfo *rsi = &cm->rst_info[plane];
1094     RestorationType rtype = rsi->frame_restoration_type;
1095     rsi->optimized_lr = optimized_lr;
1096     lr_ctxt->ctxt[plane].rsi = rsi;
1097 
1098     if (rtype == RESTORE_NONE) {
1099       continue;
1100     }
1101 
1102     const int is_uv = plane > 0;
1103     int plane_w, plane_h;
1104     av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1105     assert(plane_w == frame->crop_widths[is_uv]);
1106     assert(plane_h == frame->crop_heights[is_uv]);
1107 
1108     av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
1109                      frame->strides[is_uv], RESTORATION_BORDER,
1110                      RESTORATION_BORDER, highbd);
1111 
1112     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1113     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1114     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1115     lr_plane_ctxt->plane_w = plane_w;
1116     lr_plane_ctxt->plane_h = plane_h;
1117     lr_plane_ctxt->highbd = highbd;
1118     lr_plane_ctxt->bit_depth = bit_depth;
1119     lr_plane_ctxt->data8 = frame->buffers[plane];
1120     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1121     lr_plane_ctxt->data_stride = frame->strides[is_uv];
1122     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1123   }
1124 }
1125 
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1126 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1127                                       AV1_COMMON *cm, int num_planes) {
1128   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1129                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1130                            int vstart, int vend);
1131   static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
1132                                          aom_yv12_partial_coloc_copy_u,
1133                                          aom_yv12_partial_coloc_copy_v };
1134   assert(num_planes <= 3);
1135   for (int plane = 0; plane < num_planes; ++plane) {
1136     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1137     FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
1138     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
1139                      lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
1140   }
1141 }
1142 
1143 // Call on_rest_unit for each loop restoration unit in the plane.
foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1144 static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1145                                        rest_unit_visitor_t on_rest_unit,
1146                                        void *priv, int32_t *tmpbuf,
1147                                        RestorationLineBuffers *rlbs) {
1148   const RestorationInfo *rsi = &cm->rst_info[plane];
1149   const int hnum_rest_units = rsi->horz_units;
1150   const int vnum_rest_units = rsi->vert_units;
1151   const int unit_size = rsi->restoration_unit_size;
1152 
1153   const int is_uv = plane > 0;
1154   const int ss_y = is_uv && cm->seq_params->subsampling_y;
1155   const int ext_size = unit_size * 3 / 2;
1156   int plane_w, plane_h;
1157   av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1158 
1159   int y0 = 0, i = 0;
1160   while (y0 < plane_h) {
1161     int remaining_h = plane_h - y0;
1162     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1163 
1164     RestorationTileLimits limits;
1165     limits.v_start = y0;
1166     limits.v_end = y0 + h;
1167     assert(limits.v_end <= plane_h);
1168     // Offset upwards to align with the restoration processing stripe
1169     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1170     limits.v_start = AOMMAX(0, limits.v_start - voffset);
1171     if (limits.v_end < plane_h) limits.v_end -= voffset;
1172 
1173     av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
1174                                  hnum_rest_units, vnum_rest_units, plane, priv,
1175                                  tmpbuf, rlbs, av1_lr_sync_read_dummy,
1176                                  av1_lr_sync_write_dummy, NULL, cm->error);
1177 
1178     y0 += h;
1179     ++i;
1180   }
1181 }
1182 
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1183 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1184                                         int num_planes) {
1185   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1186 
1187   for (int plane = 0; plane < num_planes; ++plane) {
1188     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1189       continue;
1190     }
1191 
1192     foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane],
1193                                cm->rst_tmpbuf, cm->rlbs);
1194   }
1195 }
1196 
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1197 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1198                                        AV1_COMMON *cm, int optimized_lr,
1199                                        void *lr_ctxt) {
1200   assert(!cm->features.all_lossless);
1201   const int num_planes = av1_num_planes(cm);
1202 
1203   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1204 
1205   av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1206                                          optimized_lr, num_planes);
1207 
1208   foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1209 
1210   av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1211 }
1212 
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,int plane_w,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int hnum_rest_units,int vnum_rest_units,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync,struct aom_internal_error_info * error_info)1213 void av1_foreach_rest_unit_in_row(
1214     RestorationTileLimits *limits, int plane_w,
1215     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1216     int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
1217     int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
1218     sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
1219     struct aom_internal_error_info *error_info) {
1220   const int ext_size = unit_size * 3 / 2;
1221   int x0 = 0, j = 0;
1222   while (x0 < plane_w) {
1223     int remaining_w = plane_w - x0;
1224     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1225 
1226     limits->h_start = x0;
1227     limits->h_end = x0 + w;
1228     assert(limits->h_end <= plane_w);
1229 
1230     const int unit_idx = row_number * hnum_rest_units + j;
1231 
1232     // No sync for even numbered rows
1233     // For odd numbered rows, Loop Restoration of current block requires the LR
1234     // of top-right and bottom-right blocks to be completed
1235 
1236     // top-right sync
1237     on_sync_read(lr_sync, row_number, j, plane);
1238     if ((row_number + 1) < vnum_rest_units)
1239       // bottom-right sync
1240       on_sync_read(lr_sync, row_number + 2, j, plane);
1241 
1242 #if CONFIG_MULTITHREAD
1243     if (lr_sync && lr_sync->num_workers > 1) {
1244       pthread_mutex_lock(lr_sync->job_mutex);
1245       const bool lr_mt_exit = lr_sync->lr_mt_exit;
1246       pthread_mutex_unlock(lr_sync->job_mutex);
1247       // Exit in case any worker has encountered an error.
1248       if (lr_mt_exit) return;
1249     }
1250 #endif
1251 
1252     on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
1253 
1254     on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
1255 
1256     x0 += w;
1257     ++j;
1258   }
1259 }
1260 
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1261 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1262   (void)lr_sync;
1263   (void)r;
1264   (void)c;
1265   (void)plane;
1266 }
1267 
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1268 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1269                              const int sb_cols, int plane) {
1270   (void)lr_sync;
1271   (void)r;
1272   (void)c;
1273   (void)sb_cols;
1274   (void)plane;
1275 }
1276 
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1277 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1278                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
1279                                        int *rcol0, int *rcol1, int *rrow0,
1280                                        int *rrow1) {
1281   assert(rcol0 && rcol1 && rrow0 && rrow1);
1282 
1283   if (bsize != cm->seq_params->sb_size) return 0;
1284 
1285   assert(!cm->features.all_lossless);
1286 
1287   const int is_uv = plane > 0;
1288 
1289   // Compute the mi-unit corners of the superblock
1290   const int mi_row0 = mi_row;
1291   const int mi_col0 = mi_col;
1292   const int mi_row1 = mi_row0 + mi_size_high[bsize];
1293   const int mi_col1 = mi_col0 + mi_size_wide[bsize];
1294 
1295   const RestorationInfo *rsi = &cm->rst_info[plane];
1296   const int size = rsi->restoration_unit_size;
1297   const int horz_units = rsi->horz_units;
1298   const int vert_units = rsi->vert_units;
1299 
1300   // The size of an MI-unit on this plane of the image
1301   const int ss_x = is_uv && cm->seq_params->subsampling_x;
1302   const int ss_y = is_uv && cm->seq_params->subsampling_y;
1303   const int mi_size_x = MI_SIZE >> ss_x;
1304   const int mi_size_y = MI_SIZE >> ss_y;
1305 
1306   // Write m for the relative mi column or row, D for the superres denominator
1307   // and N for the superres numerator. If u is the upscaled pixel offset then
1308   // we can write the downscaled pixel offset in two ways as:
1309   //
1310   //   MI_SIZE * m = N / D u
1311   //
1312   // from which we get u = D * MI_SIZE * m / N
1313   const int mi_to_num_x = av1_superres_scaled(cm)
1314                               ? mi_size_x * cm->superres_scale_denominator
1315                               : mi_size_x;
1316   const int mi_to_num_y = mi_size_y;
1317   const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1318   const int denom_y = size;
1319 
1320   const int rnd_x = denom_x - 1;
1321   const int rnd_y = denom_y - 1;
1322 
1323   // rcol0/rrow0 should be the first column/row of restoration units that
1324   // doesn't start left/below of mi_col/mi_row. For this calculation, we need
1325   // to round up the division (if the sb starts at runit column 10.1, the first
1326   // matching runit has column index 11)
1327   *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
1328   *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
1329 
1330   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1331   // below-right. If we're at the bottom or right of the frame, this restoration
1332   // unit might not exist, in which case we'll clamp accordingly.
1333   *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1334   *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1335 
1336   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1337 }
1338 
1339 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1340 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1341                          int extend, int use_highbitdepth) {
1342   for (int i = 0; i < height; ++i) {
1343     if (use_highbitdepth) {
1344       uint16_t *buf16 = (uint16_t *)buf;
1345       aom_memset16(buf16 - extend, buf16[0], extend);
1346       aom_memset16(buf16 + width, buf16[width - 1], extend);
1347     } else {
1348       memset(buf - extend, buf[0], extend);
1349       memset(buf + width, buf[width - 1], extend);
1350     }
1351     buf += stride;
1352   }
1353 }
1354 
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1355 static void save_deblock_boundary_lines(
1356     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1357     int stripe, int use_highbd, int is_above,
1358     RestorationStripeBoundaries *boundaries) {
1359   const int is_uv = plane > 0;
1360   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1361   const int src_stride = frame->strides[is_uv] << use_highbd;
1362   const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1363 
1364   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1365                                : boundaries->stripe_boundary_below;
1366   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1367   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1368   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1369 
1370   // There is a rare case in which a processing stripe can end 1px above the
1371   // crop border. In this case, we do want to use deblocked pixels from below
1372   // the stripe (hence why we ended up in this function), but instead of
1373   // fetching 2 "below" rows we need to fetch one and duplicate it.
1374   // This is equivalent to clamping the sample locations against the crop border
1375   const int lines_to_save =
1376       AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1377   assert(lines_to_save == 1 || lines_to_save == 2);
1378 
1379   int upscaled_width;
1380   int line_bytes;
1381   if (av1_superres_scaled(cm)) {
1382     const int ss_x = is_uv && cm->seq_params->subsampling_x;
1383     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1384     line_bytes = upscaled_width << use_highbd;
1385     if (use_highbd)
1386       av1_upscale_normative_rows(
1387           cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1388           CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1389           plane, lines_to_save);
1390     else
1391       av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1392                                  boundaries->stripe_boundary_stride, plane,
1393                                  lines_to_save);
1394   } else {
1395     upscaled_width = frame->crop_widths[is_uv];
1396     line_bytes = upscaled_width << use_highbd;
1397     for (int i = 0; i < lines_to_save; i++) {
1398       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1399              line_bytes);
1400     }
1401   }
1402   // If we only saved one line, then copy it into the second line buffer
1403   if (lines_to_save == 1)
1404     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1405 
1406   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1407                RESTORATION_EXTRA_HORZ, use_highbd);
1408 }
1409 
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1410 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1411                                      const AV1_COMMON *cm, int plane, int row,
1412                                      int stripe, int use_highbd, int is_above,
1413                                      RestorationStripeBoundaries *boundaries) {
1414   const int is_uv = plane > 0;
1415   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1416   const int src_stride = frame->strides[is_uv] << use_highbd;
1417   const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride;
1418 
1419   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1420                                : boundaries->stripe_boundary_below;
1421   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1422   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1423   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1424   const int src_width = frame->crop_widths[is_uv];
1425 
1426   // At the point where this function is called, we've already applied
1427   // superres. So we don't need to extend the lines here, we can just
1428   // pull directly from the topmost row of the upscaled frame.
1429   const int ss_x = is_uv && cm->seq_params->subsampling_x;
1430   const int upscaled_width = av1_superres_scaled(cm)
1431                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
1432                                  : src_width;
1433   const int line_bytes = upscaled_width << use_highbd;
1434   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1435     // Copy the line at 'src_rows' into both context lines
1436     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1437   }
1438   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1439                RESTORATION_EXTRA_HORZ, use_highbd);
1440 }
1441 
save_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1442 static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
1443                                 int plane, AV1_COMMON *cm, int after_cdef) {
1444   const int is_uv = plane > 0;
1445   const int ss_y = is_uv && cm->seq_params->subsampling_y;
1446   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1447   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1448 
1449   int plane_w, plane_h;
1450   av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
1451 
1452   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1453 
1454   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1455 
1456   int stripe_idx;
1457   for (stripe_idx = 0;; ++stripe_idx) {
1458     const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
1459     const int y0 = rel_y0;
1460     if (y0 >= plane_h) break;
1461 
1462     const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
1463     const int y1 = AOMMIN(rel_y1, plane_h);
1464 
1465     // Extend using CDEF pixels at the top and bottom of the frame,
1466     // and deblocked pixels at internal stripe boundaries
1467     const int use_deblock_above = (stripe_idx > 0);
1468     const int use_deblock_below = (y1 < plane_height);
1469 
1470     if (!after_cdef) {
1471       // Save deblocked context at internal stripe boundaries
1472       if (use_deblock_above) {
1473         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1474                                     stripe_idx, use_highbd, 1, boundaries);
1475       }
1476       if (use_deblock_below) {
1477         save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
1478                                     use_highbd, 0, boundaries);
1479       }
1480     } else {
1481       // Save CDEF context at frame boundaries
1482       if (!use_deblock_above) {
1483         save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
1484                                  1, boundaries);
1485       }
1486       if (!use_deblock_below) {
1487         save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
1488                                  use_highbd, 0, boundaries);
1489       }
1490     }
1491   }
1492 }
1493 
1494 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1495 // lines to be used as boundary in the loop restoration process. The
1496 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1497 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1498                                               AV1_COMMON *cm, int after_cdef) {
1499   const int num_planes = av1_num_planes(cm);
1500   const int use_highbd = cm->seq_params->use_highbitdepth;
1501   for (int p = 0; p < num_planes; ++p) {
1502     save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1503   }
1504 }
1505