1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/loop_restoration.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22
23 #include "src/dsp/common.h"
24 #include "src/dsp/dsp.h"
25 #include "src/utils/common.h"
26 #include "src/utils/constants.h"
27
28 namespace libgav1 {
29 namespace dsp {
30
31 // Section 7.17.3.
32 // a2: range [1, 256].
33 // if (z >= 255)
34 // a2 = 256;
35 // else if (z == 0)
36 // a2 = 1;
37 // else
38 // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
39 // ma = 256 - a2;
40 alignas(16) const uint8_t kSgrMaLookup[256] = {
41 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
42 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
43 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
44 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
45 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
46 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 0};
56
57 namespace {
58
59 template <int bitdepth, typename Pixel>
WienerHorizontal(const Pixel * source,const ptrdiff_t source_stride,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,int16_t ** wiener_buffer)60 inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
61 const int width, const int height,
62 const int16_t* const filter,
63 const int number_zero_coefficients,
64 int16_t** wiener_buffer) {
65 constexpr int kCenterTap = kWienerFilterTaps / 2;
66 constexpr int kRoundBitsHorizontal = (bitdepth == 12)
67 ? kInterRoundBitsHorizontal12bpp
68 : kInterRoundBitsHorizontal;
69 constexpr int offset =
70 1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
71 constexpr int limit = (offset << 2) - 1;
72 for (int y = 0; y < height; ++y) {
73 int x = 0;
74 do {
75 // sum fits into 16 bits only when bitdepth = 8.
76 int sum = 0;
77 for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
78 sum +=
79 filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
80 }
81 sum += filter[kCenterTap] * source[x + kCenterTap];
82 const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
83 (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
84 } while (++x != width);
85 source += source_stride;
86 *wiener_buffer += width;
87 }
88 }
89
90 template <int bitdepth, typename Pixel>
WienerVertical(const int16_t * wiener_buffer,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,void * const dest,const ptrdiff_t dest_stride)91 inline void WienerVertical(const int16_t* wiener_buffer, const int width,
92 const int height, const int16_t* const filter,
93 const int number_zero_coefficients, void* const dest,
94 const ptrdiff_t dest_stride) {
95 constexpr int kCenterTap = kWienerFilterTaps / 2;
96 constexpr int kRoundBitsVertical =
97 (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
98 auto* dst = static_cast<Pixel*>(dest);
99 int y = height;
100 do {
101 int x = 0;
102 do {
103 // sum needs 32 bits.
104 int sum = 0;
105 for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
106 sum += filter[k] *
107 (wiener_buffer[k * width + x] +
108 wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
109 }
110 sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
111 const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
112 dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
113 } while (++x != width);
114 wiener_buffer += width;
115 dst += dest_stride;
116 } while (--y != 0);
117 }
118
119 // Note: bit range for wiener filter.
120 // Wiener filter process first applies horizontal filtering to input pixels,
121 // followed by rounding with predefined bits (dependent on bitdepth).
122 // Then vertical filtering is applied, followed by rounding (dependent on
123 // bitdepth).
124 // The process is the same as convolution:
125 // <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
126 // --> <rounding 1>
127 // By design:
128 // (a). horizontal/vertical filtering adds 7 bits to input.
129 // (b). The output of first rounding fits into 16 bits.
130 // (c). The output of second rounding fits into 16 bits.
131 // If input bitdepth > 8, the accumulator of the horizontal filter is larger
132 // than 16 bit and smaller than 32 bits.
133 // The accumulator of the vertical filter is larger than 16 bits and smaller
134 // than 32 bits.
135 // Note: range of wiener filter coefficients.
136 // Wiener filter coefficients are symmetric, and their sum is 1 (128).
137 // The range of each coefficient:
138 // filter[0] = filter[6], 4 bits, min = -5, max = 10.
139 // filter[1] = filter[5], 5 bits, min = -23, max = 8.
140 // filter[2] = filter[4], 6 bits, min = -17, max = 46.
141 // filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
142 // The difference from libaom is that in libaom:
143 // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
144 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
145 template <int bitdepth, typename Pixel>
WienerFilter_C(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)146 void WienerFilter_C(
147 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
148 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
149 const void* LIBGAV1_RESTRICT const top_border,
150 const ptrdiff_t top_border_stride,
151 const void* LIBGAV1_RESTRICT const bottom_border,
152 const ptrdiff_t bottom_border_stride, const int width, const int height,
153 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
154 void* LIBGAV1_RESTRICT const dest) {
155 constexpr int kCenterTap = kWienerFilterTaps / 2;
156 const int16_t* const number_leading_zero_coefficients =
157 restoration_info.wiener_info.number_leading_zero_coefficients;
158 const int number_rows_to_skip = std::max(
159 static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
160 1);
161 int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
162
163 // horizontal filtering.
164 const int height_horizontal =
165 height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
166 const int height_extra = (height_horizontal - height) >> 1;
167 assert(height_extra <= 2);
168 const int16_t* const filter_horizontal =
169 restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
170 const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
171 const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
172 const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
173 auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
174
175 if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
176 WienerHorizontal<bitdepth, Pixel>(
177 top + (2 - height_extra) * top_border_stride, top_border_stride, width,
178 height_extra, filter_horizontal, 0, &wiener_buffer);
179 WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
180 filter_horizontal, 0, &wiener_buffer);
181 WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
182 height_extra, filter_horizontal, 0,
183 &wiener_buffer);
184 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
185 WienerHorizontal<bitdepth, Pixel>(
186 top + (2 - height_extra) * top_border_stride, top_border_stride, width,
187 height_extra, filter_horizontal, 1, &wiener_buffer);
188 WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
189 filter_horizontal, 1, &wiener_buffer);
190 WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
191 height_extra, filter_horizontal, 1,
192 &wiener_buffer);
193 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
194 WienerHorizontal<bitdepth, Pixel>(
195 top + (2 - height_extra) * top_border_stride, top_border_stride, width,
196 height_extra, filter_horizontal, 2, &wiener_buffer);
197 WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
198 filter_horizontal, 2, &wiener_buffer);
199 WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
200 height_extra, filter_horizontal, 2,
201 &wiener_buffer);
202 } else {
203 assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
204 WienerHorizontal<bitdepth, Pixel>(
205 top + (2 - height_extra) * top_border_stride, top_border_stride, width,
206 height_extra, filter_horizontal, 3, &wiener_buffer);
207 WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
208 filter_horizontal, 3, &wiener_buffer);
209 WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
210 height_extra, filter_horizontal, 3,
211 &wiener_buffer);
212 }
213
214 // vertical filtering.
215 const int16_t* const filter_vertical =
216 restoration_info.wiener_info.filter[WienerInfo::kVertical];
217 if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
218 // Because the top row of |source| is a duplicate of the second row, and the
219 // bottom row of |source| is a duplicate of its above row, we can duplicate
220 // the top and bottom row of |wiener_buffer| accordingly.
221 memcpy(wiener_buffer, wiener_buffer - width,
222 sizeof(*wiener_buffer) * width);
223 memcpy(wiener_buffer_org, wiener_buffer_org + width,
224 sizeof(*wiener_buffer) * width);
225 WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
226 filter_vertical, 0, dest, stride);
227 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
228 WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
229 filter_vertical, 1, dest, stride);
230 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
231 WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
232 filter_vertical, 2, dest, stride);
233 } else {
234 assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
235 WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
236 filter_vertical, 3, dest, stride);
237 }
238 }
239
240 //------------------------------------------------------------------------------
241 // SGR
242
243 // When |height| is 1, |src_stride| could be set to an arbitrary value.
244 template <typename Pixel, int size>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sums,uint32_t * const * square_sums)245 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
246 const int height, const int width,
247 uint16_t* const* sums,
248 uint32_t* const* square_sums) {
249 int y = height;
250 do {
251 uint32_t sum = 0;
252 uint32_t square_sum = 0;
253 for (int dx = 0; dx < size; ++dx) {
254 const Pixel source = src[dx];
255 sum += source;
256 square_sum += source * source;
257 }
258 (*sums)[0] = sum;
259 (*square_sums)[0] = square_sum;
260 int x = 1;
261 do {
262 const Pixel source0 = src[x - 1];
263 const Pixel source1 = src[x - 1 + size];
264 sum -= source0;
265 sum += source1;
266 square_sum -= source0 * source0;
267 square_sum += source1 * source1;
268 (*sums)[x] = sum;
269 (*square_sums)[x] = square_sum;
270 } while (++x != width);
271 src += src_stride;
272 ++sums;
273 ++square_sums;
274 } while (--y != 0);
275 }
276
277 // When |height| is 1, |src_stride| could be set to an arbitrary value.
278 template <typename Pixel>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sum3,uint16_t * const * sum5,uint32_t * const * square_sum3,uint32_t * const * square_sum5)279 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
280 const int height, const int width,
281 uint16_t* const* sum3, uint16_t* const* sum5,
282 uint32_t* const* square_sum3,
283 uint32_t* const* square_sum5) {
284 int y = height;
285 do {
286 uint32_t sum = 0;
287 uint32_t square_sum = 0;
288 for (int dx = 0; dx < 4; ++dx) {
289 const Pixel source = src[dx];
290 sum += source;
291 square_sum += source * source;
292 }
293 int x = 0;
294 do {
295 const Pixel source0 = src[x];
296 const Pixel source1 = src[x + 4];
297 sum -= source0;
298 square_sum -= source0 * source0;
299 (*sum3)[x] = sum;
300 (*square_sum3)[x] = square_sum;
301 sum += source1;
302 square_sum += source1 * source1;
303 (*sum5)[x] = sum + source0;
304 (*square_sum5)[x] = square_sum + source0 * source0;
305 } while (++x != width);
306 src += src_stride;
307 ++sum3;
308 ++sum5;
309 ++square_sum3;
310 ++square_sum5;
311 } while (--y != 0);
312 }
313
314 template <int bitdepth, int n>
CalculateIntermediate(const uint32_t s,uint32_t a,const uint32_t b,uint8_t * const ma_ptr,uint32_t * const b_ptr)315 inline void CalculateIntermediate(const uint32_t s, uint32_t a,
316 const uint32_t b, uint8_t* const ma_ptr,
317 uint32_t* const b_ptr) {
318 // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
319 // since max bitdepth = 12, max < 2^31.
320 // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
321 a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
322 // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
323 // d < 2^8 * n < 2^14 regardless of bitdepth
324 const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
325 // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
326 // and p itself satisfies p < 2^14 * n^2 < 2^26.
327 // This bound on p is due to:
328 // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
329 // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
330 // This is an artifact of rounding, and can only happen if all pixels
331 // are (almost) identical, so in this case we saturate to p=0.
332 const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
333 // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
334 // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
335 // (this holds even after accounting for the rounding in s)
336 const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
337 // ma: range [0, 255].
338 const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
339 const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
340 // ma < 2^8, b < 2^(bitdepth) * n,
341 // one_over_n = round(2^12 / n)
342 // => the product here is < 2^(20 + bitdepth) <= 2^32,
343 // and b is set to a value < 2^(8 + bitdepth).
344 // This holds even with the rounding in one_over_n and in the overall result,
345 // as long as ma is strictly less than 2^8.
346 const uint32_t b2 = ma * b * one_over_n;
347 *ma_ptr = ma;
348 *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
349 }
350
351 template <typename T>
Sum343(const T * const src)352 inline uint32_t Sum343(const T* const src) {
353 return 3 * (src[0] + src[2]) + 4 * src[1];
354 }
355
356 template <typename T>
Sum444(const T * const src)357 inline uint32_t Sum444(const T* const src) {
358 return 4 * (src[0] + src[1] + src[2]);
359 }
360
361 template <typename T>
Sum565(const T * const src)362 inline uint32_t Sum565(const T* const src) {
363 return 5 * (src[0] + src[2]) + 6 * src[1];
364 }
365
366 template <int bitdepth>
BoxFilterPreProcess5(const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],const int width,const uint32_t s,SgrBuffer * const sgr_buffer,uint16_t * const ma565,uint32_t * const b565)367 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
368 const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
369 const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
370 uint16_t* const ma565, uint32_t* const b565) {
371 int x = 0;
372 do {
373 uint32_t a = 0;
374 uint32_t b = 0;
375 for (int dy = 0; dy < 5; ++dy) {
376 a += square_sum5[dy][x];
377 b += sum5[dy][x];
378 }
379 CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
380 sgr_buffer->b + x);
381 } while (++x != width + 2);
382 x = 0;
383 do {
384 ma565[x] = Sum565(sgr_buffer->ma + x);
385 b565[x] = Sum565(sgr_buffer->b + x);
386 } while (++x != width);
387 }
388
389 template <int bitdepth>
BoxFilterPreProcess3(const uint16_t * const sum3[3],const uint32_t * const square_sum3[3],const int width,const uint32_t s,const bool calculate444,SgrBuffer * const sgr_buffer,uint16_t * const ma343,uint32_t * const b343,uint16_t * const ma444,uint32_t * const b444)390 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
391 const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
392 const int width, const uint32_t s, const bool calculate444,
393 SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
394 uint16_t* const ma444, uint32_t* const b444) {
395 int x = 0;
396 do {
397 uint32_t a = 0;
398 uint32_t b = 0;
399 for (int dy = 0; dy < 3; ++dy) {
400 a += square_sum3[dy][x];
401 b += sum3[dy][x];
402 }
403 CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
404 sgr_buffer->b + x);
405 } while (++x != width + 2);
406 x = 0;
407 do {
408 ma343[x] = Sum343(sgr_buffer->ma + x);
409 b343[x] = Sum343(sgr_buffer->b + x);
410 } while (++x != width);
411 if (calculate444) {
412 x = 0;
413 do {
414 ma444[x] = Sum444(sgr_buffer->ma + x);
415 b444[x] = Sum444(sgr_buffer->b + x);
416 } while (++x != width);
417 }
418 }
419
420 template <typename Pixel>
CalculateFilteredOutput(const Pixel src,const uint32_t ma,const uint32_t b,const int shift)421 inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
422 const uint32_t b, const int shift) {
423 const int32_t v = b - ma * src;
424 return RightShiftWithRounding(v,
425 kSgrProjSgrBits + shift - kSgrProjRestoreBits);
426 }
427
428 template <typename Pixel>
BoxFilterPass1Kernel(const Pixel src0,const Pixel src1,const uint16_t * const ma565[2],const uint32_t * const b565[2],const ptrdiff_t x,int p[2])429 inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
430 const uint16_t* const ma565[2],
431 const uint32_t* const b565[2],
432 const ptrdiff_t x, int p[2]) {
433 p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
434 b565[0][x] + b565[1][x], 5);
435 p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
436 }
437
438 template <typename Pixel>
BoxFilterPass2Kernel(const Pixel src,const uint16_t * const ma343[3],const uint16_t * const ma444,const uint32_t * const b343[3],const uint32_t * const b444,const ptrdiff_t x)439 inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
440 const uint16_t* const ma444,
441 const uint32_t* const b343[3],
442 const uint32_t* const b444, const ptrdiff_t x) {
443 const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
444 const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
445 return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
446 }
447
448 template <int bitdepth, typename Pixel>
SelfGuidedFinal(const int src,const int v)449 inline Pixel SelfGuidedFinal(const int src, const int v) {
450 // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
451 // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
452 // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
453 // maximum value of each element.
454 const int s = src + RightShiftWithRounding(
455 v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
456 return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
457 }
458
459 template <int bitdepth, typename Pixel>
SelfGuidedDoubleMultiplier(const int src,const int filter0,const int filter1,const int16_t w0,const int16_t w2)460 inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
461 const int filter1, const int16_t w0,
462 const int16_t w2) {
463 const int v = w0 * filter0 + w2 * filter1;
464 return SelfGuidedFinal<bitdepth, Pixel>(src, v);
465 }
466
467 template <int bitdepth, typename Pixel>
SelfGuidedSingleMultiplier(const int src,const int filter,const int16_t w0)468 inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
469 const int16_t w0) {
470 const int v = w0 * filter;
471 return SelfGuidedFinal<bitdepth, Pixel>(src, v);
472 }
473
474 template <int bitdepth, typename Pixel>
BoxFilterPass1(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const uint32_t scale,const int16_t w0,SgrBuffer * const sgr_buffer,uint16_t * const ma565[2],uint32_t * const b565[2],Pixel * dst)475 inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
476 uint16_t* const sum5[5],
477 uint32_t* const square_sum5[5], const int width,
478 const uint32_t scale, const int16_t w0,
479 SgrBuffer* const sgr_buffer,
480 uint16_t* const ma565[2], uint32_t* const b565[2],
481 Pixel* dst) {
482 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
483 ma565[1], b565[1]);
484 int x = 0;
485 do {
486 int p[2];
487 BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
488 dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
489 dst[stride + x] =
490 SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
491 } while (++x != width);
492 }
493
494 template <int bitdepth, typename Pixel>
BoxFilterPass2(const Pixel * const src,const Pixel * const src0,const int width,const uint16_t scale,const int16_t w0,uint16_t * const sum3[4],uint32_t * const square_sum3[4],SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint32_t * const b343[4],uint32_t * const b444[3],Pixel * dst)495 inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
496 const int width, const uint16_t scale,
497 const int16_t w0, uint16_t* const sum3[4],
498 uint32_t* const square_sum3[4],
499 SgrBuffer* const sgr_buffer,
500 uint16_t* const ma343[4], uint16_t* const ma444[3],
501 uint32_t* const b343[4], uint32_t* const b444[3],
502 Pixel* dst) {
503 BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
504 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
505 sgr_buffer, ma343[2], b343[2], ma444[1],
506 b444[1]);
507 int x = 0;
508 do {
509 const int p =
510 BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
511 dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
512 } while (++x != width);
513 }
514
515 template <int bitdepth, typename Pixel>
BoxFilter(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],Pixel * dst)516 inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
517 uint16_t* const sum3[4], uint16_t* const sum5[5],
518 uint32_t* const square_sum3[4],
519 uint32_t* const square_sum5[5], const int width,
520 const uint16_t scales[2], const int16_t w0,
521 const int16_t w2, SgrBuffer* const sgr_buffer,
522 uint16_t* const ma343[4], uint16_t* const ma444[3],
523 uint16_t* const ma565[2], uint32_t* const b343[4],
524 uint32_t* const b444[3], uint32_t* const b565[2],
525 Pixel* dst) {
526 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
527 sgr_buffer, ma565[1], b565[1]);
528 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
529 sgr_buffer, ma343[2], b343[2], ma444[1],
530 b444[1]);
531 BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
532 true, sgr_buffer, ma343[3], b343[3], ma444[2],
533 b444[2]);
534 int x = 0;
535 do {
536 int p[2][2];
537 BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
538 p[1][0] =
539 BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
540 p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
541 b343 + 1, b444[1], x);
542 dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
543 p[1][0], w0, w2);
544 dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
545 src[stride + x], p[0][1], p[1][1], w0, w2);
546 } while (++x != width);
547 }
548
549 template <int bitdepth, typename Pixel>
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)550 inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
551 const Pixel* src, const ptrdiff_t stride,
552 const Pixel* const top_border,
553 const ptrdiff_t top_border_stride,
554 const Pixel* bottom_border,
555 const ptrdiff_t bottom_border_stride,
556 const int width, const int height,
557 SgrBuffer* const sgr_buffer, Pixel* dst) {
558 const auto temp_stride = Align<ptrdiff_t>(width, 8);
559 const ptrdiff_t sum_stride = temp_stride + 8;
560 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
561 const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
562 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
563 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
564 const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
565 uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
566 uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
567 sum3[0] = sgr_buffer->sum3;
568 square_sum3[0] = sgr_buffer->square_sum3;
569 ma343[0] = sgr_buffer->ma343;
570 b343[0] = sgr_buffer->b343;
571 for (int i = 1; i <= 3; ++i) {
572 sum3[i] = sum3[i - 1] + sum_stride;
573 square_sum3[i] = square_sum3[i - 1] + sum_stride;
574 ma343[i] = ma343[i - 1] + temp_stride;
575 b343[i] = b343[i - 1] + temp_stride;
576 }
577 sum5[0] = sgr_buffer->sum5;
578 square_sum5[0] = sgr_buffer->square_sum5;
579 for (int i = 1; i <= 4; ++i) {
580 sum5[i] = sum5[i - 1] + sum_stride;
581 square_sum5[i] = square_sum5[i - 1] + sum_stride;
582 }
583 ma444[0] = sgr_buffer->ma444;
584 b444[0] = sgr_buffer->b444;
585 for (int i = 1; i <= 2; ++i) {
586 ma444[i] = ma444[i - 1] + temp_stride;
587 b444[i] = b444[i - 1] + temp_stride;
588 }
589 ma565[0] = sgr_buffer->ma565;
590 ma565[1] = ma565[0] + temp_stride;
591 b565[0] = sgr_buffer->b565;
592 b565[1] = b565[0] + temp_stride;
593 assert(scales[0] != 0);
594 assert(scales[1] != 0);
595 BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
596 square_sum3, square_sum5 + 1);
597 sum5[0] = sum5[1];
598 square_sum5[0] = square_sum5[1];
599 BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
600 square_sum5 + 3);
601 const Pixel* const s = (height > 1) ? src + stride : bottom_border;
602 BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
603 square_sum5 + 4);
604 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
605 sgr_buffer, ma565[0], b565[0]);
606 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
607 sgr_buffer, ma343[0], b343[0], nullptr,
608 nullptr);
609 BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
610 true, sgr_buffer, ma343[1], b343[1], ma444[0],
611 b444[0]);
612 sum5[0] = sgr_buffer->sum5;
613 square_sum5[0] = sgr_buffer->square_sum5;
614
615 for (int y = (height >> 1) - 1; y > 0; --y) {
616 Circulate4PointersBy2<uint16_t>(sum3);
617 Circulate4PointersBy2<uint32_t>(square_sum3);
618 Circulate5PointersBy2<uint16_t>(sum5);
619 Circulate5PointersBy2<uint32_t>(square_sum5);
620 BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
621 square_sum3 + 2, square_sum5 + 3);
622 BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
623 square_sum5, width, scales, w0, w2, sgr_buffer,
624 ma343, ma444, ma565, b343, b444, b565, dst);
625 src += 2 * stride;
626 dst += 2 * stride;
627 Circulate4PointersBy2<uint16_t>(ma343);
628 Circulate4PointersBy2<uint32_t>(b343);
629 std::swap(ma444[0], ma444[2]);
630 std::swap(b444[0], b444[2]);
631 std::swap(ma565[0], ma565[1]);
632 std::swap(b565[0], b565[1]);
633 }
634
635 Circulate4PointersBy2<uint16_t>(sum3);
636 Circulate4PointersBy2<uint32_t>(square_sum3);
637 Circulate5PointersBy2<uint16_t>(sum5);
638 Circulate5PointersBy2<uint32_t>(square_sum5);
639 if ((height & 1) == 0 || height > 1) {
640 const Pixel* sr;
641 ptrdiff_t s_stride;
642 if ((height & 1) == 0) {
643 sr = bottom_border;
644 s_stride = bottom_border_stride;
645 } else {
646 sr = src + 2 * stride;
647 s_stride = bottom_border - (src + 2 * stride);
648 }
649 BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
650 square_sum3 + 2, square_sum5 + 3);
651 BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
652 square_sum5, width, scales, w0, w2, sgr_buffer,
653 ma343, ma444, ma565, b343, b444, b565, dst);
654 }
655 if ((height & 1) != 0) {
656 src += 3;
657 if (height > 1) {
658 src += 2 * stride;
659 dst += 2 * stride;
660 Circulate4PointersBy2<uint16_t>(sum3);
661 Circulate4PointersBy2<uint32_t>(square_sum3);
662 Circulate5PointersBy2<uint16_t>(sum5);
663 Circulate5PointersBy2<uint32_t>(square_sum5);
664 Circulate4PointersBy2<uint16_t>(ma343);
665 Circulate4PointersBy2<uint32_t>(b343);
666 std::swap(ma444[0], ma444[2]);
667 std::swap(b444[0], b444[2]);
668 std::swap(ma565[0], ma565[1]);
669 std::swap(b565[0], b565[1]);
670 }
671 BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
672 width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
673 square_sum5 + 3);
674 sum5[4] = sum5[3];
675 square_sum5[4] = square_sum5[3];
676 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
677 sgr_buffer, ma565[1], b565[1]);
678 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
679 sgr_buffer, ma343[2], b343[2], nullptr,
680 nullptr);
681 int x = 0;
682 do {
683 const int p0 = CalculateFilteredOutput<Pixel>(
684 src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
685 const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
686 b444[0], x);
687 dst[x] =
688 SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
689 } while (++x != width);
690 }
691 }
692
693 template <int bitdepth, typename Pixel>
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)694 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
695 const Pixel* src, const ptrdiff_t stride,
696 const Pixel* const top_border,
697 const ptrdiff_t top_border_stride,
698 const Pixel* bottom_border,
699 const ptrdiff_t bottom_border_stride,
700 const int width, const int height,
701 SgrBuffer* const sgr_buffer, Pixel* dst) {
702 const auto temp_stride = Align<ptrdiff_t>(width, 8);
703 const ptrdiff_t sum_stride = temp_stride + 8;
704 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
705 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
706 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
707 uint16_t *sum5[5], *ma565[2];
708 uint32_t *square_sum5[5], *b565[2];
709 sum5[0] = sgr_buffer->sum5;
710 square_sum5[0] = sgr_buffer->square_sum5;
711 for (int i = 1; i <= 4; ++i) {
712 sum5[i] = sum5[i - 1] + sum_stride;
713 square_sum5[i] = square_sum5[i - 1] + sum_stride;
714 }
715 ma565[0] = sgr_buffer->ma565;
716 ma565[1] = ma565[0] + temp_stride;
717 b565[0] = sgr_buffer->b565;
718 b565[1] = b565[0] + temp_stride;
719 assert(scale != 0);
720 BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
721 square_sum5 + 1);
722 sum5[0] = sum5[1];
723 square_sum5[0] = square_sum5[1];
724 BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
725 const Pixel* const s = (height > 1) ? src + stride : bottom_border;
726 BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
727 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
728 ma565[0], b565[0]);
729 sum5[0] = sgr_buffer->sum5;
730 square_sum5[0] = sgr_buffer->square_sum5;
731
732 for (int y = (height >> 1) - 1; y > 0; --y) {
733 Circulate5PointersBy2<uint16_t>(sum5);
734 Circulate5PointersBy2<uint32_t>(square_sum5);
735 BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
736 square_sum5 + 3);
737 BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
738 scale, w0, sgr_buffer, ma565, b565, dst);
739 src += 2 * stride;
740 dst += 2 * stride;
741 std::swap(ma565[0], ma565[1]);
742 std::swap(b565[0], b565[1]);
743 }
744
745 Circulate5PointersBy2<uint16_t>(sum5);
746 Circulate5PointersBy2<uint32_t>(square_sum5);
747 if ((height & 1) == 0 || height > 1) {
748 const Pixel* sr;
749 ptrdiff_t s_stride;
750 if ((height & 1) == 0) {
751 sr = bottom_border;
752 s_stride = bottom_border_stride;
753 } else {
754 sr = src + 2 * stride;
755 s_stride = bottom_border - (src + 2 * stride);
756 }
757 BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
758 BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
759 scale, w0, sgr_buffer, ma565, b565, dst);
760 }
761 if ((height & 1) != 0) {
762 src += 3;
763 if (height > 1) {
764 src += 2 * stride;
765 dst += 2 * stride;
766 std::swap(ma565[0], ma565[1]);
767 std::swap(b565[0], b565[1]);
768 Circulate5PointersBy2<uint16_t>(sum5);
769 Circulate5PointersBy2<uint32_t>(square_sum5);
770 }
771 BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
772 1, width + 2, sum5 + 3, square_sum5 + 3);
773 sum5[4] = sum5[3];
774 square_sum5[4] = square_sum5[3];
775 BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
776 ma565[1], b565[1]);
777 int x = 0;
778 do {
779 const int p = CalculateFilteredOutput<Pixel>(
780 src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
781 dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
782 } while (++x != width);
783 }
784 }
785
786 template <int bitdepth, typename Pixel>
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)787 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
788 const Pixel* src, const ptrdiff_t stride,
789 const Pixel* const top_border,
790 const ptrdiff_t top_border_stride,
791 const Pixel* bottom_border,
792 const ptrdiff_t bottom_border_stride,
793 const int width, const int height,
794 SgrBuffer* const sgr_buffer, Pixel* dst) {
795 assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
796 const auto temp_stride = Align<ptrdiff_t>(width, 8);
797 const ptrdiff_t sum_stride = temp_stride + 8;
798 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
799 const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
800 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
801 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
802 uint16_t *sum3[3], *ma343[3], *ma444[2];
803 uint32_t *square_sum3[3], *b343[3], *b444[2];
804 sum3[0] = sgr_buffer->sum3;
805 square_sum3[0] = sgr_buffer->square_sum3;
806 ma343[0] = sgr_buffer->ma343;
807 b343[0] = sgr_buffer->b343;
808 for (int i = 1; i <= 2; ++i) {
809 sum3[i] = sum3[i - 1] + sum_stride;
810 square_sum3[i] = square_sum3[i - 1] + sum_stride;
811 ma343[i] = ma343[i - 1] + temp_stride;
812 b343[i] = b343[i - 1] + temp_stride;
813 }
814 ma444[0] = sgr_buffer->ma444;
815 ma444[1] = ma444[0] + temp_stride;
816 b444[0] = sgr_buffer->b444;
817 b444[1] = b444[0] + temp_stride;
818 assert(scale != 0);
819 BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
820 square_sum3);
821 BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
822 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
823 sgr_buffer, ma343[0], b343[0], nullptr,
824 nullptr);
825 Circulate3PointersBy1<uint16_t>(sum3);
826 Circulate3PointersBy1<uint32_t>(square_sum3);
827 const Pixel* s;
828 if (height > 1) {
829 s = src + stride;
830 } else {
831 s = bottom_border;
832 bottom_border += bottom_border_stride;
833 }
834 BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
835 BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
836 sgr_buffer, ma343[1], b343[1], ma444[0],
837 b444[0]);
838
839 for (int y = height - 2; y > 0; --y) {
840 Circulate3PointersBy1<uint16_t>(sum3);
841 Circulate3PointersBy1<uint32_t>(square_sum3);
842 BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
843 sum3, square_sum3, sgr_buffer, ma343, ma444,
844 b343, b444, dst);
845 src += stride;
846 dst += stride;
847 Circulate3PointersBy1<uint16_t>(ma343);
848 Circulate3PointersBy1<uint32_t>(b343);
849 std::swap(ma444[0], ma444[1]);
850 std::swap(b444[0], b444[1]);
851 }
852
853 src += 2;
854 int y = std::min(height, 2);
855 do {
856 Circulate3PointersBy1<uint16_t>(sum3);
857 Circulate3PointersBy1<uint32_t>(square_sum3);
858 BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
859 square_sum3, sgr_buffer, ma343, ma444, b343,
860 b444, dst);
861 src += stride;
862 dst += stride;
863 bottom_border += bottom_border_stride;
864 Circulate3PointersBy1<uint16_t>(ma343);
865 Circulate3PointersBy1<uint32_t>(b343);
866 std::swap(ma444[0], ma444[1]);
867 std::swap(b444[0], b444[1]);
868 } while (--y != 0);
869 }
870
871 template <int bitdepth, typename Pixel>
SelfGuidedFilter_C(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)872 void SelfGuidedFilter_C(
873 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
874 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
875 const void* LIBGAV1_RESTRICT const top_border,
876 const ptrdiff_t top_border_stride,
877 const void* LIBGAV1_RESTRICT const bottom_border,
878 const ptrdiff_t bottom_border_stride, const int width, const int height,
879 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
880 void* LIBGAV1_RESTRICT const dest) {
881 const int index = restoration_info.sgr_proj_info.index;
882 const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
883 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
884 const auto* src = static_cast<const Pixel*>(source);
885 const auto* top = static_cast<const Pixel*>(top_border);
886 const auto* bottom = static_cast<const Pixel*>(bottom_border);
887 auto* dst = static_cast<Pixel*>(dest);
888 SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
889 if (radius_pass_1 == 0) {
890 // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
891 // following assertion.
892 assert(radius_pass_0 != 0);
893 BoxFilterProcessPass1<bitdepth, Pixel>(
894 restoration_info, src - 3, stride, top - 3, top_border_stride,
895 bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
896 } else if (radius_pass_0 == 0) {
897 BoxFilterProcessPass2<bitdepth, Pixel>(
898 restoration_info, src - 2, stride, top - 2, top_border_stride,
899 bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
900 } else {
901 BoxFilterProcess<bitdepth, Pixel>(
902 restoration_info, src - 3, stride, top - 3, top_border_stride,
903 bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
904 }
905 }
906
Init8bpp()907 void Init8bpp() {
908 Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
909 assert(dsp != nullptr);
910 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
911 dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
912 dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
913 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
914 static_cast<void>(dsp);
915 #ifndef LIBGAV1_Dsp8bpp_WienerFilter
916 dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
917 #endif
918 #ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
919 dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
920 #endif
921 #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
922 }
923
924 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()925 void Init10bpp() {
926 Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
927 assert(dsp != nullptr);
928 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
929 dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
930 dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
931 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
932 static_cast<void>(dsp);
933 #ifndef LIBGAV1_Dsp10bpp_WienerFilter
934 dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
935 #endif
936 #ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
937 dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
938 #endif
939 #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
940 }
941 #endif // LIBGAV1_MAX_BITDEPTH >= 10
942
943 #if LIBGAV1_MAX_BITDEPTH == 12
Init12bpp()944 void Init12bpp() {
945 Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
946 assert(dsp != nullptr);
947 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
948 dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
949 dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
950 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
951 static_cast<void>(dsp);
952 #ifndef LIBGAV1_Dsp12bpp_WienerFilter
953 dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
954 #endif
955 #ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
956 dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
957 #endif
958 #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
959 }
960 #endif // LIBGAV1_MAX_BITDEPTH == 12
961
962 } // namespace
963
LoopRestorationInit_C()964 void LoopRestorationInit_C() {
965 Init8bpp();
966 #if LIBGAV1_MAX_BITDEPTH >= 10
967 Init10bpp();
968 #endif
969 #if LIBGAV1_MAX_BITDEPTH == 12
970 Init12bpp();
971 #endif
972 }
973
974 } // namespace dsp
975 } // namespace libgav1
976