xref: /aosp_15_r20/external/libgav1/src/post_filter/cdef.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include <cassert>
15 
16 #include "src/post_filter.h"
17 #include "src/utils/blocking_counter.h"
18 #include "src/utils/compiler_attributes.h"
19 #include "src/utils/constants.h"
20 
21 namespace libgav1 {
22 namespace {
23 
24 constexpr int kStep64x64 = 16;  // =64/4.
25 constexpr int kCdefSkip = 8;
26 
27 constexpr uint8_t kCdefUvDirection[2][2][8] = {
28     {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
29     {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
30 
31 constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
32 
33 template <typename Pixel>
CopyRowForCdef(const Pixel * src,int block_width,int unit_width,bool is_frame_left,bool is_frame_right,uint16_t * const dst,const Pixel * left_border=nullptr)34 void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
35                     bool is_frame_left, bool is_frame_right,
36                     uint16_t* const dst, const Pixel* left_border = nullptr) {
37   if (sizeof(src[0]) == sizeof(dst[0])) {
38     if (is_frame_left) {
39       Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
40     } else if (left_border == nullptr) {
41       memcpy(dst - kCdefBorder, src - kCdefBorder,
42              kCdefBorder * sizeof(dst[0]));
43     } else {
44       memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
45     }
46     memcpy(dst, src, block_width * sizeof(dst[0]));
47     if (is_frame_right) {
48       Memset(dst + block_width, kCdefLargeValue,
49              unit_width + kCdefBorder - block_width);
50     } else {
51       memcpy(dst + block_width, src + block_width,
52              (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
53     }
54     return;
55   }
56   if (is_frame_left) {
57     for (int x = -kCdefBorder; x < 0; ++x) {
58       dst[x] = static_cast<uint16_t>(kCdefLargeValue);
59     }
60   } else if (left_border == nullptr) {
61     for (int x = -kCdefBorder; x < 0; ++x) {
62       dst[x] = src[x];
63     }
64   } else {
65     for (int x = -kCdefBorder; x < 0; ++x) {
66       dst[x] = left_border[x + kCdefBorder];
67     }
68   }
69   for (int x = 0; x < block_width; ++x) {
70     dst[x] = src[x];
71   }
72   for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
73     dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
74   }
75 }
76 
77 // GCC 13.x will report a false positive from the call to
78 // ApplyCdefForOneSuperBlockRowHelper() with a nullptr in
79 // ApplyCdefForOneSuperBlockRow(). The call to CopyPixels() in
80 // ApplyCdefForOneUnit() is only made when thread_pool_ != nullptr and
81 // border_columns[][] is a valid pointer.
82 #if defined(__GNUC__) && !defined(__clang__)
83 #pragma GCC diagnostic push
84 #pragma GCC diagnostic ignored "-Warray-bounds"
85 #pragma GCC diagnostic ignored "-Wstringop-overflow"
86 #endif
87 // For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
88 // |dst|.
CopyPixels(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width,int height,size_t pixel_size)89 void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
90                 int dst_stride, int width, int height, size_t pixel_size) {
91   assert(src != nullptr);
92   assert(dst != nullptr);
93   assert(height > 0);
94   int y = height;
95   do {
96     memcpy(dst, src, width * pixel_size);
97     src += src_stride;
98     dst += dst_stride;
99   } while (--y != 0);
100 }
101 #if defined(__GNUC__) && !defined(__clang__)
102 #pragma GCC diagnostic pop
103 #endif
104 
105 }  // namespace
106 
SetupCdefBorder(int row4x4)107 void PostFilter::SetupCdefBorder(int row4x4) {
108   assert(row4x4 >= 0);
109   assert(DoCdef());
110   int plane = kPlaneY;
111   do {
112     const ptrdiff_t src_stride = frame_buffer_.stride(plane);
113     const ptrdiff_t dst_stride = cdef_border_.stride(plane);
114     const int row_offset = DivideBy4(row4x4);
115     const int num_pixels = SubsampledValue(
116         MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
117     const int row_width = num_pixels << pixel_size_log2_;
118     const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
119                                              subsampling_y_[plane]);
120     for (int i = 0; i < 4; ++i) {
121       const int row = kCdefBorderRows[subsampling_y_[plane]][i];
122       const int absolute_row =
123           (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
124       if (absolute_row >= plane_height) break;
125       const uint8_t* src =
126           GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
127           row * src_stride;
128       uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
129       memcpy(dst, src, row_width);
130     }
131   } while (++plane < planes_);
132 }
133 
134 template <typename Pixel>
PrepareCdefBlock(int block_width4x4,int block_height4x4,int row4x4,int column4x4,uint16_t * cdef_source,ptrdiff_t cdef_stride,const bool y_plane,const uint8_t border_columns[kMaxPlanes][256],bool use_border_columns)135 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
136                                   int row4x4, int column4x4,
137                                   uint16_t* cdef_source, ptrdiff_t cdef_stride,
138                                   const bool y_plane,
139                                   const uint8_t border_columns[kMaxPlanes][256],
140                                   bool use_border_columns) {
141   assert(y_plane || planes_ == kMaxPlanes);
142   const int max_planes = y_plane ? 1 : kMaxPlanes;
143   const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
144   const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
145   const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
146   const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
147   const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
148   const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
149   const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
150   const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
151   // unit_width, unit_height are the same as block_width, block_height unless
152   // it reaches the frame boundary, where block_width < 64 or
153   // block_height < 64. unit_width, unit_height guarantee we build blocks on
154   // a multiple of 8.
155   const int unit_width = Align(block_width, 8 >> subsampling_x);
156   const int unit_height = Align(block_height, 8 >> subsampling_y);
157   const bool is_frame_left = column4x4 == 0;
158   const bool is_frame_right = start_x + block_width >= plane_width;
159   const bool is_frame_top = row4x4 == 0;
160   const bool is_frame_bottom = start_y + block_height >= plane_height;
161   const int y_offset = is_frame_top ? 0 : kCdefBorder;
162   const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
163 
164   for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
165     uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
166                                            kCdefUnitSizeWithBorders *
167                                            kCdefUnitSizeWithBorders;
168     const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
169     const Pixel* src_buffer =
170         reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
171         (start_y - y_offset) * src_stride + start_x;
172     const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
173     const Pixel* cdef_border =
174         (thread_pool_ == nullptr)
175             ? nullptr
176             : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
177                   cdef_border_row_offset * cdef_border_stride + start_x;
178 
179     // All the copying code will use negative indices for populating the left
180     // border. So the starting point is set to kCdefBorder.
181     cdef_src += kCdefBorder;
182 
183     // Copy the top 2 rows as follows;
184     // If is_frame_top is true, both the rows are set to kCdefLargeValue.
185     // Otherwise:
186     //   If multi-threaded filtering is off, the rows are copied from
187     //   |src_buffer|.
188     //   Otherwise, the rows are copied from |cdef_border|.
189     if (is_frame_top) {
190       for (int y = 0; y < kCdefBorder; ++y) {
191         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
192                unit_width + 2 * kCdefBorder);
193         cdef_src += cdef_stride;
194       }
195     } else {
196       const Pixel* top_border =
197           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
198       const int top_border_stride =
199           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
200       for (int y = 0; y < kCdefBorder; ++y) {
201         CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
202                        is_frame_right, cdef_src);
203         top_border += top_border_stride;
204         cdef_src += cdef_stride;
205         // We need to increment |src_buffer| and |cdef_border| in this loop to
206         // set them up for the subsequent loops below.
207         src_buffer += src_stride;
208         cdef_border += cdef_border_stride;
209       }
210     }
211 
212     // Copy the body as follows;
213     // If multi-threaded filtering is off or if is_frame_bottom is true, all the
214     // rows are copied from |src_buffer|.
215     // Otherwise, the first |block_height|-kCdefBorder rows are copied from
216     // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
217     int y = block_height;
218     const int y_threshold =
219         (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
220     const Pixel* left_border =
221         (thread_pool_ == nullptr || !use_border_columns)
222             ? nullptr
223             : reinterpret_cast<const Pixel*>(border_columns[plane]);
224     do {
225       CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
226                      is_frame_right, cdef_src, left_border);
227       cdef_src += cdef_stride;
228       src_buffer += src_stride;
229       if (left_border != nullptr) left_border += kCdefBorder;
230     } while (--y != y_threshold);
231 
232     if (y > 0) {
233       assert(y == kCdefBorder);
234       // |cdef_border| now points to the top 2 rows of the current block. For
235       // the next loop, we need it to point to the bottom 2 rows of the
236       // current block. So increment it by 2 rows.
237       cdef_border += MultiplyBy2(cdef_border_stride);
238       for (int i = 0; i < kCdefBorder; ++i) {
239         CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
240                        is_frame_right, cdef_src);
241         cdef_src += cdef_stride;
242         cdef_border += cdef_border_stride;
243       }
244     }
245 
246     // Copy the bottom 2 rows as follows;
247     // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
248     // Otherwise:
249     //   If multi-threaded filtering is off, the rows are copied from
250     //   |src_buffer|.
251     //   Otherwise, the rows are copied from |cdef_border|.
252     y = 0;
253     if (is_frame_bottom) {
254       do {
255         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
256                unit_width + 2 * kCdefBorder);
257         cdef_src += cdef_stride;
258       } while (++y < kCdefBorder + unit_height - block_height);
259     } else {
260       const Pixel* bottom_border =
261           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
262       const int bottom_border_stride =
263           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
264       do {
265         CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
266                        is_frame_right, cdef_src);
267         bottom_border += bottom_border_stride;
268         cdef_src += cdef_stride;
269       } while (++y < kCdefBorder + unit_height - block_height);
270     }
271   }
272 }
273 
274 template <typename Pixel>
ApplyCdefForOneUnit(uint16_t * cdef_block,const int index,const int block_width4x4,const int block_height4x4,const int row4x4_start,const int column4x4_start,uint8_t border_columns[2][kMaxPlanes][256],bool use_border_columns[2][2])275 void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
276                                      const int block_width4x4,
277                                      const int block_height4x4,
278                                      const int row4x4_start,
279                                      const int column4x4_start,
280                                      uint8_t border_columns[2][kMaxPlanes][256],
281                                      bool use_border_columns[2][2]) {
282   // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
283   static constexpr int kStep = 8;
284   static constexpr int kStep4x4 = 2;
285 
286   int cdef_buffer_row_base_stride[kMaxPlanes];
287   uint8_t* cdef_buffer_row_base[kMaxPlanes];
288   int src_buffer_row_base_stride[kMaxPlanes];
289   const uint8_t* src_buffer_row_base[kMaxPlanes];
290   const uint16_t* cdef_src_row_base[kMaxPlanes];
291   int cdef_src_row_base_stride[kMaxPlanes];
292   int column_step[kMaxPlanes];
293   assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
294   int plane = kPlaneY;
295   do {
296     cdef_buffer_row_base[plane] =
297         GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
298     cdef_buffer_row_base_stride[plane] =
299         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
300     src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
301                                                  row4x4_start, column4x4_start);
302     src_buffer_row_base_stride[plane] =
303         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
304     cdef_src_row_base[plane] =
305         cdef_block +
306         static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
307             kCdefUnitSizeWithBorders +
308         kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
309     cdef_src_row_base_stride[plane] =
310         kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
311     column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
312   } while (++plane < planes_);
313 
314   // |border_columns| contains two buffers. In each call to this function, we
315   // will use one of them as the "destination" for the current call. And the
316   // other one as the "source" for the current call (which would have been the
317   // "destination" of the previous call). We will use the src_index to populate
318   // the borders which were backed up in the previous call. We will use the
319   // dst_index to populate the borders to be used in the next call.
320   const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
321   const int border_columns_dst_index = border_columns_src_index ^ 1;
322 
323   if (index == -1) {
324     if (thread_pool_ == nullptr) {
325       int plane = kPlaneY;
326       do {
327         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
328                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
329                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
330                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
331                    sizeof(Pixel));
332       } while (++plane < planes_);
333     }
334     use_border_columns[border_columns_dst_index][0] = false;
335     use_border_columns[border_columns_dst_index][1] = false;
336     return;
337   }
338 
339   const bool is_frame_right =
340       MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
341   if (!is_frame_right && thread_pool_ != nullptr) {
342     // Backup the last 2 columns for use in the next iteration.
343     use_border_columns[border_columns_dst_index][0] = true;
344     const uint8_t* src_line =
345         GetSourceBuffer(kPlaneY, row4x4_start,
346                         column4x4_start + block_width4x4) -
347         kCdefBorder * sizeof(Pixel);
348     assert(border_columns != nullptr);
349     CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
350                border_columns[border_columns_dst_index][kPlaneY],
351                kCdefBorder * sizeof(Pixel), kCdefBorder,
352                MultiplyBy4(block_height4x4), sizeof(Pixel));
353   }
354 
355   PrepareCdefBlock<Pixel>(
356       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
357       cdef_block, kCdefUnitSizeWithBorders, true,
358       (border_columns != nullptr) ? border_columns[border_columns_src_index]
359                                   : nullptr,
360       use_border_columns[border_columns_src_index][0]);
361 
362   // Stored direction used during the u/v pass.  If bit 3 is set, then block is
363   // a skip.
364   uint8_t direction_y[8 * 8];
365   int y_index = 0;
366 
367   const uint8_t y_primary_strength =
368       frame_header_.cdef.y_primary_strength[index];
369   const uint8_t y_secondary_strength =
370       frame_header_.cdef.y_secondary_strength[index];
371   // y_strength_index is 0 for both primary and secondary strengths being
372   // non-zero, 1 for primary only, 2 for secondary only. This will be updated
373   // with y_primary_strength after variance is applied.
374   int y_strength_index = static_cast<int>(y_secondary_strength == 0);
375 
376   const bool compute_direction_and_variance =
377       (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
378   const uint8_t* skip_row =
379       &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
380   const int skip_stride = cdef_skip_.columns();
381   int row4x4 = row4x4_start;
382   do {
383     uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
384     const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
385     const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
386     int column4x4 = column4x4_start;
387 
388     if (*skip_row == 0) {
389       for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
390         direction_y[y_index] = kCdefSkip;
391       }
392       if (thread_pool_ == nullptr) {
393         CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
394                    cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
395                    sizeof(Pixel));
396       }
397     } else {
398       do {
399         const int block_width = kStep;
400         const int block_height = kStep;
401         const int cdef_stride = frame_buffer_.stride(kPlaneY);
402         uint8_t* const cdef_buffer = cdef_buffer_base;
403         const uint16_t* const cdef_src = cdef_src_base;
404         const int src_stride = frame_buffer_.stride(kPlaneY);
405         const uint8_t* const src_buffer = src_buffer_base;
406 
407         const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
408         const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
409         if (skip) {  // No cdef filtering.
410           direction_y[y_index] = kCdefSkip;
411           if (thread_pool_ == nullptr) {
412             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
413                        block_width, block_height, sizeof(Pixel));
414           }
415         } else {
416           // Zero out residual skip flag.
417           direction_y[y_index] = 0;
418 
419           int variance = 0;
420           if (compute_direction_and_variance) {
421             if (thread_pool_ == nullptr ||
422                 row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
423               dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
424                                   &variance);
425             } else if (sizeof(Pixel) == 2) {
426               dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
427                                   &direction_y[y_index], &variance);
428             } else {
429               // If we are in the last row4x4 for this unit, then the last two
430               // input rows have to come from |cdef_border_|. Since we already
431               // have |cdef_src| populated correctly, use that as the input
432               // for the direction process.
433               uint8_t direction_src[8][8];
434               const uint16_t* cdef_src_line = cdef_src;
435               for (auto& direction_src_line : direction_src) {
436                 for (int i = 0; i < 8; ++i) {
437                   direction_src_line[i] = cdef_src_line[i];
438                 }
439                 cdef_src_line += kCdefUnitSizeWithBorders;
440               }
441               dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
442                                   &variance);
443             }
444           }
445           const int direction =
446               (y_primary_strength == 0) ? 0 : direction_y[y_index];
447           const int variance_strength =
448               ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
449                                      : 0;
450           const uint8_t primary_strength =
451               (variance != 0)
452                   ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
453                   : 0;
454           if ((primary_strength | y_secondary_strength) == 0) {
455             if (thread_pool_ == nullptr) {
456               CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
457                          block_width, block_height, sizeof(Pixel));
458             }
459           } else {
460             const int strength_index =
461                 y_strength_index |
462                 (static_cast<int>(primary_strength == 0) << 1);
463             dsp_.cdef_filters[1][strength_index](
464                 cdef_src, kCdefUnitSizeWithBorders, block_height,
465                 primary_strength, y_secondary_strength,
466                 frame_header_.cdef.damping, direction, cdef_buffer,
467                 cdef_stride);
468           }
469         }
470         cdef_buffer_base += column_step[kPlaneY];
471         src_buffer_base += column_step[kPlaneY];
472         cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
473 
474         column4x4 += kStep4x4;
475         y_index++;
476       } while (column4x4 < column4x4_start + block_width4x4);
477     }
478 
479     cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
480     src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
481     cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
482     skip_row += skip_stride;
483     row4x4 += kStep4x4;
484   } while (row4x4 < row4x4_start + block_height4x4);
485 
486   if (planes_ == kMaxPlanesMonochrome) {
487     return;
488   }
489 
490   const uint8_t uv_primary_strength =
491       frame_header_.cdef.uv_primary_strength[index];
492   const uint8_t uv_secondary_strength =
493       frame_header_.cdef.uv_secondary_strength[index];
494 
495   if ((uv_primary_strength | uv_secondary_strength) == 0) {
496     if (thread_pool_ == nullptr) {
497       for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
498         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
499                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
500                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
501                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
502                    sizeof(Pixel));
503       }
504     }
505     use_border_columns[border_columns_dst_index][1] = false;
506     return;
507   }
508 
509   if (!is_frame_right && thread_pool_ != nullptr) {
510     use_border_columns[border_columns_dst_index][1] = true;
511     for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
512       // Backup the last 2 columns for use in the next iteration.
513       const uint8_t* src_line =
514           GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
515                           column4x4_start + block_width4x4) -
516           kCdefBorder * sizeof(Pixel);
517       CopyPixels(src_line, frame_buffer_.stride(plane),
518                  border_columns[border_columns_dst_index][plane],
519                  kCdefBorder * sizeof(Pixel), kCdefBorder,
520                  MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
521                  sizeof(Pixel));
522     }
523   }
524 
525   PrepareCdefBlock<Pixel>(
526       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
527       cdef_block, kCdefUnitSizeWithBorders, false,
528       (border_columns != nullptr) ? border_columns[border_columns_src_index]
529                                   : nullptr,
530       use_border_columns[border_columns_src_index][1]);
531 
532   // uv_strength_index is 0 for both primary and secondary strengths being
533   // non-zero, 1 for primary only, 2 for secondary only.
534   const int uv_strength_index =
535       (static_cast<int>(uv_primary_strength == 0) << 1) |
536       static_cast<int>(uv_secondary_strength == 0);
537   for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
538     const int8_t subsampling_x = subsampling_x_[plane];
539     const int8_t subsampling_y = subsampling_y_[plane];
540     const int block_width = kStep >> subsampling_x;
541     const int block_height = kStep >> subsampling_y;
542     int row4x4 = row4x4_start;
543 
544     y_index = 0;
545     do {
546       uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
547       const uint8_t* src_buffer_base = src_buffer_row_base[plane];
548       const uint16_t* cdef_src_base = cdef_src_row_base[plane];
549       int column4x4 = column4x4_start;
550       do {
551         const int cdef_stride = frame_buffer_.stride(plane);
552         uint8_t* const cdef_buffer = cdef_buffer_base;
553         const int src_stride = frame_buffer_.stride(plane);
554         const uint8_t* const src_buffer = src_buffer_base;
555         const uint16_t* const cdef_src = cdef_src_base;
556         const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
557         int dual_cdef = 0;
558 
559         if (skip) {  // No cdef filtering.
560           if (thread_pool_ == nullptr) {
561             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
562                        block_width, block_height, sizeof(Pixel));
563           }
564         } else {
565           // Make sure block pair is not out of bounds.
566           if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
567             // Enable dual processing if subsampling_x is 1.
568             dual_cdef = subsampling_x;
569           }
570 
571           int direction = (uv_primary_strength == 0)
572                               ? 0
573                               : kCdefUvDirection[subsampling_x][subsampling_y]
574                                                 [direction_y[y_index]];
575 
576           if (dual_cdef != 0) {
577             if (uv_primary_strength &&
578                 direction_y[y_index] != direction_y[y_index + 1]) {
579               // Disable dual processing if the second block of the pair does
580               // not have the same direction.
581               dual_cdef = 0;
582             }
583 
584             // Disable dual processing if the second block of the pair is a
585             // skip.
586             if (direction_y[y_index + 1] == kCdefSkip) {
587               dual_cdef = 0;
588             }
589           }
590 
591           // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
592           const int width_index = dual_cdef | (subsampling_x ^ 1);
593           dsp_.cdef_filters[width_index][uv_strength_index](
594               cdef_src, kCdefUnitSizeWithBorders, block_height,
595               uv_primary_strength, uv_secondary_strength,
596               frame_header_.cdef.damping - 1, direction, cdef_buffer,
597               cdef_stride);
598         }
599         // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
600         // so adjust the pointers and indexes for 2 blocks.
601         cdef_buffer_base += column_step[plane] << dual_cdef;
602         src_buffer_base += column_step[plane] << dual_cdef;
603         cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
604         column4x4 += kStep4x4 << dual_cdef;
605         y_index += 1 << dual_cdef;
606       } while (column4x4 < column4x4_start + block_width4x4);
607 
608       cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
609       src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
610       cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
611       row4x4 += kStep4x4;
612     } while (row4x4 < row4x4_start + block_height4x4);
613   }
614 }
615 
ApplyCdefForOneSuperBlockRowHelper(uint16_t * cdef_block,uint8_t border_columns[2][kMaxPlanes][256],int row4x4,int block_height4x4)616 void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
617     uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
618     int row4x4, int block_height4x4) {
619   bool use_border_columns[2][2] = {};
620   const bool non_zero_index = frame_header_.cdef.bits > 0;
621   const int8_t* cdef_index =
622       non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
623   int column4x4 = 0;
624   do {
625     const int index = non_zero_index ? *cdef_index++ : 0;
626     const int block_width4x4 =
627         std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
628 
629 #if LIBGAV1_MAX_BITDEPTH >= 10
630     if (bitdepth_ >= 10) {
631       ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
632                                     block_height4x4, row4x4, column4x4,
633                                     border_columns, use_border_columns);
634     } else  // NOLINT
635 #endif      // LIBGAV1_MAX_BITDEPTH >= 10
636     {
637       ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
638                                    block_height4x4, row4x4, column4x4,
639                                    border_columns, use_border_columns);
640     }
641     column4x4 += kStep64x64;
642   } while (column4x4 < frame_header_.columns4x4);
643 }
644 
ApplyCdefForOneSuperBlockRow(int row4x4_start,int sb4x4,bool is_last_row)645 void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
646                                               bool is_last_row) {
647   assert(row4x4_start >= 0);
648   assert(DoCdef());
649   int row4x4 = row4x4_start;
650   const int row4x4_limit = row4x4_start + sb4x4;
651   do {
652     if (row4x4 >= frame_header_.rows4x4) return;
653 
654     // Apply cdef for the last 8 rows of the previous superblock row.
655     // One exception: If the superblock size is 128x128 and is_last_row is true,
656     // then we simply apply cdef for the entire superblock row without any lag.
657     // In that case, apply cdef for the previous superblock row only during the
658     // first iteration (row4x4 == row4x4_start).
659     if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
660       assert(row4x4 >= 16);
661       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
662     }
663 
664     // Apply cdef for the current superblock row. If this is the last superblock
665     // row we apply cdef for all the rows, otherwise we leave out the last 8
666     // rows.
667     const int block_height4x4 =
668         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
669     const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
670     if (height4x4 > 0) {
671       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
672                                          height4x4);
673     }
674     row4x4 += kStep64x64;
675   } while (row4x4 < row4x4_limit);
676 }
677 
ApplyCdefWorker(std::atomic<int> * row4x4_atomic)678 void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
679   int row4x4;
680   uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
681   // Each border_column buffer has to store 64 rows and 2 columns for each
682   // plane. For 10bit, that is 64*2*2 = 256 bytes.
683   alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
684   while ((row4x4 = row4x4_atomic->fetch_add(
685               kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
686     const int block_height4x4 =
687         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
688     ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
689                                        block_height4x4);
690   }
691 }
692 
693 }  // namespace libgav1
694