xref: /aosp_15_r20/external/libdav1d/src/arm/looprestoration.h (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1 /*
2  * Copyright © 2018, VideoLAN and dav1d authors
3  * Copyright © 2018, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "src/cpu.h"
29 #include "src/looprestoration.h"
30 
31 #if ARCH_AARCH64
32 void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
33                                     const pixel (*left)[4], const pixel *lpf,
34                                     const int w, int h,
35                                     const LooprestorationParams *const params,
36                                     const enum LrEdgeFlags edges
37                                     HIGHBD_DECL_SUFFIX);
38 void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
39                                     const pixel (*left)[4], const pixel *lpf,
40                                     const int w, int h,
41                                     const LooprestorationParams *const params,
42                                     const enum LrEdgeFlags edges
43                                     HIGHBD_DECL_SUFFIX);
44 #else
45 
46 // The 8bpc version calculates things slightly differently than the reference
47 // C version. That version calculates roughly this:
48 // int16_t sum = 0;
49 // for (int i = 0; i < 7; i++)
50 //     sum += src[idx] * fh[i];
51 // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
52 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
53 // sum += 1 << (bitdepth + 6 - round_bits_h);
54 // Compared to the reference C version, this is the output of the first pass
55 // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
56 // with round_offset precompensated.
57 // The 16bpc version calculates things pretty much the same way as the
58 // reference C version, but with the end result subtracted by
59 // 1 << (bitdepth + 6 - round_bits_h).
60 void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
61                                      const pixel *src, ptrdiff_t stride,
62                                      const int16_t fh[8], intptr_t w,
63                                      int h, enum LrEdgeFlags edges
64                                      HIGHBD_DECL_SUFFIX);
65 // This calculates things slightly differently than the reference C version.
66 // This version calculates roughly this:
67 // int32_t sum = 0;
68 // for (int i = 0; i < 7; i++)
69 //     sum += mid[idx] * fv[i];
70 // sum = (sum + rounding_off_v) >> round_bits_v;
71 // This function assumes that the width is a multiple of 8.
72 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
73                                      const int16_t *mid, int w, int h,
74                                      const int16_t fv[8], enum LrEdgeFlags edges,
75                                      ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
76 
wiener_filter_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)77 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
78                                const pixel (*const left)[4], const pixel *lpf,
79                                const int w, const int h,
80                                const LooprestorationParams *const params,
81                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
82 {
83     const int16_t (*const filter)[8] = params->filter;
84     ALIGN_STK_16(int16_t, mid, 68 * 384,);
85     int mid_stride = (w + 7) & ~7;
86 
87     // Horizontal filter
88     BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
89                                     filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
90     if (edges & LR_HAVE_TOP)
91         BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
92                                         filter[0], w, 2, edges
93                                         HIGHBD_TAIL_SUFFIX);
94     if (edges & LR_HAVE_BOTTOM)
95         BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
96                                         lpf + 6 * PXSTRIDE(stride),
97                                         stride, filter[0], w, 2, edges
98                                         HIGHBD_TAIL_SUFFIX);
99 
100     // Vertical filter
101     BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
102                                     w, h, filter[1], edges,
103                                     mid_stride * sizeof(*mid)
104                                     HIGHBD_TAIL_SUFFIX);
105 }
106 #endif
107 
108 #if ARCH_ARM
109 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
110                                 const pixel (*left)[4],
111                                 const pixel *src, const ptrdiff_t stride,
112                                 const int w, const int h,
113                                 const enum LrEdgeFlags edges);
114 void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
115                            const int w, const int h,
116                            const enum LrEdgeFlags edges);
117 void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
118                              const int w, const int h, const int strength,
119                              const int bitdepth_max);
120 void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
121                                         const pixel *src, const ptrdiff_t stride,
122                                         const int32_t *a, const int16_t *b,
123                                         const int w, const int h);
124 
125 /* filter with a 3x3 box (radius=1) */
dav1d_sgr_filter1_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)126 static void dav1d_sgr_filter1_neon(int16_t *tmp,
127                                    const pixel *src, const ptrdiff_t stride,
128                                    const pixel (*left)[4], const pixel *lpf,
129                                    const int w, const int h, const int strength,
130                                    const enum LrEdgeFlags edges
131                                    HIGHBD_DECL_SUFFIX)
132 {
133     ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
134     int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
135     ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
136     int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
137 
138     BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
139     if (edges & LR_HAVE_TOP)
140         BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
141                                    NULL, lpf, stride, w, 2, edges);
142 
143     if (edges & LR_HAVE_BOTTOM)
144         BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
145                                    NULL, lpf + 6 * PXSTRIDE(stride),
146                                    stride, w, 2, edges);
147 
148     dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
149     dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
150     BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
151 }
152 
153 void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
154                                 const pixel (*left)[4],
155                                 const pixel *src, const ptrdiff_t stride,
156                                 const int w, const int h,
157                                 const enum LrEdgeFlags edges);
158 void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
159                            const int w, const int h,
160                            const enum LrEdgeFlags edges);
161 void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
162                              const int w, const int h, const int strength,
163                              const int bitdepth_max);
164 void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
165                                         const pixel *src, const ptrdiff_t stride,
166                                         const int32_t *a, const int16_t *b,
167                                         const int w, const int h);
168 
169 /* filter with a 5x5 box (radius=2) */
dav1d_sgr_filter2_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)170 static void dav1d_sgr_filter2_neon(int16_t *tmp,
171                                    const pixel *src, const ptrdiff_t stride,
172                                    const pixel (*left)[4], const pixel *lpf,
173                                    const int w, const int h, const int strength,
174                                    const enum LrEdgeFlags edges
175                                    HIGHBD_DECL_SUFFIX)
176 {
177     ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
178     int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
179     ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
180     int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
181 
182     BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
183     if (edges & LR_HAVE_TOP)
184         BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
185                                    NULL, lpf, stride, w, 2, edges);
186 
187     if (edges & LR_HAVE_BOTTOM)
188         BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
189                                    NULL, lpf + 6 * PXSTRIDE(stride),
190                                    stride, w, 2, edges);
191 
192     dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
193     dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
194     BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
195 }
196 
197 void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
198                                    const pixel *src, const ptrdiff_t src_stride,
199                                    const int16_t *t1, const int w, const int h,
200                                    const int wt HIGHBD_DECL_SUFFIX);
201 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
202                                    const pixel *src, const ptrdiff_t src_stride,
203                                    const int16_t *t1, const int16_t *t2,
204                                    const int w, const int h,
205                                    const int16_t wt[2] HIGHBD_DECL_SUFFIX);
206 
sgr_filter_5x5_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)207 static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
208                                 const pixel (*const left)[4], const pixel *lpf,
209                                 const int w, const int h,
210                                 const LooprestorationParams *const params,
211                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
212 {
213     ALIGN_STK_16(int16_t, tmp, 64 * 384,);
214     dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
215                            w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
216     BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
217                                   tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
218 }
219 
sgr_filter_3x3_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)220 static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
221                                 const pixel (*const left)[4], const pixel *lpf,
222                                 const int w, const int h,
223                                 const LooprestorationParams *const params,
224                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
225 {
226     ALIGN_STK_16(int16_t, tmp, 64 * 384,);
227     dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
228                            w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
229     BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
230                                   tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
231 }
232 
sgr_filter_mix_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)233 static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
234                                 const pixel (*const left)[4], const pixel *lpf,
235                                 const int w, const int h,
236                                 const LooprestorationParams *const params,
237                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
238 {
239     ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
240     ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
241     dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
242                            w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
243     dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
244                            w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
245     const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
246     BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
247                                   tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
248 }
249 
250 #else
rotate(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs,int n)251 static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
252     int32_t *tmp32 = sumsq_ptrs[0];
253     int16_t *tmp16 = sum_ptrs[0];
254     for (int i = 0; i < n - 1; i++) {
255         sumsq_ptrs[i] = sumsq_ptrs[i+1];
256         sum_ptrs[i] = sum_ptrs[i+1];
257     }
258     sumsq_ptrs[n - 1] = tmp32;
259     sum_ptrs[n - 1] = tmp16;
260 }
rotate5_x2(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs)261 static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
262     int32_t *tmp32[2];
263     int16_t *tmp16[2];
264     for (int i = 0; i < 2; i++) {
265         tmp32[i] = sumsq_ptrs[i];
266         tmp16[i] = sum_ptrs[i];
267     }
268     for (int i = 0; i < 3; i++) {
269         sumsq_ptrs[i] = sumsq_ptrs[i+2];
270         sum_ptrs[i] = sum_ptrs[i+2];
271     }
272     for (int i = 0; i < 2; i++) {
273         sumsq_ptrs[3 + i] = tmp32[i];
274         sum_ptrs[3 + i] = tmp16[i];
275     }
276 }
277 
rotate_ab_3(int32_t ** A_ptrs,int16_t ** B_ptrs)278 static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
279     rotate(A_ptrs, B_ptrs, 3);
280 }
281 
rotate_ab_2(int32_t ** A_ptrs,int16_t ** B_ptrs)282 static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
283     rotate(A_ptrs, B_ptrs, 2);
284 }
285 
rotate_ab_4(int32_t ** A_ptrs,int16_t ** B_ptrs)286 static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
287     rotate(A_ptrs, B_ptrs, 4);
288 }
289 
290 void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
291                                     const pixel (*left)[4],
292                                     const pixel *src, const int w,
293                                     const enum LrEdgeFlags edges);
294 void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
295                                     const pixel (*left)[4],
296                                     const pixel *src, const int w,
297                                     const enum LrEdgeFlags edges);
298 void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
299                                      int32_t *sumsq5, int16_t *sum5,
300                                      const pixel (*left)[4],
301                                      const pixel *src, const int w,
302                                      const enum LrEdgeFlags edges);
303 
304 void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
305                               int32_t *AA, int16_t *BB,
306                               const int w, const int s,
307                               const int bitdepth_max);
308 void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
309                               int32_t *AA, int16_t *BB,
310                               const int w, const int s,
311                               const int bitdepth_max);
312 
313 void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
314                                           int32_t **A_ptrs, int16_t **B_ptrs,
315                                           const int w, const int w1
316                                           HIGHBD_DECL_SUFFIX);
317 void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
318                                           int32_t **A_ptrs, int16_t **B_ptrs,
319                                           const int w, const int h,
320                                           const int w1 HIGHBD_DECL_SUFFIX);
321 
322 void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
323                                               const ptrdiff_t src_stride,
324                                               int32_t **A_ptrs,
325                                               int16_t **B_ptrs,
326                                               const int w, const int h);
327 void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
328                                               const ptrdiff_t src_stride,
329                                               int32_t **A_ptrs, int16_t **B_ptrs,
330                                               const int w, const int h);
331 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
332                                    const pixel *src, const ptrdiff_t src_stride,
333                                    const int16_t *t1, const int16_t *t2,
334                                    const int w, const int h,
335                                    const int16_t wt[2] HIGHBD_DECL_SUFFIX);
336 
sgr_box3_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)337 static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
338                                int32_t *sumsq_out, int16_t *sum_out,
339                                const int w, int s, int bitdepth_max) {
340     // box3_v + calc_ab1
341     dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
342     rotate(sumsq, sum, 3);
343 }
344 
sgr_box5_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)345 static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
346                                int32_t *sumsq_out, int16_t *sum_out,
347                                const int w, int s, int bitdepth_max) {
348     // box5_v + calc_ab2
349     dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
350     rotate5_x2(sumsq, sum);
351 }
352 
sgr_box3_hv_neon(int32_t ** sumsq,int16_t ** sum,int32_t * AA,int16_t * BB,const pixel (* left)[4],const pixel * src,const int w,const int s,const enum LrEdgeFlags edges,const int bitdepth_max)353 static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
354                              int32_t *AA, int16_t *BB,
355                              const pixel (*left)[4],
356                              const pixel *src, const int w,
357                              const int s,
358                              const enum LrEdgeFlags edges,
359                              const int bitdepth_max) {
360     BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
361     sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
362 }
363 
364 
sgr_finish1_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int w1 HIGHBD_DECL_SUFFIX)365 static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
366                              int32_t **A_ptrs, int16_t **B_ptrs, const int w,
367                              const int w1 HIGHBD_DECL_SUFFIX) {
368     BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
369                                          w, w1 HIGHBD_TAIL_SUFFIX);
370     *dst += PXSTRIDE(stride);
371     rotate_ab_3(A_ptrs, B_ptrs);
372 }
373 
sgr_finish2_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int h,const int w1 HIGHBD_DECL_SUFFIX)374 static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
375                              int32_t **A_ptrs, int16_t **B_ptrs,
376                              const int w, const int h, const int w1
377                              HIGHBD_DECL_SUFFIX) {
378     BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
379                                          w, h, w1 HIGHBD_TAIL_SUFFIX);
380     *dst += 2*PXSTRIDE(stride);
381     rotate_ab_2(A_ptrs, B_ptrs);
382 }
383 
sgr_finish_mix_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A5_ptrs,int16_t ** B5_ptrs,int32_t ** A3_ptrs,int16_t ** B3_ptrs,const int w,const int h,const int w0,const int w1 HIGHBD_DECL_SUFFIX)384 static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
385                                 int32_t **A5_ptrs, int16_t **B5_ptrs,
386                                 int32_t **A3_ptrs, int16_t **B3_ptrs,
387                                 const int w, const int h,
388                                 const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
389 #define FILTER_OUT_STRIDE 384
390     ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
391     ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
392 
393     BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
394                                              A5_ptrs, B5_ptrs, w, h);
395     BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
396                                              A3_ptrs, B3_ptrs, w, h);
397     const int16_t wt[2] = { w0, w1 };
398     BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
399                                   tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
400     *dst += h*PXSTRIDE(stride);
401     rotate_ab_2(A5_ptrs, B5_ptrs);
402     rotate_ab_4(A3_ptrs, B3_ptrs);
403 }
404 
405 
sgr_filter_3x3_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)406 static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
407                                 const pixel (*left)[4], const pixel *lpf,
408                                 const int w, int h,
409                                 const LooprestorationParams *const params,
410                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
411 {
412 #define BUF_STRIDE (384 + 16)
413     ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
414     ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
415     int32_t *sumsq_ptrs[3], *sumsq_rows[3];
416     int16_t *sum_ptrs[3], *sum_rows[3];
417     for (int i = 0; i < 3; i++) {
418         sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
419         sum_rows[i] = &sum_buf[i * BUF_STRIDE];
420     }
421 
422     ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
423     ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
424     int32_t *A_ptrs[3];
425     int16_t *B_ptrs[3];
426     for (int i = 0; i < 3; i++) {
427         A_ptrs[i] = &A_buf[i * BUF_STRIDE];
428         B_ptrs[i] = &B_buf[i * BUF_STRIDE];
429     }
430     const pixel *src = dst;
431     const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
432 
433     if (edges & LR_HAVE_TOP) {
434         sumsq_ptrs[0] = sumsq_rows[0];
435         sumsq_ptrs[1] = sumsq_rows[1];
436         sumsq_ptrs[2] = sumsq_rows[2];
437         sum_ptrs[0] = sum_rows[0];
438         sum_ptrs[1] = sum_rows[1];
439         sum_ptrs[2] = sum_rows[2];
440 
441         BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
442                                        NULL, lpf, w, edges);
443         lpf += PXSTRIDE(stride);
444         BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
445                                        NULL, lpf, w, edges);
446 
447         sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
448                          left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
449         left++;
450         src += PXSTRIDE(stride);
451         rotate_ab_3(A_ptrs, B_ptrs);
452 
453         if (--h <= 0)
454             goto vert_1;
455 
456         sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
457         left++;
458         src += PXSTRIDE(stride);
459         rotate_ab_3(A_ptrs, B_ptrs);
460 
461         if (--h <= 0)
462             goto vert_2;
463     } else {
464         sumsq_ptrs[0] = sumsq_rows[0];
465         sumsq_ptrs[1] = sumsq_rows[0];
466         sumsq_ptrs[2] = sumsq_rows[0];
467         sum_ptrs[0] = sum_rows[0];
468         sum_ptrs[1] = sum_rows[0];
469         sum_ptrs[2] = sum_rows[0];
470 
471         BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
472                                        left, src, w, edges);
473         left++;
474         src += PXSTRIDE(stride);
475 
476         sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
477                            w, params->sgr.s1, BITDEPTH_MAX);
478         rotate_ab_3(A_ptrs, B_ptrs);
479 
480         if (--h <= 0)
481             goto vert_1;
482 
483         sumsq_ptrs[2] = sumsq_rows[1];
484         sum_ptrs[2] = sum_rows[1];
485 
486         sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
487                          left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
488         left++;
489         src += PXSTRIDE(stride);
490         rotate_ab_3(A_ptrs, B_ptrs);
491 
492         if (--h <= 0)
493             goto vert_2;
494 
495         sumsq_ptrs[2] = sumsq_rows[2];
496         sum_ptrs[2] = sum_rows[2];
497     }
498 
499     do {
500         sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
501                          left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
502         left++;
503         src += PXSTRIDE(stride);
504 
505         sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
506                          w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
507     } while (--h > 0);
508 
509     if (!(edges & LR_HAVE_BOTTOM))
510         goto vert_2;
511 
512     sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
513                      NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
514     lpf_bottom += PXSTRIDE(stride);
515 
516     sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
517                      w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
518 
519     sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
520                      NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
521 
522     sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
523                      w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
524     return;
525 
526 vert_2:
527     sumsq_ptrs[2] = sumsq_ptrs[1];
528     sum_ptrs[2] = sum_ptrs[1];
529     sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
530                        w, params->sgr.s1, BITDEPTH_MAX);
531 
532     sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
533                      w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
534 
535 output_1:
536     sumsq_ptrs[2] = sumsq_ptrs[1];
537     sum_ptrs[2] = sum_ptrs[1];
538     sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
539                        w, params->sgr.s1, BITDEPTH_MAX);
540 
541     sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
542                      w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
543     return;
544 
545 vert_1:
546     sumsq_ptrs[2] = sumsq_ptrs[1];
547     sum_ptrs[2] = sum_ptrs[1];
548     sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
549                        w, params->sgr.s1, BITDEPTH_MAX);
550     rotate_ab_3(A_ptrs, B_ptrs);
551     goto output_1;
552 }
553 
sgr_filter_5x5_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)554 static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
555                                 const pixel (*left)[4], const pixel *lpf,
556                                 const int w, int h,
557                                 const LooprestorationParams *const params,
558                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
559 {
560     ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
561     ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
562     int32_t *sumsq_ptrs[5], *sumsq_rows[5];
563     int16_t *sum_ptrs[5], *sum_rows[5];
564     for (int i = 0; i < 5; i++) {
565         sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
566         sum_rows[i] = &sum_buf[i * BUF_STRIDE];
567     }
568 
569     ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
570     ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
571     int32_t *A_ptrs[2];
572     int16_t *B_ptrs[2];
573     for (int i = 0; i < 2; i++) {
574         A_ptrs[i] = &A_buf[i * BUF_STRIDE];
575         B_ptrs[i] = &B_buf[i * BUF_STRIDE];
576     }
577     const pixel *src = dst;
578     const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
579 
580     if (edges & LR_HAVE_TOP) {
581         sumsq_ptrs[0] = sumsq_rows[0];
582         sumsq_ptrs[1] = sumsq_rows[0];
583         sumsq_ptrs[2] = sumsq_rows[1];
584         sumsq_ptrs[3] = sumsq_rows[2];
585         sumsq_ptrs[4] = sumsq_rows[3];
586         sum_ptrs[0] = sum_rows[0];
587         sum_ptrs[1] = sum_rows[0];
588         sum_ptrs[2] = sum_rows[1];
589         sum_ptrs[3] = sum_rows[2];
590         sum_ptrs[4] = sum_rows[3];
591 
592         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
593                                        NULL, lpf, w, edges);
594         lpf += PXSTRIDE(stride);
595         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
596                                        NULL, lpf, w, edges);
597 
598         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
599                                        left, src, w, edges);
600         left++;
601         src += PXSTRIDE(stride);
602 
603         if (--h <= 0)
604             goto vert_1;
605 
606         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
607                                        left, src, w, edges);
608         left++;
609         src += PXSTRIDE(stride);
610         sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
611                            w, params->sgr.s0, BITDEPTH_MAX);
612         rotate_ab_2(A_ptrs, B_ptrs);
613 
614         if (--h <= 0)
615             goto vert_2;
616 
617         // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
618         // one of them to point at the previously unused rows[4].
619         sumsq_ptrs[3] = sumsq_rows[4];
620         sum_ptrs[3] = sum_rows[4];
621     } else {
622         sumsq_ptrs[0] = sumsq_rows[0];
623         sumsq_ptrs[1] = sumsq_rows[0];
624         sumsq_ptrs[2] = sumsq_rows[0];
625         sumsq_ptrs[3] = sumsq_rows[0];
626         sumsq_ptrs[4] = sumsq_rows[0];
627         sum_ptrs[0] = sum_rows[0];
628         sum_ptrs[1] = sum_rows[0];
629         sum_ptrs[2] = sum_rows[0];
630         sum_ptrs[3] = sum_rows[0];
631         sum_ptrs[4] = sum_rows[0];
632 
633         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
634                                        left, src, w, edges);
635         left++;
636         src += PXSTRIDE(stride);
637 
638         if (--h <= 0)
639             goto vert_1;
640 
641         sumsq_ptrs[4] = sumsq_rows[1];
642         sum_ptrs[4] = sum_rows[1];
643 
644         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
645                                        left, src, w, edges);
646         left++;
647         src += PXSTRIDE(stride);
648 
649         sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
650                            w, params->sgr.s0, BITDEPTH_MAX);
651         rotate_ab_2(A_ptrs, B_ptrs);
652 
653         if (--h <= 0)
654             goto vert_2;
655 
656         sumsq_ptrs[3] = sumsq_rows[2];
657         sumsq_ptrs[4] = sumsq_rows[3];
658         sum_ptrs[3] = sum_rows[2];
659         sum_ptrs[4] = sum_rows[3];
660 
661         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
662                                        left, src, w, edges);
663         left++;
664         src += PXSTRIDE(stride);
665 
666         if (--h <= 0)
667             goto odd;
668 
669         BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
670                                        left, src, w, edges);
671         left++;
672         src += PXSTRIDE(stride);
673 
674         sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
675                            w, params->sgr.s0, BITDEPTH_MAX);
676         sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
677                          w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
678 
679         if (--h <= 0)
680             goto vert_2;
681 
682         // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
683         // one of them to point at the previously unused rows[4].
684         sumsq_ptrs[3] = sumsq_rows[4];
685         sum_ptrs[3] = sum_rows[4];
686     }
687 
688     do {
689         BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
690                                        left, src, w, edges);
691         left++;
692         src += PXSTRIDE(stride);
693 
694         if (--h <= 0)
695             goto odd;
696 
697         BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
698                                        left, src, w, edges);
699         left++;
700         src += PXSTRIDE(stride);
701 
702         sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
703                            w, params->sgr.s0, BITDEPTH_MAX);
704         sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
705                          w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
706     } while (--h > 0);
707 
708     if (!(edges & LR_HAVE_BOTTOM))
709         goto vert_2;
710 
711     BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
712                                    NULL, lpf_bottom, w, edges);
713     lpf_bottom += PXSTRIDE(stride);
714     BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
715                                    NULL, lpf_bottom, w, edges);
716 
717 output_2:
718     sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
719                        w, params->sgr.s0, BITDEPTH_MAX);
720     sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
721                      w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
722     return;
723 
724 vert_2:
725     // Duplicate the last row twice more
726     sumsq_ptrs[3] = sumsq_ptrs[2];
727     sumsq_ptrs[4] = sumsq_ptrs[2];
728     sum_ptrs[3] = sum_ptrs[2];
729     sum_ptrs[4] = sum_ptrs[2];
730     goto output_2;
731 
732 odd:
733     // Copy the last row as padding once
734     sumsq_ptrs[4] = sumsq_ptrs[3];
735     sum_ptrs[4] = sum_ptrs[3];
736 
737     sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
738                        w, params->sgr.s0, BITDEPTH_MAX);
739     sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
740                      w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
741 
742 output_1:
743     // Duplicate the last row twice more
744     sumsq_ptrs[3] = sumsq_ptrs[2];
745     sumsq_ptrs[4] = sumsq_ptrs[2];
746     sum_ptrs[3] = sum_ptrs[2];
747     sum_ptrs[4] = sum_ptrs[2];
748 
749     sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
750                        w, params->sgr.s0, BITDEPTH_MAX);
751     // Output only one row
752     sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
753                      w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
754     return;
755 
756 vert_1:
757     // Copy the last row as padding once
758     sumsq_ptrs[4] = sumsq_ptrs[3];
759     sum_ptrs[4] = sum_ptrs[3];
760 
761     sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
762                        w, params->sgr.s0, BITDEPTH_MAX);
763     rotate_ab_2(A_ptrs, B_ptrs);
764 
765     goto output_1;
766 }
767 
sgr_filter_mix_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)768 static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
769                                 const pixel (*left)[4], const pixel *lpf,
770                                 const int w, int h,
771                                 const LooprestorationParams *const params,
772                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
773 {
774     ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
775     ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
776     int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
777     int16_t *sum5_ptrs[5], *sum5_rows[5];
778     for (int i = 0; i < 5; i++) {
779         sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
780         sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
781     }
782     ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
783     ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
784     int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
785     int16_t *sum3_ptrs[3], *sum3_rows[3];
786     for (int i = 0; i < 3; i++) {
787         sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
788         sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
789     }
790 
791     ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
792     ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
793     int32_t *A5_ptrs[2];
794     int16_t *B5_ptrs[2];
795     for (int i = 0; i < 2; i++) {
796         A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
797         B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
798     }
799     ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
800     ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
801     int32_t *A3_ptrs[4];
802     int16_t *B3_ptrs[4];
803     for (int i = 0; i < 4; i++) {
804         A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
805         B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
806     }
807     const pixel *src = dst;
808     const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
809 
810     if (edges & LR_HAVE_TOP) {
811         sumsq5_ptrs[0] = sumsq5_rows[0];
812         sumsq5_ptrs[1] = sumsq5_rows[0];
813         sumsq5_ptrs[2] = sumsq5_rows[1];
814         sumsq5_ptrs[3] = sumsq5_rows[2];
815         sumsq5_ptrs[4] = sumsq5_rows[3];
816         sum5_ptrs[0] = sum5_rows[0];
817         sum5_ptrs[1] = sum5_rows[0];
818         sum5_ptrs[2] = sum5_rows[1];
819         sum5_ptrs[3] = sum5_rows[2];
820         sum5_ptrs[4] = sum5_rows[3];
821 
822         sumsq3_ptrs[0] = sumsq3_rows[0];
823         sumsq3_ptrs[1] = sumsq3_rows[1];
824         sumsq3_ptrs[2] = sumsq3_rows[2];
825         sum3_ptrs[0] = sum3_rows[0];
826         sum3_ptrs[1] = sum3_rows[1];
827         sum3_ptrs[2] = sum3_rows[2];
828 
829         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
830                                         sumsq5_rows[0], sum5_rows[0],
831                                         NULL, lpf, w, edges);
832         lpf += PXSTRIDE(stride);
833         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
834                                         sumsq5_rows[1], sum5_rows[1],
835                                         NULL, lpf, w, edges);
836 
837         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
838                                         sumsq5_rows[2], sum5_rows[2],
839                                         left, src, w, edges);
840         left++;
841         src += PXSTRIDE(stride);
842 
843         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
844                            w, params->sgr.s1, BITDEPTH_MAX);
845         rotate_ab_4(A3_ptrs, B3_ptrs);
846 
847         if (--h <= 0)
848             goto vert_1;
849 
850         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
851                                         sumsq5_rows[3], sum5_rows[3],
852                                         left, src, w, edges);
853         left++;
854         src += PXSTRIDE(stride);
855         sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
856                            w, params->sgr.s0, BITDEPTH_MAX);
857         rotate_ab_2(A5_ptrs, B5_ptrs);
858         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
859                            w, params->sgr.s1, BITDEPTH_MAX);
860         rotate_ab_4(A3_ptrs, B3_ptrs);
861 
862         if (--h <= 0)
863             goto vert_2;
864 
865         // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
866         // one of them to point at the previously unused rows[4].
867         sumsq5_ptrs[3] = sumsq5_rows[4];
868         sum5_ptrs[3] = sum5_rows[4];
869     } else {
870         sumsq5_ptrs[0] = sumsq5_rows[0];
871         sumsq5_ptrs[1] = sumsq5_rows[0];
872         sumsq5_ptrs[2] = sumsq5_rows[0];
873         sumsq5_ptrs[3] = sumsq5_rows[0];
874         sumsq5_ptrs[4] = sumsq5_rows[0];
875         sum5_ptrs[0] = sum5_rows[0];
876         sum5_ptrs[1] = sum5_rows[0];
877         sum5_ptrs[2] = sum5_rows[0];
878         sum5_ptrs[3] = sum5_rows[0];
879         sum5_ptrs[4] = sum5_rows[0];
880 
881         sumsq3_ptrs[0] = sumsq3_rows[0];
882         sumsq3_ptrs[1] = sumsq3_rows[0];
883         sumsq3_ptrs[2] = sumsq3_rows[0];
884         sum3_ptrs[0] = sum3_rows[0];
885         sum3_ptrs[1] = sum3_rows[0];
886         sum3_ptrs[2] = sum3_rows[0];
887 
888         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
889                                         sumsq5_rows[0], sum5_rows[0],
890                                         left, src, w, edges);
891         left++;
892         src += PXSTRIDE(stride);
893 
894         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
895                            w, params->sgr.s1, BITDEPTH_MAX);
896         rotate_ab_4(A3_ptrs, B3_ptrs);
897 
898         if (--h <= 0)
899             goto vert_1;
900 
901         sumsq5_ptrs[4] = sumsq5_rows[1];
902         sum5_ptrs[4] = sum5_rows[1];
903 
904         sumsq3_ptrs[2] = sumsq3_rows[1];
905         sum3_ptrs[2] = sum3_rows[1];
906 
907         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
908                                         sumsq5_rows[1], sum5_rows[1],
909                                         left, src, w, edges);
910         left++;
911         src += PXSTRIDE(stride);
912 
913         sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
914                            w, params->sgr.s0, BITDEPTH_MAX);
915         rotate_ab_2(A5_ptrs, B5_ptrs);
916         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
917                            w, params->sgr.s1, BITDEPTH_MAX);
918         rotate_ab_4(A3_ptrs, B3_ptrs);
919 
920         if (--h <= 0)
921             goto vert_2;
922 
923         sumsq5_ptrs[3] = sumsq5_rows[2];
924         sumsq5_ptrs[4] = sumsq5_rows[3];
925         sum5_ptrs[3] = sum5_rows[2];
926         sum5_ptrs[4] = sum5_rows[3];
927 
928         sumsq3_ptrs[2] = sumsq3_rows[2];
929         sum3_ptrs[2] = sum3_rows[2];
930 
931         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
932                                         sumsq5_rows[2], sum5_rows[2],
933                                         left, src, w, edges);
934         left++;
935         src += PXSTRIDE(stride);
936 
937         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
938                            w, params->sgr.s1, BITDEPTH_MAX);
939         rotate_ab_4(A3_ptrs, B3_ptrs);
940 
941         if (--h <= 0)
942             goto odd;
943 
944         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
945                                         sumsq5_rows[3], sum5_rows[3],
946                                         left, src, w, edges);
947         left++;
948         src += PXSTRIDE(stride);
949 
950         sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
951                            w, params->sgr.s0, BITDEPTH_MAX);
952         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
953                            w, params->sgr.s1, BITDEPTH_MAX);
954         sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
955                             w, 2, params->sgr.w0, params->sgr.w1
956                             HIGHBD_TAIL_SUFFIX);
957 
958         if (--h <= 0)
959             goto vert_2;
960 
961         // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
962         // one of them to point at the previously unused rows[4].
963         sumsq5_ptrs[3] = sumsq5_rows[4];
964         sum5_ptrs[3] = sum5_rows[4];
965     }
966 
967     do {
968         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
969                                         sumsq5_ptrs[3], sum5_ptrs[3],
970                                         left, src, w, edges);
971         left++;
972         src += PXSTRIDE(stride);
973 
974         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
975                            w, params->sgr.s1, BITDEPTH_MAX);
976         rotate_ab_4(A3_ptrs, B3_ptrs);
977 
978         if (--h <= 0)
979             goto odd;
980 
981         BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
982                                         sumsq5_ptrs[4], sum5_ptrs[4],
983                                         left, src, w, edges);
984         left++;
985         src += PXSTRIDE(stride);
986 
987         sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
988                            w, params->sgr.s0, BITDEPTH_MAX);
989         sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
990                            w, params->sgr.s1, BITDEPTH_MAX);
991         sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
992                             w, 2, params->sgr.w0, params->sgr.w1
993                             HIGHBD_TAIL_SUFFIX);
994     } while (--h > 0);
995 
996     if (!(edges & LR_HAVE_BOTTOM))
997         goto vert_2;
998 
999     BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1000                                     sumsq5_ptrs[3], sum5_ptrs[3],
1001                                     NULL, lpf_bottom, w, edges);
1002     lpf_bottom += PXSTRIDE(stride);
1003     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1004                        w, params->sgr.s1, BITDEPTH_MAX);
1005     rotate_ab_4(A3_ptrs, B3_ptrs);
1006 
1007     BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1008                                     sumsq5_ptrs[4], sum5_ptrs[4],
1009                                     NULL, lpf_bottom, w, edges);
1010 
1011 output_2:
1012     sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1013                        w, params->sgr.s0, BITDEPTH_MAX);
1014     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1015                        w, params->sgr.s1, BITDEPTH_MAX);
1016     sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1017                         w, 2, params->sgr.w0, params->sgr.w1
1018                         HIGHBD_TAIL_SUFFIX);
1019     return;
1020 
1021 vert_2:
1022     // Duplicate the last row twice more
1023     sumsq5_ptrs[3] = sumsq5_ptrs[2];
1024     sumsq5_ptrs[4] = sumsq5_ptrs[2];
1025     sum5_ptrs[3] = sum5_ptrs[2];
1026     sum5_ptrs[4] = sum5_ptrs[2];
1027 
1028     sumsq3_ptrs[2] = sumsq3_ptrs[1];
1029     sum3_ptrs[2] = sum3_ptrs[1];
1030     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1031                        w, params->sgr.s1, BITDEPTH_MAX);
1032     rotate_ab_4(A3_ptrs, B3_ptrs);
1033 
1034     sumsq3_ptrs[2] = sumsq3_ptrs[1];
1035     sum3_ptrs[2] = sum3_ptrs[1];
1036 
1037     goto output_2;
1038 
1039 odd:
1040     // Copy the last row as padding once
1041     sumsq5_ptrs[4] = sumsq5_ptrs[3];
1042     sum5_ptrs[4] = sum5_ptrs[3];
1043 
1044     sumsq3_ptrs[2] = sumsq3_ptrs[1];
1045     sum3_ptrs[2] = sum3_ptrs[1];
1046 
1047     sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1048                        w, params->sgr.s0, BITDEPTH_MAX);
1049     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1050                        w, params->sgr.s1, BITDEPTH_MAX);
1051     sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1052                         w, 2, params->sgr.w0, params->sgr.w1
1053                         HIGHBD_TAIL_SUFFIX);
1054 
1055 output_1:
1056     // Duplicate the last row twice more
1057     sumsq5_ptrs[3] = sumsq5_ptrs[2];
1058     sumsq5_ptrs[4] = sumsq5_ptrs[2];
1059     sum5_ptrs[3] = sum5_ptrs[2];
1060     sum5_ptrs[4] = sum5_ptrs[2];
1061 
1062     sumsq3_ptrs[2] = sumsq3_ptrs[1];
1063     sum3_ptrs[2] = sum3_ptrs[1];
1064 
1065     sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1066                        w, params->sgr.s0, BITDEPTH_MAX);
1067     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1068                        w, params->sgr.s1, BITDEPTH_MAX);
1069     rotate_ab_4(A3_ptrs, B3_ptrs);
1070     // Output only one row
1071     sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1072                         w, 1, params->sgr.w0, params->sgr.w1
1073                         HIGHBD_TAIL_SUFFIX);
1074     return;
1075 
1076 vert_1:
1077     // Copy the last row as padding once
1078     sumsq5_ptrs[4] = sumsq5_ptrs[3];
1079     sum5_ptrs[4] = sum5_ptrs[3];
1080 
1081     sumsq3_ptrs[2] = sumsq3_ptrs[1];
1082     sum3_ptrs[2] = sum3_ptrs[1];
1083 
1084     sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1085                        w, params->sgr.s0, BITDEPTH_MAX);
1086     rotate_ab_2(A5_ptrs, B5_ptrs);
1087     sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1088                        w, params->sgr.s1, BITDEPTH_MAX);
1089     rotate_ab_4(A3_ptrs, B3_ptrs);
1090 
1091     goto output_1;
1092 }
1093 
1094 #endif
1095 
1096 
loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext * const c,int bpc)1097 static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
1098     const unsigned flags = dav1d_get_cpu_flags();
1099 
1100     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
1101 
1102 #if ARCH_AARCH64
1103     c->wiener[0] = BF(dav1d_wiener_filter7, neon);
1104     c->wiener[1] = BF(dav1d_wiener_filter5, neon);
1105 #else
1106     c->wiener[0] = c->wiener[1] = wiener_filter_neon;
1107 #endif
1108     if (BITDEPTH == 8 || bpc == 10) {
1109         c->sgr[0] = sgr_filter_5x5_neon;
1110         c->sgr[1] = sgr_filter_3x3_neon;
1111         c->sgr[2] = sgr_filter_mix_neon;
1112     }
1113 }
1114