1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "src/cpu.h"
29 #include "src/looprestoration.h"
30
31 #if ARCH_AARCH64
32 void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
33 const pixel (*left)[4], const pixel *lpf,
34 const int w, int h,
35 const LooprestorationParams *const params,
36 const enum LrEdgeFlags edges
37 HIGHBD_DECL_SUFFIX);
38 void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
39 const pixel (*left)[4], const pixel *lpf,
40 const int w, int h,
41 const LooprestorationParams *const params,
42 const enum LrEdgeFlags edges
43 HIGHBD_DECL_SUFFIX);
44 #else
45
46 // The 8bpc version calculates things slightly differently than the reference
47 // C version. That version calculates roughly this:
48 // int16_t sum = 0;
49 // for (int i = 0; i < 7; i++)
50 // sum += src[idx] * fh[i];
51 // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
52 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
53 // sum += 1 << (bitdepth + 6 - round_bits_h);
54 // Compared to the reference C version, this is the output of the first pass
55 // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
56 // with round_offset precompensated.
57 // The 16bpc version calculates things pretty much the same way as the
58 // reference C version, but with the end result subtracted by
59 // 1 << (bitdepth + 6 - round_bits_h).
60 void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
61 const pixel *src, ptrdiff_t stride,
62 const int16_t fh[8], intptr_t w,
63 int h, enum LrEdgeFlags edges
64 HIGHBD_DECL_SUFFIX);
65 // This calculates things slightly differently than the reference C version.
66 // This version calculates roughly this:
67 // int32_t sum = 0;
68 // for (int i = 0; i < 7; i++)
69 // sum += mid[idx] * fv[i];
70 // sum = (sum + rounding_off_v) >> round_bits_v;
71 // This function assumes that the width is a multiple of 8.
72 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
73 const int16_t *mid, int w, int h,
74 const int16_t fv[8], enum LrEdgeFlags edges,
75 ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
76
wiener_filter_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)77 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
78 const pixel (*const left)[4], const pixel *lpf,
79 const int w, const int h,
80 const LooprestorationParams *const params,
81 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
82 {
83 const int16_t (*const filter)[8] = params->filter;
84 ALIGN_STK_16(int16_t, mid, 68 * 384,);
85 int mid_stride = (w + 7) & ~7;
86
87 // Horizontal filter
88 BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
89 filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
90 if (edges & LR_HAVE_TOP)
91 BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
92 filter[0], w, 2, edges
93 HIGHBD_TAIL_SUFFIX);
94 if (edges & LR_HAVE_BOTTOM)
95 BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
96 lpf + 6 * PXSTRIDE(stride),
97 stride, filter[0], w, 2, edges
98 HIGHBD_TAIL_SUFFIX);
99
100 // Vertical filter
101 BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
102 w, h, filter[1], edges,
103 mid_stride * sizeof(*mid)
104 HIGHBD_TAIL_SUFFIX);
105 }
106 #endif
107
108 #if ARCH_ARM
109 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
110 const pixel (*left)[4],
111 const pixel *src, const ptrdiff_t stride,
112 const int w, const int h,
113 const enum LrEdgeFlags edges);
114 void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
115 const int w, const int h,
116 const enum LrEdgeFlags edges);
117 void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
118 const int w, const int h, const int strength,
119 const int bitdepth_max);
120 void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
121 const pixel *src, const ptrdiff_t stride,
122 const int32_t *a, const int16_t *b,
123 const int w, const int h);
124
125 /* filter with a 3x3 box (radius=1) */
dav1d_sgr_filter1_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)126 static void dav1d_sgr_filter1_neon(int16_t *tmp,
127 const pixel *src, const ptrdiff_t stride,
128 const pixel (*left)[4], const pixel *lpf,
129 const int w, const int h, const int strength,
130 const enum LrEdgeFlags edges
131 HIGHBD_DECL_SUFFIX)
132 {
133 ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
134 int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
135 ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
136 int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
137
138 BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
139 if (edges & LR_HAVE_TOP)
140 BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
141 NULL, lpf, stride, w, 2, edges);
142
143 if (edges & LR_HAVE_BOTTOM)
144 BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
145 NULL, lpf + 6 * PXSTRIDE(stride),
146 stride, w, 2, edges);
147
148 dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
149 dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
150 BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
151 }
152
153 void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
154 const pixel (*left)[4],
155 const pixel *src, const ptrdiff_t stride,
156 const int w, const int h,
157 const enum LrEdgeFlags edges);
158 void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
159 const int w, const int h,
160 const enum LrEdgeFlags edges);
161 void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
162 const int w, const int h, const int strength,
163 const int bitdepth_max);
164 void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
165 const pixel *src, const ptrdiff_t stride,
166 const int32_t *a, const int16_t *b,
167 const int w, const int h);
168
169 /* filter with a 5x5 box (radius=2) */
dav1d_sgr_filter2_neon(int16_t * tmp,const pixel * src,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,const int h,const int strength,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)170 static void dav1d_sgr_filter2_neon(int16_t *tmp,
171 const pixel *src, const ptrdiff_t stride,
172 const pixel (*left)[4], const pixel *lpf,
173 const int w, const int h, const int strength,
174 const enum LrEdgeFlags edges
175 HIGHBD_DECL_SUFFIX)
176 {
177 ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
178 int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
179 ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
180 int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
181
182 BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
183 if (edges & LR_HAVE_TOP)
184 BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
185 NULL, lpf, stride, w, 2, edges);
186
187 if (edges & LR_HAVE_BOTTOM)
188 BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
189 NULL, lpf + 6 * PXSTRIDE(stride),
190 stride, w, 2, edges);
191
192 dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
193 dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
194 BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
195 }
196
197 void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
198 const pixel *src, const ptrdiff_t src_stride,
199 const int16_t *t1, const int w, const int h,
200 const int wt HIGHBD_DECL_SUFFIX);
201 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
202 const pixel *src, const ptrdiff_t src_stride,
203 const int16_t *t1, const int16_t *t2,
204 const int w, const int h,
205 const int16_t wt[2] HIGHBD_DECL_SUFFIX);
206
sgr_filter_5x5_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)207 static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
208 const pixel (*const left)[4], const pixel *lpf,
209 const int w, const int h,
210 const LooprestorationParams *const params,
211 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
212 {
213 ALIGN_STK_16(int16_t, tmp, 64 * 384,);
214 dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
215 w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
216 BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
217 tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
218 }
219
sgr_filter_3x3_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)220 static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
221 const pixel (*const left)[4], const pixel *lpf,
222 const int w, const int h,
223 const LooprestorationParams *const params,
224 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
225 {
226 ALIGN_STK_16(int16_t, tmp, 64 * 384,);
227 dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
228 w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
229 BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
230 tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
231 }
232
sgr_filter_mix_neon(pixel * const dst,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)233 static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
234 const pixel (*const left)[4], const pixel *lpf,
235 const int w, const int h,
236 const LooprestorationParams *const params,
237 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
238 {
239 ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
240 ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
241 dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
242 w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
243 dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
244 w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
245 const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
246 BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
247 tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
248 }
249
250 #else
rotate(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs,int n)251 static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
252 int32_t *tmp32 = sumsq_ptrs[0];
253 int16_t *tmp16 = sum_ptrs[0];
254 for (int i = 0; i < n - 1; i++) {
255 sumsq_ptrs[i] = sumsq_ptrs[i+1];
256 sum_ptrs[i] = sum_ptrs[i+1];
257 }
258 sumsq_ptrs[n - 1] = tmp32;
259 sum_ptrs[n - 1] = tmp16;
260 }
rotate5_x2(int32_t ** sumsq_ptrs,int16_t ** sum_ptrs)261 static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
262 int32_t *tmp32[2];
263 int16_t *tmp16[2];
264 for (int i = 0; i < 2; i++) {
265 tmp32[i] = sumsq_ptrs[i];
266 tmp16[i] = sum_ptrs[i];
267 }
268 for (int i = 0; i < 3; i++) {
269 sumsq_ptrs[i] = sumsq_ptrs[i+2];
270 sum_ptrs[i] = sum_ptrs[i+2];
271 }
272 for (int i = 0; i < 2; i++) {
273 sumsq_ptrs[3 + i] = tmp32[i];
274 sum_ptrs[3 + i] = tmp16[i];
275 }
276 }
277
rotate_ab_3(int32_t ** A_ptrs,int16_t ** B_ptrs)278 static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
279 rotate(A_ptrs, B_ptrs, 3);
280 }
281
rotate_ab_2(int32_t ** A_ptrs,int16_t ** B_ptrs)282 static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
283 rotate(A_ptrs, B_ptrs, 2);
284 }
285
rotate_ab_4(int32_t ** A_ptrs,int16_t ** B_ptrs)286 static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
287 rotate(A_ptrs, B_ptrs, 4);
288 }
289
290 void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
291 const pixel (*left)[4],
292 const pixel *src, const int w,
293 const enum LrEdgeFlags edges);
294 void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
295 const pixel (*left)[4],
296 const pixel *src, const int w,
297 const enum LrEdgeFlags edges);
298 void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
299 int32_t *sumsq5, int16_t *sum5,
300 const pixel (*left)[4],
301 const pixel *src, const int w,
302 const enum LrEdgeFlags edges);
303
304 void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
305 int32_t *AA, int16_t *BB,
306 const int w, const int s,
307 const int bitdepth_max);
308 void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
309 int32_t *AA, int16_t *BB,
310 const int w, const int s,
311 const int bitdepth_max);
312
313 void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
314 int32_t **A_ptrs, int16_t **B_ptrs,
315 const int w, const int w1
316 HIGHBD_DECL_SUFFIX);
317 void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
318 int32_t **A_ptrs, int16_t **B_ptrs,
319 const int w, const int h,
320 const int w1 HIGHBD_DECL_SUFFIX);
321
322 void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
323 const ptrdiff_t src_stride,
324 int32_t **A_ptrs,
325 int16_t **B_ptrs,
326 const int w, const int h);
327 void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
328 const ptrdiff_t src_stride,
329 int32_t **A_ptrs, int16_t **B_ptrs,
330 const int w, const int h);
331 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
332 const pixel *src, const ptrdiff_t src_stride,
333 const int16_t *t1, const int16_t *t2,
334 const int w, const int h,
335 const int16_t wt[2] HIGHBD_DECL_SUFFIX);
336
sgr_box3_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)337 static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
338 int32_t *sumsq_out, int16_t *sum_out,
339 const int w, int s, int bitdepth_max) {
340 // box3_v + calc_ab1
341 dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
342 rotate(sumsq, sum, 3);
343 }
344
sgr_box5_vert_neon(int32_t ** sumsq,int16_t ** sum,int32_t * sumsq_out,int16_t * sum_out,const int w,int s,int bitdepth_max)345 static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
346 int32_t *sumsq_out, int16_t *sum_out,
347 const int w, int s, int bitdepth_max) {
348 // box5_v + calc_ab2
349 dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
350 rotate5_x2(sumsq, sum);
351 }
352
sgr_box3_hv_neon(int32_t ** sumsq,int16_t ** sum,int32_t * AA,int16_t * BB,const pixel (* left)[4],const pixel * src,const int w,const int s,const enum LrEdgeFlags edges,const int bitdepth_max)353 static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
354 int32_t *AA, int16_t *BB,
355 const pixel (*left)[4],
356 const pixel *src, const int w,
357 const int s,
358 const enum LrEdgeFlags edges,
359 const int bitdepth_max) {
360 BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
361 sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
362 }
363
364
sgr_finish1_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int w1 HIGHBD_DECL_SUFFIX)365 static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
366 int32_t **A_ptrs, int16_t **B_ptrs, const int w,
367 const int w1 HIGHBD_DECL_SUFFIX) {
368 BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
369 w, w1 HIGHBD_TAIL_SUFFIX);
370 *dst += PXSTRIDE(stride);
371 rotate_ab_3(A_ptrs, B_ptrs);
372 }
373
sgr_finish2_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A_ptrs,int16_t ** B_ptrs,const int w,const int h,const int w1 HIGHBD_DECL_SUFFIX)374 static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
375 int32_t **A_ptrs, int16_t **B_ptrs,
376 const int w, const int h, const int w1
377 HIGHBD_DECL_SUFFIX) {
378 BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
379 w, h, w1 HIGHBD_TAIL_SUFFIX);
380 *dst += 2*PXSTRIDE(stride);
381 rotate_ab_2(A_ptrs, B_ptrs);
382 }
383
sgr_finish_mix_neon(pixel ** dst,const ptrdiff_t stride,int32_t ** A5_ptrs,int16_t ** B5_ptrs,int32_t ** A3_ptrs,int16_t ** B3_ptrs,const int w,const int h,const int w0,const int w1 HIGHBD_DECL_SUFFIX)384 static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
385 int32_t **A5_ptrs, int16_t **B5_ptrs,
386 int32_t **A3_ptrs, int16_t **B3_ptrs,
387 const int w, const int h,
388 const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
389 #define FILTER_OUT_STRIDE 384
390 ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
391 ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
392
393 BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
394 A5_ptrs, B5_ptrs, w, h);
395 BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
396 A3_ptrs, B3_ptrs, w, h);
397 const int16_t wt[2] = { w0, w1 };
398 BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
399 tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
400 *dst += h*PXSTRIDE(stride);
401 rotate_ab_2(A5_ptrs, B5_ptrs);
402 rotate_ab_4(A3_ptrs, B3_ptrs);
403 }
404
405
sgr_filter_3x3_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)406 static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
407 const pixel (*left)[4], const pixel *lpf,
408 const int w, int h,
409 const LooprestorationParams *const params,
410 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
411 {
412 #define BUF_STRIDE (384 + 16)
413 ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
414 ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
415 int32_t *sumsq_ptrs[3], *sumsq_rows[3];
416 int16_t *sum_ptrs[3], *sum_rows[3];
417 for (int i = 0; i < 3; i++) {
418 sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
419 sum_rows[i] = &sum_buf[i * BUF_STRIDE];
420 }
421
422 ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
423 ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
424 int32_t *A_ptrs[3];
425 int16_t *B_ptrs[3];
426 for (int i = 0; i < 3; i++) {
427 A_ptrs[i] = &A_buf[i * BUF_STRIDE];
428 B_ptrs[i] = &B_buf[i * BUF_STRIDE];
429 }
430 const pixel *src = dst;
431 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
432
433 if (edges & LR_HAVE_TOP) {
434 sumsq_ptrs[0] = sumsq_rows[0];
435 sumsq_ptrs[1] = sumsq_rows[1];
436 sumsq_ptrs[2] = sumsq_rows[2];
437 sum_ptrs[0] = sum_rows[0];
438 sum_ptrs[1] = sum_rows[1];
439 sum_ptrs[2] = sum_rows[2];
440
441 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
442 NULL, lpf, w, edges);
443 lpf += PXSTRIDE(stride);
444 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
445 NULL, lpf, w, edges);
446
447 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
448 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
449 left++;
450 src += PXSTRIDE(stride);
451 rotate_ab_3(A_ptrs, B_ptrs);
452
453 if (--h <= 0)
454 goto vert_1;
455
456 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
457 left++;
458 src += PXSTRIDE(stride);
459 rotate_ab_3(A_ptrs, B_ptrs);
460
461 if (--h <= 0)
462 goto vert_2;
463 } else {
464 sumsq_ptrs[0] = sumsq_rows[0];
465 sumsq_ptrs[1] = sumsq_rows[0];
466 sumsq_ptrs[2] = sumsq_rows[0];
467 sum_ptrs[0] = sum_rows[0];
468 sum_ptrs[1] = sum_rows[0];
469 sum_ptrs[2] = sum_rows[0];
470
471 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
472 left, src, w, edges);
473 left++;
474 src += PXSTRIDE(stride);
475
476 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
477 w, params->sgr.s1, BITDEPTH_MAX);
478 rotate_ab_3(A_ptrs, B_ptrs);
479
480 if (--h <= 0)
481 goto vert_1;
482
483 sumsq_ptrs[2] = sumsq_rows[1];
484 sum_ptrs[2] = sum_rows[1];
485
486 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
487 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
488 left++;
489 src += PXSTRIDE(stride);
490 rotate_ab_3(A_ptrs, B_ptrs);
491
492 if (--h <= 0)
493 goto vert_2;
494
495 sumsq_ptrs[2] = sumsq_rows[2];
496 sum_ptrs[2] = sum_rows[2];
497 }
498
499 do {
500 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
501 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
502 left++;
503 src += PXSTRIDE(stride);
504
505 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
506 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
507 } while (--h > 0);
508
509 if (!(edges & LR_HAVE_BOTTOM))
510 goto vert_2;
511
512 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
513 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
514 lpf_bottom += PXSTRIDE(stride);
515
516 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
517 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
518
519 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
520 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
521
522 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
523 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
524 return;
525
526 vert_2:
527 sumsq_ptrs[2] = sumsq_ptrs[1];
528 sum_ptrs[2] = sum_ptrs[1];
529 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
530 w, params->sgr.s1, BITDEPTH_MAX);
531
532 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
533 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
534
535 output_1:
536 sumsq_ptrs[2] = sumsq_ptrs[1];
537 sum_ptrs[2] = sum_ptrs[1];
538 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
539 w, params->sgr.s1, BITDEPTH_MAX);
540
541 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
542 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
543 return;
544
545 vert_1:
546 sumsq_ptrs[2] = sumsq_ptrs[1];
547 sum_ptrs[2] = sum_ptrs[1];
548 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
549 w, params->sgr.s1, BITDEPTH_MAX);
550 rotate_ab_3(A_ptrs, B_ptrs);
551 goto output_1;
552 }
553
sgr_filter_5x5_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)554 static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
555 const pixel (*left)[4], const pixel *lpf,
556 const int w, int h,
557 const LooprestorationParams *const params,
558 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
559 {
560 ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
561 ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
562 int32_t *sumsq_ptrs[5], *sumsq_rows[5];
563 int16_t *sum_ptrs[5], *sum_rows[5];
564 for (int i = 0; i < 5; i++) {
565 sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
566 sum_rows[i] = &sum_buf[i * BUF_STRIDE];
567 }
568
569 ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
570 ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
571 int32_t *A_ptrs[2];
572 int16_t *B_ptrs[2];
573 for (int i = 0; i < 2; i++) {
574 A_ptrs[i] = &A_buf[i * BUF_STRIDE];
575 B_ptrs[i] = &B_buf[i * BUF_STRIDE];
576 }
577 const pixel *src = dst;
578 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
579
580 if (edges & LR_HAVE_TOP) {
581 sumsq_ptrs[0] = sumsq_rows[0];
582 sumsq_ptrs[1] = sumsq_rows[0];
583 sumsq_ptrs[2] = sumsq_rows[1];
584 sumsq_ptrs[3] = sumsq_rows[2];
585 sumsq_ptrs[4] = sumsq_rows[3];
586 sum_ptrs[0] = sum_rows[0];
587 sum_ptrs[1] = sum_rows[0];
588 sum_ptrs[2] = sum_rows[1];
589 sum_ptrs[3] = sum_rows[2];
590 sum_ptrs[4] = sum_rows[3];
591
592 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
593 NULL, lpf, w, edges);
594 lpf += PXSTRIDE(stride);
595 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
596 NULL, lpf, w, edges);
597
598 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
599 left, src, w, edges);
600 left++;
601 src += PXSTRIDE(stride);
602
603 if (--h <= 0)
604 goto vert_1;
605
606 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
607 left, src, w, edges);
608 left++;
609 src += PXSTRIDE(stride);
610 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
611 w, params->sgr.s0, BITDEPTH_MAX);
612 rotate_ab_2(A_ptrs, B_ptrs);
613
614 if (--h <= 0)
615 goto vert_2;
616
617 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
618 // one of them to point at the previously unused rows[4].
619 sumsq_ptrs[3] = sumsq_rows[4];
620 sum_ptrs[3] = sum_rows[4];
621 } else {
622 sumsq_ptrs[0] = sumsq_rows[0];
623 sumsq_ptrs[1] = sumsq_rows[0];
624 sumsq_ptrs[2] = sumsq_rows[0];
625 sumsq_ptrs[3] = sumsq_rows[0];
626 sumsq_ptrs[4] = sumsq_rows[0];
627 sum_ptrs[0] = sum_rows[0];
628 sum_ptrs[1] = sum_rows[0];
629 sum_ptrs[2] = sum_rows[0];
630 sum_ptrs[3] = sum_rows[0];
631 sum_ptrs[4] = sum_rows[0];
632
633 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
634 left, src, w, edges);
635 left++;
636 src += PXSTRIDE(stride);
637
638 if (--h <= 0)
639 goto vert_1;
640
641 sumsq_ptrs[4] = sumsq_rows[1];
642 sum_ptrs[4] = sum_rows[1];
643
644 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
645 left, src, w, edges);
646 left++;
647 src += PXSTRIDE(stride);
648
649 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
650 w, params->sgr.s0, BITDEPTH_MAX);
651 rotate_ab_2(A_ptrs, B_ptrs);
652
653 if (--h <= 0)
654 goto vert_2;
655
656 sumsq_ptrs[3] = sumsq_rows[2];
657 sumsq_ptrs[4] = sumsq_rows[3];
658 sum_ptrs[3] = sum_rows[2];
659 sum_ptrs[4] = sum_rows[3];
660
661 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
662 left, src, w, edges);
663 left++;
664 src += PXSTRIDE(stride);
665
666 if (--h <= 0)
667 goto odd;
668
669 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
670 left, src, w, edges);
671 left++;
672 src += PXSTRIDE(stride);
673
674 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
675 w, params->sgr.s0, BITDEPTH_MAX);
676 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
677 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
678
679 if (--h <= 0)
680 goto vert_2;
681
682 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
683 // one of them to point at the previously unused rows[4].
684 sumsq_ptrs[3] = sumsq_rows[4];
685 sum_ptrs[3] = sum_rows[4];
686 }
687
688 do {
689 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
690 left, src, w, edges);
691 left++;
692 src += PXSTRIDE(stride);
693
694 if (--h <= 0)
695 goto odd;
696
697 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
698 left, src, w, edges);
699 left++;
700 src += PXSTRIDE(stride);
701
702 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
703 w, params->sgr.s0, BITDEPTH_MAX);
704 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
705 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
706 } while (--h > 0);
707
708 if (!(edges & LR_HAVE_BOTTOM))
709 goto vert_2;
710
711 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
712 NULL, lpf_bottom, w, edges);
713 lpf_bottom += PXSTRIDE(stride);
714 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
715 NULL, lpf_bottom, w, edges);
716
717 output_2:
718 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
719 w, params->sgr.s0, BITDEPTH_MAX);
720 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
721 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
722 return;
723
724 vert_2:
725 // Duplicate the last row twice more
726 sumsq_ptrs[3] = sumsq_ptrs[2];
727 sumsq_ptrs[4] = sumsq_ptrs[2];
728 sum_ptrs[3] = sum_ptrs[2];
729 sum_ptrs[4] = sum_ptrs[2];
730 goto output_2;
731
732 odd:
733 // Copy the last row as padding once
734 sumsq_ptrs[4] = sumsq_ptrs[3];
735 sum_ptrs[4] = sum_ptrs[3];
736
737 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
738 w, params->sgr.s0, BITDEPTH_MAX);
739 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
740 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
741
742 output_1:
743 // Duplicate the last row twice more
744 sumsq_ptrs[3] = sumsq_ptrs[2];
745 sumsq_ptrs[4] = sumsq_ptrs[2];
746 sum_ptrs[3] = sum_ptrs[2];
747 sum_ptrs[4] = sum_ptrs[2];
748
749 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
750 w, params->sgr.s0, BITDEPTH_MAX);
751 // Output only one row
752 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
753 w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
754 return;
755
756 vert_1:
757 // Copy the last row as padding once
758 sumsq_ptrs[4] = sumsq_ptrs[3];
759 sum_ptrs[4] = sum_ptrs[3];
760
761 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
762 w, params->sgr.s0, BITDEPTH_MAX);
763 rotate_ab_2(A_ptrs, B_ptrs);
764
765 goto output_1;
766 }
767
sgr_filter_mix_neon(pixel * dst,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,const int w,int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)768 static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
769 const pixel (*left)[4], const pixel *lpf,
770 const int w, int h,
771 const LooprestorationParams *const params,
772 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
773 {
774 ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
775 ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
776 int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
777 int16_t *sum5_ptrs[5], *sum5_rows[5];
778 for (int i = 0; i < 5; i++) {
779 sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
780 sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
781 }
782 ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
783 ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
784 int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
785 int16_t *sum3_ptrs[3], *sum3_rows[3];
786 for (int i = 0; i < 3; i++) {
787 sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
788 sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
789 }
790
791 ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
792 ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
793 int32_t *A5_ptrs[2];
794 int16_t *B5_ptrs[2];
795 for (int i = 0; i < 2; i++) {
796 A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
797 B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
798 }
799 ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
800 ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
801 int32_t *A3_ptrs[4];
802 int16_t *B3_ptrs[4];
803 for (int i = 0; i < 4; i++) {
804 A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
805 B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
806 }
807 const pixel *src = dst;
808 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
809
810 if (edges & LR_HAVE_TOP) {
811 sumsq5_ptrs[0] = sumsq5_rows[0];
812 sumsq5_ptrs[1] = sumsq5_rows[0];
813 sumsq5_ptrs[2] = sumsq5_rows[1];
814 sumsq5_ptrs[3] = sumsq5_rows[2];
815 sumsq5_ptrs[4] = sumsq5_rows[3];
816 sum5_ptrs[0] = sum5_rows[0];
817 sum5_ptrs[1] = sum5_rows[0];
818 sum5_ptrs[2] = sum5_rows[1];
819 sum5_ptrs[3] = sum5_rows[2];
820 sum5_ptrs[4] = sum5_rows[3];
821
822 sumsq3_ptrs[0] = sumsq3_rows[0];
823 sumsq3_ptrs[1] = sumsq3_rows[1];
824 sumsq3_ptrs[2] = sumsq3_rows[2];
825 sum3_ptrs[0] = sum3_rows[0];
826 sum3_ptrs[1] = sum3_rows[1];
827 sum3_ptrs[2] = sum3_rows[2];
828
829 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
830 sumsq5_rows[0], sum5_rows[0],
831 NULL, lpf, w, edges);
832 lpf += PXSTRIDE(stride);
833 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
834 sumsq5_rows[1], sum5_rows[1],
835 NULL, lpf, w, edges);
836
837 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
838 sumsq5_rows[2], sum5_rows[2],
839 left, src, w, edges);
840 left++;
841 src += PXSTRIDE(stride);
842
843 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
844 w, params->sgr.s1, BITDEPTH_MAX);
845 rotate_ab_4(A3_ptrs, B3_ptrs);
846
847 if (--h <= 0)
848 goto vert_1;
849
850 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
851 sumsq5_rows[3], sum5_rows[3],
852 left, src, w, edges);
853 left++;
854 src += PXSTRIDE(stride);
855 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
856 w, params->sgr.s0, BITDEPTH_MAX);
857 rotate_ab_2(A5_ptrs, B5_ptrs);
858 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
859 w, params->sgr.s1, BITDEPTH_MAX);
860 rotate_ab_4(A3_ptrs, B3_ptrs);
861
862 if (--h <= 0)
863 goto vert_2;
864
865 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
866 // one of them to point at the previously unused rows[4].
867 sumsq5_ptrs[3] = sumsq5_rows[4];
868 sum5_ptrs[3] = sum5_rows[4];
869 } else {
870 sumsq5_ptrs[0] = sumsq5_rows[0];
871 sumsq5_ptrs[1] = sumsq5_rows[0];
872 sumsq5_ptrs[2] = sumsq5_rows[0];
873 sumsq5_ptrs[3] = sumsq5_rows[0];
874 sumsq5_ptrs[4] = sumsq5_rows[0];
875 sum5_ptrs[0] = sum5_rows[0];
876 sum5_ptrs[1] = sum5_rows[0];
877 sum5_ptrs[2] = sum5_rows[0];
878 sum5_ptrs[3] = sum5_rows[0];
879 sum5_ptrs[4] = sum5_rows[0];
880
881 sumsq3_ptrs[0] = sumsq3_rows[0];
882 sumsq3_ptrs[1] = sumsq3_rows[0];
883 sumsq3_ptrs[2] = sumsq3_rows[0];
884 sum3_ptrs[0] = sum3_rows[0];
885 sum3_ptrs[1] = sum3_rows[0];
886 sum3_ptrs[2] = sum3_rows[0];
887
888 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
889 sumsq5_rows[0], sum5_rows[0],
890 left, src, w, edges);
891 left++;
892 src += PXSTRIDE(stride);
893
894 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
895 w, params->sgr.s1, BITDEPTH_MAX);
896 rotate_ab_4(A3_ptrs, B3_ptrs);
897
898 if (--h <= 0)
899 goto vert_1;
900
901 sumsq5_ptrs[4] = sumsq5_rows[1];
902 sum5_ptrs[4] = sum5_rows[1];
903
904 sumsq3_ptrs[2] = sumsq3_rows[1];
905 sum3_ptrs[2] = sum3_rows[1];
906
907 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
908 sumsq5_rows[1], sum5_rows[1],
909 left, src, w, edges);
910 left++;
911 src += PXSTRIDE(stride);
912
913 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
914 w, params->sgr.s0, BITDEPTH_MAX);
915 rotate_ab_2(A5_ptrs, B5_ptrs);
916 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
917 w, params->sgr.s1, BITDEPTH_MAX);
918 rotate_ab_4(A3_ptrs, B3_ptrs);
919
920 if (--h <= 0)
921 goto vert_2;
922
923 sumsq5_ptrs[3] = sumsq5_rows[2];
924 sumsq5_ptrs[4] = sumsq5_rows[3];
925 sum5_ptrs[3] = sum5_rows[2];
926 sum5_ptrs[4] = sum5_rows[3];
927
928 sumsq3_ptrs[2] = sumsq3_rows[2];
929 sum3_ptrs[2] = sum3_rows[2];
930
931 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
932 sumsq5_rows[2], sum5_rows[2],
933 left, src, w, edges);
934 left++;
935 src += PXSTRIDE(stride);
936
937 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
938 w, params->sgr.s1, BITDEPTH_MAX);
939 rotate_ab_4(A3_ptrs, B3_ptrs);
940
941 if (--h <= 0)
942 goto odd;
943
944 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
945 sumsq5_rows[3], sum5_rows[3],
946 left, src, w, edges);
947 left++;
948 src += PXSTRIDE(stride);
949
950 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
951 w, params->sgr.s0, BITDEPTH_MAX);
952 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
953 w, params->sgr.s1, BITDEPTH_MAX);
954 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
955 w, 2, params->sgr.w0, params->sgr.w1
956 HIGHBD_TAIL_SUFFIX);
957
958 if (--h <= 0)
959 goto vert_2;
960
961 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
962 // one of them to point at the previously unused rows[4].
963 sumsq5_ptrs[3] = sumsq5_rows[4];
964 sum5_ptrs[3] = sum5_rows[4];
965 }
966
967 do {
968 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
969 sumsq5_ptrs[3], sum5_ptrs[3],
970 left, src, w, edges);
971 left++;
972 src += PXSTRIDE(stride);
973
974 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
975 w, params->sgr.s1, BITDEPTH_MAX);
976 rotate_ab_4(A3_ptrs, B3_ptrs);
977
978 if (--h <= 0)
979 goto odd;
980
981 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
982 sumsq5_ptrs[4], sum5_ptrs[4],
983 left, src, w, edges);
984 left++;
985 src += PXSTRIDE(stride);
986
987 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
988 w, params->sgr.s0, BITDEPTH_MAX);
989 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
990 w, params->sgr.s1, BITDEPTH_MAX);
991 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
992 w, 2, params->sgr.w0, params->sgr.w1
993 HIGHBD_TAIL_SUFFIX);
994 } while (--h > 0);
995
996 if (!(edges & LR_HAVE_BOTTOM))
997 goto vert_2;
998
999 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1000 sumsq5_ptrs[3], sum5_ptrs[3],
1001 NULL, lpf_bottom, w, edges);
1002 lpf_bottom += PXSTRIDE(stride);
1003 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1004 w, params->sgr.s1, BITDEPTH_MAX);
1005 rotate_ab_4(A3_ptrs, B3_ptrs);
1006
1007 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
1008 sumsq5_ptrs[4], sum5_ptrs[4],
1009 NULL, lpf_bottom, w, edges);
1010
1011 output_2:
1012 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1013 w, params->sgr.s0, BITDEPTH_MAX);
1014 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1015 w, params->sgr.s1, BITDEPTH_MAX);
1016 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1017 w, 2, params->sgr.w0, params->sgr.w1
1018 HIGHBD_TAIL_SUFFIX);
1019 return;
1020
1021 vert_2:
1022 // Duplicate the last row twice more
1023 sumsq5_ptrs[3] = sumsq5_ptrs[2];
1024 sumsq5_ptrs[4] = sumsq5_ptrs[2];
1025 sum5_ptrs[3] = sum5_ptrs[2];
1026 sum5_ptrs[4] = sum5_ptrs[2];
1027
1028 sumsq3_ptrs[2] = sumsq3_ptrs[1];
1029 sum3_ptrs[2] = sum3_ptrs[1];
1030 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1031 w, params->sgr.s1, BITDEPTH_MAX);
1032 rotate_ab_4(A3_ptrs, B3_ptrs);
1033
1034 sumsq3_ptrs[2] = sumsq3_ptrs[1];
1035 sum3_ptrs[2] = sum3_ptrs[1];
1036
1037 goto output_2;
1038
1039 odd:
1040 // Copy the last row as padding once
1041 sumsq5_ptrs[4] = sumsq5_ptrs[3];
1042 sum5_ptrs[4] = sum5_ptrs[3];
1043
1044 sumsq3_ptrs[2] = sumsq3_ptrs[1];
1045 sum3_ptrs[2] = sum3_ptrs[1];
1046
1047 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1048 w, params->sgr.s0, BITDEPTH_MAX);
1049 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1050 w, params->sgr.s1, BITDEPTH_MAX);
1051 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1052 w, 2, params->sgr.w0, params->sgr.w1
1053 HIGHBD_TAIL_SUFFIX);
1054
1055 output_1:
1056 // Duplicate the last row twice more
1057 sumsq5_ptrs[3] = sumsq5_ptrs[2];
1058 sumsq5_ptrs[4] = sumsq5_ptrs[2];
1059 sum5_ptrs[3] = sum5_ptrs[2];
1060 sum5_ptrs[4] = sum5_ptrs[2];
1061
1062 sumsq3_ptrs[2] = sumsq3_ptrs[1];
1063 sum3_ptrs[2] = sum3_ptrs[1];
1064
1065 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1066 w, params->sgr.s0, BITDEPTH_MAX);
1067 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1068 w, params->sgr.s1, BITDEPTH_MAX);
1069 rotate_ab_4(A3_ptrs, B3_ptrs);
1070 // Output only one row
1071 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
1072 w, 1, params->sgr.w0, params->sgr.w1
1073 HIGHBD_TAIL_SUFFIX);
1074 return;
1075
1076 vert_1:
1077 // Copy the last row as padding once
1078 sumsq5_ptrs[4] = sumsq5_ptrs[3];
1079 sum5_ptrs[4] = sum5_ptrs[3];
1080
1081 sumsq3_ptrs[2] = sumsq3_ptrs[1];
1082 sum3_ptrs[2] = sum3_ptrs[1];
1083
1084 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
1085 w, params->sgr.s0, BITDEPTH_MAX);
1086 rotate_ab_2(A5_ptrs, B5_ptrs);
1087 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
1088 w, params->sgr.s1, BITDEPTH_MAX);
1089 rotate_ab_4(A3_ptrs, B3_ptrs);
1090
1091 goto output_1;
1092 }
1093
1094 #endif
1095
1096
loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext * const c,int bpc)1097 static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
1098 const unsigned flags = dav1d_get_cpu_flags();
1099
1100 if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
1101
1102 #if ARCH_AARCH64
1103 c->wiener[0] = BF(dav1d_wiener_filter7, neon);
1104 c->wiener[1] = BF(dav1d_wiener_filter5, neon);
1105 #else
1106 c->wiener[0] = c->wiener[1] = wiener_filter_neon;
1107 #endif
1108 if (BITDEPTH == 8 || bpc == 10) {
1109 c->sgr[0] = sgr_filter_5x5_neon;
1110 c->sgr[1] = sgr_filter_3x3_neon;
1111 c->sgr[2] = sgr_filter_mix_neon;
1112 }
1113 }
1114