xref: /aosp_15_r20/external/libdav1d/src/arm/ipred.h (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1 /*
2  * Copyright © 2018, VideoLAN and dav1d authors
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *    list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  *    this list of conditions and the following disclaimer in the documentation
13  *    and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "src/cpu.h"
28 #include "src/ipred.h"
29 
30 decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
31 decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
32 decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
33 decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
34 decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
35 decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
36 decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
37 decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
38 decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
39 decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
40 decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
41 
42 decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
43 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
44 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
45 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
46 
47 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
48 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
49 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
50 
51 decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
52 
53 #if ARCH_AARCH64
54 void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
55                                             const pixel *const in,
56                                             const int end HIGHBD_DECL_SUFFIX);
57 void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
58                                           const pixel *const in,
59                                           const int end, const int strength);
60 void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
61                                      const int n);
62 void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
63                                     const pixel *const top, const int width,
64                                     const int height, const int dx,
65                                     const int max_base_x);
66 void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride,
67                                     const pixel *const top, const int width,
68                                     const int height, const int dx,
69                                     const int max_base_x);
70 
ipred_z1_neon(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)71 static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
72                           const pixel *const topleft_in,
73                           const int width, const int height, int angle,
74                           const int max_width, const int max_height
75                           HIGHBD_DECL_SUFFIX)
76 {
77     const int is_sm = (angle >> 9) & 0x1;
78     const int enable_intra_edge_filter = angle >> 10;
79     angle &= 511;
80     int dx = dav1d_dr_intra_derivative[angle >> 1];
81     pixel top_out[64 + 64 + (64+15)*2 + 16];
82     int max_base_x;
83     const int upsample_above = enable_intra_edge_filter ?
84         get_upsample(width + height, 90 - angle, is_sm) : 0;
85     if (upsample_above) {
86         BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
87                                                topleft_in,
88                                                width + imin(width, height)
89                                                HIGHBD_TAIL_SUFFIX);
90         max_base_x = 2 * (width + height) - 2;
91         dx <<= 1;
92     } else {
93         const int filter_strength = enable_intra_edge_filter ?
94             get_filter_strength(width + height, 90 - angle, is_sm) : 0;
95         if (filter_strength) {
96             BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height,
97                                                  topleft_in,
98                                                  width + imin(width, height),
99                                                  filter_strength);
100             max_base_x = width + height - 1;
101         } else {
102             max_base_x = width + imin(width, height) - 1;
103             memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel));
104         }
105     }
106     const int base_inc = 1 + upsample_above;
107     int pad_pixels = width + 15; // max(dx >> 6) == 15
108     BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
109                                     top_out[max_base_x], pad_pixels * base_inc);
110     if (upsample_above)
111         BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
112                                        dx, max_base_x);
113     else
114         BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height,
115                                        dx, max_base_x);
116 }
117 
118 void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src,
119                                    const int n);
120 
121 void BF(dav1d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz,
122                                             const pixel *const in
123                                             HIGHBD_DECL_SUFFIX);
124 
125 void BF(dav1d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride,
126                                     const pixel *const top,
127                                     const pixel *const left,
128                                     const int width, const int height,
129                                     const int dx, const int dy);
130 void BF(dav1d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride,
131                                     const pixel *const top,
132                                     const pixel *const left,
133                                     const int width, const int height,
134                                     const int dx, const int dy);
135 void BF(dav1d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride,
136                                     const pixel *const top,
137                                     const pixel *const left,
138                                     const int width, const int height,
139                                     const int dx, const int dy);
140 
ipred_z2_neon(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)141 static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride,
142                           const pixel *const topleft_in,
143                           const int width, const int height, int angle,
144                           const int max_width, const int max_height
145                           HIGHBD_DECL_SUFFIX)
146 {
147     const int is_sm = (angle >> 9) & 0x1;
148     const int enable_intra_edge_filter = angle >> 10;
149     angle &= 511;
150     assert(angle > 90 && angle < 180);
151     int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
152     int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
153     const int upsample_left = enable_intra_edge_filter ?
154         get_upsample(width + height, 180 - angle, is_sm) : 0;
155     const int upsample_above = enable_intra_edge_filter ?
156         get_upsample(width + height, angle - 90, is_sm) : 0;
157     pixel buf[3*(64+1)];
158     pixel *left = &buf[2*(64+1)];
159     // The asm can underread below the start of top[] and left[]; to avoid
160     // surprising behaviour, make sure this is within the allocated stack space.
161     pixel *top = &buf[1*(64+1)];
162     pixel *flipped = &buf[0*(64+1)];
163 
164     if (upsample_above) {
165         BF(dav1d_ipred_z2_upsample_edge, neon)(top, width, topleft_in
166                                                HIGHBD_TAIL_SUFFIX);
167         dx <<= 1;
168     } else {
169         const int filter_strength = enable_intra_edge_filter ?
170             get_filter_strength(width + height, angle - 90, is_sm) : 0;
171 
172         if (filter_strength) {
173             BF(dav1d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width),
174                                                  topleft_in, width,
175                                                  filter_strength);
176             if (max_width < width)
177                 memcpy(&top[1 + max_width], &topleft_in[1 + max_width],
178                        (width - max_width) * sizeof(pixel));
179         } else {
180             pixel_copy(&top[1], &topleft_in[1], width);
181         }
182     }
183     if (upsample_left) {
184         flipped[0] = topleft_in[0];
185         BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
186                                       height);
187         BF(dav1d_ipred_z2_upsample_edge, neon)(left, height, flipped
188                                                HIGHBD_TAIL_SUFFIX);
189         dy <<= 1;
190     } else {
191         const int filter_strength = enable_intra_edge_filter ?
192             get_filter_strength(width + height, 180 - angle, is_sm) : 0;
193 
194         if (filter_strength) {
195             flipped[0] = topleft_in[0];
196             BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
197                                           height);
198             BF(dav1d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height),
199                                                  flipped, height,
200                                                  filter_strength);
201             if (max_height < height)
202                 memcpy(&left[1 + max_height], &flipped[1 + max_height],
203                        (height - max_height) * sizeof(pixel));
204         } else {
205             BF(dav1d_ipred_reverse, neon)(&left[1], &topleft_in[0],
206                                           height);
207         }
208     }
209     top[0] = left[0] = *topleft_in;
210 
211     assert(!(upsample_above && upsample_left));
212     if (!upsample_above && !upsample_left) {
213         BF(dav1d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height,
214                                        dx, dy);
215     } else if (upsample_above) {
216         BF(dav1d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height,
217                                        dx, dy);
218     } else /*if (upsample_left)*/ {
219         BF(dav1d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height,
220                                        dx, dy);
221     }
222 }
223 
224 void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride,
225                                     const pixel *const left, const int width,
226                                     const int height, const int dy,
227                                     const int max_base_y);
228 void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride,
229                                     const pixel *const left, const int width,
230                                     const int height, const int dy,
231                                     const int max_base_y);
232 
ipred_z3_neon(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)233 static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
234                           const pixel *const topleft_in,
235                           const int width, const int height, int angle,
236                           const int max_width, const int max_height
237                           HIGHBD_DECL_SUFFIX)
238 {
239     const int is_sm = (angle >> 9) & 0x1;
240     const int enable_intra_edge_filter = angle >> 10;
241     angle &= 511;
242     assert(angle > 180);
243     int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
244     pixel flipped[64 + 64 + 16];
245     pixel left_out[64 + 64 + (64+15)*2];
246     int max_base_y;
247     const int upsample_left = enable_intra_edge_filter ?
248         get_upsample(width + height, angle - 180, is_sm) : 0;
249     if (upsample_left) {
250         flipped[0] = topleft_in[0];
251         BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
252                                       height + imax(width, height));
253         BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
254                                                flipped,
255                                                height + imin(width, height)
256                                                HIGHBD_TAIL_SUFFIX);
257         max_base_y = 2 * (width + height) - 2;
258         dy <<= 1;
259     } else {
260         const int filter_strength = enable_intra_edge_filter ?
261             get_filter_strength(width + height, angle - 180, is_sm) : 0;
262 
263         if (filter_strength) {
264             flipped[0] = topleft_in[0];
265             BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
266                                           height + imax(width, height));
267             BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height,
268                                                  flipped,
269                                                  height + imin(width, height),
270                                                  filter_strength);
271             max_base_y = width + height - 1;
272         } else {
273             BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0],
274                                           height + imin(width, height));
275             max_base_y = height + imin(width, height) - 1;
276         }
277     }
278     const int base_inc = 1 + upsample_left;
279     // The tbx based implementation needs left[] to have 64 bytes intitialized,
280     // the other implementation can read height + max(dy >> 6) past the end.
281     int pad_pixels = imax(64 - max_base_y - 1, height + 15);
282 
283     BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
284                                     left_out[max_base_y], pad_pixels * base_inc);
285     if (upsample_left)
286         BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
287                                        dy, max_base_y);
288     else
289         BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height,
290                                        dy, max_base_y);
291 }
292 #endif
293 
intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext * const c)294 static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
295     const unsigned flags = dav1d_get_cpu_flags();
296 
297     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
298 
299     c->intra_pred[DC_PRED]       = BF(dav1d_ipred_dc, neon);
300     c->intra_pred[DC_128_PRED]   = BF(dav1d_ipred_dc_128, neon);
301     c->intra_pred[TOP_DC_PRED]   = BF(dav1d_ipred_dc_top, neon);
302     c->intra_pred[LEFT_DC_PRED]  = BF(dav1d_ipred_dc_left, neon);
303     c->intra_pred[HOR_PRED]      = BF(dav1d_ipred_h, neon);
304     c->intra_pred[VERT_PRED]     = BF(dav1d_ipred_v, neon);
305     c->intra_pred[PAETH_PRED]    = BF(dav1d_ipred_paeth, neon);
306     c->intra_pred[SMOOTH_PRED]   = BF(dav1d_ipred_smooth, neon);
307     c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
308     c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
309 #if ARCH_AARCH64
310     c->intra_pred[Z1_PRED]       = ipred_z1_neon;
311     c->intra_pred[Z2_PRED]       = ipred_z2_neon;
312     c->intra_pred[Z3_PRED]       = ipred_z3_neon;
313 #endif
314     c->intra_pred[FILTER_PRED]   = BF(dav1d_ipred_filter, neon);
315 
316     c->cfl_pred[DC_PRED]         = BF(dav1d_ipred_cfl, neon);
317     c->cfl_pred[DC_128_PRED]     = BF(dav1d_ipred_cfl_128, neon);
318     c->cfl_pred[TOP_DC_PRED]     = BF(dav1d_ipred_cfl_top, neon);
319     c->cfl_pred[LEFT_DC_PRED]    = BF(dav1d_ipred_cfl_left, neon);
320 
321     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
322     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
323     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
324 
325     c->pal_pred                  = BF(dav1d_pal_pred, neon);
326 }
327