xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/convolve.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
11 #define VPX_VPX_DSP_X86_CONVOLVE_H_
12 
13 #include <assert.h>
14 
15 #include "./vpx_config.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_ports/compiler_attributes.h"
18 
19 // TODO([email protected]): Refactor the code here. Currently this is pretty
20 // hacky and awful to read. Note that there is a filter_x[3] == 128 check in
21 // HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
22 // assumes the filter is always 8 tap.
23 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
24                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
25                                 uint32_t output_height, const int16_t *filter);
26 
27 // TODO([email protected]): Remove the is_avg argument to the MACROS once we
28 // have 4-tap vert avg filter.
29 #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
30   void vpx_convolve8_##name##_##opt(                                         \
31       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
32       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
33       int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34     const int16_t *filter_row = filter[offset];                              \
35     (void)x0_q4;                                                             \
36     (void)x_step_q4;                                                         \
37     (void)y0_q4;                                                             \
38     (void)y_step_q4;                                                         \
39     assert(filter_row[3] != 128);                                            \
40     assert(step_q4 == 16);                                                   \
41     if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42       const int num_taps = 8;                                                \
43       while (w >= 16) {                                                      \
44         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45                                                  dst_stride, h, filter_row); \
46         src += 16;                                                           \
47         dst += 16;                                                           \
48         w -= 16;                                                             \
49       }                                                                      \
50       if (w == 8) {                                                          \
51         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52                                                 dst_stride, h, filter_row);  \
53       } else if (w == 4) {                                                   \
54         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55                                                 dst_stride, h, filter_row);  \
56       }                                                                      \
57       (void)num_taps;                                                        \
58     } else if (filter_row[2] | filter_row[5]) {                              \
59       const int num_taps = is_avg ? 8 : 4;                                   \
60       while (w >= 16) {                                                      \
61         vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62                                                  dst_stride, h, filter_row); \
63         src += 16;                                                           \
64         dst += 16;                                                           \
65         w -= 16;                                                             \
66       }                                                                      \
67       if (w == 8) {                                                          \
68         vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69                                                 dst_stride, h, filter_row);  \
70       } else if (w == 4) {                                                   \
71         vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72                                                 dst_stride, h, filter_row);  \
73       }                                                                      \
74       (void)num_taps;                                                        \
75     } else {                                                                 \
76       const int num_taps = 2;                                                \
77       while (w >= 16) {                                                      \
78         vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79                                                  dst_stride, h, filter_row); \
80         src += 16;                                                           \
81         dst += 16;                                                           \
82         w -= 16;                                                             \
83       }                                                                      \
84       if (w == 8) {                                                          \
85         vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86                                                 dst_stride, h, filter_row);  \
87       } else if (w == 4) {                                                   \
88         vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89                                                 dst_stride, h, filter_row);  \
90       }                                                                      \
91       (void)num_taps;                                                        \
92     }                                                                        \
93   }
94 
95 #define FUN_CONV_2D(avg, opt, is_avg)                                          \
96   void vpx_convolve8_##avg##opt(                                               \
97       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
98       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
99       int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
100     const int16_t *filter_x = filter[x0_q4];                                   \
101     const int16_t *filter_y = filter[y0_q4];                                   \
102     (void)filter_y;                                                            \
103     assert(filter_x[3] != 128);                                                \
104     assert(filter_y[3] != 128);                                                \
105     assert(w <= 64);                                                           \
106     assert(h <= 64);                                                           \
107     assert(x_step_q4 == 16);                                                   \
108     assert(y_step_q4 == 16);                                                   \
109     if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
110       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
111       vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
112                                 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
113                                 h + 7);                                        \
114       vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
115                                       filter, x0_q4, x_step_q4, y0_q4,         \
116                                       y_step_q4, w, h);                        \
117     } else if (filter_x[2] | filter_x[5]) {                                    \
118       const int num_taps = is_avg ? 8 : 4;                                     \
119       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
120       vpx_convolve8_horiz_##opt(                                               \
121           src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
122           filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
123       vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
124                                       dst, dst_stride, filter, x0_q4,          \
125                                       x_step_q4, y0_q4, y_step_q4, w, h);      \
126     } else {                                                                   \
127       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED);         \
128       vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
129                                 x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
130       vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
131                                       x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
132                                       h);                                      \
133     }                                                                          \
134   }
135 
136 #if CONFIG_VP9_HIGHBITDEPTH
137 
138 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
139                                        const ptrdiff_t src_pitch,
140                                        uint16_t *output_ptr,
141                                        ptrdiff_t out_pitch,
142                                        unsigned int output_height,
143                                        const int16_t *filter, int bd);
144 
145 #define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
146                          is_avg)                                              \
147   void vpx_highbd_convolve8_##name##_##opt(                                   \
148       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
149       ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
150       int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151     const int16_t *filter_row = filter_kernel[offset];                        \
152     if (step_q4 == 16 && filter_row[3] != 128) {                              \
153       if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154         const int num_taps = 8;                                               \
155         while (w >= 16) {                                                     \
156           vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158           src += 16;                                                          \
159           dst += 16;                                                          \
160           w -= 16;                                                            \
161         }                                                                     \
162         while (w >= 8) {                                                      \
163           vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165           src += 8;                                                           \
166           dst += 8;                                                           \
167           w -= 8;                                                             \
168         }                                                                     \
169         while (w >= 4) {                                                      \
170           vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172           src += 4;                                                           \
173           dst += 4;                                                           \
174           w -= 4;                                                             \
175         }                                                                     \
176         (void)num_taps;                                                       \
177       } else if (filter_row[2] | filter_row[5]) {                             \
178         const int num_taps = is_avg ? 8 : 4;                                  \
179         while (w >= 16) {                                                     \
180           vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182           src += 16;                                                          \
183           dst += 16;                                                          \
184           w -= 16;                                                            \
185         }                                                                     \
186         while (w >= 8) {                                                      \
187           vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189           src += 8;                                                           \
190           dst += 8;                                                           \
191           w -= 8;                                                             \
192         }                                                                     \
193         while (w >= 4) {                                                      \
194           vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196           src += 4;                                                           \
197           dst += 4;                                                           \
198           w -= 4;                                                             \
199         }                                                                     \
200         (void)num_taps;                                                       \
201       } else {                                                                \
202         const int num_taps = 2;                                               \
203         while (w >= 16) {                                                     \
204           vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206           src += 16;                                                          \
207           dst += 16;                                                          \
208           w -= 16;                                                            \
209         }                                                                     \
210         while (w >= 8) {                                                      \
211           vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213           src += 8;                                                           \
214           dst += 8;                                                           \
215           w -= 8;                                                             \
216         }                                                                     \
217         while (w >= 4) {                                                      \
218           vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219               src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220           src += 4;                                                           \
221           dst += 4;                                                           \
222           w -= 4;                                                             \
223         }                                                                     \
224         (void)num_taps;                                                       \
225       }                                                                       \
226     }                                                                         \
227     if (w) {                                                                  \
228       vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229                                       filter_kernel, x0_q4, x_step_q4, y0_q4, \
230                                       y_step_q4, w, h, bd);                   \
231     }                                                                         \
232   }
233 
234 #define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
235   void vpx_highbd_convolve8_##avg##opt(                                        \
236       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
237       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
238       int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
239     const int16_t *filter_x = filter[x0_q4];                                   \
240     assert(w <= 64);                                                           \
241     assert(h <= 64);                                                           \
242     if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
243       if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
244           filter_x[3] == 128) {                                                \
245         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
246         vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
247                                          fdata2, 64, filter, x0_q4, x_step_q4, \
248                                          y0_q4, y_step_q4, w, h + 7, bd);      \
249         vpx_highbd_convolve8_##avg##vert_##opt(                                \
250             fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
251             y0_q4, y_step_q4, w, h, bd);                                       \
252       } else if (filter_x[2] | filter_x[5]) {                                  \
253         const int num_taps = is_avg ? 8 : 4;                                   \
254         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
255         vpx_highbd_convolve8_horiz_##opt(                                      \
256             src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
257             filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
258             bd);                                                               \
259         vpx_highbd_convolve8_##avg##vert_##opt(                                \
260             fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
261             x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
262       } else {                                                                 \
263         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED);      \
264         vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
265                                          x0_q4, x_step_q4, y0_q4, y_step_q4,   \
266                                          w, h + 1, bd);                        \
267         vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
268                                                filter, x0_q4, x_step_q4,       \
269                                                y0_q4, y_step_q4, w, h, bd);    \
270       }                                                                        \
271     } else {                                                                   \
272       vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
273                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
274                                     bd);                                       \
275     }                                                                          \
276   }
277 
278 #endif  // CONFIG_VP9_HIGHBITDEPTH
279 #endif  // VPX_VPX_DSP_X86_CONVOLVE_H_
280