xref: /aosp_15_r20/external/libaom/av1/common/x86/cfl_simd.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
13 #define AOM_AV1_COMMON_X86_CFL_SIMD_H_
14 
15 #include "av1/common/blockd.h"
16 
17 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
18 void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
19                                      uint16_t *output_q3);
20 void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
21                                      uint16_t *output_q3);
22 void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
23                                       uint16_t *output_q3);
24 
25 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
26 void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
27                                      uint16_t *output_q3);
28 void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
29                                      uint16_t *output_q3);
30 void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
31                                       uint16_t *output_q3);
32 void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
33                                       uint16_t *output_q3);
34 
35 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
36 void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
37                                       uint16_t *output_q3);
38 void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
39                                       uint16_t *output_q3);
40 void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
41                                        int input_stride, uint16_t *output_q3);
42 void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
43                                        int input_stride, uint16_t *output_q3);
44 
45 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
46 void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
47                                      uint16_t *output_q3);
48 void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
49                                      uint16_t *output_q3);
50 void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
51                                       uint16_t *output_q3);
52 
53 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
54 void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
55                                      uint16_t *output_q3);
56 void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
57                                      uint16_t *output_q3);
58 void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
59                                       uint16_t *output_q3);
60 void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
61                                       uint16_t *output_q3);
62 
63 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
64 void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
65                                       uint16_t *output_q3);
66 void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
67                                       uint16_t *output_q3);
68 void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
69                                        int input_stride, uint16_t *output_q3);
70 void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
71                                        int input_stride, uint16_t *output_q3);
72 
73 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
74 void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
75                                      uint16_t *output_q3);
76 void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
77                                      uint16_t *output_q3);
78 void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
79                                       uint16_t *output_q3);
80 
81 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
82 void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
83                                      uint16_t *output_q3);
84 void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
85                                      uint16_t *output_q3);
86 void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
87                                       uint16_t *output_q3);
88 void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
89                                       uint16_t *output_q3);
90 
91 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
92 void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
93                                       uint16_t *output_q3);
94 void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
95                                       uint16_t *output_q3);
96 void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
97                                        int input_stride, uint16_t *output_q3);
98 void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
99                                        int input_stride, uint16_t *output_q3);
100 
101 #if CONFIG_AV1_HIGHBITDEPTH
102 void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
103                                      uint16_t *output_q3);
104 void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
105                                      uint16_t *output_q3);
106 void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
107                                       int input_stride, uint16_t *output_q3);
108 
109 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
110 void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
111                                      uint16_t *output_q3);
112 void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
113                                      uint16_t *output_q3);
114 void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
115                                       int input_stride, uint16_t *output_q3);
116 void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
117                                       int input_stride, uint16_t *output_q3);
118 
119 // SSSE3 version is faster for with == 16, we reuse it in AVX2
120 void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
121                                       int input_stride, uint16_t *output_q3);
122 void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
123                                       int input_stride, uint16_t *output_q3);
124 void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
125                                        int input_stride, uint16_t *output_q3);
126 void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
127                                        int input_stride, uint16_t *output_q3);
128 
129 void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
130                                      uint16_t *output_q3);
131 void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
132                                      uint16_t *output_q3);
133 void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
134                                       int input_stride, uint16_t *output_q3);
135 
136 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
137 void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
138                                      uint16_t *output_q3);
139 void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
140                                      uint16_t *output_q3);
141 void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
142                                       int input_stride, uint16_t *output_q3);
143 void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
144                                       int input_stride, uint16_t *output_q3);
145 
146 // SSSE3 version is faster for with == 16, we reuse it in AVX2
147 void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
148                                       int input_stride, uint16_t *output_q3);
149 void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
150                                       int input_stride, uint16_t *output_q3);
151 void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
152                                        int input_stride, uint16_t *output_q3);
153 void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
154                                        int input_stride, uint16_t *output_q3);
155 
156 void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
157                                      uint16_t *output_q3);
158 void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
159                                      uint16_t *output_q3);
160 void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
161                                       int input_stride, uint16_t *output_q3);
162 
163 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
164 void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
165                                      uint16_t *output_q3);
166 void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
167                                      uint16_t *output_q3);
168 void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
169                                       int input_stride, uint16_t *output_q3);
170 void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
171                                       int input_stride, uint16_t *output_q3);
172 
173 // SSSE3 version is faster for with == 16, we reuse it in AVX2
174 void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
175                                       int input_stride, uint16_t *output_q3);
176 void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
177                                       int input_stride, uint16_t *output_q3);
178 void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
179                                        int input_stride, uint16_t *output_q3);
180 void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
181                                        int input_stride, uint16_t *output_q3);
182 #endif  // CONFIG_AV1_HIGHBITDEPTH
183 
184 // SSE2 version is optimal for with == 4, we reuse them in AVX2
185 void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
186 void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
187 void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
188 
189 // SSE2 version is optimal for with == 8, we reuse them in AVX2
190 void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
191 void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
192 void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
193 void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
194 
195 void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
196                                int dst_stride, int alpha_q3);
197 void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
198                                int dst_stride, int alpha_q3);
199 void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
200                                 int dst_stride, int alpha_q3);
201 
202 void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
203                                int dst_stride, int alpha_q3);
204 void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
205                                int dst_stride, int alpha_q3);
206 void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
207                                 int dst_stride, int alpha_q3);
208 void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
209                                 int dst_stride, int alpha_q3);
210 
211 void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
212                                 int dst_stride, int alpha_q3);
213 void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
214                                 int dst_stride, int alpha_q3);
215 void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
216                                  int dst_stride, int alpha_q3);
217 void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
218                                  int dst_stride, int alpha_q3);
219 
220 #if CONFIG_AV1_HIGHBITDEPTH
221 void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
222                                int dst_stride, int alpha_q3, int bd);
223 void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
224                                int dst_stride, int alpha_q3, int bd);
225 void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
226                                 int dst_stride, int alpha_q3, int bd);
227 
228 void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
229                                int dst_stride, int alpha_q3, int bd);
230 void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
231                                int dst_stride, int alpha_q3, int bd);
232 void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
233                                 int dst_stride, int alpha_q3, int bd);
234 void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
235                                 int dst_stride, int alpha_q3, int bd);
236 
237 void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
238                                 int dst_stride, int alpha_q3, int bd);
239 void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
240                                 int dst_stride, int alpha_q3, int bd);
241 void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
242                                  int dst_stride, int alpha_q3, int bd);
243 void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
244                                  int dst_stride, int alpha_q3, int bd);
245 #endif  // CONFIG_AV1_HIGHBITDEPTH
246 #endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
247