1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #if CONFIG_SVT_AV1
17 #include "third_party/SVT-AV1/convolve_avx2.h"
18 #endif
19
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/x86/convolve_avx2.h"
22 #include "aom_dsp/x86/convolve_common_intrin.h"
23 #include "aom_dsp/x86/synonyms.h"
24
av1_convolve_y_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)25 static inline void av1_convolve_y_sr_general_avx2(
26 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
27 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
28 // right shift is F-1 because we are already dividing
29 // filter co-efficients by 2
30 const int right_shift_bits = (FILTER_BITS - 1);
31 __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
32 __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
33
34 __m256i coeffs[6], s[12];
35 __m128i d[10];
36
37 int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
38
39 if (vert_tap == 6)
40 prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
41 else if (vert_tap == 12) {
42 prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
43 } else {
44 prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
45 }
46
47 // vert_filt as 4 tap
48 if (vert_tap == 4) {
49 const int fo_vert = 1;
50 const uint8_t *const src_ptr = src - fo_vert * src_stride;
51 for (int j = 0; j < w; j += 16) {
52 const uint8_t *data = &src_ptr[j];
53 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
54 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
55 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
56 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
57 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
58
59 // Load lines a and b. Line a to lower 128, line b to upper 128
60 const __m256i src_01a = _mm256_permute2x128_si256(
61 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
62
63 const __m256i src_12a = _mm256_permute2x128_si256(
64 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
65
66 const __m256i src_23a = _mm256_permute2x128_si256(
67 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
68
69 const __m256i src_34a = _mm256_permute2x128_si256(
70 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
71
72 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
73 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
74
75 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
76 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
77
78 for (i = 0; i < h; i += 2) {
79 data = &src_ptr[i * src_stride + j];
80 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
81 const __m256i src_45a = _mm256_permute2x128_si256(
82 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
83
84 d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
85 const __m256i src_56a = _mm256_permute2x128_si256(
86 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
87
88 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
89 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
90
91 const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
92 /* rounding code */
93 // shift by F - 1
94 const __m256i res_16b_lo = _mm256_sra_epi16(
95 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
96 // 8 bit conversion and saturation to uint8
97 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
98
99 if (w - j > 8) {
100 const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
101
102 /* rounding code */
103 // shift by F - 1
104 const __m256i res_16b_hi = _mm256_sra_epi16(
105 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
106 // 8 bit conversion and saturation to uint8
107 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
108
109 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
110
111 const __m128i res_0 = _mm256_castsi256_si128(res_a);
112 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
113
114 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
115 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
116 res_1);
117 } else {
118 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
119 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
120 if (w - j > 4) {
121 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
122 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
123 res_1);
124 } else if (w - j > 2) {
125 xx_storel_32(&dst[i * dst_stride + j], res_0);
126 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
127 } else {
128 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
129 __m128i *const p_1 =
130 (__m128i *)&dst[i * dst_stride + j + dst_stride];
131 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
132 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
133 }
134 }
135 s[0] = s[1];
136 s[1] = s[2];
137
138 s[3] = s[4];
139 s[4] = s[5];
140 }
141 }
142 } else if (vert_tap == 6) {
143 const int fo_vert = vert_tap / 2 - 1;
144 const uint8_t *const src_ptr = src - fo_vert * src_stride;
145
146 for (int j = 0; j < w; j += 16) {
147 const uint8_t *data = &src_ptr[j];
148 __m256i src6;
149
150 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
151 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
152 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
153 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
154 // Load lines a and b. Line a to lower 128, line b to upper 128
155 const __m256i src_01a = _mm256_permute2x128_si256(
156 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
157
158 const __m256i src_12a = _mm256_permute2x128_si256(
159 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
160
161 const __m256i src_23a = _mm256_permute2x128_si256(
162 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
163
164 src6 = _mm256_castsi128_si256(
165 _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
166 const __m256i src_34a =
167 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
168
169 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
170 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
171
172 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
173 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
174
175 for (i = 0; i < h; i += 2) {
176 data = &src_ptr[i * src_stride + j];
177 const __m256i src_45a = _mm256_permute2x128_si256(
178 src6,
179 _mm256_castsi128_si256(
180 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
181 0x20);
182
183 src6 = _mm256_castsi128_si256(
184 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
185 const __m256i src_56a = _mm256_permute2x128_si256(
186 _mm256_castsi128_si256(
187 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
188 src6, 0x20);
189
190 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
191 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
192
193 const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
194
195 /* rounding code */
196 // shift by F - 1
197 const __m256i res_16b_lo = _mm256_sra_epi16(
198 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
199 // 8 bit conversion and saturation to uint8
200 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
201
202 if (w - j > 8) {
203 const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
204
205 /* rounding code */
206 // shift by F - 1
207 const __m256i res_16b_hi = _mm256_sra_epi16(
208 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
209 // 8 bit conversion and saturation to uint8
210 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
211
212 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
213
214 const __m128i res_0 = _mm256_castsi256_si128(res_a);
215 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
216
217 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
218 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
219 res_1);
220 } else {
221 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
222 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
223 if (w - j > 4) {
224 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
225 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
226 res_1);
227 } else if (w - j > 2) {
228 xx_storel_32(&dst[i * dst_stride + j], res_0);
229 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
230 } else {
231 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
232 __m128i *const p_1 =
233 (__m128i *)&dst[i * dst_stride + j + dst_stride];
234 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
235 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
236 }
237 }
238 s[0] = s[1];
239 s[1] = s[2];
240 s[3] = s[4];
241 s[4] = s[5];
242 }
243 }
244 } else if (vert_tap == 12) { // vert_tap == 12
245 const int fo_vert = filter_params_y->taps / 2 - 1;
246 const uint8_t *const src_ptr = src - fo_vert * src_stride;
247 const __m256i v_zero = _mm256_setzero_si256();
248 right_shift = _mm_cvtsi32_si128(FILTER_BITS);
249 right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
250
251 for (int j = 0; j < w; j += 8) {
252 const uint8_t *data = &src_ptr[j];
253 __m256i src10;
254
255 d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
256 d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
257 d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
258 d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
259 d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
260 d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
261 d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
262 d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
263 d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
264 d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
265 // Load lines a and b. Line a to lower 128, line b to upper 128
266 const __m256i src_01a = _mm256_permute2x128_si256(
267 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
268
269 const __m256i src_12a = _mm256_permute2x128_si256(
270 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
271
272 const __m256i src_23a = _mm256_permute2x128_si256(
273 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
274
275 const __m256i src_34a = _mm256_permute2x128_si256(
276 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
277
278 const __m256i src_45a = _mm256_permute2x128_si256(
279 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
280
281 const __m256i src_56a = _mm256_permute2x128_si256(
282 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
283
284 const __m256i src_67a = _mm256_permute2x128_si256(
285 _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
286
287 const __m256i src_78a = _mm256_permute2x128_si256(
288 _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
289
290 const __m256i src_89a = _mm256_permute2x128_si256(
291 _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
292
293 src10 = _mm256_castsi128_si256(
294 _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
295 const __m256i src_910a =
296 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
297
298 const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
299 const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
300 const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
301 const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
302 const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
303 const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
304 const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
305 const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
306 const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
307 const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
308
309 s[0] = _mm256_unpacklo_epi16(src_01, src_12);
310 s[1] = _mm256_unpacklo_epi16(src_23, src_34);
311 s[2] = _mm256_unpacklo_epi16(src_45, src_56);
312 s[3] = _mm256_unpacklo_epi16(src_67, src_78);
313 s[4] = _mm256_unpacklo_epi16(src_89, src_910);
314
315 s[6] = _mm256_unpackhi_epi16(src_01, src_12);
316 s[7] = _mm256_unpackhi_epi16(src_23, src_34);
317 s[8] = _mm256_unpackhi_epi16(src_45, src_56);
318 s[9] = _mm256_unpackhi_epi16(src_67, src_78);
319 s[10] = _mm256_unpackhi_epi16(src_89, src_910);
320
321 for (i = 0; i < h; i += 2) {
322 data = &src_ptr[i * src_stride + j];
323 const __m256i src_1011a = _mm256_permute2x128_si256(
324 src10,
325 _mm256_castsi128_si256(
326 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
327 0x20);
328
329 src10 = _mm256_castsi128_si256(
330 _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
331
332 const __m256i src_1112a = _mm256_permute2x128_si256(
333 _mm256_castsi128_si256(
334 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
335 src10, 0x20);
336
337 const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
338 const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
339
340 s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
341 s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
342
343 const __m256i res_lo = convolve_12taps(s, coeffs);
344
345 const __m256i res_32b_lo = _mm256_sra_epi32(
346 _mm256_add_epi32(res_lo, right_shift_const), right_shift);
347 // 8 bit conversion and saturation to uint8
348 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
349 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
350
351 if (w - j > 4) {
352 const __m256i res_hi = convolve_12taps(s + 6, coeffs);
353
354 const __m256i res_32b_hi = _mm256_sra_epi32(
355 _mm256_add_epi32(res_hi, right_shift_const), right_shift);
356 __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
357 // 8 bit conversion and saturation to uint8
358 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
359
360 __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
361
362 const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
363 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
364
365 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
366 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
367 res_1);
368 } else {
369 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
370 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
371 if (w - j > 2) {
372 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
373 *(int *)&dst[i * dst_stride + j + dst_stride] =
374 _mm_cvtsi128_si32(res_1);
375 } else {
376 *(uint16_t *)&dst[i * dst_stride + j] =
377 (uint16_t)_mm_cvtsi128_si32(res_0);
378 *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
379 (uint16_t)_mm_cvtsi128_si32(res_1);
380 }
381 }
382 s[0] = s[1];
383 s[1] = s[2];
384 s[2] = s[3];
385 s[3] = s[4];
386 s[4] = s[5];
387
388 s[6] = s[7];
389 s[7] = s[8];
390 s[8] = s[9];
391 s[9] = s[10];
392 s[10] = s[11];
393 }
394 }
395 } else {
396 const int fo_vert = filter_params_y->taps / 2 - 1;
397 const uint8_t *const src_ptr = src - fo_vert * src_stride;
398
399 for (int j = 0; j < w; j += 16) {
400 const uint8_t *data = &src_ptr[j];
401 __m256i src6;
402
403 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
404 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
405 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
406 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
407 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
408 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
409 // Load lines a and b. Line a to lower 128, line b to upper 128
410 const __m256i src_01a = _mm256_permute2x128_si256(
411 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
412
413 const __m256i src_12a = _mm256_permute2x128_si256(
414 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
415
416 const __m256i src_23a = _mm256_permute2x128_si256(
417 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
418
419 const __m256i src_34a = _mm256_permute2x128_si256(
420 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
421
422 const __m256i src_45a = _mm256_permute2x128_si256(
423 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
424
425 src6 = _mm256_castsi128_si256(
426 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
427 const __m256i src_56a =
428 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
429
430 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
431 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
432 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
433
434 s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
435 s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
436 s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
437
438 for (i = 0; i < h; i += 2) {
439 data = &src_ptr[i * src_stride + j];
440 const __m256i src_67a = _mm256_permute2x128_si256(
441 src6,
442 _mm256_castsi128_si256(
443 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
444 0x20);
445
446 src6 = _mm256_castsi128_si256(
447 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
448 const __m256i src_78a = _mm256_permute2x128_si256(
449 _mm256_castsi128_si256(
450 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
451 src6, 0x20);
452
453 s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
454 s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
455
456 const __m256i res_lo = convolve_lowbd(s, coeffs);
457
458 /* rounding code */
459 // shift by F - 1
460 const __m256i res_16b_lo = _mm256_sra_epi16(
461 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
462 // 8 bit conversion and saturation to uint8
463 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
464
465 if (w - j > 8) {
466 const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
467
468 /* rounding code */
469 // shift by F - 1
470 const __m256i res_16b_hi = _mm256_sra_epi16(
471 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
472 // 8 bit conversion and saturation to uint8
473 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
474
475 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
476
477 const __m128i res_0 = _mm256_castsi256_si128(res_a);
478 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
479
480 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
481 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
482 res_1);
483 } else {
484 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
485 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
486 if (w - j > 4) {
487 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
488 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
489 res_1);
490 } else if (w - j > 2) {
491 xx_storel_32(&dst[i * dst_stride + j], res_0);
492 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
493 } else {
494 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
495 __m128i *const p_1 =
496 (__m128i *)&dst[i * dst_stride + j + dst_stride];
497 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
498 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
499 }
500 }
501 s[0] = s[1];
502 s[1] = s[2];
503 s[2] = s[3];
504
505 s[4] = s[5];
506 s[5] = s[6];
507 s[6] = s[7];
508 }
509 }
510 }
511 }
512
av1_convolve_y_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_qn)513 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
514 uint8_t *dst, int32_t dst_stride, int32_t w,
515 int32_t h,
516 const InterpFilterParams *filter_params_y,
517 const int32_t subpel_y_qn) {
518 #if CONFIG_SVT_AV1
519 const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
520
521 if (vert_tap == 12) {
522 av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
523 filter_params_y, subpel_y_qn);
524 } else {
525 av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
526 filter_params_y, subpel_y_qn);
527 }
528 #else
529 av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
530 filter_params_y, subpel_y_qn);
531 #endif
532 }
533
av1_convolve_x_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)534 static inline void av1_convolve_x_sr_general_avx2(
535 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
536 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
537 ConvolveParams *conv_params) {
538 const int bits = FILTER_BITS - conv_params->round_0;
539 const __m128i round_shift = _mm_cvtsi32_si128(bits);
540 __m256i round_0_const =
541 _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
542 __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
543 __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
544 int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
545
546 assert(bits >= 0);
547 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
548 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
549 assert(conv_params->round_0 > 0);
550
551 __m256i coeffs[6], filt[4];
552 filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
553 filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
554
555 if (horiz_tap == 6)
556 prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
557 else if (horiz_tap == 12) {
558 prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
559 } else {
560 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
561 }
562
563 // horz_filt as 4 tap
564 if (horiz_tap == 4) {
565 const int fo_horiz = 1;
566 const uint8_t *const src_ptr = src - fo_horiz;
567 if (w <= 8) {
568 for (i = 0; i < h; i += 2) {
569 const __m256i data = _mm256_permute2x128_si256(
570 _mm256_castsi128_si256(
571 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
572 _mm256_castsi128_si256(_mm_loadu_si128(
573 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
574 0x20);
575
576 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
577
578 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
579 round_0_shift);
580
581 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
582 round_shift);
583
584 /* rounding code */
585 // 8 bit conversion and saturation to uint8
586 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
587
588 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
589 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
590
591 if (w > 4) {
592 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
593 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
594 } else if (w > 2) {
595 xx_storel_32(&dst[i * dst_stride], res_0);
596 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
597 } else {
598 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
599 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
600 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
601 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
602 }
603 }
604 } else {
605 for (i = 0; i < h; ++i) {
606 for (int j = 0; j < w; j += 16) {
607 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
608 // 18 19 20 21 22 23
609 const __m256i data = _mm256_inserti128_si256(
610 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
611 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
612 1);
613
614 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
615
616 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
617 round_0_shift);
618
619 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
620 round_shift);
621
622 /* rounding code */
623 // 8 bit conversion and saturation to uint8
624 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
625
626 // Store values into the destination buffer
627 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
628 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
629 __m128i res = _mm256_castsi256_si128(res_8b);
630 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
631 }
632 }
633 }
634 } else if (horiz_tap == 6) {
635 const int fo_horiz = horiz_tap / 2 - 1;
636 const uint8_t *const src_ptr = src - fo_horiz;
637 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
638 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
639
640 if (w <= 8) {
641 for (i = 0; i < h; i += 2) {
642 const __m256i data = _mm256_permute2x128_si256(
643 _mm256_castsi128_si256(
644 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
645 _mm256_castsi128_si256(_mm_loadu_si128(
646 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
647 0x20);
648
649 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
650
651 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
652 round_0_shift);
653
654 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
655 round_shift);
656
657 /* rounding code */
658 // 8 bit conversion and saturation to uint8
659 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
660
661 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
662 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
663 if (w > 4) {
664 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
665 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
666 } else if (w > 2) {
667 xx_storel_32(&dst[i * dst_stride], res_0);
668 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
669 } else {
670 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
671 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
672 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
673 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
674 }
675 }
676 } else {
677 for (i = 0; i < h; ++i) {
678 for (int j = 0; j < w; j += 16) {
679 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
680 // 18 19 20 21 22 23
681 const __m256i data = _mm256_inserti128_si256(
682 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
683 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
684 1);
685
686 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
687
688 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
689 round_0_shift);
690
691 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
692 round_shift);
693
694 /* rounding code */
695 // 8 bit conversion and saturation to uint8
696 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
697
698 // Store values into the destination buffer
699 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
700 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
701 __m128i res = _mm256_castsi256_si128(res_8b);
702 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
703 }
704 }
705 }
706 } else if (horiz_tap == 12) { // horiz_tap == 12
707 const int fo_horiz = filter_params_x->taps / 2 - 1;
708 const uint8_t *const src_ptr = src - fo_horiz;
709 const __m256i v_zero = _mm256_setzero_si256();
710 round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
711 round_const = _mm256_set1_epi32((1 << bits) >> 1);
712 round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
713 __m256i s[6];
714
715 if (w <= 4) {
716 for (i = 0; i < h; i += 2) {
717 const __m256i data = _mm256_permute2x128_si256(
718 _mm256_castsi128_si256(
719 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
720 _mm256_castsi128_si256(_mm_loadu_si128(
721 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
722 0x20);
723 // row0 0..7 row1 0..7
724 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
725 // row0 8..F row1 8..F
726 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
727
728 // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
729 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
730 // row0 04 04 .. 07 07 row1 04 04 .. 07 07
731 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
732
733 // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
734 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
735 // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
736 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
737
738 // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
739 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
740 // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
741 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
742 // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
743 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
744 // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
745 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
746 // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
747 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
748 // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
749 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
750
751 const __m256i res_lo = convolve_12taps(s, coeffs);
752
753 __m256i res_32b_lo = _mm256_sra_epi32(
754 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
755
756 // 00 01 02 03 10 12 13 14
757 res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
758 round_shift);
759 // 8 bit conversion and saturation to uint8
760 // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
761 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
762 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
763 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
764 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
765
766 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
767 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
768 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
769 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
770 if (w > 2) {
771 // 00 01 02 03
772 *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
773 // 10 11 12 13
774 *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
775 } else {
776 // 00 01
777 *(uint16_t *)&dst[i * dst_stride] =
778 (uint16_t)_mm_cvtsi128_si32(res_0);
779 // 10 11
780 *(uint16_t *)&dst[i * dst_stride + dst_stride] =
781 (uint16_t)_mm_cvtsi128_si32(res_1);
782 }
783 }
784 } else {
785 for (i = 0; i < h; i++) {
786 for (int j = 0; j < w; j += 8) {
787 const __m256i data = _mm256_permute2x128_si256(
788 _mm256_castsi128_si256(
789 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
790 _mm256_castsi128_si256(_mm_loadu_si128(
791 (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
792 0x20);
793 // row0 0..7 4..B
794 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
795 // row0 8..F C..13
796 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
797
798 // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
799 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
800 // row0 04 04 .. 07 07 08 08 .. 0B 0B
801 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
802
803 // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
804 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
805 // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
806 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
807
808 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
809 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
810 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
811 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
812 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
813 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
814
815 const __m256i res_lo = convolve_12taps(s, coeffs);
816
817 __m256i res_32b_lo = _mm256_sra_epi32(
818 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
819
820 res_32b_lo = _mm256_sra_epi32(
821 _mm256_add_epi32(res_32b_lo, round_const), round_shift);
822 // 8 bit conversion and saturation to uint8
823 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
824 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
825 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
826 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
827 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
828 *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
829 }
830 }
831 }
832 } else {
833 const int fo_horiz = filter_params_x->taps / 2 - 1;
834 const uint8_t *const src_ptr = src - fo_horiz;
835 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
836 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
837
838 if (w <= 8) {
839 for (i = 0; i < h; i += 2) {
840 const __m256i data = _mm256_permute2x128_si256(
841 _mm256_castsi128_si256(
842 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
843 _mm256_castsi128_si256(_mm_loadu_si128(
844 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
845 0x20);
846
847 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
848
849 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
850 round_0_shift);
851
852 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
853 round_shift);
854
855 /* rounding code */
856 // 8 bit conversion and saturation to uint8
857 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
858
859 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
860 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
861 if (w > 4) {
862 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
863 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
864 } else if (w > 2) {
865 xx_storel_32(&dst[i * dst_stride], res_0);
866 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
867 } else {
868 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
869 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
870 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
871 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
872 }
873 }
874 } else {
875 for (i = 0; i < h; ++i) {
876 for (int j = 0; j < w; j += 16) {
877 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
878 // 18 19 20 21 22 23
879 const __m256i data = _mm256_inserti128_si256(
880 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
881 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
882 1);
883
884 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
885
886 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
887 round_0_shift);
888
889 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
890 round_shift);
891
892 /* rounding code */
893 // 8 bit conversion and saturation to uint8
894 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
895
896 // Store values into the destination buffer
897 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
898 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
899 __m128i res = _mm256_castsi256_si128(res_8b);
900 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
901 }
902 }
903 }
904 }
905 }
906
av1_convolve_x_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_qn,ConvolveParams * conv_params)907 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
908 uint8_t *dst, int32_t dst_stride, int32_t w,
909 int32_t h,
910 const InterpFilterParams *filter_params_x,
911 const int32_t subpel_x_qn,
912 ConvolveParams *conv_params) {
913 #if CONFIG_SVT_AV1
914 const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
915
916 if (horz_tap == 12) {
917 av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
918 filter_params_x, subpel_x_qn, conv_params);
919 } else {
920 av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
921 filter_params_x, subpel_x_qn,
922 conv_params);
923 }
924 #else
925 av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
926 filter_params_x, subpel_x_qn, conv_params);
927 #endif
928 }
929