xref: /aosp_15_r20/external/libaom/av1/common/x86/convolve_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #if CONFIG_SVT_AV1
17 #include "third_party/SVT-AV1/convolve_avx2.h"
18 #endif
19 
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_dsp/x86/convolve_avx2.h"
22 #include "aom_dsp/x86/convolve_common_intrin.h"
23 #include "aom_dsp/x86/synonyms.h"
24 
av1_convolve_y_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)25 static inline void av1_convolve_y_sr_general_avx2(
26     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
27     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
28   // right shift is F-1 because we are already dividing
29   // filter co-efficients by 2
30   const int right_shift_bits = (FILTER_BITS - 1);
31   __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
32   __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
33 
34   __m256i coeffs[6], s[12];
35   __m128i d[10];
36 
37   int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
38 
39   if (vert_tap == 6)
40     prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
41   else if (vert_tap == 12) {
42     prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
43   } else {
44     prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
45   }
46 
47   // vert_filt as 4 tap
48   if (vert_tap == 4) {
49     const int fo_vert = 1;
50     const uint8_t *const src_ptr = src - fo_vert * src_stride;
51     for (int j = 0; j < w; j += 16) {
52       const uint8_t *data = &src_ptr[j];
53       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
54       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
55       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
56       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
57       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
58 
59       // Load lines a and b. Line a to lower 128, line b to upper 128
60       const __m256i src_01a = _mm256_permute2x128_si256(
61           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
62 
63       const __m256i src_12a = _mm256_permute2x128_si256(
64           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
65 
66       const __m256i src_23a = _mm256_permute2x128_si256(
67           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
68 
69       const __m256i src_34a = _mm256_permute2x128_si256(
70           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
71 
72       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
73       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
74 
75       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
76       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
77 
78       for (i = 0; i < h; i += 2) {
79         data = &src_ptr[i * src_stride + j];
80         d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
81         const __m256i src_45a = _mm256_permute2x128_si256(
82             _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
83 
84         d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
85         const __m256i src_56a = _mm256_permute2x128_si256(
86             _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
87 
88         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
89         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
90 
91         const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
92         /* rounding code */
93         // shift by F - 1
94         const __m256i res_16b_lo = _mm256_sra_epi16(
95             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
96         // 8 bit conversion and saturation to uint8
97         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
98 
99         if (w - j > 8) {
100           const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
101 
102           /* rounding code */
103           // shift by F - 1
104           const __m256i res_16b_hi = _mm256_sra_epi16(
105               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
106           // 8 bit conversion and saturation to uint8
107           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
108 
109           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
110 
111           const __m128i res_0 = _mm256_castsi256_si128(res_a);
112           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
113 
114           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
115           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
116                            res_1);
117         } else {
118           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
119           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
120           if (w - j > 4) {
121             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
122             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
123                              res_1);
124           } else if (w - j > 2) {
125             xx_storel_32(&dst[i * dst_stride + j], res_0);
126             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
127           } else {
128             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
129             __m128i *const p_1 =
130                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
131             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
132             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
133           }
134         }
135         s[0] = s[1];
136         s[1] = s[2];
137 
138         s[3] = s[4];
139         s[4] = s[5];
140       }
141     }
142   } else if (vert_tap == 6) {
143     const int fo_vert = vert_tap / 2 - 1;
144     const uint8_t *const src_ptr = src - fo_vert * src_stride;
145 
146     for (int j = 0; j < w; j += 16) {
147       const uint8_t *data = &src_ptr[j];
148       __m256i src6;
149 
150       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
151       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
152       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
153       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
154       // Load lines a and b. Line a to lower 128, line b to upper 128
155       const __m256i src_01a = _mm256_permute2x128_si256(
156           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
157 
158       const __m256i src_12a = _mm256_permute2x128_si256(
159           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
160 
161       const __m256i src_23a = _mm256_permute2x128_si256(
162           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
163 
164       src6 = _mm256_castsi128_si256(
165           _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
166       const __m256i src_34a =
167           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
168 
169       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
170       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
171 
172       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
173       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
174 
175       for (i = 0; i < h; i += 2) {
176         data = &src_ptr[i * src_stride + j];
177         const __m256i src_45a = _mm256_permute2x128_si256(
178             src6,
179             _mm256_castsi128_si256(
180                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
181             0x20);
182 
183         src6 = _mm256_castsi128_si256(
184             _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
185         const __m256i src_56a = _mm256_permute2x128_si256(
186             _mm256_castsi128_si256(
187                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
188             src6, 0x20);
189 
190         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
191         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
192 
193         const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
194 
195         /* rounding code */
196         // shift by F - 1
197         const __m256i res_16b_lo = _mm256_sra_epi16(
198             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
199         // 8 bit conversion and saturation to uint8
200         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
201 
202         if (w - j > 8) {
203           const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
204 
205           /* rounding code */
206           // shift by F - 1
207           const __m256i res_16b_hi = _mm256_sra_epi16(
208               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
209           // 8 bit conversion and saturation to uint8
210           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
211 
212           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
213 
214           const __m128i res_0 = _mm256_castsi256_si128(res_a);
215           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
216 
217           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
218           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
219                            res_1);
220         } else {
221           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
222           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
223           if (w - j > 4) {
224             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
225             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
226                              res_1);
227           } else if (w - j > 2) {
228             xx_storel_32(&dst[i * dst_stride + j], res_0);
229             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
230           } else {
231             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
232             __m128i *const p_1 =
233                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
234             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
235             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
236           }
237         }
238         s[0] = s[1];
239         s[1] = s[2];
240         s[3] = s[4];
241         s[4] = s[5];
242       }
243     }
244   } else if (vert_tap == 12) {  // vert_tap == 12
245     const int fo_vert = filter_params_y->taps / 2 - 1;
246     const uint8_t *const src_ptr = src - fo_vert * src_stride;
247     const __m256i v_zero = _mm256_setzero_si256();
248     right_shift = _mm_cvtsi32_si128(FILTER_BITS);
249     right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
250 
251     for (int j = 0; j < w; j += 8) {
252       const uint8_t *data = &src_ptr[j];
253       __m256i src10;
254 
255       d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
256       d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
257       d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
258       d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
259       d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
260       d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
261       d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
262       d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
263       d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
264       d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
265       // Load lines a and b. Line a to lower 128, line b to upper 128
266       const __m256i src_01a = _mm256_permute2x128_si256(
267           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
268 
269       const __m256i src_12a = _mm256_permute2x128_si256(
270           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
271 
272       const __m256i src_23a = _mm256_permute2x128_si256(
273           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
274 
275       const __m256i src_34a = _mm256_permute2x128_si256(
276           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
277 
278       const __m256i src_45a = _mm256_permute2x128_si256(
279           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
280 
281       const __m256i src_56a = _mm256_permute2x128_si256(
282           _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
283 
284       const __m256i src_67a = _mm256_permute2x128_si256(
285           _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
286 
287       const __m256i src_78a = _mm256_permute2x128_si256(
288           _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
289 
290       const __m256i src_89a = _mm256_permute2x128_si256(
291           _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
292 
293       src10 = _mm256_castsi128_si256(
294           _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
295       const __m256i src_910a =
296           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
297 
298       const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
299       const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
300       const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
301       const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
302       const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
303       const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
304       const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
305       const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
306       const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
307       const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
308 
309       s[0] = _mm256_unpacklo_epi16(src_01, src_12);
310       s[1] = _mm256_unpacklo_epi16(src_23, src_34);
311       s[2] = _mm256_unpacklo_epi16(src_45, src_56);
312       s[3] = _mm256_unpacklo_epi16(src_67, src_78);
313       s[4] = _mm256_unpacklo_epi16(src_89, src_910);
314 
315       s[6] = _mm256_unpackhi_epi16(src_01, src_12);
316       s[7] = _mm256_unpackhi_epi16(src_23, src_34);
317       s[8] = _mm256_unpackhi_epi16(src_45, src_56);
318       s[9] = _mm256_unpackhi_epi16(src_67, src_78);
319       s[10] = _mm256_unpackhi_epi16(src_89, src_910);
320 
321       for (i = 0; i < h; i += 2) {
322         data = &src_ptr[i * src_stride + j];
323         const __m256i src_1011a = _mm256_permute2x128_si256(
324             src10,
325             _mm256_castsi128_si256(
326                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
327             0x20);
328 
329         src10 = _mm256_castsi128_si256(
330             _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
331 
332         const __m256i src_1112a = _mm256_permute2x128_si256(
333             _mm256_castsi128_si256(
334                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
335             src10, 0x20);
336 
337         const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
338         const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
339 
340         s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
341         s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
342 
343         const __m256i res_lo = convolve_12taps(s, coeffs);
344 
345         const __m256i res_32b_lo = _mm256_sra_epi32(
346             _mm256_add_epi32(res_lo, right_shift_const), right_shift);
347         // 8 bit conversion and saturation to uint8
348         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
349         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
350 
351         if (w - j > 4) {
352           const __m256i res_hi = convolve_12taps(s + 6, coeffs);
353 
354           const __m256i res_32b_hi = _mm256_sra_epi32(
355               _mm256_add_epi32(res_hi, right_shift_const), right_shift);
356           __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
357           // 8 bit conversion and saturation to uint8
358           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
359 
360           __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
361 
362           const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
363           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
364 
365           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
366           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
367                            res_1);
368         } else {
369           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
370           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
371           if (w - j > 2) {
372             *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
373             *(int *)&dst[i * dst_stride + j + dst_stride] =
374                 _mm_cvtsi128_si32(res_1);
375           } else {
376             *(uint16_t *)&dst[i * dst_stride + j] =
377                 (uint16_t)_mm_cvtsi128_si32(res_0);
378             *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
379                 (uint16_t)_mm_cvtsi128_si32(res_1);
380           }
381         }
382         s[0] = s[1];
383         s[1] = s[2];
384         s[2] = s[3];
385         s[3] = s[4];
386         s[4] = s[5];
387 
388         s[6] = s[7];
389         s[7] = s[8];
390         s[8] = s[9];
391         s[9] = s[10];
392         s[10] = s[11];
393       }
394     }
395   } else {
396     const int fo_vert = filter_params_y->taps / 2 - 1;
397     const uint8_t *const src_ptr = src - fo_vert * src_stride;
398 
399     for (int j = 0; j < w; j += 16) {
400       const uint8_t *data = &src_ptr[j];
401       __m256i src6;
402 
403       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
404       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
405       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
406       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
407       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
408       d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
409       // Load lines a and b. Line a to lower 128, line b to upper 128
410       const __m256i src_01a = _mm256_permute2x128_si256(
411           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
412 
413       const __m256i src_12a = _mm256_permute2x128_si256(
414           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
415 
416       const __m256i src_23a = _mm256_permute2x128_si256(
417           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
418 
419       const __m256i src_34a = _mm256_permute2x128_si256(
420           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
421 
422       const __m256i src_45a = _mm256_permute2x128_si256(
423           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
424 
425       src6 = _mm256_castsi128_si256(
426           _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
427       const __m256i src_56a =
428           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
429 
430       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
431       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
432       s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
433 
434       s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
435       s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
436       s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
437 
438       for (i = 0; i < h; i += 2) {
439         data = &src_ptr[i * src_stride + j];
440         const __m256i src_67a = _mm256_permute2x128_si256(
441             src6,
442             _mm256_castsi128_si256(
443                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
444             0x20);
445 
446         src6 = _mm256_castsi128_si256(
447             _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
448         const __m256i src_78a = _mm256_permute2x128_si256(
449             _mm256_castsi128_si256(
450                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
451             src6, 0x20);
452 
453         s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
454         s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
455 
456         const __m256i res_lo = convolve_lowbd(s, coeffs);
457 
458         /* rounding code */
459         // shift by F - 1
460         const __m256i res_16b_lo = _mm256_sra_epi16(
461             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
462         // 8 bit conversion and saturation to uint8
463         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
464 
465         if (w - j > 8) {
466           const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
467 
468           /* rounding code */
469           // shift by F - 1
470           const __m256i res_16b_hi = _mm256_sra_epi16(
471               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
472           // 8 bit conversion and saturation to uint8
473           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
474 
475           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
476 
477           const __m128i res_0 = _mm256_castsi256_si128(res_a);
478           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
479 
480           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
481           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
482                            res_1);
483         } else {
484           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
485           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
486           if (w - j > 4) {
487             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
488             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
489                              res_1);
490           } else if (w - j > 2) {
491             xx_storel_32(&dst[i * dst_stride + j], res_0);
492             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
493           } else {
494             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
495             __m128i *const p_1 =
496                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
497             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
498             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
499           }
500         }
501         s[0] = s[1];
502         s[1] = s[2];
503         s[2] = s[3];
504 
505         s[4] = s[5];
506         s[5] = s[6];
507         s[6] = s[7];
508       }
509     }
510   }
511 }
512 
av1_convolve_y_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_qn)513 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
514                             uint8_t *dst, int32_t dst_stride, int32_t w,
515                             int32_t h,
516                             const InterpFilterParams *filter_params_y,
517                             const int32_t subpel_y_qn) {
518 #if CONFIG_SVT_AV1
519   const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
520 
521   if (vert_tap == 12) {
522     av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
523                                    filter_params_y, subpel_y_qn);
524   } else {
525     av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
526                                        filter_params_y, subpel_y_qn);
527   }
528 #else
529   av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
530                                  filter_params_y, subpel_y_qn);
531 #endif
532 }
533 
av1_convolve_x_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)534 static inline void av1_convolve_x_sr_general_avx2(
535     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
536     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
537     ConvolveParams *conv_params) {
538   const int bits = FILTER_BITS - conv_params->round_0;
539   const __m128i round_shift = _mm_cvtsi32_si128(bits);
540   __m256i round_0_const =
541       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
542   __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
543   __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
544   int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
545 
546   assert(bits >= 0);
547   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
548          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
549   assert(conv_params->round_0 > 0);
550 
551   __m256i coeffs[6], filt[4];
552   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
553   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
554 
555   if (horiz_tap == 6)
556     prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
557   else if (horiz_tap == 12) {
558     prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
559   } else {
560     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
561   }
562 
563   // horz_filt as 4 tap
564   if (horiz_tap == 4) {
565     const int fo_horiz = 1;
566     const uint8_t *const src_ptr = src - fo_horiz;
567     if (w <= 8) {
568       for (i = 0; i < h; i += 2) {
569         const __m256i data = _mm256_permute2x128_si256(
570             _mm256_castsi128_si256(
571                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
572             _mm256_castsi128_si256(_mm_loadu_si128(
573                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
574             0x20);
575 
576         __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
577 
578         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
579                                    round_0_shift);
580 
581         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
582                                    round_shift);
583 
584         /* rounding code */
585         // 8 bit conversion and saturation to uint8
586         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
587 
588         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
589         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
590 
591         if (w > 4) {
592           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
593           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
594         } else if (w > 2) {
595           xx_storel_32(&dst[i * dst_stride], res_0);
596           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
597         } else {
598           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
599           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
600           *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
601           *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
602         }
603       }
604     } else {
605       for (i = 0; i < h; ++i) {
606         for (int j = 0; j < w; j += 16) {
607           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
608           // 18 19 20 21 22 23
609           const __m256i data = _mm256_inserti128_si256(
610               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
611               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
612               1);
613 
614           __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
615 
616           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
617                                      round_0_shift);
618 
619           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
620                                      round_shift);
621 
622           /* rounding code */
623           // 8 bit conversion and saturation to uint8
624           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
625 
626           // Store values into the destination buffer
627           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
628           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
629           __m128i res = _mm256_castsi256_si128(res_8b);
630           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
631         }
632       }
633     }
634   } else if (horiz_tap == 6) {
635     const int fo_horiz = horiz_tap / 2 - 1;
636     const uint8_t *const src_ptr = src - fo_horiz;
637     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
638     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
639 
640     if (w <= 8) {
641       for (i = 0; i < h; i += 2) {
642         const __m256i data = _mm256_permute2x128_si256(
643             _mm256_castsi128_si256(
644                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
645             _mm256_castsi128_si256(_mm_loadu_si128(
646                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
647             0x20);
648 
649         __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
650 
651         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
652                                    round_0_shift);
653 
654         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
655                                    round_shift);
656 
657         /* rounding code */
658         // 8 bit conversion and saturation to uint8
659         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
660 
661         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
662         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
663         if (w > 4) {
664           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
665           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
666         } else if (w > 2) {
667           xx_storel_32(&dst[i * dst_stride], res_0);
668           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
669         } else {
670           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
671           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
672           *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
673           *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
674         }
675       }
676     } else {
677       for (i = 0; i < h; ++i) {
678         for (int j = 0; j < w; j += 16) {
679           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
680           // 18 19 20 21 22 23
681           const __m256i data = _mm256_inserti128_si256(
682               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
683               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
684               1);
685 
686           __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
687 
688           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
689                                      round_0_shift);
690 
691           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
692                                      round_shift);
693 
694           /* rounding code */
695           // 8 bit conversion and saturation to uint8
696           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
697 
698           // Store values into the destination buffer
699           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
700           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
701           __m128i res = _mm256_castsi256_si128(res_8b);
702           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
703         }
704       }
705     }
706   } else if (horiz_tap == 12) {  // horiz_tap == 12
707     const int fo_horiz = filter_params_x->taps / 2 - 1;
708     const uint8_t *const src_ptr = src - fo_horiz;
709     const __m256i v_zero = _mm256_setzero_si256();
710     round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
711     round_const = _mm256_set1_epi32((1 << bits) >> 1);
712     round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
713     __m256i s[6];
714 
715     if (w <= 4) {
716       for (i = 0; i < h; i += 2) {
717         const __m256i data = _mm256_permute2x128_si256(
718             _mm256_castsi128_si256(
719                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
720             _mm256_castsi128_si256(_mm_loadu_si128(
721                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
722             0x20);
723         // row0 0..7 row1 0..7
724         const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
725         // row0 8..F row1 8..F
726         const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
727 
728         // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
729         const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
730         // row0 04 04 .. 07 07 row1 04 04 .. 07 07
731         const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
732 
733         // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
734         const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
735         // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
736         const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
737 
738         // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
739         s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
740         // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
741         s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
742         // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
743         s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
744         // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
745         s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
746         // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
747         s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
748         // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
749         s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
750 
751         const __m256i res_lo = convolve_12taps(s, coeffs);
752 
753         __m256i res_32b_lo = _mm256_sra_epi32(
754             _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
755 
756         // 00 01 02 03 10 12 13 14
757         res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
758                                       round_shift);
759         // 8 bit conversion and saturation to uint8
760         // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
761         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
762         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
763         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
764         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
765 
766         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
767         const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
768         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
769         const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
770         if (w > 2) {
771           // 00 01 02 03
772           *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
773           // 10 11 12 13
774           *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
775         } else {
776           // 00 01
777           *(uint16_t *)&dst[i * dst_stride] =
778               (uint16_t)_mm_cvtsi128_si32(res_0);
779           // 10 11
780           *(uint16_t *)&dst[i * dst_stride + dst_stride] =
781               (uint16_t)_mm_cvtsi128_si32(res_1);
782         }
783       }
784     } else {
785       for (i = 0; i < h; i++) {
786         for (int j = 0; j < w; j += 8) {
787           const __m256i data = _mm256_permute2x128_si256(
788               _mm256_castsi128_si256(
789                   _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
790               _mm256_castsi128_si256(_mm_loadu_si128(
791                   (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
792               0x20);
793           // row0 0..7 4..B
794           const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
795           // row0 8..F C..13
796           const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
797 
798           // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
799           const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
800           // row0 04 04 .. 07 07 08 08 .. 0B 0B
801           const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
802 
803           // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
804           const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
805           // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
806           const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
807 
808           s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
809           s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
810           s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
811           s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
812           s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
813           s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
814 
815           const __m256i res_lo = convolve_12taps(s, coeffs);
816 
817           __m256i res_32b_lo = _mm256_sra_epi32(
818               _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
819 
820           res_32b_lo = _mm256_sra_epi32(
821               _mm256_add_epi32(res_32b_lo, round_const), round_shift);
822           // 8 bit conversion and saturation to uint8
823           __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
824           __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
825           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
826           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
827           *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
828           *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
829         }
830       }
831     }
832   } else {
833     const int fo_horiz = filter_params_x->taps / 2 - 1;
834     const uint8_t *const src_ptr = src - fo_horiz;
835     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
836     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
837 
838     if (w <= 8) {
839       for (i = 0; i < h; i += 2) {
840         const __m256i data = _mm256_permute2x128_si256(
841             _mm256_castsi128_si256(
842                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
843             _mm256_castsi128_si256(_mm_loadu_si128(
844                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
845             0x20);
846 
847         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
848 
849         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
850                                    round_0_shift);
851 
852         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
853                                    round_shift);
854 
855         /* rounding code */
856         // 8 bit conversion and saturation to uint8
857         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
858 
859         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
860         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
861         if (w > 4) {
862           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
863           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
864         } else if (w > 2) {
865           xx_storel_32(&dst[i * dst_stride], res_0);
866           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
867         } else {
868           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
869           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
870           *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
871           *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
872         }
873       }
874     } else {
875       for (i = 0; i < h; ++i) {
876         for (int j = 0; j < w; j += 16) {
877           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
878           // 18 19 20 21 22 23
879           const __m256i data = _mm256_inserti128_si256(
880               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
881               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
882               1);
883 
884           __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
885 
886           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
887                                      round_0_shift);
888 
889           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
890                                      round_shift);
891 
892           /* rounding code */
893           // 8 bit conversion and saturation to uint8
894           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
895 
896           // Store values into the destination buffer
897           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
898           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
899           __m128i res = _mm256_castsi256_si128(res_8b);
900           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
901         }
902       }
903     }
904   }
905 }
906 
av1_convolve_x_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_qn,ConvolveParams * conv_params)907 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
908                             uint8_t *dst, int32_t dst_stride, int32_t w,
909                             int32_t h,
910                             const InterpFilterParams *filter_params_x,
911                             const int32_t subpel_x_qn,
912                             ConvolveParams *conv_params) {
913 #if CONFIG_SVT_AV1
914   const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
915 
916   if (horz_tap == 12) {
917     av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
918                                    filter_params_x, subpel_x_qn, conv_params);
919   } else {
920     av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
921                                        filter_params_x, subpel_x_qn,
922                                        conv_params);
923   }
924 #else
925   av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
926                                  filter_params_x, subpel_x_qn, conv_params);
927 #endif
928 }
929