xref: /aosp_15_r20/external/libaom/av1/common/x86/convolve_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "aom_dsp/x86/synonyms.h"
20 #include "av1/common/convolve.h"
21 
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)22 static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
23                                   const int subpel_q4,
24                                   __m128i *const coeffs /* [4] */) {
25   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
26       filter_params, subpel_q4 & SUBPEL_MASK);
27   const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
28   // coeffs 0 1 0 1 2 3 2 3
29   const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
30   // coeffs 4 5 4 5 6 7 6 7
31   const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
32 
33   coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
34   coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
35   coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
36   coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
37 }
38 
convolve(const __m128i * const s,const __m128i * const coeffs)39 static inline __m128i convolve(const __m128i *const s,
40                                const __m128i *const coeffs) {
41   const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
42   const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
43   const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
44   const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
45   const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
46   return d;
47 }
48 
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)49 static inline __m128i convolve_lo_x(const __m128i *const s,
50                                     const __m128i *const coeffs) {
51   __m128i ss[4];
52   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
53   ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
54   ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
55   ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
56   return convolve(ss, coeffs);
57 }
58 
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)59 static inline __m128i convolve_lo_y(const __m128i *const s,
60                                     const __m128i *const coeffs) {
61   __m128i ss[4];
62   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
63   ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
64   ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
65   ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
66   return convolve(ss, coeffs);
67 }
68 
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)69 static inline __m128i convolve_hi_y(const __m128i *const s,
70                                     const __m128i *const coeffs) {
71   __m128i ss[4];
72   ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
73   ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
74   ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
75   ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
76   return convolve(ss, coeffs);
77 }
78 
convolve_y_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,int subpel_y_qn)79 static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
80                                      uint8_t *dst, int dst_stride, int w, int h,
81                                      const InterpFilterParams *filter_params_y,
82                                      int subpel_y_qn) {
83   const int fo_vert = filter_params_y->taps / 2 - 1;
84   const uint8_t *src_ptr = src - fo_vert * src_stride;
85   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
86   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
87   __m128i coeffs[6];
88 
89   prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
90 
91   int j = 0;
92   do {
93     __m128i s[12], src10, res_lo, res_hi;
94     __m128i res_lo_round, res_hi_round, res16, res;
95     const uint8_t *data = &src_ptr[j];
96 
97     src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
98     s[0] =
99         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
100                           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
101     s[1] =
102         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
103                           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
104     s[2] =
105         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
106                           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
107     s[3] =
108         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
109                           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
110     s[4] =
111         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
112                           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
113     s[5] =
114         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
115                           _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
116     s[6] =
117         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
118                           _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
119     s[7] =
120         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
121                           _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
122     s[8] =
123         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
124                           _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
125     s[9] = _mm_unpacklo_epi8(
126         _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
127 
128     int i = 0;
129     do {
130       data = &src_ptr[i * src_stride + j];
131       s[10] = _mm_unpacklo_epi8(
132           src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
133       src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
134       s[11] = _mm_unpacklo_epi8(
135           _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
136 
137       res_lo = convolve_lo_y_12tap(s, coeffs);  // Filter low index pixels
138       res_hi = convolve_hi_y_12tap(s, coeffs);  // Filter high index pixels
139 
140       res_lo_round =
141           _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
142       res_hi_round =
143           _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
144 
145       res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
146       res = _mm_packus_epi16(res16, res16);
147 
148       _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
149       i++;
150 
151       res_lo = convolve_lo_y_12tap(s + 1, coeffs);  // Filter low index pixels
152       res_hi = convolve_hi_y_12tap(s + 1, coeffs);  // Filter high index pixels
153 
154       res_lo_round =
155           _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
156       res_hi_round =
157           _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
158 
159       res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
160       res = _mm_packus_epi16(res16, res16);
161 
162       _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
163       i++;
164 
165       s[0] = s[2];
166       s[1] = s[3];
167       s[2] = s[4];
168       s[3] = s[5];
169       s[4] = s[6];
170       s[5] = s[7];
171       s[6] = s[8];
172       s[7] = s[9];
173       s[8] = s[10];
174       s[9] = s[11];
175     } while (i < h);
176     j += 8;
177   } while (j < w);
178 }
179 
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)180 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
181                             int dst_stride, int w, int h,
182                             const InterpFilterParams *filter_params_y,
183                             const int subpel_y_qn) {
184   if (filter_params_y->taps > 8) {
185     if (w < 8) {
186       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
187                           filter_params_y, subpel_y_qn);
188     } else {
189       convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
190                                filter_params_y, subpel_y_qn);
191     }
192   } else {
193     const int fo_vert = filter_params_y->taps / 2 - 1;
194     const uint8_t *src_ptr = src - fo_vert * src_stride;
195     const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
196     const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
197     __m128i coeffs[4];
198 
199     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
200 
201     if (w <= 4) {
202       __m128i s[8], src6, res, res_round, res16;
203       int res_int;
204       s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
205                                xx_loadl_32(src_ptr + 1 * src_stride));
206       s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
207                                xx_loadl_32(src_ptr + 2 * src_stride));
208       s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
209                                xx_loadl_32(src_ptr + 3 * src_stride));
210       s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
211                                xx_loadl_32(src_ptr + 4 * src_stride));
212       s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
213                                xx_loadl_32(src_ptr + 5 * src_stride));
214       src6 = xx_loadl_32(src_ptr + 6 * src_stride);
215       s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
216 
217       do {
218         s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
219         src6 = xx_loadl_32(src_ptr + 8 * src_stride);
220         s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
221 
222         res = convolve_lo_y(s + 0, coeffs);
223         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
224         res16 = _mm_packs_epi32(res_round, res_round);
225         res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
226 
227         if (w == 2)
228           *(uint16_t *)dst = (uint16_t)res_int;
229         else
230           *(int *)dst = res_int;
231 
232         src_ptr += src_stride;
233         dst += dst_stride;
234 
235         res = convolve_lo_y(s + 1, coeffs);
236         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
237         res16 = _mm_packs_epi32(res_round, res_round);
238         res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
239 
240         if (w == 2)
241           *(uint16_t *)dst = (uint16_t)res_int;
242         else
243           *(int *)dst = res_int;
244 
245         src_ptr += src_stride;
246         dst += dst_stride;
247 
248         s[0] = s[2];
249         s[1] = s[3];
250         s[2] = s[4];
251         s[3] = s[5];
252         s[4] = s[6];
253         s[5] = s[7];
254         h -= 2;
255       } while (h);
256     } else {
257       assert(!(w % 8));
258       int j = 0;
259       do {
260         __m128i s[8], src6, res_lo, res_hi;
261         __m128i res_lo_round, res_hi_round, res16, res;
262         const uint8_t *data = &src_ptr[j];
263 
264         src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
265         s[0] = _mm_unpacklo_epi8(
266             _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
267             _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
268         s[1] = _mm_unpacklo_epi8(
269             _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
270             _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
271         s[2] = _mm_unpacklo_epi8(
272             _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
273             _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
274         s[3] = _mm_unpacklo_epi8(
275             _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
276             _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
277         s[4] = _mm_unpacklo_epi8(
278             _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
279             _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
280         s[5] = _mm_unpacklo_epi8(
281             _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
282 
283         int i = 0;
284         do {
285           data = &src_ptr[i * src_stride + j];
286           s[6] = _mm_unpacklo_epi8(
287               src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
288           src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
289           s[7] = _mm_unpacklo_epi8(
290               _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
291 
292           res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
293           res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
294 
295           res_lo_round =
296               _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
297           res_hi_round =
298               _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
299 
300           res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
301           res = _mm_packus_epi16(res16, res16);
302 
303           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
304           i++;
305 
306           res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
307           res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
308 
309           res_lo_round =
310               _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
311           res_hi_round =
312               _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
313 
314           res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
315           res = _mm_packus_epi16(res16, res16);
316 
317           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
318           i++;
319 
320           s[0] = s[2];
321           s[1] = s[3];
322           s[2] = s[4];
323           s[3] = s[5];
324           s[4] = s[6];
325           s[5] = s[7];
326         } while (i < h);
327         j += 8;
328       } while (j < w);
329     }
330   }
331 }
332 
convolve_x_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,int subpel_x_qn,ConvolveParams * conv_params)333 static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
334                                      uint8_t *dst, int dst_stride, int w, int h,
335                                      const InterpFilterParams *filter_params_x,
336                                      int subpel_x_qn,
337                                      ConvolveParams *conv_params) {
338   const int fo_horiz = filter_params_x->taps / 2 - 1;
339   const uint8_t *src_ptr = src - fo_horiz;
340   const int bits = FILTER_BITS - conv_params->round_0;
341   const __m128i round_0_const =
342       _mm_set1_epi32((1 << conv_params->round_0) >> 1);
343   const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
344   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
345   const __m128i round_shift = _mm_cvtsi32_si128(bits);
346   const __m128i zero = _mm_setzero_si128();
347   __m128i coeffs[6];
348 
349   assert(bits >= 0);
350   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
351          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
352 
353   prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
354 
355   int i = 0;
356   do {
357     int j = 0;
358     do {
359       const __m128i data =
360           _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
361       __m128i s[4];
362 
363       s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
364       s[1] =
365           _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
366       s[2] =
367           _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
368       s[3] =
369           _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
370 
371       const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
372 
373       __m128i res32_round =
374           _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
375       res32_round =
376           _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
377 
378       const __m128i res16 = _mm_packs_epi32(res32_round, zero);
379       const __m128i res = _mm_packus_epi16(res16, zero);
380 
381       const int val = _mm_cvtsi128_si32(res);
382       memcpy((dst + i * dst_stride + j), &val, sizeof(val));
383       j += 4;
384     } while (j < w);
385   } while (++i < h);
386 }
387 
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)388 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
389                             int dst_stride, int w, int h,
390                             const InterpFilterParams *filter_params_x,
391                             const int subpel_x_qn,
392                             ConvolveParams *conv_params) {
393   if (filter_params_x->taps > 8) {
394     if (w < 4) {
395       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
396                           filter_params_x, subpel_x_qn, conv_params);
397     } else {
398       convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
399                                filter_params_x, subpel_x_qn, conv_params);
400     }
401   } else {
402     const int fo_horiz = filter_params_x->taps / 2 - 1;
403     const uint8_t *src_ptr = src - fo_horiz;
404     const int bits = FILTER_BITS - conv_params->round_0;
405     const __m128i round_0_const =
406         _mm_set1_epi32((1 << conv_params->round_0) >> 1);
407     const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
408     const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
409     const __m128i round_shift = _mm_cvtsi32_si128(bits);
410     __m128i coeffs[4];
411 
412     assert(bits >= 0);
413     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
414            ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
415 
416     prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
417 
418     if (w <= 4) {
419       do {
420         const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
421         __m128i s[4];
422 
423         s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
424         s[1] =
425             _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
426         s[2] =
427             _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
428         s[3] =
429             _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
430         const __m128i res_lo = convolve_lo_x(s, coeffs);
431         __m128i res_lo_round =
432             _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
433         res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
434                                      round_shift);
435 
436         const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
437         const __m128i res = _mm_packus_epi16(res16, res16);
438 
439         int r = _mm_cvtsi128_si32(res);
440         if (w == 2)
441           *(uint16_t *)dst = (uint16_t)r;
442         else
443           *(int *)dst = r;
444 
445         src_ptr += src_stride;
446         dst += dst_stride;
447       } while (--h);
448     } else {
449       assert(!(w % 8));
450       int i = 0;
451       do {
452         int j = 0;
453         do {
454           const __m128i data =
455               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
456           __m128i s[4];
457 
458           // Filter even-index pixels
459           s[0] = data;
460           s[1] = _mm_srli_si128(data, 2);
461           s[2] = _mm_srli_si128(data, 4);
462           s[3] = _mm_srli_si128(data, 6);
463           const __m128i res_even = convolve_lo_x(s, coeffs);
464 
465           // Filter odd-index pixels
466           s[0] = _mm_srli_si128(data, 1);
467           s[1] = _mm_srli_si128(data, 3);
468           s[2] = _mm_srli_si128(data, 5);
469           s[3] = _mm_srli_si128(data, 7);
470           const __m128i res_odd = convolve_lo_x(s, coeffs);
471 
472           // Rearrange pixels back into the order 0 ... 7
473           const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
474           const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
475           __m128i res_lo_round = _mm_sra_epi32(
476               _mm_add_epi32(res_lo, round_0_const), round_0_shift);
477           res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
478                                        round_shift);
479           __m128i res_hi_round = _mm_sra_epi32(
480               _mm_add_epi32(res_hi, round_0_const), round_0_shift);
481           res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
482                                        round_shift);
483 
484           const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
485           const __m128i res = _mm_packus_epi16(res16, res16);
486 
487           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
488           j += 8;
489         } while (j < w);
490       } while (++i < h);
491     }
492   }
493 }
494