1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "aom_dsp/x86/synonyms.h"
20 #include "av1/common/convolve.h"
21
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)22 static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
23 const int subpel_q4,
24 __m128i *const coeffs /* [4] */) {
25 const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
26 filter_params, subpel_q4 & SUBPEL_MASK);
27 const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
28 // coeffs 0 1 0 1 2 3 2 3
29 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
30 // coeffs 4 5 4 5 6 7 6 7
31 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
32
33 coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
34 coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
35 coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
36 coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
37 }
38
convolve(const __m128i * const s,const __m128i * const coeffs)39 static inline __m128i convolve(const __m128i *const s,
40 const __m128i *const coeffs) {
41 const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
42 const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
43 const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
44 const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
45 const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
46 return d;
47 }
48
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)49 static inline __m128i convolve_lo_x(const __m128i *const s,
50 const __m128i *const coeffs) {
51 __m128i ss[4];
52 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
53 ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
54 ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
55 ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
56 return convolve(ss, coeffs);
57 }
58
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)59 static inline __m128i convolve_lo_y(const __m128i *const s,
60 const __m128i *const coeffs) {
61 __m128i ss[4];
62 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
63 ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
64 ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
65 ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
66 return convolve(ss, coeffs);
67 }
68
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)69 static inline __m128i convolve_hi_y(const __m128i *const s,
70 const __m128i *const coeffs) {
71 __m128i ss[4];
72 ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
73 ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
74 ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
75 ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
76 return convolve(ss, coeffs);
77 }
78
convolve_y_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,int subpel_y_qn)79 static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
80 uint8_t *dst, int dst_stride, int w, int h,
81 const InterpFilterParams *filter_params_y,
82 int subpel_y_qn) {
83 const int fo_vert = filter_params_y->taps / 2 - 1;
84 const uint8_t *src_ptr = src - fo_vert * src_stride;
85 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
86 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
87 __m128i coeffs[6];
88
89 prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
90
91 int j = 0;
92 do {
93 __m128i s[12], src10, res_lo, res_hi;
94 __m128i res_lo_round, res_hi_round, res16, res;
95 const uint8_t *data = &src_ptr[j];
96
97 src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
98 s[0] =
99 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
100 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
101 s[1] =
102 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
103 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
104 s[2] =
105 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
106 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
107 s[3] =
108 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
109 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
110 s[4] =
111 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
112 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
113 s[5] =
114 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
115 _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
116 s[6] =
117 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
118 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
119 s[7] =
120 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
121 _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
122 s[8] =
123 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
124 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
125 s[9] = _mm_unpacklo_epi8(
126 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
127
128 int i = 0;
129 do {
130 data = &src_ptr[i * src_stride + j];
131 s[10] = _mm_unpacklo_epi8(
132 src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
133 src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
134 s[11] = _mm_unpacklo_epi8(
135 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
136
137 res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels
138 res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels
139
140 res_lo_round =
141 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
142 res_hi_round =
143 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
144
145 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
146 res = _mm_packus_epi16(res16, res16);
147
148 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
149 i++;
150
151 res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels
152 res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels
153
154 res_lo_round =
155 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
156 res_hi_round =
157 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
158
159 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
160 res = _mm_packus_epi16(res16, res16);
161
162 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
163 i++;
164
165 s[0] = s[2];
166 s[1] = s[3];
167 s[2] = s[4];
168 s[3] = s[5];
169 s[4] = s[6];
170 s[5] = s[7];
171 s[6] = s[8];
172 s[7] = s[9];
173 s[8] = s[10];
174 s[9] = s[11];
175 } while (i < h);
176 j += 8;
177 } while (j < w);
178 }
179
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)180 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
181 int dst_stride, int w, int h,
182 const InterpFilterParams *filter_params_y,
183 const int subpel_y_qn) {
184 if (filter_params_y->taps > 8) {
185 if (w < 8) {
186 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
187 filter_params_y, subpel_y_qn);
188 } else {
189 convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
190 filter_params_y, subpel_y_qn);
191 }
192 } else {
193 const int fo_vert = filter_params_y->taps / 2 - 1;
194 const uint8_t *src_ptr = src - fo_vert * src_stride;
195 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
196 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
197 __m128i coeffs[4];
198
199 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
200
201 if (w <= 4) {
202 __m128i s[8], src6, res, res_round, res16;
203 int res_int;
204 s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride),
205 xx_loadl_32(src_ptr + 1 * src_stride));
206 s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride),
207 xx_loadl_32(src_ptr + 2 * src_stride));
208 s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride),
209 xx_loadl_32(src_ptr + 3 * src_stride));
210 s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride),
211 xx_loadl_32(src_ptr + 4 * src_stride));
212 s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride),
213 xx_loadl_32(src_ptr + 5 * src_stride));
214 src6 = xx_loadl_32(src_ptr + 6 * src_stride);
215 s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6);
216
217 do {
218 s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride));
219 src6 = xx_loadl_32(src_ptr + 8 * src_stride);
220 s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6);
221
222 res = convolve_lo_y(s + 0, coeffs);
223 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
224 res16 = _mm_packs_epi32(res_round, res_round);
225 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
226
227 if (w == 2)
228 *(uint16_t *)dst = (uint16_t)res_int;
229 else
230 *(int *)dst = res_int;
231
232 src_ptr += src_stride;
233 dst += dst_stride;
234
235 res = convolve_lo_y(s + 1, coeffs);
236 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
237 res16 = _mm_packs_epi32(res_round, res_round);
238 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
239
240 if (w == 2)
241 *(uint16_t *)dst = (uint16_t)res_int;
242 else
243 *(int *)dst = res_int;
244
245 src_ptr += src_stride;
246 dst += dst_stride;
247
248 s[0] = s[2];
249 s[1] = s[3];
250 s[2] = s[4];
251 s[3] = s[5];
252 s[4] = s[6];
253 s[5] = s[7];
254 h -= 2;
255 } while (h);
256 } else {
257 assert(!(w % 8));
258 int j = 0;
259 do {
260 __m128i s[8], src6, res_lo, res_hi;
261 __m128i res_lo_round, res_hi_round, res16, res;
262 const uint8_t *data = &src_ptr[j];
263
264 src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
265 s[0] = _mm_unpacklo_epi8(
266 _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
267 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
268 s[1] = _mm_unpacklo_epi8(
269 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
270 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
271 s[2] = _mm_unpacklo_epi8(
272 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
273 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
274 s[3] = _mm_unpacklo_epi8(
275 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
276 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
277 s[4] = _mm_unpacklo_epi8(
278 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
279 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
280 s[5] = _mm_unpacklo_epi8(
281 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
282
283 int i = 0;
284 do {
285 data = &src_ptr[i * src_stride + j];
286 s[6] = _mm_unpacklo_epi8(
287 src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
288 src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
289 s[7] = _mm_unpacklo_epi8(
290 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
291
292 res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
293 res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
294
295 res_lo_round =
296 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
297 res_hi_round =
298 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
299
300 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
301 res = _mm_packus_epi16(res16, res16);
302
303 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
304 i++;
305
306 res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
307 res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
308
309 res_lo_round =
310 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
311 res_hi_round =
312 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
313
314 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
315 res = _mm_packus_epi16(res16, res16);
316
317 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
318 i++;
319
320 s[0] = s[2];
321 s[1] = s[3];
322 s[2] = s[4];
323 s[3] = s[5];
324 s[4] = s[6];
325 s[5] = s[7];
326 } while (i < h);
327 j += 8;
328 } while (j < w);
329 }
330 }
331 }
332
convolve_x_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,int subpel_x_qn,ConvolveParams * conv_params)333 static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
334 uint8_t *dst, int dst_stride, int w, int h,
335 const InterpFilterParams *filter_params_x,
336 int subpel_x_qn,
337 ConvolveParams *conv_params) {
338 const int fo_horiz = filter_params_x->taps / 2 - 1;
339 const uint8_t *src_ptr = src - fo_horiz;
340 const int bits = FILTER_BITS - conv_params->round_0;
341 const __m128i round_0_const =
342 _mm_set1_epi32((1 << conv_params->round_0) >> 1);
343 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
344 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
345 const __m128i round_shift = _mm_cvtsi32_si128(bits);
346 const __m128i zero = _mm_setzero_si128();
347 __m128i coeffs[6];
348
349 assert(bits >= 0);
350 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
351 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
352
353 prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
354
355 int i = 0;
356 do {
357 int j = 0;
358 do {
359 const __m128i data =
360 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
361 __m128i s[4];
362
363 s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
364 s[1] =
365 _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
366 s[2] =
367 _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
368 s[3] =
369 _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
370
371 const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
372
373 __m128i res32_round =
374 _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
375 res32_round =
376 _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
377
378 const __m128i res16 = _mm_packs_epi32(res32_round, zero);
379 const __m128i res = _mm_packus_epi16(res16, zero);
380
381 const int val = _mm_cvtsi128_si32(res);
382 memcpy((dst + i * dst_stride + j), &val, sizeof(val));
383 j += 4;
384 } while (j < w);
385 } while (++i < h);
386 }
387
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)388 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
389 int dst_stride, int w, int h,
390 const InterpFilterParams *filter_params_x,
391 const int subpel_x_qn,
392 ConvolveParams *conv_params) {
393 if (filter_params_x->taps > 8) {
394 if (w < 4) {
395 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
396 filter_params_x, subpel_x_qn, conv_params);
397 } else {
398 convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
399 filter_params_x, subpel_x_qn, conv_params);
400 }
401 } else {
402 const int fo_horiz = filter_params_x->taps / 2 - 1;
403 const uint8_t *src_ptr = src - fo_horiz;
404 const int bits = FILTER_BITS - conv_params->round_0;
405 const __m128i round_0_const =
406 _mm_set1_epi32((1 << conv_params->round_0) >> 1);
407 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
408 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
409 const __m128i round_shift = _mm_cvtsi32_si128(bits);
410 __m128i coeffs[4];
411
412 assert(bits >= 0);
413 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
414 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
415
416 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
417
418 if (w <= 4) {
419 do {
420 const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
421 __m128i s[4];
422
423 s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
424 s[1] =
425 _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
426 s[2] =
427 _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
428 s[3] =
429 _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
430 const __m128i res_lo = convolve_lo_x(s, coeffs);
431 __m128i res_lo_round =
432 _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
433 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
434 round_shift);
435
436 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
437 const __m128i res = _mm_packus_epi16(res16, res16);
438
439 int r = _mm_cvtsi128_si32(res);
440 if (w == 2)
441 *(uint16_t *)dst = (uint16_t)r;
442 else
443 *(int *)dst = r;
444
445 src_ptr += src_stride;
446 dst += dst_stride;
447 } while (--h);
448 } else {
449 assert(!(w % 8));
450 int i = 0;
451 do {
452 int j = 0;
453 do {
454 const __m128i data =
455 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
456 __m128i s[4];
457
458 // Filter even-index pixels
459 s[0] = data;
460 s[1] = _mm_srli_si128(data, 2);
461 s[2] = _mm_srli_si128(data, 4);
462 s[3] = _mm_srli_si128(data, 6);
463 const __m128i res_even = convolve_lo_x(s, coeffs);
464
465 // Filter odd-index pixels
466 s[0] = _mm_srli_si128(data, 1);
467 s[1] = _mm_srli_si128(data, 3);
468 s[2] = _mm_srli_si128(data, 5);
469 s[3] = _mm_srli_si128(data, 7);
470 const __m128i res_odd = convolve_lo_x(s, coeffs);
471
472 // Rearrange pixels back into the order 0 ... 7
473 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
474 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
475 __m128i res_lo_round = _mm_sra_epi32(
476 _mm_add_epi32(res_lo, round_0_const), round_0_shift);
477 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
478 round_shift);
479 __m128i res_hi_round = _mm_sra_epi32(
480 _mm_add_epi32(res_hi, round_0_const), round_0_shift);
481 res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
482 round_shift);
483
484 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
485 const __m128i res = _mm_packus_epi16(res16, res16);
486
487 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
488 j += 8;
489 } while (j < w);
490 } while (++i < h);
491 }
492 }
493 }
494