xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_convolve_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <emmintrin.h>
12 
13 #include "config/aom_dsp_rtcd.h"
14 #include "aom_dsp/x86/convolve.h"
15 
16 // -----------------------------------------------------------------------------
17 
aom_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)18 static void aom_highbd_filter_block1d4_v4_sse2(
19     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
20     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
21   __m128i filtersReg;
22   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
23   __m128i srcReg23_lo, srcReg34_lo;
24   __m128i srcReg45_lo, srcReg56_lo;
25   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
26   __m128i resReg23_45_lo, resReg34_56_lo;
27   __m128i resReg23_45, resReg34_56;
28   __m128i addFilterReg64, secondFilters, thirdFilters;
29   unsigned int i;
30   ptrdiff_t src_stride, dst_stride;
31 
32   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
33   addFilterReg64 = _mm_set1_epi32(64);
34   filtersReg = _mm_loadu_si128((const __m128i *)filter);
35 
36   // coeffs 0 1 0 1 2 3 2 3
37   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
38   // coeffs 4 5 4 5 6 7 6 7
39   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
40 
41   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
42   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
43 
44   // multiply the size of the source and destination stride by two
45   src_stride = src_pitch << 1;
46   dst_stride = dst_pitch << 1;
47 
48   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
49   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
50   srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
51 
52   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
53   srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
54 
55   for (i = height; i > 1; i -= 2) {
56     srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
57     srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
58 
59     srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
60     srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
61 
62     // multiply 2 adjacent elements with the filter and add the result
63 
64     resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
65     resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
66     resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
67     resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
68 
69     resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
70     resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
71 
72     // shift by 7 bit each 32 bit
73     resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
74     resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
75     resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
76     resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
77 
78     // shrink to 16 bit each 32 bits, the first lane contain the first
79     // convolve result and the second lane contain the second convolve
80     // result
81     resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
82     resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
83 
84     resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
85     resReg23_45 = _mm_min_epi16(resReg23_45, max);
86     resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
87     resReg34_56 = _mm_min_epi16(resReg34_56, max);
88 
89     src_ptr += src_stride;
90 
91     _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
92     _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
93 
94     dst_ptr += dst_stride;
95 
96     // save part of the registers for next strides
97     srcReg23_lo = srcReg45_lo;
98     srcReg34_lo = srcReg56_lo;
99     srcReg4 = srcReg6;
100   }
101 }
102 
aom_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)103 static void aom_highbd_filter_block1d4_h4_sse2(
104     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
105     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
106   __m128i filtersReg;
107   __m128i addFilterReg64;
108   __m128i secondFilters, thirdFilters;
109   __m128i srcRegFilt32b1_1;
110   __m128i srcReg32b1;
111   unsigned int i;
112   src_ptr -= 3;
113   addFilterReg64 = _mm_set1_epi32(64);
114   filtersReg = _mm_loadu_si128((const __m128i *)filter);
115   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
116 
117   // coeffs 0 1 0 1 2 3 2 3
118   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
119   // coeffs 4 5 4 5 6 7 6 7
120   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
121 
122   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
123   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
124 
125   for (i = height; i > 0; i -= 1) {
126     srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
127 
128     __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
129     __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
130     __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
131     __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
132     __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
133 
134     ss_23 = _mm_madd_epi16(ss_23, secondFilters);
135     ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
136     srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
137 
138     // shift by 7 bit each 32 bit
139     srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
140     srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
141 
142     srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
143     srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
144     srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
145 
146     src_ptr += src_pitch;
147 
148     _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
149 
150     dst_ptr += dst_pitch;
151   }
152 }
153 
aom_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)154 static void aom_highbd_filter_block1d8_v4_sse2(
155     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
156     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
157   __m128i filtersReg;
158   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
159   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
160   __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
161   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
162   __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
163   __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
164   __m128i resReg23_45, resReg34_56;
165   __m128i addFilterReg64, secondFilters, thirdFilters;
166   unsigned int i;
167   ptrdiff_t src_stride, dst_stride;
168 
169   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
170   addFilterReg64 = _mm_set1_epi32(64);
171   filtersReg = _mm_loadu_si128((const __m128i *)filter);
172 
173   // coeffs 0 1 0 1 2 3 2 3
174   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
175   // coeffs 4 5 4 5 6 7 6 7
176   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
177 
178   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
179   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
180 
181   // multiple the size of the source and destination stride by two
182   src_stride = src_pitch << 1;
183   dst_stride = dst_pitch << 1;
184 
185   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
186   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
187   srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
188   srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
189 
190   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
191   srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
192   srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
193 
194   for (i = height; i > 1; i -= 2) {
195     srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
196 
197     srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
198     srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
199 
200     srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
201 
202     srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
203     srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
204 
205     // multiply 2 adjacent elements with the filter and add the result
206 
207     resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
208     resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
209     resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
210     resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
211 
212     resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
213     resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
214 
215     // multiply 2 adjacent elements with the filter and add the result
216 
217     resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
218     resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
219     resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
220     resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
221 
222     resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
223     resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
224 
225     // shift by 7 bit each 32 bit
226     resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
227     resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
228     resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
229     resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
230     resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
231     resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
232     resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
233     resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
234 
235     // shrink to 16 bit each 32 bits, the first lane contain the first
236     // convolve result and the second lane contain the second convolve
237     // result
238     resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
239     resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
240 
241     resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
242     resReg23_45 = _mm_min_epi16(resReg23_45, max);
243     resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
244     resReg34_56 = _mm_min_epi16(resReg34_56, max);
245 
246     src_ptr += src_stride;
247 
248     _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
249     _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
250 
251     dst_ptr += dst_stride;
252 
253     // save part of the registers for next strides
254     srcReg23_lo = srcReg45_lo;
255     srcReg23_hi = srcReg45_hi;
256     srcReg34_lo = srcReg56_lo;
257     srcReg34_hi = srcReg56_hi;
258     srcReg4 = srcReg6;
259   }
260 }
261 
aom_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)262 static void aom_highbd_filter_block1d8_h4_sse2(
263     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
264     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
265   __m128i filtersReg;
266   __m128i addFilterReg64;
267   __m128i secondFilters, thirdFilters;
268   __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
269   __m128i srcReg32b1, srcReg32b2;
270   unsigned int i;
271   src_ptr -= 3;
272   addFilterReg64 = _mm_set1_epi32(64);
273   filtersReg = _mm_loadu_si128((const __m128i *)filter);
274   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
275 
276   // coeffs 0 1 0 1 2 3 2 3
277   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
278   // coeffs 4 5 4 5 6 7 6 7
279   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
280 
281   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
282   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
283 
284   for (i = height; i > 0; i -= 1) {
285     srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
286     srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
287 
288     __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
289     __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
290     __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
291 
292     __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
293     __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
294     srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
295 
296     __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
297     __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
298     __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
299     __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
300     __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
301     __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
302 
303     d1 = _mm_madd_epi16(ss_3, secondFilters);
304     d2 = _mm_madd_epi16(ss_5, thirdFilters);
305     srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
306 
307     __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
308     __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
309 
310     // shift by 7 bit each 32 bit
311     res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
312     res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
313     res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
314     res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
315 
316     srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
317 
318     srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
319     srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
320 
321     src_ptr += src_pitch;
322 
323     _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
324 
325     dst_ptr += dst_pitch;
326   }
327 }
328 
aom_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)329 static void aom_highbd_filter_block1d16_v4_sse2(
330     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
331     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
332   aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
333                                      height, filter, bd);
334   aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
335                                      dst_pitch, height, filter, bd);
336 }
337 
aom_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)338 static void aom_highbd_filter_block1d16_h4_sse2(
339     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
340     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
341   aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
342                                      height, filter, bd);
343   aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
344                                      dst_pitch, height, filter, bd);
345 }
346 
347 // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
348 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
349 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
350 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
351 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
352 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
353 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
354 
355 // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
356 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
357 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
358 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
359 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
360 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
361 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
362 
363 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
364 //                                      ptrdiff_t src_stride,
365 //                                      uint8_t *dst,
366 //                                      ptrdiff_t dst_stride,
367 //                                      const int16_t *filter_x,
368 //                                      int x_step_q4,
369 //                                      const int16_t *filter_y,
370 //                                      int y_step_q4,
371 //                                      int w, int h, int bd);
372 // void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
373 //                                     ptrdiff_t src_stride,
374 //                                     uint8_t *dst,
375 //                                     ptrdiff_t dst_stride,
376 //                                     const int16_t *filter_x,
377 //                                     int x_step_q4,
378 //                                     const int16_t *filter_y,
379 //                                     int y_step_q4,
380 //                                     int w, int h, int bd);
381 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
382 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
383