xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/x86/convolve.h"
14 #include "vpx_dsp/x86/convolve_avx2.h"
15 
16 // -----------------------------------------------------------------------------
17 // Copy and average
18 
vpx_highbd_convolve_copy_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)19 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20                                    uint16_t *dst, ptrdiff_t dst_stride,
21                                    const InterpKernel *filter, int x0_q4,
22                                    int x_step_q4, int y0_q4, int y_step_q4,
23                                    int w, int h, int bd) {
24   (void)filter;
25   (void)x0_q4;
26   (void)x_step_q4;
27   (void)y0_q4;
28   (void)y_step_q4;
29   (void)bd;
30 
31   assert(w % 4 == 0);
32   if (w > 32) {  // w = 64
33     do {
34       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36       const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37       const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38       src += src_stride;
39       _mm256_storeu_si256((__m256i *)dst, p0);
40       _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41       _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42       _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43       dst += dst_stride;
44       h--;
45     } while (h > 0);
46   } else if (w > 16) {  // w = 32
47     do {
48       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50       src += src_stride;
51       _mm256_storeu_si256((__m256i *)dst, p0);
52       _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53       dst += dst_stride;
54       h--;
55     } while (h > 0);
56   } else if (w > 8) {  // w = 16
57     __m256i p0, p1;
58     do {
59       p0 = _mm256_loadu_si256((const __m256i *)src);
60       src += src_stride;
61       p1 = _mm256_loadu_si256((const __m256i *)src);
62       src += src_stride;
63 
64       _mm256_storeu_si256((__m256i *)dst, p0);
65       dst += dst_stride;
66       _mm256_storeu_si256((__m256i *)dst, p1);
67       dst += dst_stride;
68       h -= 2;
69     } while (h > 0);
70   } else if (w > 4) {  // w = 8
71     __m128i p0, p1;
72     do {
73       p0 = _mm_loadu_si128((const __m128i *)src);
74       src += src_stride;
75       p1 = _mm_loadu_si128((const __m128i *)src);
76       src += src_stride;
77 
78       _mm_storeu_si128((__m128i *)dst, p0);
79       dst += dst_stride;
80       _mm_storeu_si128((__m128i *)dst, p1);
81       dst += dst_stride;
82       h -= 2;
83     } while (h > 0);
84   } else {  // w = 4
85     __m128i p0, p1;
86     do {
87       p0 = _mm_loadl_epi64((const __m128i *)src);
88       src += src_stride;
89       p1 = _mm_loadl_epi64((const __m128i *)src);
90       src += src_stride;
91 
92       _mm_storel_epi64((__m128i *)dst, p0);
93       dst += dst_stride;
94       _mm_storel_epi64((__m128i *)dst, p1);
95       dst += dst_stride;
96       h -= 2;
97     } while (h > 0);
98   }
99 }
100 
vpx_highbd_convolve_avg_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)101 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102                                   uint16_t *dst, ptrdiff_t dst_stride,
103                                   const InterpKernel *filter, int x0_q4,
104                                   int x_step_q4, int y0_q4, int y_step_q4,
105                                   int w, int h, int bd) {
106   (void)filter;
107   (void)x0_q4;
108   (void)x_step_q4;
109   (void)y0_q4;
110   (void)y_step_q4;
111   (void)bd;
112 
113   assert(w % 4 == 0);
114   if (w > 32) {  // w = 64
115     __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116     do {
117       p0 = _mm256_loadu_si256((const __m256i *)src);
118       p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119       p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120       p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121       src += src_stride;
122       u0 = _mm256_loadu_si256((const __m256i *)dst);
123       u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124       u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125       u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127       _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128       _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129       _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130       dst += dst_stride;
131       h--;
132     } while (h > 0);
133   } else if (w > 16) {  // w = 32
134     __m256i p0, p1, u0, u1;
135     do {
136       p0 = _mm256_loadu_si256((const __m256i *)src);
137       p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138       src += src_stride;
139       u0 = _mm256_loadu_si256((const __m256i *)dst);
140       u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142       _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143       dst += dst_stride;
144       h--;
145     } while (h > 0);
146   } else if (w > 8) {  // w = 16
147     __m256i p0, p1, u0, u1;
148     do {
149       p0 = _mm256_loadu_si256((const __m256i *)src);
150       p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151       src += src_stride << 1;
152       u0 = _mm256_loadu_si256((const __m256i *)dst);
153       u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154 
155       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156       _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157                           _mm256_avg_epu16(p1, u1));
158       dst += dst_stride << 1;
159       h -= 2;
160     } while (h > 0);
161   } else if (w > 4) {  // w = 8
162     __m128i p0, p1, u0, u1;
163     do {
164       p0 = _mm_loadu_si128((const __m128i *)src);
165       p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166       src += src_stride << 1;
167       u0 = _mm_loadu_si128((const __m128i *)dst);
168       u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169 
170       _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171       _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172       dst += dst_stride << 1;
173       h -= 2;
174     } while (h > 0);
175   } else {  // w = 4
176     __m128i p0, p1, u0, u1;
177     do {
178       p0 = _mm_loadl_epi64((const __m128i *)src);
179       p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180       src += src_stride << 1;
181       u0 = _mm_loadl_epi64((const __m128i *)dst);
182       u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183 
184       _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185       _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186       dst += dst_stride << 1;
187       h -= 2;
188     } while (h > 0);
189   }
190 }
191 
192 // -----------------------------------------------------------------------------
193 // Horizontal and vertical filtering
194 
195 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
196                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
197                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
198 
199 static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
200                                               8, 9, 10, 11, 10, 11, 12, 13,
201                                               4, 5, 6,  7,  6,  7,  8,  9,
202                                               8, 9, 10, 11, 10, 11, 12, 13 };
203 
204 static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
205                                               10, 11, 12, 13, 12, 13, 14, 15,
206                                               6,  7,  8,  9,  8,  9,  10, 11,
207                                               10, 11, 12, 13, 12, 13, 14, 15 };
208 
209 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
210 
211 #define CONV8_ROUNDING_BITS (7)
212 #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
213 
214 // -----------------------------------------------------------------------------
215 // Horizontal Filtering
216 
pack_pixels(const __m256i * s,__m256i * p)217 static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
218   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
219   const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
220   const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
221   const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
222 
223   p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
224   p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
225   p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
226   p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
227 }
228 
229 // Note:
230 //  Shared by 8x2 and 16x1 block
pack_16_pixels(const __m256i * s0,const __m256i * s1,__m256i * x)231 static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
232                                   __m256i *x /*x[8]*/) {
233   __m256i pp[8];
234   pack_pixels(s0, pp);
235   pack_pixels(s1, &pp[4]);
236   x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
237   x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
238   x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
239   x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
240   x[4] = x[2];
241   x[5] = x[3];
242   x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
243   x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
244 }
245 
pack_8x1_pixels(const uint16_t * src,__m256i * x)246 static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
247   __m256i pp[8];
248   __m256i s0;
249   s0 = _mm256_loadu_si256((const __m256i *)src);
250   pack_pixels(&s0, pp);
251   x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
252   x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
253   x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
254   x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
255 }
256 
pack_8x2_pixels(const uint16_t * src,ptrdiff_t stride,__m256i * x)257 static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
258                                    __m256i *x) {
259   __m256i s0, s1;
260   s0 = _mm256_loadu_si256((const __m256i *)src);
261   s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
262   pack_16_pixels(&s0, &s1, x);
263 }
264 
pack_16x1_pixels(const uint16_t * src,__m256i * x)265 static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
266   __m256i s0, s1;
267   s0 = _mm256_loadu_si256((const __m256i *)src);
268   s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
269   pack_16_pixels(&s0, &s1, x);
270 }
271 
272 // Note:
273 //  Shared by horizontal and vertical filtering
pack_filters(const int16_t * filter,__m256i * f)274 static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
275   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
276   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
277   const __m256i p0 = _mm256_set1_epi32(0x03020100);
278   const __m256i p1 = _mm256_set1_epi32(0x07060504);
279   const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
280   const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
281   f[0] = _mm256_shuffle_epi8(hh, p0);
282   f[1] = _mm256_shuffle_epi8(hh, p1);
283   f[2] = _mm256_shuffle_epi8(hh, p2);
284   f[3] = _mm256_shuffle_epi8(hh, p3);
285 }
286 
filter_8x1_pixels(const __m256i * sig,const __m256i * fil,__m256i * y)287 static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
288                                      const __m256i *fil /*fil[4]*/,
289                                      __m256i *y) {
290   __m256i a, a0, a1;
291 
292   a0 = _mm256_madd_epi16(fil[0], sig[0]);
293   a1 = _mm256_madd_epi16(fil[3], sig[3]);
294   a = _mm256_add_epi32(a0, a1);
295 
296   a0 = _mm256_madd_epi16(fil[1], sig[1]);
297   a1 = _mm256_madd_epi16(fil[2], sig[2]);
298 
299   {
300     const __m256i min = _mm256_min_epi32(a0, a1);
301     a = _mm256_add_epi32(a, min);
302   }
303   {
304     const __m256i max = _mm256_max_epi32(a0, a1);
305     a = _mm256_add_epi32(a, max);
306   }
307   {
308     const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
309     a = _mm256_add_epi32(a, rounding);
310     *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
311   }
312 }
313 
store_8x1_pixels(const __m256i * y,const __m256i * mask,uint16_t * dst)314 static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
315                                     uint16_t *dst) {
316   const __m128i a0 = _mm256_castsi256_si128(*y);
317   const __m128i a1 = _mm256_extractf128_si256(*y, 1);
318   __m128i res = _mm_packus_epi32(a0, a1);
319   res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
320   _mm_storeu_si128((__m128i *)dst, res);
321 }
322 
store_8x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)323 static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
324                                     const __m256i *mask, uint16_t *dst,
325                                     ptrdiff_t pitch) {
326   __m256i a = _mm256_packus_epi32(*y0, *y1);
327   a = _mm256_min_epi16(a, *mask);
328   _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
329   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
330 }
331 
store_16x1_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)332 static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
333                                      const __m256i *mask, uint16_t *dst) {
334   __m256i a = _mm256_packus_epi32(*y0, *y1);
335   a = _mm256_min_epi16(a, *mask);
336   _mm256_storeu_si256((__m256i *)dst, a);
337 }
338 
vpx_highbd_filter_block1d8_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)339 static void vpx_highbd_filter_block1d8_h8_avx2(
340     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
341     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
342   __m256i signal[8], res0, res1;
343   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
344 
345   __m256i ff[4];
346   pack_filters(filter, ff);
347 
348   src_ptr -= 3;
349   do {
350     pack_8x2_pixels(src_ptr, src_pitch, signal);
351     filter_8x1_pixels(signal, ff, &res0);
352     filter_8x1_pixels(&signal[4], ff, &res1);
353     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
354     height -= 2;
355     src_ptr += src_pitch << 1;
356     dst_ptr += dst_pitch << 1;
357   } while (height > 1);
358 
359   if (height > 0) {
360     pack_8x1_pixels(src_ptr, signal);
361     filter_8x1_pixels(signal, ff, &res0);
362     store_8x1_pixels(&res0, &max, dst_ptr);
363   }
364 }
365 
vpx_highbd_filter_block1d16_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)366 static void vpx_highbd_filter_block1d16_h8_avx2(
367     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
368     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
369   __m256i signal[8], res0, res1;
370   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
371 
372   __m256i ff[4];
373   pack_filters(filter, ff);
374 
375   src_ptr -= 3;
376   do {
377     pack_16x1_pixels(src_ptr, signal);
378     filter_8x1_pixels(signal, ff, &res0);
379     filter_8x1_pixels(&signal[4], ff, &res1);
380     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
381     height -= 1;
382     src_ptr += src_pitch;
383     dst_ptr += dst_pitch;
384   } while (height > 0);
385 }
386 
387 // -----------------------------------------------------------------------------
388 // 2-tap horizontal filtering
389 
pack_2t_filter(const int16_t * filter,__m256i * f)390 static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
391   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
392   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
393   const __m256i p = _mm256_set1_epi32(0x09080706);
394   f[0] = _mm256_shuffle_epi8(hh, p);
395 }
396 
397 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
398 // the difference is s0/s1 specifies first and second rows or,
399 // first 16 samples and 8-sample shifted 16 samples
pack_16_2t_pixels(const __m256i * s0,const __m256i * s1,__m256i * sig)400 static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
401                                      __m256i *sig) {
402   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
403   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
404   __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
405   __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
406   __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
407   __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
408   r0 = _mm256_shuffle_epi8(r0, sf2);
409   r1 = _mm256_shuffle_epi8(r1, sf2);
410   sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
411   sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
412 }
413 
pack_8x2_2t_pixels(const uint16_t * src,const ptrdiff_t pitch,__m256i * sig)414 static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
415                                       const ptrdiff_t pitch, __m256i *sig) {
416   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
417   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
418   pack_16_2t_pixels(&r0, &r1, sig);
419 }
420 
pack_16x1_2t_pixels(const uint16_t * src,__m256i * sig)421 static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
422                                        __m256i *sig /*sig[2]*/) {
423   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
424   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
425   pack_16_2t_pixels(&r0, &r1, sig);
426 }
427 
pack_8x1_2t_pixels(const uint16_t * src,__m256i * sig)428 static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
429                                       __m256i *sig /*sig[2]*/) {
430   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
431   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
432   __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
433   __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
434   r0 = _mm256_permutevar8x32_epi32(r0, idx);
435   r0 = _mm256_shuffle_epi8(r0, sf2);
436   sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
437 }
438 
439 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
filter_16_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)440 static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
441                                        __m256i *y0, __m256i *y1) {
442   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
443   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
444   __m256i x1 = _mm256_madd_epi16(sig[1], *f);
445   x0 = _mm256_add_epi32(x0, rounding);
446   x1 = _mm256_add_epi32(x1, rounding);
447   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
448   *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
449 }
450 
filter_8x1_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0)451 static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
452                                         __m256i *y0) {
453   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
454   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
455   x0 = _mm256_add_epi32(x0, rounding);
456   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
457 }
458 
vpx_highbd_filter_block1d8_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)459 static void vpx_highbd_filter_block1d8_h2_avx2(
460     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
461     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
462   __m256i signal[2], res0, res1;
463   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
464 
465   __m256i ff;
466   pack_2t_filter(filter, &ff);
467 
468   src_ptr -= 3;
469   do {
470     pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
471     filter_16_2t_pixels(signal, &ff, &res0, &res1);
472     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
473     height -= 2;
474     src_ptr += src_pitch << 1;
475     dst_ptr += dst_pitch << 1;
476   } while (height > 1);
477 
478   if (height > 0) {
479     pack_8x1_2t_pixels(src_ptr, signal);
480     filter_8x1_2t_pixels(signal, &ff, &res0);
481     store_8x1_pixels(&res0, &max, dst_ptr);
482   }
483 }
484 
vpx_highbd_filter_block1d16_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)485 static void vpx_highbd_filter_block1d16_h2_avx2(
486     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
487     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
488   __m256i signal[2], res0, res1;
489   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
490 
491   __m256i ff;
492   pack_2t_filter(filter, &ff);
493 
494   src_ptr -= 3;
495   do {
496     pack_16x1_2t_pixels(src_ptr, signal);
497     filter_16_2t_pixels(signal, &ff, &res0, &res1);
498     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
499     height -= 1;
500     src_ptr += src_pitch;
501     dst_ptr += dst_pitch;
502   } while (height > 0);
503 }
504 
505 // -----------------------------------------------------------------------------
506 // Vertical Filtering
507 
pack_8x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)508 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
509   __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
510   __m256i s1 =
511       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
512   __m256i s2 = _mm256_castsi128_si256(
513       _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
514   __m256i s3 = _mm256_castsi128_si256(
515       _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
516   __m256i s4 = _mm256_castsi128_si256(
517       _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
518   __m256i s5 = _mm256_castsi128_si256(
519       _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
520   __m256i s6 = _mm256_castsi128_si256(
521       _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
522 
523   s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
524   s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
525   s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
526   s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
527   s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
528   s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
529 
530   sig[0] = _mm256_unpacklo_epi16(s0, s1);
531   sig[4] = _mm256_unpackhi_epi16(s0, s1);
532   sig[1] = _mm256_unpacklo_epi16(s2, s3);
533   sig[5] = _mm256_unpackhi_epi16(s2, s3);
534   sig[2] = _mm256_unpacklo_epi16(s4, s5);
535   sig[6] = _mm256_unpackhi_epi16(s4, s5);
536   sig[8] = s6;
537 }
538 
pack_8x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)539 static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
540                                    __m256i *sig) {
541   // base + 7th row
542   __m256i s0 = _mm256_castsi128_si256(
543       _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
544   // base + 8th row
545   __m256i s1 = _mm256_castsi128_si256(
546       _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
547   __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
548   __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
549   sig[3] = _mm256_unpacklo_epi16(s2, s3);
550   sig[7] = _mm256_unpackhi_epi16(s2, s3);
551   sig[8] = s1;
552 }
553 
filter_8x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)554 static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
555                                      __m256i *y0, __m256i *y1) {
556   filter_8x1_pixels(sig, f, y0);
557   filter_8x1_pixels(&sig[4], f, y1);
558 }
559 
update_pixels(__m256i * sig)560 static INLINE void update_pixels(__m256i *sig) {
561   int i;
562   for (i = 0; i < 3; ++i) {
563     sig[i] = sig[i + 1];
564     sig[i + 4] = sig[i + 5];
565   }
566 }
567 
vpx_highbd_filter_block1d8_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)568 static void vpx_highbd_filter_block1d8_v8_avx2(
569     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
570     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
571   __m256i signal[9], res0, res1;
572   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
573 
574   __m256i ff[4];
575   pack_filters(filter, ff);
576 
577   pack_8x9_init(src_ptr, src_pitch, signal);
578 
579   do {
580     pack_8x9_pixels(src_ptr, src_pitch, signal);
581 
582     filter_8x9_pixels(signal, ff, &res0, &res1);
583     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
584     update_pixels(signal);
585 
586     src_ptr += src_pitch << 1;
587     dst_ptr += dst_pitch << 1;
588     height -= 2;
589   } while (height > 0);
590 }
591 
pack_16x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)592 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
593   __m256i u0, u1, u2, u3;
594   // load 0-6 rows
595   const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
596   const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
597   const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
598   const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
599   const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
600   const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
601   const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
602 
603   u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
604   u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
605 
606   u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
607   u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
608 
609   sig[0] = _mm256_unpacklo_epi16(u0, u2);
610   sig[4] = _mm256_unpackhi_epi16(u0, u2);
611 
612   sig[8] = _mm256_unpacklo_epi16(u1, u3);
613   sig[12] = _mm256_unpackhi_epi16(u1, u3);
614 
615   u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
616   u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
617 
618   u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
619   u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
620 
621   sig[1] = _mm256_unpacklo_epi16(u0, u2);
622   sig[5] = _mm256_unpackhi_epi16(u0, u2);
623 
624   sig[9] = _mm256_unpacklo_epi16(u1, u3);
625   sig[13] = _mm256_unpackhi_epi16(u1, u3);
626 
627   u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
628   u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
629 
630   u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
631   u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
632 
633   sig[2] = _mm256_unpacklo_epi16(u0, u2);
634   sig[6] = _mm256_unpackhi_epi16(u0, u2);
635 
636   sig[10] = _mm256_unpacklo_epi16(u1, u3);
637   sig[14] = _mm256_unpackhi_epi16(u1, u3);
638 
639   sig[16] = s6;
640 }
641 
pack_16x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)642 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
643                              __m256i *sig) {
644   // base + 7th row
645   const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
646   // base + 8th row
647   const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
648 
649   __m256i u0, u1, u2, u3;
650   u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
651   u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
652 
653   u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
654   u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
655 
656   sig[3] = _mm256_unpacklo_epi16(u0, u2);
657   sig[7] = _mm256_unpackhi_epi16(u0, u2);
658 
659   sig[11] = _mm256_unpacklo_epi16(u1, u3);
660   sig[15] = _mm256_unpackhi_epi16(u1, u3);
661 
662   sig[16] = s8;
663 }
664 
filter_16x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)665 static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
666                                       __m256i *y0, __m256i *y1) {
667   __m256i res[4];
668   int i;
669   for (i = 0; i < 4; ++i) {
670     filter_8x1_pixels(&sig[i << 2], f, &res[i]);
671   }
672 
673   {
674     const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
675     const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
676     *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
677     *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
678   }
679 }
680 
store_16x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)681 static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
682                                      const __m256i *mask, uint16_t *dst,
683                                      ptrdiff_t pitch) {
684   __m256i p = _mm256_min_epi16(*y0, *mask);
685   _mm256_storeu_si256((__m256i *)dst, p);
686   p = _mm256_min_epi16(*y1, *mask);
687   _mm256_storeu_si256((__m256i *)(dst + pitch), p);
688 }
689 
update_16x9_pixels(__m256i * sig)690 static void update_16x9_pixels(__m256i *sig) {
691   update_pixels(&sig[0]);
692   update_pixels(&sig[8]);
693 }
694 
vpx_highbd_filter_block1d16_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)695 static void vpx_highbd_filter_block1d16_v8_avx2(
696     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
697     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
698   __m256i signal[17], res0, res1;
699   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
700 
701   __m256i ff[4];
702   pack_filters(filter, ff);
703 
704   pack_16x9_init(src_ptr, src_pitch, signal);
705 
706   do {
707     pack_16x9_pixels(src_ptr, src_pitch, signal);
708     filter_16x9_pixels(signal, ff, &res0, &res1);
709     store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
710     update_16x9_pixels(signal);
711 
712     src_ptr += src_pitch << 1;
713     dst_ptr += dst_pitch << 1;
714     height -= 2;
715   } while (height > 0);
716 }
717 
718 // -----------------------------------------------------------------------------
719 // 2-tap vertical filtering
720 
pack_16x2_init(const uint16_t * src,__m256i * sig)721 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
722   sig[2] = _mm256_loadu_si256((const __m256i *)src);
723 }
724 
pack_16x2_2t_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)725 static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
726                                        __m256i *sig) {
727   // load the next row
728   const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
729   sig[0] = _mm256_unpacklo_epi16(sig[2], u);
730   sig[1] = _mm256_unpackhi_epi16(sig[2], u);
731   sig[2] = u;
732 }
733 
filter_16x2_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)734 static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
735                                          __m256i *y0, __m256i *y1) {
736   filter_16_2t_pixels(sig, f, y0, y1);
737 }
738 
vpx_highbd_filter_block1d16_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)739 static void vpx_highbd_filter_block1d16_v2_avx2(
740     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
741     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
742   __m256i signal[3], res0, res1;
743   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
744   __m256i ff;
745 
746   pack_2t_filter(filter, &ff);
747   pack_16x2_init(src_ptr, signal);
748 
749   do {
750     pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
751     filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
752     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
753 
754     src_ptr += src_pitch;
755     dst_ptr += dst_pitch;
756     height -= 1;
757   } while (height > 0);
758 }
759 
pack_8x1_2t_filter(const int16_t * filter,__m128i * f)760 static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
761   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
762   const __m128i p = _mm_set1_epi32(0x09080706);
763   f[0] = _mm_shuffle_epi8(h, p);
764 }
765 
pack_8x2_init(const uint16_t * src,__m128i * sig)766 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
767   sig[2] = _mm_loadu_si128((const __m128i *)src);
768 }
769 
pack_8x2_2t_pixels_ver(const uint16_t * src,ptrdiff_t pitch,__m128i * sig)770 static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
771                                           __m128i *sig) {
772   // load the next row
773   const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
774   sig[0] = _mm_unpacklo_epi16(sig[2], u);
775   sig[1] = _mm_unpackhi_epi16(sig[2], u);
776   sig[2] = u;
777 }
778 
filter_8_2t_pixels(const __m128i * sig,const __m128i * f,__m128i * y0,__m128i * y1)779 static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
780                                       __m128i *y0, __m128i *y1) {
781   const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
782   __m128i x0 = _mm_madd_epi16(sig[0], *f);
783   __m128i x1 = _mm_madd_epi16(sig[1], *f);
784   x0 = _mm_add_epi32(x0, rounding);
785   x1 = _mm_add_epi32(x1, rounding);
786   *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
787   *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
788 }
789 
store_8x1_2t_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)790 static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
791                                            const __m128i *mask, uint16_t *dst) {
792   __m128i res = _mm_packus_epi32(*y0, *y1);
793   res = _mm_min_epi16(res, *mask);
794   _mm_storeu_si128((__m128i *)dst, res);
795 }
796 
vpx_highbd_filter_block1d8_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)797 static void vpx_highbd_filter_block1d8_v2_avx2(
798     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
799     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
800   __m128i signal[3], res0, res1;
801   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
802   __m128i ff;
803 
804   pack_8x1_2t_filter(filter, &ff);
805   pack_8x2_init(src_ptr, signal);
806 
807   do {
808     pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
809     filter_8_2t_pixels(signal, &ff, &res0, &res1);
810     store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
811 
812     src_ptr += src_pitch;
813     dst_ptr += dst_pitch;
814     height -= 1;
815   } while (height > 0);
816 }
817 
818 // Calculation with averaging the input pixels
819 
store_8x1_avg_pixels(const __m256i * y0,const __m256i * mask,uint16_t * dst)820 static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
821                                         uint16_t *dst) {
822   const __m128i a0 = _mm256_castsi256_si128(*y0);
823   const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
824   __m128i res = _mm_packus_epi32(a0, a1);
825   const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
826   res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
827   res = _mm_avg_epu16(res, pix);
828   _mm_storeu_si128((__m128i *)dst, res);
829 }
830 
store_8x2_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)831 static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
832                                         const __m256i *mask, uint16_t *dst,
833                                         ptrdiff_t pitch) {
834   __m256i a = _mm256_packus_epi32(*y0, *y1);
835   const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
836   const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
837   const __m256i pix =
838       _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
839   a = _mm256_min_epi16(a, *mask);
840   a = _mm256_avg_epu16(a, pix);
841   _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
842   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
843 }
844 
store_16x1_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)845 static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
846                                          const __m256i *mask, uint16_t *dst) {
847   __m256i a = _mm256_packus_epi32(*y0, *y1);
848   const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
849   a = _mm256_min_epi16(a, *mask);
850   a = _mm256_avg_epu16(a, pix);
851   _mm256_storeu_si256((__m256i *)dst, a);
852 }
853 
store_16x2_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)854 static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
855                                          const __m256i *mask, uint16_t *dst,
856                                          ptrdiff_t pitch) {
857   const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
858   const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
859   __m256i p = _mm256_min_epi16(*y0, *mask);
860   p = _mm256_avg_epu16(p, pix0);
861   _mm256_storeu_si256((__m256i *)dst, p);
862 
863   p = _mm256_min_epi16(*y1, *mask);
864   p = _mm256_avg_epu16(p, pix1);
865   _mm256_storeu_si256((__m256i *)(dst + pitch), p);
866 }
867 
store_8x1_2t_avg_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)868 static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
869                                                const __m128i *y1,
870                                                const __m128i *mask,
871                                                uint16_t *dst) {
872   __m128i res = _mm_packus_epi32(*y0, *y1);
873   const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
874   res = _mm_min_epi16(res, *mask);
875   res = _mm_avg_epu16(res, pix);
876   _mm_storeu_si128((__m128i *)dst, res);
877 }
878 
vpx_highbd_filter_block1d8_h8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)879 static void vpx_highbd_filter_block1d8_h8_avg_avx2(
880     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
881     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
882   __m256i signal[8], res0, res1;
883   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
884 
885   __m256i ff[4];
886   pack_filters(filter, ff);
887 
888   src_ptr -= 3;
889   do {
890     pack_8x2_pixels(src_ptr, src_pitch, signal);
891     filter_8x1_pixels(signal, ff, &res0);
892     filter_8x1_pixels(&signal[4], ff, &res1);
893     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
894     height -= 2;
895     src_ptr += src_pitch << 1;
896     dst_ptr += dst_pitch << 1;
897   } while (height > 1);
898 
899   if (height > 0) {
900     pack_8x1_pixels(src_ptr, signal);
901     filter_8x1_pixels(signal, ff, &res0);
902     store_8x1_avg_pixels(&res0, &max, dst_ptr);
903   }
904 }
905 
vpx_highbd_filter_block1d16_h8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)906 static void vpx_highbd_filter_block1d16_h8_avg_avx2(
907     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
908     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
909   __m256i signal[8], res0, res1;
910   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
911 
912   __m256i ff[4];
913   pack_filters(filter, ff);
914 
915   src_ptr -= 3;
916   do {
917     pack_16x1_pixels(src_ptr, signal);
918     filter_8x1_pixels(signal, ff, &res0);
919     filter_8x1_pixels(&signal[4], ff, &res1);
920     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
921     height -= 1;
922     src_ptr += src_pitch;
923     dst_ptr += dst_pitch;
924   } while (height > 0);
925 }
926 
vpx_highbd_filter_block1d4_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)927 static void vpx_highbd_filter_block1d4_h4_avx2(
928     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
929     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
930   // We extract the middle four elements of the kernel into two registers in
931   // the form
932   // ... k[3] k[2] k[3] k[2]
933   // ... k[5] k[4] k[5] k[4]
934   // Then we shuffle the source into
935   // ... s[1] s[0] s[0] s[-1]
936   // ... s[3] s[2] s[2] s[1]
937   // Calling multiply and add gives us half of the sum. Calling add on the two
938   // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
939   // can do this two rows at a time.
940 
941   __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
942   __m256i res_reg;
943   __m256i idx_shift_0 =
944       _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
945                        3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
946   __m256i idx_shift_2 =
947       _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
948                        5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
949 
950   __m128i kernel_reg_128;  // Kernel
951   __m256i kernel_reg, kernel_reg_23,
952       kernel_reg_45;  // Segments of the kernel used
953   const __m256i reg_round =
954       _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
955   const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
956   const ptrdiff_t unrolled_src_stride = src_stride << 1;
957   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
958   int h;
959 
960   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
961   src_ptr -= 1;
962 
963   // Load Kernel
964   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
965   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
966   kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
967   kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
968 
969   for (h = height; h >= 2; h -= 2) {
970     // Load the source
971     src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
972     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
973     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
974 
975     // Get the output
976     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
977                                    &kernel_reg_23, &kernel_reg_45);
978 
979     // Round the result
980     res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
981 
982     // Finally combine to get the final dst
983     res_reg = _mm256_packus_epi32(res_reg, res_reg);
984     res_reg = _mm256_min_epi16(res_reg, reg_max);
985     mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
986                         &res_reg);
987 
988     src_ptr += unrolled_src_stride;
989     dst_ptr += unrolled_dst_stride;
990   }
991 
992   // Repeat for the last row if needed
993   if (h > 0) {
994     // Load the source
995     src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
996     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
997     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
998 
999     // Get the output
1000     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1001                                    &kernel_reg_23, &kernel_reg_45);
1002 
1003     // Round the result
1004     res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1005 
1006     // Finally combine to get the final dst
1007     res_reg = _mm256_packus_epi32(res_reg, res_reg);
1008     res_reg = _mm256_min_epi16(res_reg, reg_max);
1009     _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1010   }
1011 }
1012 
vpx_highbd_filter_block1d8_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1013 static void vpx_highbd_filter_block1d8_h4_avx2(
1014     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1015     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1016   // We will extract the middle four elements of the kernel into two registers
1017   // in the form
1018   // ... k[3] k[2] k[3] k[2]
1019   // ... k[5] k[4] k[5] k[4]
1020   // Then we shuffle the source into
1021   // ... s[1] s[0] s[0] s[-1]
1022   // ... s[3] s[2] s[2] s[1]
1023   // Calling multiply and add gives us half of the sum of the first half.
1024   // Calling add gives us first half of the output. Repat again to get the whole
1025   // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1026   // at a time.
1027 
1028   __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1029   __m256i res_reg, res_first, res_last;
1030   __m256i idx_shift_0 =
1031       _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1032                        3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1033   __m256i idx_shift_2 =
1034       _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1035                        5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1036 
1037   __m128i kernel_reg_128;  // Kernel
1038   __m256i kernel_reg, kernel_reg_23,
1039       kernel_reg_45;  // Segments of the kernel used
1040   const __m256i reg_round =
1041       _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1042   const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1043   const ptrdiff_t unrolled_src_stride = src_stride << 1;
1044   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1045   int h;
1046 
1047   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1048   src_ptr -= 1;
1049 
1050   // Load Kernel
1051   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1052   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1053   kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1054   kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1055 
1056   for (h = height; h >= 2; h -= 2) {
1057     // Load the source
1058     src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1059     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1060     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1061 
1062     // Result for first half
1063     res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1064                                      &kernel_reg_23, &kernel_reg_45);
1065 
1066     // Do again to get the second half of dst
1067     // Load the source
1068     src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1069     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1070     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1071 
1072     // Result for second half
1073     res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1074                                     &kernel_reg_23, &kernel_reg_45);
1075 
1076     // Round each result
1077     res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
1078     res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
1079 
1080     // Finally combine to get the final dst
1081     res_reg = _mm256_packus_epi32(res_first, res_last);
1082     res_reg = _mm256_min_epi16(res_reg, reg_max);
1083     mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1084                        &res_reg);
1085 
1086     src_ptr += unrolled_src_stride;
1087     dst_ptr += unrolled_dst_stride;
1088   }
1089 
1090   // Repeat for the last row if needed
1091   if (h > 0) {
1092     src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1093     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1094     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1095 
1096     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1097                                    &kernel_reg_23, &kernel_reg_45);
1098 
1099     res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1100 
1101     res_reg = _mm256_packus_epi32(res_reg, res_reg);
1102     res_reg = _mm256_min_epi16(res_reg, reg_max);
1103 
1104     mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1105   }
1106 }
1107 
vpx_highbd_filter_block1d16_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1108 static void vpx_highbd_filter_block1d16_h4_avx2(
1109     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1110     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1111   vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1112                                      height, kernel, bd);
1113   vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1114                                      dst_stride, height, kernel, bd);
1115 }
1116 
vpx_highbd_filter_block1d8_v8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1117 static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1118     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1119     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1120   __m256i signal[9], res0, res1;
1121   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1122 
1123   __m256i ff[4];
1124   pack_filters(filter, ff);
1125 
1126   pack_8x9_init(src_ptr, src_pitch, signal);
1127 
1128   do {
1129     pack_8x9_pixels(src_ptr, src_pitch, signal);
1130 
1131     filter_8x9_pixels(signal, ff, &res0, &res1);
1132     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1133     update_pixels(signal);
1134 
1135     src_ptr += src_pitch << 1;
1136     dst_ptr += dst_pitch << 1;
1137     height -= 2;
1138   } while (height > 0);
1139 }
1140 
vpx_highbd_filter_block1d16_v8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1141 static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1142     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1143     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1144   __m256i signal[17], res0, res1;
1145   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1146 
1147   __m256i ff[4];
1148   pack_filters(filter, ff);
1149 
1150   pack_16x9_init(src_ptr, src_pitch, signal);
1151 
1152   do {
1153     pack_16x9_pixels(src_ptr, src_pitch, signal);
1154     filter_16x9_pixels(signal, ff, &res0, &res1);
1155     store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1156     update_16x9_pixels(signal);
1157 
1158     src_ptr += src_pitch << 1;
1159     dst_ptr += dst_pitch << 1;
1160     height -= 2;
1161   } while (height > 0);
1162 }
1163 
vpx_highbd_filter_block1d8_h2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1164 static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1165     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1166     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1167   __m256i signal[2], res0, res1;
1168   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1169 
1170   __m256i ff;
1171   pack_2t_filter(filter, &ff);
1172 
1173   src_ptr -= 3;
1174   do {
1175     pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1176     filter_16_2t_pixels(signal, &ff, &res0, &res1);
1177     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1178     height -= 2;
1179     src_ptr += src_pitch << 1;
1180     dst_ptr += dst_pitch << 1;
1181   } while (height > 1);
1182 
1183   if (height > 0) {
1184     pack_8x1_2t_pixels(src_ptr, signal);
1185     filter_8x1_2t_pixels(signal, &ff, &res0);
1186     store_8x1_avg_pixels(&res0, &max, dst_ptr);
1187   }
1188 }
1189 
vpx_highbd_filter_block1d16_h2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1190 static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1191     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1192     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1193   __m256i signal[2], res0, res1;
1194   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1195 
1196   __m256i ff;
1197   pack_2t_filter(filter, &ff);
1198 
1199   src_ptr -= 3;
1200   do {
1201     pack_16x1_2t_pixels(src_ptr, signal);
1202     filter_16_2t_pixels(signal, &ff, &res0, &res1);
1203     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1204     height -= 1;
1205     src_ptr += src_pitch;
1206     dst_ptr += dst_pitch;
1207   } while (height > 0);
1208 }
1209 
vpx_highbd_filter_block1d16_v2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1210 static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1211     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1212     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1213   __m256i signal[3], res0, res1;
1214   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1215   __m256i ff;
1216 
1217   pack_2t_filter(filter, &ff);
1218   pack_16x2_init(src_ptr, signal);
1219 
1220   do {
1221     pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1222     filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1223     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1224 
1225     src_ptr += src_pitch;
1226     dst_ptr += dst_pitch;
1227     height -= 1;
1228   } while (height > 0);
1229 }
1230 
vpx_highbd_filter_block1d8_v2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1231 static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1232     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1233     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1234   __m128i signal[3], res0, res1;
1235   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1236   __m128i ff;
1237 
1238   pack_8x1_2t_filter(filter, &ff);
1239   pack_8x2_init(src_ptr, signal);
1240 
1241   do {
1242     pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1243     filter_8_2t_pixels(signal, &ff, &res0, &res1);
1244     store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1245 
1246     src_ptr += src_pitch;
1247     dst_ptr += dst_pitch;
1248     height -= 1;
1249   } while (height > 0);
1250 }
1251 
vpx_highbd_filter_block1d4_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1252 static void vpx_highbd_filter_block1d4_v4_avx2(
1253     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1254     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1255   // We will load two rows of pixels and rearrange them into the form
1256   // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1257   // so that we can call multiply and add with the kernel partial output. Then
1258   // we can call add with another row to get the output.
1259 
1260   // Register for source s[-1:3, :]
1261   __m256i src_reg_1, src_reg_2, src_reg_3;
1262   // Interleaved rows of the source. lo is first half, hi second
1263   __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1264   __m256i src_reg_m1001, src_reg_1223;
1265 
1266   // Result after multiply and add
1267   __m256i res_reg;
1268 
1269   __m128i kernel_reg_128;                            // Kernel
1270   __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
1271 
1272   const __m256i reg_round =
1273       _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1274   const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1275   const ptrdiff_t src_stride_unrolled = src_stride << 1;
1276   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1277   int h;
1278 
1279   // Load Kernel
1280   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1281   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1282   kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1283   kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1284 
1285   // Row -1 to row 0
1286   src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1287                                    (const __m128i *)(src_ptr + src_stride));
1288 
1289   // Row 0 to row 1
1290   src_reg_1 = _mm256_castsi128_si256(
1291       _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1292   src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1293 
1294   // First three rows
1295   src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1296 
1297   for (h = height; h > 1; h -= 2) {
1298     src_reg_2 = _mm256_castsi128_si256(
1299         _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1300 
1301     src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1302                                          _mm256_castsi256_si128(src_reg_2), 1);
1303 
1304     src_reg_3 = _mm256_castsi128_si256(
1305         _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1306 
1307     src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1308                                          _mm256_castsi256_si128(src_reg_3), 1);
1309 
1310     // Last three rows
1311     src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1312 
1313     // Output
1314     res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1315                                    &kernel_reg_23, &kernel_reg_45);
1316 
1317     // Round the words
1318     res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
1319 
1320     // Combine to get the result
1321     res_reg = _mm256_packus_epi32(res_reg, res_reg);
1322     res_reg = _mm256_min_epi16(res_reg, reg_max);
1323 
1324     // Save the result
1325     mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1326                         &res_reg);
1327 
1328     // Update the source by two rows
1329     src_ptr += src_stride_unrolled;
1330     dst_ptr += dst_stride_unrolled;
1331 
1332     src_reg_m1001 = src_reg_1223;
1333     src_reg_1 = src_reg_3;
1334   }
1335 }
1336 
vpx_highbd_filter_block1d8_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1337 static void vpx_highbd_filter_block1d8_v4_avx2(
1338     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1339     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1340   // We will load two rows of pixels and rearrange them into the form
1341   // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1342   // so that we can call multiply and add with the kernel partial output. Then
1343   // we can call add with another row to get the output.
1344 
1345   // Register for source s[-1:3, :]
1346   __m256i src_reg_1, src_reg_2, src_reg_3;
1347   // Interleaved rows of the source. lo is first half, hi second
1348   __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1349   __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1350 
1351   __m128i kernel_reg_128;                            // Kernel
1352   __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
1353 
1354   // Result after multiply and add
1355   __m256i res_reg, res_reg_lo, res_reg_hi;
1356 
1357   const __m256i reg_round =
1358       _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
1359   const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1360   const ptrdiff_t src_stride_unrolled = src_stride << 1;
1361   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1362   int h;
1363 
1364   // Load Kernel
1365   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1366   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1367   kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1368   kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1369 
1370   // Row -1 to row 0
1371   src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1372                                    (const __m128i *)(src_ptr + src_stride));
1373 
1374   // Row 0 to row 1
1375   src_reg_1 = _mm256_castsi128_si256(
1376       _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1377   src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1378 
1379   // First three rows
1380   src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1381   src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1382 
1383   for (h = height; h > 1; h -= 2) {
1384     src_reg_2 = _mm256_castsi128_si256(
1385         _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1386 
1387     src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1388                                          _mm256_castsi256_si128(src_reg_2), 1);
1389 
1390     src_reg_3 = _mm256_castsi128_si256(
1391         _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1392 
1393     src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1394                                          _mm256_castsi256_si128(src_reg_3), 1);
1395 
1396     // Last three rows
1397     src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1398     src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1399 
1400     // Output from first half
1401     res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1402                                       &kernel_reg_23, &kernel_reg_45);
1403 
1404     // Output from second half
1405     res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1406                                       &kernel_reg_23, &kernel_reg_45);
1407 
1408     // Round the words
1409     res_reg_lo =
1410         mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
1411     res_reg_hi =
1412         mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
1413 
1414     // Combine to get the result
1415     res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1416     res_reg = _mm256_min_epi16(res_reg, reg_max);
1417 
1418     // Save the result
1419     mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1420                        &res_reg);
1421 
1422     // Update the source by two rows
1423     src_ptr += src_stride_unrolled;
1424     dst_ptr += dst_stride_unrolled;
1425 
1426     src_reg_m1001_lo = src_reg_1223_lo;
1427     src_reg_m1001_hi = src_reg_1223_hi;
1428     src_reg_1 = src_reg_3;
1429   }
1430 }
1431 
vpx_highbd_filter_block1d16_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1432 static void vpx_highbd_filter_block1d16_v4_avx2(
1433     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1434     ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1435   vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1436                                      height, kernel, bd);
1437   vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1438                                      dst_stride, height, kernel, bd);
1439 }
1440 
1441 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1442 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1443 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1444 
1445 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1446 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1447 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1448 
1449 #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1450 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1451 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1452 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1453 
1454 // Use the [vh]8 version because there is no [vh]4 implementation.
1455 #define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1456   vpx_highbd_filter_block1d16_v8_avg_avx2
1457 #define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1458   vpx_highbd_filter_block1d16_h8_avg_avx2
1459 #define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1460   vpx_highbd_filter_block1d8_v8_avg_avx2
1461 #define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1462   vpx_highbd_filter_block1d8_h8_avg_avx2
1463 #define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1464   vpx_highbd_filter_block1d4_v8_avg_avx2
1465 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1466   vpx_highbd_filter_block1d4_h8_avg_avx2
1467 
1468 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1469 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1470                  src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1471 HIGH_FUN_CONV_2D(, avx2, 0)
1472 
1473 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1474 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1475 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1476 
1477 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1478 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1479 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1480 
1481 #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1482   vpx_highbd_filter_block1d4_h8_avg_sse2
1483 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1484   vpx_highbd_filter_block1d4_h2_avg_sse2
1485 #define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1486   vpx_highbd_filter_block1d4_v8_avg_sse2
1487 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1488   vpx_highbd_filter_block1d4_v2_avg_sse2
1489 
1490 HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1491 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1492                  src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1493 HIGH_FUN_CONV_2D(avg_, avx2, 1)
1494 
1495 #undef HIGHBD_FUNC
1496