1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/x86/convolve.h"
14 #include "vpx_dsp/x86/convolve_avx2.h"
15
16 // -----------------------------------------------------------------------------
17 // Copy and average
18
vpx_highbd_convolve_copy_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)19 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
20 uint16_t *dst, ptrdiff_t dst_stride,
21 const InterpKernel *filter, int x0_q4,
22 int x_step_q4, int y0_q4, int y_step_q4,
23 int w, int h, int bd) {
24 (void)filter;
25 (void)x0_q4;
26 (void)x_step_q4;
27 (void)y0_q4;
28 (void)y_step_q4;
29 (void)bd;
30
31 assert(w % 4 == 0);
32 if (w > 32) { // w = 64
33 do {
34 const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
35 const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
36 const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
37 const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
38 src += src_stride;
39 _mm256_storeu_si256((__m256i *)dst, p0);
40 _mm256_storeu_si256((__m256i *)(dst + 16), p1);
41 _mm256_storeu_si256((__m256i *)(dst + 32), p2);
42 _mm256_storeu_si256((__m256i *)(dst + 48), p3);
43 dst += dst_stride;
44 h--;
45 } while (h > 0);
46 } else if (w > 16) { // w = 32
47 do {
48 const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
49 const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
50 src += src_stride;
51 _mm256_storeu_si256((__m256i *)dst, p0);
52 _mm256_storeu_si256((__m256i *)(dst + 16), p1);
53 dst += dst_stride;
54 h--;
55 } while (h > 0);
56 } else if (w > 8) { // w = 16
57 __m256i p0, p1;
58 do {
59 p0 = _mm256_loadu_si256((const __m256i *)src);
60 src += src_stride;
61 p1 = _mm256_loadu_si256((const __m256i *)src);
62 src += src_stride;
63
64 _mm256_storeu_si256((__m256i *)dst, p0);
65 dst += dst_stride;
66 _mm256_storeu_si256((__m256i *)dst, p1);
67 dst += dst_stride;
68 h -= 2;
69 } while (h > 0);
70 } else if (w > 4) { // w = 8
71 __m128i p0, p1;
72 do {
73 p0 = _mm_loadu_si128((const __m128i *)src);
74 src += src_stride;
75 p1 = _mm_loadu_si128((const __m128i *)src);
76 src += src_stride;
77
78 _mm_storeu_si128((__m128i *)dst, p0);
79 dst += dst_stride;
80 _mm_storeu_si128((__m128i *)dst, p1);
81 dst += dst_stride;
82 h -= 2;
83 } while (h > 0);
84 } else { // w = 4
85 __m128i p0, p1;
86 do {
87 p0 = _mm_loadl_epi64((const __m128i *)src);
88 src += src_stride;
89 p1 = _mm_loadl_epi64((const __m128i *)src);
90 src += src_stride;
91
92 _mm_storel_epi64((__m128i *)dst, p0);
93 dst += dst_stride;
94 _mm_storel_epi64((__m128i *)dst, p1);
95 dst += dst_stride;
96 h -= 2;
97 } while (h > 0);
98 }
99 }
100
vpx_highbd_convolve_avg_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)101 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
102 uint16_t *dst, ptrdiff_t dst_stride,
103 const InterpKernel *filter, int x0_q4,
104 int x_step_q4, int y0_q4, int y_step_q4,
105 int w, int h, int bd) {
106 (void)filter;
107 (void)x0_q4;
108 (void)x_step_q4;
109 (void)y0_q4;
110 (void)y_step_q4;
111 (void)bd;
112
113 assert(w % 4 == 0);
114 if (w > 32) { // w = 64
115 __m256i p0, p1, p2, p3, u0, u1, u2, u3;
116 do {
117 p0 = _mm256_loadu_si256((const __m256i *)src);
118 p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
119 p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
120 p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
121 src += src_stride;
122 u0 = _mm256_loadu_si256((const __m256i *)dst);
123 u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
124 u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
125 u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
126 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
127 _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
128 _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
129 _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
130 dst += dst_stride;
131 h--;
132 } while (h > 0);
133 } else if (w > 16) { // w = 32
134 __m256i p0, p1, u0, u1;
135 do {
136 p0 = _mm256_loadu_si256((const __m256i *)src);
137 p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
138 src += src_stride;
139 u0 = _mm256_loadu_si256((const __m256i *)dst);
140 u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
141 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
142 _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
143 dst += dst_stride;
144 h--;
145 } while (h > 0);
146 } else if (w > 8) { // w = 16
147 __m256i p0, p1, u0, u1;
148 do {
149 p0 = _mm256_loadu_si256((const __m256i *)src);
150 p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
151 src += src_stride << 1;
152 u0 = _mm256_loadu_si256((const __m256i *)dst);
153 u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
154
155 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
156 _mm256_storeu_si256((__m256i *)(dst + dst_stride),
157 _mm256_avg_epu16(p1, u1));
158 dst += dst_stride << 1;
159 h -= 2;
160 } while (h > 0);
161 } else if (w > 4) { // w = 8
162 __m128i p0, p1, u0, u1;
163 do {
164 p0 = _mm_loadu_si128((const __m128i *)src);
165 p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
166 src += src_stride << 1;
167 u0 = _mm_loadu_si128((const __m128i *)dst);
168 u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
169
170 _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
171 _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
172 dst += dst_stride << 1;
173 h -= 2;
174 } while (h > 0);
175 } else { // w = 4
176 __m128i p0, p1, u0, u1;
177 do {
178 p0 = _mm_loadl_epi64((const __m128i *)src);
179 p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
180 src += src_stride << 1;
181 u0 = _mm_loadl_epi64((const __m128i *)dst);
182 u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
183
184 _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
185 _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
186 dst += dst_stride << 1;
187 h -= 2;
188 } while (h > 0);
189 }
190 }
191
192 // -----------------------------------------------------------------------------
193 // Horizontal and vertical filtering
194
195 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
196 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
197 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
198
199 static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
200 8, 9, 10, 11, 10, 11, 12, 13,
201 4, 5, 6, 7, 6, 7, 8, 9,
202 8, 9, 10, 11, 10, 11, 12, 13 };
203
204 static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
205 10, 11, 12, 13, 12, 13, 14, 15,
206 6, 7, 8, 9, 8, 9, 10, 11,
207 10, 11, 12, 13, 12, 13, 14, 15 };
208
209 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
210
211 #define CONV8_ROUNDING_BITS (7)
212 #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
213
214 // -----------------------------------------------------------------------------
215 // Horizontal Filtering
216
pack_pixels(const __m256i * s,__m256i * p)217 static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
218 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
219 const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
220 const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
221 const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
222
223 p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
224 p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
225 p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
226 p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
227 }
228
229 // Note:
230 // Shared by 8x2 and 16x1 block
pack_16_pixels(const __m256i * s0,const __m256i * s1,__m256i * x)231 static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
232 __m256i *x /*x[8]*/) {
233 __m256i pp[8];
234 pack_pixels(s0, pp);
235 pack_pixels(s1, &pp[4]);
236 x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
237 x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
238 x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
239 x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
240 x[4] = x[2];
241 x[5] = x[3];
242 x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
243 x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
244 }
245
pack_8x1_pixels(const uint16_t * src,__m256i * x)246 static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
247 __m256i pp[8];
248 __m256i s0;
249 s0 = _mm256_loadu_si256((const __m256i *)src);
250 pack_pixels(&s0, pp);
251 x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
252 x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
253 x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
254 x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
255 }
256
pack_8x2_pixels(const uint16_t * src,ptrdiff_t stride,__m256i * x)257 static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
258 __m256i *x) {
259 __m256i s0, s1;
260 s0 = _mm256_loadu_si256((const __m256i *)src);
261 s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
262 pack_16_pixels(&s0, &s1, x);
263 }
264
pack_16x1_pixels(const uint16_t * src,__m256i * x)265 static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
266 __m256i s0, s1;
267 s0 = _mm256_loadu_si256((const __m256i *)src);
268 s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
269 pack_16_pixels(&s0, &s1, x);
270 }
271
272 // Note:
273 // Shared by horizontal and vertical filtering
pack_filters(const int16_t * filter,__m256i * f)274 static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
275 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
276 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
277 const __m256i p0 = _mm256_set1_epi32(0x03020100);
278 const __m256i p1 = _mm256_set1_epi32(0x07060504);
279 const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
280 const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
281 f[0] = _mm256_shuffle_epi8(hh, p0);
282 f[1] = _mm256_shuffle_epi8(hh, p1);
283 f[2] = _mm256_shuffle_epi8(hh, p2);
284 f[3] = _mm256_shuffle_epi8(hh, p3);
285 }
286
filter_8x1_pixels(const __m256i * sig,const __m256i * fil,__m256i * y)287 static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
288 const __m256i *fil /*fil[4]*/,
289 __m256i *y) {
290 __m256i a, a0, a1;
291
292 a0 = _mm256_madd_epi16(fil[0], sig[0]);
293 a1 = _mm256_madd_epi16(fil[3], sig[3]);
294 a = _mm256_add_epi32(a0, a1);
295
296 a0 = _mm256_madd_epi16(fil[1], sig[1]);
297 a1 = _mm256_madd_epi16(fil[2], sig[2]);
298
299 {
300 const __m256i min = _mm256_min_epi32(a0, a1);
301 a = _mm256_add_epi32(a, min);
302 }
303 {
304 const __m256i max = _mm256_max_epi32(a0, a1);
305 a = _mm256_add_epi32(a, max);
306 }
307 {
308 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
309 a = _mm256_add_epi32(a, rounding);
310 *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
311 }
312 }
313
store_8x1_pixels(const __m256i * y,const __m256i * mask,uint16_t * dst)314 static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
315 uint16_t *dst) {
316 const __m128i a0 = _mm256_castsi256_si128(*y);
317 const __m128i a1 = _mm256_extractf128_si256(*y, 1);
318 __m128i res = _mm_packus_epi32(a0, a1);
319 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
320 _mm_storeu_si128((__m128i *)dst, res);
321 }
322
store_8x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)323 static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
324 const __m256i *mask, uint16_t *dst,
325 ptrdiff_t pitch) {
326 __m256i a = _mm256_packus_epi32(*y0, *y1);
327 a = _mm256_min_epi16(a, *mask);
328 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
329 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
330 }
331
store_16x1_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)332 static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
333 const __m256i *mask, uint16_t *dst) {
334 __m256i a = _mm256_packus_epi32(*y0, *y1);
335 a = _mm256_min_epi16(a, *mask);
336 _mm256_storeu_si256((__m256i *)dst, a);
337 }
338
vpx_highbd_filter_block1d8_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)339 static void vpx_highbd_filter_block1d8_h8_avx2(
340 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
341 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
342 __m256i signal[8], res0, res1;
343 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
344
345 __m256i ff[4];
346 pack_filters(filter, ff);
347
348 src_ptr -= 3;
349 do {
350 pack_8x2_pixels(src_ptr, src_pitch, signal);
351 filter_8x1_pixels(signal, ff, &res0);
352 filter_8x1_pixels(&signal[4], ff, &res1);
353 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
354 height -= 2;
355 src_ptr += src_pitch << 1;
356 dst_ptr += dst_pitch << 1;
357 } while (height > 1);
358
359 if (height > 0) {
360 pack_8x1_pixels(src_ptr, signal);
361 filter_8x1_pixels(signal, ff, &res0);
362 store_8x1_pixels(&res0, &max, dst_ptr);
363 }
364 }
365
vpx_highbd_filter_block1d16_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)366 static void vpx_highbd_filter_block1d16_h8_avx2(
367 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
368 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
369 __m256i signal[8], res0, res1;
370 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
371
372 __m256i ff[4];
373 pack_filters(filter, ff);
374
375 src_ptr -= 3;
376 do {
377 pack_16x1_pixels(src_ptr, signal);
378 filter_8x1_pixels(signal, ff, &res0);
379 filter_8x1_pixels(&signal[4], ff, &res1);
380 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
381 height -= 1;
382 src_ptr += src_pitch;
383 dst_ptr += dst_pitch;
384 } while (height > 0);
385 }
386
387 // -----------------------------------------------------------------------------
388 // 2-tap horizontal filtering
389
pack_2t_filter(const int16_t * filter,__m256i * f)390 static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
391 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
392 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
393 const __m256i p = _mm256_set1_epi32(0x09080706);
394 f[0] = _mm256_shuffle_epi8(hh, p);
395 }
396
397 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
398 // the difference is s0/s1 specifies first and second rows or,
399 // first 16 samples and 8-sample shifted 16 samples
pack_16_2t_pixels(const __m256i * s0,const __m256i * s1,__m256i * sig)400 static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
401 __m256i *sig) {
402 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
403 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
404 __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
405 __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
406 __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
407 __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
408 r0 = _mm256_shuffle_epi8(r0, sf2);
409 r1 = _mm256_shuffle_epi8(r1, sf2);
410 sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
411 sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
412 }
413
pack_8x2_2t_pixels(const uint16_t * src,const ptrdiff_t pitch,__m256i * sig)414 static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
415 const ptrdiff_t pitch, __m256i *sig) {
416 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
417 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
418 pack_16_2t_pixels(&r0, &r1, sig);
419 }
420
pack_16x1_2t_pixels(const uint16_t * src,__m256i * sig)421 static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
422 __m256i *sig /*sig[2]*/) {
423 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
424 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
425 pack_16_2t_pixels(&r0, &r1, sig);
426 }
427
pack_8x1_2t_pixels(const uint16_t * src,__m256i * sig)428 static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
429 __m256i *sig /*sig[2]*/) {
430 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
431 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
432 __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
433 __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
434 r0 = _mm256_permutevar8x32_epi32(r0, idx);
435 r0 = _mm256_shuffle_epi8(r0, sf2);
436 sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
437 }
438
439 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
filter_16_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)440 static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
441 __m256i *y0, __m256i *y1) {
442 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
443 __m256i x0 = _mm256_madd_epi16(sig[0], *f);
444 __m256i x1 = _mm256_madd_epi16(sig[1], *f);
445 x0 = _mm256_add_epi32(x0, rounding);
446 x1 = _mm256_add_epi32(x1, rounding);
447 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
448 *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
449 }
450
filter_8x1_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0)451 static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
452 __m256i *y0) {
453 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
454 __m256i x0 = _mm256_madd_epi16(sig[0], *f);
455 x0 = _mm256_add_epi32(x0, rounding);
456 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
457 }
458
vpx_highbd_filter_block1d8_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)459 static void vpx_highbd_filter_block1d8_h2_avx2(
460 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
461 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
462 __m256i signal[2], res0, res1;
463 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
464
465 __m256i ff;
466 pack_2t_filter(filter, &ff);
467
468 src_ptr -= 3;
469 do {
470 pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
471 filter_16_2t_pixels(signal, &ff, &res0, &res1);
472 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
473 height -= 2;
474 src_ptr += src_pitch << 1;
475 dst_ptr += dst_pitch << 1;
476 } while (height > 1);
477
478 if (height > 0) {
479 pack_8x1_2t_pixels(src_ptr, signal);
480 filter_8x1_2t_pixels(signal, &ff, &res0);
481 store_8x1_pixels(&res0, &max, dst_ptr);
482 }
483 }
484
vpx_highbd_filter_block1d16_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)485 static void vpx_highbd_filter_block1d16_h2_avx2(
486 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
487 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
488 __m256i signal[2], res0, res1;
489 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
490
491 __m256i ff;
492 pack_2t_filter(filter, &ff);
493
494 src_ptr -= 3;
495 do {
496 pack_16x1_2t_pixels(src_ptr, signal);
497 filter_16_2t_pixels(signal, &ff, &res0, &res1);
498 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
499 height -= 1;
500 src_ptr += src_pitch;
501 dst_ptr += dst_pitch;
502 } while (height > 0);
503 }
504
505 // -----------------------------------------------------------------------------
506 // Vertical Filtering
507
pack_8x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)508 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
509 __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
510 __m256i s1 =
511 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
512 __m256i s2 = _mm256_castsi128_si256(
513 _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
514 __m256i s3 = _mm256_castsi128_si256(
515 _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
516 __m256i s4 = _mm256_castsi128_si256(
517 _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
518 __m256i s5 = _mm256_castsi128_si256(
519 _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
520 __m256i s6 = _mm256_castsi128_si256(
521 _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
522
523 s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
524 s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
525 s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
526 s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
527 s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
528 s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
529
530 sig[0] = _mm256_unpacklo_epi16(s0, s1);
531 sig[4] = _mm256_unpackhi_epi16(s0, s1);
532 sig[1] = _mm256_unpacklo_epi16(s2, s3);
533 sig[5] = _mm256_unpackhi_epi16(s2, s3);
534 sig[2] = _mm256_unpacklo_epi16(s4, s5);
535 sig[6] = _mm256_unpackhi_epi16(s4, s5);
536 sig[8] = s6;
537 }
538
pack_8x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)539 static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
540 __m256i *sig) {
541 // base + 7th row
542 __m256i s0 = _mm256_castsi128_si256(
543 _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
544 // base + 8th row
545 __m256i s1 = _mm256_castsi128_si256(
546 _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
547 __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
548 __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
549 sig[3] = _mm256_unpacklo_epi16(s2, s3);
550 sig[7] = _mm256_unpackhi_epi16(s2, s3);
551 sig[8] = s1;
552 }
553
filter_8x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)554 static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
555 __m256i *y0, __m256i *y1) {
556 filter_8x1_pixels(sig, f, y0);
557 filter_8x1_pixels(&sig[4], f, y1);
558 }
559
update_pixels(__m256i * sig)560 static INLINE void update_pixels(__m256i *sig) {
561 int i;
562 for (i = 0; i < 3; ++i) {
563 sig[i] = sig[i + 1];
564 sig[i + 4] = sig[i + 5];
565 }
566 }
567
vpx_highbd_filter_block1d8_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)568 static void vpx_highbd_filter_block1d8_v8_avx2(
569 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
570 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
571 __m256i signal[9], res0, res1;
572 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
573
574 __m256i ff[4];
575 pack_filters(filter, ff);
576
577 pack_8x9_init(src_ptr, src_pitch, signal);
578
579 do {
580 pack_8x9_pixels(src_ptr, src_pitch, signal);
581
582 filter_8x9_pixels(signal, ff, &res0, &res1);
583 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
584 update_pixels(signal);
585
586 src_ptr += src_pitch << 1;
587 dst_ptr += dst_pitch << 1;
588 height -= 2;
589 } while (height > 0);
590 }
591
pack_16x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)592 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
593 __m256i u0, u1, u2, u3;
594 // load 0-6 rows
595 const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
596 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
597 const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
598 const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
599 const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
600 const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
601 const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
602
603 u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
604 u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
605
606 u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
607 u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
608
609 sig[0] = _mm256_unpacklo_epi16(u0, u2);
610 sig[4] = _mm256_unpackhi_epi16(u0, u2);
611
612 sig[8] = _mm256_unpacklo_epi16(u1, u3);
613 sig[12] = _mm256_unpackhi_epi16(u1, u3);
614
615 u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
616 u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
617
618 u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
619 u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
620
621 sig[1] = _mm256_unpacklo_epi16(u0, u2);
622 sig[5] = _mm256_unpackhi_epi16(u0, u2);
623
624 sig[9] = _mm256_unpacklo_epi16(u1, u3);
625 sig[13] = _mm256_unpackhi_epi16(u1, u3);
626
627 u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
628 u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
629
630 u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
631 u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
632
633 sig[2] = _mm256_unpacklo_epi16(u0, u2);
634 sig[6] = _mm256_unpackhi_epi16(u0, u2);
635
636 sig[10] = _mm256_unpacklo_epi16(u1, u3);
637 sig[14] = _mm256_unpackhi_epi16(u1, u3);
638
639 sig[16] = s6;
640 }
641
pack_16x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)642 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
643 __m256i *sig) {
644 // base + 7th row
645 const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
646 // base + 8th row
647 const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
648
649 __m256i u0, u1, u2, u3;
650 u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
651 u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
652
653 u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
654 u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
655
656 sig[3] = _mm256_unpacklo_epi16(u0, u2);
657 sig[7] = _mm256_unpackhi_epi16(u0, u2);
658
659 sig[11] = _mm256_unpacklo_epi16(u1, u3);
660 sig[15] = _mm256_unpackhi_epi16(u1, u3);
661
662 sig[16] = s8;
663 }
664
filter_16x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)665 static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
666 __m256i *y0, __m256i *y1) {
667 __m256i res[4];
668 int i;
669 for (i = 0; i < 4; ++i) {
670 filter_8x1_pixels(&sig[i << 2], f, &res[i]);
671 }
672
673 {
674 const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
675 const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
676 *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
677 *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
678 }
679 }
680
store_16x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)681 static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
682 const __m256i *mask, uint16_t *dst,
683 ptrdiff_t pitch) {
684 __m256i p = _mm256_min_epi16(*y0, *mask);
685 _mm256_storeu_si256((__m256i *)dst, p);
686 p = _mm256_min_epi16(*y1, *mask);
687 _mm256_storeu_si256((__m256i *)(dst + pitch), p);
688 }
689
update_16x9_pixels(__m256i * sig)690 static void update_16x9_pixels(__m256i *sig) {
691 update_pixels(&sig[0]);
692 update_pixels(&sig[8]);
693 }
694
vpx_highbd_filter_block1d16_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)695 static void vpx_highbd_filter_block1d16_v8_avx2(
696 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
697 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
698 __m256i signal[17], res0, res1;
699 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
700
701 __m256i ff[4];
702 pack_filters(filter, ff);
703
704 pack_16x9_init(src_ptr, src_pitch, signal);
705
706 do {
707 pack_16x9_pixels(src_ptr, src_pitch, signal);
708 filter_16x9_pixels(signal, ff, &res0, &res1);
709 store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
710 update_16x9_pixels(signal);
711
712 src_ptr += src_pitch << 1;
713 dst_ptr += dst_pitch << 1;
714 height -= 2;
715 } while (height > 0);
716 }
717
718 // -----------------------------------------------------------------------------
719 // 2-tap vertical filtering
720
pack_16x2_init(const uint16_t * src,__m256i * sig)721 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
722 sig[2] = _mm256_loadu_si256((const __m256i *)src);
723 }
724
pack_16x2_2t_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)725 static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
726 __m256i *sig) {
727 // load the next row
728 const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
729 sig[0] = _mm256_unpacklo_epi16(sig[2], u);
730 sig[1] = _mm256_unpackhi_epi16(sig[2], u);
731 sig[2] = u;
732 }
733
filter_16x2_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)734 static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
735 __m256i *y0, __m256i *y1) {
736 filter_16_2t_pixels(sig, f, y0, y1);
737 }
738
vpx_highbd_filter_block1d16_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)739 static void vpx_highbd_filter_block1d16_v2_avx2(
740 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
741 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
742 __m256i signal[3], res0, res1;
743 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
744 __m256i ff;
745
746 pack_2t_filter(filter, &ff);
747 pack_16x2_init(src_ptr, signal);
748
749 do {
750 pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
751 filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
752 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
753
754 src_ptr += src_pitch;
755 dst_ptr += dst_pitch;
756 height -= 1;
757 } while (height > 0);
758 }
759
pack_8x1_2t_filter(const int16_t * filter,__m128i * f)760 static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
761 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
762 const __m128i p = _mm_set1_epi32(0x09080706);
763 f[0] = _mm_shuffle_epi8(h, p);
764 }
765
pack_8x2_init(const uint16_t * src,__m128i * sig)766 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
767 sig[2] = _mm_loadu_si128((const __m128i *)src);
768 }
769
pack_8x2_2t_pixels_ver(const uint16_t * src,ptrdiff_t pitch,__m128i * sig)770 static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
771 __m128i *sig) {
772 // load the next row
773 const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
774 sig[0] = _mm_unpacklo_epi16(sig[2], u);
775 sig[1] = _mm_unpackhi_epi16(sig[2], u);
776 sig[2] = u;
777 }
778
filter_8_2t_pixels(const __m128i * sig,const __m128i * f,__m128i * y0,__m128i * y1)779 static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
780 __m128i *y0, __m128i *y1) {
781 const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
782 __m128i x0 = _mm_madd_epi16(sig[0], *f);
783 __m128i x1 = _mm_madd_epi16(sig[1], *f);
784 x0 = _mm_add_epi32(x0, rounding);
785 x1 = _mm_add_epi32(x1, rounding);
786 *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
787 *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
788 }
789
store_8x1_2t_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)790 static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
791 const __m128i *mask, uint16_t *dst) {
792 __m128i res = _mm_packus_epi32(*y0, *y1);
793 res = _mm_min_epi16(res, *mask);
794 _mm_storeu_si128((__m128i *)dst, res);
795 }
796
vpx_highbd_filter_block1d8_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)797 static void vpx_highbd_filter_block1d8_v2_avx2(
798 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
799 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
800 __m128i signal[3], res0, res1;
801 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
802 __m128i ff;
803
804 pack_8x1_2t_filter(filter, &ff);
805 pack_8x2_init(src_ptr, signal);
806
807 do {
808 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
809 filter_8_2t_pixels(signal, &ff, &res0, &res1);
810 store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
811
812 src_ptr += src_pitch;
813 dst_ptr += dst_pitch;
814 height -= 1;
815 } while (height > 0);
816 }
817
818 // Calculation with averaging the input pixels
819
store_8x1_avg_pixels(const __m256i * y0,const __m256i * mask,uint16_t * dst)820 static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
821 uint16_t *dst) {
822 const __m128i a0 = _mm256_castsi256_si128(*y0);
823 const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
824 __m128i res = _mm_packus_epi32(a0, a1);
825 const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
826 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
827 res = _mm_avg_epu16(res, pix);
828 _mm_storeu_si128((__m128i *)dst, res);
829 }
830
store_8x2_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)831 static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
832 const __m256i *mask, uint16_t *dst,
833 ptrdiff_t pitch) {
834 __m256i a = _mm256_packus_epi32(*y0, *y1);
835 const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
836 const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
837 const __m256i pix =
838 _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
839 a = _mm256_min_epi16(a, *mask);
840 a = _mm256_avg_epu16(a, pix);
841 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
842 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
843 }
844
store_16x1_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)845 static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
846 const __m256i *mask, uint16_t *dst) {
847 __m256i a = _mm256_packus_epi32(*y0, *y1);
848 const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
849 a = _mm256_min_epi16(a, *mask);
850 a = _mm256_avg_epu16(a, pix);
851 _mm256_storeu_si256((__m256i *)dst, a);
852 }
853
store_16x2_avg_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)854 static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
855 const __m256i *mask, uint16_t *dst,
856 ptrdiff_t pitch) {
857 const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
858 const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
859 __m256i p = _mm256_min_epi16(*y0, *mask);
860 p = _mm256_avg_epu16(p, pix0);
861 _mm256_storeu_si256((__m256i *)dst, p);
862
863 p = _mm256_min_epi16(*y1, *mask);
864 p = _mm256_avg_epu16(p, pix1);
865 _mm256_storeu_si256((__m256i *)(dst + pitch), p);
866 }
867
store_8x1_2t_avg_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)868 static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
869 const __m128i *y1,
870 const __m128i *mask,
871 uint16_t *dst) {
872 __m128i res = _mm_packus_epi32(*y0, *y1);
873 const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
874 res = _mm_min_epi16(res, *mask);
875 res = _mm_avg_epu16(res, pix);
876 _mm_storeu_si128((__m128i *)dst, res);
877 }
878
vpx_highbd_filter_block1d8_h8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)879 static void vpx_highbd_filter_block1d8_h8_avg_avx2(
880 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
881 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
882 __m256i signal[8], res0, res1;
883 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
884
885 __m256i ff[4];
886 pack_filters(filter, ff);
887
888 src_ptr -= 3;
889 do {
890 pack_8x2_pixels(src_ptr, src_pitch, signal);
891 filter_8x1_pixels(signal, ff, &res0);
892 filter_8x1_pixels(&signal[4], ff, &res1);
893 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
894 height -= 2;
895 src_ptr += src_pitch << 1;
896 dst_ptr += dst_pitch << 1;
897 } while (height > 1);
898
899 if (height > 0) {
900 pack_8x1_pixels(src_ptr, signal);
901 filter_8x1_pixels(signal, ff, &res0);
902 store_8x1_avg_pixels(&res0, &max, dst_ptr);
903 }
904 }
905
vpx_highbd_filter_block1d16_h8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)906 static void vpx_highbd_filter_block1d16_h8_avg_avx2(
907 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
908 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
909 __m256i signal[8], res0, res1;
910 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
911
912 __m256i ff[4];
913 pack_filters(filter, ff);
914
915 src_ptr -= 3;
916 do {
917 pack_16x1_pixels(src_ptr, signal);
918 filter_8x1_pixels(signal, ff, &res0);
919 filter_8x1_pixels(&signal[4], ff, &res1);
920 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
921 height -= 1;
922 src_ptr += src_pitch;
923 dst_ptr += dst_pitch;
924 } while (height > 0);
925 }
926
vpx_highbd_filter_block1d4_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)927 static void vpx_highbd_filter_block1d4_h4_avx2(
928 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
929 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
930 // We extract the middle four elements of the kernel into two registers in
931 // the form
932 // ... k[3] k[2] k[3] k[2]
933 // ... k[5] k[4] k[5] k[4]
934 // Then we shuffle the source into
935 // ... s[1] s[0] s[0] s[-1]
936 // ... s[3] s[2] s[2] s[1]
937 // Calling multiply and add gives us half of the sum. Calling add on the two
938 // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
939 // can do this two rows at a time.
940
941 __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
942 __m256i res_reg;
943 __m256i idx_shift_0 =
944 _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
945 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
946 __m256i idx_shift_2 =
947 _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
948 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
949
950 __m128i kernel_reg_128; // Kernel
951 __m256i kernel_reg, kernel_reg_23,
952 kernel_reg_45; // Segments of the kernel used
953 const __m256i reg_round =
954 _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
955 const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
956 const ptrdiff_t unrolled_src_stride = src_stride << 1;
957 const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
958 int h;
959
960 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
961 src_ptr -= 1;
962
963 // Load Kernel
964 kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
965 kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
966 kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
967 kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
968
969 for (h = height; h >= 2; h -= 2) {
970 // Load the source
971 src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
972 src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
973 src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
974
975 // Get the output
976 res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
977 &kernel_reg_23, &kernel_reg_45);
978
979 // Round the result
980 res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS);
981
982 // Finally combine to get the final dst
983 res_reg = _mm256_packus_epi32(res_reg, res_reg);
984 res_reg = _mm256_min_epi16(res_reg, reg_max);
985 mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
986 &res_reg);
987
988 src_ptr += unrolled_src_stride;
989 dst_ptr += unrolled_dst_stride;
990 }
991
992 // Repeat for the last row if needed
993 if (h > 0) {
994 // Load the source
995 src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
996 src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
997 src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
998
999 // Get the output
1000 res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1001 &kernel_reg_23, &kernel_reg_45);
1002
1003 // Round the result
1004 res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS);
1005
1006 // Finally combine to get the final dst
1007 res_reg = _mm256_packus_epi32(res_reg, res_reg);
1008 res_reg = _mm256_min_epi16(res_reg, reg_max);
1009 _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
1010 }
1011 }
1012
vpx_highbd_filter_block1d8_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1013 static void vpx_highbd_filter_block1d8_h4_avx2(
1014 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1015 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1016 // We will extract the middle four elements of the kernel into two registers
1017 // in the form
1018 // ... k[3] k[2] k[3] k[2]
1019 // ... k[5] k[4] k[5] k[4]
1020 // Then we shuffle the source into
1021 // ... s[1] s[0] s[0] s[-1]
1022 // ... s[3] s[2] s[2] s[1]
1023 // Calling multiply and add gives us half of the sum of the first half.
1024 // Calling add gives us first half of the output. Repat again to get the whole
1025 // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
1026 // at a time.
1027
1028 __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
1029 __m256i res_reg, res_first, res_last;
1030 __m256i idx_shift_0 =
1031 _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
1032 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
1033 __m256i idx_shift_2 =
1034 _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
1035 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
1036
1037 __m128i kernel_reg_128; // Kernel
1038 __m256i kernel_reg, kernel_reg_23,
1039 kernel_reg_45; // Segments of the kernel used
1040 const __m256i reg_round =
1041 _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
1042 const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1043 const ptrdiff_t unrolled_src_stride = src_stride << 1;
1044 const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
1045 int h;
1046
1047 // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
1048 src_ptr -= 1;
1049
1050 // Load Kernel
1051 kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1052 kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1053 kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1054 kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1055
1056 for (h = height; h >= 2; h -= 2) {
1057 // Load the source
1058 src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
1059 src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1060 src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1061
1062 // Result for first half
1063 res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1064 &kernel_reg_23, &kernel_reg_45);
1065
1066 // Do again to get the second half of dst
1067 // Load the source
1068 src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
1069 src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1070 src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1071
1072 // Result for second half
1073 res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1074 &kernel_reg_23, &kernel_reg_45);
1075
1076 // Round each result
1077 res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS);
1078 res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS);
1079
1080 // Finally combine to get the final dst
1081 res_reg = _mm256_packus_epi32(res_first, res_last);
1082 res_reg = _mm256_min_epi16(res_reg, reg_max);
1083 mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1084 &res_reg);
1085
1086 src_ptr += unrolled_src_stride;
1087 dst_ptr += unrolled_dst_stride;
1088 }
1089
1090 // Repeat for the last row if needed
1091 if (h > 0) {
1092 src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
1093 src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
1094 src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
1095
1096 res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
1097 &kernel_reg_23, &kernel_reg_45);
1098
1099 res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS);
1100
1101 res_reg = _mm256_packus_epi32(res_reg, res_reg);
1102 res_reg = _mm256_min_epi16(res_reg, reg_max);
1103
1104 mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
1105 }
1106 }
1107
vpx_highbd_filter_block1d16_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1108 static void vpx_highbd_filter_block1d16_h4_avx2(
1109 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1110 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1111 vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1112 height, kernel, bd);
1113 vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1114 dst_stride, height, kernel, bd);
1115 }
1116
vpx_highbd_filter_block1d8_v8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1117 static void vpx_highbd_filter_block1d8_v8_avg_avx2(
1118 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1119 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1120 __m256i signal[9], res0, res1;
1121 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1122
1123 __m256i ff[4];
1124 pack_filters(filter, ff);
1125
1126 pack_8x9_init(src_ptr, src_pitch, signal);
1127
1128 do {
1129 pack_8x9_pixels(src_ptr, src_pitch, signal);
1130
1131 filter_8x9_pixels(signal, ff, &res0, &res1);
1132 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1133 update_pixels(signal);
1134
1135 src_ptr += src_pitch << 1;
1136 dst_ptr += dst_pitch << 1;
1137 height -= 2;
1138 } while (height > 0);
1139 }
1140
vpx_highbd_filter_block1d16_v8_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1141 static void vpx_highbd_filter_block1d16_v8_avg_avx2(
1142 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1143 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1144 __m256i signal[17], res0, res1;
1145 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1146
1147 __m256i ff[4];
1148 pack_filters(filter, ff);
1149
1150 pack_16x9_init(src_ptr, src_pitch, signal);
1151
1152 do {
1153 pack_16x9_pixels(src_ptr, src_pitch, signal);
1154 filter_16x9_pixels(signal, ff, &res0, &res1);
1155 store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1156 update_16x9_pixels(signal);
1157
1158 src_ptr += src_pitch << 1;
1159 dst_ptr += dst_pitch << 1;
1160 height -= 2;
1161 } while (height > 0);
1162 }
1163
vpx_highbd_filter_block1d8_h2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1164 static void vpx_highbd_filter_block1d8_h2_avg_avx2(
1165 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1166 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1167 __m256i signal[2], res0, res1;
1168 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1169
1170 __m256i ff;
1171 pack_2t_filter(filter, &ff);
1172
1173 src_ptr -= 3;
1174 do {
1175 pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
1176 filter_16_2t_pixels(signal, &ff, &res0, &res1);
1177 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
1178 height -= 2;
1179 src_ptr += src_pitch << 1;
1180 dst_ptr += dst_pitch << 1;
1181 } while (height > 1);
1182
1183 if (height > 0) {
1184 pack_8x1_2t_pixels(src_ptr, signal);
1185 filter_8x1_2t_pixels(signal, &ff, &res0);
1186 store_8x1_avg_pixels(&res0, &max, dst_ptr);
1187 }
1188 }
1189
vpx_highbd_filter_block1d16_h2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1190 static void vpx_highbd_filter_block1d16_h2_avg_avx2(
1191 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1192 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1193 __m256i signal[2], res0, res1;
1194 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1195
1196 __m256i ff;
1197 pack_2t_filter(filter, &ff);
1198
1199 src_ptr -= 3;
1200 do {
1201 pack_16x1_2t_pixels(src_ptr, signal);
1202 filter_16_2t_pixels(signal, &ff, &res0, &res1);
1203 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1204 height -= 1;
1205 src_ptr += src_pitch;
1206 dst_ptr += dst_pitch;
1207 } while (height > 0);
1208 }
1209
vpx_highbd_filter_block1d16_v2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1210 static void vpx_highbd_filter_block1d16_v2_avg_avx2(
1211 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1212 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1213 __m256i signal[3], res0, res1;
1214 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1215 __m256i ff;
1216
1217 pack_2t_filter(filter, &ff);
1218 pack_16x2_init(src_ptr, signal);
1219
1220 do {
1221 pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1222 filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1223 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
1224
1225 src_ptr += src_pitch;
1226 dst_ptr += dst_pitch;
1227 height -= 1;
1228 } while (height > 0);
1229 }
1230
vpx_highbd_filter_block1d8_v2_avg_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1231 static void vpx_highbd_filter_block1d8_v2_avg_avx2(
1232 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1233 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1234 __m128i signal[3], res0, res1;
1235 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1236 __m128i ff;
1237
1238 pack_8x1_2t_filter(filter, &ff);
1239 pack_8x2_init(src_ptr, signal);
1240
1241 do {
1242 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1243 filter_8_2t_pixels(signal, &ff, &res0, &res1);
1244 store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
1245
1246 src_ptr += src_pitch;
1247 dst_ptr += dst_pitch;
1248 height -= 1;
1249 } while (height > 0);
1250 }
1251
vpx_highbd_filter_block1d4_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1252 static void vpx_highbd_filter_block1d4_v4_avx2(
1253 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1254 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1255 // We will load two rows of pixels and rearrange them into the form
1256 // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1257 // so that we can call multiply and add with the kernel partial output. Then
1258 // we can call add with another row to get the output.
1259
1260 // Register for source s[-1:3, :]
1261 __m256i src_reg_1, src_reg_2, src_reg_3;
1262 // Interleaved rows of the source. lo is first half, hi second
1263 __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1264 __m256i src_reg_m1001, src_reg_1223;
1265
1266 // Result after multiply and add
1267 __m256i res_reg;
1268
1269 __m128i kernel_reg_128; // Kernel
1270 __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used
1271
1272 const __m256i reg_round =
1273 _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
1274 const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1275 const ptrdiff_t src_stride_unrolled = src_stride << 1;
1276 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1277 int h;
1278
1279 // Load Kernel
1280 kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1281 kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1282 kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1283 kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1284
1285 // Row -1 to row 0
1286 src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
1287 (const __m128i *)(src_ptr + src_stride));
1288
1289 // Row 0 to row 1
1290 src_reg_1 = _mm256_castsi128_si256(
1291 _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1292 src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1293
1294 // First three rows
1295 src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1296
1297 for (h = height; h > 1; h -= 2) {
1298 src_reg_2 = _mm256_castsi128_si256(
1299 _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
1300
1301 src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1302 _mm256_castsi256_si128(src_reg_2), 1);
1303
1304 src_reg_3 = _mm256_castsi128_si256(
1305 _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
1306
1307 src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1308 _mm256_castsi256_si128(src_reg_3), 1);
1309
1310 // Last three rows
1311 src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1312
1313 // Output
1314 res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
1315 &kernel_reg_23, &kernel_reg_45);
1316
1317 // Round the words
1318 res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS);
1319
1320 // Combine to get the result
1321 res_reg = _mm256_packus_epi32(res_reg, res_reg);
1322 res_reg = _mm256_min_epi16(res_reg, reg_max);
1323
1324 // Save the result
1325 mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1326 &res_reg);
1327
1328 // Update the source by two rows
1329 src_ptr += src_stride_unrolled;
1330 dst_ptr += dst_stride_unrolled;
1331
1332 src_reg_m1001 = src_reg_1223;
1333 src_reg_1 = src_reg_3;
1334 }
1335 }
1336
vpx_highbd_filter_block1d8_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1337 static void vpx_highbd_filter_block1d8_v4_avx2(
1338 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1339 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1340 // We will load two rows of pixels and rearrange them into the form
1341 // ... s[1,0] s[0,0] s[0,0] s[-1,0]
1342 // so that we can call multiply and add with the kernel partial output. Then
1343 // we can call add with another row to get the output.
1344
1345 // Register for source s[-1:3, :]
1346 __m256i src_reg_1, src_reg_2, src_reg_3;
1347 // Interleaved rows of the source. lo is first half, hi second
1348 __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
1349 __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
1350
1351 __m128i kernel_reg_128; // Kernel
1352 __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel
1353
1354 // Result after multiply and add
1355 __m256i res_reg, res_reg_lo, res_reg_hi;
1356
1357 const __m256i reg_round =
1358 _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding
1359 const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
1360 const ptrdiff_t src_stride_unrolled = src_stride << 1;
1361 const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
1362 int h;
1363
1364 // Load Kernel
1365 kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
1366 kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
1367 kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
1368 kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
1369
1370 // Row -1 to row 0
1371 src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
1372 (const __m128i *)(src_ptr + src_stride));
1373
1374 // Row 0 to row 1
1375 src_reg_1 = _mm256_castsi128_si256(
1376 _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
1377 src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
1378
1379 // First three rows
1380 src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
1381 src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
1382
1383 for (h = height; h > 1; h -= 2) {
1384 src_reg_2 = _mm256_castsi128_si256(
1385 _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
1386
1387 src_reg_12 = _mm256_inserti128_si256(src_reg_1,
1388 _mm256_castsi256_si128(src_reg_2), 1);
1389
1390 src_reg_3 = _mm256_castsi128_si256(
1391 _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
1392
1393 src_reg_23 = _mm256_inserti128_si256(src_reg_2,
1394 _mm256_castsi256_si128(src_reg_3), 1);
1395
1396 // Last three rows
1397 src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
1398 src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
1399
1400 // Output from first half
1401 res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
1402 &kernel_reg_23, &kernel_reg_45);
1403
1404 // Output from second half
1405 res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
1406 &kernel_reg_23, &kernel_reg_45);
1407
1408 // Round the words
1409 res_reg_lo =
1410 mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS);
1411 res_reg_hi =
1412 mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS);
1413
1414 // Combine to get the result
1415 res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
1416 res_reg = _mm256_min_epi16(res_reg, reg_max);
1417
1418 // Save the result
1419 mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
1420 &res_reg);
1421
1422 // Update the source by two rows
1423 src_ptr += src_stride_unrolled;
1424 dst_ptr += dst_stride_unrolled;
1425
1426 src_reg_m1001_lo = src_reg_1223_lo;
1427 src_reg_m1001_hi = src_reg_1223_hi;
1428 src_reg_1 = src_reg_3;
1429 }
1430 }
1431
vpx_highbd_filter_block1d16_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel,int bd)1432 static void vpx_highbd_filter_block1d16_v4_avx2(
1433 const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
1434 ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
1435 vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
1436 height, kernel, bd);
1437 vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
1438 dst_stride, height, kernel, bd);
1439 }
1440
1441 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1442 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
1443 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
1444
1445 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1446 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
1447 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
1448
1449 #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
1450 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
1451 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
1452 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
1453
1454 // Use the [vh]8 version because there is no [vh]4 implementation.
1455 #define vpx_highbd_filter_block1d16_v4_avg_avx2 \
1456 vpx_highbd_filter_block1d16_v8_avg_avx2
1457 #define vpx_highbd_filter_block1d16_h4_avg_avx2 \
1458 vpx_highbd_filter_block1d16_h8_avg_avx2
1459 #define vpx_highbd_filter_block1d8_v4_avg_avx2 \
1460 vpx_highbd_filter_block1d8_v8_avg_avx2
1461 #define vpx_highbd_filter_block1d8_h4_avg_avx2 \
1462 vpx_highbd_filter_block1d8_h8_avg_avx2
1463 #define vpx_highbd_filter_block1d4_v4_avg_avx2 \
1464 vpx_highbd_filter_block1d4_v8_avg_avx2
1465 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
1466 vpx_highbd_filter_block1d4_h8_avg_avx2
1467
1468 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
1469 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
1470 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
1471 HIGH_FUN_CONV_2D(, avx2, 0)
1472
1473 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
1474 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
1475 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
1476
1477 // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
1478 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
1479 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
1480
1481 #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
1482 vpx_highbd_filter_block1d4_h8_avg_sse2
1483 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
1484 vpx_highbd_filter_block1d4_h2_avg_sse2
1485 #define vpx_highbd_filter_block1d4_v8_avg_avx2 \
1486 vpx_highbd_filter_block1d4_v8_avg_sse2
1487 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
1488 vpx_highbd_filter_block1d4_v2_avg_sse2
1489
1490 HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
1491 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
1492 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
1493 HIGH_FUN_CONV_2D(avg_, avx2, 1)
1494
1495 #undef HIGHBD_FUNC
1496