xref: /aosp_15_r20/external/libaom/av1/common/x86/resize_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <immintrin.h>
12 #include <string.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/resize.h"
17 
18 #include "aom_dsp/x86/synonyms.h"
19 
20 #define ROW_OFFSET 5
21 #define CAST_HI(x) _mm256_castsi128_si256(x)
22 #define CAST_LOW(x) _mm256_castsi256_si128(x)
23 
24 #define PROCESS_RESIZE_Y_WD16                                               \
25   const int idx1 = AOMMIN(height - 1, i + 5);                               \
26   const int idx2 = AOMMIN(height - 1, i + 6);                               \
27   l6 = l10;                                                                 \
28   l7 = l11;                                                                 \
29   l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride));                  \
30   l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride));                  \
31                                                                             \
32   /* g0... g15 | i0... i15 */                                               \
33   const __m256i s68 =                                                       \
34       _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20);            \
35   /* h0... h15 | j0... j15 */                                               \
36   const __m256i s79 =                                                       \
37       _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20);            \
38                                                                             \
39   /* g0h0... g7g7 | i0j0... i7j */                                          \
40   s[3] = _mm256_unpacklo_epi8(s68, s79);                                    \
41   /* g8h8... g15g15 | i8j8... i15j15 */                                     \
42   s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
43                                                                             \
44   __m256i res_out[2] = { 0 };                                               \
45   resize_convolve(s, coeffs_y, res_out);                                    \
46                                                                             \
47   /* r00... r07 */                                                          \
48   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
49   /* r20... r27 */                                                          \
50   __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits);   \
51                                                                             \
52   res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);        \
53   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
54                                                                             \
55   __m256i res_out_b[2] = { 0 };                                             \
56   resize_convolve(s + 5, coeffs_y, res_out_b);                              \
57                                                                             \
58   /* r08... r015 */                                                         \
59   __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
60   /* r28... r215 */                                                         \
61   __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
62   res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits);        \
63   res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits);        \
64                                                                             \
65   /* r00... r03 r20... r23 | r04... r07 r24... r27 */                       \
66   __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);    \
67   /* r08... r012 r28... r212 | r013... r015 r213... r215 */                 \
68   __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2);    \
69   /* r00... r07 | r20... r27 */                                             \
70   res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8);                    \
71   /* r08... r015 | r28... r215 */                                           \
72   res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8);                    \
73   /* r00... r015 | r20... r215 */                                           \
74   res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1);                    \
75   res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel);                       \
76   res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
77 
78 #define PROCESS_RESIZE_Y_WD8                                              \
79   const int idx1 = AOMMIN(height - 1, i + 5);                             \
80   const int idx2 = AOMMIN(height - 1, i + 6);                             \
81   l6 = l10;                                                               \
82   l7 = l11;                                                               \
83   l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride));                \
84   l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride));                \
85                                                                           \
86   /* g0h0... g7h7 */                                                      \
87   s67 = _mm_unpacklo_epi8(l6, l7);                                        \
88   /* i0j0...i7j7 */                                                       \
89   __m128i s89 = _mm_unpacklo_epi8(l8, l9);                                \
90                                                                           \
91   /* g0h0...g7g7 | i0j0...i7j7 */                                         \
92   s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
93                                                                           \
94   __m256i res_out[2] = { 0 };                                             \
95   resize_convolve(s, coeffs_y, res_out);                                  \
96                                                                           \
97   /* r00... r07 */                                                        \
98   __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
99   /* r20...r27 */                                                         \
100   __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
101   res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);      \
102   res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);      \
103                                                                           \
104   /* r00...r03 r20...r23 | r04...r07 r24...r27 */                         \
105   res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);      \
106   /* r00...r07 | r20...r27 */                                             \
107   res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8);          \
108   res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1);      \
109   res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
110   res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
111 
112 #define PROCESS_RESIZE_X_WD32                                                  \
113   /* a0 a1 ..... a30 a31 */                                                    \
114   __m256i row0 = _mm256_loadu_si256(                                           \
115       (__m256i *)&input[i * in_stride + j - filter_offset]);                   \
116   /* b0 b1 ..... b30 b31 */                                                    \
117   __m256i row1 = _mm256_loadu_si256(                                           \
118       (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]);             \
119   /* a0 .... a15 || b0.... b15 */                                              \
120   __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);                    \
121   /* a16 .... a31 || b16 .... b31 */                                           \
122   __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);                    \
123   filter_offset = 3;                                                           \
124                                                                                \
125   /* Pad start pixels to the left, while processing the first pixels in the    \
126    * row. */                                                                   \
127   if (j == 0) {                                                                \
128     /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
129     row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
130     /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */                   \
131     row1 = _mm256_alignr_epi8(r1, r0, 13);                                     \
132     r0 = row0;                                                                 \
133     r1 = row1;                                                                 \
134   }                                                                            \
135   const int is_last_cols32 = (j + 32 == filtered_length);                      \
136   /* Avoid loading extra pixels at frame boundary.*/                           \
137   if (is_last_cols32) row_offset = ROW_OFFSET;                                 \
138   /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
139   __m128i row0_0 = _mm_loadl_epi64(                                            \
140       (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
141   /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
142   __m128i row1_0 =                                                             \
143       _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j -         \
144                                         filter_offset - row_offset]);          \
145   __m256i r2 = _mm256_permute2x128_si256(                                      \
146       _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
147                                                                                \
148   /* Pad end pixels to the right, while processing the last pixels in the      \
149    * row. */                                                                   \
150   if (is_last_cols32) {                                                        \
151     r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET),                \
152                              wd32_end_pad_mask);                               \
153   }                                                                            \
154                                                                                \
155   /* Process even pixels of the first row  */                                  \
156   /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */                \
157   s0[0] = _mm256_alignr_epi8(r1, r0, 0);                                       \
158   /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */                \
159   s0[1] = _mm256_alignr_epi8(r1, r0, 2);                                       \
160   /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */                \
161   s0[2] = _mm256_alignr_epi8(r1, r0, 4);                                       \
162   /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */                \
163   s0[3] = _mm256_alignr_epi8(r1, r0, 6);                                       \
164                                                                                \
165   /* Process even pixels of the second row  */                                 \
166   /* a13 a14 a15 a16  ..... a28 | b13 b14 b15 b16 ..... b28 */                 \
167   s1[0] = _mm256_alignr_epi8(r2, r1, 0);                                       \
168   /* a15 a16 a17 a18  ..... a30 | b15 b16 b17 b18 ..... b30 */                 \
169   s1[1] = _mm256_alignr_epi8(r2, r1, 2);                                       \
170   /* a17 a18 a19 a20  ..... a32 | b17 b18 b19 b20 ..... b32 */                 \
171   s1[2] = _mm256_alignr_epi8(r2, r1, 4);                                       \
172   /* a19 a20 a21 a22  ..... a34 | b19 b20 b21 b22 ..... b34 */                 \
173   s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
174                                                                                \
175   /* The register res_out_0 stores the result of start-16 pixels corresponding \
176    * to the first and second rows whereas res_out_1 stores the end-16          \
177    * pixels. */                                                                \
178   __m256i res_out_0[2], res_out_1[2];                                          \
179   res_out_1[0] = res_out_1[1] = zero;                                          \
180   res_out_0[0] = res_out_0[1] = zero;                                          \
181   resize_convolve(s0, coeffs_x, res_out_0);                                    \
182   resize_convolve(s1, coeffs_x, res_out_1);                                    \
183                                                                                \
184   /* Result of 32 pixels of row0 (a0 to a32) */                                \
185   res_out_0[0] = _mm256_sra_epi32(                                             \
186       _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);     \
187   res_out_1[0] = _mm256_sra_epi32(                                             \
188       _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits);     \
189   /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
190   __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
191                                                                                \
192   /* Result of 32 pixels of row1 (b0 to b32) */                                \
193   res_out_0[1] = _mm256_sra_epi32(                                             \
194       _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
195   res_out_1[1] = _mm256_sra_epi32(                                             \
196       _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits);     \
197   /* r10-r13 r18-r111 | r14-r17 r112-r115 */                                   \
198   __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]);        \
199                                                                                \
200   /* Convert the result from 16bit to 8bit */                                  \
201   /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115   \
202    */                                                                          \
203   __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1);           \
204   __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel);            \
205   res_out_row01 = _mm256_max_epu8(res_out_r01, zero);                          \
206   __m128i low_128 = CAST_LOW(res_out_row01);                                   \
207   __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1);               \
208                                                                                \
209   _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2],                 \
210                    _mm_unpacklo_epi32(low_128, high_128));                     \
211   _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
212                    _mm_unpackhi_epi32(low_128, high_128));
213 
resize_convolve(const __m256i * const s,const __m256i * const coeffs,__m256i * res_out)214 static inline void resize_convolve(const __m256i *const s,
215                                    const __m256i *const coeffs,
216                                    __m256i *res_out) {
217   const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
218   const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
219   const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
220   const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
221 
222   const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
223   const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
224   // The sum of convolve operation crosses signed 16bit. Hence, the addition
225   // should happen in 32bit.
226   const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
227   const __m256i dst_01 =
228       _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
229   const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
230   const __m256i dst_11 =
231       _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
232 
233   res_out[0] = _mm256_add_epi32(dst_00, dst_10);
234   res_out[1] = _mm256_add_epi32(dst_01, dst_11);
235 }
236 
prepare_filter_coeffs(const int16_t * filter,__m256i * const coeffs)237 static inline void prepare_filter_coeffs(const int16_t *filter,
238                                          __m256i *const coeffs /* [4] */) {
239   // f0 f1 f2 f3 x x x x
240   const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
241   // f0 f1 f2 f3 f0 f1 f2 f3
242   const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
243   // f0 f1 f2 f3 f1 f0 f3 f2
244   const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
245 
246   const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
247 
248   // f0 f1 f0 f1 ..
249   coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
250   // f2 f3 f2 f3 ..
251   coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
252   // f3 f2 f3 f2 ..
253   coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
254   // f1 f0 f1 f0 ..
255   coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
256 }
257 
av1_resize_vert_dir_avx2(uint8_t * intbuf,uint8_t * output,int out_stride,int height,int height2,int stride,int start_col)258 bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
259                               int height, int height2, int stride,
260                               int start_col) {
261   assert(start_col <= stride);
262   // For the GM tool, the input layer height or width is assured to be an even
263   // number. Hence the function 'down2_symodd()' is not invoked and SIMD
264   // optimization of the same is not implemented.
265   // When the input height is less than 8 and even, the potential input
266   // heights are limited to 2, 4, or 6. These scenarios require seperate
267   // handling due to padding requirements. Invoking the C function here will
268   // eliminate the need for conditional statements within the subsequent SIMD
269   // code to manage these cases.
270   if (height & 1 || height < 8) {
271     return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
272                                  stride, start_col);
273   }
274 
275   __m256i s[10], coeffs_y[4];
276   const int bits = FILTER_BITS;
277 
278   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
279   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
280   const uint8_t max_pixel = 255;
281   const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
282   const __m256i zero = _mm256_setzero_si256();
283 
284   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
285 
286   const int num_col16 = stride / 16;
287   int remain_col = stride % 16;
288   // The core vertical SIMD processes 4 input rows simultaneously to generate
289   // output corresponding to 2 rows. To streamline the core loop and eliminate
290   // the need for conditional checks, the remaining rows (4 or 6) are processed
291   // separately.
292   const int remain_row = (height % 4 == 0) ? 4 : 6;
293 
294   for (int j = start_col; j < stride - remain_col; j += 16) {
295     const uint8_t *data = &intbuf[j];
296     const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
297     // Padding top 3 rows with the last available row at the top.
298     const __m128i l0 = l3;
299     const __m128i l1 = l3;
300     const __m128i l2 = l3;
301     const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
302 
303     __m128i l6, l7, l8, l9;
304     __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
305     __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
306     __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
307 
308     // a0...a15 | c0...c15
309     const __m256i s02 =
310         _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
311     // b0...b15 | d0...d15
312     const __m256i s13 =
313         _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
314     // c0...c15 | e0...e15
315     const __m256i s24 =
316         _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
317     // d0...d15 | f0...f15
318     const __m256i s35 =
319         _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
320     // e0...e15 | g0...g15
321     const __m256i s46 =
322         _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
323     // f0...f15 | h0...h15
324     const __m256i s57 =
325         _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
326 
327     // a0b0...a7b7 | c0d0...c7d7
328     s[0] = _mm256_unpacklo_epi8(s02, s13);
329     // c0d0...c7d7 | e0f0...e7f7
330     s[1] = _mm256_unpacklo_epi8(s24, s35);
331     // e0f0...e7f7 | g0h0...g7h7
332     s[2] = _mm256_unpacklo_epi8(s46, s57);
333 
334     // a8b8...a15b15 | c8d8...c15d15
335     s[5] = _mm256_unpackhi_epi8(s02, s13);
336     // c8d8...c15d15 | e8f8...e15f15
337     s[6] = _mm256_unpackhi_epi8(s24, s35);
338     // e8f8...e15f15 | g8h8...g15h15
339     s[7] = _mm256_unpackhi_epi8(s46, s57);
340 
341     // height to be processed here
342     const int process_ht = height - remain_row;
343     for (int i = 0; i < process_ht; i += 4) {
344       PROCESS_RESIZE_Y_WD16
345 
346       _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
347                        CAST_LOW(res_8bit0));
348 
349       _mm_storeu_si128(
350           (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
351           _mm256_extracti128_si256(res_8bit0, 1));
352 
353       // Load the required data for processing of next 4 input rows.
354       const int idx7 = AOMMIN(height - 1, i + 7);
355       const int idx8 = AOMMIN(height - 1, i + 8);
356       l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
357       l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
358 
359       const __m256i s810 =
360           _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
361       const __m256i s911 =
362           _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
363       // i0j0... i7j7 | k0l0... k7l7
364       s[4] = _mm256_unpacklo_epi8(s810, s911);
365       // i8j8... i15j15 | k8l8... k15l15
366       s[9] = _mm256_unpackhi_epi8(s810, s911);
367 
368       s[0] = s[2];
369       s[1] = s[3];
370       s[2] = s[4];
371 
372       s[5] = s[7];
373       s[6] = s[8];
374       s[7] = s[9];
375     }
376 
377     // Process the remaining last 4 or 6 rows here.
378     int i = process_ht;
379     while (i < height - 1) {
380       PROCESS_RESIZE_Y_WD16
381 
382       _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
383                        CAST_LOW(res_8bit0));
384       i += 2;
385 
386       const int is_store_valid = (i < height - 1);
387       if (is_store_valid)
388         _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
389                          _mm256_extracti128_si256(res_8bit0, 1));
390       i += 2;
391 
392       // Check if there is any remaining height to process. If so, perform the
393       // necessary data loading for processing the next row.
394       if (i < height - 1) {
395         l10 = l11 = l9;
396         const __m256i s810 =
397             _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
398         const __m256i s911 =
399             _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
400         // i0j0... i7j7 | k0l0... k7l7
401         s[4] = _mm256_unpacklo_epi8(s810, s911);
402         // i8j8... i15j15 | k8l8... k15l15
403         s[9] = _mm256_unpackhi_epi8(s810, s911);
404 
405         s[0] = s[2];
406         s[1] = s[3];
407         s[2] = s[4];
408 
409         s[5] = s[7];
410         s[6] = s[8];
411         s[7] = s[9];
412       }
413     }
414   }
415 
416   if (remain_col > 7) {
417     const int processed_wd = num_col16 * 16;
418     remain_col = stride % 8;
419 
420     const uint8_t *data = &intbuf[processed_wd];
421 
422     const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
423     // Padding top 3 rows with available top-most row.
424     const __m128i l0 = l3;
425     const __m128i l1 = l3;
426     const __m128i l2 = l3;
427     const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
428 
429     __m128i l6, l7, l8, l9;
430     __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
431     __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
432     __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
433 
434     // a0b0...a7b7
435     const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
436     // c0d0...c7d7
437     const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
438     // e0f0...e7f7
439     const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
440     // g0h0...g7h7
441     __m128i s67 = _mm_unpacklo_epi8(l10, l11);
442 
443     // a0b0...a7b7 | c0d0...c7d7
444     s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
445     // c0d0...c7d7 | e0f0...e7f7
446     s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
447     // e0f0...e7f7 | g0h0...g7h7
448     s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
449 
450     // height to be processed here
451     const int process_ht = height - remain_row;
452     for (int i = 0; i < process_ht; i += 4) {
453       PROCESS_RESIZE_Y_WD8
454 
455       _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
456                        CAST_LOW(res_a_round_1));
457 
458       _mm_storel_epi64(
459           (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
460           _mm256_extracti128_si256(res_a_round_1, 1));
461 
462       const int idx7 = AOMMIN(height - 1, i + 7);
463       const int idx8 = AOMMIN(height - 1, i + 8);
464       l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
465       l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
466 
467       // k0l0... k7l7
468       const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
469       // i0j0... i7j7 | k0l0... k7l7
470       s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
471 
472       s[0] = s[2];
473       s[1] = s[3];
474       s[2] = s[4];
475     }
476 
477     // Process the remaining last 4 or 6 rows here.
478     int i = process_ht;
479     while (i < height - 1) {
480       PROCESS_RESIZE_Y_WD8
481 
482       _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
483                        CAST_LOW(res_a_round_1));
484 
485       i += 2;
486 
487       const int is_store_valid = (i < height - 1);
488       if (is_store_valid)
489         _mm_storel_epi64(
490             (__m128i *)&output[(i / 2) * out_stride + processed_wd],
491             _mm256_extracti128_si256(res_a_round_1, 1));
492       i += 2;
493 
494       // Check rows are still remaining for processing. If yes do the required
495       // load of data for the next iteration.
496       if (i < height - 1) {
497         l10 = l11 = l9;
498         // k0l0... k7l7
499         const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
500         // i0j0... i7j7 | k0l0... k7l7
501         s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
502 
503         s[0] = s[2];
504         s[1] = s[3];
505         s[2] = s[4];
506       }
507     }
508   }
509 
510   if (remain_col)
511     return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
512                                  stride, stride - remain_col);
513 
514   return true;
515 }
516 
517 // Masks used for width 32 and 8 pixels, with left and right padding
518 // requirements
519 static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2,  3,  4,
520                                                     5, 6, 7, 8, 9, 10, 11, 12,
521                                                     0, 0, 0, 0, 1, 2,  3,  4,
522                                                     5, 6, 7, 8, 9, 10, 11, 12 };
523 
524 static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
525                                                      2, 2, 2, 2, 2, 2, 2, 2,
526                                                      0, 1, 2, 2, 2, 2, 2, 2,
527                                                      2, 2, 2, 2, 2, 2, 2, 2 };
528 
529 static const uint8_t wd8_right_padding_mask[32] = {
530   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
531   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
532 };
533 
av1_resize_horz_dir_avx2(const uint8_t * const input,int in_stride,uint8_t * intbuf,int height,int filtered_length,int width2)534 void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
535                               uint8_t *intbuf, int height, int filtered_length,
536                               int width2) {
537   assert(height % 2 == 0);
538   // Invoke SSE2 for width less than 32.
539   if (filtered_length < 32) {
540     av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
541                              width2);
542     return;
543   }
544 
545   const int filt_length = sizeof(av1_down2_symeven_half_filter);
546   assert(filt_length % 2 == 0);
547   (void)filt_length;
548 
549   __m256i s0[4], s1[4], coeffs_x[4];
550 
551   const int bits = FILTER_BITS;
552   const int dst_stride = width2;
553   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
554   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
555 
556   const uint8_t max_pixel = 255;
557   const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
558   const __m256i zero = _mm256_setzero_si256();
559 
560   const __m256i wd32_start_pad_mask =
561       _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
562   const __m256i wd32_end_pad_mask =
563       _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
564   const __m256i wd8_end_pad_mask =
565       _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
566   prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
567 
568   // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
569   // to generate output corresponding to 2 rows. To streamline the core loop and
570   // eliminate the need for conditional checks, the remaining columns (16 or 8)
571   // are processed separately.
572   if (filtered_length % 32 == 0) {
573     for (int i = 0; i < height; i += 2) {
574       int filter_offset = 0;
575       int row_offset = 0;
576       for (int j = 0; j < filtered_length; j += 32) {
577         PROCESS_RESIZE_X_WD32
578       }
579     }
580   } else {
581     for (int i = 0; i < height; i += 2) {
582       int filter_offset = 0;
583       int remain_col = filtered_length;
584       int row_offset = 0;
585       // To avoid pixel over-read at frame boundary, processing of 32 pixels
586       // is done using the core loop only if sufficient number of pixels
587       // required for the load are present. The remaining pixels are processed
588       // separately.
589       for (int j = 0; j <= filtered_length - 32; j += 32) {
590         if (remain_col == 34 || remain_col == 36) {
591           break;
592         }
593         PROCESS_RESIZE_X_WD32
594         remain_col -= 32;
595       }
596 
597       int wd_processed = filtered_length - remain_col;
598       // To avoid pixel over-read at frame boundary, processing of 16 pixels
599       // is done only if sufficient number of pixels required for the
600       // load are present. The remaining pixels are processed separately.
601       if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
602         remain_col = filtered_length - wd_processed - 16;
603         const int in_idx = i * in_stride + wd_processed;
604         const int out_idx = (i * dst_stride) + wd_processed / 2;
605         // a0 a1 --- a15
606         __m128i row0 =
607             _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
608         // b0 b1 --- b15
609         __m128i row1 = _mm_loadu_si128(
610             (__m128i *)&input[in_idx + in_stride - filter_offset]);
611         // a0 a1 --- a15 || b0 b1 --- b15
612         __m256i r0 =
613             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
614         if (filter_offset == 0) {
615           r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
616         }
617         filter_offset = 3;
618         const int is_last_cols16 = wd_processed + 16 == filtered_length;
619         if (is_last_cols16) row_offset = ROW_OFFSET;
620 
621         // a16 a17 --- a23
622         row0 = _mm_loadl_epi64(
623             (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
624         // b16 b17 --- b23
625         row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
626                                                  row_offset - filter_offset]);
627 
628         // a16-a23 x x x x| b16-b23 x x x x
629         __m256i r1 =
630             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
631 
632         // Pad end pixels to the right, while processing the last pixels in the
633         // row.
634         if (is_last_cols16) {
635           r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
636                                    wd32_end_pad_mask);
637         }
638 
639         // a0 a1 --- a15 || b0 b1 --- b15
640         s0[0] = r0;
641         // a2 a3 --- a17 || b2 b3 --- b17
642         s0[1] = _mm256_alignr_epi8(r1, r0, 2);
643         // a4 a5 --- a19 || b4 b5 --- b19
644         s0[2] = _mm256_alignr_epi8(r1, r0, 4);
645         // a6 a7 --- a21 || b6 b7 --- b21
646         s0[3] = _mm256_alignr_epi8(r1, r0, 6);
647 
648         // result for 16 pixels (a0 to a15) of row0 and row1
649         __m256i res_out_0[2];
650         res_out_0[0] = res_out_0[1] = zero;
651         resize_convolve(s0, coeffs_x, res_out_0);
652 
653         // r00-r07
654         res_out_0[0] = _mm256_sra_epi32(
655             _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
656         // r10-r17
657         res_out_0[1] = _mm256_sra_epi32(
658             _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
659         // r00-r03 r10-r13 r04-r07 r14-r17
660         __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
661         // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
662         res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
663         res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
664         res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
665         // r00-r03 r10-r13 r04-r07 r14-r17
666         __m128i low_result =
667             CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
668         // r00-r03 r04-r07 r10-r13 r14-r17
669         low_result = _mm_shuffle_epi32(low_result, 0xd8);
670 
671         _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
672         _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
673                          _mm_unpackhi_epi64(low_result, low_result));
674       }
675 
676       // To avoid pixel over-read at frame boundary, processing of 8 pixels
677       // is done only if sufficient number of pixels required for the
678       // load are present. The remaining pixels are processed by C function.
679       wd_processed = filtered_length - remain_col;
680       if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
681         remain_col = filtered_length - wd_processed - 8;
682         const int in_idx = i * in_stride + wd_processed - filter_offset;
683         const int out_idx = (i * dst_stride) + wd_processed / 2;
684         const int is_last_cols_8 = wd_processed + 8 == filtered_length;
685         if (is_last_cols_8) row_offset = ROW_OFFSET;
686         // a0 a1 --- a15
687         __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
688         // b0 b1 --- b15
689         __m128i row1 =
690             _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
691         // a0 a1 --- a15 || b0 b1 --- b15
692         __m256i r0 =
693             _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
694 
695         // Pad end pixels to the right, while processing the last pixels in the
696         // row.
697         if (is_last_cols_8)
698           r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
699                                    wd8_end_pad_mask);
700 
701         // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
702         s0[0] = r0;
703         // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
704         s0[1] = _mm256_bsrli_epi128(r0, 2);
705         // a4 a5 a6 a7 a8 a9 a10 a10 |  b4 b5 b6 b7 b8 b9 b10 b10
706         s0[2] = _mm256_bsrli_epi128(r0, 4);
707         // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
708         s0[3] = _mm256_bsrli_epi128(r0, 6);
709 
710         __m256i res_out_0[2];
711         res_out_0[0] = res_out_0[1] = zero;
712         resize_convolve(s0, coeffs_x, res_out_0);
713 
714         // r00 - r03 | r10 - r13
715         __m256i res_out =
716             _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
717         // r00 - r03 | r10 - r13
718         res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
719                                    round_shift_bits);
720         // r00-r03 r00-r03 r10-r13 r10-r13
721         __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
722         // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
723         res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
724         res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
725         res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
726 
727         xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
728         xx_storel_32(intbuf + out_idx + dst_stride,
729                      _mm256_extracti128_si256(res_out_row01, 1));
730       }
731 
732       wd_processed = filtered_length - remain_col;
733       if (remain_col) {
734         const int in_idx = (in_stride * i);
735         const int out_idx = (wd_processed / 2) + width2 * i;
736 
737         down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
738                       wd_processed);
739         down2_symeven(input + in_idx + in_stride, filtered_length,
740                       intbuf + out_idx + width2, wd_processed);
741       }
742     }
743   }
744 }
745