1 /*
2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <immintrin.h>
12 #include <string.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/resize.h"
17
18 #include "aom_dsp/x86/synonyms.h"
19
20 #define ROW_OFFSET 5
21 #define CAST_HI(x) _mm256_castsi128_si256(x)
22 #define CAST_LOW(x) _mm256_castsi256_si128(x)
23
24 #define PROCESS_RESIZE_Y_WD16 \
25 const int idx1 = AOMMIN(height - 1, i + 5); \
26 const int idx2 = AOMMIN(height - 1, i + 6); \
27 l6 = l10; \
28 l7 = l11; \
29 l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \
30 l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \
31 \
32 /* g0... g15 | i0... i15 */ \
33 const __m256i s68 = \
34 _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \
35 /* h0... h15 | j0... j15 */ \
36 const __m256i s79 = \
37 _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \
38 \
39 /* g0h0... g7g7 | i0j0... i7j */ \
40 s[3] = _mm256_unpacklo_epi8(s68, s79); \
41 /* g8h8... g15g15 | i8j8... i15j15 */ \
42 s[8] = _mm256_unpackhi_epi8(s68, s79); \
43 \
44 __m256i res_out[2] = { 0 }; \
45 resize_convolve(s, coeffs_y, res_out); \
46 \
47 /* r00... r07 */ \
48 __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
49 /* r20... r27 */ \
50 __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
51 \
52 res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
53 res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
54 \
55 __m256i res_out_b[2] = { 0 }; \
56 resize_convolve(s + 5, coeffs_y, res_out_b); \
57 \
58 /* r08... r015 */ \
59 __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
60 /* r28... r215 */ \
61 __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
62 res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \
63 res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \
64 \
65 /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \
66 __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
67 /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \
68 __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \
69 /* r00... r07 | r20... r27 */ \
70 res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \
71 /* r08... r015 | r28... r215 */ \
72 res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \
73 /* r00... r015 | r20... r215 */ \
74 res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \
75 res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \
76 res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
77
78 #define PROCESS_RESIZE_Y_WD8 \
79 const int idx1 = AOMMIN(height - 1, i + 5); \
80 const int idx2 = AOMMIN(height - 1, i + 6); \
81 l6 = l10; \
82 l7 = l11; \
83 l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \
84 l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \
85 \
86 /* g0h0... g7h7 */ \
87 s67 = _mm_unpacklo_epi8(l6, l7); \
88 /* i0j0...i7j7 */ \
89 __m128i s89 = _mm_unpacklo_epi8(l8, l9); \
90 \
91 /* g0h0...g7g7 | i0j0...i7j7 */ \
92 s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \
93 \
94 __m256i res_out[2] = { 0 }; \
95 resize_convolve(s, coeffs_y, res_out); \
96 \
97 /* r00... r07 */ \
98 __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
99 /* r20...r27 */ \
100 __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
101 res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
102 res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
103 \
104 /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \
105 res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
106 /* r00...r07 | r20...r27 */ \
107 res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \
108 res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \
109 res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \
110 res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
111
112 #define PROCESS_RESIZE_X_WD32 \
113 /* a0 a1 ..... a30 a31 */ \
114 __m256i row0 = _mm256_loadu_si256( \
115 (__m256i *)&input[i * in_stride + j - filter_offset]); \
116 /* b0 b1 ..... b30 b31 */ \
117 __m256i row1 = _mm256_loadu_si256( \
118 (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]); \
119 /* a0 .... a15 || b0.... b15 */ \
120 __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); \
121 /* a16 .... a31 || b16 .... b31 */ \
122 __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); \
123 filter_offset = 3; \
124 \
125 /* Pad start pixels to the left, while processing the first pixels in the \
126 * row. */ \
127 if (j == 0) { \
128 /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \
129 row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \
130 /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */ \
131 row1 = _mm256_alignr_epi8(r1, r0, 13); \
132 r0 = row0; \
133 r1 = row1; \
134 } \
135 const int is_last_cols32 = (j + 32 == filtered_length); \
136 /* Avoid loading extra pixels at frame boundary.*/ \
137 if (is_last_cols32) row_offset = ROW_OFFSET; \
138 /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \
139 __m128i row0_0 = _mm_loadl_epi64( \
140 (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
141 /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \
142 __m128i row1_0 = \
143 _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j - \
144 filter_offset - row_offset]); \
145 __m256i r2 = _mm256_permute2x128_si256( \
146 _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \
147 \
148 /* Pad end pixels to the right, while processing the last pixels in the \
149 * row. */ \
150 if (is_last_cols32) { \
151 r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET), \
152 wd32_end_pad_mask); \
153 } \
154 \
155 /* Process even pixels of the first row */ \
156 /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */ \
157 s0[0] = _mm256_alignr_epi8(r1, r0, 0); \
158 /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */ \
159 s0[1] = _mm256_alignr_epi8(r1, r0, 2); \
160 /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */ \
161 s0[2] = _mm256_alignr_epi8(r1, r0, 4); \
162 /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */ \
163 s0[3] = _mm256_alignr_epi8(r1, r0, 6); \
164 \
165 /* Process even pixels of the second row */ \
166 /* a13 a14 a15 a16 ..... a28 | b13 b14 b15 b16 ..... b28 */ \
167 s1[0] = _mm256_alignr_epi8(r2, r1, 0); \
168 /* a15 a16 a17 a18 ..... a30 | b15 b16 b17 b18 ..... b30 */ \
169 s1[1] = _mm256_alignr_epi8(r2, r1, 2); \
170 /* a17 a18 a19 a20 ..... a32 | b17 b18 b19 b20 ..... b32 */ \
171 s1[2] = _mm256_alignr_epi8(r2, r1, 4); \
172 /* a19 a20 a21 a22 ..... a34 | b19 b20 b21 b22 ..... b34 */ \
173 s1[3] = _mm256_alignr_epi8(r2, r1, 6); \
174 \
175 /* The register res_out_0 stores the result of start-16 pixels corresponding \
176 * to the first and second rows whereas res_out_1 stores the end-16 \
177 * pixels. */ \
178 __m256i res_out_0[2], res_out_1[2]; \
179 res_out_1[0] = res_out_1[1] = zero; \
180 res_out_0[0] = res_out_0[1] = zero; \
181 resize_convolve(s0, coeffs_x, res_out_0); \
182 resize_convolve(s1, coeffs_x, res_out_1); \
183 \
184 /* Result of 32 pixels of row0 (a0 to a32) */ \
185 res_out_0[0] = _mm256_sra_epi32( \
186 _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); \
187 res_out_1[0] = _mm256_sra_epi32( \
188 _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits); \
189 /* r00-r03 r08-r011 | r04-r07 r012-r015 */ \
190 __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \
191 \
192 /* Result of 32 pixels of row1 (b0 to b32) */ \
193 res_out_0[1] = _mm256_sra_epi32( \
194 _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \
195 res_out_1[1] = _mm256_sra_epi32( \
196 _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits); \
197 /* r10-r13 r18-r111 | r14-r17 r112-r115 */ \
198 __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]); \
199 \
200 /* Convert the result from 16bit to 8bit */ \
201 /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115 \
202 */ \
203 __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1); \
204 __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel); \
205 res_out_row01 = _mm256_max_epu8(res_out_r01, zero); \
206 __m128i low_128 = CAST_LOW(res_out_row01); \
207 __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1); \
208 \
209 _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2], \
210 _mm_unpacklo_epi32(low_128, high_128)); \
211 _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2], \
212 _mm_unpackhi_epi32(low_128, high_128));
213
resize_convolve(const __m256i * const s,const __m256i * const coeffs,__m256i * res_out)214 static inline void resize_convolve(const __m256i *const s,
215 const __m256i *const coeffs,
216 __m256i *res_out) {
217 const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
218 const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
219 const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
220 const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
221
222 const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
223 const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
224 // The sum of convolve operation crosses signed 16bit. Hence, the addition
225 // should happen in 32bit.
226 const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
227 const __m256i dst_01 =
228 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
229 const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
230 const __m256i dst_11 =
231 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
232
233 res_out[0] = _mm256_add_epi32(dst_00, dst_10);
234 res_out[1] = _mm256_add_epi32(dst_01, dst_11);
235 }
236
prepare_filter_coeffs(const int16_t * filter,__m256i * const coeffs)237 static inline void prepare_filter_coeffs(const int16_t *filter,
238 __m256i *const coeffs /* [4] */) {
239 // f0 f1 f2 f3 x x x x
240 const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
241 // f0 f1 f2 f3 f0 f1 f2 f3
242 const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
243 // f0 f1 f2 f3 f1 f0 f3 f2
244 const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
245
246 const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
247
248 // f0 f1 f0 f1 ..
249 coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
250 // f2 f3 f2 f3 ..
251 coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
252 // f3 f2 f3 f2 ..
253 coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
254 // f1 f0 f1 f0 ..
255 coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
256 }
257
av1_resize_vert_dir_avx2(uint8_t * intbuf,uint8_t * output,int out_stride,int height,int height2,int stride,int start_col)258 bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
259 int height, int height2, int stride,
260 int start_col) {
261 assert(start_col <= stride);
262 // For the GM tool, the input layer height or width is assured to be an even
263 // number. Hence the function 'down2_symodd()' is not invoked and SIMD
264 // optimization of the same is not implemented.
265 // When the input height is less than 8 and even, the potential input
266 // heights are limited to 2, 4, or 6. These scenarios require seperate
267 // handling due to padding requirements. Invoking the C function here will
268 // eliminate the need for conditional statements within the subsequent SIMD
269 // code to manage these cases.
270 if (height & 1 || height < 8) {
271 return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
272 stride, start_col);
273 }
274
275 __m256i s[10], coeffs_y[4];
276 const int bits = FILTER_BITS;
277
278 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
279 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
280 const uint8_t max_pixel = 255;
281 const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
282 const __m256i zero = _mm256_setzero_si256();
283
284 prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
285
286 const int num_col16 = stride / 16;
287 int remain_col = stride % 16;
288 // The core vertical SIMD processes 4 input rows simultaneously to generate
289 // output corresponding to 2 rows. To streamline the core loop and eliminate
290 // the need for conditional checks, the remaining rows (4 or 6) are processed
291 // separately.
292 const int remain_row = (height % 4 == 0) ? 4 : 6;
293
294 for (int j = start_col; j < stride - remain_col; j += 16) {
295 const uint8_t *data = &intbuf[j];
296 const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
297 // Padding top 3 rows with the last available row at the top.
298 const __m128i l0 = l3;
299 const __m128i l1 = l3;
300 const __m128i l2 = l3;
301 const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
302
303 __m128i l6, l7, l8, l9;
304 __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
305 __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
306 __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
307
308 // a0...a15 | c0...c15
309 const __m256i s02 =
310 _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
311 // b0...b15 | d0...d15
312 const __m256i s13 =
313 _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
314 // c0...c15 | e0...e15
315 const __m256i s24 =
316 _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
317 // d0...d15 | f0...f15
318 const __m256i s35 =
319 _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
320 // e0...e15 | g0...g15
321 const __m256i s46 =
322 _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
323 // f0...f15 | h0...h15
324 const __m256i s57 =
325 _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
326
327 // a0b0...a7b7 | c0d0...c7d7
328 s[0] = _mm256_unpacklo_epi8(s02, s13);
329 // c0d0...c7d7 | e0f0...e7f7
330 s[1] = _mm256_unpacklo_epi8(s24, s35);
331 // e0f0...e7f7 | g0h0...g7h7
332 s[2] = _mm256_unpacklo_epi8(s46, s57);
333
334 // a8b8...a15b15 | c8d8...c15d15
335 s[5] = _mm256_unpackhi_epi8(s02, s13);
336 // c8d8...c15d15 | e8f8...e15f15
337 s[6] = _mm256_unpackhi_epi8(s24, s35);
338 // e8f8...e15f15 | g8h8...g15h15
339 s[7] = _mm256_unpackhi_epi8(s46, s57);
340
341 // height to be processed here
342 const int process_ht = height - remain_row;
343 for (int i = 0; i < process_ht; i += 4) {
344 PROCESS_RESIZE_Y_WD16
345
346 _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
347 CAST_LOW(res_8bit0));
348
349 _mm_storeu_si128(
350 (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
351 _mm256_extracti128_si256(res_8bit0, 1));
352
353 // Load the required data for processing of next 4 input rows.
354 const int idx7 = AOMMIN(height - 1, i + 7);
355 const int idx8 = AOMMIN(height - 1, i + 8);
356 l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
357 l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
358
359 const __m256i s810 =
360 _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
361 const __m256i s911 =
362 _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
363 // i0j0... i7j7 | k0l0... k7l7
364 s[4] = _mm256_unpacklo_epi8(s810, s911);
365 // i8j8... i15j15 | k8l8... k15l15
366 s[9] = _mm256_unpackhi_epi8(s810, s911);
367
368 s[0] = s[2];
369 s[1] = s[3];
370 s[2] = s[4];
371
372 s[5] = s[7];
373 s[6] = s[8];
374 s[7] = s[9];
375 }
376
377 // Process the remaining last 4 or 6 rows here.
378 int i = process_ht;
379 while (i < height - 1) {
380 PROCESS_RESIZE_Y_WD16
381
382 _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
383 CAST_LOW(res_8bit0));
384 i += 2;
385
386 const int is_store_valid = (i < height - 1);
387 if (is_store_valid)
388 _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
389 _mm256_extracti128_si256(res_8bit0, 1));
390 i += 2;
391
392 // Check if there is any remaining height to process. If so, perform the
393 // necessary data loading for processing the next row.
394 if (i < height - 1) {
395 l10 = l11 = l9;
396 const __m256i s810 =
397 _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
398 const __m256i s911 =
399 _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
400 // i0j0... i7j7 | k0l0... k7l7
401 s[4] = _mm256_unpacklo_epi8(s810, s911);
402 // i8j8... i15j15 | k8l8... k15l15
403 s[9] = _mm256_unpackhi_epi8(s810, s911);
404
405 s[0] = s[2];
406 s[1] = s[3];
407 s[2] = s[4];
408
409 s[5] = s[7];
410 s[6] = s[8];
411 s[7] = s[9];
412 }
413 }
414 }
415
416 if (remain_col > 7) {
417 const int processed_wd = num_col16 * 16;
418 remain_col = stride % 8;
419
420 const uint8_t *data = &intbuf[processed_wd];
421
422 const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
423 // Padding top 3 rows with available top-most row.
424 const __m128i l0 = l3;
425 const __m128i l1 = l3;
426 const __m128i l2 = l3;
427 const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
428
429 __m128i l6, l7, l8, l9;
430 __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
431 __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
432 __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
433
434 // a0b0...a7b7
435 const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
436 // c0d0...c7d7
437 const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
438 // e0f0...e7f7
439 const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
440 // g0h0...g7h7
441 __m128i s67 = _mm_unpacklo_epi8(l10, l11);
442
443 // a0b0...a7b7 | c0d0...c7d7
444 s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
445 // c0d0...c7d7 | e0f0...e7f7
446 s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
447 // e0f0...e7f7 | g0h0...g7h7
448 s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
449
450 // height to be processed here
451 const int process_ht = height - remain_row;
452 for (int i = 0; i < process_ht; i += 4) {
453 PROCESS_RESIZE_Y_WD8
454
455 _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
456 CAST_LOW(res_a_round_1));
457
458 _mm_storel_epi64(
459 (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
460 _mm256_extracti128_si256(res_a_round_1, 1));
461
462 const int idx7 = AOMMIN(height - 1, i + 7);
463 const int idx8 = AOMMIN(height - 1, i + 8);
464 l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
465 l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
466
467 // k0l0... k7l7
468 const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
469 // i0j0... i7j7 | k0l0... k7l7
470 s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
471
472 s[0] = s[2];
473 s[1] = s[3];
474 s[2] = s[4];
475 }
476
477 // Process the remaining last 4 or 6 rows here.
478 int i = process_ht;
479 while (i < height - 1) {
480 PROCESS_RESIZE_Y_WD8
481
482 _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
483 CAST_LOW(res_a_round_1));
484
485 i += 2;
486
487 const int is_store_valid = (i < height - 1);
488 if (is_store_valid)
489 _mm_storel_epi64(
490 (__m128i *)&output[(i / 2) * out_stride + processed_wd],
491 _mm256_extracti128_si256(res_a_round_1, 1));
492 i += 2;
493
494 // Check rows are still remaining for processing. If yes do the required
495 // load of data for the next iteration.
496 if (i < height - 1) {
497 l10 = l11 = l9;
498 // k0l0... k7l7
499 const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
500 // i0j0... i7j7 | k0l0... k7l7
501 s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
502
503 s[0] = s[2];
504 s[1] = s[3];
505 s[2] = s[4];
506 }
507 }
508 }
509
510 if (remain_col)
511 return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
512 stride, stride - remain_col);
513
514 return true;
515 }
516
517 // Masks used for width 32 and 8 pixels, with left and right padding
518 // requirements
519 static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2, 3, 4,
520 5, 6, 7, 8, 9, 10, 11, 12,
521 0, 0, 0, 0, 1, 2, 3, 4,
522 5, 6, 7, 8, 9, 10, 11, 12 };
523
524 static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
525 2, 2, 2, 2, 2, 2, 2, 2,
526 0, 1, 2, 2, 2, 2, 2, 2,
527 2, 2, 2, 2, 2, 2, 2, 2 };
528
529 static const uint8_t wd8_right_padding_mask[32] = {
530 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
531 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
532 };
533
av1_resize_horz_dir_avx2(const uint8_t * const input,int in_stride,uint8_t * intbuf,int height,int filtered_length,int width2)534 void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
535 uint8_t *intbuf, int height, int filtered_length,
536 int width2) {
537 assert(height % 2 == 0);
538 // Invoke SSE2 for width less than 32.
539 if (filtered_length < 32) {
540 av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
541 width2);
542 return;
543 }
544
545 const int filt_length = sizeof(av1_down2_symeven_half_filter);
546 assert(filt_length % 2 == 0);
547 (void)filt_length;
548
549 __m256i s0[4], s1[4], coeffs_x[4];
550
551 const int bits = FILTER_BITS;
552 const int dst_stride = width2;
553 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
554 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
555
556 const uint8_t max_pixel = 255;
557 const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
558 const __m256i zero = _mm256_setzero_si256();
559
560 const __m256i wd32_start_pad_mask =
561 _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
562 const __m256i wd32_end_pad_mask =
563 _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
564 const __m256i wd8_end_pad_mask =
565 _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
566 prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
567
568 // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
569 // to generate output corresponding to 2 rows. To streamline the core loop and
570 // eliminate the need for conditional checks, the remaining columns (16 or 8)
571 // are processed separately.
572 if (filtered_length % 32 == 0) {
573 for (int i = 0; i < height; i += 2) {
574 int filter_offset = 0;
575 int row_offset = 0;
576 for (int j = 0; j < filtered_length; j += 32) {
577 PROCESS_RESIZE_X_WD32
578 }
579 }
580 } else {
581 for (int i = 0; i < height; i += 2) {
582 int filter_offset = 0;
583 int remain_col = filtered_length;
584 int row_offset = 0;
585 // To avoid pixel over-read at frame boundary, processing of 32 pixels
586 // is done using the core loop only if sufficient number of pixels
587 // required for the load are present. The remaining pixels are processed
588 // separately.
589 for (int j = 0; j <= filtered_length - 32; j += 32) {
590 if (remain_col == 34 || remain_col == 36) {
591 break;
592 }
593 PROCESS_RESIZE_X_WD32
594 remain_col -= 32;
595 }
596
597 int wd_processed = filtered_length - remain_col;
598 // To avoid pixel over-read at frame boundary, processing of 16 pixels
599 // is done only if sufficient number of pixels required for the
600 // load are present. The remaining pixels are processed separately.
601 if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
602 remain_col = filtered_length - wd_processed - 16;
603 const int in_idx = i * in_stride + wd_processed;
604 const int out_idx = (i * dst_stride) + wd_processed / 2;
605 // a0 a1 --- a15
606 __m128i row0 =
607 _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
608 // b0 b1 --- b15
609 __m128i row1 = _mm_loadu_si128(
610 (__m128i *)&input[in_idx + in_stride - filter_offset]);
611 // a0 a1 --- a15 || b0 b1 --- b15
612 __m256i r0 =
613 _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
614 if (filter_offset == 0) {
615 r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
616 }
617 filter_offset = 3;
618 const int is_last_cols16 = wd_processed + 16 == filtered_length;
619 if (is_last_cols16) row_offset = ROW_OFFSET;
620
621 // a16 a17 --- a23
622 row0 = _mm_loadl_epi64(
623 (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
624 // b16 b17 --- b23
625 row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
626 row_offset - filter_offset]);
627
628 // a16-a23 x x x x| b16-b23 x x x x
629 __m256i r1 =
630 _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
631
632 // Pad end pixels to the right, while processing the last pixels in the
633 // row.
634 if (is_last_cols16) {
635 r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
636 wd32_end_pad_mask);
637 }
638
639 // a0 a1 --- a15 || b0 b1 --- b15
640 s0[0] = r0;
641 // a2 a3 --- a17 || b2 b3 --- b17
642 s0[1] = _mm256_alignr_epi8(r1, r0, 2);
643 // a4 a5 --- a19 || b4 b5 --- b19
644 s0[2] = _mm256_alignr_epi8(r1, r0, 4);
645 // a6 a7 --- a21 || b6 b7 --- b21
646 s0[3] = _mm256_alignr_epi8(r1, r0, 6);
647
648 // result for 16 pixels (a0 to a15) of row0 and row1
649 __m256i res_out_0[2];
650 res_out_0[0] = res_out_0[1] = zero;
651 resize_convolve(s0, coeffs_x, res_out_0);
652
653 // r00-r07
654 res_out_0[0] = _mm256_sra_epi32(
655 _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
656 // r10-r17
657 res_out_0[1] = _mm256_sra_epi32(
658 _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
659 // r00-r03 r10-r13 r04-r07 r14-r17
660 __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
661 // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
662 res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
663 res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
664 res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
665 // r00-r03 r10-r13 r04-r07 r14-r17
666 __m128i low_result =
667 CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
668 // r00-r03 r04-r07 r10-r13 r14-r17
669 low_result = _mm_shuffle_epi32(low_result, 0xd8);
670
671 _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
672 _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
673 _mm_unpackhi_epi64(low_result, low_result));
674 }
675
676 // To avoid pixel over-read at frame boundary, processing of 8 pixels
677 // is done only if sufficient number of pixels required for the
678 // load are present. The remaining pixels are processed by C function.
679 wd_processed = filtered_length - remain_col;
680 if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
681 remain_col = filtered_length - wd_processed - 8;
682 const int in_idx = i * in_stride + wd_processed - filter_offset;
683 const int out_idx = (i * dst_stride) + wd_processed / 2;
684 const int is_last_cols_8 = wd_processed + 8 == filtered_length;
685 if (is_last_cols_8) row_offset = ROW_OFFSET;
686 // a0 a1 --- a15
687 __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
688 // b0 b1 --- b15
689 __m128i row1 =
690 _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
691 // a0 a1 --- a15 || b0 b1 --- b15
692 __m256i r0 =
693 _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
694
695 // Pad end pixels to the right, while processing the last pixels in the
696 // row.
697 if (is_last_cols_8)
698 r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
699 wd8_end_pad_mask);
700
701 // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
702 s0[0] = r0;
703 // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
704 s0[1] = _mm256_bsrli_epi128(r0, 2);
705 // a4 a5 a6 a7 a8 a9 a10 a10 | b4 b5 b6 b7 b8 b9 b10 b10
706 s0[2] = _mm256_bsrli_epi128(r0, 4);
707 // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
708 s0[3] = _mm256_bsrli_epi128(r0, 6);
709
710 __m256i res_out_0[2];
711 res_out_0[0] = res_out_0[1] = zero;
712 resize_convolve(s0, coeffs_x, res_out_0);
713
714 // r00 - r03 | r10 - r13
715 __m256i res_out =
716 _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
717 // r00 - r03 | r10 - r13
718 res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
719 round_shift_bits);
720 // r00-r03 r00-r03 r10-r13 r10-r13
721 __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
722 // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
723 res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
724 res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
725 res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
726
727 xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
728 xx_storel_32(intbuf + out_idx + dst_stride,
729 _mm256_extracti128_si256(res_out_row01, 1));
730 }
731
732 wd_processed = filtered_length - remain_col;
733 if (remain_col) {
734 const int in_idx = (in_stride * i);
735 const int out_idx = (wd_processed / 2) + width2 * i;
736
737 down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
738 wd_processed);
739 down2_symeven(input + in_idx + in_stride, filtered_length,
740 intbuf + out_idx + width2, wd_processed);
741 }
742 }
743 }
744 }
745