xref: /aosp_15_r20/external/libaom/aom_dsp/x86/variance_impl_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>  // AVX2
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_ports/mem.h"
17 
18 /* clang-format off */
19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
20   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
21   16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
22   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
23   14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
24   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
25   12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
26   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
27   10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
28    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
29    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
30    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
31    6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
32    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
33    4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
34    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
35    2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
36 };
37 /* clang-format on */
38 
39 #define FILTER_SRC(filter)                               \
40   /* filter the source */                                \
41   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
42   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
43                                                          \
44   /* add 8 to source */                                  \
45   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
46   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
47                                                          \
48   /* divide source by 16 */                              \
49   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
50   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
51 
52 #define MERGE_WITH_SRC(src_reg, reg)               \
53   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
54   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
55 
56 #define LOAD_SRC_DST                                    \
57   /* load source and destination */                     \
58   src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
59   dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
60 
61 #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
62   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
63   /* average between current and next stride source */                     \
64   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
65 
66 #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
67   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
68   MERGE_WITH_SRC(src_reg, src_next_reg)
69 
70 #define CALC_SUM_SSE_INSIDE_LOOP                          \
71   /* expand each byte to 2 bytes */                       \
72   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
73   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
74   /* source - dest */                                     \
75   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
76   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
77   /* caculate sum */                                      \
78   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
79   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
80   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
81   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
82   /* calculate sse */                                     \
83   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
84   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
85 
86 // final calculation to sum and sse
87 #define CALC_SUM_AND_SSE                                                   \
88   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
89   sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
90   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
91   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
92   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
93   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
94                                                                            \
95   sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
96   sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
97                                                                            \
98   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
99   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
100   *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
101                   _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
102   sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
103   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
104   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
105         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
106 
107 // Functions related to sub pixel variance width 16
108 #define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
109   /* load source and destination of 2 rows and insert*/          \
110   src_reg = _mm256_inserti128_si256(                             \
111       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
112       _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
113   dst_reg = _mm256_inserti128_si256(                             \
114       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
115       _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
116 
117 #define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
118   src_next_reg = _mm256_inserti128_si256(                                      \
119       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
120       _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
121   /* average between current and next stride source */                         \
122   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
123 
124 #define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
125   src_next_reg = _mm256_inserti128_si256(                                      \
126       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
127       _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
128   MERGE_WITH_SRC(src_reg, src_next_reg)
129 
130 #define LOAD_SRC_NEXT_BYTE_INSERT                                    \
131   /* load source and another source from next row   */               \
132   src_reg = _mm256_inserti128_si256(                                 \
133       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
134       _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
135   /* load source and next row source from 1 byte onwards   */        \
136   src_next_reg = _mm256_inserti128_si256(                            \
137       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
138       _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
139 
140 #define LOAD_DST_INSERT                                          \
141   dst_reg = _mm256_inserti128_si256(                             \
142       _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
143       _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
144 
145 #define LOAD_SRC_MERGE_128BIT(filter)                        \
146   __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
147   __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
148   __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
149   __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
150   __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
151   __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
152 
153 #define FILTER_SRC_128BIT(filter)             \
154   /* filter the source */                     \
155   src_lo = _mm_maddubs_epi16(src_lo, filter); \
156   src_hi = _mm_maddubs_epi16(src_hi, filter); \
157                                               \
158   /* add 8 to source */                       \
159   src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
160   src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
161                                               \
162   /* divide source by 16 */                   \
163   src_lo = _mm_srai_epi16(src_lo, 4);         \
164   src_hi = _mm_srai_epi16(src_hi, 4);
165 
166 // TODO([email protected]): These variance functions are macro-fied so we
167 // don't have to manually optimize the individual for-loops. We could save some
168 // binary size by optimizing the loops more carefully without duplicating the
169 // codes with a macro.
170 #define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
171   static inline int aom_sub_pixel_variance32x##height##_imp_avx2(             \
172       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
173       const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
174     __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
175     __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
176     __m256i zero_reg;                                                         \
177     int i, sum;                                                               \
178     sum_reg = _mm256_setzero_si256();                                         \
179     sse_reg = _mm256_setzero_si256();                                         \
180     zero_reg = _mm256_setzero_si256();                                        \
181                                                                               \
182     /* x_offset = 0 and y_offset = 0 */                                       \
183     if (x_offset == 0) {                                                      \
184       if (y_offset == 0) {                                                    \
185         for (i = 0; i < height; i++) {                                        \
186           LOAD_SRC_DST                                                        \
187           /* expend each byte to 2 bytes */                                   \
188           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
189           CALC_SUM_SSE_INSIDE_LOOP                                            \
190           src += src_stride;                                                  \
191           dst += dst_stride;                                                  \
192         }                                                                     \
193         /* x_offset = 0 and y_offset = 4 */                                   \
194       } else if (y_offset == 4) {                                             \
195         __m256i src_next_reg;                                                 \
196         for (i = 0; i < height; i++) {                                        \
197           LOAD_SRC_DST                                                        \
198           AVG_NEXT_SRC(src_reg, src_stride)                                   \
199           /* expend each byte to 2 bytes */                                   \
200           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
201           CALC_SUM_SSE_INSIDE_LOOP                                            \
202           src += src_stride;                                                  \
203           dst += dst_stride;                                                  \
204         }                                                                     \
205         /* x_offset = 0 and y_offset = bilin interpolation */                 \
206       } else {                                                                \
207         __m256i filter, pw8, src_next_reg;                                    \
208                                                                               \
209         y_offset <<= 5;                                                       \
210         filter = _mm256_load_si256(                                           \
211             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
212         pw8 = _mm256_set1_epi16(8);                                           \
213         for (i = 0; i < height; i++) {                                        \
214           LOAD_SRC_DST                                                        \
215           MERGE_NEXT_SRC(src_reg, src_stride)                                 \
216           FILTER_SRC(filter)                                                  \
217           CALC_SUM_SSE_INSIDE_LOOP                                            \
218           src += src_stride;                                                  \
219           dst += dst_stride;                                                  \
220         }                                                                     \
221       }                                                                       \
222       /* x_offset = 4  and y_offset = 0 */                                    \
223     } else if (x_offset == 4) {                                               \
224       if (y_offset == 0) {                                                    \
225         __m256i src_next_reg;                                                 \
226         for (i = 0; i < height; i++) {                                        \
227           LOAD_SRC_DST                                                        \
228           AVG_NEXT_SRC(src_reg, 1)                                            \
229           /* expand each byte to 2 bytes */                                   \
230           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
231           CALC_SUM_SSE_INSIDE_LOOP                                            \
232           src += src_stride;                                                  \
233           dst += dst_stride;                                                  \
234         }                                                                     \
235         /* x_offset = 4  and y_offset = 4 */                                  \
236       } else if (y_offset == 4) {                                             \
237         __m256i src_next_reg, src_avg;                                        \
238         /* load source and another source starting from the next */           \
239         /* following byte */                                                  \
240         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
241         AVG_NEXT_SRC(src_reg, 1)                                              \
242         for (i = 0; i < height; i++) {                                        \
243           src_avg = src_reg;                                                  \
244           src += src_stride;                                                  \
245           LOAD_SRC_DST                                                        \
246           AVG_NEXT_SRC(src_reg, 1)                                            \
247           /* average between previous average to current average */           \
248           src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
249           /* expand each byte to 2 bytes */                                   \
250           MERGE_WITH_SRC(src_avg, zero_reg)                                   \
251           /* save current source average */                                   \
252           CALC_SUM_SSE_INSIDE_LOOP                                            \
253           dst += dst_stride;                                                  \
254         }                                                                     \
255         /* x_offset = 4  and y_offset = bilin interpolation */                \
256       } else {                                                                \
257         __m256i filter, pw8, src_next_reg, src_avg;                           \
258         y_offset <<= 5;                                                       \
259         filter = _mm256_load_si256(                                           \
260             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
261         pw8 = _mm256_set1_epi16(8);                                           \
262         /* load source and another source starting from the next */           \
263         /* following byte */                                                  \
264         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
265         AVG_NEXT_SRC(src_reg, 1)                                              \
266         for (i = 0; i < height; i++) {                                        \
267           /* save current source average */                                   \
268           src_avg = src_reg;                                                  \
269           src += src_stride;                                                  \
270           LOAD_SRC_DST                                                        \
271           AVG_NEXT_SRC(src_reg, 1)                                            \
272           MERGE_WITH_SRC(src_avg, src_reg)                                    \
273           FILTER_SRC(filter)                                                  \
274           CALC_SUM_SSE_INSIDE_LOOP                                            \
275           dst += dst_stride;                                                  \
276         }                                                                     \
277       }                                                                       \
278       /* x_offset = bilin interpolation and y_offset = 0 */                   \
279     } else {                                                                  \
280       if (y_offset == 0) {                                                    \
281         __m256i filter, pw8, src_next_reg;                                    \
282         x_offset <<= 5;                                                       \
283         filter = _mm256_load_si256(                                           \
284             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
285         pw8 = _mm256_set1_epi16(8);                                           \
286         for (i = 0; i < height; i++) {                                        \
287           LOAD_SRC_DST                                                        \
288           MERGE_NEXT_SRC(src_reg, 1)                                          \
289           FILTER_SRC(filter)                                                  \
290           CALC_SUM_SSE_INSIDE_LOOP                                            \
291           src += src_stride;                                                  \
292           dst += dst_stride;                                                  \
293         }                                                                     \
294         /* x_offset = bilin interpolation and y_offset = 4 */                 \
295       } else if (y_offset == 4) {                                             \
296         __m256i filter, pw8, src_next_reg, src_pack;                          \
297         x_offset <<= 5;                                                       \
298         filter = _mm256_load_si256(                                           \
299             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
300         pw8 = _mm256_set1_epi16(8);                                           \
301         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
302         MERGE_NEXT_SRC(src_reg, 1)                                            \
303         FILTER_SRC(filter)                                                    \
304         /* convert each 16 bit to 8 bit to each low and high lane source */   \
305         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
306         for (i = 0; i < height; i++) {                                        \
307           src += src_stride;                                                  \
308           LOAD_SRC_DST                                                        \
309           MERGE_NEXT_SRC(src_reg, 1)                                          \
310           FILTER_SRC(filter)                                                  \
311           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
312           /* average between previous pack to the current */                  \
313           src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
314           MERGE_WITH_SRC(src_pack, zero_reg)                                  \
315           CALC_SUM_SSE_INSIDE_LOOP                                            \
316           src_pack = src_reg;                                                 \
317           dst += dst_stride;                                                  \
318         }                                                                     \
319         /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
320          */                                                                   \
321       } else {                                                                \
322         __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
323         x_offset <<= 5;                                                       \
324         xfilter = _mm256_load_si256(                                          \
325             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
326         y_offset <<= 5;                                                       \
327         yfilter = _mm256_load_si256(                                          \
328             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
329         pw8 = _mm256_set1_epi16(8);                                           \
330         /* load source and another source starting from the next */           \
331         /* following byte */                                                  \
332         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
333         MERGE_NEXT_SRC(src_reg, 1)                                            \
334                                                                               \
335         FILTER_SRC(xfilter)                                                   \
336         /* convert each 16 bit to 8 bit to each low and high lane source */   \
337         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
338         for (i = 0; i < height; i++) {                                        \
339           src += src_stride;                                                  \
340           LOAD_SRC_DST                                                        \
341           MERGE_NEXT_SRC(src_reg, 1)                                          \
342           FILTER_SRC(xfilter)                                                 \
343           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
344           /* merge previous pack to current pack source */                    \
345           MERGE_WITH_SRC(src_pack, src_reg)                                   \
346           /* filter the source */                                             \
347           FILTER_SRC(yfilter)                                                 \
348           src_pack = src_reg;                                                 \
349           CALC_SUM_SSE_INSIDE_LOOP                                            \
350           dst += dst_stride;                                                  \
351         }                                                                     \
352       }                                                                       \
353     }                                                                         \
354     CALC_SUM_AND_SSE                                                          \
355     _mm256_zeroupper();                                                       \
356     return sum;                                                               \
357   }                                                                           \
358   unsigned int aom_sub_pixel_variance32x##height##_avx2(                      \
359       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
360       const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
361     const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(             \
362         src, src_stride, x_offset, y_offset, dst, dst_stride, sse);           \
363     return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
364   }
365 
366 MAKE_SUB_PIXEL_VAR_32XH(64, 6)
367 MAKE_SUB_PIXEL_VAR_32XH(32, 5)
368 MAKE_SUB_PIXEL_VAR_32XH(16, 4)
369 
370 #define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)                \
371   unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                    \
372       const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
373       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {        \
374     unsigned int sse = 0;                                                 \
375     int se = 0;                                                           \
376     for (int i = 0; i < (w / wf); ++i) {                                  \
377       const uint8_t *src_ptr = src;                                       \
378       const uint8_t *dst_ptr = dst;                                       \
379       for (int j = 0; j < (h / hf); ++j) {                                \
380         unsigned int sse2;                                                \
381         const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2(     \
382             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
383             &sse2);                                                       \
384         dst_ptr += hf * dst_stride;                                       \
385         src_ptr += hf * src_stride;                                       \
386         se += se2;                                                        \
387         sse += sse2;                                                      \
388       }                                                                   \
389       src += wf;                                                          \
390       dst += wf;                                                          \
391     }                                                                     \
392     *sse_ptr = sse;                                                       \
393     return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
394   }
395 
396 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
397 AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
398 AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
399 AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
400 AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
401 AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
402 
403 #define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                           \
404   unsigned int aom_sub_pixel_variance16x##height##_avx2(                      \
405       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
406       const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
407     __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
408     __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
409     __m256i zero_reg;                                                         \
410     int i, sum;                                                               \
411     sum_reg = _mm256_setzero_si256();                                         \
412     sse_reg = _mm256_setzero_si256();                                         \
413     zero_reg = _mm256_setzero_si256();                                        \
414                                                                               \
415     /* x_offset = 0 and y_offset = 0 */                                       \
416     if (x_offset == 0) {                                                      \
417       if (y_offset == 0) {                                                    \
418         for (i = 0; i < height; i += 2) {                                     \
419           LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
420           /* expend each byte to 2 bytes */                                   \
421           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
422           CALC_SUM_SSE_INSIDE_LOOP                                            \
423           src += (src_stride << 1);                                           \
424           dst += (dst_stride << 1);                                           \
425         }                                                                     \
426         /* x_offset = 0 and y_offset = 4 */                                   \
427       } else if (y_offset == 4) {                                             \
428         __m256i src_next_reg;                                                 \
429         for (i = 0; i < height; i += 2) {                                     \
430           LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
431           AVG_NEXT_SRC_INSERT(src_reg, src_stride)                            \
432           /* expend each byte to 2 bytes */                                   \
433           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
434           CALC_SUM_SSE_INSIDE_LOOP                                            \
435           src += (src_stride << 1);                                           \
436           dst += (dst_stride << 1);                                           \
437         }                                                                     \
438         /* x_offset = 0 and y_offset = bilin interpolation */                 \
439       } else {                                                                \
440         __m256i filter, pw8, src_next_reg;                                    \
441         y_offset <<= 5;                                                       \
442         filter = _mm256_load_si256(                                           \
443             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
444         pw8 = _mm256_set1_epi16(8);                                           \
445         for (i = 0; i < height; i += 2) {                                     \
446           LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
447           MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                          \
448           FILTER_SRC(filter)                                                  \
449           CALC_SUM_SSE_INSIDE_LOOP                                            \
450           src += (src_stride << 1);                                           \
451           dst += (dst_stride << 1);                                           \
452         }                                                                     \
453       }                                                                       \
454       /* x_offset = 4  and y_offset = 0 */                                    \
455     } else if (x_offset == 4) {                                               \
456       if (y_offset == 0) {                                                    \
457         __m256i src_next_reg;                                                 \
458         for (i = 0; i < height; i += 2) {                                     \
459           LOAD_SRC_NEXT_BYTE_INSERT                                           \
460           LOAD_DST_INSERT                                                     \
461           /* average between current and next stride source */                \
462           src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                   \
463           /* expand each byte to 2 bytes */                                   \
464           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
465           CALC_SUM_SSE_INSIDE_LOOP                                            \
466           src += (src_stride << 1);                                           \
467           dst += (dst_stride << 1);                                           \
468         }                                                                     \
469         /* x_offset = 4  and y_offset = 4 */                                  \
470       } else if (y_offset == 4) {                                             \
471         __m256i src_next_reg, src_avg, src_temp;                              \
472         /* load and insert source and next row source */                      \
473         LOAD_SRC_NEXT_BYTE_INSERT                                             \
474         src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
475         src += src_stride << 1;                                               \
476         for (i = 0; i < height - 2; i += 2) {                                 \
477           LOAD_SRC_NEXT_BYTE_INSERT                                           \
478           src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
479           src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
480           src_temp = _mm256_avg_epu8(src_avg, src_temp);                      \
481           LOAD_DST_INSERT                                                     \
482           /* expand each byte to 2 bytes */                                   \
483           MERGE_WITH_SRC(src_temp, zero_reg)                                  \
484           /* save current source average */                                   \
485           src_avg = src_next_reg;                                             \
486           CALC_SUM_SSE_INSIDE_LOOP                                            \
487           dst += dst_stride << 1;                                             \
488           src += src_stride << 1;                                             \
489         }                                                                     \
490         /* last 2 rows processing happens here */                             \
491         __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
492         __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
493         src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
494         src_next_reg = _mm256_permute2x128_si256(                             \
495             src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
496         LOAD_DST_INSERT                                                       \
497         src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                     \
498         MERGE_WITH_SRC(src_avg, zero_reg)                                     \
499         CALC_SUM_SSE_INSIDE_LOOP                                              \
500       } else {                                                                \
501         /* x_offset = 4  and y_offset = bilin interpolation */                \
502         __m256i filter, pw8, src_next_reg, src_avg, src_temp;                 \
503         y_offset <<= 5;                                                       \
504         filter = _mm256_load_si256(                                           \
505             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
506         pw8 = _mm256_set1_epi16(8);                                           \
507         /* load and insert source and next row source */                      \
508         LOAD_SRC_NEXT_BYTE_INSERT                                             \
509         src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
510         src += src_stride << 1;                                               \
511         for (i = 0; i < height - 2; i += 2) {                                 \
512           LOAD_SRC_NEXT_BYTE_INSERT                                           \
513           src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
514           src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
515           LOAD_DST_INSERT                                                     \
516           MERGE_WITH_SRC(src_avg, src_temp)                                   \
517           /* save current source average */                                   \
518           src_avg = src_next_reg;                                             \
519           FILTER_SRC(filter)                                                  \
520           CALC_SUM_SSE_INSIDE_LOOP                                            \
521           dst += dst_stride << 1;                                             \
522           src += src_stride << 1;                                             \
523         }                                                                     \
524         /* last 2 rows processing happens here */                             \
525         __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
526         __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
527         src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
528         src_next_reg = _mm256_permute2x128_si256(                             \
529             src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
530         LOAD_DST_INSERT                                                       \
531         MERGE_WITH_SRC(src_avg, src_next_reg)                                 \
532         FILTER_SRC(filter)                                                    \
533         CALC_SUM_SSE_INSIDE_LOOP                                              \
534       }                                                                       \
535       /* x_offset = bilin interpolation and y_offset = 0 */                   \
536     } else {                                                                  \
537       if (y_offset == 0) {                                                    \
538         __m256i filter, pw8, src_next_reg;                                    \
539         x_offset <<= 5;                                                       \
540         filter = _mm256_load_si256(                                           \
541             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
542         pw8 = _mm256_set1_epi16(8);                                           \
543         for (i = 0; i < height; i += 2) {                                     \
544           LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
545           MERGE_NEXT_SRC_INSERT(src_reg, 1)                                   \
546           FILTER_SRC(filter)                                                  \
547           CALC_SUM_SSE_INSIDE_LOOP                                            \
548           src += (src_stride << 1);                                           \
549           dst += (dst_stride << 1);                                           \
550         }                                                                     \
551         /* x_offset = bilin interpolation and y_offset = 4 */                 \
552       } else if (y_offset == 4) {                                             \
553         __m256i filter, pw8, src_next_reg, src_pack;                          \
554         x_offset <<= 5;                                                       \
555         filter = _mm256_load_si256(                                           \
556             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
557         pw8 = _mm256_set1_epi16(8);                                           \
558         /* load and insert source and next row source */                      \
559         LOAD_SRC_NEXT_BYTE_INSERT                                             \
560         MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
561         FILTER_SRC(filter)                                                    \
562         /* convert each 16 bit to 8 bit to each low and high lane source */   \
563         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
564         src += src_stride << 1;                                               \
565         for (i = 0; i < height - 2; i += 2) {                                 \
566           LOAD_SRC_NEXT_BYTE_INSERT                                           \
567           LOAD_DST_INSERT                                                     \
568           MERGE_WITH_SRC(src_reg, src_next_reg)                               \
569           FILTER_SRC(filter)                                                  \
570           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
571           src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
572           /* average between previous pack to the current */                  \
573           src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                 \
574           MERGE_WITH_SRC(src_pack, zero_reg)                                  \
575           CALC_SUM_SSE_INSIDE_LOOP                                            \
576           src_pack = src_reg;                                                 \
577           src += src_stride << 1;                                             \
578           dst += dst_stride << 1;                                             \
579         }                                                                     \
580         /* last 2 rows processing happens here */                             \
581         LOAD_SRC_MERGE_128BIT(filter)                                         \
582         LOAD_DST_INSERT                                                       \
583         FILTER_SRC_128BIT(filter_128bit)                                      \
584         src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
585         src_next_reg = _mm256_permute2x128_si256(                             \
586             src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
587         /* average between previous pack to the current */                    \
588         src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                   \
589         MERGE_WITH_SRC(src_pack, zero_reg)                                    \
590         CALC_SUM_SSE_INSIDE_LOOP                                              \
591       } else {                                                                \
592         /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
593          */                                                                   \
594         __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
595         x_offset <<= 5;                                                       \
596         xfilter = _mm256_load_si256(                                          \
597             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
598         y_offset <<= 5;                                                       \
599         yfilter = _mm256_load_si256(                                          \
600             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
601         pw8 = _mm256_set1_epi16(8);                                           \
602         /* load and insert source and next row source */                      \
603         LOAD_SRC_NEXT_BYTE_INSERT                                             \
604         MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
605         FILTER_SRC(xfilter)                                                   \
606         /* convert each 16 bit to 8 bit to each low and high lane source */   \
607         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
608         src += src_stride << 1;                                               \
609         for (i = 0; i < height - 2; i += 2) {                                 \
610           LOAD_SRC_NEXT_BYTE_INSERT                                           \
611           LOAD_DST_INSERT                                                     \
612           MERGE_WITH_SRC(src_reg, src_next_reg)                               \
613           FILTER_SRC(xfilter)                                                 \
614           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
615           src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
616           /* average between previous pack to the current */                  \
617           MERGE_WITH_SRC(src_pack, src_next_reg)                              \
618           /* filter the source */                                             \
619           FILTER_SRC(yfilter)                                                 \
620           src_pack = src_reg;                                                 \
621           CALC_SUM_SSE_INSIDE_LOOP                                            \
622           src += src_stride << 1;                                             \
623           dst += dst_stride << 1;                                             \
624         }                                                                     \
625         /* last 2 rows processing happens here */                             \
626         LOAD_SRC_MERGE_128BIT(xfilter)                                        \
627         LOAD_DST_INSERT                                                       \
628         FILTER_SRC_128BIT(filter_128bit)                                      \
629         src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
630         src_next_reg = _mm256_permute2x128_si256(                             \
631             src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
632         MERGE_WITH_SRC(src_pack, src_next_reg)                                \
633         FILTER_SRC(yfilter)                                                   \
634         CALC_SUM_SSE_INSIDE_LOOP                                              \
635       }                                                                       \
636     }                                                                         \
637     CALC_SUM_AND_SSE                                                          \
638     _mm256_zeroupper();                                                       \
639     return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));   \
640   }
641 
642 MAKE_SUB_PIXEL_VAR_16XH(32, 5)
643 MAKE_SUB_PIXEL_VAR_16XH(16, 4)
644 MAKE_SUB_PIXEL_VAR_16XH(8, 3)
645 #if !CONFIG_REALTIME_ONLY
646 MAKE_SUB_PIXEL_VAR_16XH(64, 6)
647 MAKE_SUB_PIXEL_VAR_16XH(4, 2)
648 #endif
649 
650 #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height)                       \
651   static int sub_pixel_avg_variance32x##height##_imp_avx2(                    \
652       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
653       const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
654       unsigned int *sse) {                                                    \
655     __m256i sec_reg;                                                          \
656     __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
657     __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
658     __m256i zero_reg;                                                         \
659     int i, sum;                                                               \
660     sum_reg = _mm256_setzero_si256();                                         \
661     sse_reg = _mm256_setzero_si256();                                         \
662     zero_reg = _mm256_setzero_si256();                                        \
663                                                                               \
664     /* x_offset = 0 and y_offset = 0 */                                       \
665     if (x_offset == 0) {                                                      \
666       if (y_offset == 0) {                                                    \
667         for (i = 0; i < height; i++) {                                        \
668           LOAD_SRC_DST                                                        \
669           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
670           src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
671           sec += sec_stride;                                                  \
672           /* expend each byte to 2 bytes */                                   \
673           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
674           CALC_SUM_SSE_INSIDE_LOOP                                            \
675           src += src_stride;                                                  \
676           dst += dst_stride;                                                  \
677         }                                                                     \
678       } else if (y_offset == 4) {                                             \
679         __m256i src_next_reg;                                                 \
680         for (i = 0; i < height; i++) {                                        \
681           LOAD_SRC_DST                                                        \
682           AVG_NEXT_SRC(src_reg, src_stride)                                   \
683           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
684           src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
685           sec += sec_stride;                                                  \
686           /* expend each byte to 2 bytes */                                   \
687           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
688           CALC_SUM_SSE_INSIDE_LOOP                                            \
689           src += src_stride;                                                  \
690           dst += dst_stride;                                                  \
691         }                                                                     \
692         /* x_offset = 0 and y_offset = bilin interpolation */                 \
693       } else {                                                                \
694         __m256i filter, pw8, src_next_reg;                                    \
695                                                                               \
696         y_offset <<= 5;                                                       \
697         filter = _mm256_load_si256(                                           \
698             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
699         pw8 = _mm256_set1_epi16(8);                                           \
700         for (i = 0; i < height; i++) {                                        \
701           LOAD_SRC_DST                                                        \
702           MERGE_NEXT_SRC(src_reg, src_stride)                                 \
703           FILTER_SRC(filter)                                                  \
704           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
705           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
706           src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
707           sec += sec_stride;                                                  \
708           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
709           CALC_SUM_SSE_INSIDE_LOOP                                            \
710           src += src_stride;                                                  \
711           dst += dst_stride;                                                  \
712         }                                                                     \
713       }                                                                       \
714       /* x_offset = 4  and y_offset = 0 */                                    \
715     } else if (x_offset == 4) {                                               \
716       if (y_offset == 0) {                                                    \
717         __m256i src_next_reg;                                                 \
718         for (i = 0; i < height; i++) {                                        \
719           LOAD_SRC_DST                                                        \
720           AVG_NEXT_SRC(src_reg, 1)                                            \
721           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
722           src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
723           sec += sec_stride;                                                  \
724           /* expand each byte to 2 bytes */                                   \
725           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
726           CALC_SUM_SSE_INSIDE_LOOP                                            \
727           src += src_stride;                                                  \
728           dst += dst_stride;                                                  \
729         }                                                                     \
730         /* x_offset = 4  and y_offset = 4 */                                  \
731       } else if (y_offset == 4) {                                             \
732         __m256i src_next_reg, src_avg;                                        \
733         /* load source and another source starting from the next */           \
734         /* following byte */                                                  \
735         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
736         AVG_NEXT_SRC(src_reg, 1)                                              \
737         for (i = 0; i < height; i++) {                                        \
738           /* save current source average */                                   \
739           src_avg = src_reg;                                                  \
740           src += src_stride;                                                  \
741           LOAD_SRC_DST                                                        \
742           AVG_NEXT_SRC(src_reg, 1)                                            \
743           /* average between previous average to current average */           \
744           src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
745           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
746           src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
747           sec += sec_stride;                                                  \
748           /* expand each byte to 2 bytes */                                   \
749           MERGE_WITH_SRC(src_avg, zero_reg)                                   \
750           CALC_SUM_SSE_INSIDE_LOOP                                            \
751           dst += dst_stride;                                                  \
752         }                                                                     \
753         /* x_offset = 4  and y_offset = bilin interpolation */                \
754       } else {                                                                \
755         __m256i filter, pw8, src_next_reg, src_avg;                           \
756         y_offset <<= 5;                                                       \
757         filter = _mm256_load_si256(                                           \
758             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
759         pw8 = _mm256_set1_epi16(8);                                           \
760         /* load source and another source starting from the next */           \
761         /* following byte */                                                  \
762         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
763         AVG_NEXT_SRC(src_reg, 1)                                              \
764         for (i = 0; i < height; i++) {                                        \
765           /* save current source average */                                   \
766           src_avg = src_reg;                                                  \
767           src += src_stride;                                                  \
768           LOAD_SRC_DST                                                        \
769           AVG_NEXT_SRC(src_reg, 1)                                            \
770           MERGE_WITH_SRC(src_avg, src_reg)                                    \
771           FILTER_SRC(filter)                                                  \
772           src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
773           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
774           src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
775           /* expand each byte to 2 bytes */                                   \
776           MERGE_WITH_SRC(src_avg, zero_reg)                                   \
777           sec += sec_stride;                                                  \
778           CALC_SUM_SSE_INSIDE_LOOP                                            \
779           dst += dst_stride;                                                  \
780         }                                                                     \
781       }                                                                       \
782       /* x_offset = bilin interpolation and y_offset = 0 */                   \
783     } else {                                                                  \
784       if (y_offset == 0) {                                                    \
785         __m256i filter, pw8, src_next_reg;                                    \
786         x_offset <<= 5;                                                       \
787         filter = _mm256_load_si256(                                           \
788             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
789         pw8 = _mm256_set1_epi16(8);                                           \
790         for (i = 0; i < height; i++) {                                        \
791           LOAD_SRC_DST                                                        \
792           MERGE_NEXT_SRC(src_reg, 1)                                          \
793           FILTER_SRC(filter)                                                  \
794           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
795           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
796           src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
797           MERGE_WITH_SRC(src_reg, zero_reg)                                   \
798           sec += sec_stride;                                                  \
799           CALC_SUM_SSE_INSIDE_LOOP                                            \
800           src += src_stride;                                                  \
801           dst += dst_stride;                                                  \
802         }                                                                     \
803         /* x_offset = bilin interpolation and y_offset = 4 */                 \
804       } else if (y_offset == 4) {                                             \
805         __m256i filter, pw8, src_next_reg, src_pack;                          \
806         x_offset <<= 5;                                                       \
807         filter = _mm256_load_si256(                                           \
808             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
809         pw8 = _mm256_set1_epi16(8);                                           \
810         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
811         MERGE_NEXT_SRC(src_reg, 1)                                            \
812         FILTER_SRC(filter)                                                    \
813         /* convert each 16 bit to 8 bit to each low and high lane source */   \
814         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
815         for (i = 0; i < height; i++) {                                        \
816           src += src_stride;                                                  \
817           LOAD_SRC_DST                                                        \
818           MERGE_NEXT_SRC(src_reg, 1)                                          \
819           FILTER_SRC(filter)                                                  \
820           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
821           /* average between previous pack to the current */                  \
822           src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
823           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
824           src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
825           sec += sec_stride;                                                  \
826           MERGE_WITH_SRC(src_pack, zero_reg)                                  \
827           src_pack = src_reg;                                                 \
828           CALC_SUM_SSE_INSIDE_LOOP                                            \
829           dst += dst_stride;                                                  \
830         }                                                                     \
831         /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
832          */                                                                   \
833       } else {                                                                \
834         __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
835         x_offset <<= 5;                                                       \
836         xfilter = _mm256_load_si256(                                          \
837             (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
838         y_offset <<= 5;                                                       \
839         yfilter = _mm256_load_si256(                                          \
840             (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
841         pw8 = _mm256_set1_epi16(8);                                           \
842         /* load source and another source starting from the next */           \
843         /* following byte */                                                  \
844         src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
845         MERGE_NEXT_SRC(src_reg, 1)                                            \
846                                                                               \
847         FILTER_SRC(xfilter)                                                   \
848         /* convert each 16 bit to 8 bit to each low and high lane source */   \
849         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
850         for (i = 0; i < height; i++) {                                        \
851           src += src_stride;                                                  \
852           LOAD_SRC_DST                                                        \
853           MERGE_NEXT_SRC(src_reg, 1)                                          \
854           FILTER_SRC(xfilter)                                                 \
855           src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
856           /* merge previous pack to current pack source */                    \
857           MERGE_WITH_SRC(src_pack, src_reg)                                   \
858           /* filter the source */                                             \
859           FILTER_SRC(yfilter)                                                 \
860           src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);             \
861           sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
862           src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
863           MERGE_WITH_SRC(src_pack, zero_reg)                                  \
864           src_pack = src_reg;                                                 \
865           sec += sec_stride;                                                  \
866           CALC_SUM_SSE_INSIDE_LOOP                                            \
867           dst += dst_stride;                                                  \
868         }                                                                     \
869       }                                                                       \
870     }                                                                         \
871     CALC_SUM_AND_SSE                                                          \
872     _mm256_zeroupper();                                                       \
873     return sum;                                                               \
874   }                                                                           \
875   unsigned int aom_sub_pixel_avg_variance32x##height##_avx2(                  \
876       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
877       const uint8_t *dst, int dst_stride, unsigned int *sse,                  \
878       const uint8_t *sec_ptr) {                                               \
879     const int sum = sub_pixel_avg_variance32x##height##_imp_avx2(             \
880         src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32,    \
881         sse);                                                                 \
882     return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
883   }
884 
885 MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
886 MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
887 MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
888 
889 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)            \
890   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
891       const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
892       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
893       const uint8_t *sec) {                                               \
894     unsigned int sse = 0;                                                 \
895     int se = 0;                                                           \
896     for (int i = 0; i < (w / wf); ++i) {                                  \
897       const uint8_t *src_ptr = src;                                       \
898       const uint8_t *dst_ptr = dst;                                       \
899       const uint8_t *sec_ptr = sec;                                       \
900       for (int j = 0; j < (h / hf); ++j) {                                \
901         unsigned int sse2;                                                \
902         const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2(     \
903             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
904             sec_ptr, w, &sse2);                                           \
905         dst_ptr += hf * dst_stride;                                       \
906         src_ptr += hf * src_stride;                                       \
907         sec_ptr += hf * w;                                                \
908         se += se2;                                                        \
909         sse += sse2;                                                      \
910       }                                                                   \
911       src += wf;                                                          \
912       dst += wf;                                                          \
913       sec += wf;                                                          \
914     }                                                                     \
915     *sse_ptr = sse;                                                       \
916     return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
917   }
918 
919 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
920 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
921 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
922 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
923 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
924 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)
925