1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> // AVX2 13 14 #include "config/aom_dsp_rtcd.h" 15 16 #include "aom_ports/mem.h" 17 18 /* clang-format off */ 19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 20 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 21 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 22 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 23 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 26 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 27 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 28 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 29 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 30 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 31 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 32 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 33 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 34 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 35 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 36 }; 37 /* clang-format on */ 38 39 #define FILTER_SRC(filter) \ 40 /* filter the source */ \ 41 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ 42 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ 43 \ 44 /* add 8 to source */ \ 45 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ 46 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ 47 \ 48 /* divide source by 16 */ \ 49 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ 50 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); 51 52 #define MERGE_WITH_SRC(src_reg, reg) \ 53 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ 54 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); 55 56 #define LOAD_SRC_DST \ 57 /* load source and destination */ \ 58 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 59 dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); 60 61 #define AVG_NEXT_SRC(src_reg, size_stride) \ 62 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ 63 /* average between current and next stride source */ \ 64 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 65 66 #define MERGE_NEXT_SRC(src_reg, size_stride) \ 67 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ 68 MERGE_WITH_SRC(src_reg, src_next_reg) 69 70 #define CALC_SUM_SSE_INSIDE_LOOP \ 71 /* expand each byte to 2 bytes */ \ 72 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ 73 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ 74 /* source - dest */ \ 75 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ 76 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ 77 /* caculate sum */ \ 78 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ 79 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ 80 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ 81 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ 82 /* calculate sse */ \ 83 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ 84 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); 85 86 // final calculation to sum and sse 87 #define CALC_SUM_AND_SSE \ 88 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ 89 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ 90 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ 91 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ 92 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 93 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ 94 \ 95 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ 96 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ 97 \ 98 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 99 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 100 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ 101 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ 102 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ 103 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 104 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ 105 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); 106 107 // Functions related to sub pixel variance width 16 108 #define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ 109 /* load source and destination of 2 rows and insert*/ \ 110 src_reg = _mm256_inserti128_si256( \ 111 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ 112 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ 113 dst_reg = _mm256_inserti128_si256( \ 114 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ 115 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); 116 117 #define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \ 118 src_next_reg = _mm256_inserti128_si256( \ 119 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ 120 _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \ 121 /* average between current and next stride source */ \ 122 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 123 124 #define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \ 125 src_next_reg = _mm256_inserti128_si256( \ 126 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ 127 _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \ 128 MERGE_WITH_SRC(src_reg, src_next_reg) 129 130 #define LOAD_SRC_NEXT_BYTE_INSERT \ 131 /* load source and another source from next row */ \ 132 src_reg = _mm256_inserti128_si256( \ 133 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ 134 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ 135 /* load source and next row source from 1 byte onwards */ \ 136 src_next_reg = _mm256_inserti128_si256( \ 137 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \ 138 _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1); 139 140 #define LOAD_DST_INSERT \ 141 dst_reg = _mm256_inserti128_si256( \ 142 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ 143 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); 144 145 #define LOAD_SRC_MERGE_128BIT(filter) \ 146 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ 147 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ 148 __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \ 149 __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \ 150 __m128i filter_128bit = _mm256_castsi256_si128(filter); \ 151 __m128i pw8_128bit = _mm256_castsi256_si128(pw8); 152 153 #define FILTER_SRC_128BIT(filter) \ 154 /* filter the source */ \ 155 src_lo = _mm_maddubs_epi16(src_lo, filter); \ 156 src_hi = _mm_maddubs_epi16(src_hi, filter); \ 157 \ 158 /* add 8 to source */ \ 159 src_lo = _mm_add_epi16(src_lo, pw8_128bit); \ 160 src_hi = _mm_add_epi16(src_hi, pw8_128bit); \ 161 \ 162 /* divide source by 16 */ \ 163 src_lo = _mm_srai_epi16(src_lo, 4); \ 164 src_hi = _mm_srai_epi16(src_hi, 4); 165 166 // TODO([email protected]): These variance functions are macro-fied so we 167 // don't have to manually optimize the individual for-loops. We could save some 168 // binary size by optimizing the loops more carefully without duplicating the 169 // codes with a macro. 170 #define MAKE_SUB_PIXEL_VAR_32XH(height, log2height) \ 171 static inline int aom_sub_pixel_variance32x##height##_imp_avx2( \ 172 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 173 const uint8_t *dst, int dst_stride, unsigned int *sse) { \ 174 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ 175 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ 176 __m256i zero_reg; \ 177 int i, sum; \ 178 sum_reg = _mm256_setzero_si256(); \ 179 sse_reg = _mm256_setzero_si256(); \ 180 zero_reg = _mm256_setzero_si256(); \ 181 \ 182 /* x_offset = 0 and y_offset = 0 */ \ 183 if (x_offset == 0) { \ 184 if (y_offset == 0) { \ 185 for (i = 0; i < height; i++) { \ 186 LOAD_SRC_DST \ 187 /* expend each byte to 2 bytes */ \ 188 MERGE_WITH_SRC(src_reg, zero_reg) \ 189 CALC_SUM_SSE_INSIDE_LOOP \ 190 src += src_stride; \ 191 dst += dst_stride; \ 192 } \ 193 /* x_offset = 0 and y_offset = 4 */ \ 194 } else if (y_offset == 4) { \ 195 __m256i src_next_reg; \ 196 for (i = 0; i < height; i++) { \ 197 LOAD_SRC_DST \ 198 AVG_NEXT_SRC(src_reg, src_stride) \ 199 /* expend each byte to 2 bytes */ \ 200 MERGE_WITH_SRC(src_reg, zero_reg) \ 201 CALC_SUM_SSE_INSIDE_LOOP \ 202 src += src_stride; \ 203 dst += dst_stride; \ 204 } \ 205 /* x_offset = 0 and y_offset = bilin interpolation */ \ 206 } else { \ 207 __m256i filter, pw8, src_next_reg; \ 208 \ 209 y_offset <<= 5; \ 210 filter = _mm256_load_si256( \ 211 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 212 pw8 = _mm256_set1_epi16(8); \ 213 for (i = 0; i < height; i++) { \ 214 LOAD_SRC_DST \ 215 MERGE_NEXT_SRC(src_reg, src_stride) \ 216 FILTER_SRC(filter) \ 217 CALC_SUM_SSE_INSIDE_LOOP \ 218 src += src_stride; \ 219 dst += dst_stride; \ 220 } \ 221 } \ 222 /* x_offset = 4 and y_offset = 0 */ \ 223 } else if (x_offset == 4) { \ 224 if (y_offset == 0) { \ 225 __m256i src_next_reg; \ 226 for (i = 0; i < height; i++) { \ 227 LOAD_SRC_DST \ 228 AVG_NEXT_SRC(src_reg, 1) \ 229 /* expand each byte to 2 bytes */ \ 230 MERGE_WITH_SRC(src_reg, zero_reg) \ 231 CALC_SUM_SSE_INSIDE_LOOP \ 232 src += src_stride; \ 233 dst += dst_stride; \ 234 } \ 235 /* x_offset = 4 and y_offset = 4 */ \ 236 } else if (y_offset == 4) { \ 237 __m256i src_next_reg, src_avg; \ 238 /* load source and another source starting from the next */ \ 239 /* following byte */ \ 240 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 241 AVG_NEXT_SRC(src_reg, 1) \ 242 for (i = 0; i < height; i++) { \ 243 src_avg = src_reg; \ 244 src += src_stride; \ 245 LOAD_SRC_DST \ 246 AVG_NEXT_SRC(src_reg, 1) \ 247 /* average between previous average to current average */ \ 248 src_avg = _mm256_avg_epu8(src_avg, src_reg); \ 249 /* expand each byte to 2 bytes */ \ 250 MERGE_WITH_SRC(src_avg, zero_reg) \ 251 /* save current source average */ \ 252 CALC_SUM_SSE_INSIDE_LOOP \ 253 dst += dst_stride; \ 254 } \ 255 /* x_offset = 4 and y_offset = bilin interpolation */ \ 256 } else { \ 257 __m256i filter, pw8, src_next_reg, src_avg; \ 258 y_offset <<= 5; \ 259 filter = _mm256_load_si256( \ 260 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 261 pw8 = _mm256_set1_epi16(8); \ 262 /* load source and another source starting from the next */ \ 263 /* following byte */ \ 264 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 265 AVG_NEXT_SRC(src_reg, 1) \ 266 for (i = 0; i < height; i++) { \ 267 /* save current source average */ \ 268 src_avg = src_reg; \ 269 src += src_stride; \ 270 LOAD_SRC_DST \ 271 AVG_NEXT_SRC(src_reg, 1) \ 272 MERGE_WITH_SRC(src_avg, src_reg) \ 273 FILTER_SRC(filter) \ 274 CALC_SUM_SSE_INSIDE_LOOP \ 275 dst += dst_stride; \ 276 } \ 277 } \ 278 /* x_offset = bilin interpolation and y_offset = 0 */ \ 279 } else { \ 280 if (y_offset == 0) { \ 281 __m256i filter, pw8, src_next_reg; \ 282 x_offset <<= 5; \ 283 filter = _mm256_load_si256( \ 284 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 285 pw8 = _mm256_set1_epi16(8); \ 286 for (i = 0; i < height; i++) { \ 287 LOAD_SRC_DST \ 288 MERGE_NEXT_SRC(src_reg, 1) \ 289 FILTER_SRC(filter) \ 290 CALC_SUM_SSE_INSIDE_LOOP \ 291 src += src_stride; \ 292 dst += dst_stride; \ 293 } \ 294 /* x_offset = bilin interpolation and y_offset = 4 */ \ 295 } else if (y_offset == 4) { \ 296 __m256i filter, pw8, src_next_reg, src_pack; \ 297 x_offset <<= 5; \ 298 filter = _mm256_load_si256( \ 299 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 300 pw8 = _mm256_set1_epi16(8); \ 301 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 302 MERGE_NEXT_SRC(src_reg, 1) \ 303 FILTER_SRC(filter) \ 304 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 305 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 306 for (i = 0; i < height; i++) { \ 307 src += src_stride; \ 308 LOAD_SRC_DST \ 309 MERGE_NEXT_SRC(src_reg, 1) \ 310 FILTER_SRC(filter) \ 311 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 312 /* average between previous pack to the current */ \ 313 src_pack = _mm256_avg_epu8(src_pack, src_reg); \ 314 MERGE_WITH_SRC(src_pack, zero_reg) \ 315 CALC_SUM_SSE_INSIDE_LOOP \ 316 src_pack = src_reg; \ 317 dst += dst_stride; \ 318 } \ 319 /* x_offset = bilin interpolation and y_offset = bilin interpolation \ 320 */ \ 321 } else { \ 322 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ 323 x_offset <<= 5; \ 324 xfilter = _mm256_load_si256( \ 325 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 326 y_offset <<= 5; \ 327 yfilter = _mm256_load_si256( \ 328 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 329 pw8 = _mm256_set1_epi16(8); \ 330 /* load source and another source starting from the next */ \ 331 /* following byte */ \ 332 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 333 MERGE_NEXT_SRC(src_reg, 1) \ 334 \ 335 FILTER_SRC(xfilter) \ 336 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 337 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 338 for (i = 0; i < height; i++) { \ 339 src += src_stride; \ 340 LOAD_SRC_DST \ 341 MERGE_NEXT_SRC(src_reg, 1) \ 342 FILTER_SRC(xfilter) \ 343 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 344 /* merge previous pack to current pack source */ \ 345 MERGE_WITH_SRC(src_pack, src_reg) \ 346 /* filter the source */ \ 347 FILTER_SRC(yfilter) \ 348 src_pack = src_reg; \ 349 CALC_SUM_SSE_INSIDE_LOOP \ 350 dst += dst_stride; \ 351 } \ 352 } \ 353 } \ 354 CALC_SUM_AND_SSE \ 355 _mm256_zeroupper(); \ 356 return sum; \ 357 } \ 358 unsigned int aom_sub_pixel_variance32x##height##_avx2( \ 359 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 360 const uint8_t *dst, int dst_stride, unsigned int *sse) { \ 361 const int sum = aom_sub_pixel_variance32x##height##_imp_avx2( \ 362 src, src_stride, x_offset, y_offset, dst, dst_stride, sse); \ 363 return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ 364 } 365 366 MAKE_SUB_PIXEL_VAR_32XH(64, 6) 367 MAKE_SUB_PIXEL_VAR_32XH(32, 5) 368 MAKE_SUB_PIXEL_VAR_32XH(16, 4) 369 370 #define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ 371 unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ 372 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 373 const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ 374 unsigned int sse = 0; \ 375 int se = 0; \ 376 for (int i = 0; i < (w / wf); ++i) { \ 377 const uint8_t *src_ptr = src; \ 378 const uint8_t *dst_ptr = dst; \ 379 for (int j = 0; j < (h / hf); ++j) { \ 380 unsigned int sse2; \ 381 const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2( \ 382 src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ 383 &sse2); \ 384 dst_ptr += hf * dst_stride; \ 385 src_ptr += hf * src_stride; \ 386 se += se2; \ 387 sse += sse2; \ 388 } \ 389 src += wf; \ 390 dst += wf; \ 391 } \ 392 *sse_ptr = sse; \ 393 return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ 394 } 395 396 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. 397 AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7) 398 AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6) 399 AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7) 400 AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6) 401 AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5) 402 403 #define MAKE_SUB_PIXEL_VAR_16XH(height, log2height) \ 404 unsigned int aom_sub_pixel_variance16x##height##_avx2( \ 405 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 406 const uint8_t *dst, int dst_stride, unsigned int *sse) { \ 407 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ 408 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ 409 __m256i zero_reg; \ 410 int i, sum; \ 411 sum_reg = _mm256_setzero_si256(); \ 412 sse_reg = _mm256_setzero_si256(); \ 413 zero_reg = _mm256_setzero_si256(); \ 414 \ 415 /* x_offset = 0 and y_offset = 0 */ \ 416 if (x_offset == 0) { \ 417 if (y_offset == 0) { \ 418 for (i = 0; i < height; i += 2) { \ 419 LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ 420 /* expend each byte to 2 bytes */ \ 421 MERGE_WITH_SRC(src_reg, zero_reg) \ 422 CALC_SUM_SSE_INSIDE_LOOP \ 423 src += (src_stride << 1); \ 424 dst += (dst_stride << 1); \ 425 } \ 426 /* x_offset = 0 and y_offset = 4 */ \ 427 } else if (y_offset == 4) { \ 428 __m256i src_next_reg; \ 429 for (i = 0; i < height; i += 2) { \ 430 LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ 431 AVG_NEXT_SRC_INSERT(src_reg, src_stride) \ 432 /* expend each byte to 2 bytes */ \ 433 MERGE_WITH_SRC(src_reg, zero_reg) \ 434 CALC_SUM_SSE_INSIDE_LOOP \ 435 src += (src_stride << 1); \ 436 dst += (dst_stride << 1); \ 437 } \ 438 /* x_offset = 0 and y_offset = bilin interpolation */ \ 439 } else { \ 440 __m256i filter, pw8, src_next_reg; \ 441 y_offset <<= 5; \ 442 filter = _mm256_load_si256( \ 443 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 444 pw8 = _mm256_set1_epi16(8); \ 445 for (i = 0; i < height; i += 2) { \ 446 LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ 447 MERGE_NEXT_SRC_INSERT(src_reg, src_stride) \ 448 FILTER_SRC(filter) \ 449 CALC_SUM_SSE_INSIDE_LOOP \ 450 src += (src_stride << 1); \ 451 dst += (dst_stride << 1); \ 452 } \ 453 } \ 454 /* x_offset = 4 and y_offset = 0 */ \ 455 } else if (x_offset == 4) { \ 456 if (y_offset == 0) { \ 457 __m256i src_next_reg; \ 458 for (i = 0; i < height; i += 2) { \ 459 LOAD_SRC_NEXT_BYTE_INSERT \ 460 LOAD_DST_INSERT \ 461 /* average between current and next stride source */ \ 462 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ 463 /* expand each byte to 2 bytes */ \ 464 MERGE_WITH_SRC(src_reg, zero_reg) \ 465 CALC_SUM_SSE_INSIDE_LOOP \ 466 src += (src_stride << 1); \ 467 dst += (dst_stride << 1); \ 468 } \ 469 /* x_offset = 4 and y_offset = 4 */ \ 470 } else if (y_offset == 4) { \ 471 __m256i src_next_reg, src_avg, src_temp; \ 472 /* load and insert source and next row source */ \ 473 LOAD_SRC_NEXT_BYTE_INSERT \ 474 src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ 475 src += src_stride << 1; \ 476 for (i = 0; i < height - 2; i += 2) { \ 477 LOAD_SRC_NEXT_BYTE_INSERT \ 478 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ 479 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ 480 src_temp = _mm256_avg_epu8(src_avg, src_temp); \ 481 LOAD_DST_INSERT \ 482 /* expand each byte to 2 bytes */ \ 483 MERGE_WITH_SRC(src_temp, zero_reg) \ 484 /* save current source average */ \ 485 src_avg = src_next_reg; \ 486 CALC_SUM_SSE_INSIDE_LOOP \ 487 dst += dst_stride << 1; \ 488 src += src_stride << 1; \ 489 } \ 490 /* last 2 rows processing happens here */ \ 491 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ 492 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ 493 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ 494 src_next_reg = _mm256_permute2x128_si256( \ 495 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ 496 LOAD_DST_INSERT \ 497 src_avg = _mm256_avg_epu8(src_avg, src_next_reg); \ 498 MERGE_WITH_SRC(src_avg, zero_reg) \ 499 CALC_SUM_SSE_INSIDE_LOOP \ 500 } else { \ 501 /* x_offset = 4 and y_offset = bilin interpolation */ \ 502 __m256i filter, pw8, src_next_reg, src_avg, src_temp; \ 503 y_offset <<= 5; \ 504 filter = _mm256_load_si256( \ 505 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 506 pw8 = _mm256_set1_epi16(8); \ 507 /* load and insert source and next row source */ \ 508 LOAD_SRC_NEXT_BYTE_INSERT \ 509 src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ 510 src += src_stride << 1; \ 511 for (i = 0; i < height - 2; i += 2) { \ 512 LOAD_SRC_NEXT_BYTE_INSERT \ 513 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ 514 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ 515 LOAD_DST_INSERT \ 516 MERGE_WITH_SRC(src_avg, src_temp) \ 517 /* save current source average */ \ 518 src_avg = src_next_reg; \ 519 FILTER_SRC(filter) \ 520 CALC_SUM_SSE_INSIDE_LOOP \ 521 dst += dst_stride << 1; \ 522 src += src_stride << 1; \ 523 } \ 524 /* last 2 rows processing happens here */ \ 525 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ 526 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ 527 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ 528 src_next_reg = _mm256_permute2x128_si256( \ 529 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ 530 LOAD_DST_INSERT \ 531 MERGE_WITH_SRC(src_avg, src_next_reg) \ 532 FILTER_SRC(filter) \ 533 CALC_SUM_SSE_INSIDE_LOOP \ 534 } \ 535 /* x_offset = bilin interpolation and y_offset = 0 */ \ 536 } else { \ 537 if (y_offset == 0) { \ 538 __m256i filter, pw8, src_next_reg; \ 539 x_offset <<= 5; \ 540 filter = _mm256_load_si256( \ 541 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 542 pw8 = _mm256_set1_epi16(8); \ 543 for (i = 0; i < height; i += 2) { \ 544 LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ 545 MERGE_NEXT_SRC_INSERT(src_reg, 1) \ 546 FILTER_SRC(filter) \ 547 CALC_SUM_SSE_INSIDE_LOOP \ 548 src += (src_stride << 1); \ 549 dst += (dst_stride << 1); \ 550 } \ 551 /* x_offset = bilin interpolation and y_offset = 4 */ \ 552 } else if (y_offset == 4) { \ 553 __m256i filter, pw8, src_next_reg, src_pack; \ 554 x_offset <<= 5; \ 555 filter = _mm256_load_si256( \ 556 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 557 pw8 = _mm256_set1_epi16(8); \ 558 /* load and insert source and next row source */ \ 559 LOAD_SRC_NEXT_BYTE_INSERT \ 560 MERGE_WITH_SRC(src_reg, src_next_reg) \ 561 FILTER_SRC(filter) \ 562 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 563 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 564 src += src_stride << 1; \ 565 for (i = 0; i < height - 2; i += 2) { \ 566 LOAD_SRC_NEXT_BYTE_INSERT \ 567 LOAD_DST_INSERT \ 568 MERGE_WITH_SRC(src_reg, src_next_reg) \ 569 FILTER_SRC(filter) \ 570 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 571 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ 572 /* average between previous pack to the current */ \ 573 src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ 574 MERGE_WITH_SRC(src_pack, zero_reg) \ 575 CALC_SUM_SSE_INSIDE_LOOP \ 576 src_pack = src_reg; \ 577 src += src_stride << 1; \ 578 dst += dst_stride << 1; \ 579 } \ 580 /* last 2 rows processing happens here */ \ 581 LOAD_SRC_MERGE_128BIT(filter) \ 582 LOAD_DST_INSERT \ 583 FILTER_SRC_128BIT(filter_128bit) \ 584 src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ 585 src_next_reg = _mm256_permute2x128_si256( \ 586 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ 587 /* average between previous pack to the current */ \ 588 src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ 589 MERGE_WITH_SRC(src_pack, zero_reg) \ 590 CALC_SUM_SSE_INSIDE_LOOP \ 591 } else { \ 592 /* x_offset = bilin interpolation and y_offset = bilin interpolation \ 593 */ \ 594 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ 595 x_offset <<= 5; \ 596 xfilter = _mm256_load_si256( \ 597 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 598 y_offset <<= 5; \ 599 yfilter = _mm256_load_si256( \ 600 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 601 pw8 = _mm256_set1_epi16(8); \ 602 /* load and insert source and next row source */ \ 603 LOAD_SRC_NEXT_BYTE_INSERT \ 604 MERGE_WITH_SRC(src_reg, src_next_reg) \ 605 FILTER_SRC(xfilter) \ 606 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 607 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 608 src += src_stride << 1; \ 609 for (i = 0; i < height - 2; i += 2) { \ 610 LOAD_SRC_NEXT_BYTE_INSERT \ 611 LOAD_DST_INSERT \ 612 MERGE_WITH_SRC(src_reg, src_next_reg) \ 613 FILTER_SRC(xfilter) \ 614 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 615 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ 616 /* average between previous pack to the current */ \ 617 MERGE_WITH_SRC(src_pack, src_next_reg) \ 618 /* filter the source */ \ 619 FILTER_SRC(yfilter) \ 620 src_pack = src_reg; \ 621 CALC_SUM_SSE_INSIDE_LOOP \ 622 src += src_stride << 1; \ 623 dst += dst_stride << 1; \ 624 } \ 625 /* last 2 rows processing happens here */ \ 626 LOAD_SRC_MERGE_128BIT(xfilter) \ 627 LOAD_DST_INSERT \ 628 FILTER_SRC_128BIT(filter_128bit) \ 629 src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ 630 src_next_reg = _mm256_permute2x128_si256( \ 631 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ 632 MERGE_WITH_SRC(src_pack, src_next_reg) \ 633 FILTER_SRC(yfilter) \ 634 CALC_SUM_SSE_INSIDE_LOOP \ 635 } \ 636 } \ 637 CALC_SUM_AND_SSE \ 638 _mm256_zeroupper(); \ 639 return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height)); \ 640 } 641 642 MAKE_SUB_PIXEL_VAR_16XH(32, 5) 643 MAKE_SUB_PIXEL_VAR_16XH(16, 4) 644 MAKE_SUB_PIXEL_VAR_16XH(8, 3) 645 #if !CONFIG_REALTIME_ONLY 646 MAKE_SUB_PIXEL_VAR_16XH(64, 6) 647 MAKE_SUB_PIXEL_VAR_16XH(4, 2) 648 #endif 649 650 #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ 651 static int sub_pixel_avg_variance32x##height##_imp_avx2( \ 652 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 653 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ 654 unsigned int *sse) { \ 655 __m256i sec_reg; \ 656 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ 657 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ 658 __m256i zero_reg; \ 659 int i, sum; \ 660 sum_reg = _mm256_setzero_si256(); \ 661 sse_reg = _mm256_setzero_si256(); \ 662 zero_reg = _mm256_setzero_si256(); \ 663 \ 664 /* x_offset = 0 and y_offset = 0 */ \ 665 if (x_offset == 0) { \ 666 if (y_offset == 0) { \ 667 for (i = 0; i < height; i++) { \ 668 LOAD_SRC_DST \ 669 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 670 src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ 671 sec += sec_stride; \ 672 /* expend each byte to 2 bytes */ \ 673 MERGE_WITH_SRC(src_reg, zero_reg) \ 674 CALC_SUM_SSE_INSIDE_LOOP \ 675 src += src_stride; \ 676 dst += dst_stride; \ 677 } \ 678 } else if (y_offset == 4) { \ 679 __m256i src_next_reg; \ 680 for (i = 0; i < height; i++) { \ 681 LOAD_SRC_DST \ 682 AVG_NEXT_SRC(src_reg, src_stride) \ 683 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 684 src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ 685 sec += sec_stride; \ 686 /* expend each byte to 2 bytes */ \ 687 MERGE_WITH_SRC(src_reg, zero_reg) \ 688 CALC_SUM_SSE_INSIDE_LOOP \ 689 src += src_stride; \ 690 dst += dst_stride; \ 691 } \ 692 /* x_offset = 0 and y_offset = bilin interpolation */ \ 693 } else { \ 694 __m256i filter, pw8, src_next_reg; \ 695 \ 696 y_offset <<= 5; \ 697 filter = _mm256_load_si256( \ 698 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 699 pw8 = _mm256_set1_epi16(8); \ 700 for (i = 0; i < height; i++) { \ 701 LOAD_SRC_DST \ 702 MERGE_NEXT_SRC(src_reg, src_stride) \ 703 FILTER_SRC(filter) \ 704 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 705 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 706 src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ 707 sec += sec_stride; \ 708 MERGE_WITH_SRC(src_reg, zero_reg) \ 709 CALC_SUM_SSE_INSIDE_LOOP \ 710 src += src_stride; \ 711 dst += dst_stride; \ 712 } \ 713 } \ 714 /* x_offset = 4 and y_offset = 0 */ \ 715 } else if (x_offset == 4) { \ 716 if (y_offset == 0) { \ 717 __m256i src_next_reg; \ 718 for (i = 0; i < height; i++) { \ 719 LOAD_SRC_DST \ 720 AVG_NEXT_SRC(src_reg, 1) \ 721 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 722 src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ 723 sec += sec_stride; \ 724 /* expand each byte to 2 bytes */ \ 725 MERGE_WITH_SRC(src_reg, zero_reg) \ 726 CALC_SUM_SSE_INSIDE_LOOP \ 727 src += src_stride; \ 728 dst += dst_stride; \ 729 } \ 730 /* x_offset = 4 and y_offset = 4 */ \ 731 } else if (y_offset == 4) { \ 732 __m256i src_next_reg, src_avg; \ 733 /* load source and another source starting from the next */ \ 734 /* following byte */ \ 735 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 736 AVG_NEXT_SRC(src_reg, 1) \ 737 for (i = 0; i < height; i++) { \ 738 /* save current source average */ \ 739 src_avg = src_reg; \ 740 src += src_stride; \ 741 LOAD_SRC_DST \ 742 AVG_NEXT_SRC(src_reg, 1) \ 743 /* average between previous average to current average */ \ 744 src_avg = _mm256_avg_epu8(src_avg, src_reg); \ 745 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 746 src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ 747 sec += sec_stride; \ 748 /* expand each byte to 2 bytes */ \ 749 MERGE_WITH_SRC(src_avg, zero_reg) \ 750 CALC_SUM_SSE_INSIDE_LOOP \ 751 dst += dst_stride; \ 752 } \ 753 /* x_offset = 4 and y_offset = bilin interpolation */ \ 754 } else { \ 755 __m256i filter, pw8, src_next_reg, src_avg; \ 756 y_offset <<= 5; \ 757 filter = _mm256_load_si256( \ 758 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 759 pw8 = _mm256_set1_epi16(8); \ 760 /* load source and another source starting from the next */ \ 761 /* following byte */ \ 762 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 763 AVG_NEXT_SRC(src_reg, 1) \ 764 for (i = 0; i < height; i++) { \ 765 /* save current source average */ \ 766 src_avg = src_reg; \ 767 src += src_stride; \ 768 LOAD_SRC_DST \ 769 AVG_NEXT_SRC(src_reg, 1) \ 770 MERGE_WITH_SRC(src_avg, src_reg) \ 771 FILTER_SRC(filter) \ 772 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 773 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 774 src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ 775 /* expand each byte to 2 bytes */ \ 776 MERGE_WITH_SRC(src_avg, zero_reg) \ 777 sec += sec_stride; \ 778 CALC_SUM_SSE_INSIDE_LOOP \ 779 dst += dst_stride; \ 780 } \ 781 } \ 782 /* x_offset = bilin interpolation and y_offset = 0 */ \ 783 } else { \ 784 if (y_offset == 0) { \ 785 __m256i filter, pw8, src_next_reg; \ 786 x_offset <<= 5; \ 787 filter = _mm256_load_si256( \ 788 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 789 pw8 = _mm256_set1_epi16(8); \ 790 for (i = 0; i < height; i++) { \ 791 LOAD_SRC_DST \ 792 MERGE_NEXT_SRC(src_reg, 1) \ 793 FILTER_SRC(filter) \ 794 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 795 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 796 src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ 797 MERGE_WITH_SRC(src_reg, zero_reg) \ 798 sec += sec_stride; \ 799 CALC_SUM_SSE_INSIDE_LOOP \ 800 src += src_stride; \ 801 dst += dst_stride; \ 802 } \ 803 /* x_offset = bilin interpolation and y_offset = 4 */ \ 804 } else if (y_offset == 4) { \ 805 __m256i filter, pw8, src_next_reg, src_pack; \ 806 x_offset <<= 5; \ 807 filter = _mm256_load_si256( \ 808 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 809 pw8 = _mm256_set1_epi16(8); \ 810 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 811 MERGE_NEXT_SRC(src_reg, 1) \ 812 FILTER_SRC(filter) \ 813 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 814 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 815 for (i = 0; i < height; i++) { \ 816 src += src_stride; \ 817 LOAD_SRC_DST \ 818 MERGE_NEXT_SRC(src_reg, 1) \ 819 FILTER_SRC(filter) \ 820 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 821 /* average between previous pack to the current */ \ 822 src_pack = _mm256_avg_epu8(src_pack, src_reg); \ 823 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 824 src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ 825 sec += sec_stride; \ 826 MERGE_WITH_SRC(src_pack, zero_reg) \ 827 src_pack = src_reg; \ 828 CALC_SUM_SSE_INSIDE_LOOP \ 829 dst += dst_stride; \ 830 } \ 831 /* x_offset = bilin interpolation and y_offset = bilin interpolation \ 832 */ \ 833 } else { \ 834 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ 835 x_offset <<= 5; \ 836 xfilter = _mm256_load_si256( \ 837 (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ 838 y_offset <<= 5; \ 839 yfilter = _mm256_load_si256( \ 840 (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ 841 pw8 = _mm256_set1_epi16(8); \ 842 /* load source and another source starting from the next */ \ 843 /* following byte */ \ 844 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 845 MERGE_NEXT_SRC(src_reg, 1) \ 846 \ 847 FILTER_SRC(xfilter) \ 848 /* convert each 16 bit to 8 bit to each low and high lane source */ \ 849 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 850 for (i = 0; i < height; i++) { \ 851 src += src_stride; \ 852 LOAD_SRC_DST \ 853 MERGE_NEXT_SRC(src_reg, 1) \ 854 FILTER_SRC(xfilter) \ 855 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 856 /* merge previous pack to current pack source */ \ 857 MERGE_WITH_SRC(src_pack, src_reg) \ 858 /* filter the source */ \ 859 FILTER_SRC(yfilter) \ 860 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ 861 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ 862 src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ 863 MERGE_WITH_SRC(src_pack, zero_reg) \ 864 src_pack = src_reg; \ 865 sec += sec_stride; \ 866 CALC_SUM_SSE_INSIDE_LOOP \ 867 dst += dst_stride; \ 868 } \ 869 } \ 870 } \ 871 CALC_SUM_AND_SSE \ 872 _mm256_zeroupper(); \ 873 return sum; \ 874 } \ 875 unsigned int aom_sub_pixel_avg_variance32x##height##_avx2( \ 876 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 877 const uint8_t *dst, int dst_stride, unsigned int *sse, \ 878 const uint8_t *sec_ptr) { \ 879 const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ 880 src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ 881 sse); \ 882 return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ 883 } 884 885 MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6) 886 MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5) 887 MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4) 888 889 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ 890 unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ 891 const uint8_t *src, int src_stride, int x_offset, int y_offset, \ 892 const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ 893 const uint8_t *sec) { \ 894 unsigned int sse = 0; \ 895 int se = 0; \ 896 for (int i = 0; i < (w / wf); ++i) { \ 897 const uint8_t *src_ptr = src; \ 898 const uint8_t *dst_ptr = dst; \ 899 const uint8_t *sec_ptr = sec; \ 900 for (int j = 0; j < (h / hf); ++j) { \ 901 unsigned int sse2; \ 902 const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ 903 src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ 904 sec_ptr, w, &sse2); \ 905 dst_ptr += hf * dst_stride; \ 906 src_ptr += hf * src_stride; \ 907 sec_ptr += hf * w; \ 908 se += se2; \ 909 sse += sse2; \ 910 } \ 911 src += wf; \ 912 dst += wf; \ 913 sec += wf; \ 914 } \ 915 *sse_ptr = sse; \ 916 return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ 917 } 918 919 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. 920 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7) 921 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6) 922 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7) 923 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6) 924 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5) 925