1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #include <immintrin.h> // AVX2
12*fb1b10abSAndroid Build Coastguard Worker
13*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
14*fb1b10abSAndroid Build Coastguard Worker
15*fb1b10abSAndroid Build Coastguard Worker /* clang-format off */
16*fb1b10abSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
17*fb1b10abSAndroid Build Coastguard Worker 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18*fb1b10abSAndroid Build Coastguard Worker 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
19*fb1b10abSAndroid Build Coastguard Worker 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
20*fb1b10abSAndroid Build Coastguard Worker 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21*fb1b10abSAndroid Build Coastguard Worker 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
22*fb1b10abSAndroid Build Coastguard Worker 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
23*fb1b10abSAndroid Build Coastguard Worker 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
24*fb1b10abSAndroid Build Coastguard Worker 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
25*fb1b10abSAndroid Build Coastguard Worker 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
26*fb1b10abSAndroid Build Coastguard Worker 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27*fb1b10abSAndroid Build Coastguard Worker 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
28*fb1b10abSAndroid Build Coastguard Worker 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
29*fb1b10abSAndroid Build Coastguard Worker 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
30*fb1b10abSAndroid Build Coastguard Worker 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
31*fb1b10abSAndroid Build Coastguard Worker 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
32*fb1b10abSAndroid Build Coastguard Worker 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
33*fb1b10abSAndroid Build Coastguard Worker };
34*fb1b10abSAndroid Build Coastguard Worker
35*fb1b10abSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
36*fb1b10abSAndroid Build Coastguard Worker 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1,
37*fb1b10abSAndroid Build Coastguard Worker 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1
38*fb1b10abSAndroid Build Coastguard Worker };
39*fb1b10abSAndroid Build Coastguard Worker /* clang-format on */
40*fb1b10abSAndroid Build Coastguard Worker
variance_kernel_avx2(const __m256i src,const __m256i ref,__m256i * const sse,__m256i * const sum)41*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
42*fb1b10abSAndroid Build Coastguard Worker __m256i *const sse,
43*fb1b10abSAndroid Build Coastguard Worker __m256i *const sum) {
44*fb1b10abSAndroid Build Coastguard Worker const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
45*fb1b10abSAndroid Build Coastguard Worker
46*fb1b10abSAndroid Build Coastguard Worker // unpack into pairs of source and reference values
47*fb1b10abSAndroid Build Coastguard Worker const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
48*fb1b10abSAndroid Build Coastguard Worker const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
49*fb1b10abSAndroid Build Coastguard Worker
50*fb1b10abSAndroid Build Coastguard Worker // subtract adjacent elements using src*1 + ref*-1
51*fb1b10abSAndroid Build Coastguard Worker const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
52*fb1b10abSAndroid Build Coastguard Worker const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
53*fb1b10abSAndroid Build Coastguard Worker const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
54*fb1b10abSAndroid Build Coastguard Worker const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
55*fb1b10abSAndroid Build Coastguard Worker
56*fb1b10abSAndroid Build Coastguard Worker // add to the running totals
57*fb1b10abSAndroid Build Coastguard Worker *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
58*fb1b10abSAndroid Build Coastguard Worker *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
59*fb1b10abSAndroid Build Coastguard Worker }
60*fb1b10abSAndroid Build Coastguard Worker
variance_final_from_32bit_sum_avx2(__m256i vsse,__m128i vsum,unsigned int * const sse,int * const sum)61*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse,
62*fb1b10abSAndroid Build Coastguard Worker __m128i vsum,
63*fb1b10abSAndroid Build Coastguard Worker unsigned int *const sse,
64*fb1b10abSAndroid Build Coastguard Worker int *const sum) {
65*fb1b10abSAndroid Build Coastguard Worker // extract the low lane and add it to the high lane
66*fb1b10abSAndroid Build Coastguard Worker const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse),
67*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsse, 1));
68*fb1b10abSAndroid Build Coastguard Worker
69*fb1b10abSAndroid Build Coastguard Worker // unpack sse and sum registers and add
70*fb1b10abSAndroid Build Coastguard Worker const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
71*fb1b10abSAndroid Build Coastguard Worker const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
72*fb1b10abSAndroid Build Coastguard Worker const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
73*fb1b10abSAndroid Build Coastguard Worker
74*fb1b10abSAndroid Build Coastguard Worker // perform the final summation and extract the results
75*fb1b10abSAndroid Build Coastguard Worker const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
76*fb1b10abSAndroid Build Coastguard Worker *((int *)sse) = _mm_cvtsi128_si32(res);
77*fb1b10abSAndroid Build Coastguard Worker *((int *)sum) = _mm_extract_epi32(res, 1);
78*fb1b10abSAndroid Build Coastguard Worker }
79*fb1b10abSAndroid Build Coastguard Worker
variance_final_from_16bit_sum_avx2(__m256i vsse,__m256i vsum,unsigned int * const sse,int * const sum)80*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse,
81*fb1b10abSAndroid Build Coastguard Worker __m256i vsum,
82*fb1b10abSAndroid Build Coastguard Worker unsigned int *const sse,
83*fb1b10abSAndroid Build Coastguard Worker int *const sum) {
84*fb1b10abSAndroid Build Coastguard Worker // extract the low lane and add it to the high lane
85*fb1b10abSAndroid Build Coastguard Worker const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
86*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsum, 1));
87*fb1b10abSAndroid Build Coastguard Worker const __m128i sum_reg_64 =
88*fb1b10abSAndroid Build Coastguard Worker _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
89*fb1b10abSAndroid Build Coastguard Worker const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
90*fb1b10abSAndroid Build Coastguard Worker
91*fb1b10abSAndroid Build Coastguard Worker variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum);
92*fb1b10abSAndroid Build Coastguard Worker }
93*fb1b10abSAndroid Build Coastguard Worker
sum_to_32bit_avx2(const __m256i sum)94*fb1b10abSAndroid Build Coastguard Worker static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
95*fb1b10abSAndroid Build Coastguard Worker const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
96*fb1b10abSAndroid Build Coastguard Worker const __m256i sum_hi =
97*fb1b10abSAndroid Build Coastguard Worker _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
98*fb1b10abSAndroid Build Coastguard Worker return _mm256_add_epi32(sum_lo, sum_hi);
99*fb1b10abSAndroid Build Coastguard Worker }
100*fb1b10abSAndroid Build Coastguard Worker
variance8_kernel_avx2(const uint8_t * const src,const int src_stride,const uint8_t * const ref,const int ref_stride,__m256i * const sse,__m256i * const sum)101*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance8_kernel_avx2(
102*fb1b10abSAndroid Build Coastguard Worker const uint8_t *const src, const int src_stride, const uint8_t *const ref,
103*fb1b10abSAndroid Build Coastguard Worker const int ref_stride, __m256i *const sse, __m256i *const sum) {
104*fb1b10abSAndroid Build Coastguard Worker __m128i src0, src1, ref0, ref1;
105*fb1b10abSAndroid Build Coastguard Worker __m256i ss, rr, diff;
106*fb1b10abSAndroid Build Coastguard Worker
107*fb1b10abSAndroid Build Coastguard Worker // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
108*fb1b10abSAndroid Build Coastguard Worker src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
109*fb1b10abSAndroid Build Coastguard Worker
110*fb1b10abSAndroid Build Coastguard Worker // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
111*fb1b10abSAndroid Build Coastguard Worker src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
112*fb1b10abSAndroid Build Coastguard Worker
113*fb1b10abSAndroid Build Coastguard Worker // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
114*fb1b10abSAndroid Build Coastguard Worker src0 = _mm_unpacklo_epi64(src0, src1);
115*fb1b10abSAndroid Build Coastguard Worker
116*fb1b10abSAndroid Build Coastguard Worker // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
117*fb1b10abSAndroid Build Coastguard Worker ss = _mm256_cvtepu8_epi16(src0);
118*fb1b10abSAndroid Build Coastguard Worker
119*fb1b10abSAndroid Build Coastguard Worker // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
120*fb1b10abSAndroid Build Coastguard Worker ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
121*fb1b10abSAndroid Build Coastguard Worker
122*fb1b10abSAndroid Build Coastguard Worker // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
123*fb1b10abSAndroid Build Coastguard Worker ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
124*fb1b10abSAndroid Build Coastguard Worker
125*fb1b10abSAndroid Build Coastguard Worker // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
126*fb1b10abSAndroid Build Coastguard Worker ref0 = _mm_unpacklo_epi64(ref0, ref1);
127*fb1b10abSAndroid Build Coastguard Worker
128*fb1b10abSAndroid Build Coastguard Worker // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
129*fb1b10abSAndroid Build Coastguard Worker rr = _mm256_cvtepu8_epi16(ref0);
130*fb1b10abSAndroid Build Coastguard Worker
131*fb1b10abSAndroid Build Coastguard Worker diff = _mm256_sub_epi16(ss, rr);
132*fb1b10abSAndroid Build Coastguard Worker *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
133*fb1b10abSAndroid Build Coastguard Worker *sum = _mm256_add_epi16(*sum, diff);
134*fb1b10abSAndroid Build Coastguard Worker }
135*fb1b10abSAndroid Build Coastguard Worker
variance16_kernel_avx2(const uint8_t * const src,const int src_stride,const uint8_t * const ref,const int ref_stride,__m256i * const sse,__m256i * const sum)136*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance16_kernel_avx2(
137*fb1b10abSAndroid Build Coastguard Worker const uint8_t *const src, const int src_stride, const uint8_t *const ref,
138*fb1b10abSAndroid Build Coastguard Worker const int ref_stride, __m256i *const sse, __m256i *const sum) {
139*fb1b10abSAndroid Build Coastguard Worker const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
140*fb1b10abSAndroid Build Coastguard Worker const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
141*fb1b10abSAndroid Build Coastguard Worker const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
142*fb1b10abSAndroid Build Coastguard Worker const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
143*fb1b10abSAndroid Build Coastguard Worker const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
144*fb1b10abSAndroid Build Coastguard Worker const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
145*fb1b10abSAndroid Build Coastguard Worker variance_kernel_avx2(s, r, sse, sum);
146*fb1b10abSAndroid Build Coastguard Worker }
147*fb1b10abSAndroid Build Coastguard Worker
variance32_kernel_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sse,__m256i * const sum)148*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance32_kernel_avx2(const uint8_t *const src,
149*fb1b10abSAndroid Build Coastguard Worker const uint8_t *const ref,
150*fb1b10abSAndroid Build Coastguard Worker __m256i *const sse,
151*fb1b10abSAndroid Build Coastguard Worker __m256i *const sum) {
152*fb1b10abSAndroid Build Coastguard Worker const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
153*fb1b10abSAndroid Build Coastguard Worker const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
154*fb1b10abSAndroid Build Coastguard Worker variance_kernel_avx2(s, r, sse, sum);
155*fb1b10abSAndroid Build Coastguard Worker }
156*fb1b10abSAndroid Build Coastguard Worker
variance8_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)157*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
158*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
159*fb1b10abSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
160*fb1b10abSAndroid Build Coastguard Worker __m256i *const vsum) {
161*fb1b10abSAndroid Build Coastguard Worker int i;
162*fb1b10abSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
163*fb1b10abSAndroid Build Coastguard Worker *vsse = _mm256_setzero_si256();
164*fb1b10abSAndroid Build Coastguard Worker
165*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i += 2) {
166*fb1b10abSAndroid Build Coastguard Worker variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
167*fb1b10abSAndroid Build Coastguard Worker src += 2 * src_stride;
168*fb1b10abSAndroid Build Coastguard Worker ref += 2 * ref_stride;
169*fb1b10abSAndroid Build Coastguard Worker }
170*fb1b10abSAndroid Build Coastguard Worker }
171*fb1b10abSAndroid Build Coastguard Worker
variance16_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)172*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
173*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
174*fb1b10abSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
175*fb1b10abSAndroid Build Coastguard Worker __m256i *const vsum) {
176*fb1b10abSAndroid Build Coastguard Worker int i;
177*fb1b10abSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
178*fb1b10abSAndroid Build Coastguard Worker *vsse = _mm256_setzero_si256();
179*fb1b10abSAndroid Build Coastguard Worker
180*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i += 2) {
181*fb1b10abSAndroid Build Coastguard Worker variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
182*fb1b10abSAndroid Build Coastguard Worker src += 2 * src_stride;
183*fb1b10abSAndroid Build Coastguard Worker ref += 2 * ref_stride;
184*fb1b10abSAndroid Build Coastguard Worker }
185*fb1b10abSAndroid Build Coastguard Worker }
186*fb1b10abSAndroid Build Coastguard Worker
variance32_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)187*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
188*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
189*fb1b10abSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
190*fb1b10abSAndroid Build Coastguard Worker __m256i *const vsum) {
191*fb1b10abSAndroid Build Coastguard Worker int i;
192*fb1b10abSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
193*fb1b10abSAndroid Build Coastguard Worker *vsse = _mm256_setzero_si256();
194*fb1b10abSAndroid Build Coastguard Worker
195*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i++) {
196*fb1b10abSAndroid Build Coastguard Worker variance32_kernel_avx2(src, ref, vsse, vsum);
197*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
198*fb1b10abSAndroid Build Coastguard Worker ref += ref_stride;
199*fb1b10abSAndroid Build Coastguard Worker }
200*fb1b10abSAndroid Build Coastguard Worker }
201*fb1b10abSAndroid Build Coastguard Worker
variance64_avx2(const uint8_t * src,const int src_stride,const uint8_t * ref,const int ref_stride,const int h,__m256i * const vsse,__m256i * const vsum)202*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
203*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref, const int ref_stride,
204*fb1b10abSAndroid Build Coastguard Worker const int h, __m256i *const vsse,
205*fb1b10abSAndroid Build Coastguard Worker __m256i *const vsum) {
206*fb1b10abSAndroid Build Coastguard Worker int i;
207*fb1b10abSAndroid Build Coastguard Worker *vsum = _mm256_setzero_si256();
208*fb1b10abSAndroid Build Coastguard Worker
209*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < h; i++) {
210*fb1b10abSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
211*fb1b10abSAndroid Build Coastguard Worker variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
212*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
213*fb1b10abSAndroid Build Coastguard Worker ref += ref_stride;
214*fb1b10abSAndroid Build Coastguard Worker }
215*fb1b10abSAndroid Build Coastguard Worker }
216*fb1b10abSAndroid Build Coastguard Worker
vpx_get16x16var_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,int * sum)217*fb1b10abSAndroid Build Coastguard Worker void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride,
218*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
219*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse, int *sum) {
220*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
221*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
222*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum);
223*fb1b10abSAndroid Build Coastguard Worker }
224*fb1b10abSAndroid Build Coastguard Worker
225*fb1b10abSAndroid Build Coastguard Worker #define FILTER_SRC(filter) \
226*fb1b10abSAndroid Build Coastguard Worker /* filter the source */ \
227*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
228*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
229*fb1b10abSAndroid Build Coastguard Worker \
230*fb1b10abSAndroid Build Coastguard Worker /* add 8 to source */ \
231*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
232*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
233*fb1b10abSAndroid Build Coastguard Worker \
234*fb1b10abSAndroid Build Coastguard Worker /* divide source by 16 */ \
235*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
236*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
237*fb1b10abSAndroid Build Coastguard Worker
238*fb1b10abSAndroid Build Coastguard Worker #define CALC_SUM_SSE_INSIDE_LOOP \
239*fb1b10abSAndroid Build Coastguard Worker /* expand each byte to 2 bytes */ \
240*fb1b10abSAndroid Build Coastguard Worker exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
241*fb1b10abSAndroid Build Coastguard Worker exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
242*fb1b10abSAndroid Build Coastguard Worker /* source - dest */ \
243*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
244*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
245*fb1b10abSAndroid Build Coastguard Worker /* caculate sum */ \
246*fb1b10abSAndroid Build Coastguard Worker *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \
247*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
248*fb1b10abSAndroid Build Coastguard Worker *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \
249*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
250*fb1b10abSAndroid Build Coastguard Worker /* calculate sse */ \
251*fb1b10abSAndroid Build Coastguard Worker *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \
252*fb1b10abSAndroid Build Coastguard Worker *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
253*fb1b10abSAndroid Build Coastguard Worker
254*fb1b10abSAndroid Build Coastguard Worker // final calculation to sum and sse
255*fb1b10abSAndroid Build Coastguard Worker #define CALC_SUM_AND_SSE \
256*fb1b10abSAndroid Build Coastguard Worker res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
257*fb1b10abSAndroid Build Coastguard Worker sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
258*fb1b10abSAndroid Build Coastguard Worker sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
259*fb1b10abSAndroid Build Coastguard Worker sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
260*fb1b10abSAndroid Build Coastguard Worker sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
261*fb1b10abSAndroid Build Coastguard Worker sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
262*fb1b10abSAndroid Build Coastguard Worker \
263*fb1b10abSAndroid Build Coastguard Worker sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
264*fb1b10abSAndroid Build Coastguard Worker sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
265*fb1b10abSAndroid Build Coastguard Worker \
266*fb1b10abSAndroid Build Coastguard Worker sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
267*fb1b10abSAndroid Build Coastguard Worker sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
268*fb1b10abSAndroid Build Coastguard Worker *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
269*fb1b10abSAndroid Build Coastguard Worker _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
270*fb1b10abSAndroid Build Coastguard Worker sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
271*fb1b10abSAndroid Build Coastguard Worker sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
272*fb1b10abSAndroid Build Coastguard Worker sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
273*fb1b10abSAndroid Build Coastguard Worker _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
274*fb1b10abSAndroid Build Coastguard Worker
spv32_x0_y0(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg)275*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
276*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
277*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
278*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
279*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg) {
280*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
281*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
282*fb1b10abSAndroid Build Coastguard Worker int i;
283*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
284*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
285*fb1b10abSAndroid Build Coastguard Worker const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
286*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
287*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
288*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
289*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
290*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
291*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
292*fb1b10abSAndroid Build Coastguard Worker } else {
293*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
294*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
295*fb1b10abSAndroid Build Coastguard Worker }
296*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
297*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
298*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
299*fb1b10abSAndroid Build Coastguard Worker }
300*fb1b10abSAndroid Build Coastguard Worker }
301*fb1b10abSAndroid Build Coastguard Worker
302*fb1b10abSAndroid Build Coastguard Worker // (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction.
spv32_half_zero(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int sstep)303*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
304*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
305*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred,
306*fb1b10abSAndroid Build Coastguard Worker int second_stride, int do_sec, int height,
307*fb1b10abSAndroid Build Coastguard Worker __m256i *sum_reg, __m256i *sse_reg,
308*fb1b10abSAndroid Build Coastguard Worker int sstep) {
309*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
310*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
311*fb1b10abSAndroid Build Coastguard Worker int i;
312*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
313*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
314*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
315*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
316*fb1b10abSAndroid Build Coastguard Worker const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
317*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
318*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
319*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
320*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
321*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
322*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
323*fb1b10abSAndroid Build Coastguard Worker } else {
324*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
325*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
326*fb1b10abSAndroid Build Coastguard Worker }
327*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
328*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
329*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
330*fb1b10abSAndroid Build Coastguard Worker }
331*fb1b10abSAndroid Build Coastguard Worker }
332*fb1b10abSAndroid Build Coastguard Worker
spv32_x0_y4(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg)333*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
334*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
335*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
336*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
337*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg) {
338*fb1b10abSAndroid Build Coastguard Worker spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
339*fb1b10abSAndroid Build Coastguard Worker do_sec, height, sum_reg, sse_reg, src_stride);
340*fb1b10abSAndroid Build Coastguard Worker }
341*fb1b10abSAndroid Build Coastguard Worker
spv32_x4_y0(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg)342*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
343*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
344*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
345*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
346*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg) {
347*fb1b10abSAndroid Build Coastguard Worker spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
348*fb1b10abSAndroid Build Coastguard Worker do_sec, height, sum_reg, sse_reg, 1);
349*fb1b10abSAndroid Build Coastguard Worker }
350*fb1b10abSAndroid Build Coastguard Worker
spv32_x4_y4(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg)351*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
352*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
353*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
354*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
355*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg) {
356*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
357*fb1b10abSAndroid Build Coastguard Worker const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
358*fb1b10abSAndroid Build Coastguard Worker const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
359*fb1b10abSAndroid Build Coastguard Worker __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
360*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
361*fb1b10abSAndroid Build Coastguard Worker int i;
362*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
363*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
364*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
365*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
366*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
367*fb1b10abSAndroid Build Coastguard Worker const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
368*fb1b10abSAndroid Build Coastguard Worker const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
369*fb1b10abSAndroid Build Coastguard Worker prev_src_avg = src_avg;
370*fb1b10abSAndroid Build Coastguard Worker
371*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
372*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
373*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
374*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
375*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
376*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
377*fb1b10abSAndroid Build Coastguard Worker } else {
378*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
379*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
380*fb1b10abSAndroid Build Coastguard Worker }
381*fb1b10abSAndroid Build Coastguard Worker // save current source average
382*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
383*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
384*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
385*fb1b10abSAndroid Build Coastguard Worker }
386*fb1b10abSAndroid Build Coastguard Worker }
387*fb1b10abSAndroid Build Coastguard Worker
388*fb1b10abSAndroid Build Coastguard Worker // (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction.
spv32_bilin_zero(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int offset,int sstep)389*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
390*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
391*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred,
392*fb1b10abSAndroid Build Coastguard Worker int second_stride, int do_sec, int height,
393*fb1b10abSAndroid Build Coastguard Worker __m256i *sum_reg, __m256i *sse_reg,
394*fb1b10abSAndroid Build Coastguard Worker int offset, int sstep) {
395*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
396*fb1b10abSAndroid Build Coastguard Worker const __m256i pw8 = _mm256_set1_epi16(8);
397*fb1b10abSAndroid Build Coastguard Worker const __m256i filter = _mm256_load_si256(
398*fb1b10abSAndroid Build Coastguard Worker (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
399*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
400*fb1b10abSAndroid Build Coastguard Worker int i;
401*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
402*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
403*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
404*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
405*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
406*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
407*fb1b10abSAndroid Build Coastguard Worker
408*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(filter)
409*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
410*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
411*fb1b10abSAndroid Build Coastguard Worker const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
412*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
413*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
414*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
415*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
416*fb1b10abSAndroid Build Coastguard Worker }
417*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
418*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
419*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
420*fb1b10abSAndroid Build Coastguard Worker }
421*fb1b10abSAndroid Build Coastguard Worker }
422*fb1b10abSAndroid Build Coastguard Worker
spv32_x0_yb(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int y_offset)423*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
424*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
425*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
426*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
427*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg, int y_offset) {
428*fb1b10abSAndroid Build Coastguard Worker spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
429*fb1b10abSAndroid Build Coastguard Worker do_sec, height, sum_reg, sse_reg, y_offset, src_stride);
430*fb1b10abSAndroid Build Coastguard Worker }
431*fb1b10abSAndroid Build Coastguard Worker
spv32_xb_y0(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int x_offset)432*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
433*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
434*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
435*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
436*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg, int x_offset) {
437*fb1b10abSAndroid Build Coastguard Worker spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
438*fb1b10abSAndroid Build Coastguard Worker do_sec, height, sum_reg, sse_reg, x_offset, 1);
439*fb1b10abSAndroid Build Coastguard Worker }
440*fb1b10abSAndroid Build Coastguard Worker
spv32_x4_yb(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int y_offset)441*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
442*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
443*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
444*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
445*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg, int y_offset) {
446*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
447*fb1b10abSAndroid Build Coastguard Worker const __m256i pw8 = _mm256_set1_epi16(8);
448*fb1b10abSAndroid Build Coastguard Worker const __m256i filter = _mm256_load_si256(
449*fb1b10abSAndroid Build Coastguard Worker (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
450*fb1b10abSAndroid Build Coastguard Worker const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
451*fb1b10abSAndroid Build Coastguard Worker const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
452*fb1b10abSAndroid Build Coastguard Worker __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
453*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
454*fb1b10abSAndroid Build Coastguard Worker int i;
455*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
456*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
457*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
458*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
459*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
460*fb1b10abSAndroid Build Coastguard Worker const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
461*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
462*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
463*fb1b10abSAndroid Build Coastguard Worker prev_src_avg = src_avg;
464*fb1b10abSAndroid Build Coastguard Worker
465*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(filter)
466*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
467*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
468*fb1b10abSAndroid Build Coastguard Worker const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
469*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
470*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
471*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
472*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
473*fb1b10abSAndroid Build Coastguard Worker }
474*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
475*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
476*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
477*fb1b10abSAndroid Build Coastguard Worker }
478*fb1b10abSAndroid Build Coastguard Worker }
479*fb1b10abSAndroid Build Coastguard Worker
spv32_xb_y4(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int x_offset)480*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
481*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
482*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
483*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
484*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg, int x_offset) {
485*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
486*fb1b10abSAndroid Build Coastguard Worker const __m256i pw8 = _mm256_set1_epi16(8);
487*fb1b10abSAndroid Build Coastguard Worker const __m256i filter = _mm256_load_si256(
488*fb1b10abSAndroid Build Coastguard Worker (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
489*fb1b10abSAndroid Build Coastguard Worker const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
490*fb1b10abSAndroid Build Coastguard Worker const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
491*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
492*fb1b10abSAndroid Build Coastguard Worker __m256i src_reg, src_pack;
493*fb1b10abSAndroid Build Coastguard Worker int i;
494*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
495*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
496*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(filter)
497*fb1b10abSAndroid Build Coastguard Worker // convert each 16 bit to 8 bit to each low and high lane source
498*fb1b10abSAndroid Build Coastguard Worker src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
499*fb1b10abSAndroid Build Coastguard Worker
500*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
501*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
502*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
503*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
504*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
505*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
506*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
507*fb1b10abSAndroid Build Coastguard Worker
508*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(filter)
509*fb1b10abSAndroid Build Coastguard Worker
510*fb1b10abSAndroid Build Coastguard Worker src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
511*fb1b10abSAndroid Build Coastguard Worker // average between previous pack to the current
512*fb1b10abSAndroid Build Coastguard Worker src_pack = _mm256_avg_epu8(src_pack, src_reg);
513*fb1b10abSAndroid Build Coastguard Worker
514*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
515*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
516*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
517*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
518*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
519*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
520*fb1b10abSAndroid Build Coastguard Worker } else {
521*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
522*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
523*fb1b10abSAndroid Build Coastguard Worker }
524*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
525*fb1b10abSAndroid Build Coastguard Worker src_pack = src_reg;
526*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
527*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
528*fb1b10abSAndroid Build Coastguard Worker }
529*fb1b10abSAndroid Build Coastguard Worker }
530*fb1b10abSAndroid Build Coastguard Worker
spv32_xb_yb(const uint8_t * src,int src_stride,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,__m256i * sum_reg,__m256i * sse_reg,int x_offset,int y_offset)531*fb1b10abSAndroid Build Coastguard Worker static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
532*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
533*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
534*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, __m256i *sum_reg,
535*fb1b10abSAndroid Build Coastguard Worker __m256i *sse_reg, int x_offset, int y_offset) {
536*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
537*fb1b10abSAndroid Build Coastguard Worker const __m256i pw8 = _mm256_set1_epi16(8);
538*fb1b10abSAndroid Build Coastguard Worker const __m256i xfilter = _mm256_load_si256(
539*fb1b10abSAndroid Build Coastguard Worker (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
540*fb1b10abSAndroid Build Coastguard Worker const __m256i yfilter = _mm256_load_si256(
541*fb1b10abSAndroid Build Coastguard Worker (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
542*fb1b10abSAndroid Build Coastguard Worker const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
543*fb1b10abSAndroid Build Coastguard Worker const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
544*fb1b10abSAndroid Build Coastguard Worker __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
545*fb1b10abSAndroid Build Coastguard Worker __m256i prev_src_pack, src_pack;
546*fb1b10abSAndroid Build Coastguard Worker int i;
547*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
548*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
549*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(xfilter)
550*fb1b10abSAndroid Build Coastguard Worker // convert each 16 bit to 8 bit to each low and high lane source
551*fb1b10abSAndroid Build Coastguard Worker prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
552*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
553*fb1b10abSAndroid Build Coastguard Worker
554*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < height; i++) {
555*fb1b10abSAndroid Build Coastguard Worker const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
556*fb1b10abSAndroid Build Coastguard Worker const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
557*fb1b10abSAndroid Build Coastguard Worker const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
558*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
559*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
560*fb1b10abSAndroid Build Coastguard Worker
561*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(xfilter)
562*fb1b10abSAndroid Build Coastguard Worker src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
563*fb1b10abSAndroid Build Coastguard Worker
564*fb1b10abSAndroid Build Coastguard Worker // merge previous pack to current pack source
565*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
566*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
567*fb1b10abSAndroid Build Coastguard Worker
568*fb1b10abSAndroid Build Coastguard Worker FILTER_SRC(yfilter)
569*fb1b10abSAndroid Build Coastguard Worker if (do_sec) {
570*fb1b10abSAndroid Build Coastguard Worker const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
571*fb1b10abSAndroid Build Coastguard Worker const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
572*fb1b10abSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
573*fb1b10abSAndroid Build Coastguard Worker exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
574*fb1b10abSAndroid Build Coastguard Worker exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
575*fb1b10abSAndroid Build Coastguard Worker second_pred += second_stride;
576*fb1b10abSAndroid Build Coastguard Worker }
577*fb1b10abSAndroid Build Coastguard Worker
578*fb1b10abSAndroid Build Coastguard Worker prev_src_pack = src_pack;
579*fb1b10abSAndroid Build Coastguard Worker
580*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_SSE_INSIDE_LOOP
581*fb1b10abSAndroid Build Coastguard Worker dst += dst_stride;
582*fb1b10abSAndroid Build Coastguard Worker src += src_stride;
583*fb1b10abSAndroid Build Coastguard Worker }
584*fb1b10abSAndroid Build Coastguard Worker }
585*fb1b10abSAndroid Build Coastguard Worker
sub_pix_var32xh(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int do_sec,int height,unsigned int * sse)586*fb1b10abSAndroid Build Coastguard Worker static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
587*fb1b10abSAndroid Build Coastguard Worker int x_offset, int y_offset,
588*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
589*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred, int second_stride,
590*fb1b10abSAndroid Build Coastguard Worker int do_sec, int height, unsigned int *sse) {
591*fb1b10abSAndroid Build Coastguard Worker const __m256i zero_reg = _mm256_setzero_si256();
592*fb1b10abSAndroid Build Coastguard Worker __m256i sum_reg = _mm256_setzero_si256();
593*fb1b10abSAndroid Build Coastguard Worker __m256i sse_reg = _mm256_setzero_si256();
594*fb1b10abSAndroid Build Coastguard Worker __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
595*fb1b10abSAndroid Build Coastguard Worker int sum;
596*fb1b10abSAndroid Build Coastguard Worker // x_offset = 0 and y_offset = 0
597*fb1b10abSAndroid Build Coastguard Worker if (x_offset == 0) {
598*fb1b10abSAndroid Build Coastguard Worker if (y_offset == 0) {
599*fb1b10abSAndroid Build Coastguard Worker spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
600*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg);
601*fb1b10abSAndroid Build Coastguard Worker // x_offset = 0 and y_offset = 4
602*fb1b10abSAndroid Build Coastguard Worker } else if (y_offset == 4) {
603*fb1b10abSAndroid Build Coastguard Worker spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
604*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg);
605*fb1b10abSAndroid Build Coastguard Worker // x_offset = 0 and y_offset = bilin interpolation
606*fb1b10abSAndroid Build Coastguard Worker } else {
607*fb1b10abSAndroid Build Coastguard Worker spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
608*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg, y_offset);
609*fb1b10abSAndroid Build Coastguard Worker }
610*fb1b10abSAndroid Build Coastguard Worker // x_offset = 4 and y_offset = 0
611*fb1b10abSAndroid Build Coastguard Worker } else if (x_offset == 4) {
612*fb1b10abSAndroid Build Coastguard Worker if (y_offset == 0) {
613*fb1b10abSAndroid Build Coastguard Worker spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
614*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg);
615*fb1b10abSAndroid Build Coastguard Worker // x_offset = 4 and y_offset = 4
616*fb1b10abSAndroid Build Coastguard Worker } else if (y_offset == 4) {
617*fb1b10abSAndroid Build Coastguard Worker spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
618*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg);
619*fb1b10abSAndroid Build Coastguard Worker // x_offset = 4 and y_offset = bilin interpolation
620*fb1b10abSAndroid Build Coastguard Worker } else {
621*fb1b10abSAndroid Build Coastguard Worker spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
622*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg, y_offset);
623*fb1b10abSAndroid Build Coastguard Worker }
624*fb1b10abSAndroid Build Coastguard Worker // x_offset = bilin interpolation and y_offset = 0
625*fb1b10abSAndroid Build Coastguard Worker } else {
626*fb1b10abSAndroid Build Coastguard Worker if (y_offset == 0) {
627*fb1b10abSAndroid Build Coastguard Worker spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
628*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg, x_offset);
629*fb1b10abSAndroid Build Coastguard Worker // x_offset = bilin interpolation and y_offset = 4
630*fb1b10abSAndroid Build Coastguard Worker } else if (y_offset == 4) {
631*fb1b10abSAndroid Build Coastguard Worker spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
632*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg, x_offset);
633*fb1b10abSAndroid Build Coastguard Worker // x_offset = bilin interpolation and y_offset = bilin interpolation
634*fb1b10abSAndroid Build Coastguard Worker } else {
635*fb1b10abSAndroid Build Coastguard Worker spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
636*fb1b10abSAndroid Build Coastguard Worker do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset);
637*fb1b10abSAndroid Build Coastguard Worker }
638*fb1b10abSAndroid Build Coastguard Worker }
639*fb1b10abSAndroid Build Coastguard Worker CALC_SUM_AND_SSE
640*fb1b10abSAndroid Build Coastguard Worker return sum;
641*fb1b10abSAndroid Build Coastguard Worker }
642*fb1b10abSAndroid Build Coastguard Worker
sub_pixel_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,int height,unsigned int * sse)643*fb1b10abSAndroid Build Coastguard Worker static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
644*fb1b10abSAndroid Build Coastguard Worker int x_offset, int y_offset,
645*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
646*fb1b10abSAndroid Build Coastguard Worker int height, unsigned int *sse) {
647*fb1b10abSAndroid Build Coastguard Worker return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
648*fb1b10abSAndroid Build Coastguard Worker NULL, 0, 0, height, sse);
649*fb1b10abSAndroid Build Coastguard Worker }
650*fb1b10abSAndroid Build Coastguard Worker
sub_pixel_avg_variance32xh_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,const uint8_t * second_pred,int second_stride,int height,unsigned int * sse)651*fb1b10abSAndroid Build Coastguard Worker static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
652*fb1b10abSAndroid Build Coastguard Worker int x_offset, int y_offset,
653*fb1b10abSAndroid Build Coastguard Worker const uint8_t *dst, int dst_stride,
654*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred,
655*fb1b10abSAndroid Build Coastguard Worker int second_stride, int height,
656*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
657*fb1b10abSAndroid Build Coastguard Worker return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
658*fb1b10abSAndroid Build Coastguard Worker second_pred, second_stride, 1, height, sse);
659*fb1b10abSAndroid Build Coastguard Worker }
660*fb1b10abSAndroid Build Coastguard Worker
661*fb1b10abSAndroid Build Coastguard Worker typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
662*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
663*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse, int *sum);
664*fb1b10abSAndroid Build Coastguard Worker
vpx_variance8x4_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)665*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
666*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
667*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
668*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
669*fb1b10abSAndroid Build Coastguard Worker int sum;
670*fb1b10abSAndroid Build Coastguard Worker variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
671*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
672*fb1b10abSAndroid Build Coastguard Worker return *sse - ((sum * sum) >> 5);
673*fb1b10abSAndroid Build Coastguard Worker }
674*fb1b10abSAndroid Build Coastguard Worker
vpx_variance8x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)675*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
676*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
677*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
678*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
679*fb1b10abSAndroid Build Coastguard Worker int sum;
680*fb1b10abSAndroid Build Coastguard Worker variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
681*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
682*fb1b10abSAndroid Build Coastguard Worker return *sse - ((sum * sum) >> 6);
683*fb1b10abSAndroid Build Coastguard Worker }
684*fb1b10abSAndroid Build Coastguard Worker
vpx_variance8x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)685*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
686*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
687*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
688*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
689*fb1b10abSAndroid Build Coastguard Worker int sum;
690*fb1b10abSAndroid Build Coastguard Worker variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
691*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
692*fb1b10abSAndroid Build Coastguard Worker return *sse - ((sum * sum) >> 7);
693*fb1b10abSAndroid Build Coastguard Worker }
694*fb1b10abSAndroid Build Coastguard Worker
vpx_variance16x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)695*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
696*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
697*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
698*fb1b10abSAndroid Build Coastguard Worker int sum;
699*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
700*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
701*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
702*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 7);
703*fb1b10abSAndroid Build Coastguard Worker }
704*fb1b10abSAndroid Build Coastguard Worker
vpx_variance16x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)705*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride,
706*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
707*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
708*fb1b10abSAndroid Build Coastguard Worker int sum;
709*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
710*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
711*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
712*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
713*fb1b10abSAndroid Build Coastguard Worker }
714*fb1b10abSAndroid Build Coastguard Worker
vpx_variance16x32_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)715*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride,
716*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
717*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
718*fb1b10abSAndroid Build Coastguard Worker int sum;
719*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
720*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
721*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
722*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
723*fb1b10abSAndroid Build Coastguard Worker }
724*fb1b10abSAndroid Build Coastguard Worker
vpx_variance32x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)725*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride,
726*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
727*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
728*fb1b10abSAndroid Build Coastguard Worker int sum;
729*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
730*fb1b10abSAndroid Build Coastguard Worker variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
731*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
732*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
733*fb1b10abSAndroid Build Coastguard Worker }
734*fb1b10abSAndroid Build Coastguard Worker
vpx_variance32x32_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)735*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride,
736*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
737*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
738*fb1b10abSAndroid Build Coastguard Worker int sum;
739*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
740*fb1b10abSAndroid Build Coastguard Worker __m128i vsum_128;
741*fb1b10abSAndroid Build Coastguard Worker variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
742*fb1b10abSAndroid Build Coastguard Worker vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
743*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsum, 1));
744*fb1b10abSAndroid Build Coastguard Worker vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
745*fb1b10abSAndroid Build Coastguard Worker _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
746*fb1b10abSAndroid Build Coastguard Worker variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
747*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
748*fb1b10abSAndroid Build Coastguard Worker }
749*fb1b10abSAndroid Build Coastguard Worker
vpx_variance32x64_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)750*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride,
751*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
752*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
753*fb1b10abSAndroid Build Coastguard Worker int sum;
754*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
755*fb1b10abSAndroid Build Coastguard Worker __m128i vsum_128;
756*fb1b10abSAndroid Build Coastguard Worker variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum);
757*fb1b10abSAndroid Build Coastguard Worker vsum = sum_to_32bit_avx2(vsum);
758*fb1b10abSAndroid Build Coastguard Worker vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
759*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsum, 1));
760*fb1b10abSAndroid Build Coastguard Worker variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
761*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
762*fb1b10abSAndroid Build Coastguard Worker }
763*fb1b10abSAndroid Build Coastguard Worker
vpx_variance64x32_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)764*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride,
765*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
766*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
767*fb1b10abSAndroid Build Coastguard Worker __m256i vsse = _mm256_setzero_si256();
768*fb1b10abSAndroid Build Coastguard Worker __m256i vsum = _mm256_setzero_si256();
769*fb1b10abSAndroid Build Coastguard Worker __m128i vsum_128;
770*fb1b10abSAndroid Build Coastguard Worker int sum;
771*fb1b10abSAndroid Build Coastguard Worker variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
772*fb1b10abSAndroid Build Coastguard Worker vsum = sum_to_32bit_avx2(vsum);
773*fb1b10abSAndroid Build Coastguard Worker vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
774*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsum, 1));
775*fb1b10abSAndroid Build Coastguard Worker variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
776*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
777*fb1b10abSAndroid Build Coastguard Worker }
778*fb1b10abSAndroid Build Coastguard Worker
vpx_variance64x64_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)779*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride,
780*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
781*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
782*fb1b10abSAndroid Build Coastguard Worker __m256i vsse = _mm256_setzero_si256();
783*fb1b10abSAndroid Build Coastguard Worker __m256i vsum = _mm256_setzero_si256();
784*fb1b10abSAndroid Build Coastguard Worker __m128i vsum_128;
785*fb1b10abSAndroid Build Coastguard Worker int sum;
786*fb1b10abSAndroid Build Coastguard Worker int i = 0;
787*fb1b10abSAndroid Build Coastguard Worker
788*fb1b10abSAndroid Build Coastguard Worker for (i = 0; i < 2; i++) {
789*fb1b10abSAndroid Build Coastguard Worker __m256i vsum16;
790*fb1b10abSAndroid Build Coastguard Worker variance64_avx2(src_ptr + 32 * i * src_stride, src_stride,
791*fb1b10abSAndroid Build Coastguard Worker ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
792*fb1b10abSAndroid Build Coastguard Worker &vsum16);
793*fb1b10abSAndroid Build Coastguard Worker vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));
794*fb1b10abSAndroid Build Coastguard Worker }
795*fb1b10abSAndroid Build Coastguard Worker vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
796*fb1b10abSAndroid Build Coastguard Worker _mm256_extractf128_si256(vsum, 1));
797*fb1b10abSAndroid Build Coastguard Worker variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
798*fb1b10abSAndroid Build Coastguard Worker return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
799*fb1b10abSAndroid Build Coastguard Worker }
800*fb1b10abSAndroid Build Coastguard Worker
vpx_mse16x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)801*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride,
802*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
803*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
804*fb1b10abSAndroid Build Coastguard Worker int sum;
805*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
806*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
807*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
808*fb1b10abSAndroid Build Coastguard Worker return *sse;
809*fb1b10abSAndroid Build Coastguard Worker }
810*fb1b10abSAndroid Build Coastguard Worker
vpx_mse16x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)811*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride,
812*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
813*fb1b10abSAndroid Build Coastguard Worker unsigned int *sse) {
814*fb1b10abSAndroid Build Coastguard Worker int sum;
815*fb1b10abSAndroid Build Coastguard Worker __m256i vsse, vsum;
816*fb1b10abSAndroid Build Coastguard Worker variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
817*fb1b10abSAndroid Build Coastguard Worker variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
818*fb1b10abSAndroid Build Coastguard Worker return *sse;
819*fb1b10abSAndroid Build Coastguard Worker }
820*fb1b10abSAndroid Build Coastguard Worker
vpx_sub_pixel_variance64x64_avx2(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)821*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sub_pixel_variance64x64_avx2(
822*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
823*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
824*fb1b10abSAndroid Build Coastguard Worker unsigned int sse1;
825*fb1b10abSAndroid Build Coastguard Worker const int se1 = sub_pixel_variance32xh_avx2(
826*fb1b10abSAndroid Build Coastguard Worker src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1);
827*fb1b10abSAndroid Build Coastguard Worker unsigned int sse2;
828*fb1b10abSAndroid Build Coastguard Worker const int se2 =
829*fb1b10abSAndroid Build Coastguard Worker sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset,
830*fb1b10abSAndroid Build Coastguard Worker ref_ptr + 32, ref_stride, 64, &sse2);
831*fb1b10abSAndroid Build Coastguard Worker const int se = se1 + se2;
832*fb1b10abSAndroid Build Coastguard Worker *sse = sse1 + sse2;
833*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)se * se) >> 12);
834*fb1b10abSAndroid Build Coastguard Worker }
835*fb1b10abSAndroid Build Coastguard Worker
vpx_sub_pixel_variance32x32_avx2(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)836*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sub_pixel_variance32x32_avx2(
837*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
838*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
839*fb1b10abSAndroid Build Coastguard Worker const int se = sub_pixel_variance32xh_avx2(
840*fb1b10abSAndroid Build Coastguard Worker src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse);
841*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)se * se) >> 10);
842*fb1b10abSAndroid Build Coastguard Worker }
843*fb1b10abSAndroid Build Coastguard Worker
vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,const uint8_t * second_pred)844*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
845*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
846*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
847*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred) {
848*fb1b10abSAndroid Build Coastguard Worker unsigned int sse1;
849*fb1b10abSAndroid Build Coastguard Worker const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
850*fb1b10abSAndroid Build Coastguard Worker y_offset, ref_ptr, ref_stride,
851*fb1b10abSAndroid Build Coastguard Worker second_pred, 64, 64, &sse1);
852*fb1b10abSAndroid Build Coastguard Worker unsigned int sse2;
853*fb1b10abSAndroid Build Coastguard Worker const int se2 = sub_pixel_avg_variance32xh_avx2(
854*fb1b10abSAndroid Build Coastguard Worker src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride,
855*fb1b10abSAndroid Build Coastguard Worker second_pred + 32, 64, 64, &sse2);
856*fb1b10abSAndroid Build Coastguard Worker const int se = se1 + se2;
857*fb1b10abSAndroid Build Coastguard Worker
858*fb1b10abSAndroid Build Coastguard Worker *sse = sse1 + sse2;
859*fb1b10abSAndroid Build Coastguard Worker
860*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)se * se) >> 12);
861*fb1b10abSAndroid Build Coastguard Worker }
862*fb1b10abSAndroid Build Coastguard Worker
vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t * src_ptr,int src_stride,int x_offset,int y_offset,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,const uint8_t * second_pred)863*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
864*fb1b10abSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
865*fb1b10abSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
866*fb1b10abSAndroid Build Coastguard Worker const uint8_t *second_pred) {
867*fb1b10abSAndroid Build Coastguard Worker // Process 32 elements in parallel.
868*fb1b10abSAndroid Build Coastguard Worker const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
869*fb1b10abSAndroid Build Coastguard Worker y_offset, ref_ptr, ref_stride,
870*fb1b10abSAndroid Build Coastguard Worker second_pred, 32, 32, sse);
871*fb1b10abSAndroid Build Coastguard Worker return *sse - (uint32_t)(((int64_t)se * se) >> 10);
872*fb1b10abSAndroid Build Coastguard Worker }
873