xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/variance_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <emmintrin.h>  // SSE2
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vpx_ports/mem.h"
17*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/x86/mem_sse2.h"
18*fb1b10abSAndroid Build Coastguard Worker 
add32x4_sse2(__m128i val)19*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int add32x4_sse2(__m128i val) {
20*fb1b10abSAndroid Build Coastguard Worker   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
21*fb1b10abSAndroid Build Coastguard Worker   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
22*fb1b10abSAndroid Build Coastguard Worker   return (unsigned int)_mm_cvtsi128_si32(val);
23*fb1b10abSAndroid Build Coastguard Worker }
24*fb1b10abSAndroid Build Coastguard Worker 
vpx_get_mb_ss_sse2(const int16_t * src_ptr)25*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
26*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum = _mm_setzero_si128();
27*fb1b10abSAndroid Build Coastguard Worker   int i;
28*fb1b10abSAndroid Build Coastguard Worker 
29*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < 32; ++i) {
30*fb1b10abSAndroid Build Coastguard Worker     const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
31*fb1b10abSAndroid Build Coastguard Worker     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
32*fb1b10abSAndroid Build Coastguard Worker     src_ptr += 8;
33*fb1b10abSAndroid Build Coastguard Worker   }
34*fb1b10abSAndroid Build Coastguard Worker 
35*fb1b10abSAndroid Build Coastguard Worker   return add32x4_sse2(vsum);
36*fb1b10abSAndroid Build Coastguard Worker }
37*fb1b10abSAndroid Build Coastguard Worker 
load4x2_sse2(const uint8_t * const p,const int stride)38*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
39*fb1b10abSAndroid Build Coastguard Worker   const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
40*fb1b10abSAndroid Build Coastguard Worker   const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
41*fb1b10abSAndroid Build Coastguard Worker   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
42*fb1b10abSAndroid Build Coastguard Worker   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
43*fb1b10abSAndroid Build Coastguard Worker }
44*fb1b10abSAndroid Build Coastguard Worker 
variance_kernel_sse2(const __m128i src_ptr,const __m128i ref_ptr,__m128i * const sse,__m128i * const sum)45*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_kernel_sse2(const __m128i src_ptr,
46*fb1b10abSAndroid Build Coastguard Worker                                         const __m128i ref_ptr,
47*fb1b10abSAndroid Build Coastguard Worker                                         __m128i *const sse,
48*fb1b10abSAndroid Build Coastguard Worker                                         __m128i *const sum) {
49*fb1b10abSAndroid Build Coastguard Worker   const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
50*fb1b10abSAndroid Build Coastguard Worker   *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
51*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_add_epi16(*sum, diff);
52*fb1b10abSAndroid Build Coastguard Worker }
53*fb1b10abSAndroid Build Coastguard Worker 
54*fb1b10abSAndroid Build Coastguard Worker // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
55*fb1b10abSAndroid Build Coastguard Worker // Slightly faster than variance_final_256_pel_sse2()
variance_final_128_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)56*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
57*fb1b10abSAndroid Build Coastguard Worker                                                unsigned int *const sse,
58*fb1b10abSAndroid Build Coastguard Worker                                                int *const sum) {
59*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
60*fb1b10abSAndroid Build Coastguard Worker 
61*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
62*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
63*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
64*fb1b10abSAndroid Build Coastguard Worker   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
65*fb1b10abSAndroid Build Coastguard Worker }
66*fb1b10abSAndroid Build Coastguard Worker 
67*fb1b10abSAndroid Build Coastguard Worker // Can handle 256 pixels' diff sum (such as 16x16)
variance_final_256_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)68*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
69*fb1b10abSAndroid Build Coastguard Worker                                                unsigned int *const sse,
70*fb1b10abSAndroid Build Coastguard Worker                                                int *const sum) {
71*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
72*fb1b10abSAndroid Build Coastguard Worker 
73*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
74*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
75*fb1b10abSAndroid Build Coastguard Worker   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
76*fb1b10abSAndroid Build Coastguard Worker   *sum += (int16_t)_mm_extract_epi16(vsum, 1);
77*fb1b10abSAndroid Build Coastguard Worker }
78*fb1b10abSAndroid Build Coastguard Worker 
79*fb1b10abSAndroid Build Coastguard Worker // Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
variance_final_512_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)80*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
81*fb1b10abSAndroid Build Coastguard Worker                                                unsigned int *const sse,
82*fb1b10abSAndroid Build Coastguard Worker                                                int *const sum) {
83*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
84*fb1b10abSAndroid Build Coastguard Worker 
85*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
86*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_unpacklo_epi16(vsum, vsum);
87*fb1b10abSAndroid Build Coastguard Worker   vsum = _mm_srai_epi32(vsum, 16);
88*fb1b10abSAndroid Build Coastguard Worker   *sum = (int)add32x4_sse2(vsum);
89*fb1b10abSAndroid Build Coastguard Worker }
90*fb1b10abSAndroid Build Coastguard Worker 
sum_to_32bit_sse2(const __m128i sum)91*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
92*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
93*fb1b10abSAndroid Build Coastguard Worker   const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
94*fb1b10abSAndroid Build Coastguard Worker   return _mm_add_epi32(sum_lo, sum_hi);
95*fb1b10abSAndroid Build Coastguard Worker }
96*fb1b10abSAndroid Build Coastguard Worker 
97*fb1b10abSAndroid Build Coastguard Worker // Can handle 1024 pixels' diff sum (such as 32x32)
sum_final_sse2(const __m128i sum)98*fb1b10abSAndroid Build Coastguard Worker static INLINE int sum_final_sse2(const __m128i sum) {
99*fb1b10abSAndroid Build Coastguard Worker   const __m128i t = sum_to_32bit_sse2(sum);
100*fb1b10abSAndroid Build Coastguard Worker   return (int)add32x4_sse2(t);
101*fb1b10abSAndroid Build Coastguard Worker }
102*fb1b10abSAndroid Build Coastguard Worker 
variance4_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)103*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
104*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, const int ref_stride,
105*fb1b10abSAndroid Build Coastguard Worker                                   const int h, __m128i *const sse,
106*fb1b10abSAndroid Build Coastguard Worker                                   __m128i *const sum) {
107*fb1b10abSAndroid Build Coastguard Worker   int i;
108*fb1b10abSAndroid Build Coastguard Worker 
109*fb1b10abSAndroid Build Coastguard Worker   assert(h <= 256);  // May overflow for larger height.
110*fb1b10abSAndroid Build Coastguard Worker   *sse = _mm_setzero_si128();
111*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_setzero_si128();
112*fb1b10abSAndroid Build Coastguard Worker 
113*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; i += 2) {
114*fb1b10abSAndroid Build Coastguard Worker     const __m128i s = load4x2_sse2(src_ptr, src_stride);
115*fb1b10abSAndroid Build Coastguard Worker     const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
116*fb1b10abSAndroid Build Coastguard Worker 
117*fb1b10abSAndroid Build Coastguard Worker     variance_kernel_sse2(s, r, sse, sum);
118*fb1b10abSAndroid Build Coastguard Worker     src_ptr += 2 * src_stride;
119*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += 2 * ref_stride;
120*fb1b10abSAndroid Build Coastguard Worker   }
121*fb1b10abSAndroid Build Coastguard Worker }
122*fb1b10abSAndroid Build Coastguard Worker 
variance8_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)123*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
124*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, const int ref_stride,
125*fb1b10abSAndroid Build Coastguard Worker                                   const int h, __m128i *const sse,
126*fb1b10abSAndroid Build Coastguard Worker                                   __m128i *const sum) {
127*fb1b10abSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
128*fb1b10abSAndroid Build Coastguard Worker   int i;
129*fb1b10abSAndroid Build Coastguard Worker 
130*fb1b10abSAndroid Build Coastguard Worker   assert(h <= 128);  // May overflow for larger height.
131*fb1b10abSAndroid Build Coastguard Worker   *sse = _mm_setzero_si128();
132*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_setzero_si128();
133*fb1b10abSAndroid Build Coastguard Worker 
134*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; i++) {
135*fb1b10abSAndroid Build Coastguard Worker     const __m128i s =
136*fb1b10abSAndroid Build Coastguard Worker         _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
137*fb1b10abSAndroid Build Coastguard Worker     const __m128i r =
138*fb1b10abSAndroid Build Coastguard Worker         _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
139*fb1b10abSAndroid Build Coastguard Worker 
140*fb1b10abSAndroid Build Coastguard Worker     variance_kernel_sse2(s, r, sse, sum);
141*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src_stride;
142*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
143*fb1b10abSAndroid Build Coastguard Worker   }
144*fb1b10abSAndroid Build Coastguard Worker }
145*fb1b10abSAndroid Build Coastguard Worker 
variance16_kernel_sse2(const uint8_t * const src_ptr,const uint8_t * const ref_ptr,__m128i * const sse,__m128i * const sum)146*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
147*fb1b10abSAndroid Build Coastguard Worker                                           const uint8_t *const ref_ptr,
148*fb1b10abSAndroid Build Coastguard Worker                                           __m128i *const sse,
149*fb1b10abSAndroid Build Coastguard Worker                                           __m128i *const sum) {
150*fb1b10abSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
151*fb1b10abSAndroid Build Coastguard Worker   const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
152*fb1b10abSAndroid Build Coastguard Worker   const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
153*fb1b10abSAndroid Build Coastguard Worker   const __m128i src0 = _mm_unpacklo_epi8(s, zero);
154*fb1b10abSAndroid Build Coastguard Worker   const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
155*fb1b10abSAndroid Build Coastguard Worker   const __m128i src1 = _mm_unpackhi_epi8(s, zero);
156*fb1b10abSAndroid Build Coastguard Worker   const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
157*fb1b10abSAndroid Build Coastguard Worker 
158*fb1b10abSAndroid Build Coastguard Worker   variance_kernel_sse2(src0, ref0, sse, sum);
159*fb1b10abSAndroid Build Coastguard Worker   variance_kernel_sse2(src1, ref1, sse, sum);
160*fb1b10abSAndroid Build Coastguard Worker }
161*fb1b10abSAndroid Build Coastguard Worker 
variance16_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)162*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
163*fb1b10abSAndroid Build Coastguard Worker                                    const uint8_t *ref_ptr, const int ref_stride,
164*fb1b10abSAndroid Build Coastguard Worker                                    const int h, __m128i *const sse,
165*fb1b10abSAndroid Build Coastguard Worker                                    __m128i *const sum) {
166*fb1b10abSAndroid Build Coastguard Worker   int i;
167*fb1b10abSAndroid Build Coastguard Worker 
168*fb1b10abSAndroid Build Coastguard Worker   assert(h <= 64);  // May overflow for larger height.
169*fb1b10abSAndroid Build Coastguard Worker   *sse = _mm_setzero_si128();
170*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_setzero_si128();
171*fb1b10abSAndroid Build Coastguard Worker 
172*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; ++i) {
173*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
174*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src_stride;
175*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
176*fb1b10abSAndroid Build Coastguard Worker   }
177*fb1b10abSAndroid Build Coastguard Worker }
178*fb1b10abSAndroid Build Coastguard Worker 
variance32_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)179*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
180*fb1b10abSAndroid Build Coastguard Worker                                    const uint8_t *ref_ptr, const int ref_stride,
181*fb1b10abSAndroid Build Coastguard Worker                                    const int h, __m128i *const sse,
182*fb1b10abSAndroid Build Coastguard Worker                                    __m128i *const sum) {
183*fb1b10abSAndroid Build Coastguard Worker   int i;
184*fb1b10abSAndroid Build Coastguard Worker 
185*fb1b10abSAndroid Build Coastguard Worker   assert(h <= 32);  // May overflow for larger height.
186*fb1b10abSAndroid Build Coastguard Worker   // Don't initialize sse here since it's an accumulation.
187*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_setzero_si128();
188*fb1b10abSAndroid Build Coastguard Worker 
189*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; ++i) {
190*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
191*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
192*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src_stride;
193*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
194*fb1b10abSAndroid Build Coastguard Worker   }
195*fb1b10abSAndroid Build Coastguard Worker }
196*fb1b10abSAndroid Build Coastguard Worker 
variance64_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)197*fb1b10abSAndroid Build Coastguard Worker static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
198*fb1b10abSAndroid Build Coastguard Worker                                    const uint8_t *ref_ptr, const int ref_stride,
199*fb1b10abSAndroid Build Coastguard Worker                                    const int h, __m128i *const sse,
200*fb1b10abSAndroid Build Coastguard Worker                                    __m128i *const sum) {
201*fb1b10abSAndroid Build Coastguard Worker   int i;
202*fb1b10abSAndroid Build Coastguard Worker 
203*fb1b10abSAndroid Build Coastguard Worker   assert(h <= 16);  // May overflow for larger height.
204*fb1b10abSAndroid Build Coastguard Worker   // Don't initialize sse here since it's an accumulation.
205*fb1b10abSAndroid Build Coastguard Worker   *sum = _mm_setzero_si128();
206*fb1b10abSAndroid Build Coastguard Worker 
207*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < h; ++i) {
208*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
209*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
210*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
211*fb1b10abSAndroid Build Coastguard Worker     variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
212*fb1b10abSAndroid Build Coastguard Worker     src_ptr += src_stride;
213*fb1b10abSAndroid Build Coastguard Worker     ref_ptr += ref_stride;
214*fb1b10abSAndroid Build Coastguard Worker   }
215*fb1b10abSAndroid Build Coastguard Worker }
216*fb1b10abSAndroid Build Coastguard Worker 
vpx_get8x8var_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,int * sum)217*fb1b10abSAndroid Build Coastguard Worker void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
218*fb1b10abSAndroid Build Coastguard Worker                         const uint8_t *ref_ptr, int ref_stride,
219*fb1b10abSAndroid Build Coastguard Worker                         unsigned int *sse, int *sum) {
220*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
221*fb1b10abSAndroid Build Coastguard Worker   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
222*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, sum);
223*fb1b10abSAndroid Build Coastguard Worker }
224*fb1b10abSAndroid Build Coastguard Worker 
vpx_get16x16var_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,int * sum)225*fb1b10abSAndroid Build Coastguard Worker void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
226*fb1b10abSAndroid Build Coastguard Worker                           const uint8_t *ref_ptr, int ref_stride,
227*fb1b10abSAndroid Build Coastguard Worker                           unsigned int *sse, int *sum) {
228*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
229*fb1b10abSAndroid Build Coastguard Worker   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
230*fb1b10abSAndroid Build Coastguard Worker   variance_final_256_pel_sse2(vsse, vsum, sse, sum);
231*fb1b10abSAndroid Build Coastguard Worker }
232*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance4x4_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)233*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
234*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, int ref_stride,
235*fb1b10abSAndroid Build Coastguard Worker                                   unsigned int *sse) {
236*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
237*fb1b10abSAndroid Build Coastguard Worker   int sum;
238*fb1b10abSAndroid Build Coastguard Worker   variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
239*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
240*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 4);
241*fb1b10abSAndroid Build Coastguard Worker }
242*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance4x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)243*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
244*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, int ref_stride,
245*fb1b10abSAndroid Build Coastguard Worker                                   unsigned int *sse) {
246*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
247*fb1b10abSAndroid Build Coastguard Worker   int sum;
248*fb1b10abSAndroid Build Coastguard Worker   variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
249*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
250*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 5);
251*fb1b10abSAndroid Build Coastguard Worker }
252*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance8x4_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)253*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
254*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, int ref_stride,
255*fb1b10abSAndroid Build Coastguard Worker                                   unsigned int *sse) {
256*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
257*fb1b10abSAndroid Build Coastguard Worker   int sum;
258*fb1b10abSAndroid Build Coastguard Worker   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
259*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
260*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 5);
261*fb1b10abSAndroid Build Coastguard Worker }
262*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance8x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)263*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
264*fb1b10abSAndroid Build Coastguard Worker                                   const uint8_t *ref_ptr, int ref_stride,
265*fb1b10abSAndroid Build Coastguard Worker                                   unsigned int *sse) {
266*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
267*fb1b10abSAndroid Build Coastguard Worker   int sum;
268*fb1b10abSAndroid Build Coastguard Worker   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
269*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
270*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 6);
271*fb1b10abSAndroid Build Coastguard Worker }
272*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance8x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)273*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
274*fb1b10abSAndroid Build Coastguard Worker                                    const uint8_t *ref_ptr, int ref_stride,
275*fb1b10abSAndroid Build Coastguard Worker                                    unsigned int *sse) {
276*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
277*fb1b10abSAndroid Build Coastguard Worker   int sum;
278*fb1b10abSAndroid Build Coastguard Worker   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
279*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
280*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 7);
281*fb1b10abSAndroid Build Coastguard Worker }
282*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance16x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)283*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
284*fb1b10abSAndroid Build Coastguard Worker                                    const uint8_t *ref_ptr, int ref_stride,
285*fb1b10abSAndroid Build Coastguard Worker                                    unsigned int *sse) {
286*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
287*fb1b10abSAndroid Build Coastguard Worker   int sum;
288*fb1b10abSAndroid Build Coastguard Worker   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
289*fb1b10abSAndroid Build Coastguard Worker   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
290*fb1b10abSAndroid Build Coastguard Worker   return *sse - ((sum * sum) >> 7);
291*fb1b10abSAndroid Build Coastguard Worker }
292*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance16x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)293*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
294*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
295*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
296*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
297*fb1b10abSAndroid Build Coastguard Worker   int sum;
298*fb1b10abSAndroid Build Coastguard Worker   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
299*fb1b10abSAndroid Build Coastguard Worker   variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
300*fb1b10abSAndroid Build Coastguard Worker   return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
301*fb1b10abSAndroid Build Coastguard Worker }
302*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance16x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)303*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
304*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
305*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
306*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse, vsum;
307*fb1b10abSAndroid Build Coastguard Worker   int sum;
308*fb1b10abSAndroid Build Coastguard Worker   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
309*fb1b10abSAndroid Build Coastguard Worker   variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
310*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
311*fb1b10abSAndroid Build Coastguard Worker }
312*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance32x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)313*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
314*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
315*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
316*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse = _mm_setzero_si128();
317*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum;
318*fb1b10abSAndroid Build Coastguard Worker   int sum;
319*fb1b10abSAndroid Build Coastguard Worker   variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
320*fb1b10abSAndroid Build Coastguard Worker   variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
321*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
322*fb1b10abSAndroid Build Coastguard Worker }
323*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance32x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)324*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
325*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
326*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
327*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse = _mm_setzero_si128();
328*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum;
329*fb1b10abSAndroid Build Coastguard Worker   int sum;
330*fb1b10abSAndroid Build Coastguard Worker   variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
331*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
332*fb1b10abSAndroid Build Coastguard Worker   sum = sum_final_sse2(vsum);
333*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
334*fb1b10abSAndroid Build Coastguard Worker }
335*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance32x64_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)336*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
337*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
338*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
339*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse = _mm_setzero_si128();
340*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum = _mm_setzero_si128();
341*fb1b10abSAndroid Build Coastguard Worker   int sum;
342*fb1b10abSAndroid Build Coastguard Worker   int i = 0;
343*fb1b10abSAndroid Build Coastguard Worker 
344*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < 2; i++) {
345*fb1b10abSAndroid Build Coastguard Worker     __m128i vsum16;
346*fb1b10abSAndroid Build Coastguard Worker     variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
347*fb1b10abSAndroid Build Coastguard Worker                     ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
348*fb1b10abSAndroid Build Coastguard Worker                     &vsum16);
349*fb1b10abSAndroid Build Coastguard Worker     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
350*fb1b10abSAndroid Build Coastguard Worker   }
351*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
352*fb1b10abSAndroid Build Coastguard Worker   sum = (int)add32x4_sse2(vsum);
353*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
354*fb1b10abSAndroid Build Coastguard Worker }
355*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance64x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)356*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
357*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
358*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
359*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse = _mm_setzero_si128();
360*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum = _mm_setzero_si128();
361*fb1b10abSAndroid Build Coastguard Worker   int sum;
362*fb1b10abSAndroid Build Coastguard Worker   int i = 0;
363*fb1b10abSAndroid Build Coastguard Worker 
364*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < 2; i++) {
365*fb1b10abSAndroid Build Coastguard Worker     __m128i vsum16;
366*fb1b10abSAndroid Build Coastguard Worker     variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
367*fb1b10abSAndroid Build Coastguard Worker                     ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
368*fb1b10abSAndroid Build Coastguard Worker                     &vsum16);
369*fb1b10abSAndroid Build Coastguard Worker     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
370*fb1b10abSAndroid Build Coastguard Worker   }
371*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
372*fb1b10abSAndroid Build Coastguard Worker   sum = (int)add32x4_sse2(vsum);
373*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
374*fb1b10abSAndroid Build Coastguard Worker }
375*fb1b10abSAndroid Build Coastguard Worker 
vpx_variance64x64_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)376*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
377*fb1b10abSAndroid Build Coastguard Worker                                     const uint8_t *ref_ptr, int ref_stride,
378*fb1b10abSAndroid Build Coastguard Worker                                     unsigned int *sse) {
379*fb1b10abSAndroid Build Coastguard Worker   __m128i vsse = _mm_setzero_si128();
380*fb1b10abSAndroid Build Coastguard Worker   __m128i vsum = _mm_setzero_si128();
381*fb1b10abSAndroid Build Coastguard Worker   int sum;
382*fb1b10abSAndroid Build Coastguard Worker   int i = 0;
383*fb1b10abSAndroid Build Coastguard Worker 
384*fb1b10abSAndroid Build Coastguard Worker   for (i = 0; i < 4; i++) {
385*fb1b10abSAndroid Build Coastguard Worker     __m128i vsum16;
386*fb1b10abSAndroid Build Coastguard Worker     variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
387*fb1b10abSAndroid Build Coastguard Worker                     ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
388*fb1b10abSAndroid Build Coastguard Worker                     &vsum16);
389*fb1b10abSAndroid Build Coastguard Worker     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
390*fb1b10abSAndroid Build Coastguard Worker   }
391*fb1b10abSAndroid Build Coastguard Worker   *sse = add32x4_sse2(vsse);
392*fb1b10abSAndroid Build Coastguard Worker   sum = (int)add32x4_sse2(vsum);
393*fb1b10abSAndroid Build Coastguard Worker   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
394*fb1b10abSAndroid Build Coastguard Worker }
395*fb1b10abSAndroid Build Coastguard Worker 
vpx_mse8x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)396*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
397*fb1b10abSAndroid Build Coastguard Worker                              const uint8_t *ref_ptr, int ref_stride,
398*fb1b10abSAndroid Build Coastguard Worker                              unsigned int *sse) {
399*fb1b10abSAndroid Build Coastguard Worker   vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
400*fb1b10abSAndroid Build Coastguard Worker   return *sse;
401*fb1b10abSAndroid Build Coastguard Worker }
402*fb1b10abSAndroid Build Coastguard Worker 
vpx_mse8x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)403*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
404*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *ref_ptr, int ref_stride,
405*fb1b10abSAndroid Build Coastguard Worker                               unsigned int *sse) {
406*fb1b10abSAndroid Build Coastguard Worker   vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
407*fb1b10abSAndroid Build Coastguard Worker   return *sse;
408*fb1b10abSAndroid Build Coastguard Worker }
409*fb1b10abSAndroid Build Coastguard Worker 
vpx_mse16x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)410*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
411*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *ref_ptr, int ref_stride,
412*fb1b10abSAndroid Build Coastguard Worker                               unsigned int *sse) {
413*fb1b10abSAndroid Build Coastguard Worker   vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
414*fb1b10abSAndroid Build Coastguard Worker   return *sse;
415*fb1b10abSAndroid Build Coastguard Worker }
416*fb1b10abSAndroid Build Coastguard Worker 
vpx_mse16x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)417*fb1b10abSAndroid Build Coastguard Worker unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
418*fb1b10abSAndroid Build Coastguard Worker                                const uint8_t *ref_ptr, int ref_stride,
419*fb1b10abSAndroid Build Coastguard Worker                                unsigned int *sse) {
420*fb1b10abSAndroid Build Coastguard Worker   vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
421*fb1b10abSAndroid Build Coastguard Worker   return *sse;
422*fb1b10abSAndroid Build Coastguard Worker }
423*fb1b10abSAndroid Build Coastguard Worker 
424*fb1b10abSAndroid Build Coastguard Worker // The 2 unused parameters are place holders for PIC enabled build.
425*fb1b10abSAndroid Build Coastguard Worker // These definitions are for functions defined in subpel_variance.asm
426*fb1b10abSAndroid Build Coastguard Worker #define DECL(w, opt)                                                          \
427*fb1b10abSAndroid Build Coastguard Worker   int vpx_sub_pixel_variance##w##xh_##opt(                                    \
428*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,             \
429*fb1b10abSAndroid Build Coastguard Worker       int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
430*fb1b10abSAndroid Build Coastguard Worker       unsigned int *sse, void *unused0, void *unused)
431*fb1b10abSAndroid Build Coastguard Worker #define DECLS(opt1, opt2) \
432*fb1b10abSAndroid Build Coastguard Worker   DECL(4, opt1);          \
433*fb1b10abSAndroid Build Coastguard Worker   DECL(8, opt1);          \
434*fb1b10abSAndroid Build Coastguard Worker   DECL(16, opt1)
435*fb1b10abSAndroid Build Coastguard Worker 
436*fb1b10abSAndroid Build Coastguard Worker DECLS(sse2, sse2);
437*fb1b10abSAndroid Build Coastguard Worker DECLS(ssse3, ssse3);
438*fb1b10abSAndroid Build Coastguard Worker #undef DECLS
439*fb1b10abSAndroid Build Coastguard Worker #undef DECL
440*fb1b10abSAndroid Build Coastguard Worker 
441*fb1b10abSAndroid Build Coastguard Worker #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
442*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                   \
443*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
444*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445*fb1b10abSAndroid Build Coastguard Worker     unsigned int sse_tmp;                                                 \
446*fb1b10abSAndroid Build Coastguard Worker     int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447*fb1b10abSAndroid Build Coastguard Worker         src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448*fb1b10abSAndroid Build Coastguard Worker         &sse_tmp, NULL, NULL);                                            \
449*fb1b10abSAndroid Build Coastguard Worker     if (w > wf) {                                                         \
450*fb1b10abSAndroid Build Coastguard Worker       unsigned int sse2;                                                  \
451*fb1b10abSAndroid Build Coastguard Worker       int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452*fb1b10abSAndroid Build Coastguard Worker           src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453*fb1b10abSAndroid Build Coastguard Worker           ref_stride, h, &sse2, NULL, NULL);                              \
454*fb1b10abSAndroid Build Coastguard Worker       se += se2;                                                          \
455*fb1b10abSAndroid Build Coastguard Worker       sse_tmp += sse2;                                                    \
456*fb1b10abSAndroid Build Coastguard Worker       if (w > wf * 2) {                                                   \
457*fb1b10abSAndroid Build Coastguard Worker         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458*fb1b10abSAndroid Build Coastguard Worker             src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459*fb1b10abSAndroid Build Coastguard Worker             ref_stride, h, &sse2, NULL, NULL);                            \
460*fb1b10abSAndroid Build Coastguard Worker         se += se2;                                                        \
461*fb1b10abSAndroid Build Coastguard Worker         sse_tmp += sse2;                                                  \
462*fb1b10abSAndroid Build Coastguard Worker         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463*fb1b10abSAndroid Build Coastguard Worker             src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464*fb1b10abSAndroid Build Coastguard Worker             ref_stride, h, &sse2, NULL, NULL);                            \
465*fb1b10abSAndroid Build Coastguard Worker         se += se2;                                                        \
466*fb1b10abSAndroid Build Coastguard Worker         sse_tmp += sse2;                                                  \
467*fb1b10abSAndroid Build Coastguard Worker       }                                                                   \
468*fb1b10abSAndroid Build Coastguard Worker     }                                                                     \
469*fb1b10abSAndroid Build Coastguard Worker     *sse = sse_tmp;                                                       \
470*fb1b10abSAndroid Build Coastguard Worker     return sse_tmp -                                                      \
471*fb1b10abSAndroid Build Coastguard Worker            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472*fb1b10abSAndroid Build Coastguard Worker   }
473*fb1b10abSAndroid Build Coastguard Worker 
474*fb1b10abSAndroid Build Coastguard Worker #define FNS(opt1, opt2)                             \
475*fb1b10abSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
476*fb1b10abSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
477*fb1b10abSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
478*fb1b10abSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
479*fb1b10abSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
480*fb1b10abSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
481*fb1b10abSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
482*fb1b10abSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
483*fb1b10abSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
484*fb1b10abSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
485*fb1b10abSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
486*fb1b10abSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
487*fb1b10abSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
488*fb1b10abSAndroid Build Coastguard Worker 
489*fb1b10abSAndroid Build Coastguard Worker FNS(sse2, sse2)
490*fb1b10abSAndroid Build Coastguard Worker FNS(ssse3, ssse3)
491*fb1b10abSAndroid Build Coastguard Worker 
492*fb1b10abSAndroid Build Coastguard Worker #undef FNS
493*fb1b10abSAndroid Build Coastguard Worker #undef FN
494*fb1b10abSAndroid Build Coastguard Worker 
495*fb1b10abSAndroid Build Coastguard Worker // The 2 unused parameters are place holders for PIC enabled build.
496*fb1b10abSAndroid Build Coastguard Worker #define DECL(w, opt)                                                   \
497*fb1b10abSAndroid Build Coastguard Worker   int vpx_sub_pixel_avg_variance##w##xh_##opt(                         \
498*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,      \
499*fb1b10abSAndroid Build Coastguard Worker       int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride,      \
500*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
501*fb1b10abSAndroid Build Coastguard Worker       unsigned int *sse, void *unused0, void *unused)
502*fb1b10abSAndroid Build Coastguard Worker #define DECLS(opt1, opt2) \
503*fb1b10abSAndroid Build Coastguard Worker   DECL(4, opt1);          \
504*fb1b10abSAndroid Build Coastguard Worker   DECL(8, opt1);          \
505*fb1b10abSAndroid Build Coastguard Worker   DECL(16, opt1)
506*fb1b10abSAndroid Build Coastguard Worker 
507*fb1b10abSAndroid Build Coastguard Worker DECLS(sse2, sse2);
508*fb1b10abSAndroid Build Coastguard Worker DECLS(ssse3, ssse3);
509*fb1b10abSAndroid Build Coastguard Worker #undef DECL
510*fb1b10abSAndroid Build Coastguard Worker #undef DECLS
511*fb1b10abSAndroid Build Coastguard Worker 
512*fb1b10abSAndroid Build Coastguard Worker #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
513*fb1b10abSAndroid Build Coastguard Worker   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(               \
514*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
515*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,          \
516*fb1b10abSAndroid Build Coastguard Worker       const uint8_t *second_pred) {                                       \
517*fb1b10abSAndroid Build Coastguard Worker     unsigned int sse_tmp;                                                 \
518*fb1b10abSAndroid Build Coastguard Worker     int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                    \
519*fb1b10abSAndroid Build Coastguard Worker         src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride,     \
520*fb1b10abSAndroid Build Coastguard Worker         second_pred, w, h, &sse_tmp, NULL, NULL);                         \
521*fb1b10abSAndroid Build Coastguard Worker     if (w > wf) {                                                         \
522*fb1b10abSAndroid Build Coastguard Worker       unsigned int sse2;                                                  \
523*fb1b10abSAndroid Build Coastguard Worker       int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                 \
524*fb1b10abSAndroid Build Coastguard Worker           src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
525*fb1b10abSAndroid Build Coastguard Worker           ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL);         \
526*fb1b10abSAndroid Build Coastguard Worker       se += se2;                                                          \
527*fb1b10abSAndroid Build Coastguard Worker       sse_tmp += sse2;                                                    \
528*fb1b10abSAndroid Build Coastguard Worker       if (w > wf * 2) {                                                   \
529*fb1b10abSAndroid Build Coastguard Worker         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
530*fb1b10abSAndroid Build Coastguard Worker             src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
531*fb1b10abSAndroid Build Coastguard Worker             ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL);       \
532*fb1b10abSAndroid Build Coastguard Worker         se += se2;                                                        \
533*fb1b10abSAndroid Build Coastguard Worker         sse_tmp += sse2;                                                  \
534*fb1b10abSAndroid Build Coastguard Worker         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
535*fb1b10abSAndroid Build Coastguard Worker             src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
536*fb1b10abSAndroid Build Coastguard Worker             ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL);       \
537*fb1b10abSAndroid Build Coastguard Worker         se += se2;                                                        \
538*fb1b10abSAndroid Build Coastguard Worker         sse_tmp += sse2;                                                  \
539*fb1b10abSAndroid Build Coastguard Worker       }                                                                   \
540*fb1b10abSAndroid Build Coastguard Worker     }                                                                     \
541*fb1b10abSAndroid Build Coastguard Worker     *sse = sse_tmp;                                                       \
542*fb1b10abSAndroid Build Coastguard Worker     return sse_tmp -                                                      \
543*fb1b10abSAndroid Build Coastguard Worker            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
544*fb1b10abSAndroid Build Coastguard Worker   }
545*fb1b10abSAndroid Build Coastguard Worker 
546*fb1b10abSAndroid Build Coastguard Worker #define FNS(opt1, opt2)                             \
547*fb1b10abSAndroid Build Coastguard Worker   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
548*fb1b10abSAndroid Build Coastguard Worker   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
549*fb1b10abSAndroid Build Coastguard Worker   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
550*fb1b10abSAndroid Build Coastguard Worker   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
551*fb1b10abSAndroid Build Coastguard Worker   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
552*fb1b10abSAndroid Build Coastguard Worker   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
553*fb1b10abSAndroid Build Coastguard Worker   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
554*fb1b10abSAndroid Build Coastguard Worker   FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
555*fb1b10abSAndroid Build Coastguard Worker   FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
556*fb1b10abSAndroid Build Coastguard Worker   FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
557*fb1b10abSAndroid Build Coastguard Worker   FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
558*fb1b10abSAndroid Build Coastguard Worker   FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
559*fb1b10abSAndroid Build Coastguard Worker   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
560*fb1b10abSAndroid Build Coastguard Worker 
561*fb1b10abSAndroid Build Coastguard Worker FNS(sse2, sse)
562*fb1b10abSAndroid Build Coastguard Worker FNS(ssse3, ssse3)
563*fb1b10abSAndroid Build Coastguard Worker 
564*fb1b10abSAndroid Build Coastguard Worker #undef FNS
565*fb1b10abSAndroid Build Coastguard Worker #undef FN
566