xref: /aosp_15_r20/external/libopenapv/src/sse/oapv_sad_sse.c (revision abb65b4b03b69e1d508d4d9a44dcf199df16e7c3)
1 /*
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3  * All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * - Neither the name of the copyright owner, nor the names of its contributors
16  *   may be used to endorse or promote products derived from this software
17  *   without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "oapv_sad_sse.h"
33 
34 #if X86_SSE
35 
36 /* SSD ***********************************************************************/
37 #define SSE_SSD_16B_8PEL(src1, src2, shift, s00, s01, s02, s00a) \
38     s00 = _mm_loadu_si128((__m128i*)(src1)); \
39     s01 = _mm_loadu_si128((__m128i*)(src2)); \
40     s02 = _mm_sub_epi16(s00, s01); \
41     \
42     s00 = _mm_cvtepi16_epi32(s02); \
43     s00 = _mm_mullo_epi32(s00, s00); \
44     \
45     s01 = _mm_srli_si128(s02, 8); \
46     s01 = _mm_cvtepi16_epi32(s01); \
47     s01 = _mm_mullo_epi32(s01, s01); \
48     \
49     s00 = _mm_srli_epi32(s00, shift); \
50     s01 = _mm_srli_epi32(s01, shift); \
51     s00a = _mm_add_epi32(s00a, s00); \
52     s00a = _mm_add_epi32(s00a, s01);
53 
ssd_16b_sse_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)54 static s64 ssd_16b_sse_8x8(int w, int h, void * src1, void * src2, int s_src1, int s_src2, int bit_depth)
55 {
56     s64   ssd;
57     s16 * s1;
58     s16 * s2;
59     const int shift = 0;
60     __m128i s00, s01, s02, s00a;
61 
62     s1 = (s16 *)src1;
63     s2 = (s16 *)src2;
64 
65     s00a = _mm_setzero_si128();
66 
67     SSE_SSD_16B_8PEL(s1, s2, shift, s00, s01, s02, s00a);
68     SSE_SSD_16B_8PEL(s1 + s_src1, s2 + s_src2, shift, s00, s01, s02, s00a);
69     SSE_SSD_16B_8PEL(s1 + s_src1*2, s2 + s_src2*2, shift, s00, s01, s02, s00a);
70     SSE_SSD_16B_8PEL(s1 + s_src1*3, s2 + s_src2*3, shift, s00, s01, s02, s00a);
71     SSE_SSD_16B_8PEL(s1 + s_src1*4, s2 + s_src2*4, shift, s00, s01, s02, s00a);
72     SSE_SSD_16B_8PEL(s1 + s_src1*5, s2 + s_src2*5, shift, s00, s01, s02, s00a);
73     SSE_SSD_16B_8PEL(s1 + s_src1*6, s2 + s_src2*6, shift, s00, s01, s02, s00a);
74     SSE_SSD_16B_8PEL(s1 + s_src1*7, s2 + s_src2*7, shift, s00, s01, s02, s00a);
75 
76     ssd = _mm_extract_epi32(s00a, 0);
77     ssd += _mm_extract_epi32(s00a, 1);
78     ssd += _mm_extract_epi32(s00a, 2);
79     ssd += _mm_extract_epi32(s00a, 3);
80 
81     return ssd;
82 }
83 
84 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_sse[2] =
85 {
86     ssd_16b_sse_8x8,
87         NULL
88 };
89 
oapv_dc_removed_had8x8_sse(pel * org,int s_org)90 int oapv_dc_removed_had8x8_sse(pel* org, int s_org)
91 {
92     int sad = 0;
93     /* all 128 bit registers are named with a suffix mxnb, where m is the */
94     /* number of n bits packed in the register                            */
95     __m128i src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
96     __m128i src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
97     __m128i pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
98     __m128i pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
99     __m128i out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
100     __m128i out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
101 
102     src0_8x16b = _mm_loadu_si128((__m128i*) org);
103     org = org + s_org;
104     src1_8x16b = _mm_loadu_si128((__m128i*) org);
105     org = org + s_org;
106     src2_8x16b = _mm_loadu_si128((__m128i*) org);
107     org = org + s_org;
108     src3_8x16b = _mm_loadu_si128((__m128i*) org);
109     org = org + s_org;
110     src4_8x16b = _mm_loadu_si128((__m128i*) org);
111     org = org + s_org;
112     src5_8x16b = _mm_loadu_si128((__m128i*) org);
113     org = org + s_org;
114     src6_8x16b = _mm_loadu_si128((__m128i*) org);
115     org = org + s_org;
116     src7_8x16b = _mm_loadu_si128((__m128i*) org);
117     org = org + s_org;
118 
119     /**************** 8x8 horizontal transform *******************************/
120     /***********************    8x8 16 bit Transpose  ************************/
121     out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
122     pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
123     out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
124     pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
125     out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
126     src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
127     pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
128     src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
129 
130     out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
131     out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
132     pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
133     pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
134     out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
135     out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
136     pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
137     pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
138 
139     out0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
140     out1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
141     out2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
142     out3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
143     out4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
144     out5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
145     out6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
146     out7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
147     /**********************   8x8 16 bit Transpose End   *********************/
148 
149     /* r0 + r1 */
150     pred0_8x16b = _mm_add_epi16(out0_8x16b, out1_8x16b);
151     /* r2 + r3 */
152     pred2_8x16b = _mm_add_epi16(out2_8x16b, out3_8x16b);
153     /* r4 + r5 */
154     pred4_8x16b = _mm_add_epi16(out4_8x16b, out5_8x16b);
155     /* r6 + r7 */
156     pred6_8x16b = _mm_add_epi16(out6_8x16b, out7_8x16b);
157 
158     /* r0 + r1 + r2 + r3 */
159     pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
160     /* r4 + r5 + r6 + r7 */
161     pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
162     /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
163     src0_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
164     /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
165     src4_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
166 
167     /* r0 + r1 - r2 - r3 */
168     pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
169     /* r4 + r5 - r6 - r7 */
170     pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
171     /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
172     src2_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
173     /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
174     src6_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
175 
176     /* r0 - r1 */
177     pred0_8x16b = _mm_sub_epi16(out0_8x16b, out1_8x16b);
178     /* r2 - r3 */
179     pred2_8x16b = _mm_sub_epi16(out2_8x16b, out3_8x16b);
180     /* r4 - r5 */
181     pred4_8x16b = _mm_sub_epi16(out4_8x16b, out5_8x16b);
182     /* r6 - r7 */
183     pred6_8x16b = _mm_sub_epi16(out6_8x16b, out7_8x16b);
184 
185     /* r0 - r1 + r2 - r3 */
186     pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
187     /* r4 - r5 + r6 - r7 */
188     pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
189     /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
190     src1_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
191     /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
192     src5_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
193 
194     /* r0 - r1 - r2 + r3 */
195     pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
196     /* r4 - r5 - r6 + r7 */
197     pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
198     /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
199     src3_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
200     /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
201     src7_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
202 
203     /***********************    8x8 16 bit Transpose  ************************/
204     out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
205     pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
206     out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
207     pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
208     out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
209     src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
210     pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
211     src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
212 
213     out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
214     out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
215     pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
216     pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
217     out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
218     out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
219     pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
220     pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
221 
222     src0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
223     src1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
224     src2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
225     src3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
226     src4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
227     src5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
228     src6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
229     src7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
230     /**********************   8x8 16 bit Transpose End   *********************/
231     /**************** 8x8 horizontal transform *******************************/
232 
233     {
234         __m128i out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
235         __m128i out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
236         __m128i tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
237         __m128i tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
238 
239         /************************* 8x8 Vertical Transform*************************/
240         tmp0_8x16b = _mm_srli_si128(src0_8x16b, 8);
241         tmp1_8x16b = _mm_srli_si128(src1_8x16b, 8);
242         tmp2_8x16b = _mm_srli_si128(src2_8x16b, 8);
243         tmp3_8x16b = _mm_srli_si128(src3_8x16b, 8);
244         tmp4_8x16b = _mm_srli_si128(src4_8x16b, 8);
245         tmp5_8x16b = _mm_srli_si128(src5_8x16b, 8);
246         tmp6_8x16b = _mm_srli_si128(src6_8x16b, 8);
247         tmp7_8x16b = _mm_srli_si128(src7_8x16b, 8);
248 
249         /*************************First 4 pixels ********************************/
250         src0_8x16b = _mm_cvtepi16_epi32(src0_8x16b);
251         src1_8x16b = _mm_cvtepi16_epi32(src1_8x16b);
252         src2_8x16b = _mm_cvtepi16_epi32(src2_8x16b);
253         src3_8x16b = _mm_cvtepi16_epi32(src3_8x16b);
254         src4_8x16b = _mm_cvtepi16_epi32(src4_8x16b);
255         src5_8x16b = _mm_cvtepi16_epi32(src5_8x16b);
256         src6_8x16b = _mm_cvtepi16_epi32(src6_8x16b);
257         src7_8x16b = _mm_cvtepi16_epi32(src7_8x16b);
258 
259         /* r0 + r1 */
260         pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
261         /* r2 + r3 */
262         pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
263         /* r4 + r5 */
264         pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
265         /* r6 + r7 */
266         pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
267 
268         /* r0 + r1 + r2 + r3 */
269         pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
270         /* r4 + r5 + r6 + r7 */
271         pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
272         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
273         out0_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
274         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
275         out4_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
276 
277         /* r0 + r1 - r2 - r3 */
278         pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
279         /* r4 + r5 - r6 - r7 */
280         pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
281         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
282         out2_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
283         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
284         out6_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
285 
286         /* r0 - r1 */
287         pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
288         /* r2 - r3 */
289         pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
290         /* r4 - r5 */
291         pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
292         /* r6 - r7 */
293         pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
294 
295         /* r0 - r1 + r2 - r3 */
296         pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
297         /* r4 - r5 + r6 - r7 */
298         pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
299         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
300         out1_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
301         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
302         out5_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
303 
304         /* r0 - r1 - r2 + r3 */
305         pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
306         /* r4 - r5 - r6 + r7 */
307         pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
308         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
309         out3_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
310         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
311         out7_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
312         /*************************First 4 pixels ********************************/
313 
314         /**************************Next 4 pixels *******************************/
315         src0_8x16b = _mm_cvtepi16_epi32(tmp0_8x16b);
316         src1_8x16b = _mm_cvtepi16_epi32(tmp1_8x16b);
317         src2_8x16b = _mm_cvtepi16_epi32(tmp2_8x16b);
318         src3_8x16b = _mm_cvtepi16_epi32(tmp3_8x16b);
319         src4_8x16b = _mm_cvtepi16_epi32(tmp4_8x16b);
320         src5_8x16b = _mm_cvtepi16_epi32(tmp5_8x16b);
321         src6_8x16b = _mm_cvtepi16_epi32(tmp6_8x16b);
322         src7_8x16b = _mm_cvtepi16_epi32(tmp7_8x16b);
323 
324         /* r0 + r1 */
325         pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
326         /* r2 + r3 */
327         pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
328         /* r4 + r5 */
329         pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
330         /* r6 + r7 */
331         pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
332 
333         /* r0 + r1 + r2 + r3 */
334         pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
335         /* r4 + r5 + r6 + r7 */
336         pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
337         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
338         out0a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
339         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
340         out4a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
341 
342         /* r0 + r1 - r2 - r3 */
343         pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
344         /* r4 + r5 - r6 - r7 */
345         pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
346         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
347         out2a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
348         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
349         out6a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
350 
351         /* r0 - r1 */
352         pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
353         /* r2 - r3 */
354         pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
355         /* r4 - r5 */
356         pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
357         /* r6 - r7 */
358         pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
359 
360         /* r0 - r1 + r2 - r3 */
361         pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
362         /* r4 - r5 + r6 - r7 */
363         pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
364         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
365         out1a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
366         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
367         out5a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
368 
369         /* r0 - r1 - r2 + r3 */
370         pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
371         /* r4 - r5 - r6 + r7 */
372         pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
373         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
374         out3a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
375         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
376         out7a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
377         /**************************Next 4 pixels *******************************/
378         /************************* 8x8 Vertical Transform*************************/
379 
380         /****************************SATD calculation ****************************/
381         src0_8x16b = _mm_abs_epi32(out0_8x16b);
382         src1_8x16b = _mm_abs_epi32(out1_8x16b);
383         src2_8x16b = _mm_abs_epi32(out2_8x16b);
384         src3_8x16b = _mm_abs_epi32(out3_8x16b);
385         src4_8x16b = _mm_abs_epi32(out4_8x16b);
386         src5_8x16b = _mm_abs_epi32(out5_8x16b);
387         src6_8x16b = _mm_abs_epi32(out6_8x16b);
388         src7_8x16b = _mm_abs_epi32(out7_8x16b);
389 
390         s32* p = (s32*)&src0_8x16b;
391         p[0] = 0;
392 
393         src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
394         src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
395         src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
396         src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
397 
398         src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
399         src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
400 
401         src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
402 
403         src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
404         src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
405 
406         sad += _mm_cvtsi128_si32(src0_8x16b);
407 
408         src0_8x16b = _mm_abs_epi32(out0a_8x16b);
409         src1_8x16b = _mm_abs_epi32(out1a_8x16b);
410         src2_8x16b = _mm_abs_epi32(out2a_8x16b);
411         src3_8x16b = _mm_abs_epi32(out3a_8x16b);
412         src4_8x16b = _mm_abs_epi32(out4a_8x16b);
413         src5_8x16b = _mm_abs_epi32(out5a_8x16b);
414         src6_8x16b = _mm_abs_epi32(out6a_8x16b);
415         src7_8x16b = _mm_abs_epi32(out7a_8x16b);
416 
417         src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
418         src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
419         src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
420         src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
421 
422         src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
423         src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
424 
425         src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
426 
427         src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
428         src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
429 
430         sad += _mm_cvtsi128_si32(src0_8x16b);
431 
432         sad = (sad + 2) >> 2;
433 
434         return sad;
435     }
436 }
437 #endif /* X86_SSE */