1 /*
2 * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3 * All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * - Neither the name of the copyright owner, nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "oapv_sad_sse.h"
33
34 #if X86_SSE
35
36 /* SSD ***********************************************************************/
37 #define SSE_SSD_16B_8PEL(src1, src2, shift, s00, s01, s02, s00a) \
38 s00 = _mm_loadu_si128((__m128i*)(src1)); \
39 s01 = _mm_loadu_si128((__m128i*)(src2)); \
40 s02 = _mm_sub_epi16(s00, s01); \
41 \
42 s00 = _mm_cvtepi16_epi32(s02); \
43 s00 = _mm_mullo_epi32(s00, s00); \
44 \
45 s01 = _mm_srli_si128(s02, 8); \
46 s01 = _mm_cvtepi16_epi32(s01); \
47 s01 = _mm_mullo_epi32(s01, s01); \
48 \
49 s00 = _mm_srli_epi32(s00, shift); \
50 s01 = _mm_srli_epi32(s01, shift); \
51 s00a = _mm_add_epi32(s00a, s00); \
52 s00a = _mm_add_epi32(s00a, s01);
53
ssd_16b_sse_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)54 static s64 ssd_16b_sse_8x8(int w, int h, void * src1, void * src2, int s_src1, int s_src2, int bit_depth)
55 {
56 s64 ssd;
57 s16 * s1;
58 s16 * s2;
59 const int shift = 0;
60 __m128i s00, s01, s02, s00a;
61
62 s1 = (s16 *)src1;
63 s2 = (s16 *)src2;
64
65 s00a = _mm_setzero_si128();
66
67 SSE_SSD_16B_8PEL(s1, s2, shift, s00, s01, s02, s00a);
68 SSE_SSD_16B_8PEL(s1 + s_src1, s2 + s_src2, shift, s00, s01, s02, s00a);
69 SSE_SSD_16B_8PEL(s1 + s_src1*2, s2 + s_src2*2, shift, s00, s01, s02, s00a);
70 SSE_SSD_16B_8PEL(s1 + s_src1*3, s2 + s_src2*3, shift, s00, s01, s02, s00a);
71 SSE_SSD_16B_8PEL(s1 + s_src1*4, s2 + s_src2*4, shift, s00, s01, s02, s00a);
72 SSE_SSD_16B_8PEL(s1 + s_src1*5, s2 + s_src2*5, shift, s00, s01, s02, s00a);
73 SSE_SSD_16B_8PEL(s1 + s_src1*6, s2 + s_src2*6, shift, s00, s01, s02, s00a);
74 SSE_SSD_16B_8PEL(s1 + s_src1*7, s2 + s_src2*7, shift, s00, s01, s02, s00a);
75
76 ssd = _mm_extract_epi32(s00a, 0);
77 ssd += _mm_extract_epi32(s00a, 1);
78 ssd += _mm_extract_epi32(s00a, 2);
79 ssd += _mm_extract_epi32(s00a, 3);
80
81 return ssd;
82 }
83
84 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_sse[2] =
85 {
86 ssd_16b_sse_8x8,
87 NULL
88 };
89
oapv_dc_removed_had8x8_sse(pel * org,int s_org)90 int oapv_dc_removed_had8x8_sse(pel* org, int s_org)
91 {
92 int sad = 0;
93 /* all 128 bit registers are named with a suffix mxnb, where m is the */
94 /* number of n bits packed in the register */
95 __m128i src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
96 __m128i src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
97 __m128i pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
98 __m128i pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
99 __m128i out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
100 __m128i out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
101
102 src0_8x16b = _mm_loadu_si128((__m128i*) org);
103 org = org + s_org;
104 src1_8x16b = _mm_loadu_si128((__m128i*) org);
105 org = org + s_org;
106 src2_8x16b = _mm_loadu_si128((__m128i*) org);
107 org = org + s_org;
108 src3_8x16b = _mm_loadu_si128((__m128i*) org);
109 org = org + s_org;
110 src4_8x16b = _mm_loadu_si128((__m128i*) org);
111 org = org + s_org;
112 src5_8x16b = _mm_loadu_si128((__m128i*) org);
113 org = org + s_org;
114 src6_8x16b = _mm_loadu_si128((__m128i*) org);
115 org = org + s_org;
116 src7_8x16b = _mm_loadu_si128((__m128i*) org);
117 org = org + s_org;
118
119 /**************** 8x8 horizontal transform *******************************/
120 /*********************** 8x8 16 bit Transpose ************************/
121 out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
122 pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
123 out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
124 pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
125 out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
126 src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
127 pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
128 src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
129
130 out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
131 out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
132 pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
133 pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
134 out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
135 out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
136 pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
137 pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
138
139 out0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
140 out1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
141 out2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
142 out3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
143 out4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
144 out5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
145 out6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
146 out7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
147 /********************** 8x8 16 bit Transpose End *********************/
148
149 /* r0 + r1 */
150 pred0_8x16b = _mm_add_epi16(out0_8x16b, out1_8x16b);
151 /* r2 + r3 */
152 pred2_8x16b = _mm_add_epi16(out2_8x16b, out3_8x16b);
153 /* r4 + r5 */
154 pred4_8x16b = _mm_add_epi16(out4_8x16b, out5_8x16b);
155 /* r6 + r7 */
156 pred6_8x16b = _mm_add_epi16(out6_8x16b, out7_8x16b);
157
158 /* r0 + r1 + r2 + r3 */
159 pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
160 /* r4 + r5 + r6 + r7 */
161 pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
162 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
163 src0_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
164 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
165 src4_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
166
167 /* r0 + r1 - r2 - r3 */
168 pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
169 /* r4 + r5 - r6 - r7 */
170 pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
171 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
172 src2_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
173 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
174 src6_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
175
176 /* r0 - r1 */
177 pred0_8x16b = _mm_sub_epi16(out0_8x16b, out1_8x16b);
178 /* r2 - r3 */
179 pred2_8x16b = _mm_sub_epi16(out2_8x16b, out3_8x16b);
180 /* r4 - r5 */
181 pred4_8x16b = _mm_sub_epi16(out4_8x16b, out5_8x16b);
182 /* r6 - r7 */
183 pred6_8x16b = _mm_sub_epi16(out6_8x16b, out7_8x16b);
184
185 /* r0 - r1 + r2 - r3 */
186 pred1_8x16b = _mm_add_epi16(pred0_8x16b, pred2_8x16b);
187 /* r4 - r5 + r6 - r7 */
188 pred5_8x16b = _mm_add_epi16(pred4_8x16b, pred6_8x16b);
189 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
190 src1_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
191 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
192 src5_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
193
194 /* r0 - r1 - r2 + r3 */
195 pred1_8x16b = _mm_sub_epi16(pred0_8x16b, pred2_8x16b);
196 /* r4 - r5 - r6 + r7 */
197 pred5_8x16b = _mm_sub_epi16(pred4_8x16b, pred6_8x16b);
198 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
199 src3_8x16b = _mm_add_epi16(pred1_8x16b, pred5_8x16b);
200 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
201 src7_8x16b = _mm_sub_epi16(pred1_8x16b, pred5_8x16b);
202
203 /*********************** 8x8 16 bit Transpose ************************/
204 out3_8x16b = _mm_unpacklo_epi16(src0_8x16b, src1_8x16b);
205 pred0_8x16b = _mm_unpacklo_epi16(src2_8x16b, src3_8x16b);
206 out2_8x16b = _mm_unpacklo_epi16(src4_8x16b, src5_8x16b);
207 pred3_8x16b = _mm_unpacklo_epi16(src6_8x16b, src7_8x16b);
208 out7_8x16b = _mm_unpackhi_epi16(src0_8x16b, src1_8x16b);
209 src2_8x16b = _mm_unpackhi_epi16(src2_8x16b, src3_8x16b);
210 pred7_8x16b = _mm_unpackhi_epi16(src4_8x16b, src5_8x16b);
211 src6_8x16b = _mm_unpackhi_epi16(src6_8x16b, src7_8x16b);
212
213 out1_8x16b = _mm_unpacklo_epi32(out3_8x16b, pred0_8x16b);
214 out3_8x16b = _mm_unpackhi_epi32(out3_8x16b, pred0_8x16b);
215 pred1_8x16b = _mm_unpacklo_epi32(out2_8x16b, pred3_8x16b);
216 pred3_8x16b = _mm_unpackhi_epi32(out2_8x16b, pred3_8x16b);
217 out5_8x16b = _mm_unpacklo_epi32(out7_8x16b, src2_8x16b);
218 out7_8x16b = _mm_unpackhi_epi32(out7_8x16b, src2_8x16b);
219 pred5_8x16b = _mm_unpacklo_epi32(pred7_8x16b, src6_8x16b);
220 pred7_8x16b = _mm_unpackhi_epi32(pred7_8x16b, src6_8x16b);
221
222 src0_8x16b = _mm_unpacklo_epi64(out1_8x16b, pred1_8x16b);
223 src1_8x16b = _mm_unpackhi_epi64(out1_8x16b, pred1_8x16b);
224 src2_8x16b = _mm_unpacklo_epi64(out3_8x16b, pred3_8x16b);
225 src3_8x16b = _mm_unpackhi_epi64(out3_8x16b, pred3_8x16b);
226 src4_8x16b = _mm_unpacklo_epi64(out5_8x16b, pred5_8x16b);
227 src5_8x16b = _mm_unpackhi_epi64(out5_8x16b, pred5_8x16b);
228 src6_8x16b = _mm_unpacklo_epi64(out7_8x16b, pred7_8x16b);
229 src7_8x16b = _mm_unpackhi_epi64(out7_8x16b, pred7_8x16b);
230 /********************** 8x8 16 bit Transpose End *********************/
231 /**************** 8x8 horizontal transform *******************************/
232
233 {
234 __m128i out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
235 __m128i out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
236 __m128i tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
237 __m128i tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
238
239 /************************* 8x8 Vertical Transform*************************/
240 tmp0_8x16b = _mm_srli_si128(src0_8x16b, 8);
241 tmp1_8x16b = _mm_srli_si128(src1_8x16b, 8);
242 tmp2_8x16b = _mm_srli_si128(src2_8x16b, 8);
243 tmp3_8x16b = _mm_srli_si128(src3_8x16b, 8);
244 tmp4_8x16b = _mm_srli_si128(src4_8x16b, 8);
245 tmp5_8x16b = _mm_srli_si128(src5_8x16b, 8);
246 tmp6_8x16b = _mm_srli_si128(src6_8x16b, 8);
247 tmp7_8x16b = _mm_srli_si128(src7_8x16b, 8);
248
249 /*************************First 4 pixels ********************************/
250 src0_8x16b = _mm_cvtepi16_epi32(src0_8x16b);
251 src1_8x16b = _mm_cvtepi16_epi32(src1_8x16b);
252 src2_8x16b = _mm_cvtepi16_epi32(src2_8x16b);
253 src3_8x16b = _mm_cvtepi16_epi32(src3_8x16b);
254 src4_8x16b = _mm_cvtepi16_epi32(src4_8x16b);
255 src5_8x16b = _mm_cvtepi16_epi32(src5_8x16b);
256 src6_8x16b = _mm_cvtepi16_epi32(src6_8x16b);
257 src7_8x16b = _mm_cvtepi16_epi32(src7_8x16b);
258
259 /* r0 + r1 */
260 pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
261 /* r2 + r3 */
262 pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
263 /* r4 + r5 */
264 pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
265 /* r6 + r7 */
266 pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
267
268 /* r0 + r1 + r2 + r3 */
269 pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
270 /* r4 + r5 + r6 + r7 */
271 pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
272 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
273 out0_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
274 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
275 out4_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
276
277 /* r0 + r1 - r2 - r3 */
278 pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
279 /* r4 + r5 - r6 - r7 */
280 pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
281 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
282 out2_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
283 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
284 out6_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
285
286 /* r0 - r1 */
287 pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
288 /* r2 - r3 */
289 pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
290 /* r4 - r5 */
291 pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
292 /* r6 - r7 */
293 pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
294
295 /* r0 - r1 + r2 - r3 */
296 pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
297 /* r4 - r5 + r6 - r7 */
298 pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
299 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
300 out1_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
301 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
302 out5_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
303
304 /* r0 - r1 - r2 + r3 */
305 pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
306 /* r4 - r5 - r6 + r7 */
307 pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
308 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
309 out3_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
310 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
311 out7_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
312 /*************************First 4 pixels ********************************/
313
314 /**************************Next 4 pixels *******************************/
315 src0_8x16b = _mm_cvtepi16_epi32(tmp0_8x16b);
316 src1_8x16b = _mm_cvtepi16_epi32(tmp1_8x16b);
317 src2_8x16b = _mm_cvtepi16_epi32(tmp2_8x16b);
318 src3_8x16b = _mm_cvtepi16_epi32(tmp3_8x16b);
319 src4_8x16b = _mm_cvtepi16_epi32(tmp4_8x16b);
320 src5_8x16b = _mm_cvtepi16_epi32(tmp5_8x16b);
321 src6_8x16b = _mm_cvtepi16_epi32(tmp6_8x16b);
322 src7_8x16b = _mm_cvtepi16_epi32(tmp7_8x16b);
323
324 /* r0 + r1 */
325 pred0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
326 /* r2 + r3 */
327 pred2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
328 /* r4 + r5 */
329 pred4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
330 /* r6 + r7 */
331 pred6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
332
333 /* r0 + r1 + r2 + r3 */
334 pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
335 /* r4 + r5 + r6 + r7 */
336 pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
337 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
338 out0a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
339 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
340 out4a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
341
342 /* r0 + r1 - r2 - r3 */
343 pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
344 /* r4 + r5 - r6 - r7 */
345 pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
346 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
347 out2a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
348 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
349 out6a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
350
351 /* r0 - r1 */
352 pred0_8x16b = _mm_sub_epi32(src0_8x16b, src1_8x16b);
353 /* r2 - r3 */
354 pred2_8x16b = _mm_sub_epi32(src2_8x16b, src3_8x16b);
355 /* r4 - r5 */
356 pred4_8x16b = _mm_sub_epi32(src4_8x16b, src5_8x16b);
357 /* r6 - r7 */
358 pred6_8x16b = _mm_sub_epi32(src6_8x16b, src7_8x16b);
359
360 /* r0 - r1 + r2 - r3 */
361 pred1_8x16b = _mm_add_epi32(pred0_8x16b, pred2_8x16b);
362 /* r4 - r5 + r6 - r7 */
363 pred5_8x16b = _mm_add_epi32(pred4_8x16b, pred6_8x16b);
364 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
365 out1a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
366 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
367 out5a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
368
369 /* r0 - r1 - r2 + r3 */
370 pred1_8x16b = _mm_sub_epi32(pred0_8x16b, pred2_8x16b);
371 /* r4 - r5 - r6 + r7 */
372 pred5_8x16b = _mm_sub_epi32(pred4_8x16b, pred6_8x16b);
373 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
374 out3a_8x16b = _mm_add_epi32(pred1_8x16b, pred5_8x16b);
375 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
376 out7a_8x16b = _mm_sub_epi32(pred1_8x16b, pred5_8x16b);
377 /**************************Next 4 pixels *******************************/
378 /************************* 8x8 Vertical Transform*************************/
379
380 /****************************SATD calculation ****************************/
381 src0_8x16b = _mm_abs_epi32(out0_8x16b);
382 src1_8x16b = _mm_abs_epi32(out1_8x16b);
383 src2_8x16b = _mm_abs_epi32(out2_8x16b);
384 src3_8x16b = _mm_abs_epi32(out3_8x16b);
385 src4_8x16b = _mm_abs_epi32(out4_8x16b);
386 src5_8x16b = _mm_abs_epi32(out5_8x16b);
387 src6_8x16b = _mm_abs_epi32(out6_8x16b);
388 src7_8x16b = _mm_abs_epi32(out7_8x16b);
389
390 s32* p = (s32*)&src0_8x16b;
391 p[0] = 0;
392
393 src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
394 src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
395 src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
396 src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
397
398 src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
399 src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
400
401 src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
402
403 src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
404 src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
405
406 sad += _mm_cvtsi128_si32(src0_8x16b);
407
408 src0_8x16b = _mm_abs_epi32(out0a_8x16b);
409 src1_8x16b = _mm_abs_epi32(out1a_8x16b);
410 src2_8x16b = _mm_abs_epi32(out2a_8x16b);
411 src3_8x16b = _mm_abs_epi32(out3a_8x16b);
412 src4_8x16b = _mm_abs_epi32(out4a_8x16b);
413 src5_8x16b = _mm_abs_epi32(out5a_8x16b);
414 src6_8x16b = _mm_abs_epi32(out6a_8x16b);
415 src7_8x16b = _mm_abs_epi32(out7a_8x16b);
416
417 src0_8x16b = _mm_add_epi32(src0_8x16b, src1_8x16b);
418 src2_8x16b = _mm_add_epi32(src2_8x16b, src3_8x16b);
419 src4_8x16b = _mm_add_epi32(src4_8x16b, src5_8x16b);
420 src6_8x16b = _mm_add_epi32(src6_8x16b, src7_8x16b);
421
422 src0_8x16b = _mm_add_epi32(src0_8x16b, src2_8x16b);
423 src4_8x16b = _mm_add_epi32(src4_8x16b, src6_8x16b);
424
425 src0_8x16b = _mm_add_epi32(src0_8x16b, src4_8x16b);
426
427 src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
428 src0_8x16b = _mm_hadd_epi32(src0_8x16b, src0_8x16b);
429
430 sad += _mm_cvtsi128_si32(src0_8x16b);
431
432 sad = (sad + 2) >> 2;
433
434 return sad;
435 }
436 }
437 #endif /* X86_SSE */