1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file isvce_rc_utils_sse42.c
24 *
25 * @brief
26 * This file contains the x86 SIMD version of the function which computes
27 * gradient per pixel value being used in Init Qp
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - isvce_get_gpp_sse42()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40
41 #include <immintrin.h>
42
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "isvc_structs.h"
46 #include "isvce_rc_utils_private_defs.h"
47
48 /**
49 *******************************************************************************
50 *
51 * @brief
52 * get gpp function
53 *
54 * @par Description:
55 * computes gradient per pixel value for a given frame
56 *
57 * @param[in] ps_input_buf
58 * pointer to yuv buffer properties
59 *
60 * @returns
61 * calculated gpp value
62 *
63 * @remarks
64 * none
65 *
66 *******************************************************************************
67 */
68
isvce_get_gpp_sse42(yuv_buf_props_t * ps_input_buf)69 DOUBLE isvce_get_gpp_sse42(yuv_buf_props_t *ps_input_buf)
70 {
71 UWORD8 *pu1_input_buf;
72 UWORD16 mask_ffff, mask_00ff;
73 UWORD32 i, j, k;
74 UWORD32 u4_width, u4_height, i4_input_stride;
75 DOUBLE d_gpp_y, d_gpp_u, d_gpp_v, d_gpp;
76
77 __m128i u1_src_r0, u1_src_r1, u1_src_r2, u1_src_r3, u1_src_r4;
78 __m128i u1_src_right_r0, u1_src_right_r1, u1_src_right_r2, u1_src_right_r3;
79 __m128i u2_sad_cur_bot_r01, u2_sad_cur_bot_r12, u2_sad_cur_bot_r23, u2_sad_cur_bot_r34;
80 __m128i u2_sad_cur_right_r0, u2_sad_cur_right_r1, u2_sad_cur_right_r2, u2_sad_cur_right_r3;
81 __m128i u2_sad_hadd, u1_shuffle_chroma, u2_mask_and_pixY, u2_mask_and_pixUV;
82
83 d_gpp_y = 0;
84 d_gpp_u = 0;
85 d_gpp_v = 0;
86 d_gpp = 0;
87 mask_ffff = 0xffff;
88 mask_00ff = 0x00ff;
89 pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[0].pv_data;
90 i4_input_stride = ps_input_buf->as_component_bufs[0].i4_data_stride;
91 u4_width = ps_input_buf->u4_width;
92 u4_height = ps_input_buf->u4_height;
93
94 u1_shuffle_chroma = _mm_setr_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x01, 0x03,
95 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f);
96 u2_mask_and_pixY = _mm_setr_epi16(mask_ffff, mask_ffff, mask_ffff, mask_ffff, mask_ffff,
97 mask_ffff, mask_ffff, mask_00ff);
98 u2_mask_and_pixUV = _mm_setr_epi16(mask_ffff, mask_ffff, mask_ffff, mask_00ff, mask_ffff,
99 mask_ffff, mask_ffff, mask_00ff);
100
101 ASSERT((u4_width % 16) == 0);
102
103 /***********************************************************/
104 /* For Luma - */
105 /* This code block calculates gpp value for luma by adding */
106 /* the absolute difference between the current pixel and */
107 /* it's immediate right pixel with the absolute difference */
108 /* between the current pixel and it's immediate bottom */
109 /* pixel and accumulating for every pixel in the frame. */
110 /***********************************************************/
111 for(i = 0; i < u4_height - 4; i += 4)
112 {
113 for(j = 0; j < u4_width - 16; j += 16)
114 {
115 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
116 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
117 u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
118 u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
119 u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
120 u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 1));
121 u1_src_right_r1 =
122 _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j + 1));
123 u1_src_right_r2 =
124 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j + 1));
125 u1_src_right_r3 =
126 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j + 1));
127
128 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
129 u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
130 u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
131 u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
132 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
133 u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
134 u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
135 u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
136
137 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
138 u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
139 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
140 u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
141
142 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
143 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
144
145 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
146
147 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
148 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
149 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
150
151 d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
152 }
153
154 /************************************************************/
155 /* Remaining width - */
156 /* Since Last pixel is not getting processed, remaining 15 */
157 /* pixels are getting processed separately by performing */
158 /* and operations with u2_mask_and_pixY mask */
159 /************************************************************/
160 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
161 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
162 u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
163 u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
164 u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
165 u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 1);
166 u1_src_right_r1 = _mm_srli_si128(u1_src_r1, 1);
167 u1_src_right_r2 = _mm_srli_si128(u1_src_r2, 1);
168 u1_src_right_r3 = _mm_srli_si128(u1_src_r3, 1);
169
170 u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixY);
171 u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixY);
172 u1_src_r2 = _mm_and_si128(u1_src_r2, u2_mask_and_pixY);
173 u1_src_r3 = _mm_and_si128(u1_src_r3, u2_mask_and_pixY);
174 u1_src_r4 = _mm_and_si128(u1_src_r4, u2_mask_and_pixY);
175
176 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
177 u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
178 u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
179 u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
180 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
181 u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
182 u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
183 u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
184
185 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
186 u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
187 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
188 u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
189
190 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
191 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
192
193 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
194
195 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
196 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
197 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
198
199 d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
200
201 pu1_input_buf += (i4_input_stride << 2);
202 }
203
204 /* Loop for the remaining height */
205 for(k = i; k < u4_height - 1; k++)
206 {
207 for(j = 0; j < u4_width - 16; j += 16)
208 {
209 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
210 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
211 u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 1));
212
213 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
214 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
215
216 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
217
218 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
219 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
220 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
221
222 d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
223 }
224
225 /************************************************************/
226 /* Remaining width - */
227 /* Since Last pixel is not getting processed, remaining 15 */
228 /* pixels are getting processed separately by performing */
229 /* and operations with u2_mask_and_pixY mask */
230 /************************************************************/
231 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
232 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
233 u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 1);
234
235 u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixY);
236 u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixY);
237
238 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
239 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
240
241 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
242
243 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
244 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
245 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
246
247 d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
248
249 pu1_input_buf += (i4_input_stride);
250 }
251
252 pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[1].pv_data;
253 i4_input_stride = ps_input_buf->as_component_bufs[1].i4_data_stride;
254
255 /**************************************************************/
256 /* For Chroma - */
257 /* This code block first deinterleaves the Cb and Cr values */
258 /* from the loaded registers, calculates gpp value for both */
259 /* Cb and Cr separately by adding the absolute difference */
260 /* between the current pixel and it's immediate right pixel */
261 /* with the absolute difference between the current pixel and */
262 /* it's immediate bottom pixel and accumulating for every */
263 /* pixel in the frame. */
264 /**************************************************************/
265 for(i = 0; i < (u4_height / 2) - 4; i += 4)
266 {
267 for(j = 0; j < u4_width - 16; j += 16)
268 {
269 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
270 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
271 u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
272 u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
273 u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
274 u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 2));
275 u1_src_right_r1 =
276 _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j + 2));
277 u1_src_right_r2 =
278 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j + 2));
279 u1_src_right_r3 =
280 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j + 2));
281
282 /* separating u and v */
283 u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
284 u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
285 u1_src_r2 = _mm_shuffle_epi8(u1_src_r2, u1_shuffle_chroma);
286 u1_src_r3 = _mm_shuffle_epi8(u1_src_r3, u1_shuffle_chroma);
287 u1_src_r4 = _mm_shuffle_epi8(u1_src_r4, u1_shuffle_chroma);
288 u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
289 u1_src_right_r1 = _mm_shuffle_epi8(u1_src_right_r1, u1_shuffle_chroma);
290 u1_src_right_r2 = _mm_shuffle_epi8(u1_src_right_r2, u1_shuffle_chroma);
291 u1_src_right_r3 = _mm_shuffle_epi8(u1_src_right_r3, u1_shuffle_chroma);
292
293 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
294 u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
295 u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
296 u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
297 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
298 u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
299 u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
300 u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
301
302 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
303 u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
304 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
305 u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
306
307 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
308 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
309
310 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
311
312 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
313 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
314
315 d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
316 d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
317 }
318
319 /************************************************************/
320 /* Remaining width - */
321 /* Since Last pixel is not getting processed, remaining 15 */
322 /* pixels are getting processed separately by performing */
323 /* and operations with u2_mask_and_pixUV mask */
324 /************************************************************/
325 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
326 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
327 u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
328 u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
329 u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
330 u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 2);
331 u1_src_right_r1 = _mm_srli_si128(u1_src_r1, 2);
332 u1_src_right_r2 = _mm_srli_si128(u1_src_r2, 2);
333 u1_src_right_r3 = _mm_srli_si128(u1_src_r3, 2);
334
335 /* separating u and v */
336 u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
337 u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
338 u1_src_r2 = _mm_shuffle_epi8(u1_src_r2, u1_shuffle_chroma);
339 u1_src_r3 = _mm_shuffle_epi8(u1_src_r3, u1_shuffle_chroma);
340 u1_src_r4 = _mm_shuffle_epi8(u1_src_r4, u1_shuffle_chroma);
341 u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
342 u1_src_right_r1 = _mm_shuffle_epi8(u1_src_right_r1, u1_shuffle_chroma);
343 u1_src_right_r2 = _mm_shuffle_epi8(u1_src_right_r2, u1_shuffle_chroma);
344 u1_src_right_r3 = _mm_shuffle_epi8(u1_src_right_r3, u1_shuffle_chroma);
345
346 u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixUV);
347 u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixUV);
348 u1_src_r2 = _mm_and_si128(u1_src_r2, u2_mask_and_pixUV);
349 u1_src_r3 = _mm_and_si128(u1_src_r3, u2_mask_and_pixUV);
350 u1_src_r4 = _mm_and_si128(u1_src_r4, u2_mask_and_pixUV);
351 u1_src_right_r0 = _mm_and_si128(u1_src_right_r0, u2_mask_and_pixUV);
352 u1_src_right_r1 = _mm_and_si128(u1_src_right_r1, u2_mask_and_pixUV);
353 u1_src_right_r2 = _mm_and_si128(u1_src_right_r2, u2_mask_and_pixUV);
354 u1_src_right_r3 = _mm_and_si128(u1_src_right_r3, u2_mask_and_pixUV);
355
356 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
357 u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
358 u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
359 u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
360 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
361 u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
362 u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
363 u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
364
365 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
366 u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
367 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
368 u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
369
370 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
371 u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
372
373 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
374
375 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
376 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
377
378 d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
379 d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
380
381 pu1_input_buf += (i4_input_stride * 4);
382 }
383
384 /* Loop for the remaining height */
385 for(k = i; k < (u4_height / 2) - 1; k++)
386 {
387 for(j = 0; j < u4_width - 16; j += 16)
388 {
389 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
390 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
391 u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 2));
392
393 /* separating u and v */
394 u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
395 u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
396 u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
397
398 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
399 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
400
401 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
402
403 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
404 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
405
406 d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
407 d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
408 }
409
410 /************************************************************/
411 /* Remaining width - */
412 /* Since Last pixel is not getting processed, remaining 15 */
413 /* pixels are getting processed separately by performing */
414 /* and operations with u2_mask_and_pixUV mask */
415 /************************************************************/
416 u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
417 u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
418 u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 2);
419
420 /* separating u and v */
421 u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
422 u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
423 u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
424
425 u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixUV);
426 u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixUV);
427 u1_src_right_r0 = _mm_and_si128(u1_src_right_r0, u2_mask_and_pixUV);
428
429 u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
430 u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
431
432 u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
433
434 u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
435 u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
436
437 d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
438 d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
439
440 pu1_input_buf += i4_input_stride;
441 }
442
443 d_gpp_y /= (u4_width * u4_height);
444 d_gpp_u /= ((u4_width / 2) * (u4_height / 2));
445 d_gpp_v /= ((u4_width / 2) * (u4_height / 2));
446
447 d_gpp = (DOUBLE) ((WT_LUMA_GPP * d_gpp_y) + d_gpp_u + d_gpp_v) / WT_TOTAL_GPP;
448
449 return d_gpp;
450 }
451