xref: /aosp_15_r20/external/libavc/encoder/x86/svc/isvce_rc_utils_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 ******************************************************************************
23 * @file isvce_rc_utils_sse42.c
24 *
25 * @brief
26 *  This file contains the x86 SIMD version of the function which computes
27 *  gradient per pixel value being used in Init Qp
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - isvce_get_gpp_sse42()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 
41 #include <immintrin.h>
42 
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "isvc_structs.h"
46 #include "isvce_rc_utils_private_defs.h"
47 
48 /**
49 *******************************************************************************
50 *
51 * @brief
52 *   get gpp function
53 *
54 * @par Description:
55 *   computes gradient per pixel value for a given frame
56 *
57 * @param[in] ps_input_buf
58 *  pointer to yuv buffer properties
59 *
60 * @returns
61 *  calculated gpp value
62 *
63 * @remarks
64 *  none
65 *
66 *******************************************************************************
67 */
68 
isvce_get_gpp_sse42(yuv_buf_props_t * ps_input_buf)69 DOUBLE isvce_get_gpp_sse42(yuv_buf_props_t *ps_input_buf)
70 {
71     UWORD8 *pu1_input_buf;
72     UWORD16 mask_ffff, mask_00ff;
73     UWORD32 i, j, k;
74     UWORD32 u4_width, u4_height, i4_input_stride;
75     DOUBLE d_gpp_y, d_gpp_u, d_gpp_v, d_gpp;
76 
77     __m128i u1_src_r0, u1_src_r1, u1_src_r2, u1_src_r3, u1_src_r4;
78     __m128i u1_src_right_r0, u1_src_right_r1, u1_src_right_r2, u1_src_right_r3;
79     __m128i u2_sad_cur_bot_r01, u2_sad_cur_bot_r12, u2_sad_cur_bot_r23, u2_sad_cur_bot_r34;
80     __m128i u2_sad_cur_right_r0, u2_sad_cur_right_r1, u2_sad_cur_right_r2, u2_sad_cur_right_r3;
81     __m128i u2_sad_hadd, u1_shuffle_chroma, u2_mask_and_pixY, u2_mask_and_pixUV;
82 
83     d_gpp_y = 0;
84     d_gpp_u = 0;
85     d_gpp_v = 0;
86     d_gpp = 0;
87     mask_ffff = 0xffff;
88     mask_00ff = 0x00ff;
89     pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[0].pv_data;
90     i4_input_stride = ps_input_buf->as_component_bufs[0].i4_data_stride;
91     u4_width = ps_input_buf->u4_width;
92     u4_height = ps_input_buf->u4_height;
93 
94     u1_shuffle_chroma = _mm_setr_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x01, 0x03,
95                                       0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f);
96     u2_mask_and_pixY = _mm_setr_epi16(mask_ffff, mask_ffff, mask_ffff, mask_ffff, mask_ffff,
97                                       mask_ffff, mask_ffff, mask_00ff);
98     u2_mask_and_pixUV = _mm_setr_epi16(mask_ffff, mask_ffff, mask_ffff, mask_00ff, mask_ffff,
99                                        mask_ffff, mask_ffff, mask_00ff);
100 
101     ASSERT((u4_width % 16) == 0);
102 
103     /***********************************************************/
104     /* For Luma -                                              */
105     /* This code block calculates gpp value for luma by adding */
106     /* the absolute difference between the current pixel and   */
107     /* it's immediate right pixel with the absolute difference */
108     /* between the current pixel and it's immediate bottom     */
109     /* pixel and accumulating for every pixel in the frame.    */
110     /***********************************************************/
111     for(i = 0; i < u4_height - 4; i += 4)
112     {
113         for(j = 0; j < u4_width - 16; j += 16)
114         {
115             u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
116             u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
117             u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
118             u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
119             u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
120             u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 1));
121             u1_src_right_r1 =
122                 _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j + 1));
123             u1_src_right_r2 =
124                 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j + 1));
125             u1_src_right_r3 =
126                 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j + 1));
127 
128             u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
129             u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
130             u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
131             u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
132             u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
133             u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
134             u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
135             u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
136 
137             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
138             u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
139             u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
140             u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
141 
142             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
143             u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
144 
145             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
146 
147             u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
148             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
149             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
150 
151             d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
152         }
153 
154         /************************************************************/
155         /* Remaining width -                                        */
156         /* Since Last pixel is not getting processed, remaining 15  */
157         /* pixels are getting processed separately by performing    */
158         /* and operations with u2_mask_and_pixY mask                */
159         /************************************************************/
160         u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
161         u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
162         u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
163         u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
164         u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
165         u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 1);
166         u1_src_right_r1 = _mm_srli_si128(u1_src_r1, 1);
167         u1_src_right_r2 = _mm_srli_si128(u1_src_r2, 1);
168         u1_src_right_r3 = _mm_srli_si128(u1_src_r3, 1);
169 
170         u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixY);
171         u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixY);
172         u1_src_r2 = _mm_and_si128(u1_src_r2, u2_mask_and_pixY);
173         u1_src_r3 = _mm_and_si128(u1_src_r3, u2_mask_and_pixY);
174         u1_src_r4 = _mm_and_si128(u1_src_r4, u2_mask_and_pixY);
175 
176         u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
177         u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
178         u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
179         u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
180         u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
181         u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
182         u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
183         u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
184 
185         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
186         u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
187         u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
188         u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
189 
190         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
191         u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
192 
193         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
194 
195         u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
196         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
197         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
198 
199         d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
200 
201         pu1_input_buf += (i4_input_stride << 2);
202     }
203 
204     /* Loop for the remaining height */
205     for(k = i; k < u4_height - 1; k++)
206     {
207         for(j = 0; j < u4_width - 16; j += 16)
208         {
209             u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
210             u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
211             u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 1));
212 
213             u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
214             u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
215 
216             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
217 
218             u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
219             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
220             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
221 
222             d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
223         }
224 
225         /************************************************************/
226         /* Remaining width -                                        */
227         /* Since Last pixel is not getting processed, remaining 15  */
228         /* pixels are getting processed separately by performing    */
229         /* and operations with u2_mask_and_pixY mask                */
230         /************************************************************/
231         u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
232         u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
233         u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 1);
234 
235         u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixY);
236         u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixY);
237 
238         u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
239         u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
240 
241         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
242 
243         u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
244         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
245         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
246 
247         d_gpp_y += _mm_extract_epi16(u2_sad_hadd, 0);
248 
249         pu1_input_buf += (i4_input_stride);
250     }
251 
252     pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[1].pv_data;
253     i4_input_stride = ps_input_buf->as_component_bufs[1].i4_data_stride;
254 
255     /**************************************************************/
256     /* For Chroma -                                               */
257     /* This code block first deinterleaves the Cb and Cr values   */
258     /* from the loaded registers, calculates gpp value for both   */
259     /* Cb and Cr separately by adding the absolute difference     */
260     /* between the current pixel and it's immediate right pixel   */
261     /* with the absolute difference between the current pixel and */
262     /* it's immediate bottom pixel and accumulating for every     */
263     /* pixel in the frame.                                        */
264     /**************************************************************/
265     for(i = 0; i < (u4_height / 2) - 4; i += 4)
266     {
267         for(j = 0; j < u4_width - 16; j += 16)
268         {
269             u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
270             u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
271             u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
272             u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
273             u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
274             u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 2));
275             u1_src_right_r1 =
276                 _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j + 2));
277             u1_src_right_r2 =
278                 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j + 2));
279             u1_src_right_r3 =
280                 _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j + 2));
281 
282             /* separating u and v */
283             u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
284             u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
285             u1_src_r2 = _mm_shuffle_epi8(u1_src_r2, u1_shuffle_chroma);
286             u1_src_r3 = _mm_shuffle_epi8(u1_src_r3, u1_shuffle_chroma);
287             u1_src_r4 = _mm_shuffle_epi8(u1_src_r4, u1_shuffle_chroma);
288             u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
289             u1_src_right_r1 = _mm_shuffle_epi8(u1_src_right_r1, u1_shuffle_chroma);
290             u1_src_right_r2 = _mm_shuffle_epi8(u1_src_right_r2, u1_shuffle_chroma);
291             u1_src_right_r3 = _mm_shuffle_epi8(u1_src_right_r3, u1_shuffle_chroma);
292 
293             u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
294             u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
295             u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
296             u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
297             u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
298             u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
299             u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
300             u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
301 
302             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
303             u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
304             u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
305             u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
306 
307             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
308             u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
309 
310             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
311 
312             u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
313             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
314 
315             d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
316             d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
317         }
318 
319         /************************************************************/
320         /* Remaining width -                                        */
321         /* Since Last pixel is not getting processed, remaining 15  */
322         /* pixels are getting processed separately by performing    */
323         /* and operations with u2_mask_and_pixUV mask               */
324         /************************************************************/
325         u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
326         u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
327         u1_src_r2 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 2) + j));
328         u1_src_r3 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 3) + j));
329         u1_src_r4 = _mm_loadu_si128((__m128i *) (pu1_input_buf + (i4_input_stride * 4) + j));
330         u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 2);
331         u1_src_right_r1 = _mm_srli_si128(u1_src_r1, 2);
332         u1_src_right_r2 = _mm_srli_si128(u1_src_r2, 2);
333         u1_src_right_r3 = _mm_srli_si128(u1_src_r3, 2);
334 
335         /* separating u and v */
336         u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
337         u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
338         u1_src_r2 = _mm_shuffle_epi8(u1_src_r2, u1_shuffle_chroma);
339         u1_src_r3 = _mm_shuffle_epi8(u1_src_r3, u1_shuffle_chroma);
340         u1_src_r4 = _mm_shuffle_epi8(u1_src_r4, u1_shuffle_chroma);
341         u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
342         u1_src_right_r1 = _mm_shuffle_epi8(u1_src_right_r1, u1_shuffle_chroma);
343         u1_src_right_r2 = _mm_shuffle_epi8(u1_src_right_r2, u1_shuffle_chroma);
344         u1_src_right_r3 = _mm_shuffle_epi8(u1_src_right_r3, u1_shuffle_chroma);
345 
346         u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixUV);
347         u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixUV);
348         u1_src_r2 = _mm_and_si128(u1_src_r2, u2_mask_and_pixUV);
349         u1_src_r3 = _mm_and_si128(u1_src_r3, u2_mask_and_pixUV);
350         u1_src_r4 = _mm_and_si128(u1_src_r4, u2_mask_and_pixUV);
351         u1_src_right_r0 = _mm_and_si128(u1_src_right_r0, u2_mask_and_pixUV);
352         u1_src_right_r1 = _mm_and_si128(u1_src_right_r1, u2_mask_and_pixUV);
353         u1_src_right_r2 = _mm_and_si128(u1_src_right_r2, u2_mask_and_pixUV);
354         u1_src_right_r3 = _mm_and_si128(u1_src_right_r3, u2_mask_and_pixUV);
355 
356         u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
357         u2_sad_cur_bot_r12 = _mm_sad_epu8(u1_src_r1, u1_src_r2);
358         u2_sad_cur_bot_r23 = _mm_sad_epu8(u1_src_r2, u1_src_r3);
359         u2_sad_cur_bot_r34 = _mm_sad_epu8(u1_src_r3, u1_src_r4);
360         u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
361         u2_sad_cur_right_r1 = _mm_sad_epu8(u1_src_r1, u1_src_right_r1);
362         u2_sad_cur_right_r2 = _mm_sad_epu8(u1_src_r2, u1_src_right_r2);
363         u2_sad_cur_right_r3 = _mm_sad_epu8(u1_src_r3, u1_src_right_r3);
364 
365         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r12);
366         u2_sad_cur_bot_r23 = _mm_adds_epu16(u2_sad_cur_bot_r23, u2_sad_cur_bot_r34);
367         u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r1);
368         u2_sad_cur_right_r2 = _mm_adds_epu16(u2_sad_cur_right_r2, u2_sad_cur_right_r3);
369 
370         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r23);
371         u2_sad_cur_right_r0 = _mm_adds_epu16(u2_sad_cur_right_r0, u2_sad_cur_right_r2);
372 
373         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
374 
375         u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
376         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
377 
378         d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
379         d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
380 
381         pu1_input_buf += (i4_input_stride * 4);
382     }
383 
384     /* Loop for the remaining height */
385     for(k = i; k < (u4_height / 2) - 1; k++)
386     {
387         for(j = 0; j < u4_width - 16; j += 16)
388         {
389             u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
390             u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
391             u1_src_right_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j + 2));
392 
393             /* separating u and v */
394             u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
395             u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
396             u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
397 
398             u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
399             u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
400 
401             u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
402 
403             u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
404             u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
405 
406             d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
407             d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
408         }
409 
410         /************************************************************/
411         /* Remaining width -                                        */
412         /* Since Last pixel is not getting processed, remaining 15  */
413         /* pixels are getting processed separately by performing    */
414         /* and operations with u2_mask_and_pixUV mask               */
415         /************************************************************/
416         u1_src_r0 = _mm_loadu_si128((__m128i *) (pu1_input_buf + j));
417         u1_src_r1 = _mm_loadu_si128((__m128i *) (pu1_input_buf + i4_input_stride + j));
418         u1_src_right_r0 = _mm_srli_si128(u1_src_r0, 2);
419 
420         /* separating u and v */
421         u1_src_r0 = _mm_shuffle_epi8(u1_src_r0, u1_shuffle_chroma);
422         u1_src_r1 = _mm_shuffle_epi8(u1_src_r1, u1_shuffle_chroma);
423         u1_src_right_r0 = _mm_shuffle_epi8(u1_src_right_r0, u1_shuffle_chroma);
424 
425         u1_src_r0 = _mm_and_si128(u1_src_r0, u2_mask_and_pixUV);
426         u1_src_r1 = _mm_and_si128(u1_src_r1, u2_mask_and_pixUV);
427         u1_src_right_r0 = _mm_and_si128(u1_src_right_r0, u2_mask_and_pixUV);
428 
429         u2_sad_cur_bot_r01 = _mm_sad_epu8(u1_src_r0, u1_src_r1);
430         u2_sad_cur_right_r0 = _mm_sad_epu8(u1_src_r0, u1_src_right_r0);
431 
432         u2_sad_cur_bot_r01 = _mm_adds_epu16(u2_sad_cur_bot_r01, u2_sad_cur_right_r0);
433 
434         u2_sad_hadd = _mm_hadd_epi16(u2_sad_cur_bot_r01, u2_sad_cur_bot_r01);
435         u2_sad_hadd = _mm_hadd_epi16(u2_sad_hadd, u2_sad_hadd);
436 
437         d_gpp_u += _mm_extract_epi16(u2_sad_hadd, 0);
438         d_gpp_v += _mm_extract_epi16(u2_sad_hadd, 1);
439 
440         pu1_input_buf += i4_input_stride;
441     }
442 
443     d_gpp_y /= (u4_width * u4_height);
444     d_gpp_u /= ((u4_width / 2) * (u4_height / 2));
445     d_gpp_v /= ((u4_width / 2) * (u4_height / 2));
446 
447     d_gpp = (DOUBLE) ((WT_LUMA_GPP * d_gpp_y) + d_gpp_u + d_gpp_v) / WT_TOTAL_GPP;
448 
449     return d_gpp;
450 }
451