xref: /aosp_15_r20/external/libavc/encoder/arm/svc/isvce_rc_utils_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 ******************************************************************************
23 * @file isvce_svc_rc_utils_neon.c
24 *
25 * @brief
26 *  This file contains the neom SIMD version of the function which computes
27 *  gradient per pixel value being used in Init Qp
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - isvce_get_gpp_neon()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 
41 #include <arm_neon.h>
42 
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "isvc_structs.h"
46 #include "isvce_rc_utils_private_defs.h"
47 
48 /**
49 *******************************************************************************
50 *
51 * @brief
52 *   get gpp function
53 *
54 * @par Description:
55 *   computes gradient per pixel value for a given frame
56 *
57 * @param[in] ps_input_buf
58 *  pointer to yuv buffer properties
59 *
60 * @returns
61 *  calculated gpp value
62 *
63 * @remarks
64 *  none
65 *
66 *******************************************************************************
67 */
68 
isvce_get_gpp_neon(yuv_buf_props_t * ps_input_buf)69 DOUBLE isvce_get_gpp_neon(yuv_buf_props_t *ps_input_buf)
70 {
71     UWORD8 *pu1_input_buf;
72     UWORD32 i, j, k;
73     UWORD32 u4_width, u4_height, i4_input_stride;
74     DOUBLE d_gpp_y, d_gpp_u, d_gpp_v, d_gpp;
75 
76     uint8x8_t reg_8x8_src_r0, reg_8x8_src_r1, reg_8x8_src_r2, reg_8x8_src_r3, reg_8x8_src_r4,
77         reg_8x8_src_r5, reg_8x8_src_r6, reg_8x8_src_r7, reg_8x8_src_r8;
78     uint8x8_t reg_8x8_src_right_r0, reg_8x8_src_right_r1, reg_8x8_src_right_r2,
79         reg_8x8_src_right_r3, reg_8x8_src_right_r4, reg_8x8_src_right_r5, reg_8x8_src_right_r6,
80         reg_8x8_src_right_r7;
81     uint16x8_t reg_16x8_abs_diff_y, reg_16x8_abs_diff_uv;
82     uint64x2_t reg_64x2_gpp_y, reg_64x2_gpp_uv;
83 
84     uint8x8_t reg_8x8_shuffle = {0, 2, 4, 6, 1, 3, 5, 7};
85     uint16x8_t reg_16x8_and_mask_y = {0xffff, 0xffff, 0xffff, 0xffff,
86                                       0xffff, 0xffff, 0xffff, 0x0000};
87     uint16x8_t reg_16x8_and_mask_uv = {0xffff, 0xffff, 0xffff, 0x0000,
88                                        0xffff, 0xffff, 0xffff, 0x0000};
89     uint32x4_t reg_32x4_abs_diff_hadd_y = vdupq_n_u32(0);
90     uint32x4_t reg_32x4_abs_diff_hadd_uv = vdupq_n_u32(0);
91 
92     d_gpp_y = 0;
93     d_gpp_u = 0;
94     d_gpp_v = 0;
95     d_gpp = 0;
96     pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[0].pv_data;
97     i4_input_stride = ps_input_buf->as_component_bufs[0].i4_data_stride;
98     u4_width = ps_input_buf->u4_width;
99     u4_height = ps_input_buf->u4_height;
100 
101     ASSERT((u4_width % 8) == 0);
102 
103     /***********************************************************/
104     /* For Luma -                                              */
105     /* This code block calculates gpp value for luma by adding */
106     /* the absolute difference between the current pixel and   */
107     /* it's immediate right pixel with the absolute difference */
108     /* between the current pixel and it's immediate bottom     */
109     /* pixel and accumulating for every pixel in the frame.    */
110     /***********************************************************/
111     /* -8 in the checks below since right column and bottow row being used for gradients, */
112     /* and last row and column are ignored for gradient computation. */
113     /* Note that input is not required to be padded */
114     for(i = 0; i < u4_height - 8; i += 8)
115     {
116         for(j = 0; j < u4_width - 8; j += 8)
117         {
118             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
119             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
120             reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
121             reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
122             reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
123             reg_8x8_src_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j);
124             reg_8x8_src_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j);
125             reg_8x8_src_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j);
126             reg_8x8_src_r8 = vld1_u8(pu1_input_buf + (i4_input_stride * 8) + j);
127 
128             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 1);
129             reg_8x8_src_right_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j + 1);
130             reg_8x8_src_right_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j + 1);
131             reg_8x8_src_right_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j + 1);
132             reg_8x8_src_right_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j + 1);
133             reg_8x8_src_right_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j + 1);
134             reg_8x8_src_right_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j + 1);
135             reg_8x8_src_right_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j + 1);
136 
137             reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
138             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_r2);
139             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_r3);
140             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_r4);
141             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r4, reg_8x8_src_r5);
142             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r5, reg_8x8_src_r6);
143             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r6, reg_8x8_src_r7);
144             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r7, reg_8x8_src_r8);
145 
146             reg_16x8_abs_diff_y =
147                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
148             reg_16x8_abs_diff_y =
149                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_right_r1);
150             reg_16x8_abs_diff_y =
151                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_right_r2);
152             reg_16x8_abs_diff_y =
153                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_right_r3);
154             reg_16x8_abs_diff_y =
155                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r4, reg_8x8_src_right_r4);
156             reg_16x8_abs_diff_y =
157                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r5, reg_8x8_src_right_r5);
158             reg_16x8_abs_diff_y =
159                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r6, reg_8x8_src_right_r6);
160             reg_16x8_abs_diff_y =
161                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r7, reg_8x8_src_right_r7);
162 
163             reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
164         }
165 
166         /************************************************************/
167         /* Remaining width -                                        */
168         /* Since Last pixel is not getting processed, remaining 7   */
169         /* pixels are getting processed separately by performing    */
170         /* and operations with reg_16x8_and_mask_y                  */
171         /************************************************************/
172         ASSERT((u4_width - j) == 8);
173         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
174         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
175         reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
176         reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
177         reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
178         reg_8x8_src_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j);
179         reg_8x8_src_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j);
180         reg_8x8_src_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j);
181         reg_8x8_src_r8 = vld1_u8(pu1_input_buf + (i4_input_stride * 8) + j);
182 
183         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 1);
184         reg_8x8_src_right_r1 = vext_u8(reg_8x8_src_r1, reg_8x8_src_r1, 1);
185         reg_8x8_src_right_r2 = vext_u8(reg_8x8_src_r2, reg_8x8_src_r2, 1);
186         reg_8x8_src_right_r3 = vext_u8(reg_8x8_src_r3, reg_8x8_src_r3, 1);
187         reg_8x8_src_right_r4 = vext_u8(reg_8x8_src_r4, reg_8x8_src_r4, 1);
188         reg_8x8_src_right_r5 = vext_u8(reg_8x8_src_r5, reg_8x8_src_r5, 1);
189         reg_8x8_src_right_r6 = vext_u8(reg_8x8_src_r6, reg_8x8_src_r6, 1);
190         reg_8x8_src_right_r7 = vext_u8(reg_8x8_src_r7, reg_8x8_src_r7, 1);
191 
192         reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
193         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_r2);
194         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_r3);
195         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_r4);
196         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r4, reg_8x8_src_r5);
197         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r5, reg_8x8_src_r6);
198         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r6, reg_8x8_src_r7);
199         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r7, reg_8x8_src_r8);
200 
201         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
202         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_right_r1);
203         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_right_r2);
204         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_right_r3);
205         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r4, reg_8x8_src_right_r4);
206         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r5, reg_8x8_src_right_r5);
207         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r6, reg_8x8_src_right_r6);
208         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r7, reg_8x8_src_right_r7);
209 
210         reg_16x8_abs_diff_y = vandq_u16(reg_16x8_abs_diff_y, reg_16x8_and_mask_y);
211 
212         reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
213 
214         pu1_input_buf += (i4_input_stride * 8);
215     }
216 
217     /* Loop for remaining height less than 8 */
218     /*    4 <= remaining_height < 8          */
219     for(k = i; k < u4_height - 4; k += 4, i += 4)
220     {
221         for(j = 0; j < u4_width - 8; j += 8)
222         {
223             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
224             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
225             reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
226             reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
227             reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
228             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 1);
229             reg_8x8_src_right_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j + 1);
230             reg_8x8_src_right_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j + 1);
231             reg_8x8_src_right_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j + 1);
232 
233             reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
234             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_r2);
235             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_r3);
236             reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_r4);
237 
238             reg_16x8_abs_diff_y =
239                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
240             reg_16x8_abs_diff_y =
241                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_right_r1);
242             reg_16x8_abs_diff_y =
243                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_right_r2);
244             reg_16x8_abs_diff_y =
245                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_right_r3);
246 
247             reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
248         }
249 
250         /************************************************************/
251         /* Remaining width -                                        */
252         /* Since Last pixel is not getting processed, remaining 7   */
253         /* pixels are getting processed separately by performing    */
254         /* and operations with reg_16x8_and_mask_y                  */
255         /************************************************************/
256         ASSERT((u4_width - j) == 8);
257         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
258         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
259         reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
260         reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
261         reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
262 
263         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 1);
264         reg_8x8_src_right_r1 = vext_u8(reg_8x8_src_r1, reg_8x8_src_r1, 1);
265         reg_8x8_src_right_r2 = vext_u8(reg_8x8_src_r2, reg_8x8_src_r2, 1);
266         reg_8x8_src_right_r3 = vext_u8(reg_8x8_src_r3, reg_8x8_src_r3, 1);
267 
268         reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
269         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_r2);
270         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_r3);
271         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_r4);
272 
273         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
274         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r1, reg_8x8_src_right_r1);
275         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r2, reg_8x8_src_right_r2);
276         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r3, reg_8x8_src_right_r3);
277 
278         reg_16x8_abs_diff_y = vandq_u16(reg_16x8_abs_diff_y, reg_16x8_and_mask_y);
279 
280         reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
281 
282         pu1_input_buf += (i4_input_stride * 4);
283     }
284 
285     /* Loop for remaining height less than 4 */
286     /*    0 <= remaining_height < 4          */
287     for(k = i; k < u4_height - 1; k++)
288     {
289         for(j = 0; j < u4_width - 8; j += 8)
290         {
291             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
292             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
293             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 1);
294 
295             reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
296             reg_16x8_abs_diff_y =
297                 vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
298 
299             reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
300         }
301 
302         /************************************************************/
303         /* Remaining width -                                        */
304         /* Since Last pixel is not getting processed, remaining 7   */
305         /* pixels are getting processed separately by performing    */
306         /* and operations with reg_16x8_and_mask_y                  */
307         /************************************************************/
308         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
309         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
310         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 1);
311 
312         reg_16x8_abs_diff_y = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
313         reg_16x8_abs_diff_y = vabal_u8(reg_16x8_abs_diff_y, reg_8x8_src_r0, reg_8x8_src_right_r0);
314 
315         reg_16x8_abs_diff_y = vandq_u16(reg_16x8_abs_diff_y, reg_16x8_and_mask_y);
316 
317         reg_32x4_abs_diff_hadd_y = vpadalq_u16(reg_32x4_abs_diff_hadd_y, reg_16x8_abs_diff_y);
318 
319         pu1_input_buf += i4_input_stride;
320     }
321 
322     /* Pairwise add reg_32x4_abs_diff_hadd_y to get final gpp value */
323     reg_64x2_gpp_y = vpaddlq_u32(reg_32x4_abs_diff_hadd_y);
324     d_gpp_y = vgetq_lane_u64(reg_64x2_gpp_y, 0);
325     d_gpp_y += vgetq_lane_u64(reg_64x2_gpp_y, 1);
326 
327     pu1_input_buf = (UWORD8 *) ps_input_buf->as_component_bufs[1].pv_data;
328     i4_input_stride = ps_input_buf->as_component_bufs[1].i4_data_stride;
329 
330     /***************************************************************/
331     /* For Chroma -                                                */
332     /* This code block first deinterleaves the Cb and Cr values,   */
333     /* calculates gpp value for both Cb and Cr separately by       */
334     /* adding the absolute difference between the current pixel    */
335     /* and it's immediate right pixel with the absolute            */
336     /* difference between the current pixel and it's immediate     */
337     /* bottom pixel and accumulating for every pixel in the frame. */
338     /***************************************************************/
339     for(i = 0; i < (u4_height >> 1) - 8; i += 8)
340     {
341         for(j = 0; j < u4_width - 8; j += 8)
342         {
343             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
344             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
345             reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
346             reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
347             reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
348             reg_8x8_src_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j);
349             reg_8x8_src_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j);
350             reg_8x8_src_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j);
351             reg_8x8_src_r8 = vld1_u8(pu1_input_buf + (i4_input_stride * 8) + j);
352 
353             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 2);
354             reg_8x8_src_right_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j + 2);
355             reg_8x8_src_right_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j + 2);
356             reg_8x8_src_right_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j + 2);
357             reg_8x8_src_right_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j + 2);
358             reg_8x8_src_right_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j + 2);
359             reg_8x8_src_right_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j + 2);
360             reg_8x8_src_right_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j + 2);
361 
362             /* separating u and v */
363             reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
364             reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
365             reg_8x8_src_r2 = vtbl1_u8(reg_8x8_src_r2, reg_8x8_shuffle);
366             reg_8x8_src_r3 = vtbl1_u8(reg_8x8_src_r3, reg_8x8_shuffle);
367             reg_8x8_src_r4 = vtbl1_u8(reg_8x8_src_r4, reg_8x8_shuffle);
368             reg_8x8_src_r5 = vtbl1_u8(reg_8x8_src_r5, reg_8x8_shuffle);
369             reg_8x8_src_r6 = vtbl1_u8(reg_8x8_src_r6, reg_8x8_shuffle);
370             reg_8x8_src_r7 = vtbl1_u8(reg_8x8_src_r7, reg_8x8_shuffle);
371             reg_8x8_src_r8 = vtbl1_u8(reg_8x8_src_r8, reg_8x8_shuffle);
372             reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
373             reg_8x8_src_right_r1 = vtbl1_u8(reg_8x8_src_right_r1, reg_8x8_shuffle);
374             reg_8x8_src_right_r2 = vtbl1_u8(reg_8x8_src_right_r2, reg_8x8_shuffle);
375             reg_8x8_src_right_r3 = vtbl1_u8(reg_8x8_src_right_r3, reg_8x8_shuffle);
376             reg_8x8_src_right_r4 = vtbl1_u8(reg_8x8_src_right_r4, reg_8x8_shuffle);
377             reg_8x8_src_right_r5 = vtbl1_u8(reg_8x8_src_right_r5, reg_8x8_shuffle);
378             reg_8x8_src_right_r6 = vtbl1_u8(reg_8x8_src_right_r6, reg_8x8_shuffle);
379             reg_8x8_src_right_r7 = vtbl1_u8(reg_8x8_src_right_r7, reg_8x8_shuffle);
380 
381             reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
382             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_r2);
383             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_r3);
384             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_r4);
385             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r4, reg_8x8_src_r5);
386             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r5, reg_8x8_src_r6);
387             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r6, reg_8x8_src_r7);
388             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r7, reg_8x8_src_r8);
389             reg_16x8_abs_diff_uv =
390                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
391             reg_16x8_abs_diff_uv =
392                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_right_r1);
393             reg_16x8_abs_diff_uv =
394                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_right_r2);
395             reg_16x8_abs_diff_uv =
396                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_right_r3);
397             reg_16x8_abs_diff_uv =
398                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r4, reg_8x8_src_right_r4);
399             reg_16x8_abs_diff_uv =
400                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r5, reg_8x8_src_right_r5);
401             reg_16x8_abs_diff_uv =
402                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r6, reg_8x8_src_right_r6);
403             reg_16x8_abs_diff_uv =
404                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r7, reg_8x8_src_right_r7);
405 
406             reg_32x4_abs_diff_hadd_uv =
407                 vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
408         }
409 
410         /************************************************************/
411         /* Remaining width -                                        */
412         /* Since Last pixel is not getting processed, remaining 6   */
413         /* pixels are getting processed separately by performing    */
414         /* and operations with reg_16x8_and_mask_uv                 */
415         /************************************************************/
416         ASSERT((u4_width - j) == 8);
417         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
418         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
419         reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
420         reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
421         reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
422         reg_8x8_src_r5 = vld1_u8(pu1_input_buf + (i4_input_stride * 5) + j);
423         reg_8x8_src_r6 = vld1_u8(pu1_input_buf + (i4_input_stride * 6) + j);
424         reg_8x8_src_r7 = vld1_u8(pu1_input_buf + (i4_input_stride * 7) + j);
425         reg_8x8_src_r8 = vld1_u8(pu1_input_buf + (i4_input_stride * 8) + j);
426         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 2);
427         reg_8x8_src_right_r1 = vext_u8(reg_8x8_src_r1, reg_8x8_src_r1, 2);
428         reg_8x8_src_right_r2 = vext_u8(reg_8x8_src_r2, reg_8x8_src_r2, 2);
429         reg_8x8_src_right_r3 = vext_u8(reg_8x8_src_r3, reg_8x8_src_r3, 2);
430         reg_8x8_src_right_r4 = vext_u8(reg_8x8_src_r4, reg_8x8_src_r4, 2);
431         reg_8x8_src_right_r5 = vext_u8(reg_8x8_src_r5, reg_8x8_src_r5, 2);
432         reg_8x8_src_right_r6 = vext_u8(reg_8x8_src_r6, reg_8x8_src_r6, 2);
433         reg_8x8_src_right_r7 = vext_u8(reg_8x8_src_r7, reg_8x8_src_r7, 2);
434 
435         /* separating u and v */
436         reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
437         reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
438         reg_8x8_src_r2 = vtbl1_u8(reg_8x8_src_r2, reg_8x8_shuffle);
439         reg_8x8_src_r3 = vtbl1_u8(reg_8x8_src_r3, reg_8x8_shuffle);
440         reg_8x8_src_r4 = vtbl1_u8(reg_8x8_src_r4, reg_8x8_shuffle);
441         reg_8x8_src_r5 = vtbl1_u8(reg_8x8_src_r5, reg_8x8_shuffle);
442         reg_8x8_src_r6 = vtbl1_u8(reg_8x8_src_r6, reg_8x8_shuffle);
443         reg_8x8_src_r7 = vtbl1_u8(reg_8x8_src_r7, reg_8x8_shuffle);
444         reg_8x8_src_r8 = vtbl1_u8(reg_8x8_src_r8, reg_8x8_shuffle);
445         reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
446         reg_8x8_src_right_r1 = vtbl1_u8(reg_8x8_src_right_r1, reg_8x8_shuffle);
447         reg_8x8_src_right_r2 = vtbl1_u8(reg_8x8_src_right_r2, reg_8x8_shuffle);
448         reg_8x8_src_right_r3 = vtbl1_u8(reg_8x8_src_right_r3, reg_8x8_shuffle);
449         reg_8x8_src_right_r4 = vtbl1_u8(reg_8x8_src_right_r4, reg_8x8_shuffle);
450         reg_8x8_src_right_r5 = vtbl1_u8(reg_8x8_src_right_r5, reg_8x8_shuffle);
451         reg_8x8_src_right_r6 = vtbl1_u8(reg_8x8_src_right_r6, reg_8x8_shuffle);
452         reg_8x8_src_right_r7 = vtbl1_u8(reg_8x8_src_right_r7, reg_8x8_shuffle);
453 
454         reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
455         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_r2);
456         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_r3);
457         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_r4);
458         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r4, reg_8x8_src_r5);
459         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r5, reg_8x8_src_r6);
460         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r6, reg_8x8_src_r7);
461         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r7, reg_8x8_src_r8);
462         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
463         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_right_r1);
464         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_right_r2);
465         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_right_r3);
466         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r4, reg_8x8_src_right_r4);
467         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r5, reg_8x8_src_right_r5);
468         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r6, reg_8x8_src_right_r6);
469         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r7, reg_8x8_src_right_r7);
470 
471         reg_16x8_abs_diff_uv = vandq_u16(reg_16x8_abs_diff_uv, reg_16x8_and_mask_uv);
472 
473         reg_32x4_abs_diff_hadd_uv = vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
474 
475         pu1_input_buf += (i4_input_stride * 8);
476     }
477 
478     /* Loop for remaining height less than 8 */
479     /*    4 <= remaining_height < 8          */
480     for(k = i; k < (u4_height >> 1) - 4; k += 4, i += 4)
481     {
482         for(j = 0; j < u4_width - 8; j += 8)
483         {
484             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
485             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
486             reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
487             reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
488             reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
489             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 2);
490             reg_8x8_src_right_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j + 2);
491             reg_8x8_src_right_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j + 2);
492             reg_8x8_src_right_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j + 2);
493 
494             /* separating u and v */
495             reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
496             reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
497             reg_8x8_src_r2 = vtbl1_u8(reg_8x8_src_r2, reg_8x8_shuffle);
498             reg_8x8_src_r3 = vtbl1_u8(reg_8x8_src_r3, reg_8x8_shuffle);
499             reg_8x8_src_r4 = vtbl1_u8(reg_8x8_src_r4, reg_8x8_shuffle);
500             reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
501             reg_8x8_src_right_r1 = vtbl1_u8(reg_8x8_src_right_r1, reg_8x8_shuffle);
502             reg_8x8_src_right_r2 = vtbl1_u8(reg_8x8_src_right_r2, reg_8x8_shuffle);
503             reg_8x8_src_right_r3 = vtbl1_u8(reg_8x8_src_right_r3, reg_8x8_shuffle);
504 
505             reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
506             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_r2);
507             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_r3);
508             reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_r4);
509             reg_16x8_abs_diff_uv =
510                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
511             reg_16x8_abs_diff_uv =
512                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_right_r1);
513             reg_16x8_abs_diff_uv =
514                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_right_r2);
515             reg_16x8_abs_diff_uv =
516                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_right_r3);
517 
518             reg_32x4_abs_diff_hadd_uv =
519                 vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
520         }
521 
522         /************************************************************/
523         /* Remaining width -                                        */
524         /* Since Last pixel is not getting processed, remaining 6   */
525         /* pixels are getting processed separately by performing    */
526         /* and operations with reg_16x8_and_mask_uv                 */
527         /************************************************************/
528         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
529         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
530         reg_8x8_src_r2 = vld1_u8(pu1_input_buf + (i4_input_stride * 2) + j);
531         reg_8x8_src_r3 = vld1_u8(pu1_input_buf + (i4_input_stride * 3) + j);
532         reg_8x8_src_r4 = vld1_u8(pu1_input_buf + (i4_input_stride * 4) + j);
533         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 2);
534         reg_8x8_src_right_r1 = vext_u8(reg_8x8_src_r1, reg_8x8_src_r1, 2);
535         reg_8x8_src_right_r2 = vext_u8(reg_8x8_src_r2, reg_8x8_src_r2, 2);
536         reg_8x8_src_right_r3 = vext_u8(reg_8x8_src_r3, reg_8x8_src_r3, 2);
537 
538         /* separating u and v */
539         reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
540         reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
541         reg_8x8_src_r2 = vtbl1_u8(reg_8x8_src_r2, reg_8x8_shuffle);
542         reg_8x8_src_r3 = vtbl1_u8(reg_8x8_src_r3, reg_8x8_shuffle);
543         reg_8x8_src_r4 = vtbl1_u8(reg_8x8_src_r4, reg_8x8_shuffle);
544         reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
545         reg_8x8_src_right_r1 = vtbl1_u8(reg_8x8_src_right_r1, reg_8x8_shuffle);
546         reg_8x8_src_right_r2 = vtbl1_u8(reg_8x8_src_right_r2, reg_8x8_shuffle);
547         reg_8x8_src_right_r3 = vtbl1_u8(reg_8x8_src_right_r3, reg_8x8_shuffle);
548 
549         reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
550         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_r2);
551         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_r3);
552         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_r4);
553         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
554         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r1, reg_8x8_src_right_r1);
555         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r2, reg_8x8_src_right_r2);
556         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r3, reg_8x8_src_right_r3);
557 
558         reg_16x8_abs_diff_uv = vandq_u16(reg_16x8_abs_diff_uv, reg_16x8_and_mask_uv);
559 
560         reg_32x4_abs_diff_hadd_uv = vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
561 
562         pu1_input_buf += (i4_input_stride * 4);
563     }
564 
565     /* Loop for remaining height less than 4 */
566     /*    0 <= remaining_height < 4          */
567     for(k = i; k < (u4_height >> 1) - 1; k++)
568     {
569         for(j = 0; j < u4_width - 8; j += 8)
570         {
571             reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
572             reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
573             reg_8x8_src_right_r0 = vld1_u8(pu1_input_buf + j + 2);
574 
575             /* separating u and v */
576             reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
577             reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
578             reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
579 
580             reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
581             reg_16x8_abs_diff_uv =
582                 vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
583 
584             reg_32x4_abs_diff_hadd_uv =
585                 vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
586         }
587 
588         /************************************************************/
589         /* Remaining width -                                        */
590         /* Since Last pixel is not getting processed, remaining 6   */
591         /* pixels are getting processed separately by performing    */
592         /* and operations with reg_16x8_and_mask_uv                 */
593         /************************************************************/
594         reg_8x8_src_r0 = vld1_u8(pu1_input_buf + j);
595         reg_8x8_src_r1 = vld1_u8(pu1_input_buf + i4_input_stride + j);
596         reg_8x8_src_right_r0 = vext_u8(reg_8x8_src_r0, reg_8x8_src_r0, 2);
597 
598         /* separating u and v */
599         reg_8x8_src_r0 = vtbl1_u8(reg_8x8_src_r0, reg_8x8_shuffle);
600         reg_8x8_src_r1 = vtbl1_u8(reg_8x8_src_r1, reg_8x8_shuffle);
601         reg_8x8_src_right_r0 = vtbl1_u8(reg_8x8_src_right_r0, reg_8x8_shuffle);
602 
603         reg_16x8_abs_diff_uv = vabdl_u8(reg_8x8_src_r0, reg_8x8_src_r1);
604         reg_16x8_abs_diff_uv = vabal_u8(reg_16x8_abs_diff_uv, reg_8x8_src_r0, reg_8x8_src_right_r0);
605 
606         reg_16x8_abs_diff_uv = vandq_u16(reg_16x8_abs_diff_uv, reg_16x8_and_mask_uv);
607 
608         reg_32x4_abs_diff_hadd_uv = vpadalq_u16(reg_32x4_abs_diff_hadd_uv, reg_16x8_abs_diff_uv);
609 
610         pu1_input_buf += i4_input_stride;
611     }
612 
613     /* Pairwise add u4_abd_hadd_uv to get final gpp_u and gpp_v value */
614     reg_64x2_gpp_uv = vpaddlq_u32(reg_32x4_abs_diff_hadd_uv);
615     d_gpp_u = vgetq_lane_u64(reg_64x2_gpp_uv, 0);
616     d_gpp_v = vgetq_lane_u64(reg_64x2_gpp_uv, 1);
617 
618     d_gpp_y /= (u4_width * u4_height);
619     d_gpp_u /= ((u4_width / 2) * (u4_height / 2));
620     d_gpp_v /= ((u4_width / 2) * (u4_height / 2));
621 
622     d_gpp = (DOUBLE) ((WT_LUMA_GPP * d_gpp_y) + d_gpp_u + d_gpp_v) / WT_TOTAL_GPP;
623 
624     return d_gpp;
625 }
626