xref: /aosp_15_r20/external/libavc/encoder/arm/svc/isvce_downscaler_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 ******************************************************************************
23 * @file ih264e_downscaler_neon.c
24 *
25 * @brief
26 *  This file contains the ARMV8 SIMD version of the function which does
27 *  horizontal scaling and transpose
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - ih264e_horizontal_downscale_and_transpose_av8()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 
41 /*****************************************************************************/
42 /* File Includes                                                             */
43 /*****************************************************************************/
44 
45 /* System include files */
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <arm_neon.h>
49 
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "isvc_defs.h"
55 #include "isvce_defs.h"
56 #include "isvc_structs.h"
57 #include "isvce_downscaler_private_defs.h"
58 
isvce_horizontal_downscale_and_transpose_neon(downscaler_ctxt_t * ps_scaler,buffer_container_t * ps_src,buffer_container_t * ps_dst,FILTER_COEFF_ARRAY pai1_filters,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht,UWORD8 u1_is_chroma)59 void isvce_horizontal_downscale_and_transpose_neon(
60     downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
61     FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
62 {
63     WORD32 i, j;
64     UWORD8 u1_phase;
65     UWORD8 *pu1_src_j, *pu1_dst_j;
66     UWORD8 *pu1_in_pixel;
67     UWORD8 *pu1_out_pixel;
68     WORD8 *pi1_filter_grid;
69     UWORD16 u2_full_pixel_inc;
70     UWORD32 u4_num_iterations_vertical_by_16, u4_num_iterations_vertical_by_8;
71     UWORD32 u4_rem_vert_loop_by_8, u4_rem_vert_loop_by_4;
72     UWORD32 u4_rem_vert_loop;
73     UWORD32 u4_height_finished;
74 
75     uint8x8_t reg_8x8_src_r0, reg_8x8_src_r1, reg_8x8_src_r2, reg_8x8_src_r3, reg_8x8_src_r4,
76         reg_8x8_src_r5, reg_8x8_src_r6, reg_8x8_src_r7;
77 
78     uint16x8_t reg_16x8_src_r0, reg_16x8_src_r1, reg_16x8_src_r2, reg_16x8_src_r3, reg_16x8_src_r4,
79         reg_16x8_src_r5, reg_16x8_src_r6, reg_16x8_src_r7;
80 
81     int16x8_t reg_16x8_mul_r0, reg_16x8_mul_r1, reg_16x8_mul_r2, reg_16x8_mul_r3, reg_16x8_mul_r4,
82         reg_16x8_mul_r5, reg_16x8_mul_r6, reg_16x8_mul_r7;
83 
84     int32x4_t reg_32x4_sum_r0, reg_32x4_sum_r1, reg_32x4_sum_r2, reg_32x4_sum_r3, reg_32x4_sum_r4,
85         reg_32x4_sum_r5, reg_32x4_sum_r6, reg_32x4_sum_r7;
86 
87     int32x4_t reg_32x4_sum_r01, reg_32x4_sum_r23, reg_32x4_sum_r45, reg_32x4_sum_r67,
88         reg_32x4_sum_r89, reg_32x4_sum_r1011, reg_32x4_sum_r1213, reg_32x4_sum_r1415;
89 
90     uint8x8_t reg_8x8_src_r8, reg_8x8_src_r9, reg_8x8_src_r10, reg_8x8_src_r11, reg_8x8_src_r12,
91         reg_8x8_src_r13, reg_8x8_src_r14, reg_8x8_src_r15;
92 
93     uint16x8_t reg_16x8_src_r8, reg_16x8_src_r9, reg_16x8_src_r10, reg_16x8_src_r11,
94         reg_16x8_src_r12, reg_16x8_src_r13, reg_16x8_src_r14, reg_16x8_src_r15;
95 
96     int16x8_t reg_16x8_mul_r8, reg_16x8_mul_r9, reg_16x8_mul_r10, reg_16x8_mul_r11,
97         reg_16x8_mul_r12, reg_16x8_mul_r13, reg_16x8_mul_r14, reg_16x8_mul_r15;
98 
99     int32x4_t reg_32x4_sum_r8, reg_32x4_sum_r9, reg_32x4_sum_r10, reg_32x4_sum_r11,
100         reg_32x4_sum_r12, reg_32x4_sum_r13, reg_32x4_sum_r14, reg_32x4_sum_r15;
101 
102     uint8x16_t reg_8x16_src_r0, reg_8x16_src_r1, reg_8x16_src_r2, reg_8x16_src_r3, reg_8x16_src_r4,
103         reg_8x16_src_r5, reg_8x16_src_r6, reg_8x16_src_r7;
104 
105     uint16x8_t reg_16x8_src_cb_r0, reg_16x8_src_cb_r1, reg_16x8_src_cb_r2, reg_16x8_src_cb_r3,
106         reg_16x8_src_cb_r4, reg_16x8_src_cb_r5, reg_16x8_src_cb_r6, reg_16x8_src_cb_r7;
107 
108     uint16x8_t reg_16x8_src_cr_r0, reg_16x8_src_cr_r1, reg_16x8_src_cr_r2, reg_16x8_src_cr_r3,
109         reg_16x8_src_cr_r4, reg_16x8_src_cr_r5, reg_16x8_src_cr_r6, reg_16x8_src_cr_r7;
110 
111     int16x8_t reg_16x8_mul_cb_r0, reg_16x8_mul_cb_r1, reg_16x8_mul_cb_r2, reg_16x8_mul_cb_r3,
112         reg_16x8_mul_cb_r4, reg_16x8_mul_cb_r5, reg_16x8_mul_cb_r6, reg_16x8_mul_cb_r7;
113 
114     int16x8_t reg_16x8_mul_cr_r0, reg_16x8_mul_cr_r1, reg_16x8_mul_cr_r2, reg_16x8_mul_cr_r3,
115         reg_16x8_mul_cr_r4, reg_16x8_mul_cr_r5, reg_16x8_mul_cr_r6, reg_16x8_mul_cr_r7;
116 
117     int32x4_t reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1, reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3,
118         reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5, reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7;
119 
120     int32x4_t reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1, reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3,
121         reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5, reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7;
122 
123     int32x4_t reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23, reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67;
124     uint16x4_t reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67;
125     uint16x8_t reg_16x8_sum_cb_r0_r7;
126     uint8x8_t reg_8x8_sum_cb_r0_r7;
127 
128     int32x4_t reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23, reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67;
129     uint16x4_t reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67;
130     uint16x8_t reg_16x8_sum_cr_r0_r7;
131     uint8x8_t reg_8x8_sum_cr_r0_r7;
132     uint16x8_t reg_16x8_sum_cb_cr_r0_r3;
133     uint8x8_t reg_8x8_sum_cb_cr_r0_r3;
134 
135     int32x4_t reg_32x4_sum_cb_cr_r0;
136     uint16x4_t reg_16x4_sum_cb_cr_r0;
137 
138     int32x4_t reg_32x4_zero = vdupq_n_s32(0);
139 
140     uint16x4_t reg_16x4_sum_r01_23, reg_16x4_sum_r45_67;
141     uint16x4_t reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15;
142     uint16x8_t reg_16x8_sum_r0_r7, reg_16x8_sum_r8_r15;
143     uint8x8_t reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15;
144     uint8x16_t reg_8x16_sum_r0_r15;
145     int8x8_t reg_8x8_filt_coeff_grid;
146     int16x8_t reg_16x8_filt_coeff_grid;
147     int32x4x2_t reg_32x4x2_sum_r01, reg_32x4x2_sum_r23, reg_32x4x2_sum_r45, reg_32x4x2_sum_r67;
148     int32x4x2_t reg_32x4x2_sum_r89, reg_32x4x2_sum_r1011, reg_32x4x2_sum_r1213,
149         reg_32x4x2_sum_r1415;
150     uint8x16x2_t reg_8x16x2_src_r0, reg_8x16x2_src_r1, reg_8x16x2_src_r2, reg_8x16x2_src_r3;
151 
152     downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
153 
154     UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
155     UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
156     UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
157     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
158     UWORD32 u4_in_stride = ps_src->i4_data_stride;
159     UWORD8 *pu1_dst = (UWORD8 *) ps_dst->pv_data;
160     UWORD32 u4_out_stride = ps_dst->i4_data_stride;
161     UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
162 
163     /* Offset the input so that the input pixel to be processed
164     co-incides with the centre of filter (4th coefficient)*/
165     pu1_src += (1 + u1_is_chroma);
166 
167     ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
168 
169     if(!u1_is_chroma)
170     {
171         u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
172         u4_rem_vert_loop = u4_blk_ht % 16;
173 
174         for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
175         {
176             pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
177             pu1_dst_j = pu1_dst + (j << 4);
178 
179             u4_center_pixel_pos = u4_center_pixel_pos_src;
180 
181             for(i = 0; i < (WORD32) u4_blk_wd; i++)
182             {
183                 u1_phase = get_filter_phase(u4_center_pixel_pos);
184 
185                 pi1_filter_grid = pai1_filters[u1_phase];
186 
187                 /* Doing the Calculation for current Loop Count  */
188                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
189 
190                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
191 
192                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
193 
194                 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
195 
196                 /******************************************************/
197                 /* This loop is going vertically in bottom direction */
198                 /* but the output pixels are stored in horizontal    */
199                 /* direction in transpose manner                     */
200                 /******************************************************/
201 
202                 /* r0-r7 */
203                 reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
204                 reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride);
205                 reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride);
206                 reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride);
207                 reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride);
208                 reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride);
209                 reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride);
210                 reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride);
211 
212                 /* r0-r7 */
213                 reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
214                 reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1);
215                 reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2);
216                 reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3);
217                 reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4);
218                 reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5);
219                 reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6);
220                 reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7);
221 
222                 /* r8-r15 */
223                 reg_8x8_src_r8 = vld1_u8(pu1_in_pixel + 8 * u4_in_stride);
224                 reg_8x8_src_r9 = vld1_u8(pu1_in_pixel + 9 * u4_in_stride);
225                 reg_8x8_src_r10 = vld1_u8(pu1_in_pixel + 10 * u4_in_stride);
226                 reg_8x8_src_r11 = vld1_u8(pu1_in_pixel + 11 * u4_in_stride);
227                 reg_8x8_src_r12 = vld1_u8(pu1_in_pixel + 12 * u4_in_stride);
228                 reg_8x8_src_r13 = vld1_u8(pu1_in_pixel + 13 * u4_in_stride);
229                 reg_8x8_src_r14 = vld1_u8(pu1_in_pixel + 14 * u4_in_stride);
230                 reg_8x8_src_r15 = vld1_u8(pu1_in_pixel + 15 * u4_in_stride);
231 
232                 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
233 
234                 /*r0-r7 */
235                 reg_16x8_mul_r0 =
236                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0), reg_16x8_filt_coeff_grid);
237                 reg_16x8_mul_r1 =
238                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1), reg_16x8_filt_coeff_grid);
239                 reg_16x8_mul_r2 =
240                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2), reg_16x8_filt_coeff_grid);
241                 reg_16x8_mul_r3 =
242                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3), reg_16x8_filt_coeff_grid);
243                 reg_16x8_mul_r4 =
244                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4), reg_16x8_filt_coeff_grid);
245                 reg_16x8_mul_r5 =
246                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5), reg_16x8_filt_coeff_grid);
247                 reg_16x8_mul_r6 =
248                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6), reg_16x8_filt_coeff_grid);
249                 reg_16x8_mul_r7 =
250                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7), reg_16x8_filt_coeff_grid);
251 
252                 /* r8-r15 */
253                 reg_16x8_src_r8 = vmovl_u8(reg_8x8_src_r8);
254                 reg_16x8_src_r9 = vmovl_u8(reg_8x8_src_r9);
255                 reg_16x8_src_r10 = vmovl_u8(reg_8x8_src_r10);
256                 reg_16x8_src_r11 = vmovl_u8(reg_8x8_src_r11);
257                 reg_16x8_src_r12 = vmovl_u8(reg_8x8_src_r12);
258                 reg_16x8_src_r13 = vmovl_u8(reg_8x8_src_r13);
259                 reg_16x8_src_r14 = vmovl_u8(reg_8x8_src_r14);
260                 reg_16x8_src_r15 = vmovl_u8(reg_8x8_src_r15);
261 
262                 /* r0-r7 */
263                 reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
264                 reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1);
265                 reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2);
266                 reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3);
267                 reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4);
268                 reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5);
269                 reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6);
270                 reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7);
271 
272                 /* r8-r15 */
273                 reg_16x8_mul_r8 =
274                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r8), reg_16x8_filt_coeff_grid);
275                 reg_16x8_mul_r9 =
276                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r9), reg_16x8_filt_coeff_grid);
277                 reg_16x8_mul_r10 =
278                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r10), reg_16x8_filt_coeff_grid);
279                 reg_16x8_mul_r11 =
280                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r11), reg_16x8_filt_coeff_grid);
281                 reg_16x8_mul_r12 =
282                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r12), reg_16x8_filt_coeff_grid);
283                 reg_16x8_mul_r13 =
284                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r13), reg_16x8_filt_coeff_grid);
285                 reg_16x8_mul_r14 =
286                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r14), reg_16x8_filt_coeff_grid);
287                 reg_16x8_mul_r15 =
288                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r15), reg_16x8_filt_coeff_grid);
289 
290                 /* r0-r7 */
291                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1);
292                 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3);
293                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5);
294                 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7);
295 
296                 reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
297                 reg_32x4_sum_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
298                 reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
299                 reg_32x4_sum_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
300 
301                 /* r8-r15 */
302                 reg_32x4_sum_r8 = vpaddlq_s16(reg_16x8_mul_r8);
303                 reg_32x4_sum_r9 = vpaddlq_s16(reg_16x8_mul_r9);
304                 reg_32x4_sum_r10 = vpaddlq_s16(reg_16x8_mul_r10);
305                 reg_32x4_sum_r11 = vpaddlq_s16(reg_16x8_mul_r11);
306                 reg_32x4_sum_r12 = vpaddlq_s16(reg_16x8_mul_r12);
307                 reg_32x4_sum_r13 = vpaddlq_s16(reg_16x8_mul_r13);
308                 reg_32x4_sum_r14 = vpaddlq_s16(reg_16x8_mul_r14);
309                 reg_32x4_sum_r15 = vpaddlq_s16(reg_16x8_mul_r15);
310 
311                 /* r0-r7 */
312                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23);
313                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67);
314                 reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
315                 reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
316 
317                 /* r8-r15 */
318                 reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r8, reg_32x4_sum_r9);
319                 reg_32x4x2_sum_r1011 = vuzpq_s32(reg_32x4_sum_r10, reg_32x4_sum_r11);
320                 reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r12, reg_32x4_sum_r13);
321                 reg_32x4x2_sum_r1415 = vuzpq_s32(reg_32x4_sum_r14, reg_32x4_sum_r15);
322 
323                 reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]);
324                 reg_32x4_sum_r1011 =
325                     vaddq_s32(reg_32x4x2_sum_r1011.val[0], reg_32x4x2_sum_r1011.val[1]);
326                 reg_32x4_sum_r1213 =
327                     vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]);
328                 reg_32x4_sum_r1415 =
329                     vaddq_s32(reg_32x4x2_sum_r1415.val[0], reg_32x4x2_sum_r1415.val[1]);
330 
331                 /* r0-r7 */
332                 reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
333                 reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7);
334 
335                 /* r8-r15 */
336                 reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r89, reg_32x4_sum_r1011);
337                 reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r1213, reg_32x4_sum_r1415);
338                 reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]);
339                 reg_32x4_sum_r1213 =
340                     vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]);
341 
342                 /* r0-r7 */
343                 reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67);
344                 reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7);
345 
346                 reg_16x4_sum_r8_r11 = vqrshrun_n_s32(reg_32x4_sum_r89, 7);
347                 reg_16x4_sum_r12_r15 = vqrshrun_n_s32(reg_32x4_sum_r1213, 7);
348 
349                 reg_16x8_sum_r8_r15 = vcombine_u16(reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15);
350                 reg_8x8_sum_r8_r15 = vqmovn_u16(reg_16x8_sum_r8_r15);
351 
352                 reg_8x16_sum_r0_r15 = vcombine_u8(reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15);
353 
354                 /* r0-r7 */
355                 vst1q_u8(pu1_out_pixel, reg_8x16_sum_r0_r15);
356 
357                 pu1_out_pixel += 16;
358                 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
359 
360                 /* Update the context for next Loop Count */
361                 u4_center_pixel_pos += u4_src_horz_increments;
362             }
363         }
364 
365         /* Loop for the remaining height less than 16 */
366         if(u4_rem_vert_loop)
367         {
368             u4_rem_vert_loop_by_8 = u4_rem_vert_loop >> 3;
369             u4_rem_vert_loop = u4_rem_vert_loop % 8;
370 
371             u4_height_finished = (u4_num_iterations_vertical_by_16 << 4);
372 
373             pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride);
374             pu1_dst_j = pu1_dst + u4_height_finished;
375 
376             u4_center_pixel_pos = u4_center_pixel_pos_src;
377 
378             /* 8 <= remaining height < 16 */
379             if(u4_rem_vert_loop_by_8)
380             {
381                 for(i = 0; i < (WORD32) u4_blk_wd; i++)
382                 {
383                     u1_phase = get_filter_phase(u4_center_pixel_pos);
384                     pi1_filter_grid = pai1_filters[u1_phase];
385 
386                     u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
387 
388                     pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
389 
390                     pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
391 
392                     reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
393 
394                     for(j = u4_rem_vert_loop_by_8; j > 0; j--)
395                     {
396                         /******************************************************/
397                         /* This loop is going vertically in bottom direction */
398                         /* but the output pixels are stored in horizontal    */
399                         /* direction in transpose manner                     */
400                         /******************************************************/
401 
402                         reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
403                         reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride);
404                         reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride);
405                         reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride);
406                         reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride);
407                         reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride);
408                         reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride);
409                         reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride);
410 
411                         reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
412                         reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1);
413                         reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2);
414                         reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3);
415                         reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4);
416                         reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5);
417                         reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6);
418                         reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7);
419                         reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
420 
421                         reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0),
422                                                     reg_16x8_filt_coeff_grid);
423                         reg_16x8_mul_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1),
424                                                     reg_16x8_filt_coeff_grid);
425                         reg_16x8_mul_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2),
426                                                     reg_16x8_filt_coeff_grid);
427                         reg_16x8_mul_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3),
428                                                     reg_16x8_filt_coeff_grid);
429                         reg_16x8_mul_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4),
430                                                     reg_16x8_filt_coeff_grid);
431                         reg_16x8_mul_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5),
432                                                     reg_16x8_filt_coeff_grid);
433                         reg_16x8_mul_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6),
434                                                     reg_16x8_filt_coeff_grid);
435                         reg_16x8_mul_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7),
436                                                     reg_16x8_filt_coeff_grid);
437 
438                         reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
439                         reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1);
440                         reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2);
441                         reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3);
442                         reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4);
443                         reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5);
444                         reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6);
445                         reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7);
446 
447                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1);
448                         reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3);
449                         reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5);
450                         reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7);
451 
452                         reg_32x4_sum_r01 =
453                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
454                         reg_32x4_sum_r23 =
455                             vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
456                         reg_32x4_sum_r45 =
457                             vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
458                         reg_32x4_sum_r67 =
459                             vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
460 
461                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23);
462                         reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67);
463                         reg_32x4_sum_r01 =
464                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
465                         reg_32x4_sum_r45 =
466                             vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
467 
468                         reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
469                         reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7);
470 
471                         reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67);
472                         reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7);
473 
474                         vst1_u8(pu1_out_pixel, reg_8x8_sum_r0_r7);
475 
476                         pu1_out_pixel += 8;
477                         pu1_in_pixel +=
478                             (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q;
479                     }
480                     /* Update the context for next Loop Count */
481                     u4_center_pixel_pos += u4_src_horz_increments;
482                 }
483             }
484 
485             /* 1 <= remaining height < 8 */
486             if(u4_rem_vert_loop)
487             {
488                 u4_height_finished =
489                     ((u4_num_iterations_vertical_by_16 << 4) + (u4_rem_vert_loop_by_8 << 3));
490                 pu1_src_j = pu1_src + u4_height_finished * u4_in_stride;
491                 pu1_dst_j = pu1_dst + u4_height_finished;
492 
493                 u4_center_pixel_pos = u4_center_pixel_pos_src;
494 
495                 for(i = 0; i < (WORD32) u4_blk_wd; i++)
496                 {
497                     u1_phase = get_filter_phase(u4_center_pixel_pos);
498                     pi1_filter_grid = pai1_filters[u1_phase];
499 
500                     u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
501 
502                     pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
503 
504                     pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
505 
506                     reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
507 
508                     for(j = u4_rem_vert_loop; j > 0; j--)
509                     {
510                         /******************************************************/
511                         /* This loop is going vertically in bottom direction */
512                         /* but the output pixels are stored in horizontal    */
513                         /* direction in transpose manner                     */
514                         /******************************************************/
515 
516                         reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
517                         reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
518 
519                         reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
520 
521                         reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0),
522                                                     reg_16x8_filt_coeff_grid);
523 
524                         reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
525 
526                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_zero);
527                         reg_32x4_sum_r01 =
528                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
529                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_zero);
530                         reg_32x4_sum_r01 =
531                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
532 
533                         reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
534 
535                         vst1_lane_u8(pu1_out_pixel, vreinterpret_u8_u16(reg_16x4_sum_r01_23), 0);
536                         pu1_out_pixel += 1;
537                         pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
538                     }
539                     /* Update the context for next Loop Count */
540                     u4_center_pixel_pos += u4_src_horz_increments;
541                 }
542             }
543         }
544     }
545     /* for chroma */
546     else
547     {
548         u4_num_iterations_vertical_by_8 = u4_blk_ht >> 3;
549         u4_rem_vert_loop = u4_blk_ht % 8;
550 
551         for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_8; j++)
552         {
553             pu1_src_j = pu1_src + ((j << 3) * u4_in_stride);
554             pu1_dst_j = pu1_dst + (j << 3);
555 
556             u4_center_pixel_pos = u4_center_pixel_pos_src;
557 
558             for(i = 0; i < (WORD32) u4_blk_wd; i++)
559             {
560                 u1_phase = get_filter_phase(u4_center_pixel_pos);
561                 pi1_filter_grid = pai1_filters[u1_phase];
562 
563                 /*Doing the Calculation for current Loop Count  */
564                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
565 
566                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
567 
568                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
569 
570                 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
571 
572                 /******************************************************/
573                 /* This loop is going vertically in bottom direction */
574                 /* but the output pixels are stored in horizontal    */
575                 /* direction in transpose manner                     */
576                 /******************************************************/
577 
578                 reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
579                 reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride);
580                 reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride);
581                 reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride);
582                 reg_8x16_src_r4 = vld1q_u8(pu1_in_pixel + 4 * u4_in_stride);
583                 reg_8x16_src_r5 = vld1q_u8(pu1_in_pixel + 5 * u4_in_stride);
584                 reg_8x16_src_r6 = vld1q_u8(pu1_in_pixel + 6 * u4_in_stride);
585                 reg_8x16_src_r7 = vld1q_u8(pu1_in_pixel + 7 * u4_in_stride);
586 
587                 reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1);
588                 reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3);
589                 reg_8x16x2_src_r2 = vuzpq_u8(reg_8x16_src_r4, reg_8x16_src_r5);
590                 reg_8x16x2_src_r3 = vuzpq_u8(reg_8x16_src_r6, reg_8x16_src_r7);
591 
592                 reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
593                 reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0]));
594                 reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0]));
595                 reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0]));
596                 reg_16x8_src_cb_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[0]));
597                 reg_16x8_src_cb_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[0]));
598                 reg_16x8_src_cb_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[0]));
599                 reg_16x8_src_cb_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[0]));
600 
601                 reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
602                 reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1]));
603                 reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1]));
604                 reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1]));
605                 reg_16x8_src_cr_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[1]));
606                 reg_16x8_src_cr_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[1]));
607                 reg_16x8_src_cr_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[1]));
608                 reg_16x8_src_cr_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[1]));
609 
610                 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
611 
612                 reg_16x8_mul_cb_r0 =
613                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0), reg_16x8_filt_coeff_grid);
614                 reg_16x8_mul_cb_r1 =
615                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1), reg_16x8_filt_coeff_grid);
616                 reg_16x8_mul_cb_r2 =
617                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2), reg_16x8_filt_coeff_grid);
618                 reg_16x8_mul_cb_r3 =
619                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3), reg_16x8_filt_coeff_grid);
620                 reg_16x8_mul_cb_r4 =
621                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r4), reg_16x8_filt_coeff_grid);
622                 reg_16x8_mul_cb_r5 =
623                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r5), reg_16x8_filt_coeff_grid);
624                 reg_16x8_mul_cb_r6 =
625                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r6), reg_16x8_filt_coeff_grid);
626                 reg_16x8_mul_cb_r7 =
627                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r7), reg_16x8_filt_coeff_grid);
628 
629                 reg_16x8_mul_cr_r0 =
630                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0), reg_16x8_filt_coeff_grid);
631                 reg_16x8_mul_cr_r1 =
632                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1), reg_16x8_filt_coeff_grid);
633                 reg_16x8_mul_cr_r2 =
634                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2), reg_16x8_filt_coeff_grid);
635                 reg_16x8_mul_cr_r3 =
636                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3), reg_16x8_filt_coeff_grid);
637                 reg_16x8_mul_cr_r4 =
638                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r4), reg_16x8_filt_coeff_grid);
639                 reg_16x8_mul_cr_r5 =
640                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r5), reg_16x8_filt_coeff_grid);
641                 reg_16x8_mul_cr_r6 =
642                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r6), reg_16x8_filt_coeff_grid);
643                 reg_16x8_mul_cr_r7 =
644                     vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r7), reg_16x8_filt_coeff_grid);
645 
646                 reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
647                 reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1);
648                 reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2);
649                 reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3);
650                 reg_32x4_sum_cb_r4 = vpaddlq_s16(reg_16x8_mul_cb_r4);
651                 reg_32x4_sum_cb_r5 = vpaddlq_s16(reg_16x8_mul_cb_r5);
652                 reg_32x4_sum_cb_r6 = vpaddlq_s16(reg_16x8_mul_cb_r6);
653                 reg_32x4_sum_cb_r7 = vpaddlq_s16(reg_16x8_mul_cb_r7);
654 
655                 reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
656                 reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1);
657                 reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2);
658                 reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3);
659                 reg_32x4_sum_cr_r4 = vpaddlq_s16(reg_16x8_mul_cr_r4);
660                 reg_32x4_sum_cr_r5 = vpaddlq_s16(reg_16x8_mul_cr_r5);
661                 reg_32x4_sum_cr_r6 = vpaddlq_s16(reg_16x8_mul_cr_r6);
662                 reg_32x4_sum_cr_r7 = vpaddlq_s16(reg_16x8_mul_cr_r7);
663 
664                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1);
665                 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3);
666                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5);
667                 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7);
668 
669                 reg_32x4_sum_cb_r01 =
670                     vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
671                 reg_32x4_sum_cb_r23 =
672                     vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
673                 reg_32x4_sum_cb_r45 =
674                     vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
675                 reg_32x4_sum_cb_r67 =
676                     vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
677 
678                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23);
679                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67);
680                 reg_32x4_sum_cb_r01 =
681                     vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
682                 reg_32x4_sum_cb_r45 =
683                     vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
684 
685                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1);
686                 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3);
687                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5);
688                 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7);
689 
690                 reg_32x4_sum_cr_r01 =
691                     vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
692                 reg_32x4_sum_cr_r23 =
693                     vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
694                 reg_32x4_sum_cr_r45 =
695                     vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
696                 reg_32x4_sum_cr_r67 =
697                     vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
698 
699                 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23);
700                 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67);
701                 reg_32x4_sum_cr_r01 =
702                     vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
703                 reg_32x4_sum_cr_r45 =
704                     vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
705 
706                 reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7);
707                 reg_16x4_sum_cb_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cb_r45, 7);
708 
709                 reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7);
710                 reg_16x4_sum_cr_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cr_r45, 7);
711 
712                 reg_16x8_sum_cb_r0_r7 =
713                     vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67);
714                 reg_16x8_sum_cr_r0_r7 =
715                     vcombine_u16(reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67);
716 
717                 reg_8x8_sum_cb_r0_r7 = vqmovn_u16(reg_16x8_sum_cb_r0_r7);
718                 reg_8x8_sum_cr_r0_r7 = vqmovn_u16(reg_16x8_sum_cr_r0_r7);
719 
720                 vst1_u8(pu1_out_pixel, reg_8x8_sum_cb_r0_r7);
721                 vst1_u8(pu1_out_pixel + u4_out_stride, reg_8x8_sum_cr_r0_r7);
722 
723                 pu1_out_pixel += 8;
724 
725                 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q;
726 
727                 /* Update the context for next Loop Count */
728                 u4_center_pixel_pos += u4_src_horz_increments;
729             }
730         }
731 
732         /* Loop for the remaining height less than 8 */
733         if(u4_rem_vert_loop)
734         {
735             u4_rem_vert_loop_by_4 = u4_rem_vert_loop >> 2;
736             u4_rem_vert_loop = u4_rem_vert_loop % 4;
737             u4_height_finished = (u4_num_iterations_vertical_by_8 << 3);
738             pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride);
739             pu1_dst_j = pu1_dst + u4_height_finished;
740 
741             u4_center_pixel_pos = u4_center_pixel_pos_src;
742 
743             /* 4<= remaining height < 8 */
744             if(u4_rem_vert_loop_by_4)
745             {
746                 for(i = 0; i < (WORD32) u4_blk_wd; i++)
747                 {
748                     u1_phase = get_filter_phase(u4_center_pixel_pos);
749                     pi1_filter_grid = pai1_filters[u1_phase];
750 
751                     u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
752 
753                     pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
754 
755                     pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
756 
757                     reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
758 
759                     for(j = u4_rem_vert_loop_by_4; j > 0; j--)
760                     {
761                         /******************************************************/
762                         /* This loop is going vertically in bottom direction */
763                         /* but the output pixels are stored in horizontal    */
764                         /* direction in transpose manner                     */
765                         /******************************************************/
766 
767                         reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
768                         reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride);
769                         reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride);
770                         reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride);
771 
772                         reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1);
773                         reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3);
774 
775                         reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
776                         reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0]));
777                         reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0]));
778                         reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0]));
779 
780                         reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
781                         reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1]));
782                         reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1]));
783                         reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1]));
784 
785                         reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
786 
787                         reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0),
788                                                        reg_16x8_filt_coeff_grid);
789                         reg_16x8_mul_cb_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1),
790                                                        reg_16x8_filt_coeff_grid);
791                         reg_16x8_mul_cb_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2),
792                                                        reg_16x8_filt_coeff_grid);
793                         reg_16x8_mul_cb_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3),
794                                                        reg_16x8_filt_coeff_grid);
795 
796                         reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0),
797                                                        reg_16x8_filt_coeff_grid);
798                         reg_16x8_mul_cr_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1),
799                                                        reg_16x8_filt_coeff_grid);
800                         reg_16x8_mul_cr_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2),
801                                                        reg_16x8_filt_coeff_grid);
802                         reg_16x8_mul_cr_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3),
803                                                        reg_16x8_filt_coeff_grid);
804 
805                         reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
806                         reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1);
807                         reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2);
808                         reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3);
809 
810                         reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
811                         reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1);
812                         reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2);
813                         reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3);
814 
815                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1);
816                         reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3);
817                         reg_32x4_sum_cb_r01 =
818                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
819                         reg_32x4_sum_cb_r23 =
820                             vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
821                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23);
822                         reg_32x4_sum_cb_r01 =
823                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
824 
825                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1);
826                         reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3);
827                         reg_32x4_sum_cr_r01 =
828                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
829                         reg_32x4_sum_cr_r23 =
830                             vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
831                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23);
832                         reg_32x4_sum_cr_r01 =
833                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
834 
835                         reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7);
836                         reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7);
837 
838                         reg_16x8_sum_cb_cr_r0_r3 =
839                             vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cr_r01_23);
840                         reg_8x8_sum_cb_cr_r0_r3 = vmovn_u16(reg_16x8_sum_cb_cr_r0_r3);
841                         vst1_lane_u32((uint32_t *) (pu1_out_pixel),
842                                       vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 0);
843                         vst1_lane_u32((uint32_t *) (pu1_out_pixel + u4_out_stride),
844                                       vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 1);
845 
846                         pu1_out_pixel += 4;
847 
848                         pu1_in_pixel +=
849                             (u4_src_vert_increments * (u4_in_stride << 2)) >> DOWNSCALER_Q;
850                     }
851                     /* Update the context for next Loop Count */
852                     u4_center_pixel_pos += u4_src_horz_increments;
853                 }
854             }
855 
856             /* 1<= remaining height < 4 */
857             if(u4_rem_vert_loop)
858             {
859                 u4_height_finished =
860                     ((u4_num_iterations_vertical_by_8 << 3) + (u4_rem_vert_loop_by_4 << 2));
861                 pu1_src_j = pu1_src + u4_height_finished * u4_in_stride;
862                 pu1_dst_j = pu1_dst + u4_height_finished;
863 
864                 u4_center_pixel_pos = u4_center_pixel_pos_src;
865                 for(i = 0; i < (WORD32) u4_blk_wd; i++)
866                 {
867                     u1_phase = get_filter_phase(u4_center_pixel_pos);
868                     pi1_filter_grid = pai1_filters[u1_phase];
869 
870                     u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
871 
872                     pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
873 
874                     pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
875 
876                     reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
877 
878                     for(j = u4_rem_vert_loop; j > 0; j--)
879                     {
880                         /******************************************************/
881                         /* This loop is going vertically in bottom direction */
882                         /* but the output pixels are stored in horizontal    */
883                         /* direction in transpose manner                     */
884                         /******************************************************/
885 
886                         reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
887 
888                         reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r0);
889                         reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
890                         reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
891 
892                         reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
893 
894                         reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0),
895                                                        reg_16x8_filt_coeff_grid);
896                         reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0),
897                                                        reg_16x8_filt_coeff_grid);
898 
899                         reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
900                         reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
901 
902                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cr_r0);
903                         reg_32x4_sum_cb_cr_r0 =
904                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
905 
906                         reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_cr_r0, reg_32x4_zero);
907                         reg_32x4_sum_cb_cr_r0 =
908                             vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
909 
910                         reg_16x4_sum_cb_cr_r0 = vqrshrun_n_s32(reg_32x4_sum_cb_cr_r0, 7);
911                         vst1_lane_u8((pu1_out_pixel), vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0),
912                                      0);
913                         vst1_lane_u8((pu1_out_pixel + u4_out_stride),
914                                      vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0), 2);
915 
916                         pu1_out_pixel += 1;
917 
918                         pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride)) >> DOWNSCALER_Q;
919                     }
920 
921                     /* Update the context for next Loop Count */
922                     u4_center_pixel_pos += u4_src_horz_increments;
923                 }
924             }
925         }
926     }
927 }
928