1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file ih264e_downscaler_neon.c
24 *
25 * @brief
26 * This file contains the ARMV8 SIMD version of the function which does
27 * horizontal scaling and transpose
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - ih264e_horizontal_downscale_and_transpose_av8()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40
41 /*****************************************************************************/
42 /* File Includes */
43 /*****************************************************************************/
44
45 /* System include files */
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <arm_neon.h>
49
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "isvc_defs.h"
55 #include "isvce_defs.h"
56 #include "isvc_structs.h"
57 #include "isvce_downscaler_private_defs.h"
58
isvce_horizontal_downscale_and_transpose_neon(downscaler_ctxt_t * ps_scaler,buffer_container_t * ps_src,buffer_container_t * ps_dst,FILTER_COEFF_ARRAY pai1_filters,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht,UWORD8 u1_is_chroma)59 void isvce_horizontal_downscale_and_transpose_neon(
60 downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
61 FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
62 {
63 WORD32 i, j;
64 UWORD8 u1_phase;
65 UWORD8 *pu1_src_j, *pu1_dst_j;
66 UWORD8 *pu1_in_pixel;
67 UWORD8 *pu1_out_pixel;
68 WORD8 *pi1_filter_grid;
69 UWORD16 u2_full_pixel_inc;
70 UWORD32 u4_num_iterations_vertical_by_16, u4_num_iterations_vertical_by_8;
71 UWORD32 u4_rem_vert_loop_by_8, u4_rem_vert_loop_by_4;
72 UWORD32 u4_rem_vert_loop;
73 UWORD32 u4_height_finished;
74
75 uint8x8_t reg_8x8_src_r0, reg_8x8_src_r1, reg_8x8_src_r2, reg_8x8_src_r3, reg_8x8_src_r4,
76 reg_8x8_src_r5, reg_8x8_src_r6, reg_8x8_src_r7;
77
78 uint16x8_t reg_16x8_src_r0, reg_16x8_src_r1, reg_16x8_src_r2, reg_16x8_src_r3, reg_16x8_src_r4,
79 reg_16x8_src_r5, reg_16x8_src_r6, reg_16x8_src_r7;
80
81 int16x8_t reg_16x8_mul_r0, reg_16x8_mul_r1, reg_16x8_mul_r2, reg_16x8_mul_r3, reg_16x8_mul_r4,
82 reg_16x8_mul_r5, reg_16x8_mul_r6, reg_16x8_mul_r7;
83
84 int32x4_t reg_32x4_sum_r0, reg_32x4_sum_r1, reg_32x4_sum_r2, reg_32x4_sum_r3, reg_32x4_sum_r4,
85 reg_32x4_sum_r5, reg_32x4_sum_r6, reg_32x4_sum_r7;
86
87 int32x4_t reg_32x4_sum_r01, reg_32x4_sum_r23, reg_32x4_sum_r45, reg_32x4_sum_r67,
88 reg_32x4_sum_r89, reg_32x4_sum_r1011, reg_32x4_sum_r1213, reg_32x4_sum_r1415;
89
90 uint8x8_t reg_8x8_src_r8, reg_8x8_src_r9, reg_8x8_src_r10, reg_8x8_src_r11, reg_8x8_src_r12,
91 reg_8x8_src_r13, reg_8x8_src_r14, reg_8x8_src_r15;
92
93 uint16x8_t reg_16x8_src_r8, reg_16x8_src_r9, reg_16x8_src_r10, reg_16x8_src_r11,
94 reg_16x8_src_r12, reg_16x8_src_r13, reg_16x8_src_r14, reg_16x8_src_r15;
95
96 int16x8_t reg_16x8_mul_r8, reg_16x8_mul_r9, reg_16x8_mul_r10, reg_16x8_mul_r11,
97 reg_16x8_mul_r12, reg_16x8_mul_r13, reg_16x8_mul_r14, reg_16x8_mul_r15;
98
99 int32x4_t reg_32x4_sum_r8, reg_32x4_sum_r9, reg_32x4_sum_r10, reg_32x4_sum_r11,
100 reg_32x4_sum_r12, reg_32x4_sum_r13, reg_32x4_sum_r14, reg_32x4_sum_r15;
101
102 uint8x16_t reg_8x16_src_r0, reg_8x16_src_r1, reg_8x16_src_r2, reg_8x16_src_r3, reg_8x16_src_r4,
103 reg_8x16_src_r5, reg_8x16_src_r6, reg_8x16_src_r7;
104
105 uint16x8_t reg_16x8_src_cb_r0, reg_16x8_src_cb_r1, reg_16x8_src_cb_r2, reg_16x8_src_cb_r3,
106 reg_16x8_src_cb_r4, reg_16x8_src_cb_r5, reg_16x8_src_cb_r6, reg_16x8_src_cb_r7;
107
108 uint16x8_t reg_16x8_src_cr_r0, reg_16x8_src_cr_r1, reg_16x8_src_cr_r2, reg_16x8_src_cr_r3,
109 reg_16x8_src_cr_r4, reg_16x8_src_cr_r5, reg_16x8_src_cr_r6, reg_16x8_src_cr_r7;
110
111 int16x8_t reg_16x8_mul_cb_r0, reg_16x8_mul_cb_r1, reg_16x8_mul_cb_r2, reg_16x8_mul_cb_r3,
112 reg_16x8_mul_cb_r4, reg_16x8_mul_cb_r5, reg_16x8_mul_cb_r6, reg_16x8_mul_cb_r7;
113
114 int16x8_t reg_16x8_mul_cr_r0, reg_16x8_mul_cr_r1, reg_16x8_mul_cr_r2, reg_16x8_mul_cr_r3,
115 reg_16x8_mul_cr_r4, reg_16x8_mul_cr_r5, reg_16x8_mul_cr_r6, reg_16x8_mul_cr_r7;
116
117 int32x4_t reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1, reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3,
118 reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5, reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7;
119
120 int32x4_t reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1, reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3,
121 reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5, reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7;
122
123 int32x4_t reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23, reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67;
124 uint16x4_t reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67;
125 uint16x8_t reg_16x8_sum_cb_r0_r7;
126 uint8x8_t reg_8x8_sum_cb_r0_r7;
127
128 int32x4_t reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23, reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67;
129 uint16x4_t reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67;
130 uint16x8_t reg_16x8_sum_cr_r0_r7;
131 uint8x8_t reg_8x8_sum_cr_r0_r7;
132 uint16x8_t reg_16x8_sum_cb_cr_r0_r3;
133 uint8x8_t reg_8x8_sum_cb_cr_r0_r3;
134
135 int32x4_t reg_32x4_sum_cb_cr_r0;
136 uint16x4_t reg_16x4_sum_cb_cr_r0;
137
138 int32x4_t reg_32x4_zero = vdupq_n_s32(0);
139
140 uint16x4_t reg_16x4_sum_r01_23, reg_16x4_sum_r45_67;
141 uint16x4_t reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15;
142 uint16x8_t reg_16x8_sum_r0_r7, reg_16x8_sum_r8_r15;
143 uint8x8_t reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15;
144 uint8x16_t reg_8x16_sum_r0_r15;
145 int8x8_t reg_8x8_filt_coeff_grid;
146 int16x8_t reg_16x8_filt_coeff_grid;
147 int32x4x2_t reg_32x4x2_sum_r01, reg_32x4x2_sum_r23, reg_32x4x2_sum_r45, reg_32x4x2_sum_r67;
148 int32x4x2_t reg_32x4x2_sum_r89, reg_32x4x2_sum_r1011, reg_32x4x2_sum_r1213,
149 reg_32x4x2_sum_r1415;
150 uint8x16x2_t reg_8x16x2_src_r0, reg_8x16x2_src_r1, reg_8x16x2_src_r2, reg_8x16x2_src_r3;
151
152 downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
153
154 UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
155 UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
156 UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
157 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
158 UWORD32 u4_in_stride = ps_src->i4_data_stride;
159 UWORD8 *pu1_dst = (UWORD8 *) ps_dst->pv_data;
160 UWORD32 u4_out_stride = ps_dst->i4_data_stride;
161 UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
162
163 /* Offset the input so that the input pixel to be processed
164 co-incides with the centre of filter (4th coefficient)*/
165 pu1_src += (1 + u1_is_chroma);
166
167 ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
168
169 if(!u1_is_chroma)
170 {
171 u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
172 u4_rem_vert_loop = u4_blk_ht % 16;
173
174 for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
175 {
176 pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
177 pu1_dst_j = pu1_dst + (j << 4);
178
179 u4_center_pixel_pos = u4_center_pixel_pos_src;
180
181 for(i = 0; i < (WORD32) u4_blk_wd; i++)
182 {
183 u1_phase = get_filter_phase(u4_center_pixel_pos);
184
185 pi1_filter_grid = pai1_filters[u1_phase];
186
187 /* Doing the Calculation for current Loop Count */
188 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
189
190 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
191
192 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
193
194 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
195
196 /******************************************************/
197 /* This loop is going vertically in bottom direction */
198 /* but the output pixels are stored in horizontal */
199 /* direction in transpose manner */
200 /******************************************************/
201
202 /* r0-r7 */
203 reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
204 reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride);
205 reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride);
206 reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride);
207 reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride);
208 reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride);
209 reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride);
210 reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride);
211
212 /* r0-r7 */
213 reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
214 reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1);
215 reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2);
216 reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3);
217 reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4);
218 reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5);
219 reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6);
220 reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7);
221
222 /* r8-r15 */
223 reg_8x8_src_r8 = vld1_u8(pu1_in_pixel + 8 * u4_in_stride);
224 reg_8x8_src_r9 = vld1_u8(pu1_in_pixel + 9 * u4_in_stride);
225 reg_8x8_src_r10 = vld1_u8(pu1_in_pixel + 10 * u4_in_stride);
226 reg_8x8_src_r11 = vld1_u8(pu1_in_pixel + 11 * u4_in_stride);
227 reg_8x8_src_r12 = vld1_u8(pu1_in_pixel + 12 * u4_in_stride);
228 reg_8x8_src_r13 = vld1_u8(pu1_in_pixel + 13 * u4_in_stride);
229 reg_8x8_src_r14 = vld1_u8(pu1_in_pixel + 14 * u4_in_stride);
230 reg_8x8_src_r15 = vld1_u8(pu1_in_pixel + 15 * u4_in_stride);
231
232 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
233
234 /*r0-r7 */
235 reg_16x8_mul_r0 =
236 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0), reg_16x8_filt_coeff_grid);
237 reg_16x8_mul_r1 =
238 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1), reg_16x8_filt_coeff_grid);
239 reg_16x8_mul_r2 =
240 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2), reg_16x8_filt_coeff_grid);
241 reg_16x8_mul_r3 =
242 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3), reg_16x8_filt_coeff_grid);
243 reg_16x8_mul_r4 =
244 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4), reg_16x8_filt_coeff_grid);
245 reg_16x8_mul_r5 =
246 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5), reg_16x8_filt_coeff_grid);
247 reg_16x8_mul_r6 =
248 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6), reg_16x8_filt_coeff_grid);
249 reg_16x8_mul_r7 =
250 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7), reg_16x8_filt_coeff_grid);
251
252 /* r8-r15 */
253 reg_16x8_src_r8 = vmovl_u8(reg_8x8_src_r8);
254 reg_16x8_src_r9 = vmovl_u8(reg_8x8_src_r9);
255 reg_16x8_src_r10 = vmovl_u8(reg_8x8_src_r10);
256 reg_16x8_src_r11 = vmovl_u8(reg_8x8_src_r11);
257 reg_16x8_src_r12 = vmovl_u8(reg_8x8_src_r12);
258 reg_16x8_src_r13 = vmovl_u8(reg_8x8_src_r13);
259 reg_16x8_src_r14 = vmovl_u8(reg_8x8_src_r14);
260 reg_16x8_src_r15 = vmovl_u8(reg_8x8_src_r15);
261
262 /* r0-r7 */
263 reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
264 reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1);
265 reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2);
266 reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3);
267 reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4);
268 reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5);
269 reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6);
270 reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7);
271
272 /* r8-r15 */
273 reg_16x8_mul_r8 =
274 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r8), reg_16x8_filt_coeff_grid);
275 reg_16x8_mul_r9 =
276 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r9), reg_16x8_filt_coeff_grid);
277 reg_16x8_mul_r10 =
278 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r10), reg_16x8_filt_coeff_grid);
279 reg_16x8_mul_r11 =
280 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r11), reg_16x8_filt_coeff_grid);
281 reg_16x8_mul_r12 =
282 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r12), reg_16x8_filt_coeff_grid);
283 reg_16x8_mul_r13 =
284 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r13), reg_16x8_filt_coeff_grid);
285 reg_16x8_mul_r14 =
286 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r14), reg_16x8_filt_coeff_grid);
287 reg_16x8_mul_r15 =
288 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r15), reg_16x8_filt_coeff_grid);
289
290 /* r0-r7 */
291 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1);
292 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3);
293 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5);
294 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7);
295
296 reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
297 reg_32x4_sum_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
298 reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
299 reg_32x4_sum_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
300
301 /* r8-r15 */
302 reg_32x4_sum_r8 = vpaddlq_s16(reg_16x8_mul_r8);
303 reg_32x4_sum_r9 = vpaddlq_s16(reg_16x8_mul_r9);
304 reg_32x4_sum_r10 = vpaddlq_s16(reg_16x8_mul_r10);
305 reg_32x4_sum_r11 = vpaddlq_s16(reg_16x8_mul_r11);
306 reg_32x4_sum_r12 = vpaddlq_s16(reg_16x8_mul_r12);
307 reg_32x4_sum_r13 = vpaddlq_s16(reg_16x8_mul_r13);
308 reg_32x4_sum_r14 = vpaddlq_s16(reg_16x8_mul_r14);
309 reg_32x4_sum_r15 = vpaddlq_s16(reg_16x8_mul_r15);
310
311 /* r0-r7 */
312 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23);
313 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67);
314 reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
315 reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
316
317 /* r8-r15 */
318 reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r8, reg_32x4_sum_r9);
319 reg_32x4x2_sum_r1011 = vuzpq_s32(reg_32x4_sum_r10, reg_32x4_sum_r11);
320 reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r12, reg_32x4_sum_r13);
321 reg_32x4x2_sum_r1415 = vuzpq_s32(reg_32x4_sum_r14, reg_32x4_sum_r15);
322
323 reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]);
324 reg_32x4_sum_r1011 =
325 vaddq_s32(reg_32x4x2_sum_r1011.val[0], reg_32x4x2_sum_r1011.val[1]);
326 reg_32x4_sum_r1213 =
327 vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]);
328 reg_32x4_sum_r1415 =
329 vaddq_s32(reg_32x4x2_sum_r1415.val[0], reg_32x4x2_sum_r1415.val[1]);
330
331 /* r0-r7 */
332 reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
333 reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7);
334
335 /* r8-r15 */
336 reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r89, reg_32x4_sum_r1011);
337 reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r1213, reg_32x4_sum_r1415);
338 reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]);
339 reg_32x4_sum_r1213 =
340 vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]);
341
342 /* r0-r7 */
343 reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67);
344 reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7);
345
346 reg_16x4_sum_r8_r11 = vqrshrun_n_s32(reg_32x4_sum_r89, 7);
347 reg_16x4_sum_r12_r15 = vqrshrun_n_s32(reg_32x4_sum_r1213, 7);
348
349 reg_16x8_sum_r8_r15 = vcombine_u16(reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15);
350 reg_8x8_sum_r8_r15 = vqmovn_u16(reg_16x8_sum_r8_r15);
351
352 reg_8x16_sum_r0_r15 = vcombine_u8(reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15);
353
354 /* r0-r7 */
355 vst1q_u8(pu1_out_pixel, reg_8x16_sum_r0_r15);
356
357 pu1_out_pixel += 16;
358 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
359
360 /* Update the context for next Loop Count */
361 u4_center_pixel_pos += u4_src_horz_increments;
362 }
363 }
364
365 /* Loop for the remaining height less than 16 */
366 if(u4_rem_vert_loop)
367 {
368 u4_rem_vert_loop_by_8 = u4_rem_vert_loop >> 3;
369 u4_rem_vert_loop = u4_rem_vert_loop % 8;
370
371 u4_height_finished = (u4_num_iterations_vertical_by_16 << 4);
372
373 pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride);
374 pu1_dst_j = pu1_dst + u4_height_finished;
375
376 u4_center_pixel_pos = u4_center_pixel_pos_src;
377
378 /* 8 <= remaining height < 16 */
379 if(u4_rem_vert_loop_by_8)
380 {
381 for(i = 0; i < (WORD32) u4_blk_wd; i++)
382 {
383 u1_phase = get_filter_phase(u4_center_pixel_pos);
384 pi1_filter_grid = pai1_filters[u1_phase];
385
386 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
387
388 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
389
390 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
391
392 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
393
394 for(j = u4_rem_vert_loop_by_8; j > 0; j--)
395 {
396 /******************************************************/
397 /* This loop is going vertically in bottom direction */
398 /* but the output pixels are stored in horizontal */
399 /* direction in transpose manner */
400 /******************************************************/
401
402 reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
403 reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride);
404 reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride);
405 reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride);
406 reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride);
407 reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride);
408 reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride);
409 reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride);
410
411 reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
412 reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1);
413 reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2);
414 reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3);
415 reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4);
416 reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5);
417 reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6);
418 reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7);
419 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
420
421 reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0),
422 reg_16x8_filt_coeff_grid);
423 reg_16x8_mul_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1),
424 reg_16x8_filt_coeff_grid);
425 reg_16x8_mul_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2),
426 reg_16x8_filt_coeff_grid);
427 reg_16x8_mul_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3),
428 reg_16x8_filt_coeff_grid);
429 reg_16x8_mul_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4),
430 reg_16x8_filt_coeff_grid);
431 reg_16x8_mul_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5),
432 reg_16x8_filt_coeff_grid);
433 reg_16x8_mul_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6),
434 reg_16x8_filt_coeff_grid);
435 reg_16x8_mul_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7),
436 reg_16x8_filt_coeff_grid);
437
438 reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
439 reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1);
440 reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2);
441 reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3);
442 reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4);
443 reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5);
444 reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6);
445 reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7);
446
447 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1);
448 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3);
449 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5);
450 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7);
451
452 reg_32x4_sum_r01 =
453 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
454 reg_32x4_sum_r23 =
455 vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
456 reg_32x4_sum_r45 =
457 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
458 reg_32x4_sum_r67 =
459 vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
460
461 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23);
462 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67);
463 reg_32x4_sum_r01 =
464 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
465 reg_32x4_sum_r45 =
466 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
467
468 reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
469 reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7);
470
471 reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67);
472 reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7);
473
474 vst1_u8(pu1_out_pixel, reg_8x8_sum_r0_r7);
475
476 pu1_out_pixel += 8;
477 pu1_in_pixel +=
478 (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q;
479 }
480 /* Update the context for next Loop Count */
481 u4_center_pixel_pos += u4_src_horz_increments;
482 }
483 }
484
485 /* 1 <= remaining height < 8 */
486 if(u4_rem_vert_loop)
487 {
488 u4_height_finished =
489 ((u4_num_iterations_vertical_by_16 << 4) + (u4_rem_vert_loop_by_8 << 3));
490 pu1_src_j = pu1_src + u4_height_finished * u4_in_stride;
491 pu1_dst_j = pu1_dst + u4_height_finished;
492
493 u4_center_pixel_pos = u4_center_pixel_pos_src;
494
495 for(i = 0; i < (WORD32) u4_blk_wd; i++)
496 {
497 u1_phase = get_filter_phase(u4_center_pixel_pos);
498 pi1_filter_grid = pai1_filters[u1_phase];
499
500 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
501
502 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
503
504 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
505
506 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
507
508 for(j = u4_rem_vert_loop; j > 0; j--)
509 {
510 /******************************************************/
511 /* This loop is going vertically in bottom direction */
512 /* but the output pixels are stored in horizontal */
513 /* direction in transpose manner */
514 /******************************************************/
515
516 reg_8x8_src_r0 = vld1_u8(pu1_in_pixel);
517 reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0);
518
519 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
520
521 reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0),
522 reg_16x8_filt_coeff_grid);
523
524 reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0);
525
526 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_zero);
527 reg_32x4_sum_r01 =
528 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
529 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_zero);
530 reg_32x4_sum_r01 =
531 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
532
533 reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7);
534
535 vst1_lane_u8(pu1_out_pixel, vreinterpret_u8_u16(reg_16x4_sum_r01_23), 0);
536 pu1_out_pixel += 1;
537 pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
538 }
539 /* Update the context for next Loop Count */
540 u4_center_pixel_pos += u4_src_horz_increments;
541 }
542 }
543 }
544 }
545 /* for chroma */
546 else
547 {
548 u4_num_iterations_vertical_by_8 = u4_blk_ht >> 3;
549 u4_rem_vert_loop = u4_blk_ht % 8;
550
551 for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_8; j++)
552 {
553 pu1_src_j = pu1_src + ((j << 3) * u4_in_stride);
554 pu1_dst_j = pu1_dst + (j << 3);
555
556 u4_center_pixel_pos = u4_center_pixel_pos_src;
557
558 for(i = 0; i < (WORD32) u4_blk_wd; i++)
559 {
560 u1_phase = get_filter_phase(u4_center_pixel_pos);
561 pi1_filter_grid = pai1_filters[u1_phase];
562
563 /*Doing the Calculation for current Loop Count */
564 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
565
566 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
567
568 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
569
570 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
571
572 /******************************************************/
573 /* This loop is going vertically in bottom direction */
574 /* but the output pixels are stored in horizontal */
575 /* direction in transpose manner */
576 /******************************************************/
577
578 reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
579 reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride);
580 reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride);
581 reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride);
582 reg_8x16_src_r4 = vld1q_u8(pu1_in_pixel + 4 * u4_in_stride);
583 reg_8x16_src_r5 = vld1q_u8(pu1_in_pixel + 5 * u4_in_stride);
584 reg_8x16_src_r6 = vld1q_u8(pu1_in_pixel + 6 * u4_in_stride);
585 reg_8x16_src_r7 = vld1q_u8(pu1_in_pixel + 7 * u4_in_stride);
586
587 reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1);
588 reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3);
589 reg_8x16x2_src_r2 = vuzpq_u8(reg_8x16_src_r4, reg_8x16_src_r5);
590 reg_8x16x2_src_r3 = vuzpq_u8(reg_8x16_src_r6, reg_8x16_src_r7);
591
592 reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
593 reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0]));
594 reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0]));
595 reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0]));
596 reg_16x8_src_cb_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[0]));
597 reg_16x8_src_cb_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[0]));
598 reg_16x8_src_cb_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[0]));
599 reg_16x8_src_cb_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[0]));
600
601 reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
602 reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1]));
603 reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1]));
604 reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1]));
605 reg_16x8_src_cr_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[1]));
606 reg_16x8_src_cr_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[1]));
607 reg_16x8_src_cr_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[1]));
608 reg_16x8_src_cr_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[1]));
609
610 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
611
612 reg_16x8_mul_cb_r0 =
613 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0), reg_16x8_filt_coeff_grid);
614 reg_16x8_mul_cb_r1 =
615 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1), reg_16x8_filt_coeff_grid);
616 reg_16x8_mul_cb_r2 =
617 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2), reg_16x8_filt_coeff_grid);
618 reg_16x8_mul_cb_r3 =
619 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3), reg_16x8_filt_coeff_grid);
620 reg_16x8_mul_cb_r4 =
621 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r4), reg_16x8_filt_coeff_grid);
622 reg_16x8_mul_cb_r5 =
623 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r5), reg_16x8_filt_coeff_grid);
624 reg_16x8_mul_cb_r6 =
625 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r6), reg_16x8_filt_coeff_grid);
626 reg_16x8_mul_cb_r7 =
627 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r7), reg_16x8_filt_coeff_grid);
628
629 reg_16x8_mul_cr_r0 =
630 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0), reg_16x8_filt_coeff_grid);
631 reg_16x8_mul_cr_r1 =
632 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1), reg_16x8_filt_coeff_grid);
633 reg_16x8_mul_cr_r2 =
634 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2), reg_16x8_filt_coeff_grid);
635 reg_16x8_mul_cr_r3 =
636 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3), reg_16x8_filt_coeff_grid);
637 reg_16x8_mul_cr_r4 =
638 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r4), reg_16x8_filt_coeff_grid);
639 reg_16x8_mul_cr_r5 =
640 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r5), reg_16x8_filt_coeff_grid);
641 reg_16x8_mul_cr_r6 =
642 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r6), reg_16x8_filt_coeff_grid);
643 reg_16x8_mul_cr_r7 =
644 vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r7), reg_16x8_filt_coeff_grid);
645
646 reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
647 reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1);
648 reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2);
649 reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3);
650 reg_32x4_sum_cb_r4 = vpaddlq_s16(reg_16x8_mul_cb_r4);
651 reg_32x4_sum_cb_r5 = vpaddlq_s16(reg_16x8_mul_cb_r5);
652 reg_32x4_sum_cb_r6 = vpaddlq_s16(reg_16x8_mul_cb_r6);
653 reg_32x4_sum_cb_r7 = vpaddlq_s16(reg_16x8_mul_cb_r7);
654
655 reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
656 reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1);
657 reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2);
658 reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3);
659 reg_32x4_sum_cr_r4 = vpaddlq_s16(reg_16x8_mul_cr_r4);
660 reg_32x4_sum_cr_r5 = vpaddlq_s16(reg_16x8_mul_cr_r5);
661 reg_32x4_sum_cr_r6 = vpaddlq_s16(reg_16x8_mul_cr_r6);
662 reg_32x4_sum_cr_r7 = vpaddlq_s16(reg_16x8_mul_cr_r7);
663
664 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1);
665 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3);
666 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5);
667 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7);
668
669 reg_32x4_sum_cb_r01 =
670 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
671 reg_32x4_sum_cb_r23 =
672 vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
673 reg_32x4_sum_cb_r45 =
674 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
675 reg_32x4_sum_cb_r67 =
676 vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
677
678 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23);
679 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67);
680 reg_32x4_sum_cb_r01 =
681 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
682 reg_32x4_sum_cb_r45 =
683 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
684
685 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1);
686 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3);
687 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5);
688 reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7);
689
690 reg_32x4_sum_cr_r01 =
691 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
692 reg_32x4_sum_cr_r23 =
693 vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
694 reg_32x4_sum_cr_r45 =
695 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
696 reg_32x4_sum_cr_r67 =
697 vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]);
698
699 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23);
700 reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67);
701 reg_32x4_sum_cr_r01 =
702 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
703 reg_32x4_sum_cr_r45 =
704 vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]);
705
706 reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7);
707 reg_16x4_sum_cb_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cb_r45, 7);
708
709 reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7);
710 reg_16x4_sum_cr_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cr_r45, 7);
711
712 reg_16x8_sum_cb_r0_r7 =
713 vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67);
714 reg_16x8_sum_cr_r0_r7 =
715 vcombine_u16(reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67);
716
717 reg_8x8_sum_cb_r0_r7 = vqmovn_u16(reg_16x8_sum_cb_r0_r7);
718 reg_8x8_sum_cr_r0_r7 = vqmovn_u16(reg_16x8_sum_cr_r0_r7);
719
720 vst1_u8(pu1_out_pixel, reg_8x8_sum_cb_r0_r7);
721 vst1_u8(pu1_out_pixel + u4_out_stride, reg_8x8_sum_cr_r0_r7);
722
723 pu1_out_pixel += 8;
724
725 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q;
726
727 /* Update the context for next Loop Count */
728 u4_center_pixel_pos += u4_src_horz_increments;
729 }
730 }
731
732 /* Loop for the remaining height less than 8 */
733 if(u4_rem_vert_loop)
734 {
735 u4_rem_vert_loop_by_4 = u4_rem_vert_loop >> 2;
736 u4_rem_vert_loop = u4_rem_vert_loop % 4;
737 u4_height_finished = (u4_num_iterations_vertical_by_8 << 3);
738 pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride);
739 pu1_dst_j = pu1_dst + u4_height_finished;
740
741 u4_center_pixel_pos = u4_center_pixel_pos_src;
742
743 /* 4<= remaining height < 8 */
744 if(u4_rem_vert_loop_by_4)
745 {
746 for(i = 0; i < (WORD32) u4_blk_wd; i++)
747 {
748 u1_phase = get_filter_phase(u4_center_pixel_pos);
749 pi1_filter_grid = pai1_filters[u1_phase];
750
751 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
752
753 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
754
755 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
756
757 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
758
759 for(j = u4_rem_vert_loop_by_4; j > 0; j--)
760 {
761 /******************************************************/
762 /* This loop is going vertically in bottom direction */
763 /* but the output pixels are stored in horizontal */
764 /* direction in transpose manner */
765 /******************************************************/
766
767 reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
768 reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride);
769 reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride);
770 reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride);
771
772 reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1);
773 reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3);
774
775 reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
776 reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0]));
777 reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0]));
778 reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0]));
779
780 reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
781 reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1]));
782 reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1]));
783 reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1]));
784
785 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
786
787 reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0),
788 reg_16x8_filt_coeff_grid);
789 reg_16x8_mul_cb_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1),
790 reg_16x8_filt_coeff_grid);
791 reg_16x8_mul_cb_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2),
792 reg_16x8_filt_coeff_grid);
793 reg_16x8_mul_cb_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3),
794 reg_16x8_filt_coeff_grid);
795
796 reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0),
797 reg_16x8_filt_coeff_grid);
798 reg_16x8_mul_cr_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1),
799 reg_16x8_filt_coeff_grid);
800 reg_16x8_mul_cr_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2),
801 reg_16x8_filt_coeff_grid);
802 reg_16x8_mul_cr_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3),
803 reg_16x8_filt_coeff_grid);
804
805 reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
806 reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1);
807 reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2);
808 reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3);
809
810 reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
811 reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1);
812 reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2);
813 reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3);
814
815 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1);
816 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3);
817 reg_32x4_sum_cb_r01 =
818 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
819 reg_32x4_sum_cb_r23 =
820 vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
821 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23);
822 reg_32x4_sum_cb_r01 =
823 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
824
825 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1);
826 reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3);
827 reg_32x4_sum_cr_r01 =
828 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
829 reg_32x4_sum_cr_r23 =
830 vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]);
831 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23);
832 reg_32x4_sum_cr_r01 =
833 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
834
835 reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7);
836 reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7);
837
838 reg_16x8_sum_cb_cr_r0_r3 =
839 vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cr_r01_23);
840 reg_8x8_sum_cb_cr_r0_r3 = vmovn_u16(reg_16x8_sum_cb_cr_r0_r3);
841 vst1_lane_u32((uint32_t *) (pu1_out_pixel),
842 vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 0);
843 vst1_lane_u32((uint32_t *) (pu1_out_pixel + u4_out_stride),
844 vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 1);
845
846 pu1_out_pixel += 4;
847
848 pu1_in_pixel +=
849 (u4_src_vert_increments * (u4_in_stride << 2)) >> DOWNSCALER_Q;
850 }
851 /* Update the context for next Loop Count */
852 u4_center_pixel_pos += u4_src_horz_increments;
853 }
854 }
855
856 /* 1<= remaining height < 4 */
857 if(u4_rem_vert_loop)
858 {
859 u4_height_finished =
860 ((u4_num_iterations_vertical_by_8 << 3) + (u4_rem_vert_loop_by_4 << 2));
861 pu1_src_j = pu1_src + u4_height_finished * u4_in_stride;
862 pu1_dst_j = pu1_dst + u4_height_finished;
863
864 u4_center_pixel_pos = u4_center_pixel_pos_src;
865 for(i = 0; i < (WORD32) u4_blk_wd; i++)
866 {
867 u1_phase = get_filter_phase(u4_center_pixel_pos);
868 pi1_filter_grid = pai1_filters[u1_phase];
869
870 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
871
872 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
873
874 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
875
876 reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid);
877
878 for(j = u4_rem_vert_loop; j > 0; j--)
879 {
880 /******************************************************/
881 /* This loop is going vertically in bottom direction */
882 /* but the output pixels are stored in horizontal */
883 /* direction in transpose manner */
884 /******************************************************/
885
886 reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel);
887
888 reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r0);
889 reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0]));
890 reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1]));
891
892 reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid);
893
894 reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0),
895 reg_16x8_filt_coeff_grid);
896 reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0),
897 reg_16x8_filt_coeff_grid);
898
899 reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0);
900 reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0);
901
902 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cr_r0);
903 reg_32x4_sum_cb_cr_r0 =
904 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
905
906 reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_cr_r0, reg_32x4_zero);
907 reg_32x4_sum_cb_cr_r0 =
908 vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]);
909
910 reg_16x4_sum_cb_cr_r0 = vqrshrun_n_s32(reg_32x4_sum_cb_cr_r0, 7);
911 vst1_lane_u8((pu1_out_pixel), vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0),
912 0);
913 vst1_lane_u8((pu1_out_pixel + u4_out_stride),
914 vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0), 2);
915
916 pu1_out_pixel += 1;
917
918 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride)) >> DOWNSCALER_Q;
919 }
920
921 /* Update the context for next Loop Count */
922 u4_center_pixel_pos += u4_src_horz_increments;
923 }
924 }
925 }
926 }
927 }
928