xref: /aosp_15_r20/external/libavc/common/arm/svc/isvc_intra_sampling_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  * *******************************************************************************
22  * * @file
23  *  isvc_intra_sampling_neon.c
24  *
25  * @brief
26  *  neon variants of intra sampling functions used by IBL mode
27  *
28  * *******************************************************************************
29  */
30 
31 #include <arm_neon.h>
32 #include <string.h>
33 
34 #include "ih264_typedefs.h"
35 #include "isvc_intra_resample.h"
36 
isvc_interpolate_base_luma_dyadic_neon(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)37 void isvc_interpolate_base_luma_dyadic_neon(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
38                                             UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
39 {
40     WORD32 i4_y;
41     WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
42     WORD32 i4_filt_stride, i4_src_stride;
43     UWORD8 *pu1_inp = pu1_inp_buf;
44     UWORD8 *pu1_out = pu1_out_buf;
45     WORD16 *pi2_tmp = pi2_tmp_filt_buf;
46 
47     int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2;
48     uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3;
49     int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2;
50 
51     /* Horizontal interpolation */
52     int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2;
53     uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp,
54         i4_rslt_horz_r1_2_tmp;
55     uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2;
56 
57     int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3,
58         i4_samp_horz_16x4_4;
59     int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8;
60     int16_t i4_coeff_c0 = -3;
61     int16_t i4_coeff_c1 = 28;
62     int16_t i4_coeff_c2 = 8;
63     int16_t i4_coeff_c3 = -1;
64     int32x4x2_t i4_rslt_horz_r0_tmp32, i4_rslt_horz_r1_tmp32;
65     int32x4_t const_512_32x4 = vdupq_n_s32(512);
66 
67     /* Filter coefficient values for phase 4 */
68     i4_coeff_0 = -3;
69     i4_coeff_1 = 28;
70     i4_coeff_2 = 8;
71     i4_coeff_3 = -1;
72 
73     i4_filt_stride = 12;
74     i4_src_stride = DYADIC_REF_W_Y;
75 
76     /* Vertical interpolation */
77     {
78         /* First 64 bits*/
79         i4_samp_vert_8x8_0 = vld1_u8((const UWORD8 *) pu1_inp);
80         pu1_inp += i4_src_stride;
81         i4_samp_vert_8x8_1 = vld1_u8((const UWORD8 *) pu1_inp);
82         pu1_inp += i4_src_stride;
83         i4_samp_vert_8x8_2 = vld1_u8((const UWORD8 *) pu1_inp);
84         pu1_inp += i4_src_stride;
85         i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
86         pu1_inp += i4_src_stride;
87 
88         i4_rslt_vert_16x8_0 =
89             vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
90         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
91             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
92         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
93             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
94         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
95             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
96 
97         vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0);
98         pi2_tmp += i4_filt_stride;
99 
100         for(i4_y = 1; i4_y < 15; i4_y += 2)
101         {
102             i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
103             i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
104             i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
105             i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
106 
107             i4_rslt_vert_16x8_0 =
108                 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
109             i4_rslt_vert_16x8_0 =
110                 vmlaq_n_s16(i4_rslt_vert_16x8_0,
111                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
112             i4_rslt_vert_16x8_0 =
113                 vmlaq_n_s16(i4_rslt_vert_16x8_0,
114                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
115             i4_rslt_vert_16x8_0 =
116                 vmlaq_n_s16(i4_rslt_vert_16x8_0,
117                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
118 
119             i4_rslt_vert_16x8_2 =
120                 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
121             i4_rslt_vert_16x8_2 =
122                 vmlaq_n_s16(i4_rslt_vert_16x8_2,
123                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
124             i4_rslt_vert_16x8_2 =
125                 vmlaq_n_s16(i4_rslt_vert_16x8_2,
126                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
127             i4_rslt_vert_16x8_2 =
128                 vmlaq_n_s16(i4_rslt_vert_16x8_2,
129                             vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
130 
131             vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
132             pi2_tmp += i4_filt_stride;
133             vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2));
134             pi2_tmp += i4_filt_stride;
135             pu1_inp += i4_src_stride;
136         }
137 
138         /* y = 15, y_phase = 4 */
139         i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
140         i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
141         i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
142         i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
143 
144         i4_rslt_vert_16x8_0 =
145             vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
146         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
147             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
148         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
149             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
150         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
151             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
152 
153         vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
154     }
155 
156     {
157         /* Remaining 32 bits */
158         pu1_inp = pu1_inp_buf + 8;
159         pi2_tmp = pi2_tmp_filt_buf + 8;
160 
161         i4_samp_vert_8x8_0 = vld1_u8((const UWORD8 *) pu1_inp);
162         pu1_inp += i4_src_stride;
163         i4_samp_vert_8x8_1 = vld1_u8((const UWORD8 *) pu1_inp);
164         pu1_inp += i4_src_stride;
165         i4_samp_vert_8x8_2 = vld1_u8((const UWORD8 *) pu1_inp);
166         pu1_inp += i4_src_stride;
167         i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
168         pu1_inp += i4_src_stride;
169 
170         i4_rslt_vert_16x4_1 = vmul_n_s16(
171             vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
172         i4_rslt_vert_16x4_1 = vmla_n_s16(
173             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
174             i4_coeff_2);
175         i4_rslt_vert_16x4_1 = vmla_n_s16(
176             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
177             i4_coeff_1);
178         i4_rslt_vert_16x4_1 = vmla_n_s16(
179             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
180             i4_coeff_0);
181 
182         vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
183         pi2_tmp += i4_filt_stride;
184 
185         for(i4_y = 1; i4_y < 15; i4_y += 2)
186         {
187             i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
188             i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
189             i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
190             i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
191 
192             i4_rslt_vert_16x4_1 = vmul_n_s16(
193                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
194             i4_rslt_vert_16x4_1 = vmla_n_s16(
195                 i4_rslt_vert_16x4_1,
196                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1);
197             i4_rslt_vert_16x4_1 = vmla_n_s16(
198                 i4_rslt_vert_16x4_1,
199                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2);
200             i4_rslt_vert_16x4_1 = vmla_n_s16(
201                 i4_rslt_vert_16x4_1,
202                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3);
203 
204             i4_rslt_vert_16x4_2 = vmul_n_s16(
205                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
206             i4_rslt_vert_16x4_2 = vmla_n_s16(
207                 i4_rslt_vert_16x4_2,
208                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2);
209             i4_rslt_vert_16x4_2 = vmla_n_s16(
210                 i4_rslt_vert_16x4_2,
211                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1);
212             i4_rslt_vert_16x4_2 = vmla_n_s16(
213                 i4_rslt_vert_16x4_2,
214                 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0);
215 
216             vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
217             pi2_tmp += i4_filt_stride;
218             vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2));
219             pi2_tmp += i4_filt_stride;
220             pu1_inp += i4_src_stride;
221         }
222 
223         i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
224         i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
225         i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
226         i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
227 
228         i4_rslt_vert_16x4_1 = vmul_n_s16(
229             vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
230         i4_rslt_vert_16x4_1 = vmla_n_s16(
231             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
232             i4_coeff_1);
233         i4_rslt_vert_16x4_1 = vmla_n_s16(
234             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
235             i4_coeff_2);
236         i4_rslt_vert_16x4_1 = vmla_n_s16(
237             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
238             i4_coeff_3);
239 
240         vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
241         /* Reinitializing the ptrs */
242         pu1_inp = pu1_inp_buf;
243         pi2_tmp = pi2_tmp_filt_buf;
244     }
245 
246     /* Horizontal interpolation */
247     for(i4_y = 0; i4_y < 16; i4_y++)
248     {
249         i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp);
250         i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1);
251         i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2);
252         i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3);
253         i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4);
254         i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5);
255         i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6);
256         i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7);
257         i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8);
258 
259         i4_rslt_horz_r0_1 =
260             vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); /* a0c3 a1c3  a2c3  a3c3 */
261         i4_rslt_horz_r0_1 =
262             vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1,
263                         i4_coeff_c2); /* a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1 */
264         i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1);
265         i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0);
266         /* i4_rslt_horz_r0_1 : contains res at even pos:0,2,4,6 */
267 
268         i4_rslt_horz_r1_1 =
269             vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); /* a0c0 a1c0  a2c0  a3c0 */
270         i4_rslt_horz_r1_1 =
271             vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2,
272                         i4_coeff_c1); /* a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1 */
273         i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2);
274         i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3);
275         /* i4_rslt_horz_r1_1 : contains res at odd pos:1,3,5,7 */
276 
277         i4_rslt_horz_r0_2 =
278             vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); /* a0c3 a1c3  a2c3  a3c3 */
279         i4_rslt_horz_r0_2 =
280             vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5,
281                         i4_coeff_c2); /* a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1 */
282         i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1);
283         i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0);
284         /* i4_rslt_horz_r0_1 : contains res at even pos:8,10,12,14 */
285 
286         i4_rslt_horz_r1_2 =
287             vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); /* a0c0 a1c0  a2c0  a3c0 */
288         i4_rslt_horz_r1_2 =
289             vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6,
290                         i4_coeff_c1); /* a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1 */
291         i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2);
292         i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3);
293         /* i4_rslt_horz_r1_1 : contains res at odd pos:1,3,5,7 */
294 
295         i4_rslt_horz_r0_tmp32 = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1);
296         i4_rslt_horz_r1_tmp32 = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2);
297 
298         i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_tmp32.val[0], const_512_32x4);
299         i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r0_tmp32.val[1], const_512_32x4);
300         i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r1_tmp32.val[0], const_512_32x4);
301         i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_tmp32.val[1], const_512_32x4);
302 
303         i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10);
304         i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10);
305 
306         i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10);
307         i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10);
308 
309         rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp);
310         rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp);
311 
312         vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1));
313         vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2));
314 
315         pu1_out += i4_out_stride;
316         pi2_tmp += i4_filt_stride;
317     }
318 }
319 
isvc_horz_interpol_chroma_dyadic_neon(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)320 void isvc_horz_interpol_chroma_dyadic_neon(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
321                                            WORD32 i4_out_stride, WORD32 i4_phase_0,
322                                            WORD32 i4_phase_1)
323 {
324     WORD32 i4_y;
325     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
326     UWORD8 *pu1_out = pu1_out_buf;
327     WORD16 *pi2_tmp = pi2_tmp_filt_buf;
328     WORD32 i4_filt_stride = 6;
329     WORD32 i4_dst_stride = i4_out_stride;
330 
331     int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2;
332     int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2;
333     int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
334     int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
335 
336     int16x8x2_t temp_horz_16x8_r0;
337     int16x8x2_t temp_horz_16x8_r1;
338     int16x8_t final_horz_16x8_r0_1;
339     int16x8_t final_horz_16x8_r1_1;
340 
341     uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
342     uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
343 
344     i4_coeff_0 = 16 - i4_phase_0;
345     i4_coeff_1 = i4_phase_0;
346     i4_coeff_2 = 16 - i4_phase_1;
347     i4_coeff_3 = i4_phase_1;
348 
349     /* Horizontal interpolation */
350     for(i4_y = 0; i4_y < 8; i4_y += 2)
351     {
352         i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp);     /* a0 a1 a2 a3 a4 a5 a6 a7 */
353         i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); /* a1 a2 a3 a4 */
354         i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); /* a2 a3 a4 a5 */
355 
356         i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
357         i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
358         i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2));
359 
360         i4_rslt_horz_r0_1 =
361             vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); /* a0c0 a1c0  a2c0  a3c0 */
362         i4_rslt_horz_r0_2 =
363             vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); /* a1c2 a2c2  a3c2 a4c2 */
364 
365         i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
366                                         i4_coeff_1); /* a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1 */
367         i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2,
368                                         i4_coeff_3); /* a1c2+a2c3  a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 */
369 
370         i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
371         i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2);
372 
373         i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
374         i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3);
375 
376         temp_horz_16x8_r0 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2);
377         temp_horz_16x8_r1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2);
378 
379         final_horz_16x8_r0_1 = temp_horz_16x8_r0.val[0];
380         final_horz_16x8_r1_1 = temp_horz_16x8_r1.val[0];
381 
382         final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 8);
383         final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 8);
384 
385         i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
386         i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
387 
388         i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
389                                        i4_out_horz_8x16_r0);
390         i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
391                                        i4_out_horz_8x16_r1);
392 
393         vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
394         vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
395 
396         /* Incrementing ptr */
397         pi2_tmp += (i4_filt_stride << 1);
398         pu1_out += (i4_dst_stride << 1);
399     }
400 }
401 
isvc_vert_interpol_chroma_dyadic_neon(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)402 void isvc_vert_interpol_chroma_dyadic_neon(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
403                                            WORD32 i4_phase_0, WORD32 i4_phase_1)
404 {
405     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
406     WORD32 i4_src_stride = DYADIC_REF_W_C;
407     UWORD8 *pu1_inp = pu1_inp_buf;
408     WORD16 *pi2_tmp = pi2_tmp_filt_buf;
409 
410     uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3,
411         i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5;
412 
413     int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
414         i4_rslt_vert_16x8_r3, i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
415         i4_rslt_vert_16x8_r7;
416 
417     i4_coeff_0 = 16 - i4_phase_0;
418     i4_coeff_1 = i4_phase_0;
419     i4_coeff_2 = 16 - i4_phase_1;
420     i4_coeff_3 = i4_phase_1;
421 
422     /* Vertical interpolation */
423     i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
424     pu1_inp += i4_src_stride;
425     i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
426     pu1_inp += i4_src_stride;
427     i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
428     pu1_inp += i4_src_stride;
429     i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
430     pu1_inp += i4_src_stride;
431     i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
432     pu1_inp += i4_src_stride;
433     i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp);
434     pu1_inp += i4_src_stride;
435 
436     i4_rslt_vert_16x8_r0 =
437         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
438     i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
439         i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
440     vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
441 
442     i4_rslt_vert_16x8_r1 =
443         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
444     i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
445         i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
446     vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
447 
448     i4_rslt_vert_16x8_r2 =
449         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
450     i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
451         i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
452     vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
453 
454     i4_rslt_vert_16x8_r3 =
455         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
456     i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
457         i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
458     vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
459 
460     i4_rslt_vert_16x8_r4 =
461         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
462     i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
463         i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
464     vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
465 
466     i4_rslt_vert_16x8_r5 =
467         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
468     i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
469         i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
470     vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
471 
472     i4_rslt_vert_16x8_r6 =
473         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
474     i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
475         i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
476     vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
477 
478     i4_rslt_vert_16x8_r7 =
479         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2);
480     i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
481         i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3);
482     vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
483     vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
484     vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
485 }
486