1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 * *******************************************************************************
22 * * @file
23 * isvc_intra_sampling_neon.c
24 *
25 * @brief
26 * neon variants of intra sampling functions used by IBL mode
27 *
28 * *******************************************************************************
29 */
30
31 #include <arm_neon.h>
32 #include <string.h>
33
34 #include "ih264_typedefs.h"
35 #include "isvc_intra_resample.h"
36
isvc_interpolate_base_luma_dyadic_neon(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)37 void isvc_interpolate_base_luma_dyadic_neon(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
38 UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
39 {
40 WORD32 i4_y;
41 WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
42 WORD32 i4_filt_stride, i4_src_stride;
43 UWORD8 *pu1_inp = pu1_inp_buf;
44 UWORD8 *pu1_out = pu1_out_buf;
45 WORD16 *pi2_tmp = pi2_tmp_filt_buf;
46
47 int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2;
48 uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3;
49 int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2;
50
51 /* Horizontal interpolation */
52 int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2;
53 uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp,
54 i4_rslt_horz_r1_2_tmp;
55 uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2;
56
57 int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3,
58 i4_samp_horz_16x4_4;
59 int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8;
60 int16_t i4_coeff_c0 = -3;
61 int16_t i4_coeff_c1 = 28;
62 int16_t i4_coeff_c2 = 8;
63 int16_t i4_coeff_c3 = -1;
64 int32x4x2_t i4_rslt_horz_r0_tmp32, i4_rslt_horz_r1_tmp32;
65 int32x4_t const_512_32x4 = vdupq_n_s32(512);
66
67 /* Filter coefficient values for phase 4 */
68 i4_coeff_0 = -3;
69 i4_coeff_1 = 28;
70 i4_coeff_2 = 8;
71 i4_coeff_3 = -1;
72
73 i4_filt_stride = 12;
74 i4_src_stride = DYADIC_REF_W_Y;
75
76 /* Vertical interpolation */
77 {
78 /* First 64 bits*/
79 i4_samp_vert_8x8_0 = vld1_u8((const UWORD8 *) pu1_inp);
80 pu1_inp += i4_src_stride;
81 i4_samp_vert_8x8_1 = vld1_u8((const UWORD8 *) pu1_inp);
82 pu1_inp += i4_src_stride;
83 i4_samp_vert_8x8_2 = vld1_u8((const UWORD8 *) pu1_inp);
84 pu1_inp += i4_src_stride;
85 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
86 pu1_inp += i4_src_stride;
87
88 i4_rslt_vert_16x8_0 =
89 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
90 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
91 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
92 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
93 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
94 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
95 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
96
97 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0);
98 pi2_tmp += i4_filt_stride;
99
100 for(i4_y = 1; i4_y < 15; i4_y += 2)
101 {
102 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
103 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
104 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
105 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
106
107 i4_rslt_vert_16x8_0 =
108 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
109 i4_rslt_vert_16x8_0 =
110 vmlaq_n_s16(i4_rslt_vert_16x8_0,
111 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
112 i4_rslt_vert_16x8_0 =
113 vmlaq_n_s16(i4_rslt_vert_16x8_0,
114 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
115 i4_rslt_vert_16x8_0 =
116 vmlaq_n_s16(i4_rslt_vert_16x8_0,
117 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
118
119 i4_rslt_vert_16x8_2 =
120 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
121 i4_rslt_vert_16x8_2 =
122 vmlaq_n_s16(i4_rslt_vert_16x8_2,
123 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
124 i4_rslt_vert_16x8_2 =
125 vmlaq_n_s16(i4_rslt_vert_16x8_2,
126 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
127 i4_rslt_vert_16x8_2 =
128 vmlaq_n_s16(i4_rslt_vert_16x8_2,
129 vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
130
131 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
132 pi2_tmp += i4_filt_stride;
133 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2));
134 pi2_tmp += i4_filt_stride;
135 pu1_inp += i4_src_stride;
136 }
137
138 /* y = 15, y_phase = 4 */
139 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
140 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
141 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
142 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
143
144 i4_rslt_vert_16x8_0 =
145 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
146 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
147 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
148 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
149 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
150 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
151 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
152
153 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
154 }
155
156 {
157 /* Remaining 32 bits */
158 pu1_inp = pu1_inp_buf + 8;
159 pi2_tmp = pi2_tmp_filt_buf + 8;
160
161 i4_samp_vert_8x8_0 = vld1_u8((const UWORD8 *) pu1_inp);
162 pu1_inp += i4_src_stride;
163 i4_samp_vert_8x8_1 = vld1_u8((const UWORD8 *) pu1_inp);
164 pu1_inp += i4_src_stride;
165 i4_samp_vert_8x8_2 = vld1_u8((const UWORD8 *) pu1_inp);
166 pu1_inp += i4_src_stride;
167 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
168 pu1_inp += i4_src_stride;
169
170 i4_rslt_vert_16x4_1 = vmul_n_s16(
171 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
172 i4_rslt_vert_16x4_1 = vmla_n_s16(
173 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
174 i4_coeff_2);
175 i4_rslt_vert_16x4_1 = vmla_n_s16(
176 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
177 i4_coeff_1);
178 i4_rslt_vert_16x4_1 = vmla_n_s16(
179 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
180 i4_coeff_0);
181
182 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
183 pi2_tmp += i4_filt_stride;
184
185 for(i4_y = 1; i4_y < 15; i4_y += 2)
186 {
187 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
188 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
189 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
190 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
191
192 i4_rslt_vert_16x4_1 = vmul_n_s16(
193 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
194 i4_rslt_vert_16x4_1 = vmla_n_s16(
195 i4_rslt_vert_16x4_1,
196 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1);
197 i4_rslt_vert_16x4_1 = vmla_n_s16(
198 i4_rslt_vert_16x4_1,
199 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2);
200 i4_rslt_vert_16x4_1 = vmla_n_s16(
201 i4_rslt_vert_16x4_1,
202 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3);
203
204 i4_rslt_vert_16x4_2 = vmul_n_s16(
205 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
206 i4_rslt_vert_16x4_2 = vmla_n_s16(
207 i4_rslt_vert_16x4_2,
208 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2);
209 i4_rslt_vert_16x4_2 = vmla_n_s16(
210 i4_rslt_vert_16x4_2,
211 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1);
212 i4_rslt_vert_16x4_2 = vmla_n_s16(
213 i4_rslt_vert_16x4_2,
214 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0);
215
216 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
217 pi2_tmp += i4_filt_stride;
218 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2));
219 pi2_tmp += i4_filt_stride;
220 pu1_inp += i4_src_stride;
221 }
222
223 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
224 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
225 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
226 i4_samp_vert_8x8_3 = vld1_u8((const UWORD8 *) pu1_inp);
227
228 i4_rslt_vert_16x4_1 = vmul_n_s16(
229 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
230 i4_rslt_vert_16x4_1 = vmla_n_s16(
231 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
232 i4_coeff_1);
233 i4_rslt_vert_16x4_1 = vmla_n_s16(
234 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
235 i4_coeff_2);
236 i4_rslt_vert_16x4_1 = vmla_n_s16(
237 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
238 i4_coeff_3);
239
240 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
241 /* Reinitializing the ptrs */
242 pu1_inp = pu1_inp_buf;
243 pi2_tmp = pi2_tmp_filt_buf;
244 }
245
246 /* Horizontal interpolation */
247 for(i4_y = 0; i4_y < 16; i4_y++)
248 {
249 i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp);
250 i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1);
251 i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2);
252 i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3);
253 i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4);
254 i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5);
255 i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6);
256 i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7);
257 i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8);
258
259 i4_rslt_horz_r0_1 =
260 vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); /* a0c3 a1c3 a2c3 a3c3 */
261 i4_rslt_horz_r0_1 =
262 vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1,
263 i4_coeff_c2); /* a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 */
264 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1);
265 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0);
266 /* i4_rslt_horz_r0_1 : contains res at even pos:0,2,4,6 */
267
268 i4_rslt_horz_r1_1 =
269 vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); /* a0c0 a1c0 a2c0 a3c0 */
270 i4_rslt_horz_r1_1 =
271 vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2,
272 i4_coeff_c1); /* a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 */
273 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2);
274 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3);
275 /* i4_rslt_horz_r1_1 : contains res at odd pos:1,3,5,7 */
276
277 i4_rslt_horz_r0_2 =
278 vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); /* a0c3 a1c3 a2c3 a3c3 */
279 i4_rslt_horz_r0_2 =
280 vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5,
281 i4_coeff_c2); /* a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 */
282 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1);
283 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0);
284 /* i4_rslt_horz_r0_1 : contains res at even pos:8,10,12,14 */
285
286 i4_rslt_horz_r1_2 =
287 vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); /* a0c0 a1c0 a2c0 a3c0 */
288 i4_rslt_horz_r1_2 =
289 vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6,
290 i4_coeff_c1); /* a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 */
291 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2);
292 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3);
293 /* i4_rslt_horz_r1_1 : contains res at odd pos:1,3,5,7 */
294
295 i4_rslt_horz_r0_tmp32 = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1);
296 i4_rslt_horz_r1_tmp32 = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2);
297
298 i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_tmp32.val[0], const_512_32x4);
299 i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r0_tmp32.val[1], const_512_32x4);
300 i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r1_tmp32.val[0], const_512_32x4);
301 i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_tmp32.val[1], const_512_32x4);
302
303 i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10);
304 i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10);
305
306 i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10);
307 i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10);
308
309 rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp);
310 rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp);
311
312 vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1));
313 vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2));
314
315 pu1_out += i4_out_stride;
316 pi2_tmp += i4_filt_stride;
317 }
318 }
319
isvc_horz_interpol_chroma_dyadic_neon(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)320 void isvc_horz_interpol_chroma_dyadic_neon(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
321 WORD32 i4_out_stride, WORD32 i4_phase_0,
322 WORD32 i4_phase_1)
323 {
324 WORD32 i4_y;
325 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
326 UWORD8 *pu1_out = pu1_out_buf;
327 WORD16 *pi2_tmp = pi2_tmp_filt_buf;
328 WORD32 i4_filt_stride = 6;
329 WORD32 i4_dst_stride = i4_out_stride;
330
331 int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2;
332 int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2;
333 int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
334 int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
335
336 int16x8x2_t temp_horz_16x8_r0;
337 int16x8x2_t temp_horz_16x8_r1;
338 int16x8_t final_horz_16x8_r0_1;
339 int16x8_t final_horz_16x8_r1_1;
340
341 uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
342 uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
343
344 i4_coeff_0 = 16 - i4_phase_0;
345 i4_coeff_1 = i4_phase_0;
346 i4_coeff_2 = 16 - i4_phase_1;
347 i4_coeff_3 = i4_phase_1;
348
349 /* Horizontal interpolation */
350 for(i4_y = 0; i4_y < 8; i4_y += 2)
351 {
352 i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); /* a0 a1 a2 a3 a4 a5 a6 a7 */
353 i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); /* a1 a2 a3 a4 */
354 i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); /* a2 a3 a4 a5 */
355
356 i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
357 i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
358 i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2));
359
360 i4_rslt_horz_r0_1 =
361 vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); /* a0c0 a1c0 a2c0 a3c0 */
362 i4_rslt_horz_r0_2 =
363 vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); /* a1c2 a2c2 a3c2 a4c2 */
364
365 i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
366 i4_coeff_1); /* a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 */
367 i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2,
368 i4_coeff_3); /* a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 */
369
370 i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
371 i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2);
372
373 i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
374 i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3);
375
376 temp_horz_16x8_r0 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2);
377 temp_horz_16x8_r1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2);
378
379 final_horz_16x8_r0_1 = temp_horz_16x8_r0.val[0];
380 final_horz_16x8_r1_1 = temp_horz_16x8_r1.val[0];
381
382 final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 8);
383 final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 8);
384
385 i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
386 i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
387
388 i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
389 i4_out_horz_8x16_r0);
390 i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
391 i4_out_horz_8x16_r1);
392
393 vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
394 vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
395
396 /* Incrementing ptr */
397 pi2_tmp += (i4_filt_stride << 1);
398 pu1_out += (i4_dst_stride << 1);
399 }
400 }
401
isvc_vert_interpol_chroma_dyadic_neon(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)402 void isvc_vert_interpol_chroma_dyadic_neon(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
403 WORD32 i4_phase_0, WORD32 i4_phase_1)
404 {
405 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
406 WORD32 i4_src_stride = DYADIC_REF_W_C;
407 UWORD8 *pu1_inp = pu1_inp_buf;
408 WORD16 *pi2_tmp = pi2_tmp_filt_buf;
409
410 uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3,
411 i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5;
412
413 int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
414 i4_rslt_vert_16x8_r3, i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
415 i4_rslt_vert_16x8_r7;
416
417 i4_coeff_0 = 16 - i4_phase_0;
418 i4_coeff_1 = i4_phase_0;
419 i4_coeff_2 = 16 - i4_phase_1;
420 i4_coeff_3 = i4_phase_1;
421
422 /* Vertical interpolation */
423 i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
424 pu1_inp += i4_src_stride;
425 i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
426 pu1_inp += i4_src_stride;
427 i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
428 pu1_inp += i4_src_stride;
429 i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
430 pu1_inp += i4_src_stride;
431 i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
432 pu1_inp += i4_src_stride;
433 i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp);
434 pu1_inp += i4_src_stride;
435
436 i4_rslt_vert_16x8_r0 =
437 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
438 i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
439 i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
440 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
441
442 i4_rslt_vert_16x8_r1 =
443 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
444 i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
445 i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
446 vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
447
448 i4_rslt_vert_16x8_r2 =
449 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
450 i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
451 i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
452 vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
453
454 i4_rslt_vert_16x8_r3 =
455 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
456 i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
457 i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
458 vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
459
460 i4_rslt_vert_16x8_r4 =
461 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
462 i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
463 i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
464 vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
465
466 i4_rslt_vert_16x8_r5 =
467 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
468 i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
469 i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
470 vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
471
472 i4_rslt_vert_16x8_r6 =
473 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
474 i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
475 i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
476 vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
477
478 i4_rslt_vert_16x8_r7 =
479 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2);
480 i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
481 i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3);
482 vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
483 vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
484 vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
485 }
486