1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 *
24 * @file
25 * isvce_svc_residual_pred_neon.c
26 *
27 * @brief
28 * Contains functions
29 * used for SVC residual
30 * prediction
31 *
32 *******************************************************************************
33 */
34 #include <arm_neon.h>
35
36 #include "ih264_typedefs.h"
37 #include "ih264_macros.h"
38 #include "ih264_size_defs.h"
39 #include "isvc_macros.h"
40 #include "isvc_structs.h"
41
isvce_luma_residual_sampler_2x_neon(coordinates_t * ps_ref_array_positions,coordinates_t * ps_ref_array_phases,buffer_container_t * ps_inp,buffer_container_t * ps_out,buffer_container_t * ps_scratch,UWORD32 u4_ref_nnz,UWORD8 u1_ref_tx_size)42 void isvce_luma_residual_sampler_2x_neon(coordinates_t *ps_ref_array_positions,
43 coordinates_t *ps_ref_array_phases,
44 buffer_container_t *ps_inp, buffer_container_t *ps_out,
45 buffer_container_t *ps_scratch, UWORD32 u4_ref_nnz,
46 UWORD8 u1_ref_tx_size)
47 {
48 WORD16 *pi2_inp_data = (WORD16 *) ps_inp->pv_data;
49 WORD16 *pi2_out_res = (WORD16 *) ps_out->pv_data;
50 WORD32 i4_inp_data_stride = ps_inp->i4_data_stride;
51 WORD32 i4_out_res_stride = ps_out->i4_data_stride;
52 WORD16 *pi2_refarray_buffer = (WORD16 *) ps_scratch->pv_data;
53 WORD32 i4_blk_ctr;
54
55 UNUSED(ps_ref_array_positions);
56 UNUSED(ps_ref_array_phases);
57
58 /* For 2x scaling, offsets always point to TL pixel outside MB */
59 /* Hence, refTransBlkIdc will be different and since phase */
60 /* for first refArray pos for horiz filtering samples > 8, */
61 /* first row and first column from the refArray is never used */
62 pi2_inp_data += 1 + i4_inp_data_stride;
63
64 if((u1_ref_tx_size) && (0 != u4_ref_nnz))
65 {
66 WORD16 *pi2_ref_data_byte;
67 WORD32 *pi4_ref_array;
68 WORD32 i4_i, i4_j;
69
70 /* ----------- Horizontal Interpolation ---------------- */
71 int16x8_t i2_coeff_add_16x8_r0;
72 int16x8_t i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1;
73 int16x8_t i2_coeff_16x8_sl_r0_0, i2_coeff_16x8_sl_r0_1;
74 int16x8_t result_16x8_r0_0, result_16x8_r0_1;
75
76 int16x8_t i2_coeff_add_16x8_r1;
77 int16x8_t i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1;
78 int16x8_t i2_coeff_16x8_sl_r1_0, i2_coeff_16x8_sl_r1_1;
79 int16x8_t result_16x8_r1_0, result_16x8_r1_1;
80 int16x8x2_t final_result_16x8x2_r0, final_result_16x8x2_r1;
81
82 pi2_ref_data_byte = pi2_inp_data;
83
84 /* ----------- Horizontal Interpolation ---------------- */
85 pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
86
87 for(i4_i = 0; i4_i < BLK8x8SIZE; i4_i += 2)
88 {
89 i2_coeff_16x8_r0_0 = vld1q_s16(pi2_ref_data_byte);
90 i2_coeff_16x8_r0_1 = vld1q_s16((pi2_ref_data_byte + 1));
91
92 i2_coeff_16x8_r1_0 = vld1q_s16(pi2_ref_data_byte + i4_inp_data_stride);
93 i2_coeff_16x8_r1_1 = vld1q_s16((pi2_ref_data_byte + i4_inp_data_stride + 1));
94
95 i2_coeff_add_16x8_r0 = vaddq_s16(i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1);
96 i2_coeff_16x8_sl_r0_0 = vshlq_n_s16(i2_coeff_16x8_r0_0, 1);
97 i2_coeff_16x8_sl_r0_1 = vshlq_n_s16(i2_coeff_16x8_r0_1, 1);
98
99 i2_coeff_add_16x8_r1 = vaddq_s16(i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1);
100 i2_coeff_16x8_sl_r1_0 = vshlq_n_s16(i2_coeff_16x8_r1_0, 1);
101 i2_coeff_16x8_sl_r1_1 = vshlq_n_s16(i2_coeff_16x8_r1_1, 1);
102
103 result_16x8_r0_0 = vaddq_s16(i2_coeff_16x8_sl_r0_0, i2_coeff_add_16x8_r0);
104 result_16x8_r0_1 = vaddq_s16(i2_coeff_16x8_sl_r0_1, i2_coeff_add_16x8_r0);
105
106 result_16x8_r1_0 = vaddq_s16(i2_coeff_16x8_sl_r1_0, i2_coeff_add_16x8_r1);
107 result_16x8_r1_1 = vaddq_s16(i2_coeff_16x8_sl_r1_1, i2_coeff_add_16x8_r1);
108
109 final_result_16x8x2_r0 = vzipq_s16(result_16x8_r0_0, result_16x8_r0_1);
110 final_result_16x8x2_r1 = vzipq_s16(result_16x8_r1_0, result_16x8_r1_1);
111
112 vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8x2_r0.val[0])));
113 vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8x2_r0.val[0])));
114 vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8x2_r0.val[1])));
115 vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8x2_r0.val[1])));
116
117 pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
118 pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
119 pi4_ref_array += 16;
120 pi2_ref_data_byte += i4_inp_data_stride;
121
122 vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8x2_r1.val[0])));
123 vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8x2_r1.val[0])));
124 vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8x2_r1.val[1])));
125 vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8x2_r1.val[1])));
126
127 pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
128 pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
129 pi4_ref_array += 16;
130 /* vertical loop updates */
131 pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
132 }
133
134 /* ----------- Vertical Interpolation ---------------- */
135 pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
136 {
137 WORD32 *pi4_ref_array_temp;
138 WORD16 *pi2_out;
139 int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r1_3,
140 i4_horz_samp_32x4_r1_4;
141 int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2, i4_horz_samp_32x4_r2_3,
142 i4_horz_samp_32x4_r2_4;
143
144 int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2, i4_horz_res_32x4_r1_3,
145 i4_horz_res_32x4_r1_4;
146 int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2, i4_horz_res_32x4_r2_3,
147 i4_horz_res_32x4_r2_4;
148 int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2, i4_horz_res_32x4_r3_3,
149 i4_horz_res_32x4_r3_4;
150 int32x4_t horz_add_32x4_r2_1, horz_add_32x4_r2_2, horz_add_32x4_r2_3,
151 horz_add_32x4_r2_4;
152
153 int16x8_t comb_horz_16x8_1, comb_horz_16x8_2, comb_horz_16x8_3, comb_horz_16x8_4;
154 pi4_ref_array_temp = pi4_ref_array;
155 pi2_out = pi2_out_res;
156
157 i4_horz_samp_32x4_r1_1 = vld1q_s32(pi4_ref_array_temp);
158 i4_horz_samp_32x4_r1_2 = vld1q_s32(pi4_ref_array_temp + 4);
159 i4_horz_samp_32x4_r1_3 = vld1q_s32(pi4_ref_array_temp + 8);
160 i4_horz_samp_32x4_r1_4 = vld1q_s32(pi4_ref_array_temp + 12);
161
162 /* populate the first inter sample */
163 i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
164 i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
165 i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
166 i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
167
168 comb_horz_16x8_1 =
169 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
170 comb_horz_16x8_2 =
171 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
172 vst1q_s16(pi2_out, comb_horz_16x8_1);
173 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
174
175 pi2_out += i4_out_res_stride;
176
177 for(i4_j = 0; i4_j < 14; i4_j += 2)
178 {
179 pi4_ref_array_temp += MB_SIZE;
180 i4_horz_samp_32x4_r2_1 = vld1q_s32(pi4_ref_array_temp);
181 i4_horz_samp_32x4_r2_2 = vld1q_s32(pi4_ref_array_temp + 4);
182 i4_horz_samp_32x4_r2_3 = vld1q_s32(pi4_ref_array_temp + 8);
183 i4_horz_samp_32x4_r2_4 = vld1q_s32(pi4_ref_array_temp + 12);
184
185 horz_add_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r2_1);
186 horz_add_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r2_2);
187 horz_add_32x4_r2_3 = vaddq_s32(i4_horz_samp_32x4_r1_3, i4_horz_samp_32x4_r2_3);
188 horz_add_32x4_r2_4 = vaddq_s32(i4_horz_samp_32x4_r1_4, i4_horz_samp_32x4_r2_4);
189
190 i4_horz_res_32x4_r2_1 =
191 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_1, 1), horz_add_32x4_r2_1);
192 i4_horz_res_32x4_r2_2 =
193 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_2, 1), horz_add_32x4_r2_2);
194 i4_horz_res_32x4_r2_3 =
195 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_3, 1), horz_add_32x4_r2_3);
196 i4_horz_res_32x4_r2_4 =
197 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_4, 1), horz_add_32x4_r2_4);
198
199 i4_horz_res_32x4_r3_1 =
200 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_1, 1), horz_add_32x4_r2_1);
201 i4_horz_res_32x4_r3_2 =
202 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_2, 1), horz_add_32x4_r2_2);
203 i4_horz_res_32x4_r3_3 =
204 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_3, 1), horz_add_32x4_r2_3);
205 i4_horz_res_32x4_r3_4 =
206 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_4, 1), horz_add_32x4_r2_4);
207
208 i4_horz_res_32x4_r2_1 = vrshrq_n_s32(i4_horz_res_32x4_r2_1, 4);
209 i4_horz_res_32x4_r2_2 = vrshrq_n_s32(i4_horz_res_32x4_r2_2, 4);
210 i4_horz_res_32x4_r2_3 = vrshrq_n_s32(i4_horz_res_32x4_r2_3, 4);
211 i4_horz_res_32x4_r2_4 = vrshrq_n_s32(i4_horz_res_32x4_r2_4, 4);
212
213 i4_horz_res_32x4_r3_1 = vrshrq_n_s32(i4_horz_res_32x4_r3_1, 4);
214 i4_horz_res_32x4_r3_2 = vrshrq_n_s32(i4_horz_res_32x4_r3_2, 4);
215 i4_horz_res_32x4_r3_3 = vrshrq_n_s32(i4_horz_res_32x4_r3_3, 4);
216 i4_horz_res_32x4_r3_4 = vrshrq_n_s32(i4_horz_res_32x4_r3_4, 4);
217
218 comb_horz_16x8_1 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_1),
219 vmovn_s32(i4_horz_res_32x4_r2_2));
220 comb_horz_16x8_2 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_3),
221 vmovn_s32(i4_horz_res_32x4_r2_4));
222
223 comb_horz_16x8_3 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_1),
224 vmovn_s32(i4_horz_res_32x4_r3_2));
225 comb_horz_16x8_4 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_3),
226 vmovn_s32(i4_horz_res_32x4_r3_4));
227
228 /* populate 2 samples based on current coeffs */
229 vst1q_s16(pi2_out, comb_horz_16x8_1);
230 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
231 pi2_out += i4_out_res_stride;
232
233 vst1q_s16(pi2_out, comb_horz_16x8_3);
234 vst1q_s16(pi2_out + 8, comb_horz_16x8_4);
235 pi2_out += i4_out_res_stride;
236
237 /* store the coeff 2 to coeff 1 */
238 /* (used in next iteration) */
239 i4_horz_samp_32x4_r1_1 = i4_horz_samp_32x4_r2_1;
240 i4_horz_samp_32x4_r1_2 = i4_horz_samp_32x4_r2_2;
241 i4_horz_samp_32x4_r1_3 = i4_horz_samp_32x4_r2_3;
242 i4_horz_samp_32x4_r1_4 = i4_horz_samp_32x4_r2_4;
243 }
244
245 /* populate the first inter sample */
246 i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
247 i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
248 i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
249 i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
250
251 comb_horz_16x8_1 =
252 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
253 comb_horz_16x8_2 =
254 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
255 vst1q_s16(pi2_out, comb_horz_16x8_1);
256 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
257
258 /* horizontal loop updates */
259 pi4_ref_array++;
260 pi2_out_res++;
261 }
262 }
263 else
264 {
265 /* ----------------------------------------------------------------- */
266 /* LOOP over number of blocks */
267 /* ----------------------------------------------------------------- */
268 for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
269 {
270 /* if reference layer is not coded then no processing */
271 if(0 != (u4_ref_nnz & 0x1))
272 {
273 int16x8_t i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_1;
274 int16x8_t i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_1;
275 int16x8_t i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_1;
276 int16x8_t i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_1;
277 int16x8_t i2_add_16x8_r0_0;
278 int16x8_t i2_add_16x8_r1_0;
279 int16x8_t i2_add_16x8_r2_0;
280 int16x8_t i2_add_16x8_r3_0;
281 int16x8_t i2_res_16x8_r0_0, i2_res_16x8_r0_1;
282 int16x8_t i2_res_16x8_r1_0, i2_res_16x8_r1_1;
283 int16x8_t i2_res_16x8_r2_0, i2_res_16x8_r2_1;
284 int16x8_t i2_res_16x8_r3_0, i2_res_16x8_r3_1;
285 int16x4_t i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r0_2;
286 int16x4_t i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r1_2;
287 int16x4_t i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r2_2;
288 int16x4_t i4_horz_samp_16x4_r3_1, i4_horz_samp_16x4_r3_2;
289 int32x4_t i4_horz_samp_32x4_r0_1, i4_horz_samp_32x4_r0_2;
290 int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2;
291 int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2;
292 int32x4_t i4_horz_samp_32x4_r3_1, i4_horz_samp_32x4_r3_2;
293 int32x4_t i4_horz_add_32x4_r1_1, i4_horz_add_32x4_r1_2;
294 int32x4_t i4_horz_add_32x4_r2_1, i4_horz_add_32x4_r2_2;
295 int32x4_t i4_horz_add_32x4_r3_1, i4_horz_add_32x4_r3_2;
296 int16x4_t i4_horz_res_16x4_r0_1, i4_horz_res_16x4_r0_2;
297 int16x4_t i4_horz_res_16x4_r1_1, i4_horz_res_16x4_r1_2;
298 int16x4_t i4_horz_res_16x4_r2_1, i4_horz_res_16x4_r2_2;
299 int16x4_t i4_horz_res_16x4_r3_1, i4_horz_res_16x4_r3_2;
300 int16x4_t i4_horz_res_16x4_r4_1, i4_horz_res_16x4_r4_2;
301 int16x4_t i4_horz_res_16x4_r5_1, i4_horz_res_16x4_r5_2;
302 int16x4_t i4_horz_res_16x4_r6_1, i4_horz_res_16x4_r6_2;
303 int16x4_t i4_horz_res_16x4_r7_1, i4_horz_res_16x4_r7_2;
304 int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2;
305 int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2;
306 int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2;
307 int32x4_t i4_horz_res_32x4_r4_1, i4_horz_res_32x4_r4_2;
308 int32x4_t i4_horz_res_32x4_r5_1, i4_horz_res_32x4_r5_2;
309 int32x4_t i4_horz_res_32x4_r6_1, i4_horz_res_32x4_r6_2;
310 int16x8x2_t ti2_res_16x8x2_r0, ti2_res_16x8x2_r1;
311 int16x8x2_t ti2_res_16x8x2_r2, ti2_res_16x8x2_r3;
312
313 i2_coeff1_16x8_r0_0 = vld1q_s16(pi2_inp_data);
314 i2_coeff1_16x8_r1_0 = vld1q_s16(pi2_inp_data + i4_inp_data_stride);
315 i2_coeff1_16x8_r2_0 = vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1));
316 i2_coeff1_16x8_r3_0 =
317 vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1) + i4_inp_data_stride);
318
319 i2_coeff1_16x8_r0_1 = vextq_s16(i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_0, 1);
320 i2_coeff1_16x8_r1_1 = vextq_s16(i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_0, 1);
321 i2_coeff1_16x8_r2_1 = vextq_s16(i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_0, 1);
322 i2_coeff1_16x8_r3_1 = vextq_s16(i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_0, 1);
323
324 i2_add_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_coeff1_16x8_r0_0);
325 i2_add_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_coeff1_16x8_r1_0);
326 i2_add_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_coeff1_16x8_r2_0);
327 i2_add_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_coeff1_16x8_r3_0);
328
329 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
330 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
331 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
332 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
333
334 i2_coeff1_16x8_r0_1 = vshlq_n_s16(i2_coeff1_16x8_r0_1, 1);
335 i2_coeff1_16x8_r1_1 = vshlq_n_s16(i2_coeff1_16x8_r1_1, 1);
336 i2_coeff1_16x8_r2_1 = vshlq_n_s16(i2_coeff1_16x8_r2_1, 1);
337 i2_coeff1_16x8_r3_1 = vshlq_n_s16(i2_coeff1_16x8_r3_1, 1);
338
339 i2_res_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_0, i2_add_16x8_r0_0);
340 i2_res_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_0, i2_add_16x8_r1_0);
341 i2_res_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_0, i2_add_16x8_r2_0);
342 i2_res_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_0, i2_add_16x8_r3_0);
343
344 i2_res_16x8_r0_1 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_add_16x8_r0_0);
345 i2_res_16x8_r1_1 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_add_16x8_r1_0);
346 i2_res_16x8_r2_1 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_add_16x8_r2_0);
347 i2_res_16x8_r3_1 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_add_16x8_r3_0);
348
349 ti2_res_16x8x2_r0 = vzipq_s16(i2_res_16x8_r0_0, i2_res_16x8_r0_1);
350 ti2_res_16x8x2_r1 = vzipq_s16(i2_res_16x8_r1_0, i2_res_16x8_r1_1);
351 ti2_res_16x8x2_r2 = vzipq_s16(i2_res_16x8_r2_0, i2_res_16x8_r2_1);
352 ti2_res_16x8x2_r3 = vzipq_s16(i2_res_16x8_r3_0, i2_res_16x8_r3_1);
353
354 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
355 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
356 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
357 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
358
359 vst1q_s16(pi2_refarray_buffer + 1, ti2_res_16x8x2_r0.val[0]);
360 vst1q_lane_s16(pi2_refarray_buffer, i2_coeff1_16x8_r0_0, 0);
361 vst1q_lane_s16(pi2_refarray_buffer + 7, i2_coeff1_16x8_r0_0, 3);
362
363 vst1q_s16(pi2_refarray_buffer + 9, ti2_res_16x8x2_r1.val[0]);
364 vst1q_lane_s16(pi2_refarray_buffer + 8, i2_coeff1_16x8_r1_0, 0);
365 vst1q_lane_s16(pi2_refarray_buffer + 15, i2_coeff1_16x8_r1_0, 3);
366
367 vst1q_s16(pi2_refarray_buffer + 17, ti2_res_16x8x2_r2.val[0]);
368 vst1q_lane_s16(pi2_refarray_buffer + 16, i2_coeff1_16x8_r2_0, 0);
369 vst1q_lane_s16(pi2_refarray_buffer + 23, i2_coeff1_16x8_r2_0, 3);
370
371 vst1q_s16(pi2_refarray_buffer + 25, ti2_res_16x8x2_r3.val[0]);
372 vst1q_lane_s16(pi2_refarray_buffer + 24, i2_coeff1_16x8_r3_0, 0);
373 vst1q_lane_s16(pi2_refarray_buffer + 31, i2_coeff1_16x8_r3_0, 3);
374
375 i4_horz_samp_16x4_r0_1 = vld1_s16(pi2_refarray_buffer);
376 i4_horz_samp_16x4_r0_2 = vld1_s16(pi2_refarray_buffer + 4);
377
378 i4_horz_samp_16x4_r1_1 = vld1_s16(pi2_refarray_buffer + 8);
379 i4_horz_samp_16x4_r1_2 = vld1_s16(pi2_refarray_buffer + 12);
380
381 i4_horz_samp_16x4_r2_1 = vld1_s16(pi2_refarray_buffer + 16);
382 i4_horz_samp_16x4_r2_2 = vld1_s16(pi2_refarray_buffer + 20);
383
384 i4_horz_samp_16x4_r3_1 = vld1_s16(pi2_refarray_buffer + 24);
385 i4_horz_samp_16x4_r3_2 = vld1_s16(pi2_refarray_buffer + 28);
386
387 i4_horz_res_16x4_r0_1 = vrshr_n_s16(i4_horz_samp_16x4_r0_1, 2);
388 i4_horz_res_16x4_r0_2 = vrshr_n_s16(i4_horz_samp_16x4_r0_2, 2);
389
390 i4_horz_add_32x4_r1_1 = vaddl_s16(i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r1_1);
391 i4_horz_add_32x4_r1_2 = vaddl_s16(i4_horz_samp_16x4_r0_2, i4_horz_samp_16x4_r1_2);
392
393 i4_horz_add_32x4_r2_1 = vaddl_s16(i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r2_1);
394 i4_horz_add_32x4_r2_2 = vaddl_s16(i4_horz_samp_16x4_r1_2, i4_horz_samp_16x4_r2_2);
395
396 i4_horz_add_32x4_r3_1 = vaddl_s16(i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r3_1);
397 i4_horz_add_32x4_r3_2 = vaddl_s16(i4_horz_samp_16x4_r2_2, i4_horz_samp_16x4_r3_2);
398
399 i4_horz_samp_32x4_r0_1 = vshll_n_s16(i4_horz_samp_16x4_r0_1, 1);
400 i4_horz_samp_32x4_r0_2 = vshll_n_s16(i4_horz_samp_16x4_r0_2, 1);
401
402 i4_horz_samp_32x4_r1_1 = vshll_n_s16(i4_horz_samp_16x4_r1_1, 1);
403 i4_horz_samp_32x4_r1_2 = vshll_n_s16(i4_horz_samp_16x4_r1_2, 1);
404
405 i4_horz_samp_32x4_r2_1 = vshll_n_s16(i4_horz_samp_16x4_r2_1, 1);
406 i4_horz_samp_32x4_r2_2 = vshll_n_s16(i4_horz_samp_16x4_r2_2, 1);
407
408 i4_horz_samp_32x4_r3_1 = vshll_n_s16(i4_horz_samp_16x4_r3_1, 1);
409 i4_horz_samp_32x4_r3_2 = vshll_n_s16(i4_horz_samp_16x4_r3_2, 1);
410
411 i4_horz_res_32x4_r1_1 = vaddq_s32(i4_horz_samp_32x4_r0_1, i4_horz_add_32x4_r1_1);
412 i4_horz_res_32x4_r1_2 = vaddq_s32(i4_horz_samp_32x4_r0_2, i4_horz_add_32x4_r1_2);
413
414 i4_horz_res_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r1_1);
415 i4_horz_res_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r1_2);
416
417 i4_horz_res_32x4_r3_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r2_1);
418 i4_horz_res_32x4_r3_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r2_2);
419
420 i4_horz_res_32x4_r4_1 = vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r2_1);
421 i4_horz_res_32x4_r4_2 = vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r2_2);
422
423 i4_horz_res_32x4_r5_1 = vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r3_1);
424 i4_horz_res_32x4_r5_2 = vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r3_2);
425
426 i4_horz_res_32x4_r6_1 = vaddq_s32(i4_horz_samp_32x4_r3_1, i4_horz_add_32x4_r3_1);
427 i4_horz_res_32x4_r6_2 = vaddq_s32(i4_horz_samp_32x4_r3_2, i4_horz_add_32x4_r3_2);
428
429 i4_horz_res_16x4_r1_1 = vqrshrn_n_s32(i4_horz_res_32x4_r1_1, 4);
430 i4_horz_res_16x4_r1_2 = vqrshrn_n_s32(i4_horz_res_32x4_r1_2, 4);
431
432 i4_horz_res_16x4_r2_1 = vqrshrn_n_s32(i4_horz_res_32x4_r2_1, 4);
433 i4_horz_res_16x4_r2_2 = vqrshrn_n_s32(i4_horz_res_32x4_r2_2, 4);
434
435 i4_horz_res_16x4_r3_1 = vqrshrn_n_s32(i4_horz_res_32x4_r3_1, 4);
436 i4_horz_res_16x4_r3_2 = vqrshrn_n_s32(i4_horz_res_32x4_r3_2, 4);
437
438 i4_horz_res_16x4_r4_1 = vqrshrn_n_s32(i4_horz_res_32x4_r4_1, 4);
439 i4_horz_res_16x4_r4_2 = vqrshrn_n_s32(i4_horz_res_32x4_r4_2, 4);
440
441 i4_horz_res_16x4_r5_1 = vqrshrn_n_s32(i4_horz_res_32x4_r5_1, 4);
442 i4_horz_res_16x4_r5_2 = vqrshrn_n_s32(i4_horz_res_32x4_r5_2, 4);
443
444 i4_horz_res_16x4_r6_1 = vqrshrn_n_s32(i4_horz_res_32x4_r6_1, 4);
445 i4_horz_res_16x4_r6_2 = vqrshrn_n_s32(i4_horz_res_32x4_r6_2, 4);
446
447 i4_horz_res_16x4_r7_1 = vrshr_n_s16(i4_horz_samp_16x4_r3_1, 2);
448 i4_horz_res_16x4_r7_2 = vrshr_n_s16(i4_horz_samp_16x4_r3_2, 2);
449
450 vst1_s16(pi2_out_res, i4_horz_res_16x4_r0_1);
451 vst1_s16(pi2_out_res + 4, i4_horz_res_16x4_r0_2);
452
453 vst1_s16(pi2_out_res + i4_out_res_stride, i4_horz_res_16x4_r1_1);
454 vst1_s16(pi2_out_res + i4_out_res_stride + 4, i4_horz_res_16x4_r1_2);
455
456 vst1_s16(pi2_out_res + (i4_out_res_stride << 1), i4_horz_res_16x4_r2_1);
457 vst1_s16(pi2_out_res + (i4_out_res_stride << 1) + 4, i4_horz_res_16x4_r2_2);
458
459 vst1_s16(pi2_out_res + (i4_out_res_stride * 3), i4_horz_res_16x4_r3_1);
460 vst1_s16(pi2_out_res + (i4_out_res_stride * 3) + 4, i4_horz_res_16x4_r3_2);
461
462 vst1_s16(pi2_out_res + (i4_out_res_stride << 2), i4_horz_res_16x4_r4_1);
463 vst1_s16(pi2_out_res + (i4_out_res_stride << 2) + 4, i4_horz_res_16x4_r4_2);
464
465 vst1_s16(pi2_out_res + (i4_out_res_stride * 5), i4_horz_res_16x4_r5_1);
466 vst1_s16(pi2_out_res + (i4_out_res_stride * 5) + 4, i4_horz_res_16x4_r5_2);
467
468 vst1_s16(pi2_out_res + (i4_out_res_stride * 6), i4_horz_res_16x4_r6_1);
469 vst1_s16(pi2_out_res + (i4_out_res_stride * 6) + 4, i4_horz_res_16x4_r6_2);
470
471 vst1_s16(pi2_out_res + (i4_out_res_stride * 7), i4_horz_res_16x4_r7_1);
472 vst1_s16(pi2_out_res + (i4_out_res_stride * 7) + 4, i4_horz_res_16x4_r7_2);
473
474 pi2_out_res += BLK8x8SIZE;
475 }
476 else
477 {
478 pi2_out_res += BLK8x8SIZE;
479 }
480
481 /* Block level loop updates */
482 if(1 == i4_blk_ctr)
483 {
484 pi2_inp_data -= SUB_BLK_WIDTH_4x4;
485 pi2_inp_data += (i4_inp_data_stride * SUB_BLK_HEIGHT_4x4);
486 pi2_out_res -= MB_SIZE;
487 pi2_out_res += (i4_out_res_stride * BLK8x8SIZE);
488 u4_ref_nnz >>= 2;
489 }
490 else
491 {
492 pi2_inp_data += SUB_BLK_HEIGHT_4x4;
493 }
494 u4_ref_nnz >>= 1;
495 }
496 /* The above loop iterates over all the blocks */
497 }
498 }
499
isvce_get_sad_with_residual_pred_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res,UWORD32 u4_mb_wd,UWORD32 u4_mb_ht)500 UWORD32 isvce_get_sad_with_residual_pred_neon(buffer_container_t *ps_src,
501 buffer_container_t *ps_pred,
502 buffer_container_t *ps_res, UWORD32 u4_mb_wd,
503 UWORD32 u4_mb_ht)
504 {
505 UWORD32 i, j, u4_sad = 0;
506 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
507 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
508 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
509 WORD32 i4_src_stride = ps_src->i4_data_stride;
510 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
511 WORD32 i4_res_stride = ps_res->i4_data_stride;
512 UWORD32 u4_num_rows_per_loop = 8;
513 UWORD32 u4_ht_by_8 = u4_mb_ht / u4_num_rows_per_loop;
514 uint8x8_t src0, src1, src2, src3;
515 uint8x8_t src4, src5, src6, src7;
516 uint8x8_t pred0, pred1, pred2, pred3;
517 uint8x8_t pred4, pred5, pred6, pred7;
518 int16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8, res4_16x8, res5_16x8, res6_16x8,
519 res7_16x8;
520 uint16x8_t res0_u16x8, res1_u16x8, res2_u16x8, res3_u16x8, res4_u16x8, res5_u16x8, res6_u16x8,
521 res7_u16x8;
522 int16x8_t respred0_16x8, respred1_16x8, respred2_16x8, respred3_16x8, respred4_16x8,
523 respred5_16x8, respred6_16x8, respred7_16x8;
524 int16x8_t temp0_16x8, temp1_16x8, temp2_16x8, temp3_16x8, temp4_16x8, temp5_16x8, temp6_16x8,
525 temp7_16x8;
526 int32x4_t temp0_32x4;
527 int32x2_t temp0_32x2;
528
529 if((u4_mb_wd == 16) && (u4_mb_ht % 8 == 0))
530 {
531 for(i = 0; i < u4_ht_by_8; i++)
532 {
533 /* This loop processes 4 rows of 16 bytes each iteration */
534 /* So, 8 rows are processed across two iterations */
535 for(j = 0; j < 2; j++)
536 {
537 src0 = vld1_u8(pu1_src);
538 src1 = vld1_u8(pu1_src + 8);
539
540 pu1_src += i4_src_stride;
541
542 src2 = vld1_u8(pu1_src);
543 src3 = vld1_u8(pu1_src + 8);
544
545 pu1_src += i4_src_stride;
546
547 src4 = vld1_u8(pu1_src);
548 src5 = vld1_u8(pu1_src + 8);
549
550 pu1_src += i4_src_stride;
551
552 src6 = vld1_u8(pu1_src);
553 src7 = vld1_u8(pu1_src + 8);
554
555 pu1_src += i4_src_stride;
556
557 pred0 = vld1_u8(pu1_pred);
558 pred1 = vld1_u8(pu1_pred + 8);
559
560 pu1_pred += i4_pred_stride;
561
562 pred2 = vld1_u8(pu1_pred);
563 pred3 = vld1_u8(pu1_pred + 8);
564
565 pu1_pred += i4_pred_stride;
566
567 pred4 = vld1_u8(pu1_pred);
568 pred5 = vld1_u8(pu1_pred + 8);
569
570 pu1_pred += i4_pred_stride;
571
572 pred6 = vld1_u8(pu1_pred);
573 pred7 = vld1_u8(pu1_pred + 8);
574
575 pu1_pred += i4_pred_stride;
576
577 res0_u16x8 = vsubl_u8(src0, pred0);
578 res1_u16x8 = vsubl_u8(src1, pred1);
579 res2_u16x8 = vsubl_u8(src2, pred2);
580 res3_u16x8 = vsubl_u8(src3, pred3);
581 res4_u16x8 = vsubl_u8(src4, pred4);
582 res5_u16x8 = vsubl_u8(src5, pred5);
583 res6_u16x8 = vsubl_u8(src6, pred6);
584 res7_u16x8 = vsubl_u8(src7, pred7);
585
586 res0_16x8 = vreinterpretq_s16_u16(res0_u16x8);
587 res1_16x8 = vreinterpretq_s16_u16(res1_u16x8);
588 res2_16x8 = vreinterpretq_s16_u16(res2_u16x8);
589 res3_16x8 = vreinterpretq_s16_u16(res3_u16x8);
590 res4_16x8 = vreinterpretq_s16_u16(res4_u16x8);
591 res5_16x8 = vreinterpretq_s16_u16(res5_u16x8);
592 res6_16x8 = vreinterpretq_s16_u16(res6_u16x8);
593 res7_16x8 = vreinterpretq_s16_u16(res7_u16x8);
594
595 respred0_16x8 = vld1q_s16(pi2_res);
596 respred1_16x8 = vld1q_s16(pi2_res + 8);
597
598 pi2_res += i4_res_stride;
599
600 respred2_16x8 = vld1q_s16(pi2_res);
601 respred3_16x8 = vld1q_s16(pi2_res + 8);
602
603 pi2_res += i4_res_stride;
604
605 respred4_16x8 = vld1q_s16(pi2_res);
606 respred5_16x8 = vld1q_s16(pi2_res + 8);
607
608 pi2_res += i4_res_stride;
609
610 respred6_16x8 = vld1q_s16(pi2_res);
611 respred7_16x8 = vld1q_s16(pi2_res + 8);
612
613 pi2_res += i4_res_stride;
614
615 temp0_16x8 = vsubq_s16(res0_16x8, respred0_16x8);
616 temp1_16x8 = vsubq_s16(res1_16x8, respred1_16x8);
617 temp2_16x8 = vsubq_s16(res2_16x8, respred2_16x8);
618 temp3_16x8 = vsubq_s16(res3_16x8, respred3_16x8);
619 temp4_16x8 = vsubq_s16(res4_16x8, respred4_16x8);
620 temp5_16x8 = vsubq_s16(res5_16x8, respred5_16x8);
621 temp6_16x8 = vsubq_s16(res6_16x8, respred6_16x8);
622 temp7_16x8 = vsubq_s16(res7_16x8, respred7_16x8);
623
624 temp0_16x8 = vabsq_s16(temp0_16x8);
625 temp1_16x8 = vabsq_s16(temp1_16x8);
626 temp2_16x8 = vabsq_s16(temp2_16x8);
627 temp3_16x8 = vabsq_s16(temp3_16x8);
628 temp4_16x8 = vabsq_s16(temp4_16x8);
629 temp5_16x8 = vabsq_s16(temp5_16x8);
630 temp6_16x8 = vabsq_s16(temp6_16x8);
631 temp7_16x8 = vabsq_s16(temp7_16x8);
632
633 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
634 temp1_16x8 = vaddq_s16(temp2_16x8, temp3_16x8);
635 temp2_16x8 = vaddq_s16(temp4_16x8, temp5_16x8);
636 temp3_16x8 = vaddq_s16(temp6_16x8, temp7_16x8);
637
638 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
639 temp1_16x8 = vaddq_s16(temp2_16x8, temp3_16x8);
640
641 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
642
643 temp0_32x4 = vpaddlq_s16(temp0_16x8);
644 temp0_32x2 = vpadd_s32(vget_low_s32(temp0_32x4), vget_high_s32(temp0_32x4));
645
646 u4_sad += vget_lane_s32(temp0_32x2, 0);
647 u4_sad += vget_lane_s32(temp0_32x2, 1);
648 }
649 }
650 }
651 else
652 {
653 for(i = 0; i < u4_mb_ht; i++)
654 {
655 for(j = 0; j < u4_mb_wd; j++)
656 {
657 WORD16 i2_src = pu1_src[j + i * i4_src_stride];
658 WORD16 i2_pred = pu1_pred[j + i * i4_pred_stride];
659 WORD16 i2_res = pi2_res[j + i * i4_res_stride];
660 u4_sad += ABS(i2_src - i2_pred - i2_res);
661 }
662 }
663 }
664
665 return u4_sad;
666 }
667