xref: /aosp_15_r20/external/libavc/encoder/arm/svc/isvce_residual_pred_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 *******************************************************************************
23 *
24 * @file
25 *  isvce_svc_residual_pred_neon.c
26 *
27 * @brief
28 *  Contains functions
29 * used for SVC residual
30 * prediction
31 *
32 *******************************************************************************
33 */
34 #include <arm_neon.h>
35 
36 #include "ih264_typedefs.h"
37 #include "ih264_macros.h"
38 #include "ih264_size_defs.h"
39 #include "isvc_macros.h"
40 #include "isvc_structs.h"
41 
isvce_luma_residual_sampler_2x_neon(coordinates_t * ps_ref_array_positions,coordinates_t * ps_ref_array_phases,buffer_container_t * ps_inp,buffer_container_t * ps_out,buffer_container_t * ps_scratch,UWORD32 u4_ref_nnz,UWORD8 u1_ref_tx_size)42 void isvce_luma_residual_sampler_2x_neon(coordinates_t *ps_ref_array_positions,
43                                          coordinates_t *ps_ref_array_phases,
44                                          buffer_container_t *ps_inp, buffer_container_t *ps_out,
45                                          buffer_container_t *ps_scratch, UWORD32 u4_ref_nnz,
46                                          UWORD8 u1_ref_tx_size)
47 {
48     WORD16 *pi2_inp_data = (WORD16 *) ps_inp->pv_data;
49     WORD16 *pi2_out_res = (WORD16 *) ps_out->pv_data;
50     WORD32 i4_inp_data_stride = ps_inp->i4_data_stride;
51     WORD32 i4_out_res_stride = ps_out->i4_data_stride;
52     WORD16 *pi2_refarray_buffer = (WORD16 *) ps_scratch->pv_data;
53     WORD32 i4_blk_ctr;
54 
55     UNUSED(ps_ref_array_positions);
56     UNUSED(ps_ref_array_phases);
57 
58     /* For 2x scaling, offsets always point to TL pixel outside MB */
59     /* Hence, refTransBlkIdc will be different and since phase */
60     /* for first refArray pos for horiz filtering samples > 8, */
61     /* first row and first column from the refArray is never used */
62     pi2_inp_data += 1 + i4_inp_data_stride;
63 
64     if((u1_ref_tx_size) && (0 != u4_ref_nnz))
65     {
66         WORD16 *pi2_ref_data_byte;
67         WORD32 *pi4_ref_array;
68         WORD32 i4_i, i4_j;
69 
70         /* ----------- Horizontal Interpolation ---------------- */
71         int16x8_t i2_coeff_add_16x8_r0;
72         int16x8_t i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1;
73         int16x8_t i2_coeff_16x8_sl_r0_0, i2_coeff_16x8_sl_r0_1;
74         int16x8_t result_16x8_r0_0, result_16x8_r0_1;
75 
76         int16x8_t i2_coeff_add_16x8_r1;
77         int16x8_t i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1;
78         int16x8_t i2_coeff_16x8_sl_r1_0, i2_coeff_16x8_sl_r1_1;
79         int16x8_t result_16x8_r1_0, result_16x8_r1_1;
80         int16x8x2_t final_result_16x8x2_r0, final_result_16x8x2_r1;
81 
82         pi2_ref_data_byte = pi2_inp_data;
83 
84         /* ----------- Horizontal Interpolation ---------------- */
85         pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
86 
87         for(i4_i = 0; i4_i < BLK8x8SIZE; i4_i += 2)
88         {
89             i2_coeff_16x8_r0_0 = vld1q_s16(pi2_ref_data_byte);
90             i2_coeff_16x8_r0_1 = vld1q_s16((pi2_ref_data_byte + 1));
91 
92             i2_coeff_16x8_r1_0 = vld1q_s16(pi2_ref_data_byte + i4_inp_data_stride);
93             i2_coeff_16x8_r1_1 = vld1q_s16((pi2_ref_data_byte + i4_inp_data_stride + 1));
94 
95             i2_coeff_add_16x8_r0 = vaddq_s16(i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1);
96             i2_coeff_16x8_sl_r0_0 = vshlq_n_s16(i2_coeff_16x8_r0_0, 1);
97             i2_coeff_16x8_sl_r0_1 = vshlq_n_s16(i2_coeff_16x8_r0_1, 1);
98 
99             i2_coeff_add_16x8_r1 = vaddq_s16(i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1);
100             i2_coeff_16x8_sl_r1_0 = vshlq_n_s16(i2_coeff_16x8_r1_0, 1);
101             i2_coeff_16x8_sl_r1_1 = vshlq_n_s16(i2_coeff_16x8_r1_1, 1);
102 
103             result_16x8_r0_0 = vaddq_s16(i2_coeff_16x8_sl_r0_0, i2_coeff_add_16x8_r0);
104             result_16x8_r0_1 = vaddq_s16(i2_coeff_16x8_sl_r0_1, i2_coeff_add_16x8_r0);
105 
106             result_16x8_r1_0 = vaddq_s16(i2_coeff_16x8_sl_r1_0, i2_coeff_add_16x8_r1);
107             result_16x8_r1_1 = vaddq_s16(i2_coeff_16x8_sl_r1_1, i2_coeff_add_16x8_r1);
108 
109             final_result_16x8x2_r0 = vzipq_s16(result_16x8_r0_0, result_16x8_r0_1);
110             final_result_16x8x2_r1 = vzipq_s16(result_16x8_r1_0, result_16x8_r1_1);
111 
112             vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8x2_r0.val[0])));
113             vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8x2_r0.val[0])));
114             vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8x2_r0.val[1])));
115             vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8x2_r0.val[1])));
116 
117             pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
118             pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
119             pi4_ref_array += 16;
120             pi2_ref_data_byte += i4_inp_data_stride;
121 
122             vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8x2_r1.val[0])));
123             vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8x2_r1.val[0])));
124             vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8x2_r1.val[1])));
125             vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8x2_r1.val[1])));
126 
127             pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
128             pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
129             pi4_ref_array += 16;
130             /* vertical loop updates */
131             pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
132         }
133 
134         /* ----------- Vertical Interpolation ---------------- */
135         pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
136         {
137             WORD32 *pi4_ref_array_temp;
138             WORD16 *pi2_out;
139             int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r1_3,
140                 i4_horz_samp_32x4_r1_4;
141             int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2, i4_horz_samp_32x4_r2_3,
142                 i4_horz_samp_32x4_r2_4;
143 
144             int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2, i4_horz_res_32x4_r1_3,
145                 i4_horz_res_32x4_r1_4;
146             int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2, i4_horz_res_32x4_r2_3,
147                 i4_horz_res_32x4_r2_4;
148             int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2, i4_horz_res_32x4_r3_3,
149                 i4_horz_res_32x4_r3_4;
150             int32x4_t horz_add_32x4_r2_1, horz_add_32x4_r2_2, horz_add_32x4_r2_3,
151                 horz_add_32x4_r2_4;
152 
153             int16x8_t comb_horz_16x8_1, comb_horz_16x8_2, comb_horz_16x8_3, comb_horz_16x8_4;
154             pi4_ref_array_temp = pi4_ref_array;
155             pi2_out = pi2_out_res;
156 
157             i4_horz_samp_32x4_r1_1 = vld1q_s32(pi4_ref_array_temp);
158             i4_horz_samp_32x4_r1_2 = vld1q_s32(pi4_ref_array_temp + 4);
159             i4_horz_samp_32x4_r1_3 = vld1q_s32(pi4_ref_array_temp + 8);
160             i4_horz_samp_32x4_r1_4 = vld1q_s32(pi4_ref_array_temp + 12);
161 
162             /* populate the first inter sample */
163             i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
164             i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
165             i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
166             i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
167 
168             comb_horz_16x8_1 =
169                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
170             comb_horz_16x8_2 =
171                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
172             vst1q_s16(pi2_out, comb_horz_16x8_1);
173             vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
174 
175             pi2_out += i4_out_res_stride;
176 
177             for(i4_j = 0; i4_j < 14; i4_j += 2)
178             {
179                 pi4_ref_array_temp += MB_SIZE;
180                 i4_horz_samp_32x4_r2_1 = vld1q_s32(pi4_ref_array_temp);
181                 i4_horz_samp_32x4_r2_2 = vld1q_s32(pi4_ref_array_temp + 4);
182                 i4_horz_samp_32x4_r2_3 = vld1q_s32(pi4_ref_array_temp + 8);
183                 i4_horz_samp_32x4_r2_4 = vld1q_s32(pi4_ref_array_temp + 12);
184 
185                 horz_add_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r2_1);
186                 horz_add_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r2_2);
187                 horz_add_32x4_r2_3 = vaddq_s32(i4_horz_samp_32x4_r1_3, i4_horz_samp_32x4_r2_3);
188                 horz_add_32x4_r2_4 = vaddq_s32(i4_horz_samp_32x4_r1_4, i4_horz_samp_32x4_r2_4);
189 
190                 i4_horz_res_32x4_r2_1 =
191                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_1, 1), horz_add_32x4_r2_1);
192                 i4_horz_res_32x4_r2_2 =
193                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_2, 1), horz_add_32x4_r2_2);
194                 i4_horz_res_32x4_r2_3 =
195                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_3, 1), horz_add_32x4_r2_3);
196                 i4_horz_res_32x4_r2_4 =
197                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_4, 1), horz_add_32x4_r2_4);
198 
199                 i4_horz_res_32x4_r3_1 =
200                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_1, 1), horz_add_32x4_r2_1);
201                 i4_horz_res_32x4_r3_2 =
202                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_2, 1), horz_add_32x4_r2_2);
203                 i4_horz_res_32x4_r3_3 =
204                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_3, 1), horz_add_32x4_r2_3);
205                 i4_horz_res_32x4_r3_4 =
206                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_4, 1), horz_add_32x4_r2_4);
207 
208                 i4_horz_res_32x4_r2_1 = vrshrq_n_s32(i4_horz_res_32x4_r2_1, 4);
209                 i4_horz_res_32x4_r2_2 = vrshrq_n_s32(i4_horz_res_32x4_r2_2, 4);
210                 i4_horz_res_32x4_r2_3 = vrshrq_n_s32(i4_horz_res_32x4_r2_3, 4);
211                 i4_horz_res_32x4_r2_4 = vrshrq_n_s32(i4_horz_res_32x4_r2_4, 4);
212 
213                 i4_horz_res_32x4_r3_1 = vrshrq_n_s32(i4_horz_res_32x4_r3_1, 4);
214                 i4_horz_res_32x4_r3_2 = vrshrq_n_s32(i4_horz_res_32x4_r3_2, 4);
215                 i4_horz_res_32x4_r3_3 = vrshrq_n_s32(i4_horz_res_32x4_r3_3, 4);
216                 i4_horz_res_32x4_r3_4 = vrshrq_n_s32(i4_horz_res_32x4_r3_4, 4);
217 
218                 comb_horz_16x8_1 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_1),
219                                                 vmovn_s32(i4_horz_res_32x4_r2_2));
220                 comb_horz_16x8_2 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_3),
221                                                 vmovn_s32(i4_horz_res_32x4_r2_4));
222 
223                 comb_horz_16x8_3 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_1),
224                                                 vmovn_s32(i4_horz_res_32x4_r3_2));
225                 comb_horz_16x8_4 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_3),
226                                                 vmovn_s32(i4_horz_res_32x4_r3_4));
227 
228                 /* populate 2 samples based on current coeffs */
229                 vst1q_s16(pi2_out, comb_horz_16x8_1);
230                 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
231                 pi2_out += i4_out_res_stride;
232 
233                 vst1q_s16(pi2_out, comb_horz_16x8_3);
234                 vst1q_s16(pi2_out + 8, comb_horz_16x8_4);
235                 pi2_out += i4_out_res_stride;
236 
237                 /* store the coeff 2 to coeff 1 */
238                 /* (used in next iteration)     */
239                 i4_horz_samp_32x4_r1_1 = i4_horz_samp_32x4_r2_1;
240                 i4_horz_samp_32x4_r1_2 = i4_horz_samp_32x4_r2_2;
241                 i4_horz_samp_32x4_r1_3 = i4_horz_samp_32x4_r2_3;
242                 i4_horz_samp_32x4_r1_4 = i4_horz_samp_32x4_r2_4;
243             }
244 
245             /* populate the first inter sample */
246             i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
247             i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
248             i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
249             i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
250 
251             comb_horz_16x8_1 =
252                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
253             comb_horz_16x8_2 =
254                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
255             vst1q_s16(pi2_out, comb_horz_16x8_1);
256             vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
257 
258             /* horizontal loop updates */
259             pi4_ref_array++;
260             pi2_out_res++;
261         }
262     }
263     else
264     {
265         /* ----------------------------------------------------------------- */
266         /* LOOP over number of blocks                                        */
267         /* ----------------------------------------------------------------- */
268         for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
269         {
270             /* if reference layer is not coded then no processing */
271             if(0 != (u4_ref_nnz & 0x1))
272             {
273                 int16x8_t i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_1;
274                 int16x8_t i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_1;
275                 int16x8_t i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_1;
276                 int16x8_t i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_1;
277                 int16x8_t i2_add_16x8_r0_0;
278                 int16x8_t i2_add_16x8_r1_0;
279                 int16x8_t i2_add_16x8_r2_0;
280                 int16x8_t i2_add_16x8_r3_0;
281                 int16x8_t i2_res_16x8_r0_0, i2_res_16x8_r0_1;
282                 int16x8_t i2_res_16x8_r1_0, i2_res_16x8_r1_1;
283                 int16x8_t i2_res_16x8_r2_0, i2_res_16x8_r2_1;
284                 int16x8_t i2_res_16x8_r3_0, i2_res_16x8_r3_1;
285                 int16x4_t i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r0_2;
286                 int16x4_t i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r1_2;
287                 int16x4_t i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r2_2;
288                 int16x4_t i4_horz_samp_16x4_r3_1, i4_horz_samp_16x4_r3_2;
289                 int32x4_t i4_horz_samp_32x4_r0_1, i4_horz_samp_32x4_r0_2;
290                 int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2;
291                 int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2;
292                 int32x4_t i4_horz_samp_32x4_r3_1, i4_horz_samp_32x4_r3_2;
293                 int32x4_t i4_horz_add_32x4_r1_1, i4_horz_add_32x4_r1_2;
294                 int32x4_t i4_horz_add_32x4_r2_1, i4_horz_add_32x4_r2_2;
295                 int32x4_t i4_horz_add_32x4_r3_1, i4_horz_add_32x4_r3_2;
296                 int16x4_t i4_horz_res_16x4_r0_1, i4_horz_res_16x4_r0_2;
297                 int16x4_t i4_horz_res_16x4_r1_1, i4_horz_res_16x4_r1_2;
298                 int16x4_t i4_horz_res_16x4_r2_1, i4_horz_res_16x4_r2_2;
299                 int16x4_t i4_horz_res_16x4_r3_1, i4_horz_res_16x4_r3_2;
300                 int16x4_t i4_horz_res_16x4_r4_1, i4_horz_res_16x4_r4_2;
301                 int16x4_t i4_horz_res_16x4_r5_1, i4_horz_res_16x4_r5_2;
302                 int16x4_t i4_horz_res_16x4_r6_1, i4_horz_res_16x4_r6_2;
303                 int16x4_t i4_horz_res_16x4_r7_1, i4_horz_res_16x4_r7_2;
304                 int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2;
305                 int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2;
306                 int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2;
307                 int32x4_t i4_horz_res_32x4_r4_1, i4_horz_res_32x4_r4_2;
308                 int32x4_t i4_horz_res_32x4_r5_1, i4_horz_res_32x4_r5_2;
309                 int32x4_t i4_horz_res_32x4_r6_1, i4_horz_res_32x4_r6_2;
310                 int16x8x2_t ti2_res_16x8x2_r0, ti2_res_16x8x2_r1;
311                 int16x8x2_t ti2_res_16x8x2_r2, ti2_res_16x8x2_r3;
312 
313                 i2_coeff1_16x8_r0_0 = vld1q_s16(pi2_inp_data);
314                 i2_coeff1_16x8_r1_0 = vld1q_s16(pi2_inp_data + i4_inp_data_stride);
315                 i2_coeff1_16x8_r2_0 = vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1));
316                 i2_coeff1_16x8_r3_0 =
317                     vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1) + i4_inp_data_stride);
318 
319                 i2_coeff1_16x8_r0_1 = vextq_s16(i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_0, 1);
320                 i2_coeff1_16x8_r1_1 = vextq_s16(i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_0, 1);
321                 i2_coeff1_16x8_r2_1 = vextq_s16(i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_0, 1);
322                 i2_coeff1_16x8_r3_1 = vextq_s16(i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_0, 1);
323 
324                 i2_add_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_coeff1_16x8_r0_0);
325                 i2_add_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_coeff1_16x8_r1_0);
326                 i2_add_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_coeff1_16x8_r2_0);
327                 i2_add_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_coeff1_16x8_r3_0);
328 
329                 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
330                 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
331                 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
332                 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
333 
334                 i2_coeff1_16x8_r0_1 = vshlq_n_s16(i2_coeff1_16x8_r0_1, 1);
335                 i2_coeff1_16x8_r1_1 = vshlq_n_s16(i2_coeff1_16x8_r1_1, 1);
336                 i2_coeff1_16x8_r2_1 = vshlq_n_s16(i2_coeff1_16x8_r2_1, 1);
337                 i2_coeff1_16x8_r3_1 = vshlq_n_s16(i2_coeff1_16x8_r3_1, 1);
338 
339                 i2_res_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_0, i2_add_16x8_r0_0);
340                 i2_res_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_0, i2_add_16x8_r1_0);
341                 i2_res_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_0, i2_add_16x8_r2_0);
342                 i2_res_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_0, i2_add_16x8_r3_0);
343 
344                 i2_res_16x8_r0_1 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_add_16x8_r0_0);
345                 i2_res_16x8_r1_1 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_add_16x8_r1_0);
346                 i2_res_16x8_r2_1 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_add_16x8_r2_0);
347                 i2_res_16x8_r3_1 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_add_16x8_r3_0);
348 
349                 ti2_res_16x8x2_r0 = vzipq_s16(i2_res_16x8_r0_0, i2_res_16x8_r0_1);
350                 ti2_res_16x8x2_r1 = vzipq_s16(i2_res_16x8_r1_0, i2_res_16x8_r1_1);
351                 ti2_res_16x8x2_r2 = vzipq_s16(i2_res_16x8_r2_0, i2_res_16x8_r2_1);
352                 ti2_res_16x8x2_r3 = vzipq_s16(i2_res_16x8_r3_0, i2_res_16x8_r3_1);
353 
354                 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
355                 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
356                 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
357                 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
358 
359                 vst1q_s16(pi2_refarray_buffer + 1, ti2_res_16x8x2_r0.val[0]);
360                 vst1q_lane_s16(pi2_refarray_buffer, i2_coeff1_16x8_r0_0, 0);
361                 vst1q_lane_s16(pi2_refarray_buffer + 7, i2_coeff1_16x8_r0_0, 3);
362 
363                 vst1q_s16(pi2_refarray_buffer + 9, ti2_res_16x8x2_r1.val[0]);
364                 vst1q_lane_s16(pi2_refarray_buffer + 8, i2_coeff1_16x8_r1_0, 0);
365                 vst1q_lane_s16(pi2_refarray_buffer + 15, i2_coeff1_16x8_r1_0, 3);
366 
367                 vst1q_s16(pi2_refarray_buffer + 17, ti2_res_16x8x2_r2.val[0]);
368                 vst1q_lane_s16(pi2_refarray_buffer + 16, i2_coeff1_16x8_r2_0, 0);
369                 vst1q_lane_s16(pi2_refarray_buffer + 23, i2_coeff1_16x8_r2_0, 3);
370 
371                 vst1q_s16(pi2_refarray_buffer + 25, ti2_res_16x8x2_r3.val[0]);
372                 vst1q_lane_s16(pi2_refarray_buffer + 24, i2_coeff1_16x8_r3_0, 0);
373                 vst1q_lane_s16(pi2_refarray_buffer + 31, i2_coeff1_16x8_r3_0, 3);
374 
375                 i4_horz_samp_16x4_r0_1 = vld1_s16(pi2_refarray_buffer);
376                 i4_horz_samp_16x4_r0_2 = vld1_s16(pi2_refarray_buffer + 4);
377 
378                 i4_horz_samp_16x4_r1_1 = vld1_s16(pi2_refarray_buffer + 8);
379                 i4_horz_samp_16x4_r1_2 = vld1_s16(pi2_refarray_buffer + 12);
380 
381                 i4_horz_samp_16x4_r2_1 = vld1_s16(pi2_refarray_buffer + 16);
382                 i4_horz_samp_16x4_r2_2 = vld1_s16(pi2_refarray_buffer + 20);
383 
384                 i4_horz_samp_16x4_r3_1 = vld1_s16(pi2_refarray_buffer + 24);
385                 i4_horz_samp_16x4_r3_2 = vld1_s16(pi2_refarray_buffer + 28);
386 
387                 i4_horz_res_16x4_r0_1 = vrshr_n_s16(i4_horz_samp_16x4_r0_1, 2);
388                 i4_horz_res_16x4_r0_2 = vrshr_n_s16(i4_horz_samp_16x4_r0_2, 2);
389 
390                 i4_horz_add_32x4_r1_1 = vaddl_s16(i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r1_1);
391                 i4_horz_add_32x4_r1_2 = vaddl_s16(i4_horz_samp_16x4_r0_2, i4_horz_samp_16x4_r1_2);
392 
393                 i4_horz_add_32x4_r2_1 = vaddl_s16(i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r2_1);
394                 i4_horz_add_32x4_r2_2 = vaddl_s16(i4_horz_samp_16x4_r1_2, i4_horz_samp_16x4_r2_2);
395 
396                 i4_horz_add_32x4_r3_1 = vaddl_s16(i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r3_1);
397                 i4_horz_add_32x4_r3_2 = vaddl_s16(i4_horz_samp_16x4_r2_2, i4_horz_samp_16x4_r3_2);
398 
399                 i4_horz_samp_32x4_r0_1 = vshll_n_s16(i4_horz_samp_16x4_r0_1, 1);
400                 i4_horz_samp_32x4_r0_2 = vshll_n_s16(i4_horz_samp_16x4_r0_2, 1);
401 
402                 i4_horz_samp_32x4_r1_1 = vshll_n_s16(i4_horz_samp_16x4_r1_1, 1);
403                 i4_horz_samp_32x4_r1_2 = vshll_n_s16(i4_horz_samp_16x4_r1_2, 1);
404 
405                 i4_horz_samp_32x4_r2_1 = vshll_n_s16(i4_horz_samp_16x4_r2_1, 1);
406                 i4_horz_samp_32x4_r2_2 = vshll_n_s16(i4_horz_samp_16x4_r2_2, 1);
407 
408                 i4_horz_samp_32x4_r3_1 = vshll_n_s16(i4_horz_samp_16x4_r3_1, 1);
409                 i4_horz_samp_32x4_r3_2 = vshll_n_s16(i4_horz_samp_16x4_r3_2, 1);
410 
411                 i4_horz_res_32x4_r1_1 = vaddq_s32(i4_horz_samp_32x4_r0_1, i4_horz_add_32x4_r1_1);
412                 i4_horz_res_32x4_r1_2 = vaddq_s32(i4_horz_samp_32x4_r0_2, i4_horz_add_32x4_r1_2);
413 
414                 i4_horz_res_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r1_1);
415                 i4_horz_res_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r1_2);
416 
417                 i4_horz_res_32x4_r3_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r2_1);
418                 i4_horz_res_32x4_r3_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r2_2);
419 
420                 i4_horz_res_32x4_r4_1 = vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r2_1);
421                 i4_horz_res_32x4_r4_2 = vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r2_2);
422 
423                 i4_horz_res_32x4_r5_1 = vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r3_1);
424                 i4_horz_res_32x4_r5_2 = vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r3_2);
425 
426                 i4_horz_res_32x4_r6_1 = vaddq_s32(i4_horz_samp_32x4_r3_1, i4_horz_add_32x4_r3_1);
427                 i4_horz_res_32x4_r6_2 = vaddq_s32(i4_horz_samp_32x4_r3_2, i4_horz_add_32x4_r3_2);
428 
429                 i4_horz_res_16x4_r1_1 = vqrshrn_n_s32(i4_horz_res_32x4_r1_1, 4);
430                 i4_horz_res_16x4_r1_2 = vqrshrn_n_s32(i4_horz_res_32x4_r1_2, 4);
431 
432                 i4_horz_res_16x4_r2_1 = vqrshrn_n_s32(i4_horz_res_32x4_r2_1, 4);
433                 i4_horz_res_16x4_r2_2 = vqrshrn_n_s32(i4_horz_res_32x4_r2_2, 4);
434 
435                 i4_horz_res_16x4_r3_1 = vqrshrn_n_s32(i4_horz_res_32x4_r3_1, 4);
436                 i4_horz_res_16x4_r3_2 = vqrshrn_n_s32(i4_horz_res_32x4_r3_2, 4);
437 
438                 i4_horz_res_16x4_r4_1 = vqrshrn_n_s32(i4_horz_res_32x4_r4_1, 4);
439                 i4_horz_res_16x4_r4_2 = vqrshrn_n_s32(i4_horz_res_32x4_r4_2, 4);
440 
441                 i4_horz_res_16x4_r5_1 = vqrshrn_n_s32(i4_horz_res_32x4_r5_1, 4);
442                 i4_horz_res_16x4_r5_2 = vqrshrn_n_s32(i4_horz_res_32x4_r5_2, 4);
443 
444                 i4_horz_res_16x4_r6_1 = vqrshrn_n_s32(i4_horz_res_32x4_r6_1, 4);
445                 i4_horz_res_16x4_r6_2 = vqrshrn_n_s32(i4_horz_res_32x4_r6_2, 4);
446 
447                 i4_horz_res_16x4_r7_1 = vrshr_n_s16(i4_horz_samp_16x4_r3_1, 2);
448                 i4_horz_res_16x4_r7_2 = vrshr_n_s16(i4_horz_samp_16x4_r3_2, 2);
449 
450                 vst1_s16(pi2_out_res, i4_horz_res_16x4_r0_1);
451                 vst1_s16(pi2_out_res + 4, i4_horz_res_16x4_r0_2);
452 
453                 vst1_s16(pi2_out_res + i4_out_res_stride, i4_horz_res_16x4_r1_1);
454                 vst1_s16(pi2_out_res + i4_out_res_stride + 4, i4_horz_res_16x4_r1_2);
455 
456                 vst1_s16(pi2_out_res + (i4_out_res_stride << 1), i4_horz_res_16x4_r2_1);
457                 vst1_s16(pi2_out_res + (i4_out_res_stride << 1) + 4, i4_horz_res_16x4_r2_2);
458 
459                 vst1_s16(pi2_out_res + (i4_out_res_stride * 3), i4_horz_res_16x4_r3_1);
460                 vst1_s16(pi2_out_res + (i4_out_res_stride * 3) + 4, i4_horz_res_16x4_r3_2);
461 
462                 vst1_s16(pi2_out_res + (i4_out_res_stride << 2), i4_horz_res_16x4_r4_1);
463                 vst1_s16(pi2_out_res + (i4_out_res_stride << 2) + 4, i4_horz_res_16x4_r4_2);
464 
465                 vst1_s16(pi2_out_res + (i4_out_res_stride * 5), i4_horz_res_16x4_r5_1);
466                 vst1_s16(pi2_out_res + (i4_out_res_stride * 5) + 4, i4_horz_res_16x4_r5_2);
467 
468                 vst1_s16(pi2_out_res + (i4_out_res_stride * 6), i4_horz_res_16x4_r6_1);
469                 vst1_s16(pi2_out_res + (i4_out_res_stride * 6) + 4, i4_horz_res_16x4_r6_2);
470 
471                 vst1_s16(pi2_out_res + (i4_out_res_stride * 7), i4_horz_res_16x4_r7_1);
472                 vst1_s16(pi2_out_res + (i4_out_res_stride * 7) + 4, i4_horz_res_16x4_r7_2);
473 
474                 pi2_out_res += BLK8x8SIZE;
475             }
476             else
477             {
478                 pi2_out_res += BLK8x8SIZE;
479             }
480 
481             /* Block level loop updates */
482             if(1 == i4_blk_ctr)
483             {
484                 pi2_inp_data -= SUB_BLK_WIDTH_4x4;
485                 pi2_inp_data += (i4_inp_data_stride * SUB_BLK_HEIGHT_4x4);
486                 pi2_out_res -= MB_SIZE;
487                 pi2_out_res += (i4_out_res_stride * BLK8x8SIZE);
488                 u4_ref_nnz >>= 2;
489             }
490             else
491             {
492                 pi2_inp_data += SUB_BLK_HEIGHT_4x4;
493             }
494             u4_ref_nnz >>= 1;
495         }
496         /* The above loop iterates over all the blocks */
497     }
498 }
499 
isvce_get_sad_with_residual_pred_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res,UWORD32 u4_mb_wd,UWORD32 u4_mb_ht)500 UWORD32 isvce_get_sad_with_residual_pred_neon(buffer_container_t *ps_src,
501                                               buffer_container_t *ps_pred,
502                                               buffer_container_t *ps_res, UWORD32 u4_mb_wd,
503                                               UWORD32 u4_mb_ht)
504 {
505     UWORD32 i, j, u4_sad = 0;
506     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
507     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
508     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
509     WORD32 i4_src_stride = ps_src->i4_data_stride;
510     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
511     WORD32 i4_res_stride = ps_res->i4_data_stride;
512     UWORD32 u4_num_rows_per_loop = 8;
513     UWORD32 u4_ht_by_8 = u4_mb_ht / u4_num_rows_per_loop;
514     uint8x8_t src0, src1, src2, src3;
515     uint8x8_t src4, src5, src6, src7;
516     uint8x8_t pred0, pred1, pred2, pred3;
517     uint8x8_t pred4, pred5, pred6, pred7;
518     int16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8, res4_16x8, res5_16x8, res6_16x8,
519         res7_16x8;
520     uint16x8_t res0_u16x8, res1_u16x8, res2_u16x8, res3_u16x8, res4_u16x8, res5_u16x8, res6_u16x8,
521         res7_u16x8;
522     int16x8_t respred0_16x8, respred1_16x8, respred2_16x8, respred3_16x8, respred4_16x8,
523         respred5_16x8, respred6_16x8, respred7_16x8;
524     int16x8_t temp0_16x8, temp1_16x8, temp2_16x8, temp3_16x8, temp4_16x8, temp5_16x8, temp6_16x8,
525         temp7_16x8;
526     int32x4_t temp0_32x4;
527     int32x2_t temp0_32x2;
528 
529     if((u4_mb_wd == 16) && (u4_mb_ht % 8 == 0))
530     {
531         for(i = 0; i < u4_ht_by_8; i++)
532         {
533             /* This loop processes 4 rows of 16 bytes each iteration */
534             /* So, 8 rows are processed across two iterations */
535             for(j = 0; j < 2; j++)
536             {
537                 src0 = vld1_u8(pu1_src);
538                 src1 = vld1_u8(pu1_src + 8);
539 
540                 pu1_src += i4_src_stride;
541 
542                 src2 = vld1_u8(pu1_src);
543                 src3 = vld1_u8(pu1_src + 8);
544 
545                 pu1_src += i4_src_stride;
546 
547                 src4 = vld1_u8(pu1_src);
548                 src5 = vld1_u8(pu1_src + 8);
549 
550                 pu1_src += i4_src_stride;
551 
552                 src6 = vld1_u8(pu1_src);
553                 src7 = vld1_u8(pu1_src + 8);
554 
555                 pu1_src += i4_src_stride;
556 
557                 pred0 = vld1_u8(pu1_pred);
558                 pred1 = vld1_u8(pu1_pred + 8);
559 
560                 pu1_pred += i4_pred_stride;
561 
562                 pred2 = vld1_u8(pu1_pred);
563                 pred3 = vld1_u8(pu1_pred + 8);
564 
565                 pu1_pred += i4_pred_stride;
566 
567                 pred4 = vld1_u8(pu1_pred);
568                 pred5 = vld1_u8(pu1_pred + 8);
569 
570                 pu1_pred += i4_pred_stride;
571 
572                 pred6 = vld1_u8(pu1_pred);
573                 pred7 = vld1_u8(pu1_pred + 8);
574 
575                 pu1_pred += i4_pred_stride;
576 
577                 res0_u16x8 = vsubl_u8(src0, pred0);
578                 res1_u16x8 = vsubl_u8(src1, pred1);
579                 res2_u16x8 = vsubl_u8(src2, pred2);
580                 res3_u16x8 = vsubl_u8(src3, pred3);
581                 res4_u16x8 = vsubl_u8(src4, pred4);
582                 res5_u16x8 = vsubl_u8(src5, pred5);
583                 res6_u16x8 = vsubl_u8(src6, pred6);
584                 res7_u16x8 = vsubl_u8(src7, pred7);
585 
586                 res0_16x8 = vreinterpretq_s16_u16(res0_u16x8);
587                 res1_16x8 = vreinterpretq_s16_u16(res1_u16x8);
588                 res2_16x8 = vreinterpretq_s16_u16(res2_u16x8);
589                 res3_16x8 = vreinterpretq_s16_u16(res3_u16x8);
590                 res4_16x8 = vreinterpretq_s16_u16(res4_u16x8);
591                 res5_16x8 = vreinterpretq_s16_u16(res5_u16x8);
592                 res6_16x8 = vreinterpretq_s16_u16(res6_u16x8);
593                 res7_16x8 = vreinterpretq_s16_u16(res7_u16x8);
594 
595                 respred0_16x8 = vld1q_s16(pi2_res);
596                 respred1_16x8 = vld1q_s16(pi2_res + 8);
597 
598                 pi2_res += i4_res_stride;
599 
600                 respred2_16x8 = vld1q_s16(pi2_res);
601                 respred3_16x8 = vld1q_s16(pi2_res + 8);
602 
603                 pi2_res += i4_res_stride;
604 
605                 respred4_16x8 = vld1q_s16(pi2_res);
606                 respred5_16x8 = vld1q_s16(pi2_res + 8);
607 
608                 pi2_res += i4_res_stride;
609 
610                 respred6_16x8 = vld1q_s16(pi2_res);
611                 respred7_16x8 = vld1q_s16(pi2_res + 8);
612 
613                 pi2_res += i4_res_stride;
614 
615                 temp0_16x8 = vsubq_s16(res0_16x8, respred0_16x8);
616                 temp1_16x8 = vsubq_s16(res1_16x8, respred1_16x8);
617                 temp2_16x8 = vsubq_s16(res2_16x8, respred2_16x8);
618                 temp3_16x8 = vsubq_s16(res3_16x8, respred3_16x8);
619                 temp4_16x8 = vsubq_s16(res4_16x8, respred4_16x8);
620                 temp5_16x8 = vsubq_s16(res5_16x8, respred5_16x8);
621                 temp6_16x8 = vsubq_s16(res6_16x8, respred6_16x8);
622                 temp7_16x8 = vsubq_s16(res7_16x8, respred7_16x8);
623 
624                 temp0_16x8 = vabsq_s16(temp0_16x8);
625                 temp1_16x8 = vabsq_s16(temp1_16x8);
626                 temp2_16x8 = vabsq_s16(temp2_16x8);
627                 temp3_16x8 = vabsq_s16(temp3_16x8);
628                 temp4_16x8 = vabsq_s16(temp4_16x8);
629                 temp5_16x8 = vabsq_s16(temp5_16x8);
630                 temp6_16x8 = vabsq_s16(temp6_16x8);
631                 temp7_16x8 = vabsq_s16(temp7_16x8);
632 
633                 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
634                 temp1_16x8 = vaddq_s16(temp2_16x8, temp3_16x8);
635                 temp2_16x8 = vaddq_s16(temp4_16x8, temp5_16x8);
636                 temp3_16x8 = vaddq_s16(temp6_16x8, temp7_16x8);
637 
638                 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
639                 temp1_16x8 = vaddq_s16(temp2_16x8, temp3_16x8);
640 
641                 temp0_16x8 = vaddq_s16(temp0_16x8, temp1_16x8);
642 
643                 temp0_32x4 = vpaddlq_s16(temp0_16x8);
644                 temp0_32x2 = vpadd_s32(vget_low_s32(temp0_32x4), vget_high_s32(temp0_32x4));
645 
646                 u4_sad += vget_lane_s32(temp0_32x2, 0);
647                 u4_sad += vget_lane_s32(temp0_32x2, 1);
648             }
649         }
650     }
651     else
652     {
653         for(i = 0; i < u4_mb_ht; i++)
654         {
655             for(j = 0; j < u4_mb_wd; j++)
656             {
657                 WORD16 i2_src = pu1_src[j + i * i4_src_stride];
658                 WORD16 i2_pred = pu1_pred[j + i * i4_pred_stride];
659                 WORD16 i2_res = pi2_res[j + i * i4_res_stride];
660                 u4_sad += ABS(i2_src - i2_pred - i2_res);
661             }
662         }
663     }
664 
665     return u4_sad;
666 }
667