xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_resi_trans_neon_32x32.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22  *******************************************************************************
23  * @file
24  *  ihevc_resi_trans_neon_32x32.c
25  *
26  * @brief
27  *  Contains definitions of functions for computing residue and fwd transform
28  *
29  * @author
30  *  Ittiam
31  *
32  * @par List of Functions:
33  *  - ihevc_resi_trans_32x32_neon()
34  *
35  * @remarks
36  *  None
37  *
38  *******************************************************************************
39  */
40 
41 /*****************************************************************************/
42 /* File Includes                                                             */
43 /*****************************************************************************/
44 /* System include files */
45 #include <stdio.h>
46 #include <string.h>
47 
48 /* System user files */
49 #include "ihevc_typedefs.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_cmn_utils_neon.h"
53 
54 #include "ihevc_trans_tables.h"
55 #include "ihevc_resi_trans.h"
56 
57 /*****************************************************************************/
58 /* Function Definitions                                                      */
59 /*****************************************************************************/
60 /**
61  *******************************************************************************
62  *
63  * @brief
64  *  This function performs residue calculation and forward  transform on
65  * input pixels
66  *
67  * @par Description:
68  *  Performs residue calculation by subtracting source and  prediction and
69  * followed by forward transform
70  *
71  * @param[in] pu1_src
72  *  Input 32x32 pixels
73  *
74  * @param[in] pu1_pred
75  *  Prediction data
76  *
77  * @param[in] pi2_tmp
78  *  Temporary buffer of size 32x32
79  *
80  * @param[out] pi2_dst
81  *  Output 32x32 coefficients
82  *
83  * @param[in] src_strd
84  *  Input stride
85  *
86  * @param[in] pred_strd
87  *  Prediction Stride
88  *
89  * @param[in] dst_strd
90  *  Output Stride
91  *
92  * @param[in] e_chroma_plane
93  *  Enum singalling chroma plane
94  *
95  * @returns  Void
96  *
97  * @remarks
98  *  None
99  *
100  *******************************************************************************
101  */
ihevc_resi_trans_32x32_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)102 UWORD32 ihevc_resi_trans_32x32_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
103     WORD32 *pi4_temp, WORD16 *pi2_dst, WORD32 src_strd, WORD32 pred_strd,
104     WORD32 dst_strd, CHROMA_PLANE_ID_T e_chroma_plane)
105 {
106     int16x8_t diff_16[4][2];
107     WORD32 i;
108     int32x2_t sad;
109     int64x2_t tmp_a;
110     UWORD32 u4_blk_sad = 0;
111     WORD32 *pi4_temp_orig = pi4_temp;
112     int16x8_t abs = vdupq_n_s16(0);
113     int32x4_t sum_val = vdupq_n_s32(0);
114     UNUSED(e_chroma_plane);
115 
116     // Stage 1
117     for(i = 0; i < 16; i++)
118     {
119 
120         uint8x16_t src_buff, pred_buff;
121         abs = vdupq_n_s16(0);
122 
123         src_buff = vld1q_u8(pu1_src);
124         pred_buff = vld1q_u8(pu1_pred);
125         diff_16[0][0] = vreinterpretq_s16_u16(
126             vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff)));
127         diff_16[1][0] = vreinterpretq_s16_u16(
128             vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff)));
129         abs = vaddq_s16(abs, vabsq_s16(diff_16[0][0]));
130         abs = vaddq_s16(abs, vabsq_s16(diff_16[1][0]));
131 
132         src_buff = vld1q_u8(pu1_src + 16);
133         pred_buff = vld1q_u8(pu1_pred + 16);
134         diff_16[2][0] = vrev64q_s16(vreinterpretq_s16_u16(
135             vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff))));
136         diff_16[2][0] = vcombine_s16(
137             vget_high_s16(diff_16[2][0]), vget_low_s16(diff_16[2][0]));
138         diff_16[3][0] = vrev64q_s16(vreinterpretq_s16_u16(
139             vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff))));
140         diff_16[3][0] = vcombine_s16(
141             vget_high_s16(diff_16[3][0]), vget_low_s16(diff_16[3][0]));
142         abs = vaddq_s16(abs, vabsq_s16(diff_16[2][0]));
143         abs = vaddq_s16(abs, vabsq_s16(diff_16[3][0]));
144 
145         pu1_src += src_strd;
146         pu1_pred += pred_strd;
147 
148         src_buff = vld1q_u8(pu1_src);
149         pred_buff = vld1q_u8(pu1_pred);
150         diff_16[0][1] = vreinterpretq_s16_u16(
151             vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff)));
152         diff_16[1][1] = vreinterpretq_s16_u16(
153             vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff)));
154         abs = vaddq_s16(abs, vabsq_s16(diff_16[0][1]));
155         abs = vaddq_s16(abs, vabsq_s16(diff_16[1][1]));
156 
157         src_buff = vld1q_u8(pu1_src + 16);
158         pred_buff = vld1q_u8(pu1_pred + 16);
159         diff_16[2][1] = vrev64q_s16(vreinterpretq_s16_u16(
160             vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff))));
161         diff_16[2][1] = vcombine_s16(
162             vget_high_s16(diff_16[2][1]), vget_low_s16(diff_16[2][1]));
163         diff_16[3][1] = vrev64q_s16(vreinterpretq_s16_u16(
164             vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff))));
165         diff_16[3][1] = vcombine_s16(
166             vget_high_s16(diff_16[3][1]), vget_low_s16(diff_16[3][1]));
167 
168         abs = vaddq_s16(abs, vabsq_s16(diff_16[2][1]));
169         abs = vaddq_s16(abs, vabsq_s16(diff_16[3][1]));
170 
171         sum_val = vaddq_s32(sum_val,vpaddlq_s16(abs));
172 
173         pu1_src += src_strd;
174         pu1_pred += pred_strd;
175         {
176             static const int16x8_t g_ai2_ihevc_trans_32_01_8 = { 64, 83, 64, 36, 64, 36, -64, -83 };
177 
178             static const int16x4_t g_ai2_ihevc_trans_32_4_04 = { 89, 75, 50, 18 };
179             static const int16x4_t g_ai2_ihevc_trans_32_12_04 = { 75, -18, -89, -50 };
180             static const int16x4_t g_ai2_ihevc_trans_32_20_04 = { 50, -89, 18, 75 };
181             static const int16x4_t g_ai2_ihevc_trans_32_28_04 = { 18, -50, 75, -89 };
182 
183             static const int16x8_t g_ai2_ihevc_trans_32_2_07 = { 90, 87, 80, 70, 57, 43, 25, 9 };
184             static const int16x8_t g_ai2_ihevc_trans_32_6_07 = { 87, 57, 9, -43, -80, -90, -70, -25 };
185             static const int16x8_t g_ai2_ihevc_trans_32_10_07 = { 80, 9, -70, -87, -25, 57, 90, 43 };
186             static const int16x8_t g_ai2_ihevc_trans_32_14_07 = { 70, -43, -87, 9, 90, 25, -80, -57 };
187             static const int16x8_t g_ai2_ihevc_trans_32_18_07 = { 57, -80, -25, 90, -9, -87, 43, 70 };
188             static const int16x8_t g_ai2_ihevc_trans_32_22_07 = { 43, -90, 57, 25, -87, 70, 9, -80 };
189             static const int16x8_t g_ai2_ihevc_trans_32_26_07 = { 25, -70, 90, -80, 43, 9, -57, 87 };
190             static const int16x8_t g_ai2_ihevc_trans_32_30_07 = { 9, -25, 43, -57, 70, -80, 87, -90 };
191 
192             static const int16x8_t g_ai2_ihevc_trans_32_1_07 = { 90, 90, 88, 85, 82, 78, 73, 67 };
193             static const int16x8_t g_ai2_ihevc_trans_32_1_815 = { 61, 54, 46, 38, 31, 22, 13, 4 };
194             static const int16x8_t g_ai2_ihevc_trans_32_3_07 = { 90, 82, 67, 46, 22, -4, -31, -54 };
195             static const int16x8_t g_ai2_ihevc_trans_32_3_815 = { -73, -85, -90, -88, -78, -61, -38, -13 };
196             static const int16x8_t g_ai2_ihevc_trans_32_5_07 = { 88, 67, 31, -13, -54, -82, -90, -78 };
197             static const int16x8_t g_ai2_ihevc_trans_32_5_815 = { -46, -4, 38, 73, 90, 85, 61, 22 };
198             static const int16x8_t g_ai2_ihevc_trans_32_7_07 = { 85, 46, -13, -67, -90, -73, -22, 38 };
199             static const int16x8_t g_ai2_ihevc_trans_32_7_815 = { 82, 88, 54, -4, -61, -90, -78, -31 };
200             static const int16x8_t g_ai2_ihevc_trans_32_9_07 = { 82, 22, -54, -90, -61, 13, 78, 85 };
201             static const int16x8_t g_ai2_ihevc_trans_32_9_815 = { 31, -46, -90, -67, 4, 73, 88, 38 };
202             static const int16x8_t g_ai2_ihevc_trans_32_11_07 = { 78, -4, -82, -73, 13, 85, 67, -22 };
203             static const int16x8_t g_ai2_ihevc_trans_32_11_815 = { -88, -61, 31, 90, 54, -38, -90, -46 };
204             static const int16x8_t g_ai2_ihevc_trans_32_13_07 = { 73, -31, -90, -22, 78, 67, -38, -90 };
205             static const int16x8_t g_ai2_ihevc_trans_32_13_815 = { -13, 82, 61, -46, -88, -4, 85, 54 };
206             static const int16x8_t g_ai2_ihevc_trans_32_15_07 = { 67, -54, -78, 38, 85, -22, -90, 4 };
207             static const int16x8_t g_ai2_ihevc_trans_32_15_815 = { 90, 13, -88, -31, 82, 46, -73, -61 };
208             static const int16x8_t g_ai2_ihevc_trans_32_17_07 = { 61, -73, -46, 82, 31, -88, -13, 90 };
209             static const int16x8_t g_ai2_ihevc_trans_32_17_815 = { -4, -90, 22, 85, -38, -78, 54, 67 };
210             static const int16x8_t g_ai2_ihevc_trans_32_19_07 = { 54, -85, -4, 88, -46, -61, 82, 13 };
211             static const int16x8_t g_ai2_ihevc_trans_32_19_815 = { -90, 38, 67, -78, -22, 90, -31, -73 };
212             static const int16x8_t g_ai2_ihevc_trans_32_21_07 = { 46, -90, 38, 54, -90, 31, 61, -88 };
213             static const int16x8_t g_ai2_ihevc_trans_32_21_815 = { 22, 67, -85, 13, 73, -82, 4, 78 };
214             static const int16x8_t g_ai2_ihevc_trans_32_23_07 = { 38, -88, 73, -4, -67, 90, -46, -31 };
215             static const int16x8_t g_ai2_ihevc_trans_32_23_815 = { 85, -78, 13, 61, -90, 54, 22, -82 };
216             static const int16x8_t g_ai2_ihevc_trans_32_25_07 = { 31, -78, 90, -61, 4, 54, -88, 82 };
217             static const int16x8_t g_ai2_ihevc_trans_32_25_815 = { -38, -22, 73, -90, 67, -13, -46, 85 };
218             static const int16x8_t g_ai2_ihevc_trans_32_27_07 = { 22, -61, 85, -90, 73, -38, -4, 46 };
219             static const int16x8_t g_ai2_ihevc_trans_32_27_815 = { -78, 90, -82, 54, -13, -31, 67, -88 };
220             static const int16x8_t g_ai2_ihevc_trans_32_29_07 = { 13, -38, 61, -78, 88, -90, 85, -73 };
221             static const int16x8_t g_ai2_ihevc_trans_32_29_815 = { 54, -31, 4, 22, -46, 67, -82, 90 };
222             static const int16x8_t g_ai2_ihevc_trans_32_31_07 = { 4, -13, 22, -31, 38, -46, 54, -61 };
223             static const int16x8_t g_ai2_ihevc_trans_32_31_815 = { 67, -73, 78, -82, 85, -88, 90, -90 };
224 
225             int32x4x2_t a[32];
226 
227             const int16x8_t o1_1 = vsubq_s16(
228                 diff_16[1][1], diff_16[2][1]); /*R2(9-16) - R2(24-17)*/
229             const int16x8_t o1_0 = vsubq_s16(
230                 diff_16[0][1], diff_16[3][1]); /*R2(1- 8) - R2(32-25)*/
231             const int16x8_t o0_1 = vsubq_s16(
232                 diff_16[1][0], diff_16[2][0]); /*R1(9-16) - R1(24-17)*/
233             const int16x8_t o0_0 = vsubq_s16(
234                 diff_16[0][0], diff_16[3][0]); /*R1(1- 8) - R1(32-25)*/
235             const int16x8_t e0_0 = vaddq_s16(
236                 diff_16[0][0], diff_16[3][0]); /*R1(1- 8) + R1(32-25)*/
237             int16x8_t e0_1 = vrev64q_s16(vaddq_s16(
238                 diff_16[1][0], diff_16[2][0])); /*R1(9-16) + R1(24-17)*/
239             e0_1 = vcombine_s16(vget_high_s16(e0_1), vget_low_s16(e0_1));
240             const int16x8_t e1_0 = vaddq_s16(
241                 diff_16[0][1], diff_16[3][1]); /*R2(1- 8) + R2(32-25)*/
242             int16x8_t e1_1 = vrev64q_s16(vaddq_s16(
243                 diff_16[1][1], diff_16[2][1])); /*R2(9-16) + R2(24-17)*/
244             e1_1 = vcombine_s16(vget_high_s16(e1_1), vget_low_s16(e1_1));
245 
246             const int16x8_t ee0 = vaddq_s16(e0_0, e0_1); /*E1(1- 8) + E1(16-9)*/
247             const int16x8_t ee1 = vaddq_s16(e1_0, e1_1); /*E2(1- 8) + E2(16-9)*/
248             const int16x8_t eo1 = vsubq_s16(e1_0, e1_1); /*E2(1- 8) - E2(16-9)*/
249             const int16x8_t eo0 = vsubq_s16(e0_0, e0_1); /*E1(1- 8) - E1(16-9)*/
250 
251             /*EE0(1-4) & EE1(1-4)*/
252             const int16x8_t ee_a =
253                 vcombine_s16(vget_low_s16(ee0), vget_low_s16(ee1));
254             /*EE0(8-5) & EE1(8-5)*/
255             const int16x8_t ee_b = vcombine_s16(
256                 vrev64_s16(vget_high_s16(ee0)), vrev64_s16(vget_high_s16(ee1)));
257 
258             /*EE(1-4) - EE(8-5)*/
259             const int16x8_t eeo = vsubq_s16(ee_a, ee_b);  //Q0
260             /*EE(1-4) + EE(8-5)*/
261             const int16x8_t eee = vaddq_s16(ee_a, ee_b);  //Q13
262 
263             /*EEEE Calculations*/
264             const int32x2x2_t ee =
265                 vtrn_s32(vreinterpret_s32_s16(vget_low_s16(eee)),
266                     vreinterpret_s32_s16(vget_high_s16(eee)));
267             const int16x8_t eeee_a =
268                 vreinterpretq_s16_s32(vcombine_s32(ee.val[0], ee.val[0]));
269             const int16x8_t eeee_b =
270                 vcombine_s16(vrev32_s16(vreinterpret_s16_s32(ee.val[1])),
271                     vneg_s16(vrev32_s16(vreinterpret_s16_s32(ee.val[1]))));
272             const int16x8_t eeee = vaddq_s16(eeee_a, eeee_b);  //q2
273             const int16x4x2_t trans_eeee =
274                 vtrn_s16(vget_low_s16(eeee), vget_high_s16(eeee));
275             const int16x4_t eeee_00 = vreinterpret_s16_s32(vdup_lane_s32(
276                 vreinterpret_s32_s16(trans_eeee.val[0]), 0));  //d8
277             const int16x4_t eeee_10 = vreinterpret_s16_s32(vdup_lane_s32(
278                 vreinterpret_s32_s16(trans_eeee.val[0]), 1));  //d9
279             const int16x4_t eeee_01 = vreinterpret_s16_s32(vdup_lane_s32(
280                 vreinterpret_s32_s16(trans_eeee.val[1]), 0));  //d10
281             const int16x4_t eeee_11 = vreinterpret_s16_s32(vdup_lane_s32(
282                 vreinterpret_s32_s16(trans_eeee.val[1]), 1));  //d11
283 
284             /*Calculation of values 0 8 16 24*/
285             a[0].val[0] =
286                 vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_01_8), eeee_00);
287             a[0].val[0] = vmlal_s16(
288                 a[0].val[0], vget_high_s16(g_ai2_ihevc_trans_32_01_8), eeee_01);
289             a[0].val[1] =
290                 vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_01_8), eeee_10);
291             a[0].val[1] = vmlal_s16(
292                 a[0].val[1], vget_high_s16(g_ai2_ihevc_trans_32_01_8), eeee_11);
293 
294             int32x4x2_t val_8 = vzipq_s32(a[0].val[0], a[0].val[1]);
295 
296             /*Store*/
297             vst1_s32(pi4_temp, vget_low_s32(val_8.val[0])); /*Value 0*/
298             vst1_s32(pi4_temp + 256, vget_high_s32(val_8.val[0])); /*Value 8*/
299             vst1_s32(pi4_temp + 512, vget_low_s32(val_8.val[1])); /*Value 16*/
300             vst1_s32(pi4_temp + 768, vget_high_s32(val_8.val[1])); /*Value 24*/
301 
302             /*Calculation of values 4 12 20 28*/
303             /*Multiplications*/
304             a[4].val[0] =
305                 vmull_s16(g_ai2_ihevc_trans_32_4_04, vget_low_s16(eeo));
306             a[12].val[0] =
307                 vmull_s16(g_ai2_ihevc_trans_32_12_04, vget_low_s16(eeo));
308             a[20].val[0] =
309                 vmull_s16(g_ai2_ihevc_trans_32_20_04, vget_low_s16(eeo));
310             a[28].val[0] =
311                 vmull_s16(g_ai2_ihevc_trans_32_28_04, vget_low_s16(eeo));
312 
313             a[4].val[1] =
314                 vmull_s16(g_ai2_ihevc_trans_32_4_04, vget_high_s16(eeo));
315             a[12].val[1] =
316                 vmull_s16(g_ai2_ihevc_trans_32_12_04, vget_high_s16(eeo));
317             a[20].val[1] =
318                 vmull_s16(g_ai2_ihevc_trans_32_20_04, vget_high_s16(eeo));
319             a[28].val[1] =
320                 vmull_s16(g_ai2_ihevc_trans_32_28_04, vget_high_s16(eeo));
321 
322             /*Transposes*/
323             int32x4x2_t val_4_0 =
324                 vtrnq_s32(a[4].val[0], a[12].val[0]);  //q15 q5
325             int32x4x2_t val_4_1 =
326                 vtrnq_s32(a[4].val[1], a[12].val[1]);  //q4 q12
327             int32x4x2_t val_20_0 =
328                 vtrnq_s32(a[20].val[0], a[28].val[0]);  //q8 q2
329             int32x4x2_t val_20_1 =
330                 vtrnq_s32(a[20].val[1], a[28].val[1]);  //q9 q13
331 
332             /*Swap*/
333             a[4].val[0] = vcombine_s32(vget_low_s32(val_4_0.val[0]),
334                 vget_low_s32(val_20_0.val[0]));  //q12
335             a[4].val[1] = vcombine_s32(vget_high_s32(val_4_0.val[0]),
336                 vget_high_s32(val_20_0.val[0]));  //q2
337 
338             a[12].val[0] = vcombine_s32(vget_low_s32(val_4_0.val[1]),
339                 vget_low_s32(val_20_0.val[1]));  //q4
340             a[12].val[1] = vcombine_s32(vget_high_s32(val_4_0.val[1]),
341                 vget_high_s32(val_20_0.val[1]));  //q8
342 
343             /*Additions*/
344             a[12].val[0] = vaddq_s32(a[12].val[0], a[4].val[0]);  //q4
345             a[12].val[1] = vaddq_s32(a[12].val[1], a[4].val[1]);  //q8
346             a[12].val[1] = vaddq_s32(a[12].val[1], a[12].val[0]);  //q8
347 
348             a[20].val[0] = vcombine_s32(vget_low_s32(val_4_1.val[0]),
349                 vget_low_s32(val_20_1.val[0]));  //q5
350             a[20].val[1] = vcombine_s32(vget_high_s32(val_4_1.val[0]),
351                 vget_high_s32(val_20_1.val[0]));  //q13
352 
353             a[28].val[0] = vcombine_s32(vget_low_s32(val_4_1.val[1]),
354                 vget_low_s32(val_20_1.val[1]));  //q15
355             a[28].val[1] = vcombine_s32(vget_high_s32(val_4_1.val[1]),
356                 vget_high_s32(val_20_1.val[1]));  //q9
357 
358             a[28].val[0] = vaddq_s32(a[28].val[0], a[20].val[0]);  //q15
359             a[28].val[1] = vaddq_s32(a[28].val[1], a[20].val[1]);  //q5
360             a[28].val[1] = vaddq_s32(a[28].val[1], a[28].val[0]);  //q15
361 
362             int32x4x2_t val_4 = vzipq_s32(a[12].val[1], a[28].val[1]);
363 
364             /*Store*/
365             vst1_s32(pi4_temp + 128, vget_low_s32(val_4.val[0])); /*Value 4*/
366             vst1_s32(pi4_temp + 384, vget_high_s32(val_4.val[0])); /*Value 12*/
367             vst1_s32(pi4_temp + 640, vget_low_s32(val_4.val[1])); /*Value 20*/
368             vst1_s32(pi4_temp + 896, vget_high_s32(val_4.val[1])); /*Value 28*/
369 
370             /*Calculation of value 2 6 10 14 18 22 26 30*/
371             /*Multiplications*/
372             a[2].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_2_07),
373                 vget_low_s16(eo0));  //q2
374             a[6].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_6_07),
375                 vget_low_s16(eo0));  //q5
376             a[10].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_10_07),
377                 vget_low_s16(eo0));  //q9
378             a[14].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_14_07),
379                 vget_low_s16(eo0));  //q8
380 
381             a[14].val[0] = vmlal_s16(a[14].val[0],
382                 vget_high_s16(g_ai2_ihevc_trans_32_14_07), vget_high_s16(eo0));
383             a[10].val[0] = vmlal_s16(a[10].val[0],
384                 vget_high_s16(g_ai2_ihevc_trans_32_10_07), vget_high_s16(eo0));
385             a[6].val[0] = vmlal_s16(a[6].val[0],
386                 vget_high_s16(g_ai2_ihevc_trans_32_6_07), vget_high_s16(eo0));
387             a[2].val[0] = vmlal_s16(a[2].val[0],
388                 vget_high_s16(g_ai2_ihevc_trans_32_2_07), vget_high_s16(eo0));
389 
390             a[2].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_2_07),
391                 vget_low_s16(eo1));  //q4
392             a[6].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_6_07),
393                 vget_low_s16(eo1));  //q13
394             a[10].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_10_07),
395                 vget_low_s16(eo1));  //q12
396             a[14].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_14_07),
397                 vget_low_s16(eo1));  //q15
398 
399             a[14].val[1] = vmlal_s16(a[14].val[1],
400                 vget_high_s16(g_ai2_ihevc_trans_32_14_07), vget_high_s16(eo1));
401             a[10].val[1] = vmlal_s16(a[10].val[1],
402                 vget_high_s16(g_ai2_ihevc_trans_32_10_07), vget_high_s16(eo1));
403             a[6].val[1] = vmlal_s16(a[6].val[1],
404                 vget_high_s16(g_ai2_ihevc_trans_32_6_07), vget_high_s16(eo1));
405             a[2].val[1] = vmlal_s16(a[2].val[1],
406                 vget_high_s16(g_ai2_ihevc_trans_32_2_07), vget_high_s16(eo1));
407 
408             /*Transposes*/
409             int32x4x2_t val_26_0 = vtrnq_s32(a[2].val[0], a[6].val[0]);  //q2 q5
410             int32x4x2_t val_1014_0 =
411                 vtrnq_s32(a[10].val[0], a[14].val[0]);  //q9 q8
412             int32x4x2_t val_26_1 =
413                 vtrnq_s32(a[2].val[1], a[6].val[1]);  //q4 q13
414             int32x4x2_t val_1014_1 =
415                 vtrnq_s32(a[10].val[1], a[14].val[1]);  //q12 q15
416 
417             /*Swap*/
418             a[2].val[0] = vcombine_s32(vget_low_s32(val_26_0.val[0]),
419                 vget_low_s32(val_1014_0.val[0]));  //q2
420             a[2].val[1] = vcombine_s32(vget_high_s32(val_26_0.val[0]),
421                 vget_high_s32(val_1014_0.val[0]));  //q9
422 
423             a[6].val[0] = vcombine_s32(vget_low_s32(val_26_0.val[1]),
424                 vget_low_s32(val_1014_0.val[1]));  //q5
425             a[6].val[1] = vcombine_s32(vget_high_s32(val_26_0.val[1]),
426                 vget_high_s32(val_1014_0.val[1]));  //q8
427 
428             a[10].val[0] = vcombine_s32(vget_low_s32(val_26_1.val[0]),
429                 vget_low_s32(val_1014_1.val[0]));  //q4
430             a[10].val[1] = vcombine_s32(vget_high_s32(val_26_1.val[0]),
431                 vget_high_s32(val_1014_1.val[0]));  //q12
432 
433             a[14].val[0] = vcombine_s32(vget_low_s32(val_26_1.val[1]),
434                 vget_low_s32(val_1014_1.val[1]));  //q13
435             a[14].val[1] = vcombine_s32(vget_high_s32(val_26_1.val[1]),
436                 vget_high_s32(val_1014_1.val[1]));  //q15
437 
438             /*Additions*/
439             a[2].val[0] = vaddq_s32(a[2].val[0], a[6].val[0]);  //q2
440             a[2].val[1] = vaddq_s32(a[2].val[1], a[6].val[1]);  //q9
441             a[2].val[1] = vaddq_s32(a[2].val[1], a[2].val[0]);  //q9
442 
443             a[10].val[0] = vaddq_s32(a[10].val[0], a[14].val[0]);  //q4
444             a[10].val[1] = vaddq_s32(a[10].val[1], a[14].val[1]);  //q12
445             a[10].val[1] = vaddq_s32(a[10].val[1], a[10].val[0]);  //q12
446 
447             int32x4x2_t val_2 = vzipq_s32(a[2].val[1], a[10].val[1]);  //q9 q12
448 
449             /*Store*/
450             vst1_s32(pi4_temp + 64, vget_low_s32(val_2.val[0])); /*Value 2*/
451             vst1_s32(pi4_temp + 192, vget_high_s32(val_2.val[0])); /*Value 6*/
452             vst1_s32(pi4_temp + 320, vget_low_s32(val_2.val[1])); /*Value 10*/
453             vst1_s32(pi4_temp + 448, vget_high_s32(val_2.val[1])); /*Value 14*/
454 
455             a[18].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_18_07),
456                 vget_low_s16(eo0));  //q0
457             a[22].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_22_07),
458                 vget_low_s16(eo0));  //q5
459             a[26].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_26_07),
460                 vget_low_s16(eo0));  //q9
461             a[30].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_30_07),
462                 vget_low_s16(eo0));  //q15
463 
464             a[30].val[0] = vmlal_s16(a[30].val[0],
465                 vget_high_s16(g_ai2_ihevc_trans_32_30_07), vget_high_s16(eo0));
466             a[26].val[0] = vmlal_s16(a[26].val[0],
467                 vget_high_s16(g_ai2_ihevc_trans_32_26_07), vget_high_s16(eo0));
468             a[22].val[0] = vmlal_s16(a[22].val[0],
469                 vget_high_s16(g_ai2_ihevc_trans_32_22_07), vget_high_s16(eo0));
470             a[18].val[0] = vmlal_s16(a[18].val[0],
471                 vget_high_s16(g_ai2_ihevc_trans_32_18_07), vget_high_s16(eo0));
472 
473             a[18].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_18_07),
474                 vget_low_s16(eo1));  //q4
475             a[22].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_22_07),
476                 vget_low_s16(eo1));  //q8
477             a[26].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_26_07),
478                 vget_low_s16(eo1));  //q12
479             a[30].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_30_07),
480                 vget_low_s16(eo1));  //q18
481 
482             a[30].val[1] = vmlal_s16(a[30].val[1],
483                 vget_high_s16(g_ai2_ihevc_trans_32_30_07), vget_high_s16(eo1));
484             a[26].val[1] = vmlal_s16(a[26].val[1],
485                 vget_high_s16(g_ai2_ihevc_trans_32_26_07), vget_high_s16(eo1));
486             a[22].val[1] = vmlal_s16(a[22].val[1],
487                 vget_high_s16(g_ai2_ihevc_trans_32_22_07), vget_high_s16(eo1));
488             a[18].val[1] = vmlal_s16(a[18].val[1],
489                 vget_high_s16(g_ai2_ihevc_trans_32_18_07), vget_high_s16(eo1));
490 
491             /*Transposes*/
492             int32x4x2_t val_1822_0 =
493                 vtrnq_s32(a[18].val[0], a[22].val[0]);  //q2 q5
494             int32x4x2_t val_2630_0 =
495                 vtrnq_s32(a[26].val[0], a[30].val[0]);  //q9 q8
496             int32x4x2_t val_1822_1 =
497                 vtrnq_s32(a[18].val[1], a[22].val[1]);  //q4 q13
498             int32x4x2_t val_2630_1 =
499                 vtrnq_s32(a[26].val[1], a[30].val[1]);  //q12 q15
500 
501             /*Swap*/
502             a[18].val[0] = vcombine_s32(vget_low_s32(val_1822_0.val[0]),
503                 vget_low_s32(val_2630_0.val[0]));  //q2
504             a[18].val[1] = vcombine_s32(vget_high_s32(val_1822_0.val[0]),
505                 vget_high_s32(val_2630_0.val[0]));  //q9
506 
507             a[22].val[0] = vcombine_s32(vget_low_s32(val_1822_0.val[1]),
508                 vget_low_s32(val_2630_0.val[1]));  //q5
509             a[22].val[1] = vcombine_s32(vget_high_s32(val_1822_0.val[1]),
510                 vget_high_s32(val_2630_0.val[1]));  //q8
511 
512             a[26].val[0] = vcombine_s32(vget_low_s32(val_1822_1.val[0]),
513                 vget_low_s32(val_2630_1.val[0]));  //q4
514             a[26].val[1] = vcombine_s32(vget_high_s32(val_1822_1.val[0]),
515                 vget_high_s32(val_2630_1.val[0]));  //q12
516 
517             a[30].val[0] = vcombine_s32(vget_low_s32(val_1822_1.val[1]),
518                 vget_low_s32(val_2630_1.val[1]));  //q13
519             a[30].val[1] = vcombine_s32(vget_high_s32(val_1822_1.val[1]),
520                 vget_high_s32(val_2630_1.val[1]));  //q15
521 
522             /*Additions*/
523             a[18].val[0] = vaddq_s32(a[18].val[0], a[22].val[0]);  //q2
524             a[18].val[1] = vaddq_s32(a[18].val[1], a[22].val[1]);  //q9
525             a[18].val[1] = vaddq_s32(a[18].val[1], a[18].val[0]);  //q9
526 
527             a[26].val[0] = vaddq_s32(a[26].val[0], a[30].val[0]);  //q4
528             a[26].val[1] = vaddq_s32(a[26].val[1], a[30].val[1]);  //q12
529             a[26].val[1] = vaddq_s32(a[26].val[1], a[26].val[0]);  //q12
530 
531             int32x4x2_t val_18 =
532                 vzipq_s32(a[18].val[1], a[26].val[1]);  //q9 q12
533 
534             /*Store*/
535             vst1_s32(pi4_temp + 576, vget_low_s32(val_18.val[0])); /*Value 18*/
536             vst1_s32(pi4_temp + 704, vget_high_s32(val_18.val[0])); /*Value 22*/
537             vst1_s32(pi4_temp + 832, vget_low_s32(val_18.val[1])); /*Value 26*/
538             vst1_s32(pi4_temp + 960, vget_high_s32(val_18.val[1])); /*Value 30*/
539 
540             /*Calculations for odd indexes*/
541             a[1].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_1_07),
542                 vget_low_s16(o0_0));  //q1
543             a[3].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_3_07),
544                 vget_low_s16(o0_0));  //q5
545             a[5].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_5_07),
546                 vget_low_s16(o0_0));  //q8
547             a[7].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_7_07),
548                 vget_low_s16(o0_0));  //q12
549 
550             a[7].val[0] = vmlal_s16(a[7].val[0],
551                 vget_high_s16(g_ai2_ihevc_trans_32_7_07), vget_high_s16(o0_0));
552             a[5].val[0] = vmlal_s16(a[5].val[0],
553                 vget_high_s16(g_ai2_ihevc_trans_32_5_07), vget_high_s16(o0_0));
554             a[3].val[0] = vmlal_s16(a[3].val[0],
555                 vget_high_s16(g_ai2_ihevc_trans_32_3_07), vget_high_s16(o0_0));
556             a[1].val[0] = vmlal_s16(a[1].val[0],
557                 vget_high_s16(g_ai2_ihevc_trans_32_1_07), vget_high_s16(o0_0));
558 
559             a[1].val[0] = vmlal_s16(a[1].val[0],
560                 vget_low_s16(g_ai2_ihevc_trans_32_1_815), vget_low_s16(o0_1));
561             a[3].val[0] = vmlal_s16(a[3].val[0],
562                 vget_low_s16(g_ai2_ihevc_trans_32_3_815), vget_low_s16(o0_1));
563             a[5].val[0] = vmlal_s16(a[5].val[0],
564                 vget_low_s16(g_ai2_ihevc_trans_32_5_815), vget_low_s16(o0_1));
565             a[7].val[0] = vmlal_s16(a[7].val[0],
566                 vget_low_s16(g_ai2_ihevc_trans_32_7_815), vget_low_s16(o0_1));
567 
568             a[7].val[0] = vmlal_s16(a[7].val[0],
569                 vget_high_s16(g_ai2_ihevc_trans_32_7_815), vget_high_s16(o0_1));
570             a[5].val[0] = vmlal_s16(a[5].val[0],
571                 vget_high_s16(g_ai2_ihevc_trans_32_5_815), vget_high_s16(o0_1));
572             a[3].val[0] = vmlal_s16(a[3].val[0],
573                 vget_high_s16(g_ai2_ihevc_trans_32_3_815), vget_high_s16(o0_1));
574             a[1].val[0] = vmlal_s16(a[1].val[0],
575                 vget_high_s16(g_ai2_ihevc_trans_32_1_815), vget_high_s16(o0_1));
576 
577             a[1].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_1_07),
578                 vget_low_s16(o1_0));  //q0
579             a[3].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_3_07),
580                 vget_low_s16(o1_0));  //q4
581             a[5].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_5_07),
582                 vget_low_s16(o1_0));  //q9
583             a[7].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_7_07),
584                 vget_low_s16(o1_0));  //q13
585 
586             a[7].val[1] = vmlal_s16(a[7].val[1],
587                 vget_high_s16(g_ai2_ihevc_trans_32_7_07), vget_high_s16(o1_0));
588             a[5].val[1] = vmlal_s16(a[5].val[1],
589                 vget_high_s16(g_ai2_ihevc_trans_32_5_07), vget_high_s16(o1_0));
590             a[3].val[1] = vmlal_s16(a[3].val[1],
591                 vget_high_s16(g_ai2_ihevc_trans_32_3_07), vget_high_s16(o1_0));
592             a[1].val[1] = vmlal_s16(a[1].val[1],
593                 vget_high_s16(g_ai2_ihevc_trans_32_1_07), vget_high_s16(o1_0));
594 
595             a[1].val[1] = vmlal_s16(a[1].val[1],
596                 vget_low_s16(g_ai2_ihevc_trans_32_1_815), vget_low_s16(o1_1));
597             a[3].val[1] = vmlal_s16(a[3].val[1],
598                 vget_low_s16(g_ai2_ihevc_trans_32_3_815), vget_low_s16(o1_1));
599             a[5].val[1] = vmlal_s16(a[5].val[1],
600                 vget_low_s16(g_ai2_ihevc_trans_32_5_815), vget_low_s16(o1_1));
601             a[7].val[1] = vmlal_s16(a[7].val[1],
602                 vget_low_s16(g_ai2_ihevc_trans_32_7_815), vget_low_s16(o1_1));
603 
604             a[7].val[1] = vmlal_s16(a[7].val[1],
605                 vget_high_s16(g_ai2_ihevc_trans_32_7_815), vget_high_s16(o1_1));
606             a[5].val[1] = vmlal_s16(a[5].val[1],
607                 vget_high_s16(g_ai2_ihevc_trans_32_5_815), vget_high_s16(o1_1));
608             a[3].val[1] = vmlal_s16(a[3].val[1],
609                 vget_high_s16(g_ai2_ihevc_trans_32_3_815), vget_high_s16(o1_1));
610             a[1].val[1] = vmlal_s16(a[1].val[1],
611                 vget_high_s16(g_ai2_ihevc_trans_32_1_815), vget_high_s16(o1_1));
612 
613             /*Transposes*/
614             int32x4x2_t val_13_0 = vtrnq_s32(a[1].val[0], a[3].val[0]);  //q0 q4
615             int32x4x2_t val_13_1 = vtrnq_s32(a[1].val[1], a[3].val[1]);  //q1 q5
616             int32x4x2_t val_57_0 =
617                 vtrnq_s32(a[5].val[0], a[7].val[0]);  //q8 q12
618             int32x4x2_t val_57_1 =
619                 vtrnq_s32(a[5].val[1], a[7].val[1]);  //q9 q13
620 
621             /*Swap*/
622             a[1].val[0] = vcombine_s32(vget_low_s32(val_13_0.val[0]),
623                 vget_low_s32(val_57_0.val[0]));  //q0
624             a[1].val[1] = vcombine_s32(vget_high_s32(val_13_0.val[0]),
625                 vget_high_s32(val_57_0.val[0]));  //q8
626 
627             a[3].val[0] = vcombine_s32(vget_low_s32(val_13_0.val[1]),
628                 vget_low_s32(val_57_0.val[1]));  //q1
629             a[3].val[1] = vcombine_s32(vget_high_s32(val_13_0.val[1]),
630                 vget_high_s32(val_57_0.val[1]));  //q9
631 
632             a[5].val[0] = vcombine_s32(vget_low_s32(val_13_1.val[0]),
633                 vget_low_s32(val_57_1.val[0]));  //q4
634             a[5].val[1] = vcombine_s32(vget_high_s32(val_13_1.val[0]),
635                 vget_high_s32(val_57_1.val[0]));  //q12
636 
637             a[7].val[0] = vcombine_s32(vget_low_s32(val_13_1.val[1]),
638                 vget_low_s32(val_57_1.val[1]));  //q5
639             a[7].val[1] = vcombine_s32(vget_high_s32(val_13_1.val[1]),
640                 vget_high_s32(val_57_1.val[1]));  //q13
641 
642             /*Additions*/
643             a[1].val[0] = vaddq_s32(a[1].val[0], a[3].val[0]);  //q0
644             a[1].val[1] = vaddq_s32(a[1].val[1], a[3].val[1]);  //q8
645             a[1].val[1] = vaddq_s32(a[1].val[1], a[1].val[0]);  //q8
646 
647             a[5].val[0] = vaddq_s32(a[5].val[0], a[7].val[0]);  //q1
648             a[5].val[1] = vaddq_s32(a[5].val[1], a[7].val[1]);  //q9
649             a[5].val[1] = vaddq_s32(a[5].val[1], a[5].val[0]);  //q9
650 
651             int32x4x2_t val_1 = vzipq_s32(a[1].val[1], a[5].val[1]);  //q8 q9
652 
653             /*Store*/
654             vst1_s32(pi4_temp + 32, vget_low_s32(val_1.val[0])); /*Value 1*/
655             vst1_s32(pi4_temp + 96, vget_high_s32(val_1.val[0])); /*Value 3*/
656             vst1_s32(pi4_temp + 160, vget_low_s32(val_1.val[1])); /*Value 5*/
657             vst1_s32(pi4_temp + 224, vget_high_s32(val_1.val[1])); /*Value 7*/
658 
659             a[9].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_9_07),
660                 vget_low_s16(o0_0));  //q2
661             a[11].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_11_07),
662                 vget_low_s16(o0_0));  //q2
663             a[13].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_13_07),
664                 vget_low_s16(o0_0));  //q2
665             a[15].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_15_07),
666                 vget_low_s16(o0_0));  //q2
667 
668             a[15].val[0] = vmlal_s16(a[15].val[0],
669                 vget_high_s16(g_ai2_ihevc_trans_32_15_07), vget_high_s16(o0_0));
670             a[13].val[0] = vmlal_s16(a[13].val[0],
671                 vget_high_s16(g_ai2_ihevc_trans_32_13_07), vget_high_s16(o0_0));
672             a[11].val[0] = vmlal_s16(a[11].val[0],
673                 vget_high_s16(g_ai2_ihevc_trans_32_11_07), vget_high_s16(o0_0));
674             a[9].val[0] = vmlal_s16(a[9].val[0],
675                 vget_high_s16(g_ai2_ihevc_trans_32_9_07), vget_high_s16(o0_0));
676 
677             a[9].val[0] = vmlal_s16(a[9].val[0],
678                 vget_low_s16(g_ai2_ihevc_trans_32_9_815), vget_low_s16(o0_1));
679             a[11].val[0] = vmlal_s16(a[11].val[0],
680                 vget_low_s16(g_ai2_ihevc_trans_32_11_815), vget_low_s16(o0_1));
681             a[13].val[0] = vmlal_s16(a[13].val[0],
682                 vget_low_s16(g_ai2_ihevc_trans_32_13_815), vget_low_s16(o0_1));
683             a[15].val[0] = vmlal_s16(a[15].val[0],
684                 vget_low_s16(g_ai2_ihevc_trans_32_15_815), vget_low_s16(o0_1));
685 
686             a[15].val[0] = vmlal_s16(a[15].val[0],
687                 vget_high_s16(g_ai2_ihevc_trans_32_15_815),
688                 vget_high_s16(o0_1));
689             a[13].val[0] = vmlal_s16(a[13].val[0],
690                 vget_high_s16(g_ai2_ihevc_trans_32_13_815),
691                 vget_high_s16(o0_1));
692             a[11].val[0] = vmlal_s16(a[11].val[0],
693                 vget_high_s16(g_ai2_ihevc_trans_32_11_815),
694                 vget_high_s16(o0_1));
695             a[9].val[0] = vmlal_s16(a[9].val[0],
696                 vget_high_s16(g_ai2_ihevc_trans_32_9_815), vget_high_s16(o0_1));
697 
698             a[9].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_9_07),
699                 vget_low_s16(o1_0));  //q2
700             a[11].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_11_07),
701                 vget_low_s16(o1_0));  //q2
702             a[13].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_13_07),
703                 vget_low_s16(o1_0));  //q2
704             a[15].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_15_07),
705                 vget_low_s16(o1_0));  //q2
706 
707             a[15].val[1] = vmlal_s16(a[15].val[1],
708                 vget_high_s16(g_ai2_ihevc_trans_32_15_07), vget_high_s16(o1_0));
709             a[13].val[1] = vmlal_s16(a[13].val[1],
710                 vget_high_s16(g_ai2_ihevc_trans_32_13_07), vget_high_s16(o1_0));
711             a[11].val[1] = vmlal_s16(a[11].val[1],
712                 vget_high_s16(g_ai2_ihevc_trans_32_11_07), vget_high_s16(o1_0));
713             a[9].val[1] = vmlal_s16(a[9].val[1],
714                 vget_high_s16(g_ai2_ihevc_trans_32_9_07), vget_high_s16(o1_0));
715 
716             a[9].val[1] = vmlal_s16(a[9].val[1],
717                 vget_low_s16(g_ai2_ihevc_trans_32_9_815), vget_low_s16(o1_1));
718             a[11].val[1] = vmlal_s16(a[11].val[1],
719                 vget_low_s16(g_ai2_ihevc_trans_32_11_815), vget_low_s16(o1_1));
720             a[13].val[1] = vmlal_s16(a[13].val[1],
721                 vget_low_s16(g_ai2_ihevc_trans_32_13_815), vget_low_s16(o1_1));
722             a[15].val[1] = vmlal_s16(a[15].val[1],
723                 vget_low_s16(g_ai2_ihevc_trans_32_15_815), vget_low_s16(o1_1));
724 
725             a[15].val[1] = vmlal_s16(a[15].val[1],
726                 vget_high_s16(g_ai2_ihevc_trans_32_15_815),
727                 vget_high_s16(o1_1));
728             a[13].val[1] = vmlal_s16(a[13].val[1],
729                 vget_high_s16(g_ai2_ihevc_trans_32_13_815),
730                 vget_high_s16(o1_1));
731             a[11].val[1] = vmlal_s16(a[11].val[1],
732                 vget_high_s16(g_ai2_ihevc_trans_32_11_815),
733                 vget_high_s16(o1_1));
734             a[9].val[1] = vmlal_s16(a[9].val[1],
735                 vget_high_s16(g_ai2_ihevc_trans_32_9_815), vget_high_s16(o1_1));
736 
737             int32x4x2_t val_911_0 =
738                 vtrnq_s32(a[9].val[0], a[11].val[0]);  //q0 q4
739             int32x4x2_t val_911_1 =
740                 vtrnq_s32(a[9].val[1], a[11].val[1]);  //q1 q5
741             int32x4x2_t val_1315_0 =
742                 vtrnq_s32(a[13].val[0], a[15].val[0]);  //q8 q12
743             int32x4x2_t val_1315_1 =
744                 vtrnq_s32(a[13].val[1], a[15].val[1]);  //q9 q13
745 
746             a[9].val[0] = vcombine_s32(vget_low_s32(val_911_0.val[0]),
747                 vget_low_s32(val_1315_0.val[0]));  //q0
748             a[9].val[1] = vcombine_s32(vget_high_s32(val_911_0.val[0]),
749                 vget_high_s32(val_1315_0.val[0]));  //q8
750 
751             a[11].val[0] = vcombine_s32(vget_low_s32(val_911_0.val[1]),
752                 vget_low_s32(val_1315_0.val[1]));  //q1
753             a[11].val[1] = vcombine_s32(vget_high_s32(val_911_0.val[1]),
754                 vget_high_s32(val_1315_0.val[1]));  //q9
755 
756             a[13].val[0] = vcombine_s32(vget_low_s32(val_911_1.val[0]),
757                 vget_low_s32(val_1315_1.val[0]));  //q4
758             a[13].val[1] = vcombine_s32(vget_high_s32(val_911_1.val[0]),
759                 vget_high_s32(val_1315_1.val[0]));  //q12
760 
761             a[15].val[0] = vcombine_s32(vget_low_s32(val_911_1.val[1]),
762                 vget_low_s32(val_1315_1.val[1]));  //q5
763             a[15].val[1] = vcombine_s32(vget_high_s32(val_911_1.val[1]),
764                 vget_high_s32(val_1315_1.val[1]));  //q13
765 
766             a[9].val[0] = vaddq_s32(a[9].val[0], a[11].val[0]);  //q0
767             a[9].val[1] = vaddq_s32(a[9].val[1], a[11].val[1]);  //q8
768             a[9].val[1] = vaddq_s32(a[9].val[1], a[9].val[0]);  //q8
769 
770             a[13].val[0] = vaddq_s32(a[13].val[0], a[15].val[0]);  //q1
771             a[13].val[1] = vaddq_s32(a[13].val[1], a[15].val[1]);  //q9
772             a[13].val[1] = vaddq_s32(a[13].val[1], a[13].val[0]);  //q9
773 
774             int32x4x2_t val_9 = vzipq_s32(a[9].val[1], a[13].val[1]);  //q8 q9
775 
776             vst1_s32(pi4_temp + 288, vget_low_s32(val_9.val[0])); /*Value 9*/
777             vst1_s32(pi4_temp + 352, vget_high_s32(val_9.val[0])); /*Value 11*/
778             vst1_s32(pi4_temp + 416, vget_low_s32(val_9.val[1])); /*Value 13*/
779             vst1_s32(pi4_temp + 480, vget_high_s32(val_9.val[1])); /*Value 15*/
780 
781             a[17].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_17_07),
782                 vget_low_s16(o0_0));  //q2
783             a[19].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_19_07),
784                 vget_low_s16(o0_0));  //q2
785             a[21].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_21_07),
786                 vget_low_s16(o0_0));  //q2
787             a[23].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_23_07),
788                 vget_low_s16(o0_0));  //q2
789 
790             a[23].val[0] = vmlal_s16(a[23].val[0],
791                 vget_high_s16(g_ai2_ihevc_trans_32_23_07), vget_high_s16(o0_0));
792             a[21].val[0] = vmlal_s16(a[21].val[0],
793                 vget_high_s16(g_ai2_ihevc_trans_32_21_07), vget_high_s16(o0_0));
794             a[19].val[0] = vmlal_s16(a[19].val[0],
795                 vget_high_s16(g_ai2_ihevc_trans_32_19_07), vget_high_s16(o0_0));
796             a[17].val[0] = vmlal_s16(a[17].val[0],
797                 vget_high_s16(g_ai2_ihevc_trans_32_17_07), vget_high_s16(o0_0));
798 
799             a[17].val[0] = vmlal_s16(a[17].val[0],
800                 vget_low_s16(g_ai2_ihevc_trans_32_17_815), vget_low_s16(o0_1));
801             a[19].val[0] = vmlal_s16(a[19].val[0],
802                 vget_low_s16(g_ai2_ihevc_trans_32_19_815), vget_low_s16(o0_1));
803             a[21].val[0] = vmlal_s16(a[21].val[0],
804                 vget_low_s16(g_ai2_ihevc_trans_32_21_815), vget_low_s16(o0_1));
805             a[23].val[0] = vmlal_s16(a[23].val[0],
806                 vget_low_s16(g_ai2_ihevc_trans_32_23_815), vget_low_s16(o0_1));
807 
808             a[23].val[0] = vmlal_s16(a[23].val[0],
809                 vget_high_s16(g_ai2_ihevc_trans_32_23_815),
810                 vget_high_s16(o0_1));
811             a[21].val[0] = vmlal_s16(a[21].val[0],
812                 vget_high_s16(g_ai2_ihevc_trans_32_21_815),
813                 vget_high_s16(o0_1));
814             a[19].val[0] = vmlal_s16(a[19].val[0],
815                 vget_high_s16(g_ai2_ihevc_trans_32_19_815),
816                 vget_high_s16(o0_1));
817             a[17].val[0] = vmlal_s16(a[17].val[0],
818                 vget_high_s16(g_ai2_ihevc_trans_32_17_815),
819                 vget_high_s16(o0_1));
820 
821             a[17].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_17_07),
822                 vget_low_s16(o1_0));  //q2
823             a[19].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_19_07),
824                 vget_low_s16(o1_0));  //q2
825             a[21].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_21_07),
826                 vget_low_s16(o1_0));  //q2
827             a[23].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_23_07),
828                 vget_low_s16(o1_0));  //q2
829 
830             a[23].val[1] = vmlal_s16(a[23].val[1],
831                 vget_high_s16(g_ai2_ihevc_trans_32_23_07), vget_high_s16(o1_0));
832             a[21].val[1] = vmlal_s16(a[21].val[1],
833                 vget_high_s16(g_ai2_ihevc_trans_32_21_07), vget_high_s16(o1_0));
834             a[19].val[1] = vmlal_s16(a[19].val[1],
835                 vget_high_s16(g_ai2_ihevc_trans_32_19_07), vget_high_s16(o1_0));
836             a[17].val[1] = vmlal_s16(a[17].val[1],
837                 vget_high_s16(g_ai2_ihevc_trans_32_17_07), vget_high_s16(o1_0));
838 
839             a[17].val[1] = vmlal_s16(a[17].val[1],
840                 vget_low_s16(g_ai2_ihevc_trans_32_17_815), vget_low_s16(o1_1));
841             a[19].val[1] = vmlal_s16(a[19].val[1],
842                 vget_low_s16(g_ai2_ihevc_trans_32_19_815), vget_low_s16(o1_1));
843             a[21].val[1] = vmlal_s16(a[21].val[1],
844                 vget_low_s16(g_ai2_ihevc_trans_32_21_815), vget_low_s16(o1_1));
845             a[23].val[1] = vmlal_s16(a[23].val[1],
846                 vget_low_s16(g_ai2_ihevc_trans_32_23_815), vget_low_s16(o1_1));
847 
848             a[23].val[1] = vmlal_s16(a[23].val[1],
849                 vget_high_s16(g_ai2_ihevc_trans_32_23_815),
850                 vget_high_s16(o1_1));
851             a[21].val[1] = vmlal_s16(a[21].val[1],
852                 vget_high_s16(g_ai2_ihevc_trans_32_21_815),
853                 vget_high_s16(o1_1));
854             a[19].val[1] = vmlal_s16(a[19].val[1],
855                 vget_high_s16(g_ai2_ihevc_trans_32_19_815),
856                 vget_high_s16(o1_1));
857             a[17].val[1] = vmlal_s16(a[17].val[1],
858                 vget_high_s16(g_ai2_ihevc_trans_32_17_815),
859                 vget_high_s16(o1_1));
860 
861             int32x4x2_t val_1719_0 =
862                 vtrnq_s32(a[17].val[0], a[19].val[0]);  //q0 q4
863             int32x4x2_t val_1719_1 =
864                 vtrnq_s32(a[17].val[1], a[19].val[1]);  //q1 q5
865             int32x4x2_t val_2123_0 =
866                 vtrnq_s32(a[21].val[0], a[23].val[0]);  //q8 q12
867             int32x4x2_t val_2123_1 =
868                 vtrnq_s32(a[21].val[1], a[23].val[1]);  //q9 q13
869 
870             a[17].val[0] = vcombine_s32(vget_low_s32(val_1719_0.val[0]),
871                 vget_low_s32(val_2123_0.val[0]));  //q0
872             a[17].val[1] = vcombine_s32(vget_high_s32(val_1719_0.val[0]),
873                 vget_high_s32(val_2123_0.val[0]));  //q8
874 
875             a[19].val[0] = vcombine_s32(vget_low_s32(val_1719_0.val[1]),
876                 vget_low_s32(val_2123_0.val[1]));  //q1
877             a[19].val[1] = vcombine_s32(vget_high_s32(val_1719_0.val[1]),
878                 vget_high_s32(val_2123_0.val[1]));  //q9
879 
880             a[21].val[0] = vcombine_s32(vget_low_s32(val_1719_1.val[0]),
881                 vget_low_s32(val_2123_1.val[0]));  //q4
882             a[21].val[1] = vcombine_s32(vget_high_s32(val_1719_1.val[0]),
883                 vget_high_s32(val_2123_1.val[0]));  //q12
884 
885             a[23].val[0] = vcombine_s32(vget_low_s32(val_1719_1.val[1]),
886                 vget_low_s32(val_2123_1.val[1]));  //q5
887             a[23].val[1] = vcombine_s32(vget_high_s32(val_1719_1.val[1]),
888                 vget_high_s32(val_2123_1.val[1]));  //q13
889 
890             a[17].val[0] = vaddq_s32(a[17].val[0], a[19].val[0]);  //q0
891             a[17].val[1] = vaddq_s32(a[17].val[1], a[19].val[1]);  //q8
892             a[17].val[1] = vaddq_s32(a[17].val[1], a[17].val[0]);  //q8
893 
894             a[21].val[0] = vaddq_s32(a[21].val[0], a[23].val[0]);  //q1
895             a[21].val[1] = vaddq_s32(a[21].val[1], a[23].val[1]);  //q9
896             a[21].val[1] = vaddq_s32(a[21].val[1], a[21].val[0]);  //q9
897 
898             int32x4x2_t val_17 = vzipq_s32(a[17].val[1], a[21].val[1]);  //q8 q9
899 
900             vst1_s32(pi4_temp + 544, vget_low_s32(val_17.val[0])); /*Value 17*/
901             vst1_s32(pi4_temp + 608, vget_high_s32(val_17.val[0])); /*Value 19*/
902             vst1_s32(pi4_temp + 672, vget_low_s32(val_17.val[1])); /*Value 21*/
903             vst1_s32(pi4_temp + 736, vget_high_s32(val_17.val[1])); /*Value 23*/
904 
905             a[25].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_25_07),
906                 vget_low_s16(o0_0));  //q2
907             a[27].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_27_07),
908                 vget_low_s16(o0_0));  //q2
909             a[29].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_29_07),
910                 vget_low_s16(o0_0));  //q2
911             a[31].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_31_07),
912                 vget_low_s16(o0_0));  //q2
913 
914             a[31].val[0] = vmlal_s16(a[31].val[0],
915                 vget_high_s16(g_ai2_ihevc_trans_32_31_07), vget_high_s16(o0_0));
916             a[29].val[0] = vmlal_s16(a[29].val[0],
917                 vget_high_s16(g_ai2_ihevc_trans_32_29_07), vget_high_s16(o0_0));
918             a[27].val[0] = vmlal_s16(a[27].val[0],
919                 vget_high_s16(g_ai2_ihevc_trans_32_27_07), vget_high_s16(o0_0));
920             a[25].val[0] = vmlal_s16(a[25].val[0],
921                 vget_high_s16(g_ai2_ihevc_trans_32_25_07), vget_high_s16(o0_0));
922 
923             a[25].val[0] = vmlal_s16(a[25].val[0],
924                 vget_low_s16(g_ai2_ihevc_trans_32_25_815), vget_low_s16(o0_1));
925             a[27].val[0] = vmlal_s16(a[27].val[0],
926                 vget_low_s16(g_ai2_ihevc_trans_32_27_815), vget_low_s16(o0_1));
927             a[29].val[0] = vmlal_s16(a[29].val[0],
928                 vget_low_s16(g_ai2_ihevc_trans_32_29_815), vget_low_s16(o0_1));
929             a[31].val[0] = vmlal_s16(a[31].val[0],
930                 vget_low_s16(g_ai2_ihevc_trans_32_31_815), vget_low_s16(o0_1));
931 
932             a[31].val[0] = vmlal_s16(a[31].val[0],
933                 vget_high_s16(g_ai2_ihevc_trans_32_31_815),
934                 vget_high_s16(o0_1));
935             a[29].val[0] = vmlal_s16(a[29].val[0],
936                 vget_high_s16(g_ai2_ihevc_trans_32_29_815),
937                 vget_high_s16(o0_1));
938             a[27].val[0] = vmlal_s16(a[27].val[0],
939                 vget_high_s16(g_ai2_ihevc_trans_32_27_815),
940                 vget_high_s16(o0_1));
941             a[25].val[0] = vmlal_s16(a[25].val[0],
942                 vget_high_s16(g_ai2_ihevc_trans_32_25_815),
943                 vget_high_s16(o0_1));
944 
945             a[25].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_25_07),
946                 vget_low_s16(o1_0));  //q2
947             a[27].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_27_07),
948                 vget_low_s16(o1_0));  //q2
949             a[29].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_29_07),
950                 vget_low_s16(o1_0));  //q2
951             a[31].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_31_07),
952                 vget_low_s16(o1_0));  //q2
953 
954             a[31].val[1] = vmlal_s16(a[31].val[1],
955                 vget_high_s16(g_ai2_ihevc_trans_32_31_07), vget_high_s16(o1_0));
956             a[29].val[1] = vmlal_s16(a[29].val[1],
957                 vget_high_s16(g_ai2_ihevc_trans_32_29_07), vget_high_s16(o1_0));
958             a[27].val[1] = vmlal_s16(a[27].val[1],
959                 vget_high_s16(g_ai2_ihevc_trans_32_27_07), vget_high_s16(o1_0));
960             a[25].val[1] = vmlal_s16(a[25].val[1],
961                 vget_high_s16(g_ai2_ihevc_trans_32_25_07), vget_high_s16(o1_0));
962 
963             a[25].val[1] = vmlal_s16(a[25].val[1],
964                 vget_low_s16(g_ai2_ihevc_trans_32_25_815), vget_low_s16(o1_1));
965             a[27].val[1] = vmlal_s16(a[27].val[1],
966                 vget_low_s16(g_ai2_ihevc_trans_32_27_815), vget_low_s16(o1_1));
967             a[29].val[1] = vmlal_s16(a[29].val[1],
968                 vget_low_s16(g_ai2_ihevc_trans_32_29_815), vget_low_s16(o1_1));
969             a[31].val[1] = vmlal_s16(a[31].val[1],
970                 vget_low_s16(g_ai2_ihevc_trans_32_31_815), vget_low_s16(o1_1));
971 
972             a[31].val[1] = vmlal_s16(a[31].val[1],
973                 vget_high_s16(g_ai2_ihevc_trans_32_31_815),
974                 vget_high_s16(o1_1));
975             a[29].val[1] = vmlal_s16(a[29].val[1],
976                 vget_high_s16(g_ai2_ihevc_trans_32_29_815),
977                 vget_high_s16(o1_1));
978             a[27].val[1] = vmlal_s16(a[27].val[1],
979                 vget_high_s16(g_ai2_ihevc_trans_32_27_815),
980                 vget_high_s16(o1_1));
981             a[25].val[1] = vmlal_s16(a[25].val[1],
982                 vget_high_s16(g_ai2_ihevc_trans_32_25_815),
983                 vget_high_s16(o1_1));
984 
985             int32x4x2_t val_2527_0 =
986                 vtrnq_s32(a[25].val[0], a[27].val[0]);  //q0 q4
987             int32x4x2_t val_2527_1 =
988                 vtrnq_s32(a[25].val[1], a[27].val[1]);  //q1 q5
989             int32x4x2_t val_2931_0 =
990                 vtrnq_s32(a[29].val[0], a[31].val[0]);  //q8 q12
991             int32x4x2_t val_2931_1 =
992                 vtrnq_s32(a[29].val[1], a[31].val[1]);  //q9 q13
993 
994             a[25].val[0] = vcombine_s32(vget_low_s32(val_2527_0.val[0]),
995                 vget_low_s32(val_2931_0.val[0]));  //q0
996             a[25].val[1] = vcombine_s32(vget_high_s32(val_2527_0.val[0]),
997                 vget_high_s32(val_2931_0.val[0]));  //q8
998 
999             a[27].val[0] = vcombine_s32(vget_low_s32(val_2527_0.val[1]),
1000                 vget_low_s32(val_2931_0.val[1]));  //q1
1001             a[27].val[1] = vcombine_s32(vget_high_s32(val_2527_0.val[1]),
1002                 vget_high_s32(val_2931_0.val[1]));  //q9
1003 
1004             a[29].val[0] = vcombine_s32(vget_low_s32(val_2527_1.val[0]),
1005                 vget_low_s32(val_2931_1.val[0]));  //q4
1006             a[29].val[1] = vcombine_s32(vget_high_s32(val_2527_1.val[0]),
1007                 vget_high_s32(val_2931_1.val[0]));  //q12
1008 
1009             a[31].val[0] = vcombine_s32(vget_low_s32(val_2527_1.val[1]),
1010                 vget_low_s32(val_2931_1.val[1]));  //q5
1011             a[31].val[1] = vcombine_s32(vget_high_s32(val_2527_1.val[1]),
1012                 vget_high_s32(val_2931_1.val[1]));  //q13
1013 
1014             a[25].val[0] = vaddq_s32(a[25].val[0], a[27].val[0]);  //q0
1015             a[25].val[1] = vaddq_s32(a[25].val[1], a[27].val[1]);  //q8
1016             a[25].val[1] = vaddq_s32(a[25].val[1], a[25].val[0]);  //q8
1017 
1018             a[29].val[0] = vaddq_s32(a[29].val[0], a[31].val[0]);  //q1
1019             a[29].val[1] = vaddq_s32(a[29].val[1], a[31].val[1]);  //q9
1020             a[29].val[1] = vaddq_s32(a[29].val[1], a[29].val[0]);  //q9
1021 
1022             int32x4x2_t val_25 = vzipq_s32(a[25].val[1], a[29].val[1]);  //q8 q9
1023 
1024             vst1_s32(pi4_temp + 800, vget_low_s32(val_25.val[0])); /*Value 25*/
1025             vst1_s32(pi4_temp + 864, vget_high_s32(val_25.val[0])); /*Value 27*/
1026             vst1_s32(pi4_temp + 928, vget_low_s32(val_25.val[1])); /*Value 29*/
1027             vst1_s32(pi4_temp + 992, vget_high_s32(val_25.val[1])); /*Value 31*/
1028 
1029             pi4_temp += 2;
1030         }
1031     }
1032 
1033     /*sad of the block*/
1034     tmp_a = vpaddlq_s32(sum_val);
1035     sad = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_a)),
1036                       vreinterpret_s32_s64(vget_high_s64(tmp_a)));
1037     u4_blk_sad = vget_lane_s32(sad, 0);
1038 
1039     //Stage 2
1040     {
1041         static const int32x4_t g_ai4_ihevc_trans_32_0_8 = { 64, -64, 83, -83 };
1042         static const int32x4_t g_ai4_ihevc_trans_32_1_8 = { 64, 64, 36, 36 };
1043 
1044         static const int32x4_t g_ai4_ihevc_trans_32_4_04 = { 89, 75, 50, 18 };
1045         static const int32x4_t g_ai4_ihevc_trans_32_12_04 = { 75, -18, -89, -50 };
1046         static const int32x4_t g_ai4_ihevc_trans_32_20_04 = { 50, -89, 18, 75 };
1047         static const int32x4_t g_ai4_ihevc_trans_32_28_04 = { 18, -50, 75, -89 };
1048 
1049         static const int32x4_t g_ai4_ihevc_trans_32_2_03 = { 90, 87, 80, 70 };
1050         static const int32x4_t g_ai4_ihevc_trans_32_2_47 = { 57, 43, 25, 9 };
1051         static const int32x4_t g_ai4_ihevc_trans_32_6_03 = { 87, 57, 9, -43 };
1052         static const int32x4_t g_ai4_ihevc_trans_32_6_47 = { -80, -90, -70,
1053             -25 };
1054         static const int32x4_t g_ai4_ihevc_trans_32_10_03 = { 80, 9, -70, -87 };
1055         static const int32x4_t g_ai4_ihevc_trans_32_10_47 = { -25, 57, 90, 43 };
1056         static const int32x4_t g_ai4_ihevc_trans_32_14_03 = { 70, -43, -87, 9 };
1057         static const int32x4_t g_ai4_ihevc_trans_32_14_47 = { 90, 25, -80, -57 };
1058         static const int32x4_t g_ai4_ihevc_trans_32_18_03 = { 57, -80, -25, 90 };
1059         static const int32x4_t g_ai4_ihevc_trans_32_18_47 = { -9, -87, 43, 70 };
1060         static const int32x4_t g_ai4_ihevc_trans_32_22_03 = { 43, -90, 57, 25 };
1061         static const int32x4_t g_ai4_ihevc_trans_32_22_47 = { -87, 70, 9, -80 };
1062         static const int32x4_t g_ai4_ihevc_trans_32_26_03 = { 25, -70, 90, -80 };
1063         static const int32x4_t g_ai4_ihevc_trans_32_26_47 = { 43, 9, -57, 87 };
1064         static const int32x4_t g_ai4_ihevc_trans_32_30_03 = { 9, -25, 43, -57 };
1065         static const int32x4_t g_ai4_ihevc_trans_32_30_47 = { 70, -80, 87, -90 };
1066 
1067         static const int32x4_t g_ai4_ihevc_trans_32_1_03 = { 90, 90, 88, 85 };
1068         static const int32x4_t g_ai4_ihevc_trans_32_1_47 = { 82, 78, 73, 67 };
1069         static const int32x4_t g_ai4_ihevc_trans_32_1_811 = { 61, 54, 46, 38 };
1070         static const int32x4_t g_ai4_ihevc_trans_32_1_1215 = { 31, 22, 13, 4 };
1071         static const int32x4_t g_ai4_ihevc_trans_32_3_03 = { 90, 82, 67, 46 };
1072         static const int32x4_t g_ai4_ihevc_trans_32_3_47 = { 22, -4, -31, -54 };
1073         static const int32x4_t g_ai4_ihevc_trans_32_3_811 = { -73, -85, -90, -88 };
1074         static const int32x4_t g_ai4_ihevc_trans_32_3_1215 = { -78, -61, -38, -13 };
1075         static const int32x4_t g_ai4_ihevc_trans_32_5_03 = { 88, 67, 31, -13 };
1076         static const int32x4_t g_ai4_ihevc_trans_32_5_47 = { -54, -82, -90, -78 };
1077         static const int32x4_t g_ai4_ihevc_trans_32_5_811 = { -46, -4, 38, 73 };
1078         static const int32x4_t g_ai4_ihevc_trans_32_5_1215 = { 90, 85, 61, 22 };
1079         static const int32x4_t g_ai4_ihevc_trans_32_7_03 = { 85, 46, -13, -67 };
1080         static const int32x4_t g_ai4_ihevc_trans_32_7_47 = { -90, -73, -22, 38 };
1081         static const int32x4_t g_ai4_ihevc_trans_32_7_811 = { 82, 88, 54, -4 };
1082         static const int32x4_t g_ai4_ihevc_trans_32_7_1215 = { -61, -90, -78, -31 };
1083         static const int32x4_t g_ai4_ihevc_trans_32_9_03 = { 82, 22, -54, -90 };
1084         static const int32x4_t g_ai4_ihevc_trans_32_9_47 = { -61, 13, 78, 85 };
1085         static const int32x4_t g_ai4_ihevc_trans_32_9_811 = { 31, -46, -90, -67 };
1086         static const int32x4_t g_ai4_ihevc_trans_32_9_1215 = { 4, 73, 88, 38 };
1087         static const int32x4_t g_ai4_ihevc_trans_32_11_03 = { 78, -4, -82, -73 };
1088         static const int32x4_t g_ai4_ihevc_trans_32_11_47 = { 13, 85, 67, -22 };
1089         static const int32x4_t g_ai4_ihevc_trans_32_11_811 = { -88, -61, 31, 90 };
1090         static const int32x4_t g_ai4_ihevc_trans_32_11_1215 = { 54, -38, -90, -46 };
1091         static const int32x4_t g_ai4_ihevc_trans_32_13_03 = { 73, -31, -90, -22 };
1092         static const int32x4_t g_ai4_ihevc_trans_32_13_47 = { 78, 67, -38, -90 };
1093         static const int32x4_t g_ai4_ihevc_trans_32_13_811 = { -13, 82, 61, -46 };
1094         static const int32x4_t g_ai4_ihevc_trans_32_13_1215 = { -88, -4, 85, 54 };
1095         static const int32x4_t g_ai4_ihevc_trans_32_15_03 = { 67, -54, -78, 38 };
1096         static const int32x4_t g_ai4_ihevc_trans_32_15_47 = { 85, -22, -90, 4 };
1097         static const int32x4_t g_ai4_ihevc_trans_32_15_811 = { 90, 13, -88, -31 };
1098         static const int32x4_t g_ai4_ihevc_trans_32_15_1215 = { 82, 46, -73, -61 };
1099         static const int32x4_t g_ai4_ihevc_trans_32_17_03 = { 61, -73, -46, 82 };
1100         static const int32x4_t g_ai4_ihevc_trans_32_17_47 = { 31, -88, -13, 90 };
1101         static const int32x4_t g_ai4_ihevc_trans_32_17_811 = { -4, -90, 22, 85 };
1102         static const int32x4_t g_ai4_ihevc_trans_32_17_1215 = { -38, -78, 54, 67 };
1103         static const int32x4_t g_ai4_ihevc_trans_32_19_03 = { 54, -85, -4, 88 };
1104         static const int32x4_t g_ai4_ihevc_trans_32_19_47 = { -46, -61, 82, 13 };
1105         static const int32x4_t g_ai4_ihevc_trans_32_19_811 = { -90, 38, 67, -78 };
1106         static const int32x4_t g_ai4_ihevc_trans_32_19_1215 = { -22, 90, -31, -73 };
1107         static const int32x4_t g_ai4_ihevc_trans_32_21_03 = { 46, -90, 38, 54 };
1108         static const int32x4_t g_ai4_ihevc_trans_32_21_47 = { -90, 31, 61, -88 };
1109         static const int32x4_t g_ai4_ihevc_trans_32_21_811 = { 22, 67, -85, 13 };
1110         static const int32x4_t g_ai4_ihevc_trans_32_21_1215 = { 73, -82, 4, 78 };
1111         static const int32x4_t g_ai4_ihevc_trans_32_23_03 = { 38, -88, 73, -4 };
1112         static const int32x4_t g_ai4_ihevc_trans_32_23_47 = { -67, 90, -46, -31 };
1113         static const int32x4_t g_ai4_ihevc_trans_32_23_811 = { 85, -78, 13, 61 };
1114         static const int32x4_t g_ai4_ihevc_trans_32_23_1215 = { -90, 54, 22, -82 };
1115         static const int32x4_t g_ai4_ihevc_trans_32_25_03 = { 31, -78, 90, -61 };
1116         static const int32x4_t g_ai4_ihevc_trans_32_25_47 = { 4, 54, -88, 82 };
1117         static const int32x4_t g_ai4_ihevc_trans_32_25_811 = { -38, -22, 73, -90 };
1118         static const int32x4_t g_ai4_ihevc_trans_32_25_1215 = { 67, -13, -46, 85 };
1119         static const int32x4_t g_ai4_ihevc_trans_32_27_03 = { 22, -61, 85, -90 };
1120         static const int32x4_t g_ai4_ihevc_trans_32_27_47 = { 73, -38, -4, 46 };
1121         static const int32x4_t g_ai4_ihevc_trans_32_27_811 = { -78, 90, -82, 54 };
1122         static const int32x4_t g_ai4_ihevc_trans_32_27_1215 = { -13, -31, 67, -88 };
1123         static const int32x4_t g_ai4_ihevc_trans_32_29_03 = { 13, -38, 61, -78 };
1124         static const int32x4_t g_ai4_ihevc_trans_32_29_47 = { 88, -90, 85, -73 };
1125         static const int32x4_t g_ai4_ihevc_trans_32_29_811 = { 54, -31, 4, 22 };
1126         static const int32x4_t g_ai4_ihevc_trans_32_29_1215 = { -46, 67, -82, 90 };
1127         static const int32x4_t g_ai4_ihevc_trans_32_31_03 = { 4, -13, 22, -31 };
1128         static const int32x4_t g_ai4_ihevc_trans_32_31_47 = { 38, -46, 54, -61 };
1129         static const int32x4_t g_ai4_ihevc_trans_32_31_811 = { 67, -73, 78, -82 };
1130         static const int32x4_t g_ai4_ihevc_trans_32_31_1215 = { 85, -88, 90, -90 };
1131 
1132         int32x4_t a[32];
1133 
1134         pi4_temp = pi4_temp_orig;
1135         for(i = 0; i < 32; i++)
1136         {
1137             int32x4_t temp_data[8];
1138 
1139             temp_data[0] = vld1q_s32(pi4_temp);
1140             temp_data[1] = vld1q_s32(pi4_temp + 4);
1141             temp_data[2] = vld1q_s32(pi4_temp + 8);
1142             temp_data[3] = vld1q_s32(pi4_temp + 12);
1143 
1144             temp_data[4] = vrev64q_s32(vld1q_s32(pi4_temp + 16));
1145             temp_data[4] = vcombine_s32(
1146                 vget_high_s32(temp_data[4]), vget_low_s32(temp_data[4]));
1147 
1148             temp_data[5] = vrev64q_s32(vld1q_s32(pi4_temp + 20));
1149             temp_data[5] = vcombine_s32(
1150                 vget_high_s32(temp_data[5]), vget_low_s32(temp_data[5]));
1151 
1152             temp_data[6] = vrev64q_s32(vld1q_s32(pi4_temp + 24));
1153             temp_data[6] = vcombine_s32(
1154                 vget_high_s32(temp_data[6]), vget_low_s32(temp_data[6]));
1155 
1156             temp_data[7] = vrev64q_s32(vld1q_s32(pi4_temp + 28));
1157             temp_data[7] = vcombine_s32(
1158                 vget_high_s32(temp_data[7]), vget_low_s32(temp_data[7]));
1159 
1160             pi4_temp += 32;
1161 
1162             const int32x4_t o0 =
1163                 vsubq_s32(temp_data[0], temp_data[7]); /*R2(9-16) - R2(24-17)*/
1164             const int32x4_t o1 =
1165                 vsubq_s32(temp_data[1], temp_data[6]); /*R2(1- 8) - R2(32-25)*/
1166             const int32x4_t o2 =
1167                 vsubq_s32(temp_data[2], temp_data[5]); /*R1(9-16) - R1(24-17)*/
1168             const int32x4_t o3 =
1169                 vsubq_s32(temp_data[3], temp_data[4]); /*R1(1- 8) - R1(32-25)*/
1170 
1171             int32x4_t e3 = vrev64q_s32(
1172                 vaddq_s32(temp_data[3], temp_data[4])); /*R1(1- 8) + R1(32-25)*/
1173             e3 = vcombine_s32(vget_high_s32(e3), vget_low_s32(e3));
1174             int32x4_t e2 = vrev64q_s32(
1175                 vaddq_s32(temp_data[2], temp_data[5])); /*R1(9-16) + R1(24-17)*/
1176             e2 = vcombine_s32(vget_high_s32(e2), vget_low_s32(e2));
1177 
1178             const int32x4_t e1 =
1179                 vaddq_s32(temp_data[1], temp_data[6]); /*R2(1- 8) + R2(32-25)*/
1180             const int32x4_t e0 =
1181                 vaddq_s32(temp_data[0], temp_data[7]); /*R2(9-16) + R2(24-17)*/
1182 
1183             const int32x4_t ee0 = vaddq_s32(e0, e3); /*E1(1- 8) + E1(16-9)*/
1184             int32x4_t ee1 =
1185                 vrev64q_s32(vaddq_s32(e1, e2)); /*E2(1- 8) + E2(16-9)*/
1186             ee1 = vcombine_s32(vget_high_s32(ee1), vget_low_s32(ee1));
1187             const int32x4_t eo1 = vsubq_s32(e1, e2); /*E2(1- 8) - E2(16-9)*/
1188             const int32x4_t eo0 = vsubq_s32(e0, e3); /*E1(1- 8) - E1(16-9)*/
1189 
1190             /*EE(1-4) - EE(8-5)*/
1191             const int32x4_t eeo = vsubq_s32(ee0, ee1);  //Q5
1192             /*EE(1-4) + EE(8-5)*/
1193             const int32x4_t eee = vaddq_s32(ee0, ee1);  //Q4
1194 
1195             /*EEEE Calculations*/
1196             const int32x4_t eeee = vcombine_s32(
1197                 vadd_s32(vget_low_s32(eee), vrev64_s32(vget_high_s32(eee))),
1198                 vsub_s32(
1199                     vget_low_s32(eee), vrev64_s32(vget_high_s32(eee))));  //q6
1200 
1201             /*Calculation of values 0 8 16 24*/
1202             /*Multiplications*/
1203             a[0] = vmulq_s32(g_ai4_ihevc_trans_32_0_8, eeee);
1204             a[0] = vmlaq_s32(a[0], g_ai4_ihevc_trans_32_1_8, vrev64q_s32(eeee));
1205             /*Shift*/
1206             int16x4_t val_0 = vrshrn_n_s32(a[0], 15);
1207             /*Store*/
1208             vst1_lane_s16(pi2_dst, val_0, 0); /*Value 0*/
1209             vst1_lane_s16(pi2_dst + 8 * dst_strd, val_0, 2); /*Value 8*/
1210             vst1_lane_s16(pi2_dst + 16 * dst_strd, val_0, 1); /*Value 16*/
1211             vst1_lane_s16(pi2_dst + 24 * dst_strd, val_0, 3); /*Value 24*/
1212 
1213             /*Calculation of values 4 12 20 28*/
1214             /*Multiplications*/
1215             a[4] = vmulq_s32(g_ai4_ihevc_trans_32_4_04, eeo);
1216             a[12] = vmulq_s32(g_ai4_ihevc_trans_32_12_04, eeo);
1217             a[20] = vmulq_s32(g_ai4_ihevc_trans_32_20_04, eeo);
1218             a[28] = vmulq_s32(g_ai4_ihevc_trans_32_28_04, eeo);
1219             /*Transposes*/
1220             int32x4x2_t val_412 = vtrnq_s32(a[4], a[12]);  //q0 q9
1221             int32x4x2_t val_2028 = vtrnq_s32(a[20], a[28]);  //q10 q11
1222             /*Swap*/
1223             a[4] = vcombine_s32(vget_low_s32(val_412.val[0]),
1224                 vget_low_s32(val_2028.val[0]));  //q0
1225             a[12] = vcombine_s32(vget_low_s32(val_412.val[1]),
1226                 vget_low_s32(val_2028.val[1]));  //q9
1227             a[20] = vcombine_s32(vget_high_s32(val_412.val[0]),
1228                 vget_high_s32(val_2028.val[0]));  //q10
1229             a[28] = vcombine_s32(vget_high_s32(val_412.val[1]),
1230                 vget_high_s32(val_2028.val[1]));  //q11
1231             /*Additions*/
1232             a[4] = vaddq_s32(a[4], a[12]);  //q0
1233             a[20] = vaddq_s32(a[20], a[28]);  //q10
1234             a[4] = vaddq_s32(a[4], a[20]);  //q0
1235             /*Shift*/
1236             int16x4_t val_4 = vrshrn_n_s32(a[4], 15);
1237             /*Store*/
1238             vst1_lane_s16(pi2_dst + 4 * dst_strd, val_4, 0); /*Value 4*/
1239             vst1_lane_s16(pi2_dst + 12 * dst_strd, val_4, 1); /*Value 12*/
1240             vst1_lane_s16(pi2_dst + 20 * dst_strd, val_4, 2); /*Value 20*/
1241             vst1_lane_s16(pi2_dst + 28 * dst_strd, val_4, 3); /*Value 28*/
1242 
1243             /*Calculation of value 2 6 10 14 18 22 26 30*/
1244             /*Multiplications*/
1245             a[2] = vmulq_s32(g_ai4_ihevc_trans_32_2_03, eo0);  //q8
1246             a[6] = vmulq_s32(g_ai4_ihevc_trans_32_6_03, eo0);  //q2
1247             a[10] = vmulq_s32(g_ai4_ihevc_trans_32_10_03, eo0);  //q2
1248             a[14] = vmulq_s32(g_ai4_ihevc_trans_32_14_03, eo0);  //q2
1249 
1250             a[14] = vmlaq_s32(a[14], g_ai4_ihevc_trans_32_14_47, eo1);
1251             a[10] = vmlaq_s32(a[10], g_ai4_ihevc_trans_32_10_47, eo1);
1252             a[6] = vmlaq_s32(a[6], g_ai4_ihevc_trans_32_6_47, eo1);
1253             a[2] = vmlaq_s32(a[2], g_ai4_ihevc_trans_32_2_47, eo1);
1254 
1255             int32x2_t val_2 = vadd_s32(vget_low_s32(a[2]), vget_high_s32(a[2]));
1256             int32x2_t val_6 = vadd_s32(vget_low_s32(a[6]), vget_high_s32(a[6]));
1257             val_2 = vpadd_s32(val_2, val_6);
1258 
1259             int32x2_t val_10 =
1260                 vadd_s32(vget_low_s32(a[10]), vget_high_s32(a[10]));
1261             int32x2_t val_14 =
1262                 vadd_s32(vget_low_s32(a[14]), vget_high_s32(a[14]));
1263             val_10 = vpadd_s32(val_10, val_14);
1264 
1265             /*Shift*/
1266             int16x4_t val__2 =
1267                 vrshrn_n_s32(vcombine_s32(val_2, val_10), 15);  //q9 q12
1268 
1269             /*Store*/
1270             vst1_lane_s16(pi2_dst + 2 * dst_strd, val__2, 0); /*Value 2*/
1271             vst1_lane_s16(pi2_dst + 6 * dst_strd, val__2, 1); /*Value 6*/
1272             vst1_lane_s16(pi2_dst + 10 * dst_strd, val__2, 2); /*Value 10*/
1273             vst1_lane_s16(pi2_dst + 14 * dst_strd, val__2, 3); /*Value 14*/
1274 
1275             a[18] = vmulq_s32(g_ai4_ihevc_trans_32_18_03, eo0);  //q2
1276             a[22] = vmulq_s32(g_ai4_ihevc_trans_32_22_03, eo0);  //q2
1277             a[26] = vmulq_s32(g_ai4_ihevc_trans_32_26_03, eo0);  //q2
1278             a[30] = vmulq_s32(g_ai4_ihevc_trans_32_30_03, eo0);  //q2
1279 
1280             a[30] = vmlaq_s32(a[30], g_ai4_ihevc_trans_32_30_47, eo1);
1281             a[26] = vmlaq_s32(a[26], g_ai4_ihevc_trans_32_26_47, eo1);
1282             a[22] = vmlaq_s32(a[22], g_ai4_ihevc_trans_32_22_47, eo1);
1283             a[18] = vmlaq_s32(a[18], g_ai4_ihevc_trans_32_18_47, eo1);
1284 
1285             int32x2_t val_18 =
1286                 vadd_s32(vget_low_s32(a[18]), vget_high_s32(a[18]));
1287             int32x2_t val_22 =
1288                 vadd_s32(vget_low_s32(a[22]), vget_high_s32(a[22]));
1289             val_18 = vpadd_s32(val_18, val_22);
1290             int32x2_t val_26 =
1291                 vadd_s32(vget_low_s32(a[26]), vget_high_s32(a[26]));
1292             int32x2_t val_30 =
1293                 vadd_s32(vget_low_s32(a[30]), vget_high_s32(a[30]));
1294             val_26 = vpadd_s32(val_26, val_30);
1295 
1296             int16x4_t val__18 =
1297                 vrshrn_n_s32(vcombine_s32(val_18, val_26), 15);  //q9 q12
1298 
1299             vst1_lane_s16(pi2_dst + 18 * dst_strd, val__18, 0); /*Value 18*/
1300             vst1_lane_s16(pi2_dst + 22 * dst_strd, val__18, 1); /*Value 22*/
1301             vst1_lane_s16(pi2_dst + 26 * dst_strd, val__18, 2); /*Value 26*/
1302             vst1_lane_s16(pi2_dst + 30 * dst_strd, val__18, 3); /*Value 30*/
1303 
1304             /*Calculations for odd indexes*/
1305             a[7] = vmulq_s32(g_ai4_ihevc_trans_32_7_03, o0);  //q1
1306             a[5] = vmulq_s32(g_ai4_ihevc_trans_32_5_03, o0);  //q1
1307             a[3] = vmulq_s32(g_ai4_ihevc_trans_32_3_03, o0);  //q1
1308             a[1] = vmulq_s32(g_ai4_ihevc_trans_32_1_03, o0);  //q1
1309 
1310             a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_47, o1);
1311             a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_47, o1);
1312             a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_47, o1);
1313             a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_47, o1);
1314 
1315             a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_811, o2);
1316             a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_811, o2);
1317             a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_811, o2);
1318             a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_811, o2);
1319 
1320             a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_1215, o3);
1321             int32x2_t val_1 = vadd_s32(vget_low_s32(a[1]), vget_high_s32(a[1]));
1322             a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_1215, o3);
1323             int32x2_t val_3 = vadd_s32(vget_low_s32(a[3]), vget_high_s32(a[3]));
1324             val_1 = vpadd_s32(val_1, val_3);
1325             a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_1215, o3);
1326             int32x2_t val_5 = vadd_s32(vget_low_s32(a[5]), vget_high_s32(a[5]));
1327             a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_1215, o3);
1328             int32x2_t val_7 = vadd_s32(vget_low_s32(a[7]), vget_high_s32(a[7]));
1329             val_5 = vpadd_s32(val_5, val_7);
1330 
1331             /*Shift*/
1332             int16x4_t val__1 =
1333                 vrshrn_n_s32(vcombine_s32(val_1, val_5), 15);  //q9 q12
1334 
1335             /*Store*/
1336             vst1_lane_s16(pi2_dst + 1 * dst_strd, val__1, 0); /*Value 1*/
1337             vst1_lane_s16(pi2_dst + 3 * dst_strd, val__1, 1); /*Value 3*/
1338             vst1_lane_s16(pi2_dst + 5 * dst_strd, val__1, 2); /*Value 5*/
1339             vst1_lane_s16(pi2_dst + 7 * dst_strd, val__1, 3); /*Value 7*/
1340 
1341             a[15] = vmulq_s32(g_ai4_ihevc_trans_32_15_03, o0);  //q1
1342             a[13] = vmulq_s32(g_ai4_ihevc_trans_32_13_03, o0);  //q1
1343             a[11] = vmulq_s32(g_ai4_ihevc_trans_32_11_03, o0);  //q1
1344             a[9] = vmulq_s32(g_ai4_ihevc_trans_32_9_03, o0);  //q1
1345 
1346             a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_47, o1);
1347             a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_47, o1);
1348             a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_47, o1);
1349             a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_47, o1);
1350 
1351             a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_811, o2);
1352             a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_811, o2);
1353             a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_811, o2);
1354             a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_811, o2);
1355 
1356             a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_1215, o3);
1357             int32x2_t val_9 = vadd_s32(vget_low_s32(a[9]), vget_high_s32(a[9]));
1358             a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_1215, o3);
1359             int32x2_t val_11 =
1360                 vadd_s32(vget_low_s32(a[11]), vget_high_s32(a[11]));
1361             val_9 = vpadd_s32(val_9, val_11);
1362             a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_1215, o3);
1363             int32x2_t val_13 =
1364                 vadd_s32(vget_low_s32(a[13]), vget_high_s32(a[13]));
1365             a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_1215, o3);
1366             int32x2_t val_15 =
1367                 vadd_s32(vget_low_s32(a[15]), vget_high_s32(a[15]));
1368             val_13 = vpadd_s32(val_13, val_15);
1369 
1370             int16x4_t val__9 =
1371                 vrshrn_n_s32(vcombine_s32(val_9, val_13), 15);  //q9 q12
1372 
1373             vst1_lane_s16(pi2_dst + 9 * dst_strd, val__9, 0); /*Value 9*/
1374             vst1_lane_s16(pi2_dst + 11 * dst_strd, val__9, 1); /*Value 11*/
1375             vst1_lane_s16(pi2_dst + 13 * dst_strd, val__9, 2); /*Value 13*/
1376             vst1_lane_s16(pi2_dst + 15 * dst_strd, val__9, 3); /*Value 15*/
1377 
1378             a[23] = vmulq_s32(g_ai4_ihevc_trans_32_23_03, o0);  //q1
1379             a[21] = vmulq_s32(g_ai4_ihevc_trans_32_21_03, o0);  //q1
1380             a[19] = vmulq_s32(g_ai4_ihevc_trans_32_19_03, o0);  //q1
1381             a[17] = vmulq_s32(g_ai4_ihevc_trans_32_17_03, o0);  //q1
1382 
1383             a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_47, o1);
1384             a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_47, o1);
1385             a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_47, o1);
1386             a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_47, o1);
1387 
1388             a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_811, o2);
1389             a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_811, o2);
1390             a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_811, o2);
1391             a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_811, o2);
1392 
1393             a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_1215, o3);
1394             int32x2_t val_17 =
1395                 vadd_s32(vget_low_s32(a[17]), vget_high_s32(a[17]));
1396             a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_1215, o3);
1397             int32x2_t val_19 =
1398                 vadd_s32(vget_low_s32(a[19]), vget_high_s32(a[19]));
1399             val_17 = vpadd_s32(val_17, val_19);
1400             a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_1215, o3);
1401             int32x2_t val_21 =
1402                 vadd_s32(vget_low_s32(a[21]), vget_high_s32(a[21]));
1403             a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_1215, o3);
1404             int32x2_t val_23 =
1405                 vadd_s32(vget_low_s32(a[23]), vget_high_s32(a[23]));
1406             val_21 = vpadd_s32(val_21, val_23);
1407 
1408             int16x4_t val__17 =
1409                 vrshrn_n_s32(vcombine_s32(val_17, val_21), 15);  //q9 q12
1410 
1411             vst1_lane_s16(pi2_dst + 17 * dst_strd, val__17, 0); /*Value 17*/
1412             vst1_lane_s16(pi2_dst + 19 * dst_strd, val__17, 1); /*Value 19*/
1413             vst1_lane_s16(pi2_dst + 21 * dst_strd, val__17, 2); /*Value 21*/
1414             vst1_lane_s16(pi2_dst + 23 * dst_strd, val__17, 3); /*Value 23*/
1415 
1416             a[31] = vmulq_s32(g_ai4_ihevc_trans_32_31_03, o0);  //q10
1417             a[29] = vmulq_s32(g_ai4_ihevc_trans_32_29_03, o0);  //q1
1418             a[27] = vmulq_s32(g_ai4_ihevc_trans_32_27_03, o0);  //q1
1419             a[25] = vmulq_s32(g_ai4_ihevc_trans_32_25_03, o0);  //q1
1420 
1421             a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_47, o1);
1422             a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_47, o1);
1423             a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_47, o1);
1424             a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_47, o1);
1425 
1426             a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_811, o2);
1427             a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_811, o2);
1428             a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_811, o2);
1429             a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_811, o2);
1430 
1431             a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_1215, o3);
1432             int32x2_t val_25 =
1433                 vadd_s32(vget_low_s32(a[25]), vget_high_s32(a[25]));
1434             a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_1215, o3);
1435             int32x2_t val_27 =
1436                 vadd_s32(vget_low_s32(a[27]), vget_high_s32(a[27]));
1437             val_25 = vpadd_s32(val_25, val_27);
1438             a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_1215, o3);
1439             int32x2_t val_29 =
1440                 vadd_s32(vget_low_s32(a[29]), vget_high_s32(a[29]));
1441             a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_1215, o3);
1442             int32x2_t val_31 =
1443                 vadd_s32(vget_low_s32(a[31]), vget_high_s32(a[31]));
1444             val_29 = vpadd_s32(val_29, val_31);
1445 
1446             int16x4_t val__25 =
1447                 vrshrn_n_s32(vcombine_s32(val_25, val_29), 15);  //q9 q12
1448 
1449             vst1_lane_s16(pi2_dst + 25 * dst_strd, val__25, 0); /*Value 25*/
1450             vst1_lane_s16(pi2_dst + 27 * dst_strd, val__25, 1); /*Value 27*/
1451             vst1_lane_s16(pi2_dst + 29 * dst_strd, val__25, 2); /*Value 29*/
1452             vst1_lane_s16(pi2_dst + 31 * dst_strd, val__25, 3); /*Value 31*/
1453 
1454             pi2_dst++;
1455         }
1456     }
1457     return u4_blk_sad;
1458 }
1459