1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * ihevc_resi_trans_neon_32x32.c
25 *
26 * @brief
27 * Contains definitions of functions for computing residue and fwd transform
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - ihevc_resi_trans_32x32_neon()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40
41 /*****************************************************************************/
42 /* File Includes */
43 /*****************************************************************************/
44 /* System include files */
45 #include <stdio.h>
46 #include <string.h>
47
48 /* System user files */
49 #include "ihevc_typedefs.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_cmn_utils_neon.h"
53
54 #include "ihevc_trans_tables.h"
55 #include "ihevc_resi_trans.h"
56
57 /*****************************************************************************/
58 /* Function Definitions */
59 /*****************************************************************************/
60 /**
61 *******************************************************************************
62 *
63 * @brief
64 * This function performs residue calculation and forward transform on
65 * input pixels
66 *
67 * @par Description:
68 * Performs residue calculation by subtracting source and prediction and
69 * followed by forward transform
70 *
71 * @param[in] pu1_src
72 * Input 32x32 pixels
73 *
74 * @param[in] pu1_pred
75 * Prediction data
76 *
77 * @param[in] pi2_tmp
78 * Temporary buffer of size 32x32
79 *
80 * @param[out] pi2_dst
81 * Output 32x32 coefficients
82 *
83 * @param[in] src_strd
84 * Input stride
85 *
86 * @param[in] pred_strd
87 * Prediction Stride
88 *
89 * @param[in] dst_strd
90 * Output Stride
91 *
92 * @param[in] e_chroma_plane
93 * Enum singalling chroma plane
94 *
95 * @returns Void
96 *
97 * @remarks
98 * None
99 *
100 *******************************************************************************
101 */
ihevc_resi_trans_32x32_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)102 UWORD32 ihevc_resi_trans_32x32_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred,
103 WORD32 *pi4_temp, WORD16 *pi2_dst, WORD32 src_strd, WORD32 pred_strd,
104 WORD32 dst_strd, CHROMA_PLANE_ID_T e_chroma_plane)
105 {
106 int16x8_t diff_16[4][2];
107 WORD32 i;
108 int32x2_t sad;
109 int64x2_t tmp_a;
110 UWORD32 u4_blk_sad = 0;
111 WORD32 *pi4_temp_orig = pi4_temp;
112 int16x8_t abs = vdupq_n_s16(0);
113 int32x4_t sum_val = vdupq_n_s32(0);
114 UNUSED(e_chroma_plane);
115
116 // Stage 1
117 for(i = 0; i < 16; i++)
118 {
119
120 uint8x16_t src_buff, pred_buff;
121 abs = vdupq_n_s16(0);
122
123 src_buff = vld1q_u8(pu1_src);
124 pred_buff = vld1q_u8(pu1_pred);
125 diff_16[0][0] = vreinterpretq_s16_u16(
126 vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff)));
127 diff_16[1][0] = vreinterpretq_s16_u16(
128 vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff)));
129 abs = vaddq_s16(abs, vabsq_s16(diff_16[0][0]));
130 abs = vaddq_s16(abs, vabsq_s16(diff_16[1][0]));
131
132 src_buff = vld1q_u8(pu1_src + 16);
133 pred_buff = vld1q_u8(pu1_pred + 16);
134 diff_16[2][0] = vrev64q_s16(vreinterpretq_s16_u16(
135 vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff))));
136 diff_16[2][0] = vcombine_s16(
137 vget_high_s16(diff_16[2][0]), vget_low_s16(diff_16[2][0]));
138 diff_16[3][0] = vrev64q_s16(vreinterpretq_s16_u16(
139 vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff))));
140 diff_16[3][0] = vcombine_s16(
141 vget_high_s16(diff_16[3][0]), vget_low_s16(diff_16[3][0]));
142 abs = vaddq_s16(abs, vabsq_s16(diff_16[2][0]));
143 abs = vaddq_s16(abs, vabsq_s16(diff_16[3][0]));
144
145 pu1_src += src_strd;
146 pu1_pred += pred_strd;
147
148 src_buff = vld1q_u8(pu1_src);
149 pred_buff = vld1q_u8(pu1_pred);
150 diff_16[0][1] = vreinterpretq_s16_u16(
151 vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff)));
152 diff_16[1][1] = vreinterpretq_s16_u16(
153 vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff)));
154 abs = vaddq_s16(abs, vabsq_s16(diff_16[0][1]));
155 abs = vaddq_s16(abs, vabsq_s16(diff_16[1][1]));
156
157 src_buff = vld1q_u8(pu1_src + 16);
158 pred_buff = vld1q_u8(pu1_pred + 16);
159 diff_16[2][1] = vrev64q_s16(vreinterpretq_s16_u16(
160 vsubl_u8(vget_low_u8(src_buff), vget_low_u8(pred_buff))));
161 diff_16[2][1] = vcombine_s16(
162 vget_high_s16(diff_16[2][1]), vget_low_s16(diff_16[2][1]));
163 diff_16[3][1] = vrev64q_s16(vreinterpretq_s16_u16(
164 vsubl_u8(vget_high_u8(src_buff), vget_high_u8(pred_buff))));
165 diff_16[3][1] = vcombine_s16(
166 vget_high_s16(diff_16[3][1]), vget_low_s16(diff_16[3][1]));
167
168 abs = vaddq_s16(abs, vabsq_s16(diff_16[2][1]));
169 abs = vaddq_s16(abs, vabsq_s16(diff_16[3][1]));
170
171 sum_val = vaddq_s32(sum_val,vpaddlq_s16(abs));
172
173 pu1_src += src_strd;
174 pu1_pred += pred_strd;
175 {
176 static const int16x8_t g_ai2_ihevc_trans_32_01_8 = { 64, 83, 64, 36, 64, 36, -64, -83 };
177
178 static const int16x4_t g_ai2_ihevc_trans_32_4_04 = { 89, 75, 50, 18 };
179 static const int16x4_t g_ai2_ihevc_trans_32_12_04 = { 75, -18, -89, -50 };
180 static const int16x4_t g_ai2_ihevc_trans_32_20_04 = { 50, -89, 18, 75 };
181 static const int16x4_t g_ai2_ihevc_trans_32_28_04 = { 18, -50, 75, -89 };
182
183 static const int16x8_t g_ai2_ihevc_trans_32_2_07 = { 90, 87, 80, 70, 57, 43, 25, 9 };
184 static const int16x8_t g_ai2_ihevc_trans_32_6_07 = { 87, 57, 9, -43, -80, -90, -70, -25 };
185 static const int16x8_t g_ai2_ihevc_trans_32_10_07 = { 80, 9, -70, -87, -25, 57, 90, 43 };
186 static const int16x8_t g_ai2_ihevc_trans_32_14_07 = { 70, -43, -87, 9, 90, 25, -80, -57 };
187 static const int16x8_t g_ai2_ihevc_trans_32_18_07 = { 57, -80, -25, 90, -9, -87, 43, 70 };
188 static const int16x8_t g_ai2_ihevc_trans_32_22_07 = { 43, -90, 57, 25, -87, 70, 9, -80 };
189 static const int16x8_t g_ai2_ihevc_trans_32_26_07 = { 25, -70, 90, -80, 43, 9, -57, 87 };
190 static const int16x8_t g_ai2_ihevc_trans_32_30_07 = { 9, -25, 43, -57, 70, -80, 87, -90 };
191
192 static const int16x8_t g_ai2_ihevc_trans_32_1_07 = { 90, 90, 88, 85, 82, 78, 73, 67 };
193 static const int16x8_t g_ai2_ihevc_trans_32_1_815 = { 61, 54, 46, 38, 31, 22, 13, 4 };
194 static const int16x8_t g_ai2_ihevc_trans_32_3_07 = { 90, 82, 67, 46, 22, -4, -31, -54 };
195 static const int16x8_t g_ai2_ihevc_trans_32_3_815 = { -73, -85, -90, -88, -78, -61, -38, -13 };
196 static const int16x8_t g_ai2_ihevc_trans_32_5_07 = { 88, 67, 31, -13, -54, -82, -90, -78 };
197 static const int16x8_t g_ai2_ihevc_trans_32_5_815 = { -46, -4, 38, 73, 90, 85, 61, 22 };
198 static const int16x8_t g_ai2_ihevc_trans_32_7_07 = { 85, 46, -13, -67, -90, -73, -22, 38 };
199 static const int16x8_t g_ai2_ihevc_trans_32_7_815 = { 82, 88, 54, -4, -61, -90, -78, -31 };
200 static const int16x8_t g_ai2_ihevc_trans_32_9_07 = { 82, 22, -54, -90, -61, 13, 78, 85 };
201 static const int16x8_t g_ai2_ihevc_trans_32_9_815 = { 31, -46, -90, -67, 4, 73, 88, 38 };
202 static const int16x8_t g_ai2_ihevc_trans_32_11_07 = { 78, -4, -82, -73, 13, 85, 67, -22 };
203 static const int16x8_t g_ai2_ihevc_trans_32_11_815 = { -88, -61, 31, 90, 54, -38, -90, -46 };
204 static const int16x8_t g_ai2_ihevc_trans_32_13_07 = { 73, -31, -90, -22, 78, 67, -38, -90 };
205 static const int16x8_t g_ai2_ihevc_trans_32_13_815 = { -13, 82, 61, -46, -88, -4, 85, 54 };
206 static const int16x8_t g_ai2_ihevc_trans_32_15_07 = { 67, -54, -78, 38, 85, -22, -90, 4 };
207 static const int16x8_t g_ai2_ihevc_trans_32_15_815 = { 90, 13, -88, -31, 82, 46, -73, -61 };
208 static const int16x8_t g_ai2_ihevc_trans_32_17_07 = { 61, -73, -46, 82, 31, -88, -13, 90 };
209 static const int16x8_t g_ai2_ihevc_trans_32_17_815 = { -4, -90, 22, 85, -38, -78, 54, 67 };
210 static const int16x8_t g_ai2_ihevc_trans_32_19_07 = { 54, -85, -4, 88, -46, -61, 82, 13 };
211 static const int16x8_t g_ai2_ihevc_trans_32_19_815 = { -90, 38, 67, -78, -22, 90, -31, -73 };
212 static const int16x8_t g_ai2_ihevc_trans_32_21_07 = { 46, -90, 38, 54, -90, 31, 61, -88 };
213 static const int16x8_t g_ai2_ihevc_trans_32_21_815 = { 22, 67, -85, 13, 73, -82, 4, 78 };
214 static const int16x8_t g_ai2_ihevc_trans_32_23_07 = { 38, -88, 73, -4, -67, 90, -46, -31 };
215 static const int16x8_t g_ai2_ihevc_trans_32_23_815 = { 85, -78, 13, 61, -90, 54, 22, -82 };
216 static const int16x8_t g_ai2_ihevc_trans_32_25_07 = { 31, -78, 90, -61, 4, 54, -88, 82 };
217 static const int16x8_t g_ai2_ihevc_trans_32_25_815 = { -38, -22, 73, -90, 67, -13, -46, 85 };
218 static const int16x8_t g_ai2_ihevc_trans_32_27_07 = { 22, -61, 85, -90, 73, -38, -4, 46 };
219 static const int16x8_t g_ai2_ihevc_trans_32_27_815 = { -78, 90, -82, 54, -13, -31, 67, -88 };
220 static const int16x8_t g_ai2_ihevc_trans_32_29_07 = { 13, -38, 61, -78, 88, -90, 85, -73 };
221 static const int16x8_t g_ai2_ihevc_trans_32_29_815 = { 54, -31, 4, 22, -46, 67, -82, 90 };
222 static const int16x8_t g_ai2_ihevc_trans_32_31_07 = { 4, -13, 22, -31, 38, -46, 54, -61 };
223 static const int16x8_t g_ai2_ihevc_trans_32_31_815 = { 67, -73, 78, -82, 85, -88, 90, -90 };
224
225 int32x4x2_t a[32];
226
227 const int16x8_t o1_1 = vsubq_s16(
228 diff_16[1][1], diff_16[2][1]); /*R2(9-16) - R2(24-17)*/
229 const int16x8_t o1_0 = vsubq_s16(
230 diff_16[0][1], diff_16[3][1]); /*R2(1- 8) - R2(32-25)*/
231 const int16x8_t o0_1 = vsubq_s16(
232 diff_16[1][0], diff_16[2][0]); /*R1(9-16) - R1(24-17)*/
233 const int16x8_t o0_0 = vsubq_s16(
234 diff_16[0][0], diff_16[3][0]); /*R1(1- 8) - R1(32-25)*/
235 const int16x8_t e0_0 = vaddq_s16(
236 diff_16[0][0], diff_16[3][0]); /*R1(1- 8) + R1(32-25)*/
237 int16x8_t e0_1 = vrev64q_s16(vaddq_s16(
238 diff_16[1][0], diff_16[2][0])); /*R1(9-16) + R1(24-17)*/
239 e0_1 = vcombine_s16(vget_high_s16(e0_1), vget_low_s16(e0_1));
240 const int16x8_t e1_0 = vaddq_s16(
241 diff_16[0][1], diff_16[3][1]); /*R2(1- 8) + R2(32-25)*/
242 int16x8_t e1_1 = vrev64q_s16(vaddq_s16(
243 diff_16[1][1], diff_16[2][1])); /*R2(9-16) + R2(24-17)*/
244 e1_1 = vcombine_s16(vget_high_s16(e1_1), vget_low_s16(e1_1));
245
246 const int16x8_t ee0 = vaddq_s16(e0_0, e0_1); /*E1(1- 8) + E1(16-9)*/
247 const int16x8_t ee1 = vaddq_s16(e1_0, e1_1); /*E2(1- 8) + E2(16-9)*/
248 const int16x8_t eo1 = vsubq_s16(e1_0, e1_1); /*E2(1- 8) - E2(16-9)*/
249 const int16x8_t eo0 = vsubq_s16(e0_0, e0_1); /*E1(1- 8) - E1(16-9)*/
250
251 /*EE0(1-4) & EE1(1-4)*/
252 const int16x8_t ee_a =
253 vcombine_s16(vget_low_s16(ee0), vget_low_s16(ee1));
254 /*EE0(8-5) & EE1(8-5)*/
255 const int16x8_t ee_b = vcombine_s16(
256 vrev64_s16(vget_high_s16(ee0)), vrev64_s16(vget_high_s16(ee1)));
257
258 /*EE(1-4) - EE(8-5)*/
259 const int16x8_t eeo = vsubq_s16(ee_a, ee_b); //Q0
260 /*EE(1-4) + EE(8-5)*/
261 const int16x8_t eee = vaddq_s16(ee_a, ee_b); //Q13
262
263 /*EEEE Calculations*/
264 const int32x2x2_t ee =
265 vtrn_s32(vreinterpret_s32_s16(vget_low_s16(eee)),
266 vreinterpret_s32_s16(vget_high_s16(eee)));
267 const int16x8_t eeee_a =
268 vreinterpretq_s16_s32(vcombine_s32(ee.val[0], ee.val[0]));
269 const int16x8_t eeee_b =
270 vcombine_s16(vrev32_s16(vreinterpret_s16_s32(ee.val[1])),
271 vneg_s16(vrev32_s16(vreinterpret_s16_s32(ee.val[1]))));
272 const int16x8_t eeee = vaddq_s16(eeee_a, eeee_b); //q2
273 const int16x4x2_t trans_eeee =
274 vtrn_s16(vget_low_s16(eeee), vget_high_s16(eeee));
275 const int16x4_t eeee_00 = vreinterpret_s16_s32(vdup_lane_s32(
276 vreinterpret_s32_s16(trans_eeee.val[0]), 0)); //d8
277 const int16x4_t eeee_10 = vreinterpret_s16_s32(vdup_lane_s32(
278 vreinterpret_s32_s16(trans_eeee.val[0]), 1)); //d9
279 const int16x4_t eeee_01 = vreinterpret_s16_s32(vdup_lane_s32(
280 vreinterpret_s32_s16(trans_eeee.val[1]), 0)); //d10
281 const int16x4_t eeee_11 = vreinterpret_s16_s32(vdup_lane_s32(
282 vreinterpret_s32_s16(trans_eeee.val[1]), 1)); //d11
283
284 /*Calculation of values 0 8 16 24*/
285 a[0].val[0] =
286 vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_01_8), eeee_00);
287 a[0].val[0] = vmlal_s16(
288 a[0].val[0], vget_high_s16(g_ai2_ihevc_trans_32_01_8), eeee_01);
289 a[0].val[1] =
290 vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_01_8), eeee_10);
291 a[0].val[1] = vmlal_s16(
292 a[0].val[1], vget_high_s16(g_ai2_ihevc_trans_32_01_8), eeee_11);
293
294 int32x4x2_t val_8 = vzipq_s32(a[0].val[0], a[0].val[1]);
295
296 /*Store*/
297 vst1_s32(pi4_temp, vget_low_s32(val_8.val[0])); /*Value 0*/
298 vst1_s32(pi4_temp + 256, vget_high_s32(val_8.val[0])); /*Value 8*/
299 vst1_s32(pi4_temp + 512, vget_low_s32(val_8.val[1])); /*Value 16*/
300 vst1_s32(pi4_temp + 768, vget_high_s32(val_8.val[1])); /*Value 24*/
301
302 /*Calculation of values 4 12 20 28*/
303 /*Multiplications*/
304 a[4].val[0] =
305 vmull_s16(g_ai2_ihevc_trans_32_4_04, vget_low_s16(eeo));
306 a[12].val[0] =
307 vmull_s16(g_ai2_ihevc_trans_32_12_04, vget_low_s16(eeo));
308 a[20].val[0] =
309 vmull_s16(g_ai2_ihevc_trans_32_20_04, vget_low_s16(eeo));
310 a[28].val[0] =
311 vmull_s16(g_ai2_ihevc_trans_32_28_04, vget_low_s16(eeo));
312
313 a[4].val[1] =
314 vmull_s16(g_ai2_ihevc_trans_32_4_04, vget_high_s16(eeo));
315 a[12].val[1] =
316 vmull_s16(g_ai2_ihevc_trans_32_12_04, vget_high_s16(eeo));
317 a[20].val[1] =
318 vmull_s16(g_ai2_ihevc_trans_32_20_04, vget_high_s16(eeo));
319 a[28].val[1] =
320 vmull_s16(g_ai2_ihevc_trans_32_28_04, vget_high_s16(eeo));
321
322 /*Transposes*/
323 int32x4x2_t val_4_0 =
324 vtrnq_s32(a[4].val[0], a[12].val[0]); //q15 q5
325 int32x4x2_t val_4_1 =
326 vtrnq_s32(a[4].val[1], a[12].val[1]); //q4 q12
327 int32x4x2_t val_20_0 =
328 vtrnq_s32(a[20].val[0], a[28].val[0]); //q8 q2
329 int32x4x2_t val_20_1 =
330 vtrnq_s32(a[20].val[1], a[28].val[1]); //q9 q13
331
332 /*Swap*/
333 a[4].val[0] = vcombine_s32(vget_low_s32(val_4_0.val[0]),
334 vget_low_s32(val_20_0.val[0])); //q12
335 a[4].val[1] = vcombine_s32(vget_high_s32(val_4_0.val[0]),
336 vget_high_s32(val_20_0.val[0])); //q2
337
338 a[12].val[0] = vcombine_s32(vget_low_s32(val_4_0.val[1]),
339 vget_low_s32(val_20_0.val[1])); //q4
340 a[12].val[1] = vcombine_s32(vget_high_s32(val_4_0.val[1]),
341 vget_high_s32(val_20_0.val[1])); //q8
342
343 /*Additions*/
344 a[12].val[0] = vaddq_s32(a[12].val[0], a[4].val[0]); //q4
345 a[12].val[1] = vaddq_s32(a[12].val[1], a[4].val[1]); //q8
346 a[12].val[1] = vaddq_s32(a[12].val[1], a[12].val[0]); //q8
347
348 a[20].val[0] = vcombine_s32(vget_low_s32(val_4_1.val[0]),
349 vget_low_s32(val_20_1.val[0])); //q5
350 a[20].val[1] = vcombine_s32(vget_high_s32(val_4_1.val[0]),
351 vget_high_s32(val_20_1.val[0])); //q13
352
353 a[28].val[0] = vcombine_s32(vget_low_s32(val_4_1.val[1]),
354 vget_low_s32(val_20_1.val[1])); //q15
355 a[28].val[1] = vcombine_s32(vget_high_s32(val_4_1.val[1]),
356 vget_high_s32(val_20_1.val[1])); //q9
357
358 a[28].val[0] = vaddq_s32(a[28].val[0], a[20].val[0]); //q15
359 a[28].val[1] = vaddq_s32(a[28].val[1], a[20].val[1]); //q5
360 a[28].val[1] = vaddq_s32(a[28].val[1], a[28].val[0]); //q15
361
362 int32x4x2_t val_4 = vzipq_s32(a[12].val[1], a[28].val[1]);
363
364 /*Store*/
365 vst1_s32(pi4_temp + 128, vget_low_s32(val_4.val[0])); /*Value 4*/
366 vst1_s32(pi4_temp + 384, vget_high_s32(val_4.val[0])); /*Value 12*/
367 vst1_s32(pi4_temp + 640, vget_low_s32(val_4.val[1])); /*Value 20*/
368 vst1_s32(pi4_temp + 896, vget_high_s32(val_4.val[1])); /*Value 28*/
369
370 /*Calculation of value 2 6 10 14 18 22 26 30*/
371 /*Multiplications*/
372 a[2].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_2_07),
373 vget_low_s16(eo0)); //q2
374 a[6].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_6_07),
375 vget_low_s16(eo0)); //q5
376 a[10].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_10_07),
377 vget_low_s16(eo0)); //q9
378 a[14].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_14_07),
379 vget_low_s16(eo0)); //q8
380
381 a[14].val[0] = vmlal_s16(a[14].val[0],
382 vget_high_s16(g_ai2_ihevc_trans_32_14_07), vget_high_s16(eo0));
383 a[10].val[0] = vmlal_s16(a[10].val[0],
384 vget_high_s16(g_ai2_ihevc_trans_32_10_07), vget_high_s16(eo0));
385 a[6].val[0] = vmlal_s16(a[6].val[0],
386 vget_high_s16(g_ai2_ihevc_trans_32_6_07), vget_high_s16(eo0));
387 a[2].val[0] = vmlal_s16(a[2].val[0],
388 vget_high_s16(g_ai2_ihevc_trans_32_2_07), vget_high_s16(eo0));
389
390 a[2].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_2_07),
391 vget_low_s16(eo1)); //q4
392 a[6].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_6_07),
393 vget_low_s16(eo1)); //q13
394 a[10].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_10_07),
395 vget_low_s16(eo1)); //q12
396 a[14].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_14_07),
397 vget_low_s16(eo1)); //q15
398
399 a[14].val[1] = vmlal_s16(a[14].val[1],
400 vget_high_s16(g_ai2_ihevc_trans_32_14_07), vget_high_s16(eo1));
401 a[10].val[1] = vmlal_s16(a[10].val[1],
402 vget_high_s16(g_ai2_ihevc_trans_32_10_07), vget_high_s16(eo1));
403 a[6].val[1] = vmlal_s16(a[6].val[1],
404 vget_high_s16(g_ai2_ihevc_trans_32_6_07), vget_high_s16(eo1));
405 a[2].val[1] = vmlal_s16(a[2].val[1],
406 vget_high_s16(g_ai2_ihevc_trans_32_2_07), vget_high_s16(eo1));
407
408 /*Transposes*/
409 int32x4x2_t val_26_0 = vtrnq_s32(a[2].val[0], a[6].val[0]); //q2 q5
410 int32x4x2_t val_1014_0 =
411 vtrnq_s32(a[10].val[0], a[14].val[0]); //q9 q8
412 int32x4x2_t val_26_1 =
413 vtrnq_s32(a[2].val[1], a[6].val[1]); //q4 q13
414 int32x4x2_t val_1014_1 =
415 vtrnq_s32(a[10].val[1], a[14].val[1]); //q12 q15
416
417 /*Swap*/
418 a[2].val[0] = vcombine_s32(vget_low_s32(val_26_0.val[0]),
419 vget_low_s32(val_1014_0.val[0])); //q2
420 a[2].val[1] = vcombine_s32(vget_high_s32(val_26_0.val[0]),
421 vget_high_s32(val_1014_0.val[0])); //q9
422
423 a[6].val[0] = vcombine_s32(vget_low_s32(val_26_0.val[1]),
424 vget_low_s32(val_1014_0.val[1])); //q5
425 a[6].val[1] = vcombine_s32(vget_high_s32(val_26_0.val[1]),
426 vget_high_s32(val_1014_0.val[1])); //q8
427
428 a[10].val[0] = vcombine_s32(vget_low_s32(val_26_1.val[0]),
429 vget_low_s32(val_1014_1.val[0])); //q4
430 a[10].val[1] = vcombine_s32(vget_high_s32(val_26_1.val[0]),
431 vget_high_s32(val_1014_1.val[0])); //q12
432
433 a[14].val[0] = vcombine_s32(vget_low_s32(val_26_1.val[1]),
434 vget_low_s32(val_1014_1.val[1])); //q13
435 a[14].val[1] = vcombine_s32(vget_high_s32(val_26_1.val[1]),
436 vget_high_s32(val_1014_1.val[1])); //q15
437
438 /*Additions*/
439 a[2].val[0] = vaddq_s32(a[2].val[0], a[6].val[0]); //q2
440 a[2].val[1] = vaddq_s32(a[2].val[1], a[6].val[1]); //q9
441 a[2].val[1] = vaddq_s32(a[2].val[1], a[2].val[0]); //q9
442
443 a[10].val[0] = vaddq_s32(a[10].val[0], a[14].val[0]); //q4
444 a[10].val[1] = vaddq_s32(a[10].val[1], a[14].val[1]); //q12
445 a[10].val[1] = vaddq_s32(a[10].val[1], a[10].val[0]); //q12
446
447 int32x4x2_t val_2 = vzipq_s32(a[2].val[1], a[10].val[1]); //q9 q12
448
449 /*Store*/
450 vst1_s32(pi4_temp + 64, vget_low_s32(val_2.val[0])); /*Value 2*/
451 vst1_s32(pi4_temp + 192, vget_high_s32(val_2.val[0])); /*Value 6*/
452 vst1_s32(pi4_temp + 320, vget_low_s32(val_2.val[1])); /*Value 10*/
453 vst1_s32(pi4_temp + 448, vget_high_s32(val_2.val[1])); /*Value 14*/
454
455 a[18].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_18_07),
456 vget_low_s16(eo0)); //q0
457 a[22].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_22_07),
458 vget_low_s16(eo0)); //q5
459 a[26].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_26_07),
460 vget_low_s16(eo0)); //q9
461 a[30].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_30_07),
462 vget_low_s16(eo0)); //q15
463
464 a[30].val[0] = vmlal_s16(a[30].val[0],
465 vget_high_s16(g_ai2_ihevc_trans_32_30_07), vget_high_s16(eo0));
466 a[26].val[0] = vmlal_s16(a[26].val[0],
467 vget_high_s16(g_ai2_ihevc_trans_32_26_07), vget_high_s16(eo0));
468 a[22].val[0] = vmlal_s16(a[22].val[0],
469 vget_high_s16(g_ai2_ihevc_trans_32_22_07), vget_high_s16(eo0));
470 a[18].val[0] = vmlal_s16(a[18].val[0],
471 vget_high_s16(g_ai2_ihevc_trans_32_18_07), vget_high_s16(eo0));
472
473 a[18].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_18_07),
474 vget_low_s16(eo1)); //q4
475 a[22].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_22_07),
476 vget_low_s16(eo1)); //q8
477 a[26].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_26_07),
478 vget_low_s16(eo1)); //q12
479 a[30].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_30_07),
480 vget_low_s16(eo1)); //q18
481
482 a[30].val[1] = vmlal_s16(a[30].val[1],
483 vget_high_s16(g_ai2_ihevc_trans_32_30_07), vget_high_s16(eo1));
484 a[26].val[1] = vmlal_s16(a[26].val[1],
485 vget_high_s16(g_ai2_ihevc_trans_32_26_07), vget_high_s16(eo1));
486 a[22].val[1] = vmlal_s16(a[22].val[1],
487 vget_high_s16(g_ai2_ihevc_trans_32_22_07), vget_high_s16(eo1));
488 a[18].val[1] = vmlal_s16(a[18].val[1],
489 vget_high_s16(g_ai2_ihevc_trans_32_18_07), vget_high_s16(eo1));
490
491 /*Transposes*/
492 int32x4x2_t val_1822_0 =
493 vtrnq_s32(a[18].val[0], a[22].val[0]); //q2 q5
494 int32x4x2_t val_2630_0 =
495 vtrnq_s32(a[26].val[0], a[30].val[0]); //q9 q8
496 int32x4x2_t val_1822_1 =
497 vtrnq_s32(a[18].val[1], a[22].val[1]); //q4 q13
498 int32x4x2_t val_2630_1 =
499 vtrnq_s32(a[26].val[1], a[30].val[1]); //q12 q15
500
501 /*Swap*/
502 a[18].val[0] = vcombine_s32(vget_low_s32(val_1822_0.val[0]),
503 vget_low_s32(val_2630_0.val[0])); //q2
504 a[18].val[1] = vcombine_s32(vget_high_s32(val_1822_0.val[0]),
505 vget_high_s32(val_2630_0.val[0])); //q9
506
507 a[22].val[0] = vcombine_s32(vget_low_s32(val_1822_0.val[1]),
508 vget_low_s32(val_2630_0.val[1])); //q5
509 a[22].val[1] = vcombine_s32(vget_high_s32(val_1822_0.val[1]),
510 vget_high_s32(val_2630_0.val[1])); //q8
511
512 a[26].val[0] = vcombine_s32(vget_low_s32(val_1822_1.val[0]),
513 vget_low_s32(val_2630_1.val[0])); //q4
514 a[26].val[1] = vcombine_s32(vget_high_s32(val_1822_1.val[0]),
515 vget_high_s32(val_2630_1.val[0])); //q12
516
517 a[30].val[0] = vcombine_s32(vget_low_s32(val_1822_1.val[1]),
518 vget_low_s32(val_2630_1.val[1])); //q13
519 a[30].val[1] = vcombine_s32(vget_high_s32(val_1822_1.val[1]),
520 vget_high_s32(val_2630_1.val[1])); //q15
521
522 /*Additions*/
523 a[18].val[0] = vaddq_s32(a[18].val[0], a[22].val[0]); //q2
524 a[18].val[1] = vaddq_s32(a[18].val[1], a[22].val[1]); //q9
525 a[18].val[1] = vaddq_s32(a[18].val[1], a[18].val[0]); //q9
526
527 a[26].val[0] = vaddq_s32(a[26].val[0], a[30].val[0]); //q4
528 a[26].val[1] = vaddq_s32(a[26].val[1], a[30].val[1]); //q12
529 a[26].val[1] = vaddq_s32(a[26].val[1], a[26].val[0]); //q12
530
531 int32x4x2_t val_18 =
532 vzipq_s32(a[18].val[1], a[26].val[1]); //q9 q12
533
534 /*Store*/
535 vst1_s32(pi4_temp + 576, vget_low_s32(val_18.val[0])); /*Value 18*/
536 vst1_s32(pi4_temp + 704, vget_high_s32(val_18.val[0])); /*Value 22*/
537 vst1_s32(pi4_temp + 832, vget_low_s32(val_18.val[1])); /*Value 26*/
538 vst1_s32(pi4_temp + 960, vget_high_s32(val_18.val[1])); /*Value 30*/
539
540 /*Calculations for odd indexes*/
541 a[1].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_1_07),
542 vget_low_s16(o0_0)); //q1
543 a[3].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_3_07),
544 vget_low_s16(o0_0)); //q5
545 a[5].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_5_07),
546 vget_low_s16(o0_0)); //q8
547 a[7].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_7_07),
548 vget_low_s16(o0_0)); //q12
549
550 a[7].val[0] = vmlal_s16(a[7].val[0],
551 vget_high_s16(g_ai2_ihevc_trans_32_7_07), vget_high_s16(o0_0));
552 a[5].val[0] = vmlal_s16(a[5].val[0],
553 vget_high_s16(g_ai2_ihevc_trans_32_5_07), vget_high_s16(o0_0));
554 a[3].val[0] = vmlal_s16(a[3].val[0],
555 vget_high_s16(g_ai2_ihevc_trans_32_3_07), vget_high_s16(o0_0));
556 a[1].val[0] = vmlal_s16(a[1].val[0],
557 vget_high_s16(g_ai2_ihevc_trans_32_1_07), vget_high_s16(o0_0));
558
559 a[1].val[0] = vmlal_s16(a[1].val[0],
560 vget_low_s16(g_ai2_ihevc_trans_32_1_815), vget_low_s16(o0_1));
561 a[3].val[0] = vmlal_s16(a[3].val[0],
562 vget_low_s16(g_ai2_ihevc_trans_32_3_815), vget_low_s16(o0_1));
563 a[5].val[0] = vmlal_s16(a[5].val[0],
564 vget_low_s16(g_ai2_ihevc_trans_32_5_815), vget_low_s16(o0_1));
565 a[7].val[0] = vmlal_s16(a[7].val[0],
566 vget_low_s16(g_ai2_ihevc_trans_32_7_815), vget_low_s16(o0_1));
567
568 a[7].val[0] = vmlal_s16(a[7].val[0],
569 vget_high_s16(g_ai2_ihevc_trans_32_7_815), vget_high_s16(o0_1));
570 a[5].val[0] = vmlal_s16(a[5].val[0],
571 vget_high_s16(g_ai2_ihevc_trans_32_5_815), vget_high_s16(o0_1));
572 a[3].val[0] = vmlal_s16(a[3].val[0],
573 vget_high_s16(g_ai2_ihevc_trans_32_3_815), vget_high_s16(o0_1));
574 a[1].val[0] = vmlal_s16(a[1].val[0],
575 vget_high_s16(g_ai2_ihevc_trans_32_1_815), vget_high_s16(o0_1));
576
577 a[1].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_1_07),
578 vget_low_s16(o1_0)); //q0
579 a[3].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_3_07),
580 vget_low_s16(o1_0)); //q4
581 a[5].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_5_07),
582 vget_low_s16(o1_0)); //q9
583 a[7].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_7_07),
584 vget_low_s16(o1_0)); //q13
585
586 a[7].val[1] = vmlal_s16(a[7].val[1],
587 vget_high_s16(g_ai2_ihevc_trans_32_7_07), vget_high_s16(o1_0));
588 a[5].val[1] = vmlal_s16(a[5].val[1],
589 vget_high_s16(g_ai2_ihevc_trans_32_5_07), vget_high_s16(o1_0));
590 a[3].val[1] = vmlal_s16(a[3].val[1],
591 vget_high_s16(g_ai2_ihevc_trans_32_3_07), vget_high_s16(o1_0));
592 a[1].val[1] = vmlal_s16(a[1].val[1],
593 vget_high_s16(g_ai2_ihevc_trans_32_1_07), vget_high_s16(o1_0));
594
595 a[1].val[1] = vmlal_s16(a[1].val[1],
596 vget_low_s16(g_ai2_ihevc_trans_32_1_815), vget_low_s16(o1_1));
597 a[3].val[1] = vmlal_s16(a[3].val[1],
598 vget_low_s16(g_ai2_ihevc_trans_32_3_815), vget_low_s16(o1_1));
599 a[5].val[1] = vmlal_s16(a[5].val[1],
600 vget_low_s16(g_ai2_ihevc_trans_32_5_815), vget_low_s16(o1_1));
601 a[7].val[1] = vmlal_s16(a[7].val[1],
602 vget_low_s16(g_ai2_ihevc_trans_32_7_815), vget_low_s16(o1_1));
603
604 a[7].val[1] = vmlal_s16(a[7].val[1],
605 vget_high_s16(g_ai2_ihevc_trans_32_7_815), vget_high_s16(o1_1));
606 a[5].val[1] = vmlal_s16(a[5].val[1],
607 vget_high_s16(g_ai2_ihevc_trans_32_5_815), vget_high_s16(o1_1));
608 a[3].val[1] = vmlal_s16(a[3].val[1],
609 vget_high_s16(g_ai2_ihevc_trans_32_3_815), vget_high_s16(o1_1));
610 a[1].val[1] = vmlal_s16(a[1].val[1],
611 vget_high_s16(g_ai2_ihevc_trans_32_1_815), vget_high_s16(o1_1));
612
613 /*Transposes*/
614 int32x4x2_t val_13_0 = vtrnq_s32(a[1].val[0], a[3].val[0]); //q0 q4
615 int32x4x2_t val_13_1 = vtrnq_s32(a[1].val[1], a[3].val[1]); //q1 q5
616 int32x4x2_t val_57_0 =
617 vtrnq_s32(a[5].val[0], a[7].val[0]); //q8 q12
618 int32x4x2_t val_57_1 =
619 vtrnq_s32(a[5].val[1], a[7].val[1]); //q9 q13
620
621 /*Swap*/
622 a[1].val[0] = vcombine_s32(vget_low_s32(val_13_0.val[0]),
623 vget_low_s32(val_57_0.val[0])); //q0
624 a[1].val[1] = vcombine_s32(vget_high_s32(val_13_0.val[0]),
625 vget_high_s32(val_57_0.val[0])); //q8
626
627 a[3].val[0] = vcombine_s32(vget_low_s32(val_13_0.val[1]),
628 vget_low_s32(val_57_0.val[1])); //q1
629 a[3].val[1] = vcombine_s32(vget_high_s32(val_13_0.val[1]),
630 vget_high_s32(val_57_0.val[1])); //q9
631
632 a[5].val[0] = vcombine_s32(vget_low_s32(val_13_1.val[0]),
633 vget_low_s32(val_57_1.val[0])); //q4
634 a[5].val[1] = vcombine_s32(vget_high_s32(val_13_1.val[0]),
635 vget_high_s32(val_57_1.val[0])); //q12
636
637 a[7].val[0] = vcombine_s32(vget_low_s32(val_13_1.val[1]),
638 vget_low_s32(val_57_1.val[1])); //q5
639 a[7].val[1] = vcombine_s32(vget_high_s32(val_13_1.val[1]),
640 vget_high_s32(val_57_1.val[1])); //q13
641
642 /*Additions*/
643 a[1].val[0] = vaddq_s32(a[1].val[0], a[3].val[0]); //q0
644 a[1].val[1] = vaddq_s32(a[1].val[1], a[3].val[1]); //q8
645 a[1].val[1] = vaddq_s32(a[1].val[1], a[1].val[0]); //q8
646
647 a[5].val[0] = vaddq_s32(a[5].val[0], a[7].val[0]); //q1
648 a[5].val[1] = vaddq_s32(a[5].val[1], a[7].val[1]); //q9
649 a[5].val[1] = vaddq_s32(a[5].val[1], a[5].val[0]); //q9
650
651 int32x4x2_t val_1 = vzipq_s32(a[1].val[1], a[5].val[1]); //q8 q9
652
653 /*Store*/
654 vst1_s32(pi4_temp + 32, vget_low_s32(val_1.val[0])); /*Value 1*/
655 vst1_s32(pi4_temp + 96, vget_high_s32(val_1.val[0])); /*Value 3*/
656 vst1_s32(pi4_temp + 160, vget_low_s32(val_1.val[1])); /*Value 5*/
657 vst1_s32(pi4_temp + 224, vget_high_s32(val_1.val[1])); /*Value 7*/
658
659 a[9].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_9_07),
660 vget_low_s16(o0_0)); //q2
661 a[11].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_11_07),
662 vget_low_s16(o0_0)); //q2
663 a[13].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_13_07),
664 vget_low_s16(o0_0)); //q2
665 a[15].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_15_07),
666 vget_low_s16(o0_0)); //q2
667
668 a[15].val[0] = vmlal_s16(a[15].val[0],
669 vget_high_s16(g_ai2_ihevc_trans_32_15_07), vget_high_s16(o0_0));
670 a[13].val[0] = vmlal_s16(a[13].val[0],
671 vget_high_s16(g_ai2_ihevc_trans_32_13_07), vget_high_s16(o0_0));
672 a[11].val[0] = vmlal_s16(a[11].val[0],
673 vget_high_s16(g_ai2_ihevc_trans_32_11_07), vget_high_s16(o0_0));
674 a[9].val[0] = vmlal_s16(a[9].val[0],
675 vget_high_s16(g_ai2_ihevc_trans_32_9_07), vget_high_s16(o0_0));
676
677 a[9].val[0] = vmlal_s16(a[9].val[0],
678 vget_low_s16(g_ai2_ihevc_trans_32_9_815), vget_low_s16(o0_1));
679 a[11].val[0] = vmlal_s16(a[11].val[0],
680 vget_low_s16(g_ai2_ihevc_trans_32_11_815), vget_low_s16(o0_1));
681 a[13].val[0] = vmlal_s16(a[13].val[0],
682 vget_low_s16(g_ai2_ihevc_trans_32_13_815), vget_low_s16(o0_1));
683 a[15].val[0] = vmlal_s16(a[15].val[0],
684 vget_low_s16(g_ai2_ihevc_trans_32_15_815), vget_low_s16(o0_1));
685
686 a[15].val[0] = vmlal_s16(a[15].val[0],
687 vget_high_s16(g_ai2_ihevc_trans_32_15_815),
688 vget_high_s16(o0_1));
689 a[13].val[0] = vmlal_s16(a[13].val[0],
690 vget_high_s16(g_ai2_ihevc_trans_32_13_815),
691 vget_high_s16(o0_1));
692 a[11].val[0] = vmlal_s16(a[11].val[0],
693 vget_high_s16(g_ai2_ihevc_trans_32_11_815),
694 vget_high_s16(o0_1));
695 a[9].val[0] = vmlal_s16(a[9].val[0],
696 vget_high_s16(g_ai2_ihevc_trans_32_9_815), vget_high_s16(o0_1));
697
698 a[9].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_9_07),
699 vget_low_s16(o1_0)); //q2
700 a[11].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_11_07),
701 vget_low_s16(o1_0)); //q2
702 a[13].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_13_07),
703 vget_low_s16(o1_0)); //q2
704 a[15].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_15_07),
705 vget_low_s16(o1_0)); //q2
706
707 a[15].val[1] = vmlal_s16(a[15].val[1],
708 vget_high_s16(g_ai2_ihevc_trans_32_15_07), vget_high_s16(o1_0));
709 a[13].val[1] = vmlal_s16(a[13].val[1],
710 vget_high_s16(g_ai2_ihevc_trans_32_13_07), vget_high_s16(o1_0));
711 a[11].val[1] = vmlal_s16(a[11].val[1],
712 vget_high_s16(g_ai2_ihevc_trans_32_11_07), vget_high_s16(o1_0));
713 a[9].val[1] = vmlal_s16(a[9].val[1],
714 vget_high_s16(g_ai2_ihevc_trans_32_9_07), vget_high_s16(o1_0));
715
716 a[9].val[1] = vmlal_s16(a[9].val[1],
717 vget_low_s16(g_ai2_ihevc_trans_32_9_815), vget_low_s16(o1_1));
718 a[11].val[1] = vmlal_s16(a[11].val[1],
719 vget_low_s16(g_ai2_ihevc_trans_32_11_815), vget_low_s16(o1_1));
720 a[13].val[1] = vmlal_s16(a[13].val[1],
721 vget_low_s16(g_ai2_ihevc_trans_32_13_815), vget_low_s16(o1_1));
722 a[15].val[1] = vmlal_s16(a[15].val[1],
723 vget_low_s16(g_ai2_ihevc_trans_32_15_815), vget_low_s16(o1_1));
724
725 a[15].val[1] = vmlal_s16(a[15].val[1],
726 vget_high_s16(g_ai2_ihevc_trans_32_15_815),
727 vget_high_s16(o1_1));
728 a[13].val[1] = vmlal_s16(a[13].val[1],
729 vget_high_s16(g_ai2_ihevc_trans_32_13_815),
730 vget_high_s16(o1_1));
731 a[11].val[1] = vmlal_s16(a[11].val[1],
732 vget_high_s16(g_ai2_ihevc_trans_32_11_815),
733 vget_high_s16(o1_1));
734 a[9].val[1] = vmlal_s16(a[9].val[1],
735 vget_high_s16(g_ai2_ihevc_trans_32_9_815), vget_high_s16(o1_1));
736
737 int32x4x2_t val_911_0 =
738 vtrnq_s32(a[9].val[0], a[11].val[0]); //q0 q4
739 int32x4x2_t val_911_1 =
740 vtrnq_s32(a[9].val[1], a[11].val[1]); //q1 q5
741 int32x4x2_t val_1315_0 =
742 vtrnq_s32(a[13].val[0], a[15].val[0]); //q8 q12
743 int32x4x2_t val_1315_1 =
744 vtrnq_s32(a[13].val[1], a[15].val[1]); //q9 q13
745
746 a[9].val[0] = vcombine_s32(vget_low_s32(val_911_0.val[0]),
747 vget_low_s32(val_1315_0.val[0])); //q0
748 a[9].val[1] = vcombine_s32(vget_high_s32(val_911_0.val[0]),
749 vget_high_s32(val_1315_0.val[0])); //q8
750
751 a[11].val[0] = vcombine_s32(vget_low_s32(val_911_0.val[1]),
752 vget_low_s32(val_1315_0.val[1])); //q1
753 a[11].val[1] = vcombine_s32(vget_high_s32(val_911_0.val[1]),
754 vget_high_s32(val_1315_0.val[1])); //q9
755
756 a[13].val[0] = vcombine_s32(vget_low_s32(val_911_1.val[0]),
757 vget_low_s32(val_1315_1.val[0])); //q4
758 a[13].val[1] = vcombine_s32(vget_high_s32(val_911_1.val[0]),
759 vget_high_s32(val_1315_1.val[0])); //q12
760
761 a[15].val[0] = vcombine_s32(vget_low_s32(val_911_1.val[1]),
762 vget_low_s32(val_1315_1.val[1])); //q5
763 a[15].val[1] = vcombine_s32(vget_high_s32(val_911_1.val[1]),
764 vget_high_s32(val_1315_1.val[1])); //q13
765
766 a[9].val[0] = vaddq_s32(a[9].val[0], a[11].val[0]); //q0
767 a[9].val[1] = vaddq_s32(a[9].val[1], a[11].val[1]); //q8
768 a[9].val[1] = vaddq_s32(a[9].val[1], a[9].val[0]); //q8
769
770 a[13].val[0] = vaddq_s32(a[13].val[0], a[15].val[0]); //q1
771 a[13].val[1] = vaddq_s32(a[13].val[1], a[15].val[1]); //q9
772 a[13].val[1] = vaddq_s32(a[13].val[1], a[13].val[0]); //q9
773
774 int32x4x2_t val_9 = vzipq_s32(a[9].val[1], a[13].val[1]); //q8 q9
775
776 vst1_s32(pi4_temp + 288, vget_low_s32(val_9.val[0])); /*Value 9*/
777 vst1_s32(pi4_temp + 352, vget_high_s32(val_9.val[0])); /*Value 11*/
778 vst1_s32(pi4_temp + 416, vget_low_s32(val_9.val[1])); /*Value 13*/
779 vst1_s32(pi4_temp + 480, vget_high_s32(val_9.val[1])); /*Value 15*/
780
781 a[17].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_17_07),
782 vget_low_s16(o0_0)); //q2
783 a[19].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_19_07),
784 vget_low_s16(o0_0)); //q2
785 a[21].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_21_07),
786 vget_low_s16(o0_0)); //q2
787 a[23].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_23_07),
788 vget_low_s16(o0_0)); //q2
789
790 a[23].val[0] = vmlal_s16(a[23].val[0],
791 vget_high_s16(g_ai2_ihevc_trans_32_23_07), vget_high_s16(o0_0));
792 a[21].val[0] = vmlal_s16(a[21].val[0],
793 vget_high_s16(g_ai2_ihevc_trans_32_21_07), vget_high_s16(o0_0));
794 a[19].val[0] = vmlal_s16(a[19].val[0],
795 vget_high_s16(g_ai2_ihevc_trans_32_19_07), vget_high_s16(o0_0));
796 a[17].val[0] = vmlal_s16(a[17].val[0],
797 vget_high_s16(g_ai2_ihevc_trans_32_17_07), vget_high_s16(o0_0));
798
799 a[17].val[0] = vmlal_s16(a[17].val[0],
800 vget_low_s16(g_ai2_ihevc_trans_32_17_815), vget_low_s16(o0_1));
801 a[19].val[0] = vmlal_s16(a[19].val[0],
802 vget_low_s16(g_ai2_ihevc_trans_32_19_815), vget_low_s16(o0_1));
803 a[21].val[0] = vmlal_s16(a[21].val[0],
804 vget_low_s16(g_ai2_ihevc_trans_32_21_815), vget_low_s16(o0_1));
805 a[23].val[0] = vmlal_s16(a[23].val[0],
806 vget_low_s16(g_ai2_ihevc_trans_32_23_815), vget_low_s16(o0_1));
807
808 a[23].val[0] = vmlal_s16(a[23].val[0],
809 vget_high_s16(g_ai2_ihevc_trans_32_23_815),
810 vget_high_s16(o0_1));
811 a[21].val[0] = vmlal_s16(a[21].val[0],
812 vget_high_s16(g_ai2_ihevc_trans_32_21_815),
813 vget_high_s16(o0_1));
814 a[19].val[0] = vmlal_s16(a[19].val[0],
815 vget_high_s16(g_ai2_ihevc_trans_32_19_815),
816 vget_high_s16(o0_1));
817 a[17].val[0] = vmlal_s16(a[17].val[0],
818 vget_high_s16(g_ai2_ihevc_trans_32_17_815),
819 vget_high_s16(o0_1));
820
821 a[17].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_17_07),
822 vget_low_s16(o1_0)); //q2
823 a[19].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_19_07),
824 vget_low_s16(o1_0)); //q2
825 a[21].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_21_07),
826 vget_low_s16(o1_0)); //q2
827 a[23].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_23_07),
828 vget_low_s16(o1_0)); //q2
829
830 a[23].val[1] = vmlal_s16(a[23].val[1],
831 vget_high_s16(g_ai2_ihevc_trans_32_23_07), vget_high_s16(o1_0));
832 a[21].val[1] = vmlal_s16(a[21].val[1],
833 vget_high_s16(g_ai2_ihevc_trans_32_21_07), vget_high_s16(o1_0));
834 a[19].val[1] = vmlal_s16(a[19].val[1],
835 vget_high_s16(g_ai2_ihevc_trans_32_19_07), vget_high_s16(o1_0));
836 a[17].val[1] = vmlal_s16(a[17].val[1],
837 vget_high_s16(g_ai2_ihevc_trans_32_17_07), vget_high_s16(o1_0));
838
839 a[17].val[1] = vmlal_s16(a[17].val[1],
840 vget_low_s16(g_ai2_ihevc_trans_32_17_815), vget_low_s16(o1_1));
841 a[19].val[1] = vmlal_s16(a[19].val[1],
842 vget_low_s16(g_ai2_ihevc_trans_32_19_815), vget_low_s16(o1_1));
843 a[21].val[1] = vmlal_s16(a[21].val[1],
844 vget_low_s16(g_ai2_ihevc_trans_32_21_815), vget_low_s16(o1_1));
845 a[23].val[1] = vmlal_s16(a[23].val[1],
846 vget_low_s16(g_ai2_ihevc_trans_32_23_815), vget_low_s16(o1_1));
847
848 a[23].val[1] = vmlal_s16(a[23].val[1],
849 vget_high_s16(g_ai2_ihevc_trans_32_23_815),
850 vget_high_s16(o1_1));
851 a[21].val[1] = vmlal_s16(a[21].val[1],
852 vget_high_s16(g_ai2_ihevc_trans_32_21_815),
853 vget_high_s16(o1_1));
854 a[19].val[1] = vmlal_s16(a[19].val[1],
855 vget_high_s16(g_ai2_ihevc_trans_32_19_815),
856 vget_high_s16(o1_1));
857 a[17].val[1] = vmlal_s16(a[17].val[1],
858 vget_high_s16(g_ai2_ihevc_trans_32_17_815),
859 vget_high_s16(o1_1));
860
861 int32x4x2_t val_1719_0 =
862 vtrnq_s32(a[17].val[0], a[19].val[0]); //q0 q4
863 int32x4x2_t val_1719_1 =
864 vtrnq_s32(a[17].val[1], a[19].val[1]); //q1 q5
865 int32x4x2_t val_2123_0 =
866 vtrnq_s32(a[21].val[0], a[23].val[0]); //q8 q12
867 int32x4x2_t val_2123_1 =
868 vtrnq_s32(a[21].val[1], a[23].val[1]); //q9 q13
869
870 a[17].val[0] = vcombine_s32(vget_low_s32(val_1719_0.val[0]),
871 vget_low_s32(val_2123_0.val[0])); //q0
872 a[17].val[1] = vcombine_s32(vget_high_s32(val_1719_0.val[0]),
873 vget_high_s32(val_2123_0.val[0])); //q8
874
875 a[19].val[0] = vcombine_s32(vget_low_s32(val_1719_0.val[1]),
876 vget_low_s32(val_2123_0.val[1])); //q1
877 a[19].val[1] = vcombine_s32(vget_high_s32(val_1719_0.val[1]),
878 vget_high_s32(val_2123_0.val[1])); //q9
879
880 a[21].val[0] = vcombine_s32(vget_low_s32(val_1719_1.val[0]),
881 vget_low_s32(val_2123_1.val[0])); //q4
882 a[21].val[1] = vcombine_s32(vget_high_s32(val_1719_1.val[0]),
883 vget_high_s32(val_2123_1.val[0])); //q12
884
885 a[23].val[0] = vcombine_s32(vget_low_s32(val_1719_1.val[1]),
886 vget_low_s32(val_2123_1.val[1])); //q5
887 a[23].val[1] = vcombine_s32(vget_high_s32(val_1719_1.val[1]),
888 vget_high_s32(val_2123_1.val[1])); //q13
889
890 a[17].val[0] = vaddq_s32(a[17].val[0], a[19].val[0]); //q0
891 a[17].val[1] = vaddq_s32(a[17].val[1], a[19].val[1]); //q8
892 a[17].val[1] = vaddq_s32(a[17].val[1], a[17].val[0]); //q8
893
894 a[21].val[0] = vaddq_s32(a[21].val[0], a[23].val[0]); //q1
895 a[21].val[1] = vaddq_s32(a[21].val[1], a[23].val[1]); //q9
896 a[21].val[1] = vaddq_s32(a[21].val[1], a[21].val[0]); //q9
897
898 int32x4x2_t val_17 = vzipq_s32(a[17].val[1], a[21].val[1]); //q8 q9
899
900 vst1_s32(pi4_temp + 544, vget_low_s32(val_17.val[0])); /*Value 17*/
901 vst1_s32(pi4_temp + 608, vget_high_s32(val_17.val[0])); /*Value 19*/
902 vst1_s32(pi4_temp + 672, vget_low_s32(val_17.val[1])); /*Value 21*/
903 vst1_s32(pi4_temp + 736, vget_high_s32(val_17.val[1])); /*Value 23*/
904
905 a[25].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_25_07),
906 vget_low_s16(o0_0)); //q2
907 a[27].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_27_07),
908 vget_low_s16(o0_0)); //q2
909 a[29].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_29_07),
910 vget_low_s16(o0_0)); //q2
911 a[31].val[0] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_31_07),
912 vget_low_s16(o0_0)); //q2
913
914 a[31].val[0] = vmlal_s16(a[31].val[0],
915 vget_high_s16(g_ai2_ihevc_trans_32_31_07), vget_high_s16(o0_0));
916 a[29].val[0] = vmlal_s16(a[29].val[0],
917 vget_high_s16(g_ai2_ihevc_trans_32_29_07), vget_high_s16(o0_0));
918 a[27].val[0] = vmlal_s16(a[27].val[0],
919 vget_high_s16(g_ai2_ihevc_trans_32_27_07), vget_high_s16(o0_0));
920 a[25].val[0] = vmlal_s16(a[25].val[0],
921 vget_high_s16(g_ai2_ihevc_trans_32_25_07), vget_high_s16(o0_0));
922
923 a[25].val[0] = vmlal_s16(a[25].val[0],
924 vget_low_s16(g_ai2_ihevc_trans_32_25_815), vget_low_s16(o0_1));
925 a[27].val[0] = vmlal_s16(a[27].val[0],
926 vget_low_s16(g_ai2_ihevc_trans_32_27_815), vget_low_s16(o0_1));
927 a[29].val[0] = vmlal_s16(a[29].val[0],
928 vget_low_s16(g_ai2_ihevc_trans_32_29_815), vget_low_s16(o0_1));
929 a[31].val[0] = vmlal_s16(a[31].val[0],
930 vget_low_s16(g_ai2_ihevc_trans_32_31_815), vget_low_s16(o0_1));
931
932 a[31].val[0] = vmlal_s16(a[31].val[0],
933 vget_high_s16(g_ai2_ihevc_trans_32_31_815),
934 vget_high_s16(o0_1));
935 a[29].val[0] = vmlal_s16(a[29].val[0],
936 vget_high_s16(g_ai2_ihevc_trans_32_29_815),
937 vget_high_s16(o0_1));
938 a[27].val[0] = vmlal_s16(a[27].val[0],
939 vget_high_s16(g_ai2_ihevc_trans_32_27_815),
940 vget_high_s16(o0_1));
941 a[25].val[0] = vmlal_s16(a[25].val[0],
942 vget_high_s16(g_ai2_ihevc_trans_32_25_815),
943 vget_high_s16(o0_1));
944
945 a[25].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_25_07),
946 vget_low_s16(o1_0)); //q2
947 a[27].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_27_07),
948 vget_low_s16(o1_0)); //q2
949 a[29].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_29_07),
950 vget_low_s16(o1_0)); //q2
951 a[31].val[1] = vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_31_07),
952 vget_low_s16(o1_0)); //q2
953
954 a[31].val[1] = vmlal_s16(a[31].val[1],
955 vget_high_s16(g_ai2_ihevc_trans_32_31_07), vget_high_s16(o1_0));
956 a[29].val[1] = vmlal_s16(a[29].val[1],
957 vget_high_s16(g_ai2_ihevc_trans_32_29_07), vget_high_s16(o1_0));
958 a[27].val[1] = vmlal_s16(a[27].val[1],
959 vget_high_s16(g_ai2_ihevc_trans_32_27_07), vget_high_s16(o1_0));
960 a[25].val[1] = vmlal_s16(a[25].val[1],
961 vget_high_s16(g_ai2_ihevc_trans_32_25_07), vget_high_s16(o1_0));
962
963 a[25].val[1] = vmlal_s16(a[25].val[1],
964 vget_low_s16(g_ai2_ihevc_trans_32_25_815), vget_low_s16(o1_1));
965 a[27].val[1] = vmlal_s16(a[27].val[1],
966 vget_low_s16(g_ai2_ihevc_trans_32_27_815), vget_low_s16(o1_1));
967 a[29].val[1] = vmlal_s16(a[29].val[1],
968 vget_low_s16(g_ai2_ihevc_trans_32_29_815), vget_low_s16(o1_1));
969 a[31].val[1] = vmlal_s16(a[31].val[1],
970 vget_low_s16(g_ai2_ihevc_trans_32_31_815), vget_low_s16(o1_1));
971
972 a[31].val[1] = vmlal_s16(a[31].val[1],
973 vget_high_s16(g_ai2_ihevc_trans_32_31_815),
974 vget_high_s16(o1_1));
975 a[29].val[1] = vmlal_s16(a[29].val[1],
976 vget_high_s16(g_ai2_ihevc_trans_32_29_815),
977 vget_high_s16(o1_1));
978 a[27].val[1] = vmlal_s16(a[27].val[1],
979 vget_high_s16(g_ai2_ihevc_trans_32_27_815),
980 vget_high_s16(o1_1));
981 a[25].val[1] = vmlal_s16(a[25].val[1],
982 vget_high_s16(g_ai2_ihevc_trans_32_25_815),
983 vget_high_s16(o1_1));
984
985 int32x4x2_t val_2527_0 =
986 vtrnq_s32(a[25].val[0], a[27].val[0]); //q0 q4
987 int32x4x2_t val_2527_1 =
988 vtrnq_s32(a[25].val[1], a[27].val[1]); //q1 q5
989 int32x4x2_t val_2931_0 =
990 vtrnq_s32(a[29].val[0], a[31].val[0]); //q8 q12
991 int32x4x2_t val_2931_1 =
992 vtrnq_s32(a[29].val[1], a[31].val[1]); //q9 q13
993
994 a[25].val[0] = vcombine_s32(vget_low_s32(val_2527_0.val[0]),
995 vget_low_s32(val_2931_0.val[0])); //q0
996 a[25].val[1] = vcombine_s32(vget_high_s32(val_2527_0.val[0]),
997 vget_high_s32(val_2931_0.val[0])); //q8
998
999 a[27].val[0] = vcombine_s32(vget_low_s32(val_2527_0.val[1]),
1000 vget_low_s32(val_2931_0.val[1])); //q1
1001 a[27].val[1] = vcombine_s32(vget_high_s32(val_2527_0.val[1]),
1002 vget_high_s32(val_2931_0.val[1])); //q9
1003
1004 a[29].val[0] = vcombine_s32(vget_low_s32(val_2527_1.val[0]),
1005 vget_low_s32(val_2931_1.val[0])); //q4
1006 a[29].val[1] = vcombine_s32(vget_high_s32(val_2527_1.val[0]),
1007 vget_high_s32(val_2931_1.val[0])); //q12
1008
1009 a[31].val[0] = vcombine_s32(vget_low_s32(val_2527_1.val[1]),
1010 vget_low_s32(val_2931_1.val[1])); //q5
1011 a[31].val[1] = vcombine_s32(vget_high_s32(val_2527_1.val[1]),
1012 vget_high_s32(val_2931_1.val[1])); //q13
1013
1014 a[25].val[0] = vaddq_s32(a[25].val[0], a[27].val[0]); //q0
1015 a[25].val[1] = vaddq_s32(a[25].val[1], a[27].val[1]); //q8
1016 a[25].val[1] = vaddq_s32(a[25].val[1], a[25].val[0]); //q8
1017
1018 a[29].val[0] = vaddq_s32(a[29].val[0], a[31].val[0]); //q1
1019 a[29].val[1] = vaddq_s32(a[29].val[1], a[31].val[1]); //q9
1020 a[29].val[1] = vaddq_s32(a[29].val[1], a[29].val[0]); //q9
1021
1022 int32x4x2_t val_25 = vzipq_s32(a[25].val[1], a[29].val[1]); //q8 q9
1023
1024 vst1_s32(pi4_temp + 800, vget_low_s32(val_25.val[0])); /*Value 25*/
1025 vst1_s32(pi4_temp + 864, vget_high_s32(val_25.val[0])); /*Value 27*/
1026 vst1_s32(pi4_temp + 928, vget_low_s32(val_25.val[1])); /*Value 29*/
1027 vst1_s32(pi4_temp + 992, vget_high_s32(val_25.val[1])); /*Value 31*/
1028
1029 pi4_temp += 2;
1030 }
1031 }
1032
1033 /*sad of the block*/
1034 tmp_a = vpaddlq_s32(sum_val);
1035 sad = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_a)),
1036 vreinterpret_s32_s64(vget_high_s64(tmp_a)));
1037 u4_blk_sad = vget_lane_s32(sad, 0);
1038
1039 //Stage 2
1040 {
1041 static const int32x4_t g_ai4_ihevc_trans_32_0_8 = { 64, -64, 83, -83 };
1042 static const int32x4_t g_ai4_ihevc_trans_32_1_8 = { 64, 64, 36, 36 };
1043
1044 static const int32x4_t g_ai4_ihevc_trans_32_4_04 = { 89, 75, 50, 18 };
1045 static const int32x4_t g_ai4_ihevc_trans_32_12_04 = { 75, -18, -89, -50 };
1046 static const int32x4_t g_ai4_ihevc_trans_32_20_04 = { 50, -89, 18, 75 };
1047 static const int32x4_t g_ai4_ihevc_trans_32_28_04 = { 18, -50, 75, -89 };
1048
1049 static const int32x4_t g_ai4_ihevc_trans_32_2_03 = { 90, 87, 80, 70 };
1050 static const int32x4_t g_ai4_ihevc_trans_32_2_47 = { 57, 43, 25, 9 };
1051 static const int32x4_t g_ai4_ihevc_trans_32_6_03 = { 87, 57, 9, -43 };
1052 static const int32x4_t g_ai4_ihevc_trans_32_6_47 = { -80, -90, -70,
1053 -25 };
1054 static const int32x4_t g_ai4_ihevc_trans_32_10_03 = { 80, 9, -70, -87 };
1055 static const int32x4_t g_ai4_ihevc_trans_32_10_47 = { -25, 57, 90, 43 };
1056 static const int32x4_t g_ai4_ihevc_trans_32_14_03 = { 70, -43, -87, 9 };
1057 static const int32x4_t g_ai4_ihevc_trans_32_14_47 = { 90, 25, -80, -57 };
1058 static const int32x4_t g_ai4_ihevc_trans_32_18_03 = { 57, -80, -25, 90 };
1059 static const int32x4_t g_ai4_ihevc_trans_32_18_47 = { -9, -87, 43, 70 };
1060 static const int32x4_t g_ai4_ihevc_trans_32_22_03 = { 43, -90, 57, 25 };
1061 static const int32x4_t g_ai4_ihevc_trans_32_22_47 = { -87, 70, 9, -80 };
1062 static const int32x4_t g_ai4_ihevc_trans_32_26_03 = { 25, -70, 90, -80 };
1063 static const int32x4_t g_ai4_ihevc_trans_32_26_47 = { 43, 9, -57, 87 };
1064 static const int32x4_t g_ai4_ihevc_trans_32_30_03 = { 9, -25, 43, -57 };
1065 static const int32x4_t g_ai4_ihevc_trans_32_30_47 = { 70, -80, 87, -90 };
1066
1067 static const int32x4_t g_ai4_ihevc_trans_32_1_03 = { 90, 90, 88, 85 };
1068 static const int32x4_t g_ai4_ihevc_trans_32_1_47 = { 82, 78, 73, 67 };
1069 static const int32x4_t g_ai4_ihevc_trans_32_1_811 = { 61, 54, 46, 38 };
1070 static const int32x4_t g_ai4_ihevc_trans_32_1_1215 = { 31, 22, 13, 4 };
1071 static const int32x4_t g_ai4_ihevc_trans_32_3_03 = { 90, 82, 67, 46 };
1072 static const int32x4_t g_ai4_ihevc_trans_32_3_47 = { 22, -4, -31, -54 };
1073 static const int32x4_t g_ai4_ihevc_trans_32_3_811 = { -73, -85, -90, -88 };
1074 static const int32x4_t g_ai4_ihevc_trans_32_3_1215 = { -78, -61, -38, -13 };
1075 static const int32x4_t g_ai4_ihevc_trans_32_5_03 = { 88, 67, 31, -13 };
1076 static const int32x4_t g_ai4_ihevc_trans_32_5_47 = { -54, -82, -90, -78 };
1077 static const int32x4_t g_ai4_ihevc_trans_32_5_811 = { -46, -4, 38, 73 };
1078 static const int32x4_t g_ai4_ihevc_trans_32_5_1215 = { 90, 85, 61, 22 };
1079 static const int32x4_t g_ai4_ihevc_trans_32_7_03 = { 85, 46, -13, -67 };
1080 static const int32x4_t g_ai4_ihevc_trans_32_7_47 = { -90, -73, -22, 38 };
1081 static const int32x4_t g_ai4_ihevc_trans_32_7_811 = { 82, 88, 54, -4 };
1082 static const int32x4_t g_ai4_ihevc_trans_32_7_1215 = { -61, -90, -78, -31 };
1083 static const int32x4_t g_ai4_ihevc_trans_32_9_03 = { 82, 22, -54, -90 };
1084 static const int32x4_t g_ai4_ihevc_trans_32_9_47 = { -61, 13, 78, 85 };
1085 static const int32x4_t g_ai4_ihevc_trans_32_9_811 = { 31, -46, -90, -67 };
1086 static const int32x4_t g_ai4_ihevc_trans_32_9_1215 = { 4, 73, 88, 38 };
1087 static const int32x4_t g_ai4_ihevc_trans_32_11_03 = { 78, -4, -82, -73 };
1088 static const int32x4_t g_ai4_ihevc_trans_32_11_47 = { 13, 85, 67, -22 };
1089 static const int32x4_t g_ai4_ihevc_trans_32_11_811 = { -88, -61, 31, 90 };
1090 static const int32x4_t g_ai4_ihevc_trans_32_11_1215 = { 54, -38, -90, -46 };
1091 static const int32x4_t g_ai4_ihevc_trans_32_13_03 = { 73, -31, -90, -22 };
1092 static const int32x4_t g_ai4_ihevc_trans_32_13_47 = { 78, 67, -38, -90 };
1093 static const int32x4_t g_ai4_ihevc_trans_32_13_811 = { -13, 82, 61, -46 };
1094 static const int32x4_t g_ai4_ihevc_trans_32_13_1215 = { -88, -4, 85, 54 };
1095 static const int32x4_t g_ai4_ihevc_trans_32_15_03 = { 67, -54, -78, 38 };
1096 static const int32x4_t g_ai4_ihevc_trans_32_15_47 = { 85, -22, -90, 4 };
1097 static const int32x4_t g_ai4_ihevc_trans_32_15_811 = { 90, 13, -88, -31 };
1098 static const int32x4_t g_ai4_ihevc_trans_32_15_1215 = { 82, 46, -73, -61 };
1099 static const int32x4_t g_ai4_ihevc_trans_32_17_03 = { 61, -73, -46, 82 };
1100 static const int32x4_t g_ai4_ihevc_trans_32_17_47 = { 31, -88, -13, 90 };
1101 static const int32x4_t g_ai4_ihevc_trans_32_17_811 = { -4, -90, 22, 85 };
1102 static const int32x4_t g_ai4_ihevc_trans_32_17_1215 = { -38, -78, 54, 67 };
1103 static const int32x4_t g_ai4_ihevc_trans_32_19_03 = { 54, -85, -4, 88 };
1104 static const int32x4_t g_ai4_ihevc_trans_32_19_47 = { -46, -61, 82, 13 };
1105 static const int32x4_t g_ai4_ihevc_trans_32_19_811 = { -90, 38, 67, -78 };
1106 static const int32x4_t g_ai4_ihevc_trans_32_19_1215 = { -22, 90, -31, -73 };
1107 static const int32x4_t g_ai4_ihevc_trans_32_21_03 = { 46, -90, 38, 54 };
1108 static const int32x4_t g_ai4_ihevc_trans_32_21_47 = { -90, 31, 61, -88 };
1109 static const int32x4_t g_ai4_ihevc_trans_32_21_811 = { 22, 67, -85, 13 };
1110 static const int32x4_t g_ai4_ihevc_trans_32_21_1215 = { 73, -82, 4, 78 };
1111 static const int32x4_t g_ai4_ihevc_trans_32_23_03 = { 38, -88, 73, -4 };
1112 static const int32x4_t g_ai4_ihevc_trans_32_23_47 = { -67, 90, -46, -31 };
1113 static const int32x4_t g_ai4_ihevc_trans_32_23_811 = { 85, -78, 13, 61 };
1114 static const int32x4_t g_ai4_ihevc_trans_32_23_1215 = { -90, 54, 22, -82 };
1115 static const int32x4_t g_ai4_ihevc_trans_32_25_03 = { 31, -78, 90, -61 };
1116 static const int32x4_t g_ai4_ihevc_trans_32_25_47 = { 4, 54, -88, 82 };
1117 static const int32x4_t g_ai4_ihevc_trans_32_25_811 = { -38, -22, 73, -90 };
1118 static const int32x4_t g_ai4_ihevc_trans_32_25_1215 = { 67, -13, -46, 85 };
1119 static const int32x4_t g_ai4_ihevc_trans_32_27_03 = { 22, -61, 85, -90 };
1120 static const int32x4_t g_ai4_ihevc_trans_32_27_47 = { 73, -38, -4, 46 };
1121 static const int32x4_t g_ai4_ihevc_trans_32_27_811 = { -78, 90, -82, 54 };
1122 static const int32x4_t g_ai4_ihevc_trans_32_27_1215 = { -13, -31, 67, -88 };
1123 static const int32x4_t g_ai4_ihevc_trans_32_29_03 = { 13, -38, 61, -78 };
1124 static const int32x4_t g_ai4_ihevc_trans_32_29_47 = { 88, -90, 85, -73 };
1125 static const int32x4_t g_ai4_ihevc_trans_32_29_811 = { 54, -31, 4, 22 };
1126 static const int32x4_t g_ai4_ihevc_trans_32_29_1215 = { -46, 67, -82, 90 };
1127 static const int32x4_t g_ai4_ihevc_trans_32_31_03 = { 4, -13, 22, -31 };
1128 static const int32x4_t g_ai4_ihevc_trans_32_31_47 = { 38, -46, 54, -61 };
1129 static const int32x4_t g_ai4_ihevc_trans_32_31_811 = { 67, -73, 78, -82 };
1130 static const int32x4_t g_ai4_ihevc_trans_32_31_1215 = { 85, -88, 90, -90 };
1131
1132 int32x4_t a[32];
1133
1134 pi4_temp = pi4_temp_orig;
1135 for(i = 0; i < 32; i++)
1136 {
1137 int32x4_t temp_data[8];
1138
1139 temp_data[0] = vld1q_s32(pi4_temp);
1140 temp_data[1] = vld1q_s32(pi4_temp + 4);
1141 temp_data[2] = vld1q_s32(pi4_temp + 8);
1142 temp_data[3] = vld1q_s32(pi4_temp + 12);
1143
1144 temp_data[4] = vrev64q_s32(vld1q_s32(pi4_temp + 16));
1145 temp_data[4] = vcombine_s32(
1146 vget_high_s32(temp_data[4]), vget_low_s32(temp_data[4]));
1147
1148 temp_data[5] = vrev64q_s32(vld1q_s32(pi4_temp + 20));
1149 temp_data[5] = vcombine_s32(
1150 vget_high_s32(temp_data[5]), vget_low_s32(temp_data[5]));
1151
1152 temp_data[6] = vrev64q_s32(vld1q_s32(pi4_temp + 24));
1153 temp_data[6] = vcombine_s32(
1154 vget_high_s32(temp_data[6]), vget_low_s32(temp_data[6]));
1155
1156 temp_data[7] = vrev64q_s32(vld1q_s32(pi4_temp + 28));
1157 temp_data[7] = vcombine_s32(
1158 vget_high_s32(temp_data[7]), vget_low_s32(temp_data[7]));
1159
1160 pi4_temp += 32;
1161
1162 const int32x4_t o0 =
1163 vsubq_s32(temp_data[0], temp_data[7]); /*R2(9-16) - R2(24-17)*/
1164 const int32x4_t o1 =
1165 vsubq_s32(temp_data[1], temp_data[6]); /*R2(1- 8) - R2(32-25)*/
1166 const int32x4_t o2 =
1167 vsubq_s32(temp_data[2], temp_data[5]); /*R1(9-16) - R1(24-17)*/
1168 const int32x4_t o3 =
1169 vsubq_s32(temp_data[3], temp_data[4]); /*R1(1- 8) - R1(32-25)*/
1170
1171 int32x4_t e3 = vrev64q_s32(
1172 vaddq_s32(temp_data[3], temp_data[4])); /*R1(1- 8) + R1(32-25)*/
1173 e3 = vcombine_s32(vget_high_s32(e3), vget_low_s32(e3));
1174 int32x4_t e2 = vrev64q_s32(
1175 vaddq_s32(temp_data[2], temp_data[5])); /*R1(9-16) + R1(24-17)*/
1176 e2 = vcombine_s32(vget_high_s32(e2), vget_low_s32(e2));
1177
1178 const int32x4_t e1 =
1179 vaddq_s32(temp_data[1], temp_data[6]); /*R2(1- 8) + R2(32-25)*/
1180 const int32x4_t e0 =
1181 vaddq_s32(temp_data[0], temp_data[7]); /*R2(9-16) + R2(24-17)*/
1182
1183 const int32x4_t ee0 = vaddq_s32(e0, e3); /*E1(1- 8) + E1(16-9)*/
1184 int32x4_t ee1 =
1185 vrev64q_s32(vaddq_s32(e1, e2)); /*E2(1- 8) + E2(16-9)*/
1186 ee1 = vcombine_s32(vget_high_s32(ee1), vget_low_s32(ee1));
1187 const int32x4_t eo1 = vsubq_s32(e1, e2); /*E2(1- 8) - E2(16-9)*/
1188 const int32x4_t eo0 = vsubq_s32(e0, e3); /*E1(1- 8) - E1(16-9)*/
1189
1190 /*EE(1-4) - EE(8-5)*/
1191 const int32x4_t eeo = vsubq_s32(ee0, ee1); //Q5
1192 /*EE(1-4) + EE(8-5)*/
1193 const int32x4_t eee = vaddq_s32(ee0, ee1); //Q4
1194
1195 /*EEEE Calculations*/
1196 const int32x4_t eeee = vcombine_s32(
1197 vadd_s32(vget_low_s32(eee), vrev64_s32(vget_high_s32(eee))),
1198 vsub_s32(
1199 vget_low_s32(eee), vrev64_s32(vget_high_s32(eee)))); //q6
1200
1201 /*Calculation of values 0 8 16 24*/
1202 /*Multiplications*/
1203 a[0] = vmulq_s32(g_ai4_ihevc_trans_32_0_8, eeee);
1204 a[0] = vmlaq_s32(a[0], g_ai4_ihevc_trans_32_1_8, vrev64q_s32(eeee));
1205 /*Shift*/
1206 int16x4_t val_0 = vrshrn_n_s32(a[0], 15);
1207 /*Store*/
1208 vst1_lane_s16(pi2_dst, val_0, 0); /*Value 0*/
1209 vst1_lane_s16(pi2_dst + 8 * dst_strd, val_0, 2); /*Value 8*/
1210 vst1_lane_s16(pi2_dst + 16 * dst_strd, val_0, 1); /*Value 16*/
1211 vst1_lane_s16(pi2_dst + 24 * dst_strd, val_0, 3); /*Value 24*/
1212
1213 /*Calculation of values 4 12 20 28*/
1214 /*Multiplications*/
1215 a[4] = vmulq_s32(g_ai4_ihevc_trans_32_4_04, eeo);
1216 a[12] = vmulq_s32(g_ai4_ihevc_trans_32_12_04, eeo);
1217 a[20] = vmulq_s32(g_ai4_ihevc_trans_32_20_04, eeo);
1218 a[28] = vmulq_s32(g_ai4_ihevc_trans_32_28_04, eeo);
1219 /*Transposes*/
1220 int32x4x2_t val_412 = vtrnq_s32(a[4], a[12]); //q0 q9
1221 int32x4x2_t val_2028 = vtrnq_s32(a[20], a[28]); //q10 q11
1222 /*Swap*/
1223 a[4] = vcombine_s32(vget_low_s32(val_412.val[0]),
1224 vget_low_s32(val_2028.val[0])); //q0
1225 a[12] = vcombine_s32(vget_low_s32(val_412.val[1]),
1226 vget_low_s32(val_2028.val[1])); //q9
1227 a[20] = vcombine_s32(vget_high_s32(val_412.val[0]),
1228 vget_high_s32(val_2028.val[0])); //q10
1229 a[28] = vcombine_s32(vget_high_s32(val_412.val[1]),
1230 vget_high_s32(val_2028.val[1])); //q11
1231 /*Additions*/
1232 a[4] = vaddq_s32(a[4], a[12]); //q0
1233 a[20] = vaddq_s32(a[20], a[28]); //q10
1234 a[4] = vaddq_s32(a[4], a[20]); //q0
1235 /*Shift*/
1236 int16x4_t val_4 = vrshrn_n_s32(a[4], 15);
1237 /*Store*/
1238 vst1_lane_s16(pi2_dst + 4 * dst_strd, val_4, 0); /*Value 4*/
1239 vst1_lane_s16(pi2_dst + 12 * dst_strd, val_4, 1); /*Value 12*/
1240 vst1_lane_s16(pi2_dst + 20 * dst_strd, val_4, 2); /*Value 20*/
1241 vst1_lane_s16(pi2_dst + 28 * dst_strd, val_4, 3); /*Value 28*/
1242
1243 /*Calculation of value 2 6 10 14 18 22 26 30*/
1244 /*Multiplications*/
1245 a[2] = vmulq_s32(g_ai4_ihevc_trans_32_2_03, eo0); //q8
1246 a[6] = vmulq_s32(g_ai4_ihevc_trans_32_6_03, eo0); //q2
1247 a[10] = vmulq_s32(g_ai4_ihevc_trans_32_10_03, eo0); //q2
1248 a[14] = vmulq_s32(g_ai4_ihevc_trans_32_14_03, eo0); //q2
1249
1250 a[14] = vmlaq_s32(a[14], g_ai4_ihevc_trans_32_14_47, eo1);
1251 a[10] = vmlaq_s32(a[10], g_ai4_ihevc_trans_32_10_47, eo1);
1252 a[6] = vmlaq_s32(a[6], g_ai4_ihevc_trans_32_6_47, eo1);
1253 a[2] = vmlaq_s32(a[2], g_ai4_ihevc_trans_32_2_47, eo1);
1254
1255 int32x2_t val_2 = vadd_s32(vget_low_s32(a[2]), vget_high_s32(a[2]));
1256 int32x2_t val_6 = vadd_s32(vget_low_s32(a[6]), vget_high_s32(a[6]));
1257 val_2 = vpadd_s32(val_2, val_6);
1258
1259 int32x2_t val_10 =
1260 vadd_s32(vget_low_s32(a[10]), vget_high_s32(a[10]));
1261 int32x2_t val_14 =
1262 vadd_s32(vget_low_s32(a[14]), vget_high_s32(a[14]));
1263 val_10 = vpadd_s32(val_10, val_14);
1264
1265 /*Shift*/
1266 int16x4_t val__2 =
1267 vrshrn_n_s32(vcombine_s32(val_2, val_10), 15); //q9 q12
1268
1269 /*Store*/
1270 vst1_lane_s16(pi2_dst + 2 * dst_strd, val__2, 0); /*Value 2*/
1271 vst1_lane_s16(pi2_dst + 6 * dst_strd, val__2, 1); /*Value 6*/
1272 vst1_lane_s16(pi2_dst + 10 * dst_strd, val__2, 2); /*Value 10*/
1273 vst1_lane_s16(pi2_dst + 14 * dst_strd, val__2, 3); /*Value 14*/
1274
1275 a[18] = vmulq_s32(g_ai4_ihevc_trans_32_18_03, eo0); //q2
1276 a[22] = vmulq_s32(g_ai4_ihevc_trans_32_22_03, eo0); //q2
1277 a[26] = vmulq_s32(g_ai4_ihevc_trans_32_26_03, eo0); //q2
1278 a[30] = vmulq_s32(g_ai4_ihevc_trans_32_30_03, eo0); //q2
1279
1280 a[30] = vmlaq_s32(a[30], g_ai4_ihevc_trans_32_30_47, eo1);
1281 a[26] = vmlaq_s32(a[26], g_ai4_ihevc_trans_32_26_47, eo1);
1282 a[22] = vmlaq_s32(a[22], g_ai4_ihevc_trans_32_22_47, eo1);
1283 a[18] = vmlaq_s32(a[18], g_ai4_ihevc_trans_32_18_47, eo1);
1284
1285 int32x2_t val_18 =
1286 vadd_s32(vget_low_s32(a[18]), vget_high_s32(a[18]));
1287 int32x2_t val_22 =
1288 vadd_s32(vget_low_s32(a[22]), vget_high_s32(a[22]));
1289 val_18 = vpadd_s32(val_18, val_22);
1290 int32x2_t val_26 =
1291 vadd_s32(vget_low_s32(a[26]), vget_high_s32(a[26]));
1292 int32x2_t val_30 =
1293 vadd_s32(vget_low_s32(a[30]), vget_high_s32(a[30]));
1294 val_26 = vpadd_s32(val_26, val_30);
1295
1296 int16x4_t val__18 =
1297 vrshrn_n_s32(vcombine_s32(val_18, val_26), 15); //q9 q12
1298
1299 vst1_lane_s16(pi2_dst + 18 * dst_strd, val__18, 0); /*Value 18*/
1300 vst1_lane_s16(pi2_dst + 22 * dst_strd, val__18, 1); /*Value 22*/
1301 vst1_lane_s16(pi2_dst + 26 * dst_strd, val__18, 2); /*Value 26*/
1302 vst1_lane_s16(pi2_dst + 30 * dst_strd, val__18, 3); /*Value 30*/
1303
1304 /*Calculations for odd indexes*/
1305 a[7] = vmulq_s32(g_ai4_ihevc_trans_32_7_03, o0); //q1
1306 a[5] = vmulq_s32(g_ai4_ihevc_trans_32_5_03, o0); //q1
1307 a[3] = vmulq_s32(g_ai4_ihevc_trans_32_3_03, o0); //q1
1308 a[1] = vmulq_s32(g_ai4_ihevc_trans_32_1_03, o0); //q1
1309
1310 a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_47, o1);
1311 a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_47, o1);
1312 a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_47, o1);
1313 a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_47, o1);
1314
1315 a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_811, o2);
1316 a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_811, o2);
1317 a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_811, o2);
1318 a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_811, o2);
1319
1320 a[1] = vmlaq_s32(a[1], g_ai4_ihevc_trans_32_1_1215, o3);
1321 int32x2_t val_1 = vadd_s32(vget_low_s32(a[1]), vget_high_s32(a[1]));
1322 a[3] = vmlaq_s32(a[3], g_ai4_ihevc_trans_32_3_1215, o3);
1323 int32x2_t val_3 = vadd_s32(vget_low_s32(a[3]), vget_high_s32(a[3]));
1324 val_1 = vpadd_s32(val_1, val_3);
1325 a[5] = vmlaq_s32(a[5], g_ai4_ihevc_trans_32_5_1215, o3);
1326 int32x2_t val_5 = vadd_s32(vget_low_s32(a[5]), vget_high_s32(a[5]));
1327 a[7] = vmlaq_s32(a[7], g_ai4_ihevc_trans_32_7_1215, o3);
1328 int32x2_t val_7 = vadd_s32(vget_low_s32(a[7]), vget_high_s32(a[7]));
1329 val_5 = vpadd_s32(val_5, val_7);
1330
1331 /*Shift*/
1332 int16x4_t val__1 =
1333 vrshrn_n_s32(vcombine_s32(val_1, val_5), 15); //q9 q12
1334
1335 /*Store*/
1336 vst1_lane_s16(pi2_dst + 1 * dst_strd, val__1, 0); /*Value 1*/
1337 vst1_lane_s16(pi2_dst + 3 * dst_strd, val__1, 1); /*Value 3*/
1338 vst1_lane_s16(pi2_dst + 5 * dst_strd, val__1, 2); /*Value 5*/
1339 vst1_lane_s16(pi2_dst + 7 * dst_strd, val__1, 3); /*Value 7*/
1340
1341 a[15] = vmulq_s32(g_ai4_ihevc_trans_32_15_03, o0); //q1
1342 a[13] = vmulq_s32(g_ai4_ihevc_trans_32_13_03, o0); //q1
1343 a[11] = vmulq_s32(g_ai4_ihevc_trans_32_11_03, o0); //q1
1344 a[9] = vmulq_s32(g_ai4_ihevc_trans_32_9_03, o0); //q1
1345
1346 a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_47, o1);
1347 a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_47, o1);
1348 a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_47, o1);
1349 a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_47, o1);
1350
1351 a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_811, o2);
1352 a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_811, o2);
1353 a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_811, o2);
1354 a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_811, o2);
1355
1356 a[9] = vmlaq_s32(a[9], g_ai4_ihevc_trans_32_9_1215, o3);
1357 int32x2_t val_9 = vadd_s32(vget_low_s32(a[9]), vget_high_s32(a[9]));
1358 a[11] = vmlaq_s32(a[11], g_ai4_ihevc_trans_32_11_1215, o3);
1359 int32x2_t val_11 =
1360 vadd_s32(vget_low_s32(a[11]), vget_high_s32(a[11]));
1361 val_9 = vpadd_s32(val_9, val_11);
1362 a[13] = vmlaq_s32(a[13], g_ai4_ihevc_trans_32_13_1215, o3);
1363 int32x2_t val_13 =
1364 vadd_s32(vget_low_s32(a[13]), vget_high_s32(a[13]));
1365 a[15] = vmlaq_s32(a[15], g_ai4_ihevc_trans_32_15_1215, o3);
1366 int32x2_t val_15 =
1367 vadd_s32(vget_low_s32(a[15]), vget_high_s32(a[15]));
1368 val_13 = vpadd_s32(val_13, val_15);
1369
1370 int16x4_t val__9 =
1371 vrshrn_n_s32(vcombine_s32(val_9, val_13), 15); //q9 q12
1372
1373 vst1_lane_s16(pi2_dst + 9 * dst_strd, val__9, 0); /*Value 9*/
1374 vst1_lane_s16(pi2_dst + 11 * dst_strd, val__9, 1); /*Value 11*/
1375 vst1_lane_s16(pi2_dst + 13 * dst_strd, val__9, 2); /*Value 13*/
1376 vst1_lane_s16(pi2_dst + 15 * dst_strd, val__9, 3); /*Value 15*/
1377
1378 a[23] = vmulq_s32(g_ai4_ihevc_trans_32_23_03, o0); //q1
1379 a[21] = vmulq_s32(g_ai4_ihevc_trans_32_21_03, o0); //q1
1380 a[19] = vmulq_s32(g_ai4_ihevc_trans_32_19_03, o0); //q1
1381 a[17] = vmulq_s32(g_ai4_ihevc_trans_32_17_03, o0); //q1
1382
1383 a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_47, o1);
1384 a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_47, o1);
1385 a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_47, o1);
1386 a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_47, o1);
1387
1388 a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_811, o2);
1389 a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_811, o2);
1390 a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_811, o2);
1391 a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_811, o2);
1392
1393 a[17] = vmlaq_s32(a[17], g_ai4_ihevc_trans_32_17_1215, o3);
1394 int32x2_t val_17 =
1395 vadd_s32(vget_low_s32(a[17]), vget_high_s32(a[17]));
1396 a[19] = vmlaq_s32(a[19], g_ai4_ihevc_trans_32_19_1215, o3);
1397 int32x2_t val_19 =
1398 vadd_s32(vget_low_s32(a[19]), vget_high_s32(a[19]));
1399 val_17 = vpadd_s32(val_17, val_19);
1400 a[21] = vmlaq_s32(a[21], g_ai4_ihevc_trans_32_21_1215, o3);
1401 int32x2_t val_21 =
1402 vadd_s32(vget_low_s32(a[21]), vget_high_s32(a[21]));
1403 a[23] = vmlaq_s32(a[23], g_ai4_ihevc_trans_32_23_1215, o3);
1404 int32x2_t val_23 =
1405 vadd_s32(vget_low_s32(a[23]), vget_high_s32(a[23]));
1406 val_21 = vpadd_s32(val_21, val_23);
1407
1408 int16x4_t val__17 =
1409 vrshrn_n_s32(vcombine_s32(val_17, val_21), 15); //q9 q12
1410
1411 vst1_lane_s16(pi2_dst + 17 * dst_strd, val__17, 0); /*Value 17*/
1412 vst1_lane_s16(pi2_dst + 19 * dst_strd, val__17, 1); /*Value 19*/
1413 vst1_lane_s16(pi2_dst + 21 * dst_strd, val__17, 2); /*Value 21*/
1414 vst1_lane_s16(pi2_dst + 23 * dst_strd, val__17, 3); /*Value 23*/
1415
1416 a[31] = vmulq_s32(g_ai4_ihevc_trans_32_31_03, o0); //q10
1417 a[29] = vmulq_s32(g_ai4_ihevc_trans_32_29_03, o0); //q1
1418 a[27] = vmulq_s32(g_ai4_ihevc_trans_32_27_03, o0); //q1
1419 a[25] = vmulq_s32(g_ai4_ihevc_trans_32_25_03, o0); //q1
1420
1421 a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_47, o1);
1422 a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_47, o1);
1423 a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_47, o1);
1424 a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_47, o1);
1425
1426 a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_811, o2);
1427 a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_811, o2);
1428 a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_811, o2);
1429 a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_811, o2);
1430
1431 a[25] = vmlaq_s32(a[25], g_ai4_ihevc_trans_32_25_1215, o3);
1432 int32x2_t val_25 =
1433 vadd_s32(vget_low_s32(a[25]), vget_high_s32(a[25]));
1434 a[27] = vmlaq_s32(a[27], g_ai4_ihevc_trans_32_27_1215, o3);
1435 int32x2_t val_27 =
1436 vadd_s32(vget_low_s32(a[27]), vget_high_s32(a[27]));
1437 val_25 = vpadd_s32(val_25, val_27);
1438 a[29] = vmlaq_s32(a[29], g_ai4_ihevc_trans_32_29_1215, o3);
1439 int32x2_t val_29 =
1440 vadd_s32(vget_low_s32(a[29]), vget_high_s32(a[29]));
1441 a[31] = vmlaq_s32(a[31], g_ai4_ihevc_trans_32_31_1215, o3);
1442 int32x2_t val_31 =
1443 vadd_s32(vget_low_s32(a[31]), vget_high_s32(a[31]));
1444 val_29 = vpadd_s32(val_29, val_31);
1445
1446 int16x4_t val__25 =
1447 vrshrn_n_s32(vcombine_s32(val_25, val_29), 15); //q9 q12
1448
1449 vst1_lane_s16(pi2_dst + 25 * dst_strd, val__25, 0); /*Value 25*/
1450 vst1_lane_s16(pi2_dst + 27 * dst_strd, val__25, 1); /*Value 27*/
1451 vst1_lane_s16(pi2_dst + 29 * dst_strd, val__25, 2); /*Value 29*/
1452 vst1_lane_s16(pi2_dst + 31 * dst_strd, val__25, 3); /*Value 31*/
1453
1454 pi2_dst++;
1455 }
1456 }
1457 return u4_blk_sad;
1458 }
1459