1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * ihevc_resi_trans_neon.c
25 *
26 * @brief
27 * Contains definitions of functions for computing residue and fwd transform
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - ihevc_resi_trans_4x4_neon()
34 * - ihevc_resi_trans_4x4_ttype1_neon()
35 * - ihevc_resi_trans_8x8_neon()
36 * - ihevc_resi_trans_16x16_neon()
37 * @remarks
38 * None
39 *
40 *******************************************************************************
41 */
42
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46 /* System include files */
47 #include <stdio.h>
48 #include <string.h>
49
50 /* System user files */
51 #include "ihevc_typedefs.h"
52 #include "ihevc_macros.h"
53 #include "ihevc_defs.h"
54 #include "ihevc_cmn_utils_neon.h"
55
56 #include "ihevc_trans_tables.h"
57 #include "ihevc_resi_trans.h"
58
59 /*****************************************************************************/
60 /* Function Definitions */
61 /*****************************************************************************/
ihevc_resi_trans_4x4_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)62 UWORD32 ihevc_resi_trans_4x4_neon(
63 UWORD8 *pu1_src,
64 UWORD8 *pu1_pred,
65 WORD32 *pi4_temp,
66 WORD16 *pi2_dst,
67 WORD32 src_strd,
68 WORD32 pred_strd,
69 WORD32 dst_strd,
70 CHROMA_PLANE_ID_T e_chroma_plane)
71 {
72 UWORD32 sad;
73 uint8x16_t inp_buf, pred_buf;
74 int16x8_t diff_1, diff_2;
75 int16x4_t diff_1_low, diff_1_high, diff_2_low, diff_2_high;
76 int16x8_t e_01, o_32;
77 int16x4_t e_0, e_1, o_0, o_1;
78 int32x4_t e_0_a_e_1, e_0_s_e_1;
79 int32x4_t temp1, temp2, temp3, temp4;
80 int32x4_t o_1_m_trans_10, o_1_m_trans_11;
81 int32x4_t e_03, e_12, o_03, o_12;
82 int16x4_t out_0, out_1, out_2, out_3;
83 uint16x8_t abs;
84 uint32x4_t b;
85 uint64x2_t c;
86
87 (void)pi4_temp;
88 if(e_chroma_plane == NULL_PLANE)
89 {
90 inp_buf = load_unaligned_u8q(pu1_src, src_strd);
91 pred_buf = load_unaligned_u8q(pu1_pred, pred_strd);
92 }
93 else
94 {
95 inp_buf = load_unaligned_u8qi(pu1_src + e_chroma_plane, src_strd);
96 pred_buf = load_unaligned_u8qi(pu1_pred + e_chroma_plane, pred_strd);
97 }
98
99 abs = vabdl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf));
100 abs = vabal_u8(abs, vget_high_u8(inp_buf), vget_high_u8(pred_buf));
101 b = vpaddlq_u16(abs);
102 c = vpaddlq_u32(b);
103 sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
104 vreinterpret_u32_u64(vget_high_u64(c))),
105 0);
106
107 diff_1 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(inp_buf), vget_low_u8(pred_buf)));
108 diff_2 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(inp_buf), vget_high_u8(pred_buf)));
109
110 diff_1_low = vget_low_s16(diff_1);
111 diff_1_high = vget_high_s16(diff_1);
112 diff_2_low = vget_low_s16(diff_2);
113 diff_2_high = vget_high_s16(diff_2);
114
115 transpose_s16_4x4d(&diff_1_low, &diff_1_high, &diff_2_low, &diff_2_high);
116 diff_1 = vcombine_s16(diff_1_low, diff_1_high);
117 diff_2 = vcombine_s16(diff_2_high, diff_2_low);
118
119 e_01 = vaddq_s16(diff_1, diff_2);
120 o_32 = vsubq_s16(diff_1, diff_2);
121
122 e_0 = vget_low_s16(e_01);
123 e_1 = vget_high_s16(e_01);
124 o_0 = vget_high_s16(o_32);
125 o_1 = vget_low_s16(o_32);
126
127 e_0_a_e_1 = vaddl_s16(e_0, e_1);
128 e_0_s_e_1 = vsubl_s16(e_0, e_1);
129
130 temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
131 temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
132
133 o_1_m_trans_10 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][0]);
134 o_1_m_trans_11 = vmull_n_s16(o_1, (WORD32)g_ai2_ihevc_trans_4[1][1]);
135
136 temp3 = vmlal_n_s16(o_1_m_trans_10, o_0, (WORD32)g_ai2_ihevc_trans_4[1][1]);
137 temp4 = vmlsl_n_s16(o_1_m_trans_11, o_0, (WORD32)g_ai2_ihevc_trans_4[1][0]);
138
139 transpose_s32_4x4(&temp1, &temp3, &temp2, &temp4);
140
141 e_03 = vaddq_s32(temp1, temp4);
142 e_12 = vaddq_s32(temp3, temp2);
143 o_03 = vsubq_s32(temp1, temp4);
144 o_12 = vsubq_s32(temp3, temp2);
145
146 e_0_a_e_1 = vaddq_s32(e_03, e_12);
147 e_0_s_e_1 = vsubq_s32(e_03, e_12);
148
149 temp1 = vmulq_n_s32(e_0_a_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
150 temp2 = vmulq_n_s32(e_0_s_e_1, (WORD32)g_ai2_ihevc_trans_4[0][0]);
151
152 o_1_m_trans_10 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][0]);
153 o_1_m_trans_11 = vmulq_n_s32(o_03, (WORD32)g_ai2_ihevc_trans_4[1][1]);
154
155 temp3 = vmlaq_n_s32(o_1_m_trans_10, o_12, (WORD32)g_ai2_ihevc_trans_4[1][1]);
156 temp4 = vmlsq_n_s32(o_1_m_trans_11, o_12, (WORD32)g_ai2_ihevc_trans_4[1][0]);
157
158 out_0 = vrshrn_n_s32(temp1, 9);
159 out_1 = vrshrn_n_s32(temp3, 9);
160 out_2 = vrshrn_n_s32(temp2, 9);
161 out_3 = vrshrn_n_s32(temp4, 9);
162
163 vst1_s16(pi2_dst, out_0);
164 vst1_s16(pi2_dst + dst_strd, out_1);
165 vst1_s16(pi2_dst + 2 * dst_strd, out_2);
166 vst1_s16(pi2_dst + 3 * dst_strd, out_3);
167
168 return sad;
169 }
170
171 /**
172 *******************************************************************************
173 *
174 * @brief
175 * This function performs residue calculation and forward transform type 1
176 * on input pixels
177 *
178 * @par Description:
179 * Performs residue calculation by subtracting source and prediction and
180 * followed by forward transform
181 *
182 * @param[in] pu1_src
183 * Input 4x4 pixels
184 *
185 * @param[in] pu1_pred
186 * Prediction data
187 *
188 * @param[in] pi2_tmp
189 * Temporary buffer of size 4x4
190 *
191 * @param[out] pi2_dst
192 * Output 4x4 coefficients
193 *
194 * @param[in] src_strd
195 * Input stride
196 *
197 * @param[in] pred_strd
198 * Prediction Stride
199 *
200 * @param[in] dst_strd
201 * Output Stride
202 *
203 * @param[in] e_chroma_plane
204 * Enum singalling chroma plane
205 *
206 * @returns block sad
207 *
208 * @remarks
209 * None
210 *
211 *******************************************************************************
212 */
ihevc_resi_trans_4x4_ttype1_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)213 UWORD32 ihevc_resi_trans_4x4_ttype1_neon(
214 UWORD8 *pu1_src,
215 UWORD8 *pu1_pred,
216 WORD32 *pi4_temp,
217 WORD16 *pi2_dst,
218 WORD32 src_strd,
219 WORD32 pred_strd,
220 WORD32 dst_strd,
221 CHROMA_PLANE_ID_T e_chroma_plane)
222 {
223 UWORD32 sad;
224 int16x4_t src0_4x16b;
225 int16x4_t src1_4x16b;
226 int16x4_t src2_4x16b;
227 int16x4_t src3_4x16b;
228 int32x4_t src0_4x32b;
229 int32x4_t src1_4x32b;
230 int32x4_t src2_4x32b;
231 int32x4_t src3_4x32b;
232 /*load source and pred values */
233 const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
234 const uint8x16_t pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
235
236 const int16x8_t src_reg0 =
237 vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
238 const int16x8_t src_reg1 =
239 vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
240
241 int32x4_t add_val = vdupq_n_s32(1);
242
243 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8));
244 uint32x4_t b;
245 uint64x2_t c;
246 UNUSED(e_chroma_plane);
247 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(pred_u8));
248 b = vpaddlq_u16(abs);
249 c = vpaddlq_u32(b);
250 sad = vget_lane_u32(vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
251 vreinterpret_u32_u64(vget_high_u64(c))),
252 0);
253
254 (void)pi4_temp;
255
256 /************************* 4x4 16bit Transpose ***********************/
257 src0_4x16b = vget_low_s16(src_reg0);
258 src1_4x16b = vget_high_s16(src_reg0);
259 src2_4x16b = vget_low_s16(src_reg1);
260 src3_4x16b = vget_high_s16(src_reg1);
261
262 transpose_s16_4x4d(&src0_4x16b, &src1_4x16b, &src2_4x16b, &src3_4x16b);
263
264 /************************** 4x4 Transpose End *************************/
265
266 /* Residue + Forward Transform 1st stage */
267 /* coeff2_4x32b = 74 74 74 74 */
268 const int32x4_t coeff2_4x32b =
269 vdupq_n_s32(74); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[2][0]);
270 /* coeff0_4x32b = 29 29 29 29 */
271 const int32x4_t coeff0_4x32b =
272 vdupq_n_s32(29); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[0][0]);
273 /* coeff1_4x32b = 55 55 55 55 */
274 const int32x4_t coeff1_4x32b =
275 vdupq_n_s32(55); //vld1q_s32(&g_ai4_ihevc_trans_dst_intr_4[1][0]);
276
277 /* c0 to c3 calculation */
278 int32x4_t c0_4x32b = vaddl_s16(src0_4x16b, src3_4x16b); /* r0+r3 */
279 int32x4_t c1_4x32b = vaddl_s16(src1_4x16b, src3_4x16b); /* r1+r3 */
280 int32x4_t c2_4x32b = vsubl_s16(src0_4x16b, src1_4x16b); /* r0-r1 */
281 int32x4_t c3_4x32b = vmulq_s32(vmovl_s16(src2_4x16b), coeff2_4x32b); /* 74*r2 */
282 src0_4x16b = vadd_s16(src0_4x16b, src1_4x16b); /* r0+r1 */
283
284 src1_4x32b = vsubl_s16(src0_4x16b, src3_4x16b); /* r0+r1-r3 */
285 src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
286 src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
287 src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
288 src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
289
290 src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
291 src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
292 c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
293 src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
294 src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
295
296 /* result + add */
297 src1_4x32b = vaddq_s32(src1_4x32b, add_val);
298 src0_4x32b = vaddq_s32(src0_4x32b, add_val);
299 src2_4x32b = vaddq_s32(src2_4x32b, add_val);
300 src3_4x32b = vaddq_s32(src3_4x32b, add_val);
301 /* result >> shift */
302 src1_4x32b = vshrq_n_s32(src1_4x32b, 1);
303 src0_4x32b = vshrq_n_s32(src0_4x32b, 1);
304 src2_4x32b = vshrq_n_s32(src2_4x32b, 1);
305 src3_4x32b = vshrq_n_s32(src3_4x32b, 1);
306 /* Forward transform 2nd stage */
307 {
308 /************************* 4x4 32bit Transpose ***********************/
309
310 transpose_s32_4x4(&src0_4x32b, &src1_4x32b, &src2_4x32b, &src3_4x32b);
311
312 /************************** 4x4 Transpose End *************************/
313
314 /* add value */
315 add_val = vdupq_n_s32(128);
316 c0_4x32b = vaddq_s32(src0_4x32b, src3_4x32b); /* r0+r3 */
317 c1_4x32b = vaddq_s32(src1_4x32b, src3_4x32b); /* r1+r3 */
318 c2_4x32b = vsubq_s32(src0_4x32b, src1_4x32b); /* r0-r1 */
319 c3_4x32b = vmulq_s32(src2_4x32b, coeff2_4x32b); /* 74*r2 */
320 src1_4x32b = vaddq_s32(src0_4x32b, src1_4x32b); /* r0+r1 */
321
322 src1_4x32b = vsubq_s32(src1_4x32b, src3_4x32b); /* r0+r1-r3 */
323 src0_4x32b = vmlaq_s32(c3_4x32b, c0_4x32b, coeff0_4x32b); /* 29*c0 + c3 */
324 src2_4x32b = vmulq_s32(c2_4x32b, coeff0_4x32b); /* 29*c2 - c3 */
325 src3_4x32b = vmlaq_s32(c3_4x32b, c2_4x32b, coeff1_4x32b); /* 55*c2 + c3 */
326 src2_4x32b = vsubq_s32(src2_4x32b, c3_4x32b);
327
328 src0_4x32b = vmlaq_s32(src0_4x32b, c1_4x32b, coeff1_4x32b); /* 29*c0 + 55*c1 + c3 */
329 src2_4x32b = vmlaq_s32(src2_4x32b, c0_4x32b, coeff1_4x32b); /* 29*c2 + 55*c0 - c3 */
330 c1_4x32b = vmulq_s32(c1_4x32b, coeff0_4x32b); /* 55*c2 - 29*c1 + c3 */
331 src1_4x32b = vmulq_s32(src1_4x32b, coeff2_4x32b); /*74*(r0+r1-r3)*/
332 src3_4x32b = vsubq_s32(src3_4x32b, c1_4x32b);
333
334 /* result + add */
335 src1_4x32b = vaddq_s32(src1_4x32b, add_val);
336 src0_4x32b = vaddq_s32(src0_4x32b, add_val);
337 src2_4x32b = vaddq_s32(src2_4x32b, add_val);
338 src3_4x32b = vaddq_s32(src3_4x32b, add_val);
339
340 src1_4x32b = vshrq_n_s32(src1_4x32b, 8);
341 src0_4x32b = vshrq_n_s32(src0_4x32b, 8);
342 src2_4x32b = vshrq_n_s32(src2_4x32b, 8);
343 src3_4x32b = vshrq_n_s32(src3_4x32b, 8);
344
345 vst1_s16((pi2_dst + dst_strd), vmovn_s32(src1_4x32b));
346 vst1_s16(pi2_dst, vmovn_s32(src0_4x32b));
347 vst1_s16((pi2_dst + 2 * dst_strd), vmovn_s32(src2_4x32b));
348 vst1_s16((pi2_dst + 3 * dst_strd), vmovn_s32(src3_4x32b));
349 }
350 return sad;
351 }
352
353 /**
354 *******************************************************************************
355 *
356 * @brief
357 * This function performs residue calculation and forward transform on
358 * input pixels
359 *
360 * @par Description:
361 * Performs residue calculation by subtracting source and prediction and
362 * followed by forward transform
363 *
364 * @param[in] pu1_src
365 * Input 8x8 pixels
366 *
367 * @param[in] pu1_pred
368 * Prediction data
369 *
370 * @param[in] pi2_tmp
371 * Temporary buffer of size 8x8
372 *
373 * @param[out] pi2_dst
374 * Output 8x8 coefficients
375 *
376 * @param[in] src_strd
377 * Input stride
378 *
379 * @param[in] pred_strd
380 * Prediction Stride
381 *
382 * @param[in] dst_strd
383 * Output Stride
384 *
385 * @param[in] e_chroma_plane
386 * Enum singalling chroma plane
387 *
388 * @returns Void
389 *
390 * @remarks
391 * None
392 *
393 *******************************************************************************
394 */
ihevc_resi_trans_8x8_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)395 UWORD32 ihevc_resi_trans_8x8_neon(
396 UWORD8 *pu1_src,
397 UWORD8 *pu1_pred,
398 WORD32 *pi4_temp,
399 WORD16 *pi2_dst,
400 WORD32 src_strd,
401 WORD32 pred_strd,
402 WORD32 dst_strd,
403 CHROMA_PLANE_ID_T e_chroma_plane)
404 {
405 int16x8_t diff_16[8];
406 int16x8_t abs = vdupq_n_s16(0);
407 int32x4_t tmp_a;
408 int64x2_t tmp_b;
409 int32x2_t sad_v;
410 int32x4x2_t a0, a1, a2, a3, a4, a5, a6, a7;
411 UWORD32 sad;
412
413 (void)pi4_temp;
414 // stage 1
415 for(int k = 0; k < 8; k++)
416 {
417 if(NULL_PLANE == e_chroma_plane)
418 {
419 diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(pu1_src), vld1_u8(pu1_pred)));
420 }
421 else
422 {
423 diff_16[k] = vreinterpretq_s16_u16(vsubl_u8(vld2_u8(pu1_src).val[e_chroma_plane],
424 vld2_u8(pu1_pred).val[e_chroma_plane]));
425 }
426 pu1_src += src_strd;
427 pu1_pred += pred_strd;
428 abs = vaddq_s16(abs, vabsq_s16(diff_16[k]));
429 }
430
431 tmp_a = vpaddlq_s16(abs);
432 tmp_b = vpaddlq_s32(tmp_a);
433 sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_b)),
434 vreinterpret_s32_s64(vget_high_s64(tmp_b)));
435 sad = vget_lane_s32(sad_v, 0);
436
437 transpose_s16_8x8(
438 &diff_16[0],
439 &diff_16[1],
440 &diff_16[2],
441 &diff_16[3],
442 &diff_16[4],
443 &diff_16[5],
444 &diff_16[6],
445 &diff_16[7]);
446
447 {
448 const int16x8_t o3 = vsubq_s16(diff_16[3], diff_16[4]); /*C3 - C4*/
449 const int16x8_t o2 = vsubq_s16(diff_16[2], diff_16[5]); /*C2 - C5*/
450 const int16x8_t o1 = vsubq_s16(diff_16[1], diff_16[6]); /*C1 - C6*/
451 const int16x8_t o0 = vsubq_s16(diff_16[0], diff_16[7]); /*C0 - C7*/
452 const int16x8_t e0 = vaddq_s16(diff_16[0], diff_16[7]); /*C0 + C7*/
453 const int16x8_t e1 = vaddq_s16(diff_16[1], diff_16[6]); /*C1 + C6*/
454 const int16x8_t e2 = vaddq_s16(diff_16[2], diff_16[5]); /*C2 + C5*/
455 const int16x8_t e3 = vaddq_s16(diff_16[3], diff_16[4]); /*C3 + C4*/
456
457 const int16x8_t ee0 = vaddq_s16(e0, e3); /*C0 + C3 + C4 + C7*/
458 const int16x8_t ee1 = vaddq_s16(e1, e2); /*C1 + C2 + C5 + C6*/
459 const int16x8_t eo0 = vsubq_s16(e0, e3); /*C0 - C3 - C4 + C7*/
460 const int16x8_t eo1 = vsubq_s16(e1, e2); /*C1 - C2 - C5 + C6*/
461
462 /*C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7*/
463 const int16x8_t eee = vaddq_s16(ee1, ee0);
464 /*C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7*/
465 const int16x8_t eeo = vsubq_s16(ee0, ee1);
466
467 /*F2[0] of 83*(C0 - C3 - C4 + C7)*/
468 a2.val[0] = vmull_n_s16(vget_low_s16(eo0), 83);
469 /*F6[0] of 36*(C0 - C3 - C4 + C7)*/
470 a6.val[0] = vmull_n_s16(vget_low_s16(eo0), 36);
471 /*F2[1] of 83*(C0 - C3 - C4 + C7)*/
472 a2.val[1] = vmull_n_s16(vget_high_s16(eo0), 83);
473 /*F6[1] of 36*(C0 - C3 - C4 + C7)*/
474 a6.val[1] = vmull_n_s16(vget_high_s16(eo0), 36);
475
476 /*F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
477 a6.val[1] = vmlsl_n_s16(a6.val[1], vget_high_s16(eo1), 83);
478 /*F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
479 a2.val[1] = vmlal_n_s16(a2.val[1], vget_high_s16(eo1), 36);
480 /*F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)*/
481 a6.val[0] = vmlsl_n_s16(a6.val[0], vget_low_s16(eo1), 83);
482 /*F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)*/
483 a2.val[0] = vmlal_n_s16(a2.val[0], vget_low_s16(eo1), 36);
484
485 /*F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
486 a0.val[0] = vshll_n_s16(vget_low_s16(eee), 6);
487 /*F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)*/
488 a0.val[1] = vshll_n_s16(vget_high_s16(eee), 6);
489 /*F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
490 a4.val[0] = vshll_n_s16(vget_low_s16(eeo), 6);
491 /*F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)*/
492 a4.val[1] = vshll_n_s16(vget_high_s16(eeo), 6);
493
494 a7.val[0] = vmull_n_s16(vget_low_s16(o0), 18); /*F7[0] = 18*(C0 - C7)*/
495 a5.val[0] = vmull_n_s16(vget_low_s16(o0), 50); /*F5[0] = 50*(C0 - C7)*/
496 a3.val[0] = vmull_n_s16(vget_low_s16(o0), 75); /*F3[0] = 75*(C0 - C7)*/
497 a1.val[0] = vmull_n_s16(vget_low_s16(o0), 89); /*F1[0] = 89*(C0 - C7)*/
498 a1.val[1] = vmull_n_s16(vget_high_s16(o0), 89); /*F1[1] = 89*(C0 - C7)*/
499 a3.val[1] = vmull_n_s16(vget_high_s16(o0), 75); /*F3[1] = 75*(C0 - C7)*/
500 a5.val[1] = vmull_n_s16(vget_high_s16(o0), 50); /*F5[1] = 50*(C0 - C7)*/
501 a7.val[1] = vmull_n_s16(vget_high_s16(o0), 18); /*F7[1] = 18*(C0 - C7)*/
502
503 /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6)*/
504 a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o1), 50);
505 /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6)*/
506 a5.val[0] = vmlsl_n_s16(a5.val[0], vget_low_s16(o1), 89);
507 /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6)*/
508 a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o1), 18);
509 /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6)*/
510 a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o1), 75);
511 /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6)*/
512 a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o1), 75);
513 /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6)*/
514 a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o1), 18);
515 /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6)*/
516 a5.val[1] = vmlsl_n_s16(a5.val[1], vget_high_s16(o1), 89);
517 /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6)*/
518 a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o1), 50);
519
520 /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
521 a7.val[0] = vmlal_n_s16(a7.val[0], vget_low_s16(o2), 75);
522 /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
523 a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o2), 18);
524 /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
525 a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o2), 89);
526 /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
527 a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o2), 50);
528 /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)*/
529 a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o2), 50);
530 /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)*/
531 a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o2), 89);
532 /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)*/
533 a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o2), 18);
534 /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)*/
535 a7.val[1] = vmlal_n_s16(a7.val[1], vget_high_s16(o2), 75);
536
537 /*F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
538 a7.val[0] = vmlsl_n_s16(a7.val[0], vget_low_s16(o3), 89);
539 /*F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
540 a5.val[0] = vmlal_n_s16(a5.val[0], vget_low_s16(o3), 75);
541 /*F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
542 a3.val[0] = vmlsl_n_s16(a3.val[0], vget_low_s16(o3), 50);
543 /*F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
544 a1.val[0] = vmlal_n_s16(a1.val[0], vget_low_s16(o3), 18);
545 /*F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)*/
546 a1.val[1] = vmlal_n_s16(a1.val[1], vget_high_s16(o3), 18);
547 /*F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)*/
548 a3.val[1] = vmlsl_n_s16(a3.val[1], vget_high_s16(o3), 50);
549 /*F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)*/
550 a5.val[1] = vmlal_n_s16(a5.val[1], vget_high_s16(o3), 75);
551 /*F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)*/
552 a7.val[1] = vmlsl_n_s16(a7.val[1], vget_high_s16(o3), 89);
553 }
554
555 //Stage 2
556 {
557 int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
558 int32x4_t e0_2, e1_2, e2_2, e3_2;
559 int32x4_t o0_2, o1_2, o2_2, o3_2;
560 int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
561 int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
562
563 /*Transposing second half of transform stage 1 (1)*/
564 int32x4x2_t b1 = vtrnq_s32(a0.val[1], a1.val[1]);
565 int32x4x2_t b3 = vtrnq_s32(a2.val[1], a3.val[1]);
566 int32x4x2_t b0 = vtrnq_s32(a0.val[0], a1.val[0]);
567 int32x4x2_t b2 = vtrnq_s32(a2.val[0], a3.val[0]);
568
569 /*Transposing second half of transform stage 1 (2)*/
570 a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
571 a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
572 a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
573 a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
574 a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
575 a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
576 a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
577 a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
578
579 o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
580 o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
581 o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
582 o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
583 e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
584 e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
585 e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
586 e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
587
588 eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
589 ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
590 eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
591 ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
592
593 /* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
594 h4 = vsubq_s32(ee0_2, ee1_2);
595 /* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
596 h0 = vaddq_s32(ee0_2, ee1_2);
597 /* Truncating last 11 bits in H0*/
598 row0 = vrshrn_n_s32(h0, 5);
599 /*First half-row of row 1 of transform stage 2 (H0) stored*/
600 vst1_s16(pi2_dst, row0);
601 /* Truncating last 11 bits in H4*/
602 row4 = vrshrn_n_s32(h4, 5);
603 /*First half-row of row 5 of transform stage 2 (H4) stored*/
604 vst1_s16(pi2_dst + 4 * dst_strd, row4);
605
606 /* F6 = 36*(B0 - B3 - B4 + B7) */
607 h6 = vmulq_n_s32(eo0_2, 36);
608 /* F2 = 83*(B0 - B3 - B4 + B7) */
609 h2 = vmulq_n_s32(eo0_2, 83);
610 /*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
611 h2 = vmlaq_n_s32(h2, eo1_2, 36);
612 /*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
613 h6 = vmlsq_n_s32(h6, eo1_2, 83);
614 /* Truncating last 11 bits in H6*/
615 row6 = vrshrn_n_s32(h6, 11);
616 /*First half-row of row 7 of transform stage 2 (H6) stored*/
617 vst1_s16(pi2_dst + 6 * dst_strd, row6);
618 /* Truncating last 11 bits in H2*/
619 row2 = vrshrn_n_s32(h2, 11);
620 /*First half-row of row 3 of transform stage 2 (H2) stored*/
621 vst1_s16(pi2_dst + 2 * dst_strd, row2);
622
623 h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
624 h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
625 h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
626 h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
627
628 h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
629 h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
630 h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
631 h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
632
633 /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
634 h1 = vmlaq_n_s32(h1, o2_2, 50);
635 /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
636 h3 = vmlsq_n_s32(h3, o2_2, 89);
637 /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
638 h5 = vmlaq_n_s32(h5, o2_2, 18);
639 /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
640 h7 = vmlaq_n_s32(h7, o2_2, 75);
641
642 /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
643 h7 = vmlsq_n_s32(h7, o3_2, 89);
644 /* Truncating last 11 bits in H7*/
645 row7 = vrshrn_n_s32(h7, 11);
646 /*First half-row of row 8 of transform stage 2 (H7) stored*/
647 vst1_s16(pi2_dst + 7 * dst_strd, row7);
648 /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
649 h5 = vmlaq_n_s32(h5, o3_2, 75);
650 /* Truncating last 11 bits in H5*/
651 row5 = vrshrn_n_s32(h5, 11);
652 /*First half-row of row 6 of transform stage 2 (H5) stored*/
653 vst1_s16(pi2_dst + 5 * dst_strd, row5);
654 /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
655 h3 = vmlsq_n_s32(h3, o3_2, 50);
656 /* Truncating last 11 bits in H3*/
657 row3 = vrshrn_n_s32(h3, 11);
658 /*First half-row of row 4 of transform stage 2 (H3) stored*/
659 vst1_s16(pi2_dst + 3 * dst_strd, row3);
660 /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
661 h1 = vmlaq_n_s32(h1, o3_2, 18);
662 /* Truncating last 11 bits in H1*/
663 row1 = vrshrn_n_s32(h1, 11);
664 /*First half-row of row 2 of transform stage 2 (H1) stored*/
665 vst1_s16(pi2_dst + dst_strd, row1);
666 }
667
668 pi2_dst += 4;
669
670 {
671 int32x4_t h0, h1, h2, h3, h4, h5, h6, h7;
672 int32x4_t e0_2, e1_2, e2_2, e3_2;
673 int32x4_t o0_2, o1_2, o2_2, o3_2;
674 int32x4_t ee1_2, eo1_2, eo0_2, ee0_2;
675 int16x4_t row0, row1, row2, row3, row4, row5, row6, row7;
676
677 /*Transposing second half of transform stage 1 (1)*/
678 int32x4x2_t b1 = vtrnq_s32(a4.val[1], a5.val[1]);
679 int32x4x2_t b3 = vtrnq_s32(a6.val[1], a7.val[1]);
680 int32x4x2_t b0 = vtrnq_s32(a4.val[0], a5.val[0]);
681 int32x4x2_t b2 = vtrnq_s32(a6.val[0], a7.val[0]);
682
683 /*Transposing second half of transform stage 1 (2)*/
684 a0.val[0] = vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b2.val[0]));
685 a2.val[0] = vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b2.val[0]));
686 a1.val[0] = vcombine_s32(vget_low_s32(b0.val[1]), vget_low_s32(b2.val[1]));
687 a3.val[0] = vcombine_s32(vget_high_s32(b0.val[1]), vget_high_s32(b2.val[1]));
688 a0.val[1] = vcombine_s32(vget_low_s32(b1.val[0]), vget_low_s32(b3.val[0]));
689 a2.val[1] = vcombine_s32(vget_high_s32(b1.val[0]), vget_high_s32(b3.val[0]));
690 a1.val[1] = vcombine_s32(vget_low_s32(b1.val[1]), vget_low_s32(b3.val[1]));
691 a3.val[1] = vcombine_s32(vget_high_s32(b1.val[1]), vget_high_s32(b3.val[1]));
692
693 o0_2 = vsubq_s32(a0.val[0], a3.val[1]); /*B0 - B7*/
694 o1_2 = vsubq_s32(a1.val[0], a2.val[1]); /*B1 - B6*/
695 o2_2 = vsubq_s32(a2.val[0], a1.val[1]); /*B2 - B5*/
696 o3_2 = vsubq_s32(a3.val[0], a0.val[1]); /*B3 - B4*/
697 e3_2 = vaddq_s32(a3.val[0], a0.val[1]); /*B3 + B4*/
698 e2_2 = vaddq_s32(a2.val[0], a1.val[1]); /*B2 + B5*/
699 e1_2 = vaddq_s32(a1.val[0], a2.val[1]); /*B1 + B6*/
700 e0_2 = vaddq_s32(a0.val[0], a3.val[1]); /*B0 + B7*/
701
702 eo1_2 = vsubq_s32(e1_2, e2_2); /*B1 - B2 - B5 + B6*/
703 ee1_2 = vaddq_s32(e1_2, e2_2); /*B1 + B2 + B5 + B6*/
704 eo0_2 = vsubq_s32(e0_2, e3_2); /*B0 - B3 - B4 + B7*/
705 ee0_2 = vaddq_s32(e0_2, e3_2); /*B0 + B3 + B4 + B7*/
706
707 /* F4 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7*/
708 h4 = vsubq_s32(ee0_2, ee1_2);
709 /* F0 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7*/
710 h0 = vaddq_s32(ee0_2, ee1_2);
711 /* Truncating last 11 bits in H0*/
712 row0 = vrshrn_n_s32(h0, 5);
713 /*First half-row of row 1 of transform stage 2 (H0) stored*/
714 vst1_s16(pi2_dst, row0);
715 /* Truncating last 11 bits in H4*/
716 row4 = vrshrn_n_s32(h4, 5);
717 /*First half-row of row 5 of transform stage 2 (H4) stored*/
718 vst1_s16(pi2_dst + 4 * dst_strd, row4);
719
720 /* F6 = 36*(B0 - B3 - B4 + B7) */
721 h6 = vmulq_n_s32(eo0_2, 36);
722 /* F2 = 83*(B0 - B3 - B4 + B7) */
723 h2 = vmulq_n_s32(eo0_2, 83);
724 /*H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)*/
725 h2 = vmlaq_n_s32(h2, eo1_2, 36);
726 /*H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)*/
727 h6 = vmlsq_n_s32(h6, eo1_2, 83);
728 /* Truncating last 11 bits in H6*/
729 row6 = vrshrn_n_s32(h6, 11);
730 /*First half-row of row 7 of transform stage 2 (H6) stored*/
731 vst1_s16(pi2_dst + 6 * dst_strd, row6);
732 /* Truncating last 11 bits in H2*/
733 row2 = vrshrn_n_s32(h2, 11);
734 /*First half-row of row 3 of transform stage 2 (H2) stored*/
735 vst1_s16(pi2_dst + 2 * dst_strd, row2);
736
737 h1 = vmulq_n_s32(o0_2, 89); /* H1 = 89*(B0 - B7) */
738 h3 = vmulq_n_s32(o0_2, 75); /* H3 = 75*(B0 - B7) */
739 h5 = vmulq_n_s32(o0_2, 50); /* H5 = 50*(B0 - B7) */
740 h7 = vmulq_n_s32(o0_2, 18); /* H7 = 18*(B0 - B7) */
741
742 h7 = vmlsq_n_s32(h7, o1_2, 50); /* H7 = 18*(B0 - B7) - 50*(B1 - B6) */
743 h5 = vmlsq_n_s32(h5, o1_2, 89); /* H5 = 50*(B0 - B7) - 89*(B1 - B6) */
744 h3 = vmlsq_n_s32(h3, o1_2, 18); /* H3 = 75*(B0 - B7) - 18*(B1 - B6) */
745 h1 = vmlaq_n_s32(h1, o1_2, 75); /* H1 = 89*(B0 - B7) + 75*(B1 - B6) */
746
747 /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) */
748 h1 = vmlaq_n_s32(h1, o2_2, 50);
749 /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) */
750 h3 = vmlsq_n_s32(h3, o2_2, 89);
751 /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) */
752 h5 = vmlaq_n_s32(h5, o2_2, 18);
753 /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) */
754 h7 = vmlaq_n_s32(h7, o2_2, 75);
755
756 /* H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) */
757 h7 = vmlsq_n_s32(h7, o3_2, 89);
758 /* Truncating last 11 bits in H7*/
759 row7 = vrshrn_n_s32(h7, 11);
760 /*First half-row of row 8 of transform stage 2 (H7) stored*/
761 vst1_s16(pi2_dst + 7 * dst_strd, row7);
762 /* H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) */
763 h5 = vmlaq_n_s32(h5, o3_2, 75);
764 /* Truncating last 11 bits in H5*/
765 row5 = vrshrn_n_s32(h5, 11);
766 /*First half-row of row 6 of transform stage 2 (H5) stored*/
767 vst1_s16(pi2_dst + 5 * dst_strd, row5);
768 /* H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) */
769 h3 = vmlsq_n_s32(h3, o3_2, 50);
770 /* Truncating last 11 bits in H3*/
771 row3 = vrshrn_n_s32(h3, 11);
772 /*First half-row of row 4 of transform stage 2 (H3) stored*/
773 vst1_s16(pi2_dst + 3 * dst_strd, row3);
774 /* H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) */
775 h1 = vmlaq_n_s32(h1, o3_2, 18);
776 /* Truncating last 11 bits in H1*/
777 row1 = vrshrn_n_s32(h1, 11);
778 /*First half-row of row 2 of transform stage 2 (H1) stored*/
779 vst1_s16(pi2_dst + dst_strd, row1);
780 }
781 return sad;
782 }
783
load(const uint8_t * a,int stride,uint8x8_t * b,CHROMA_PLANE_ID_T e_chroma_plane)784 static INLINE void load(const uint8_t *a, int stride, uint8x8_t *b,
785 CHROMA_PLANE_ID_T e_chroma_plane)
786 {
787 int i;
788
789 if(e_chroma_plane == NULL_PLANE)
790 {
791 for (i = 0; i < 16; i++)
792 {
793 b[i] = vld1_u8(a);
794 a += stride;
795 }
796 }
797 else
798 {
799 for (i = 0; i < 16; i++)
800 {
801 b[i] = vld2_u8(a).val[e_chroma_plane];
802 a += stride;
803 }
804 }
805 }
806
807 // Store 8 16x8 values, assuming stride == 16.
store(WORD16 * a,int16x8_t * b)808 static INLINE void store(WORD16 *a, int16x8_t *b /*[8]*/)
809 {
810 int i;
811
812 for (i = 0; i < 8; i++)
813 {
814 vst1q_s16(a, b[i]);
815 a += 16;
816 }
817 }
818
cross_input_16(int16x8_t * a,int16x8_t * b)819 static INLINE void cross_input_16(int16x8_t *a /*[16]*/, int16x8_t *b /*[16]*/)
820 {
821 b[0] = vaddq_s16(a[0], a[15]);
822 b[1] = vaddq_s16(a[1], a[14]);
823 b[2] = vaddq_s16(a[2], a[13]);
824 b[3] = vaddq_s16(a[3], a[12]);
825 b[4] = vaddq_s16(a[4], a[11]);
826 b[5] = vaddq_s16(a[5], a[10]);
827 b[6] = vaddq_s16(a[6], a[9]);
828 b[7] = vaddq_s16(a[7], a[8]);
829
830 b[8] = vsubq_s16(a[7], a[8]);
831 b[9] = vsubq_s16(a[6], a[9]);
832 b[10] = vsubq_s16(a[5], a[10]);
833 b[11] = vsubq_s16(a[4], a[11]);
834 b[12] = vsubq_s16(a[3], a[12]);
835 b[13] = vsubq_s16(a[2], a[13]);
836 b[14] = vsubq_s16(a[1], a[14]);
837 b[15] = vsubq_s16(a[0], a[15]);
838 }
839
cross_input_32(int32x4x2_t * a,int32x4x2_t * b)840 static INLINE void cross_input_32(int32x4x2_t *a /*[16][2]*/, int32x4x2_t *b /*[16][2]*/)
841 {
842 WORD32 i;
843 for(i = 0; i < 2; i++)
844 {
845 b[0].val[i] = vaddq_s32(a[0].val[i], a[15].val[i]);
846 b[1].val[i] = vaddq_s32(a[1].val[i], a[14].val[i]);
847 b[2].val[i] = vaddq_s32(a[2].val[i], a[13].val[i]);
848 b[3].val[i] = vaddq_s32(a[3].val[i], a[12].val[i]);
849 b[4].val[i] = vaddq_s32(a[4].val[i], a[11].val[i]);
850 b[5].val[i] = vaddq_s32(a[5].val[i], a[10].val[i]);
851 b[6].val[i] = vaddq_s32(a[6].val[i], a[9].val[i]);
852 b[7].val[i] = vaddq_s32(a[7].val[i], a[8].val[i]);
853
854 b[8].val[i] = vsubq_s32(a[7].val[i], a[8].val[i]);
855 b[9].val[i] = vsubq_s32(a[6].val[i], a[9].val[i]);
856 b[10].val[i] = vsubq_s32(a[5].val[i], a[10].val[i]);
857 b[11].val[i] = vsubq_s32(a[4].val[i], a[11].val[i]);
858 b[12].val[i] = vsubq_s32(a[3].val[i], a[12].val[i]);
859 b[13].val[i] = vsubq_s32(a[2].val[i], a[13].val[i]);
860 b[14].val[i] = vsubq_s32(a[1].val[i], a[14].val[i]);
861 b[15].val[i] = vsubq_s32(a[0].val[i], a[15].val[i]);
862 }
863 }
864
diff(uint8x8_t * a,uint8x8_t * b,int16x8_t * c)865 static INLINE int32x4_t diff(uint8x8_t *a /*[16]*/, uint8x8_t *b /*[16]*/, int16x8_t *c /*[16]*/)
866 {
867 int i;
868 int16x8_t abs = vdupq_n_s16(0);
869
870 for (i = 0; i < 16; i++)
871 {
872 c[i] = vreinterpretq_s16_u16(vsubl_u8(a[i], b[i]));
873 abs = vaddq_s16(abs, vabsq_s16(c[i]));
874 }
875 return vpaddlq_s16(abs);
876 }
877
partial_round_shift(int32x4x2_t * a,int16x8_t * b)878 static INLINE void partial_round_shift(int32x4x2_t *a, int16x8_t *b /*[16]*/)
879 {
880 WORD32 shift = 13, add;
881 add = 1 << (shift - 1);
882
883 const int32x4_t vecadd = vdupq_n_s32(add);
884 b[0] = vcombine_s16(
885 vshrn_n_s32(vaddq_s32(a[0].val[0], vecadd), 13),
886 vshrn_n_s32(vaddq_s32(a[0].val[1], vecadd), 13));
887 b[1] = vcombine_s16(
888 vshrn_n_s32(vaddq_s32(a[1].val[0], vecadd), 13),
889 vshrn_n_s32(vaddq_s32(a[1].val[1], vecadd), 13));
890 b[2] = vcombine_s16(
891 vshrn_n_s32(vaddq_s32(a[2].val[0], vecadd), 13),
892 vshrn_n_s32(vaddq_s32(a[2].val[1], vecadd), 13));
893 b[3] = vcombine_s16(
894 vshrn_n_s32(vaddq_s32(a[3].val[0], vecadd), 13),
895 vshrn_n_s32(vaddq_s32(a[3].val[1], vecadd), 13));
896 b[4] = vcombine_s16(
897 vshrn_n_s32(vaddq_s32(a[4].val[0], vecadd), 13),
898 vshrn_n_s32(vaddq_s32(a[4].val[1], vecadd), 13));
899 b[5] = vcombine_s16(
900 vshrn_n_s32(vaddq_s32(a[5].val[0], vecadd), 13),
901 vshrn_n_s32(vaddq_s32(a[5].val[1], vecadd), 13));
902 b[6] = vcombine_s16(
903 vshrn_n_s32(vaddq_s32(a[6].val[0], vecadd), 13),
904 vshrn_n_s32(vaddq_s32(a[6].val[1], vecadd), 13));
905 b[7] = vcombine_s16(
906 vshrn_n_s32(vaddq_s32(a[7].val[0], vecadd), 13),
907 vshrn_n_s32(vaddq_s32(a[7].val[1], vecadd), 13));
908 b[8] = vcombine_s16(
909 vshrn_n_s32(vaddq_s32(a[8].val[0], vecadd), 13),
910 vshrn_n_s32(vaddq_s32(a[8].val[1], vecadd), 13));
911 b[9] = vcombine_s16(
912 vshrn_n_s32(vaddq_s32(a[9].val[0], vecadd), 13),
913 vshrn_n_s32(vaddq_s32(a[9].val[1], vecadd), 13));
914 b[10] = vcombine_s16(
915 vshrn_n_s32(vaddq_s32(a[10].val[0], vecadd), 13),
916 vshrn_n_s32(vaddq_s32(a[10].val[1], vecadd), 13));
917 b[11] = vcombine_s16(
918 vshrn_n_s32(vaddq_s32(a[11].val[0], vecadd), 13),
919 vshrn_n_s32(vaddq_s32(a[11].val[1], vecadd), 13));
920 b[12] = vcombine_s16(
921 vshrn_n_s32(vaddq_s32(a[12].val[0], vecadd), 13),
922 vshrn_n_s32(vaddq_s32(a[12].val[1], vecadd), 13));
923 b[13] = vcombine_s16(
924 vshrn_n_s32(vaddq_s32(a[13].val[0], vecadd), 13),
925 vshrn_n_s32(vaddq_s32(a[13].val[1], vecadd), 13));
926 b[14] = vcombine_s16(
927 vshrn_n_s32(vaddq_s32(a[14].val[0], vecadd), 13),
928 vshrn_n_s32(vaddq_s32(a[14].val[1], vecadd), 13));
929 b[15] = vcombine_s16(
930 vshrn_n_s32(vaddq_s32(a[15].val[0], vecadd), 13),
931 vshrn_n_s32(vaddq_s32(a[15].val[1], vecadd), 13));
932 }
933
934 static INLINE int32x4_t
add4(int32x4_t row1_low,int32x4_t row1_high,int32x4_t row2_low,int32x4_t row2_high)935 add4(int32x4_t row1_low, int32x4_t row1_high, int32x4_t row2_low, int32x4_t row2_high)
936 {
937 int32x4_t sum1, sum2;
938 sum1 = vaddq_s32(row1_low, row1_high);
939 sum2 = vaddq_s32(row2_low, row2_high);
940 return vaddq_s32(sum1, sum2);
941 }
942
butterfly_one_coeff_16_32(int16x8_t a,int16x8_t b,int16_t c,int32x4x2_t * row1,int32x4x2_t * row2)943 static INLINE void butterfly_one_coeff_16_32(
944 int16x8_t a, int16x8_t b, int16_t c, int32x4x2_t *row1, int32x4x2_t *row2)
945 {
946 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
947 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
948 //printf("multiply done\n");
949 row1->val[0] = vmlal_n_s16(a0, vget_low_s16(b), c);
950 row1->val[1] = vmlal_n_s16(a1, vget_high_s16(b), c);
951 row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c);
952 row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c);
953 }
954
butterfly_two_coeff_16_32(int16x8_t a,int16x8_t b,int16_t c0,int16_t c1,int32x4x2_t * row1,int32x4x2_t * row2)955 static INLINE void butterfly_two_coeff_16_32(
956 int16x8_t a, int16x8_t b, int16_t c0, int16_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
957 {
958 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
959 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
960 const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
961 const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
962 row1->val[0] = vmlal_n_s16(a2, vget_low_s16(b), c0);
963 row1->val[1] = vmlal_n_s16(a3, vget_high_s16(b), c0);
964 row2->val[0] = vmlsl_n_s16(a0, vget_low_s16(b), c1);
965 row2->val[1] = vmlsl_n_s16(a1, vget_high_s16(b), c1);
966 }
967
butterfly_one_coeff_32_32(int32x4x2_t a,int32x4x2_t b,int32_t c,int32x4x2_t * row1,int32x4x2_t * row2)968 static INLINE void butterfly_one_coeff_32_32(
969 int32x4x2_t a, int32x4x2_t b, int32_t c, int32x4x2_t *row1, int32x4x2_t *row2)
970 {
971 const int32x4_t a0 = vmulq_n_s32(a.val[0], c);
972 const int32x4_t a1 = vmulq_n_s32(a.val[1], c);
973 row1->val[0] = vmlaq_n_s32(a0, b.val[0], c);
974 row1->val[1] = vmlaq_n_s32(a1, b.val[1], c);
975 row2->val[0] = vmlsq_n_s32(a0, b.val[0], c);
976 row2->val[1] = vmlsq_n_s32(a1, b.val[1], c);
977 }
978
butterfly_two_coeff_32_32(int32x4x2_t a,int32x4x2_t b,int32_t c0,int32_t c1,int32x4x2_t * row1,int32x4x2_t * row2)979 static INLINE void butterfly_two_coeff_32_32(
980 int32x4x2_t a, int32x4x2_t b, int32_t c0, int32_t c1, int32x4x2_t *row1, int32x4x2_t *row2)
981 {
982 const int32x4_t a0 = vmulq_n_s32(a.val[0], c0);
983 const int32x4_t a1 = vmulq_n_s32(a.val[1], c0);
984 const int32x4_t a2 = vmulq_n_s32(a.val[0], c1);
985 const int32x4_t a3 = vmulq_n_s32(a.val[1], c1);
986 row1->val[0] = vmlaq_n_s32(a2, b.val[0], c0);
987 row1->val[1] = vmlaq_n_s32(a3, b.val[1], c0);
988 row2->val[0] = vmlsq_n_s32(a0, b.val[0], c1);
989 row2->val[1] = vmlsq_n_s32(a1, b.val[1], c1);
990 }
991
992 // Transpose 8x8 to a new location. Don't use transpose_neon.h because those
993 // are all in-place.
transpose_8x8(int32x4x2_t * a,int32x4x2_t * b)994 static INLINE void transpose_8x8(int32x4x2_t *a /*[8][2]*/, int32x4x2_t *b)
995 {
996 const int32x4x2_t c0 = vtrnq_s32(a[0].val[0], a[1].val[0]);
997 const int32x4x2_t c1 = vtrnq_s32(a[2].val[0], a[3].val[0]);
998 const int32x4x2_t c2 = vtrnq_s32(a[4].val[0], a[5].val[0]);
999 const int32x4x2_t c3 = vtrnq_s32(a[6].val[0], a[7].val[0]);
1000 const int32x4x2_t c4 = vtrnq_s32(a[0].val[1], a[1].val[1]);
1001 const int32x4x2_t c5 = vtrnq_s32(a[2].val[1], a[3].val[1]);
1002 const int32x4x2_t c6 = vtrnq_s32(a[4].val[1], a[5].val[1]);
1003 const int32x4x2_t c7 = vtrnq_s32(a[6].val[1], a[7].val[1]);
1004
1005 const int32x4x2_t d0 = vtrnq_s64_to_s32(c0.val[0], c1.val[0]);
1006 const int32x4x2_t d1 = vtrnq_s64_to_s32(c0.val[1], c1.val[1]);
1007 const int32x4x2_t d2 = vtrnq_s64_to_s32(c2.val[0], c3.val[0]);
1008 const int32x4x2_t d3 = vtrnq_s64_to_s32(c2.val[1], c3.val[1]);
1009 const int32x4x2_t d4 = vtrnq_s64_to_s32(c4.val[0], c5.val[0]);
1010 const int32x4x2_t d5 = vtrnq_s64_to_s32(c4.val[1], c5.val[1]);
1011 const int32x4x2_t d6 = vtrnq_s64_to_s32(c6.val[0], c7.val[0]);
1012 const int32x4x2_t d7 = vtrnq_s64_to_s32(c6.val[1], c7.val[1]);
1013
1014 b[0].val[0] = d0.val[0];
1015 b[0].val[1] = d2.val[0];
1016 b[1].val[0] = d1.val[0];
1017 b[1].val[1] = d3.val[0];
1018 b[2].val[0] = d0.val[1];
1019 b[2].val[1] = d2.val[1];
1020 b[3].val[0] = d1.val[1];
1021 b[3].val[1] = d3.val[1];
1022 b[4].val[0] = d4.val[0];
1023 b[4].val[1] = d6.val[0];
1024 b[5].val[0] = d5.val[0];
1025 b[5].val[1] = d7.val[0];
1026 b[6].val[0] = d4.val[1];
1027 b[6].val[1] = d6.val[1];
1028 b[7].val[0] = d5.val[1];
1029 b[7].val[1] = d7.val[1];
1030 }
1031
dct_body_16_32(int16x8_t * in,int32x4x2_t * out)1032 static void dct_body_16_32(int16x8_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
1033 {
1034 int16x8_t s[8];
1035 int16x8_t x[4];
1036 int32x4x2_t tmp0, tmp1, tmp2, tmp3;
1037 int32x4x2_t tmp4, tmp5, tmp6, tmp7;
1038
1039 s[0] = vaddq_s16(in[0], in[7]);
1040 s[1] = vaddq_s16(in[1], in[6]);
1041 s[2] = vaddq_s16(in[2], in[5]);
1042 s[3] = vaddq_s16(in[3], in[4]);
1043 s[4] = vsubq_s16(in[3], in[4]);
1044 s[5] = vsubq_s16(in[2], in[5]);
1045 s[6] = vsubq_s16(in[1], in[6]);
1046 s[7] = vsubq_s16(in[0], in[7]);
1047
1048 x[0] = vaddq_s16(s[0], s[3]);
1049 x[1] = vaddq_s16(s[1], s[2]);
1050 x[2] = vsubq_s16(s[1], s[2]);
1051 x[3] = vsubq_s16(s[0], s[3]);
1052
1053 // Type 1
1054 // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
1055 // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
1056 butterfly_one_coeff_16_32(x[0], x[1], 64, &out[0], &out[8]);
1057
1058 // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
1059 // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
1060 butterfly_two_coeff_16_32(x[3], x[2], 36, 83, &out[4], &out[12]);
1061
1062 // Type 2
1063 butterfly_two_coeff_16_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
1064 butterfly_two_coeff_16_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
1065
1066 out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1067 out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1068
1069 out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1070 out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1071
1072 butterfly_two_coeff_16_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
1073 butterfly_two_coeff_16_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
1074
1075 out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1076 out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1077
1078 out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1079 out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1080
1081 // Type 3
1082 butterfly_two_coeff_16_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
1083 butterfly_two_coeff_16_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
1084 butterfly_two_coeff_16_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
1085 butterfly_two_coeff_16_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
1086
1087 out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1088 out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1089
1090 out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1091 out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1092
1093 butterfly_two_coeff_16_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
1094 butterfly_two_coeff_16_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
1095 butterfly_two_coeff_16_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
1096 butterfly_two_coeff_16_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
1097
1098 out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1099 out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1100
1101 out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1102 out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1103
1104 butterfly_two_coeff_16_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
1105 butterfly_two_coeff_16_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
1106 butterfly_two_coeff_16_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
1107 butterfly_two_coeff_16_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
1108
1109 out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1110 out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1111
1112 out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1113 out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1114
1115 butterfly_two_coeff_16_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
1116 butterfly_two_coeff_16_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
1117 butterfly_two_coeff_16_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
1118 butterfly_two_coeff_16_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
1119
1120 out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1121 out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1122
1123 out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1124 out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1125 }
1126
dct_body_32_32(int32x4x2_t * in,int32x4x2_t * out)1127 static void dct_body_32_32(int32x4x2_t *in /*[16]*/, int32x4x2_t *out /*[16]*/)
1128 {
1129 int32x4x2_t s[8];
1130 int32x4x2_t x[4];
1131 int32x4x2_t tmp0, tmp1, tmp2, tmp3;
1132 int32x4x2_t tmp4, tmp5, tmp6, tmp7;
1133 WORD32 i;
1134
1135 for(i = 0; i < 2; i++)
1136 {
1137 s[0].val[i] = vaddq_s32(in[0].val[i], in[7].val[i]);
1138 s[1].val[i] = vaddq_s32(in[1].val[i], in[6].val[i]);
1139 s[2].val[i] = vaddq_s32(in[2].val[i], in[5].val[i]);
1140 s[3].val[i] = vaddq_s32(in[3].val[i], in[4].val[i]);
1141 s[4].val[i] = vsubq_s32(in[3].val[i], in[4].val[i]);
1142 s[5].val[i] = vsubq_s32(in[2].val[i], in[5].val[i]);
1143 s[6].val[i] = vsubq_s32(in[1].val[i], in[6].val[i]);
1144 s[7].val[i] = vsubq_s32(in[0].val[i], in[7].val[i]);
1145
1146 x[0].val[i] = vaddq_s32(s[0].val[i], s[3].val[i]);
1147 x[1].val[i] = vaddq_s32(s[1].val[i], s[2].val[i]);
1148 x[2].val[i] = vsubq_s32(s[1].val[i], s[2].val[i]);
1149 x[3].val[i] = vsubq_s32(s[0].val[i], s[3].val[i]);
1150 }
1151
1152 // Type 1
1153 // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
1154 // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
1155 butterfly_one_coeff_32_32(x[0], x[1], 64, &out[0], &out[8]);
1156 // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
1157 // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
1158 butterfly_two_coeff_32_32(x[3], x[2], 36, 83, &out[4], &out[12]);
1159
1160 // Type 2
1161 butterfly_two_coeff_32_32(s[7], s[4], 18, 89, &tmp0, &tmp1);
1162 butterfly_two_coeff_32_32(s[5], s[6], 75, 50, &tmp2, &tmp3);
1163
1164 out[2].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1165 out[2].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1166
1167 out[14].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1168 out[14].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1169
1170 butterfly_two_coeff_32_32(s[7], s[4], 75, 50, &tmp0, &tmp1);
1171 butterfly_two_coeff_32_32(s[5], s[6], -89, 18, &tmp2, &tmp3);
1172
1173 out[10].val[0] = vaddq_s32(tmp0.val[0], tmp2.val[0]);
1174 out[10].val[1] = vaddq_s32(tmp0.val[1], tmp2.val[1]);
1175
1176 out[6].val[0] = vaddq_s32(tmp1.val[0], tmp3.val[0]);
1177 out[6].val[1] = vaddq_s32(tmp1.val[1], tmp3.val[1]);
1178
1179 // Type 3
1180 butterfly_two_coeff_32_32(in[8], in[15], 9, -90, &tmp0, &tmp1);
1181 butterfly_two_coeff_32_32(in[9], in[14], 87, 25, &tmp2, &tmp3);
1182 butterfly_two_coeff_32_32(in[10], in[13], 43, -80, &tmp4, &tmp5);
1183 butterfly_two_coeff_32_32(in[11], in[12], 70, 57, &tmp6, &tmp7);
1184
1185 out[1].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1186 out[1].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1187
1188 out[15].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1189 out[15].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1190
1191 butterfly_two_coeff_32_32(in[8], in[15], 87, -25, &tmp0, &tmp1);
1192 butterfly_two_coeff_32_32(in[9], in[14], -70, -57, &tmp2, &tmp3);
1193 butterfly_two_coeff_32_32(in[10], in[13], 9, -90, &tmp4, &tmp5);
1194 butterfly_two_coeff_32_32(in[11], in[12], -80, 43, &tmp6, &tmp7);
1195
1196 out[3].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1197 out[3].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1198
1199 out[13].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1200 out[13].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1201
1202 butterfly_two_coeff_32_32(in[8], in[15], 43, -80, &tmp0, &tmp1);
1203 butterfly_two_coeff_32_32(in[9], in[14], 9, 90, &tmp2, &tmp3);
1204 butterfly_two_coeff_32_32(in[10], in[13], 57, 70, &tmp4, &tmp5);
1205 butterfly_two_coeff_32_32(in[11], in[12], -87, -25, &tmp6, &tmp7);
1206
1207 out[5].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1208 out[5].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1209
1210 out[11].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1211 out[11].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1212
1213 butterfly_two_coeff_32_32(in[8], in[15], 70, -57, &tmp0, &tmp1);
1214 butterfly_two_coeff_32_32(in[9], in[14], -80, 43, &tmp2, &tmp3);
1215 butterfly_two_coeff_32_32(in[10], in[13], -87, 25, &tmp4, &tmp5);
1216 butterfly_two_coeff_32_32(in[11], in[12], 90, -9, &tmp6, &tmp7);
1217
1218 out[7].val[0] = add4(tmp0.val[0], tmp3.val[0], tmp4.val[0], tmp7.val[0]);
1219 out[7].val[1] = add4(tmp0.val[1], tmp3.val[1], tmp4.val[1], tmp7.val[1]);
1220
1221 out[9].val[0] = add4(tmp1.val[0], tmp2.val[0], tmp5.val[0], tmp6.val[0]);
1222 out[9].val[1] = add4(tmp1.val[1], tmp2.val[1], tmp5.val[1], tmp6.val[1]);
1223 }
1224
1225 /**
1226 *******************************************************************************
1227 *
1228 * @brief
1229 * This function performs residue calculation and forward transform on
1230 * input pixels
1231 *
1232 * @par Description:
1233 * Performs residue calculation by subtracting source and prediction and
1234 * followed by forward transform
1235 *
1236 * @param[in] pu1_src
1237 * Input 16x16 pixels
1238 *
1239 * @param[in] pu1_pred
1240 * Prediction data
1241 *
1242 * @param[in] pi2_tmp
1243 * Temporary buffer of size 16x16
1244 *
1245 * @param[out] pi2_dst
1246 * Output 16x16 coefficients
1247 *
1248 * @param[in] src_strd
1249 * Input stride
1250 *
1251 * @param[in] pred_strd
1252 * Prediction Stride
1253 *
1254 * @param[in] dst_strd
1255 * Output Stride
1256 *
1257 * @param[in] e_chroma_plane
1258 * Enum singalling chroma plane
1259 *
1260 * @returns Void
1261 *
1262 * @remarks
1263 * None
1264 *
1265 *******************************************************************************
1266 */
ihevc_resi_trans_16x16_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,CHROMA_PLANE_ID_T e_chroma_plane)1267 UWORD32 ihevc_resi_trans_16x16_neon(
1268 UWORD8 *pu1_src,
1269 UWORD8 *pu1_pred,
1270 WORD32 *pi4_temp,
1271 WORD16 *pi2_dst,
1272 WORD32 src_strd,
1273 WORD32 pred_strd,
1274 WORD32 dst_strd,
1275 CHROMA_PLANE_ID_T e_chroma_plane)
1276 {
1277 UWORD32 u4_blk_sad = 0;
1278 WORD32 chroma_flag;
1279 uint8x8_t temp0[16], temp1[16];
1280 int16x8_t temp2[16], temp3[16];
1281 int32x4_t tmp_a, tmp_b;
1282 int64x2_t tmp_c;
1283 int32x2_t sad_v;
1284 int32x4x2_t out0[16], out1[16], temp4[16], temp5[16];
1285
1286 (void)pi4_temp;
1287 chroma_flag = e_chroma_plane != NULL_PLANE;
1288 /* Residue + Forward Transform 1st stage */
1289 // Left half.
1290 load(pu1_src, src_strd, temp0, e_chroma_plane);
1291 load(pu1_pred, pred_strd, temp1, e_chroma_plane);
1292
1293 tmp_a = diff(temp0, temp1, temp2);
1294 cross_input_16(temp2, temp3);
1295 dct_body_16_32(temp3, out0);
1296
1297 // Right half.
1298 load(pu1_src + 8 * (1 + chroma_flag), src_strd, temp0, e_chroma_plane);
1299 load(pu1_pred + 8 * (1 + chroma_flag), pred_strd, temp1, e_chroma_plane);
1300
1301 tmp_b = diff(temp0, temp1, temp2);
1302 cross_input_16(temp2, temp3);
1303 dct_body_16_32(temp3, out1);
1304
1305 tmp_a = vaddq_s32(tmp_a, tmp_b);
1306 tmp_c = vpaddlq_s32(tmp_a);
1307 sad_v = vadd_s32(vreinterpret_s32_s64(vget_low_s64(tmp_c)),
1308 vreinterpret_s32_s64(vget_high_s64(tmp_c)));
1309 u4_blk_sad = vget_lane_s32(sad_v, 0);
1310
1311
1312 // Transpose top left and top right quarters into one contiguous location to
1313 // process to the top half.
1314 transpose_8x8(&out0[0], &temp4[0]);
1315 transpose_8x8(&out1[0], &temp4[8]);
1316
1317 cross_input_32(temp4, temp5);
1318 dct_body_32_32(temp5, temp4);
1319 partial_round_shift(temp4, temp2);
1320 transpose_s16_8x8(
1321 &temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
1322 transpose_s16_8x8(
1323 &temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
1324
1325 store(pi2_dst, &temp2[0]);
1326 store(pi2_dst + 8, &temp2[8]);
1327 pi2_dst += 8 * dst_strd;
1328
1329 // Transpose bottom left and bottom right quarters into one contiguous
1330 // location to process to the bottom half.
1331 transpose_8x8(&out0[8], &out1[0]);
1332 transpose_s32_8x8(
1333 &out1[8], &out1[9], &out1[10], &out1[11], &out1[12], &out1[13], &out1[14], &out1[15]);
1334
1335 cross_input_32(out1, temp5);
1336 dct_body_32_32(temp5, temp4);
1337 partial_round_shift(temp4, temp2);
1338 transpose_s16_8x8(
1339 &temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]);
1340 transpose_s16_8x8(
1341 &temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], &temp2[13], &temp2[14], &temp2[15]);
1342 store(pi2_dst, &temp2[0]);
1343 store(pi2_dst + 8, &temp2[8]);
1344
1345 return u4_blk_sad;
1346 }
1347