1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker *
4*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6*dfc6aa5cSAndroid Build Coastguard Worker *
7*dfc6aa5cSAndroid Build Coastguard Worker * This software is provided 'as-is', without any express or implied
8*dfc6aa5cSAndroid Build Coastguard Worker * warranty. In no event will the authors be held liable for any damages
9*dfc6aa5cSAndroid Build Coastguard Worker * arising from the use of this software.
10*dfc6aa5cSAndroid Build Coastguard Worker *
11*dfc6aa5cSAndroid Build Coastguard Worker * Permission is granted to anyone to use this software for any purpose,
12*dfc6aa5cSAndroid Build Coastguard Worker * including commercial applications, and to alter it and redistribute it
13*dfc6aa5cSAndroid Build Coastguard Worker * freely, subject to the following restrictions:
14*dfc6aa5cSAndroid Build Coastguard Worker *
15*dfc6aa5cSAndroid Build Coastguard Worker * 1. The origin of this software must not be misrepresented; you must not
16*dfc6aa5cSAndroid Build Coastguard Worker * claim that you wrote the original software. If you use this software
17*dfc6aa5cSAndroid Build Coastguard Worker * in a product, an acknowledgment in the product documentation would be
18*dfc6aa5cSAndroid Build Coastguard Worker * appreciated but is not required.
19*dfc6aa5cSAndroid Build Coastguard Worker * 2. Altered source versions must be plainly marked as such, and must not be
20*dfc6aa5cSAndroid Build Coastguard Worker * misrepresented as being the original software.
21*dfc6aa5cSAndroid Build Coastguard Worker * 3. This notice may not be removed or altered from any source distribution.
22*dfc6aa5cSAndroid Build Coastguard Worker */
23*dfc6aa5cSAndroid Build Coastguard Worker
24*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
31*dfc6aa5cSAndroid Build Coastguard Worker #include "align.h"
32*dfc6aa5cSAndroid Build Coastguard Worker #include "neon-compat.h"
33*dfc6aa5cSAndroid Build Coastguard Worker
34*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
35*dfc6aa5cSAndroid Build Coastguard Worker
36*dfc6aa5cSAndroid Build Coastguard Worker
37*dfc6aa5cSAndroid Build Coastguard Worker /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
38*dfc6aa5cSAndroid Build Coastguard Worker * (Discrete Cosine Transform) on one block of samples. It uses the same
39*dfc6aa5cSAndroid Build Coastguard Worker * calculations and produces exactly the same output as IJG's original
40*dfc6aa5cSAndroid Build Coastguard Worker * jpeg_fdct_islow() function, which can be found in jfdctint.c.
41*dfc6aa5cSAndroid Build Coastguard Worker *
42*dfc6aa5cSAndroid Build Coastguard Worker * Scaled integer constants are used to avoid floating-point arithmetic:
43*dfc6aa5cSAndroid Build Coastguard Worker * 0.298631336 = 2446 * 2^-13
44*dfc6aa5cSAndroid Build Coastguard Worker * 0.390180644 = 3196 * 2^-13
45*dfc6aa5cSAndroid Build Coastguard Worker * 0.541196100 = 4433 * 2^-13
46*dfc6aa5cSAndroid Build Coastguard Worker * 0.765366865 = 6270 * 2^-13
47*dfc6aa5cSAndroid Build Coastguard Worker * 0.899976223 = 7373 * 2^-13
48*dfc6aa5cSAndroid Build Coastguard Worker * 1.175875602 = 9633 * 2^-13
49*dfc6aa5cSAndroid Build Coastguard Worker * 1.501321110 = 12299 * 2^-13
50*dfc6aa5cSAndroid Build Coastguard Worker * 1.847759065 = 15137 * 2^-13
51*dfc6aa5cSAndroid Build Coastguard Worker * 1.961570560 = 16069 * 2^-13
52*dfc6aa5cSAndroid Build Coastguard Worker * 2.053119869 = 16819 * 2^-13
53*dfc6aa5cSAndroid Build Coastguard Worker * 2.562915447 = 20995 * 2^-13
54*dfc6aa5cSAndroid Build Coastguard Worker * 3.072711026 = 25172 * 2^-13
55*dfc6aa5cSAndroid Build Coastguard Worker *
56*dfc6aa5cSAndroid Build Coastguard Worker * See jfdctint.c for further details of the DCT algorithm. Where possible,
57*dfc6aa5cSAndroid Build Coastguard Worker * the variable names and comments here in jsimd_fdct_islow_neon() match up
58*dfc6aa5cSAndroid Build Coastguard Worker * with those in jpeg_fdct_islow().
59*dfc6aa5cSAndroid Build Coastguard Worker */
60*dfc6aa5cSAndroid Build Coastguard Worker
61*dfc6aa5cSAndroid Build Coastguard Worker #define CONST_BITS 13
62*dfc6aa5cSAndroid Build Coastguard Worker #define PASS1_BITS 2
63*dfc6aa5cSAndroid Build Coastguard Worker
64*dfc6aa5cSAndroid Build Coastguard Worker #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
65*dfc6aa5cSAndroid Build Coastguard Worker #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
66*dfc6aa5cSAndroid Build Coastguard Worker
67*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_298 2446
68*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_390 3196
69*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_541 4433
70*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_765 6270
71*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_899 7373
72*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_175 9633
73*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_501 12299
74*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_847 15137
75*dfc6aa5cSAndroid Build Coastguard Worker #define F_1_961 16069
76*dfc6aa5cSAndroid Build Coastguard Worker #define F_2_053 16819
77*dfc6aa5cSAndroid Build Coastguard Worker #define F_2_562 20995
78*dfc6aa5cSAndroid Build Coastguard Worker #define F_3_072 25172
79*dfc6aa5cSAndroid Build Coastguard Worker
80*dfc6aa5cSAndroid Build Coastguard Worker
81*dfc6aa5cSAndroid Build Coastguard Worker ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
82*dfc6aa5cSAndroid Build Coastguard Worker F_0_298, -F_0_390, F_0_541, F_0_765,
83*dfc6aa5cSAndroid Build Coastguard Worker -F_0_899, F_1_175, F_1_501, -F_1_847,
84*dfc6aa5cSAndroid Build Coastguard Worker -F_1_961, F_2_053, -F_2_562, F_3_072
85*dfc6aa5cSAndroid Build Coastguard Worker };
86*dfc6aa5cSAndroid Build Coastguard Worker
jsimd_fdct_islow_neon(DCTELEM * data)87*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_fdct_islow_neon(DCTELEM *data)
88*dfc6aa5cSAndroid Build Coastguard Worker {
89*dfc6aa5cSAndroid Build Coastguard Worker /* Load DCT constants. */
90*dfc6aa5cSAndroid Build Coastguard Worker #ifdef HAVE_VLD1_S16_X3
91*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
92*dfc6aa5cSAndroid Build Coastguard Worker #else
93*dfc6aa5cSAndroid Build Coastguard Worker /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
94*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
95*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
96*dfc6aa5cSAndroid Build Coastguard Worker const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
97*dfc6aa5cSAndroid Build Coastguard Worker const int16x4x3_t consts = { { consts1, consts2, consts3 } };
98*dfc6aa5cSAndroid Build Coastguard Worker #endif
99*dfc6aa5cSAndroid Build Coastguard Worker
100*dfc6aa5cSAndroid Build Coastguard Worker /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
101*dfc6aa5cSAndroid Build Coastguard Worker * are used, followed by vuzp to transpose the block such that we have a
102*dfc6aa5cSAndroid Build Coastguard Worker * column of samples per vector - allowing all rows to be processed at once.
103*dfc6aa5cSAndroid Build Coastguard Worker */
104*dfc6aa5cSAndroid Build Coastguard Worker int16x8x4_t s_rows_0123 = vld4q_s16(data);
105*dfc6aa5cSAndroid Build Coastguard Worker int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
106*dfc6aa5cSAndroid Build Coastguard Worker
107*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
108*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
109*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
110*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
111*dfc6aa5cSAndroid Build Coastguard Worker
112*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col0 = cols_04.val[0];
113*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col1 = cols_15.val[0];
114*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col2 = cols_26.val[0];
115*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col3 = cols_37.val[0];
116*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col4 = cols_04.val[1];
117*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col5 = cols_15.val[1];
118*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col6 = cols_26.val[1];
119*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t col7 = cols_37.val[1];
120*dfc6aa5cSAndroid Build Coastguard Worker
121*dfc6aa5cSAndroid Build Coastguard Worker /* Pass 1: process rows. */
122*dfc6aa5cSAndroid Build Coastguard Worker
123*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp0 = vaddq_s16(col0, col7);
124*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp7 = vsubq_s16(col0, col7);
125*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp1 = vaddq_s16(col1, col6);
126*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp6 = vsubq_s16(col1, col6);
127*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp2 = vaddq_s16(col2, col5);
128*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp5 = vsubq_s16(col2, col5);
129*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp3 = vaddq_s16(col3, col4);
130*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp4 = vsubq_s16(col3, col4);
131*dfc6aa5cSAndroid Build Coastguard Worker
132*dfc6aa5cSAndroid Build Coastguard Worker /* Even part */
133*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
134*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
135*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
136*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
137*dfc6aa5cSAndroid Build Coastguard Worker
138*dfc6aa5cSAndroid Build Coastguard Worker col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
139*dfc6aa5cSAndroid Build Coastguard Worker col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
140*dfc6aa5cSAndroid Build Coastguard Worker
141*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
142*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z1_l =
143*dfc6aa5cSAndroid Build Coastguard Worker vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
144*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z1_h =
145*dfc6aa5cSAndroid Build Coastguard Worker vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
146*dfc6aa5cSAndroid Build Coastguard Worker
147*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t col2_scaled_l =
148*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
149*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t col2_scaled_h =
150*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
151*dfc6aa5cSAndroid Build Coastguard Worker col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
152*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
153*dfc6aa5cSAndroid Build Coastguard Worker
154*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t col6_scaled_l =
155*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
156*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t col6_scaled_h =
157*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
158*dfc6aa5cSAndroid Build Coastguard Worker col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
159*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
160*dfc6aa5cSAndroid Build Coastguard Worker
161*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part */
162*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t z1 = vaddq_s16(tmp4, tmp7);
163*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t z2 = vaddq_s16(tmp5, tmp6);
164*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t z3 = vaddq_s16(tmp4, tmp6);
165*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t z4 = vaddq_s16(tmp5, tmp7);
166*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * c3 */
167*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
168*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
169*dfc6aa5cSAndroid Build Coastguard Worker z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
170*dfc6aa5cSAndroid Build Coastguard Worker z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
171*dfc6aa5cSAndroid Build Coastguard Worker
172*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c1+c3+c5-c7) */
173*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
174*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
175*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3-c5+c7) */
176*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
177*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
178*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3+c5-c7) */
179*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
180*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
181*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3-c5-c7) */
182*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
183*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
184*dfc6aa5cSAndroid Build Coastguard Worker
185*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (c7-c3) */
186*dfc6aa5cSAndroid Build Coastguard Worker z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
187*dfc6aa5cSAndroid Build Coastguard Worker z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
188*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c1-c3) */
189*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
190*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
191*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c3-c5) */
192*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
193*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
194*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (c5-c3) */
195*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
196*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
197*dfc6aa5cSAndroid Build Coastguard Worker
198*dfc6aa5cSAndroid Build Coastguard Worker z3_l = vaddq_s32(z3_l, z5_l);
199*dfc6aa5cSAndroid Build Coastguard Worker z3_h = vaddq_s32(z3_h, z5_h);
200*dfc6aa5cSAndroid Build Coastguard Worker z4_l = vaddq_s32(z4_l, z5_l);
201*dfc6aa5cSAndroid Build Coastguard Worker z4_h = vaddq_s32(z4_h, z5_h);
202*dfc6aa5cSAndroid Build Coastguard Worker
203*dfc6aa5cSAndroid Build Coastguard Worker tmp4_l = vaddq_s32(tmp4_l, z1_l);
204*dfc6aa5cSAndroid Build Coastguard Worker tmp4_h = vaddq_s32(tmp4_h, z1_h);
205*dfc6aa5cSAndroid Build Coastguard Worker tmp4_l = vaddq_s32(tmp4_l, z3_l);
206*dfc6aa5cSAndroid Build Coastguard Worker tmp4_h = vaddq_s32(tmp4_h, z3_h);
207*dfc6aa5cSAndroid Build Coastguard Worker col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
208*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp4_h, DESCALE_P1));
209*dfc6aa5cSAndroid Build Coastguard Worker
210*dfc6aa5cSAndroid Build Coastguard Worker tmp5_l = vaddq_s32(tmp5_l, z2_l);
211*dfc6aa5cSAndroid Build Coastguard Worker tmp5_h = vaddq_s32(tmp5_h, z2_h);
212*dfc6aa5cSAndroid Build Coastguard Worker tmp5_l = vaddq_s32(tmp5_l, z4_l);
213*dfc6aa5cSAndroid Build Coastguard Worker tmp5_h = vaddq_s32(tmp5_h, z4_h);
214*dfc6aa5cSAndroid Build Coastguard Worker col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
215*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp5_h, DESCALE_P1));
216*dfc6aa5cSAndroid Build Coastguard Worker
217*dfc6aa5cSAndroid Build Coastguard Worker tmp6_l = vaddq_s32(tmp6_l, z2_l);
218*dfc6aa5cSAndroid Build Coastguard Worker tmp6_h = vaddq_s32(tmp6_h, z2_h);
219*dfc6aa5cSAndroid Build Coastguard Worker tmp6_l = vaddq_s32(tmp6_l, z3_l);
220*dfc6aa5cSAndroid Build Coastguard Worker tmp6_h = vaddq_s32(tmp6_h, z3_h);
221*dfc6aa5cSAndroid Build Coastguard Worker col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
222*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp6_h, DESCALE_P1));
223*dfc6aa5cSAndroid Build Coastguard Worker
224*dfc6aa5cSAndroid Build Coastguard Worker tmp7_l = vaddq_s32(tmp7_l, z1_l);
225*dfc6aa5cSAndroid Build Coastguard Worker tmp7_h = vaddq_s32(tmp7_h, z1_h);
226*dfc6aa5cSAndroid Build Coastguard Worker tmp7_l = vaddq_s32(tmp7_l, z4_l);
227*dfc6aa5cSAndroid Build Coastguard Worker tmp7_h = vaddq_s32(tmp7_h, z4_h);
228*dfc6aa5cSAndroid Build Coastguard Worker col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
229*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp7_h, DESCALE_P1));
230*dfc6aa5cSAndroid Build Coastguard Worker
231*dfc6aa5cSAndroid Build Coastguard Worker /* Transpose to work on columns in pass 2. */
232*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
233*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
234*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
235*dfc6aa5cSAndroid Build Coastguard Worker int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
236*dfc6aa5cSAndroid Build Coastguard Worker
237*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
238*dfc6aa5cSAndroid Build Coastguard Worker vreinterpretq_s32_s16(cols_45.val[0]));
239*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
240*dfc6aa5cSAndroid Build Coastguard Worker vreinterpretq_s32_s16(cols_45.val[1]));
241*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
242*dfc6aa5cSAndroid Build Coastguard Worker vreinterpretq_s32_s16(cols_67.val[0]));
243*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
244*dfc6aa5cSAndroid Build Coastguard Worker vreinterpretq_s32_s16(cols_67.val[1]));
245*dfc6aa5cSAndroid Build Coastguard Worker
246*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
247*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
248*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
249*dfc6aa5cSAndroid Build Coastguard Worker int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
250*dfc6aa5cSAndroid Build Coastguard Worker
251*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
252*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
253*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
254*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
255*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
256*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
257*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
258*dfc6aa5cSAndroid Build Coastguard Worker int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
259*dfc6aa5cSAndroid Build Coastguard Worker
260*dfc6aa5cSAndroid Build Coastguard Worker /* Pass 2: process columns. */
261*dfc6aa5cSAndroid Build Coastguard Worker
262*dfc6aa5cSAndroid Build Coastguard Worker tmp0 = vaddq_s16(row0, row7);
263*dfc6aa5cSAndroid Build Coastguard Worker tmp7 = vsubq_s16(row0, row7);
264*dfc6aa5cSAndroid Build Coastguard Worker tmp1 = vaddq_s16(row1, row6);
265*dfc6aa5cSAndroid Build Coastguard Worker tmp6 = vsubq_s16(row1, row6);
266*dfc6aa5cSAndroid Build Coastguard Worker tmp2 = vaddq_s16(row2, row5);
267*dfc6aa5cSAndroid Build Coastguard Worker tmp5 = vsubq_s16(row2, row5);
268*dfc6aa5cSAndroid Build Coastguard Worker tmp3 = vaddq_s16(row3, row4);
269*dfc6aa5cSAndroid Build Coastguard Worker tmp4 = vsubq_s16(row3, row4);
270*dfc6aa5cSAndroid Build Coastguard Worker
271*dfc6aa5cSAndroid Build Coastguard Worker /* Even part */
272*dfc6aa5cSAndroid Build Coastguard Worker tmp10 = vaddq_s16(tmp0, tmp3);
273*dfc6aa5cSAndroid Build Coastguard Worker tmp13 = vsubq_s16(tmp0, tmp3);
274*dfc6aa5cSAndroid Build Coastguard Worker tmp11 = vaddq_s16(tmp1, tmp2);
275*dfc6aa5cSAndroid Build Coastguard Worker tmp12 = vsubq_s16(tmp1, tmp2);
276*dfc6aa5cSAndroid Build Coastguard Worker
277*dfc6aa5cSAndroid Build Coastguard Worker row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
278*dfc6aa5cSAndroid Build Coastguard Worker row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
279*dfc6aa5cSAndroid Build Coastguard Worker
280*dfc6aa5cSAndroid Build Coastguard Worker tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
281*dfc6aa5cSAndroid Build Coastguard Worker z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
282*dfc6aa5cSAndroid Build Coastguard Worker z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
283*dfc6aa5cSAndroid Build Coastguard Worker
284*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t row2_scaled_l =
285*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
286*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t row2_scaled_h =
287*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
288*dfc6aa5cSAndroid Build Coastguard Worker row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
289*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
290*dfc6aa5cSAndroid Build Coastguard Worker
291*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t row6_scaled_l =
292*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
293*dfc6aa5cSAndroid Build Coastguard Worker int32x4_t row6_scaled_h =
294*dfc6aa5cSAndroid Build Coastguard Worker vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
295*dfc6aa5cSAndroid Build Coastguard Worker row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
296*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
297*dfc6aa5cSAndroid Build Coastguard Worker
298*dfc6aa5cSAndroid Build Coastguard Worker /* Odd part */
299*dfc6aa5cSAndroid Build Coastguard Worker z1 = vaddq_s16(tmp4, tmp7);
300*dfc6aa5cSAndroid Build Coastguard Worker z2 = vaddq_s16(tmp5, tmp6);
301*dfc6aa5cSAndroid Build Coastguard Worker z3 = vaddq_s16(tmp4, tmp6);
302*dfc6aa5cSAndroid Build Coastguard Worker z4 = vaddq_s16(tmp5, tmp7);
303*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * c3 */
304*dfc6aa5cSAndroid Build Coastguard Worker z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
305*dfc6aa5cSAndroid Build Coastguard Worker z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
306*dfc6aa5cSAndroid Build Coastguard Worker z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
307*dfc6aa5cSAndroid Build Coastguard Worker z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
308*dfc6aa5cSAndroid Build Coastguard Worker
309*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c1+c3+c5-c7) */
310*dfc6aa5cSAndroid Build Coastguard Worker tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
311*dfc6aa5cSAndroid Build Coastguard Worker tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
312*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3-c5+c7) */
313*dfc6aa5cSAndroid Build Coastguard Worker tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
314*dfc6aa5cSAndroid Build Coastguard Worker tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
315*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3+c5-c7) */
316*dfc6aa5cSAndroid Build Coastguard Worker tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
317*dfc6aa5cSAndroid Build Coastguard Worker tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
318*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * ( c1+c3-c5-c7) */
319*dfc6aa5cSAndroid Build Coastguard Worker tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
320*dfc6aa5cSAndroid Build Coastguard Worker tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
321*dfc6aa5cSAndroid Build Coastguard Worker
322*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (c7-c3) */
323*dfc6aa5cSAndroid Build Coastguard Worker z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
324*dfc6aa5cSAndroid Build Coastguard Worker z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
325*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c1-c3) */
326*dfc6aa5cSAndroid Build Coastguard Worker z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
327*dfc6aa5cSAndroid Build Coastguard Worker z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
328*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (-c3-c5) */
329*dfc6aa5cSAndroid Build Coastguard Worker z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
330*dfc6aa5cSAndroid Build Coastguard Worker z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
331*dfc6aa5cSAndroid Build Coastguard Worker /* sqrt(2) * (c5-c3) */
332*dfc6aa5cSAndroid Build Coastguard Worker z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
333*dfc6aa5cSAndroid Build Coastguard Worker z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
334*dfc6aa5cSAndroid Build Coastguard Worker
335*dfc6aa5cSAndroid Build Coastguard Worker z3_l = vaddq_s32(z3_l, z5_l);
336*dfc6aa5cSAndroid Build Coastguard Worker z3_h = vaddq_s32(z3_h, z5_h);
337*dfc6aa5cSAndroid Build Coastguard Worker z4_l = vaddq_s32(z4_l, z5_l);
338*dfc6aa5cSAndroid Build Coastguard Worker z4_h = vaddq_s32(z4_h, z5_h);
339*dfc6aa5cSAndroid Build Coastguard Worker
340*dfc6aa5cSAndroid Build Coastguard Worker tmp4_l = vaddq_s32(tmp4_l, z1_l);
341*dfc6aa5cSAndroid Build Coastguard Worker tmp4_h = vaddq_s32(tmp4_h, z1_h);
342*dfc6aa5cSAndroid Build Coastguard Worker tmp4_l = vaddq_s32(tmp4_l, z3_l);
343*dfc6aa5cSAndroid Build Coastguard Worker tmp4_h = vaddq_s32(tmp4_h, z3_h);
344*dfc6aa5cSAndroid Build Coastguard Worker row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
345*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp4_h, DESCALE_P2));
346*dfc6aa5cSAndroid Build Coastguard Worker
347*dfc6aa5cSAndroid Build Coastguard Worker tmp5_l = vaddq_s32(tmp5_l, z2_l);
348*dfc6aa5cSAndroid Build Coastguard Worker tmp5_h = vaddq_s32(tmp5_h, z2_h);
349*dfc6aa5cSAndroid Build Coastguard Worker tmp5_l = vaddq_s32(tmp5_l, z4_l);
350*dfc6aa5cSAndroid Build Coastguard Worker tmp5_h = vaddq_s32(tmp5_h, z4_h);
351*dfc6aa5cSAndroid Build Coastguard Worker row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
352*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp5_h, DESCALE_P2));
353*dfc6aa5cSAndroid Build Coastguard Worker
354*dfc6aa5cSAndroid Build Coastguard Worker tmp6_l = vaddq_s32(tmp6_l, z2_l);
355*dfc6aa5cSAndroid Build Coastguard Worker tmp6_h = vaddq_s32(tmp6_h, z2_h);
356*dfc6aa5cSAndroid Build Coastguard Worker tmp6_l = vaddq_s32(tmp6_l, z3_l);
357*dfc6aa5cSAndroid Build Coastguard Worker tmp6_h = vaddq_s32(tmp6_h, z3_h);
358*dfc6aa5cSAndroid Build Coastguard Worker row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
359*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp6_h, DESCALE_P2));
360*dfc6aa5cSAndroid Build Coastguard Worker
361*dfc6aa5cSAndroid Build Coastguard Worker tmp7_l = vaddq_s32(tmp7_l, z1_l);
362*dfc6aa5cSAndroid Build Coastguard Worker tmp7_h = vaddq_s32(tmp7_h, z1_h);
363*dfc6aa5cSAndroid Build Coastguard Worker tmp7_l = vaddq_s32(tmp7_l, z4_l);
364*dfc6aa5cSAndroid Build Coastguard Worker tmp7_h = vaddq_s32(tmp7_h, z4_h);
365*dfc6aa5cSAndroid Build Coastguard Worker row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
366*dfc6aa5cSAndroid Build Coastguard Worker vrshrn_n_s32(tmp7_h, DESCALE_P2));
367*dfc6aa5cSAndroid Build Coastguard Worker
368*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 0 * DCTSIZE, row0);
369*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 1 * DCTSIZE, row1);
370*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 2 * DCTSIZE, row2);
371*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 3 * DCTSIZE, row3);
372*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 4 * DCTSIZE, row4);
373*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 5 * DCTSIZE, row5);
374*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 6 * DCTSIZE, row6);
375*dfc6aa5cSAndroid Build Coastguard Worker vst1q_s16(data + 7 * DCTSIZE, row7);
376*dfc6aa5cSAndroid Build Coastguard Worker }
377