xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jfdctfst-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1*dfc6aa5cSAndroid Build Coastguard Worker /*
2*dfc6aa5cSAndroid Build Coastguard Worker  * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
3*dfc6aa5cSAndroid Build Coastguard Worker  *
4*dfc6aa5cSAndroid Build Coastguard Worker  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5*dfc6aa5cSAndroid Build Coastguard Worker  *
6*dfc6aa5cSAndroid Build Coastguard Worker  * This software is provided 'as-is', without any express or implied
7*dfc6aa5cSAndroid Build Coastguard Worker  * warranty.  In no event will the authors be held liable for any damages
8*dfc6aa5cSAndroid Build Coastguard Worker  * arising from the use of this software.
9*dfc6aa5cSAndroid Build Coastguard Worker  *
10*dfc6aa5cSAndroid Build Coastguard Worker  * Permission is granted to anyone to use this software for any purpose,
11*dfc6aa5cSAndroid Build Coastguard Worker  * including commercial applications, and to alter it and redistribute it
12*dfc6aa5cSAndroid Build Coastguard Worker  * freely, subject to the following restrictions:
13*dfc6aa5cSAndroid Build Coastguard Worker  *
14*dfc6aa5cSAndroid Build Coastguard Worker  * 1. The origin of this software must not be misrepresented; you must not
15*dfc6aa5cSAndroid Build Coastguard Worker  *    claim that you wrote the original software. If you use this software
16*dfc6aa5cSAndroid Build Coastguard Worker  *    in a product, an acknowledgment in the product documentation would be
17*dfc6aa5cSAndroid Build Coastguard Worker  *    appreciated but is not required.
18*dfc6aa5cSAndroid Build Coastguard Worker  * 2. Altered source versions must be plainly marked as such, and must not be
19*dfc6aa5cSAndroid Build Coastguard Worker  *    misrepresented as being the original software.
20*dfc6aa5cSAndroid Build Coastguard Worker  * 3. This notice may not be removed or altered from any source distribution.
21*dfc6aa5cSAndroid Build Coastguard Worker  */
22*dfc6aa5cSAndroid Build Coastguard Worker 
23*dfc6aa5cSAndroid Build Coastguard Worker #define JPEG_INTERNALS
24*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jinclude.h"
25*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jpeglib.h"
26*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimd.h"
27*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jdct.h"
28*dfc6aa5cSAndroid Build Coastguard Worker #include "../../jsimddct.h"
29*dfc6aa5cSAndroid Build Coastguard Worker #include "../jsimd.h"
30*dfc6aa5cSAndroid Build Coastguard Worker #include "align.h"
31*dfc6aa5cSAndroid Build Coastguard Worker 
32*dfc6aa5cSAndroid Build Coastguard Worker #include <arm_neon.h>
33*dfc6aa5cSAndroid Build Coastguard Worker 
34*dfc6aa5cSAndroid Build Coastguard Worker 
35*dfc6aa5cSAndroid Build Coastguard Worker /* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
36*dfc6aa5cSAndroid Build Coastguard Worker  * (Discrete Cosine Transform) on one block of samples.  It uses the same
37*dfc6aa5cSAndroid Build Coastguard Worker  * calculations and produces exactly the same output as IJG's original
38*dfc6aa5cSAndroid Build Coastguard Worker  * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
39*dfc6aa5cSAndroid Build Coastguard Worker  *
40*dfc6aa5cSAndroid Build Coastguard Worker  * Scaled integer constants are used to avoid floating-point arithmetic:
41*dfc6aa5cSAndroid Build Coastguard Worker  *    0.382683433 = 12544 * 2^-15
42*dfc6aa5cSAndroid Build Coastguard Worker  *    0.541196100 = 17795 * 2^-15
43*dfc6aa5cSAndroid Build Coastguard Worker  *    0.707106781 = 23168 * 2^-15
44*dfc6aa5cSAndroid Build Coastguard Worker  *    0.306562965 =  9984 * 2^-15
45*dfc6aa5cSAndroid Build Coastguard Worker  *
46*dfc6aa5cSAndroid Build Coastguard Worker  * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
47*dfc6aa5cSAndroid Build Coastguard Worker  * the variable names and comments here in jsimd_fdct_ifast_neon() match up
48*dfc6aa5cSAndroid Build Coastguard Worker  * with those in jpeg_fdct_ifast().
49*dfc6aa5cSAndroid Build Coastguard Worker  */
50*dfc6aa5cSAndroid Build Coastguard Worker 
51*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_382  12544
52*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_541  17792
53*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_707  23168
54*dfc6aa5cSAndroid Build Coastguard Worker #define F_0_306  9984
55*dfc6aa5cSAndroid Build Coastguard Worker 
56*dfc6aa5cSAndroid Build Coastguard Worker 
57*dfc6aa5cSAndroid Build Coastguard Worker ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
58*dfc6aa5cSAndroid Build Coastguard Worker   F_0_382, F_0_541, F_0_707, F_0_306
59*dfc6aa5cSAndroid Build Coastguard Worker };
60*dfc6aa5cSAndroid Build Coastguard Worker 
jsimd_fdct_ifast_neon(DCTELEM * data)61*dfc6aa5cSAndroid Build Coastguard Worker void jsimd_fdct_ifast_neon(DCTELEM *data)
62*dfc6aa5cSAndroid Build Coastguard Worker {
63*dfc6aa5cSAndroid Build Coastguard Worker   /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
64*dfc6aa5cSAndroid Build Coastguard Worker    * are used, followed by vuzp to transpose the block such that we have a
65*dfc6aa5cSAndroid Build Coastguard Worker    * column of samples per vector - allowing all rows to be processed at once.
66*dfc6aa5cSAndroid Build Coastguard Worker    */
67*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x4_t data1 = vld4q_s16(data);
68*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
69*dfc6aa5cSAndroid Build Coastguard Worker 
70*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
71*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
72*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
73*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
74*dfc6aa5cSAndroid Build Coastguard Worker 
75*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col0 = cols_04.val[0];
76*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col1 = cols_15.val[0];
77*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col2 = cols_26.val[0];
78*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col3 = cols_37.val[0];
79*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col4 = cols_04.val[1];
80*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col5 = cols_15.val[1];
81*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col6 = cols_26.val[1];
82*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t col7 = cols_37.val[1];
83*dfc6aa5cSAndroid Build Coastguard Worker 
84*dfc6aa5cSAndroid Build Coastguard Worker   /* Pass 1: process rows. */
85*dfc6aa5cSAndroid Build Coastguard Worker 
86*dfc6aa5cSAndroid Build Coastguard Worker   /* Load DCT conversion constants. */
87*dfc6aa5cSAndroid Build Coastguard Worker   const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
88*dfc6aa5cSAndroid Build Coastguard Worker 
89*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp0 = vaddq_s16(col0, col7);
90*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp7 = vsubq_s16(col0, col7);
91*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp1 = vaddq_s16(col1, col6);
92*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp6 = vsubq_s16(col1, col6);
93*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp2 = vaddq_s16(col2, col5);
94*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp5 = vsubq_s16(col2, col5);
95*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp3 = vaddq_s16(col3, col4);
96*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp4 = vsubq_s16(col3, col4);
97*dfc6aa5cSAndroid Build Coastguard Worker 
98*dfc6aa5cSAndroid Build Coastguard Worker   /* Even part */
99*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
100*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
101*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
102*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
103*dfc6aa5cSAndroid Build Coastguard Worker 
104*dfc6aa5cSAndroid Build Coastguard Worker   col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
105*dfc6aa5cSAndroid Build Coastguard Worker   col4 = vsubq_s16(tmp10, tmp11);
106*dfc6aa5cSAndroid Build Coastguard Worker 
107*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
108*dfc6aa5cSAndroid Build Coastguard Worker   col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
109*dfc6aa5cSAndroid Build Coastguard Worker   col6 = vsubq_s16(tmp13, z1);
110*dfc6aa5cSAndroid Build Coastguard Worker 
111*dfc6aa5cSAndroid Build Coastguard Worker   /* Odd part */
112*dfc6aa5cSAndroid Build Coastguard Worker   tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
113*dfc6aa5cSAndroid Build Coastguard Worker   tmp11 = vaddq_s16(tmp5, tmp6);
114*dfc6aa5cSAndroid Build Coastguard Worker   tmp12 = vaddq_s16(tmp6, tmp7);
115*dfc6aa5cSAndroid Build Coastguard Worker 
116*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
117*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
118*dfc6aa5cSAndroid Build Coastguard Worker   z2 = vaddq_s16(z2, z5);
119*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
120*dfc6aa5cSAndroid Build Coastguard Worker   z5 = vaddq_s16(tmp12, z5);
121*dfc6aa5cSAndroid Build Coastguard Worker   z4 = vaddq_s16(z4, z5);
122*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
123*dfc6aa5cSAndroid Build Coastguard Worker 
124*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
125*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t z13 = vsubq_s16(tmp7, z3);
126*dfc6aa5cSAndroid Build Coastguard Worker 
127*dfc6aa5cSAndroid Build Coastguard Worker   col5 = vaddq_s16(z13, z2);                  /* phase 6 */
128*dfc6aa5cSAndroid Build Coastguard Worker   col3 = vsubq_s16(z13, z2);
129*dfc6aa5cSAndroid Build Coastguard Worker   col1 = vaddq_s16(z11, z4);
130*dfc6aa5cSAndroid Build Coastguard Worker   col7 = vsubq_s16(z11, z4);
131*dfc6aa5cSAndroid Build Coastguard Worker 
132*dfc6aa5cSAndroid Build Coastguard Worker   /* Transpose to work on columns in pass 2. */
133*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
134*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
135*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
136*dfc6aa5cSAndroid Build Coastguard Worker   int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
137*dfc6aa5cSAndroid Build Coastguard Worker 
138*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
139*dfc6aa5cSAndroid Build Coastguard Worker                                       vreinterpretq_s32_s16(cols_45.val[0]));
140*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
141*dfc6aa5cSAndroid Build Coastguard Worker                                       vreinterpretq_s32_s16(cols_45.val[1]));
142*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
143*dfc6aa5cSAndroid Build Coastguard Worker                                       vreinterpretq_s32_s16(cols_67.val[0]));
144*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
145*dfc6aa5cSAndroid Build Coastguard Worker                                       vreinterpretq_s32_s16(cols_67.val[1]));
146*dfc6aa5cSAndroid Build Coastguard Worker 
147*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
148*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
149*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
150*dfc6aa5cSAndroid Build Coastguard Worker   int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
151*dfc6aa5cSAndroid Build Coastguard Worker 
152*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
153*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
154*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
155*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
156*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
157*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
158*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
159*dfc6aa5cSAndroid Build Coastguard Worker   int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
160*dfc6aa5cSAndroid Build Coastguard Worker 
161*dfc6aa5cSAndroid Build Coastguard Worker   /* Pass 2: process columns. */
162*dfc6aa5cSAndroid Build Coastguard Worker 
163*dfc6aa5cSAndroid Build Coastguard Worker   tmp0 = vaddq_s16(row0, row7);
164*dfc6aa5cSAndroid Build Coastguard Worker   tmp7 = vsubq_s16(row0, row7);
165*dfc6aa5cSAndroid Build Coastguard Worker   tmp1 = vaddq_s16(row1, row6);
166*dfc6aa5cSAndroid Build Coastguard Worker   tmp6 = vsubq_s16(row1, row6);
167*dfc6aa5cSAndroid Build Coastguard Worker   tmp2 = vaddq_s16(row2, row5);
168*dfc6aa5cSAndroid Build Coastguard Worker   tmp5 = vsubq_s16(row2, row5);
169*dfc6aa5cSAndroid Build Coastguard Worker   tmp3 = vaddq_s16(row3, row4);
170*dfc6aa5cSAndroid Build Coastguard Worker   tmp4 = vsubq_s16(row3, row4);
171*dfc6aa5cSAndroid Build Coastguard Worker 
172*dfc6aa5cSAndroid Build Coastguard Worker   /* Even part */
173*dfc6aa5cSAndroid Build Coastguard Worker   tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
174*dfc6aa5cSAndroid Build Coastguard Worker   tmp13 = vsubq_s16(tmp0, tmp3);
175*dfc6aa5cSAndroid Build Coastguard Worker   tmp11 = vaddq_s16(tmp1, tmp2);
176*dfc6aa5cSAndroid Build Coastguard Worker   tmp12 = vsubq_s16(tmp1, tmp2);
177*dfc6aa5cSAndroid Build Coastguard Worker 
178*dfc6aa5cSAndroid Build Coastguard Worker   row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
179*dfc6aa5cSAndroid Build Coastguard Worker   row4 = vsubq_s16(tmp10, tmp11);
180*dfc6aa5cSAndroid Build Coastguard Worker 
181*dfc6aa5cSAndroid Build Coastguard Worker   z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
182*dfc6aa5cSAndroid Build Coastguard Worker   row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
183*dfc6aa5cSAndroid Build Coastguard Worker   row6 = vsubq_s16(tmp13, z1);
184*dfc6aa5cSAndroid Build Coastguard Worker 
185*dfc6aa5cSAndroid Build Coastguard Worker   /* Odd part */
186*dfc6aa5cSAndroid Build Coastguard Worker   tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
187*dfc6aa5cSAndroid Build Coastguard Worker   tmp11 = vaddq_s16(tmp5, tmp6);
188*dfc6aa5cSAndroid Build Coastguard Worker   tmp12 = vaddq_s16(tmp6, tmp7);
189*dfc6aa5cSAndroid Build Coastguard Worker 
190*dfc6aa5cSAndroid Build Coastguard Worker   z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
191*dfc6aa5cSAndroid Build Coastguard Worker   z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
192*dfc6aa5cSAndroid Build Coastguard Worker   z2 = vaddq_s16(z2, z5);
193*dfc6aa5cSAndroid Build Coastguard Worker   z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
194*dfc6aa5cSAndroid Build Coastguard Worker   z5 = vaddq_s16(tmp12, z5);
195*dfc6aa5cSAndroid Build Coastguard Worker   z4 = vaddq_s16(z4, z5);
196*dfc6aa5cSAndroid Build Coastguard Worker   z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
197*dfc6aa5cSAndroid Build Coastguard Worker 
198*dfc6aa5cSAndroid Build Coastguard Worker   z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
199*dfc6aa5cSAndroid Build Coastguard Worker   z13 = vsubq_s16(tmp7, z3);
200*dfc6aa5cSAndroid Build Coastguard Worker 
201*dfc6aa5cSAndroid Build Coastguard Worker   row5 = vaddq_s16(z13, z2);                  /* phase 6 */
202*dfc6aa5cSAndroid Build Coastguard Worker   row3 = vsubq_s16(z13, z2);
203*dfc6aa5cSAndroid Build Coastguard Worker   row1 = vaddq_s16(z11, z4);
204*dfc6aa5cSAndroid Build Coastguard Worker   row7 = vsubq_s16(z11, z4);
205*dfc6aa5cSAndroid Build Coastguard Worker 
206*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 0 * DCTSIZE, row0);
207*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 1 * DCTSIZE, row1);
208*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 2 * DCTSIZE, row2);
209*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 3 * DCTSIZE, row3);
210*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 4 * DCTSIZE, row4);
211*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 5 * DCTSIZE, row5);
212*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 6 * DCTSIZE, row6);
213*dfc6aa5cSAndroid Build Coastguard Worker   vst1q_s16(data + 7 * DCTSIZE, row7);
214*dfc6aa5cSAndroid Build Coastguard Worker }
215