xref: /aosp_15_r20/external/libjpeg-turbo/simd/arm/jfdctfst-neon.c (revision dfc6aa5c1cfd4bc4e2018dc74aa96e29ee49c6da)
1 /*
2  * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "align.h"
31 
32 #include <arm_neon.h>
33 
34 
35 /* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
36  * (Discrete Cosine Transform) on one block of samples.  It uses the same
37  * calculations and produces exactly the same output as IJG's original
38  * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
39  *
40  * Scaled integer constants are used to avoid floating-point arithmetic:
41  *    0.382683433 = 12544 * 2^-15
42  *    0.541196100 = 17795 * 2^-15
43  *    0.707106781 = 23168 * 2^-15
44  *    0.306562965 =  9984 * 2^-15
45  *
46  * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
47  * the variable names and comments here in jsimd_fdct_ifast_neon() match up
48  * with those in jpeg_fdct_ifast().
49  */
50 
51 #define F_0_382  12544
52 #define F_0_541  17792
53 #define F_0_707  23168
54 #define F_0_306  9984
55 
56 
57 ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
58   F_0_382, F_0_541, F_0_707, F_0_306
59 };
60 
jsimd_fdct_ifast_neon(DCTELEM * data)61 void jsimd_fdct_ifast_neon(DCTELEM *data)
62 {
63   /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
64    * are used, followed by vuzp to transpose the block such that we have a
65    * column of samples per vector - allowing all rows to be processed at once.
66    */
67   int16x8x4_t data1 = vld4q_s16(data);
68   int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
69 
70   int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
71   int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
72   int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
73   int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
74 
75   int16x8_t col0 = cols_04.val[0];
76   int16x8_t col1 = cols_15.val[0];
77   int16x8_t col2 = cols_26.val[0];
78   int16x8_t col3 = cols_37.val[0];
79   int16x8_t col4 = cols_04.val[1];
80   int16x8_t col5 = cols_15.val[1];
81   int16x8_t col6 = cols_26.val[1];
82   int16x8_t col7 = cols_37.val[1];
83 
84   /* Pass 1: process rows. */
85 
86   /* Load DCT conversion constants. */
87   const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
88 
89   int16x8_t tmp0 = vaddq_s16(col0, col7);
90   int16x8_t tmp7 = vsubq_s16(col0, col7);
91   int16x8_t tmp1 = vaddq_s16(col1, col6);
92   int16x8_t tmp6 = vsubq_s16(col1, col6);
93   int16x8_t tmp2 = vaddq_s16(col2, col5);
94   int16x8_t tmp5 = vsubq_s16(col2, col5);
95   int16x8_t tmp3 = vaddq_s16(col3, col4);
96   int16x8_t tmp4 = vsubq_s16(col3, col4);
97 
98   /* Even part */
99   int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
100   int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
101   int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
102   int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
103 
104   col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
105   col4 = vsubq_s16(tmp10, tmp11);
106 
107   int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
108   col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
109   col6 = vsubq_s16(tmp13, z1);
110 
111   /* Odd part */
112   tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
113   tmp11 = vaddq_s16(tmp5, tmp6);
114   tmp12 = vaddq_s16(tmp6, tmp7);
115 
116   int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
117   int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
118   z2 = vaddq_s16(z2, z5);
119   int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
120   z5 = vaddq_s16(tmp12, z5);
121   z4 = vaddq_s16(z4, z5);
122   int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
123 
124   int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
125   int16x8_t z13 = vsubq_s16(tmp7, z3);
126 
127   col5 = vaddq_s16(z13, z2);                  /* phase 6 */
128   col3 = vsubq_s16(z13, z2);
129   col1 = vaddq_s16(z11, z4);
130   col7 = vsubq_s16(z11, z4);
131 
132   /* Transpose to work on columns in pass 2. */
133   int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
134   int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
135   int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
136   int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
137 
138   int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
139                                       vreinterpretq_s32_s16(cols_45.val[0]));
140   int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
141                                       vreinterpretq_s32_s16(cols_45.val[1]));
142   int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
143                                       vreinterpretq_s32_s16(cols_67.val[0]));
144   int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
145                                       vreinterpretq_s32_s16(cols_67.val[1]));
146 
147   int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
148   int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
149   int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
150   int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
151 
152   int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
153   int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
154   int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
155   int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
156   int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
157   int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
158   int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
159   int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
160 
161   /* Pass 2: process columns. */
162 
163   tmp0 = vaddq_s16(row0, row7);
164   tmp7 = vsubq_s16(row0, row7);
165   tmp1 = vaddq_s16(row1, row6);
166   tmp6 = vsubq_s16(row1, row6);
167   tmp2 = vaddq_s16(row2, row5);
168   tmp5 = vsubq_s16(row2, row5);
169   tmp3 = vaddq_s16(row3, row4);
170   tmp4 = vsubq_s16(row3, row4);
171 
172   /* Even part */
173   tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
174   tmp13 = vsubq_s16(tmp0, tmp3);
175   tmp11 = vaddq_s16(tmp1, tmp2);
176   tmp12 = vsubq_s16(tmp1, tmp2);
177 
178   row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
179   row4 = vsubq_s16(tmp10, tmp11);
180 
181   z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
182   row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
183   row6 = vsubq_s16(tmp13, z1);
184 
185   /* Odd part */
186   tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
187   tmp11 = vaddq_s16(tmp5, tmp6);
188   tmp12 = vaddq_s16(tmp6, tmp7);
189 
190   z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
191   z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
192   z2 = vaddq_s16(z2, z5);
193   z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
194   z5 = vaddq_s16(tmp12, z5);
195   z4 = vaddq_s16(z4, z5);
196   z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
197 
198   z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
199   z13 = vsubq_s16(tmp7, z3);
200 
201   row5 = vaddq_s16(z13, z2);                  /* phase 6 */
202   row3 = vsubq_s16(z13, z2);
203   row1 = vaddq_s16(z11, z4);
204   row7 = vsubq_s16(z11, z4);
205 
206   vst1q_s16(data + 0 * DCTSIZE, row0);
207   vst1q_s16(data + 1 * DCTSIZE, row1);
208   vst1q_s16(data + 2 * DCTSIZE, row2);
209   vst1q_s16(data + 3 * DCTSIZE, row3);
210   vst1q_s16(data + 4 * DCTSIZE, row4);
211   vst1q_s16(data + 5 * DCTSIZE, row5);
212   vst1q_s16(data + 6 * DCTSIZE, row6);
213   vst1q_s16(data + 7 * DCTSIZE, row7);
214 }
215