xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/fdct8x8_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
13 
14 #include <arm_neon.h>
15 
vpx_fdct8x8_pass1_notranspose_neon(int16x8_t * in,int16x8_t * out)16 static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
17                                                       int16x8_t *out) {
18   int16x8_t s[8], x[4], t[2];
19 
20   s[0] = vaddq_s16(in[0], in[7]);
21   s[1] = vaddq_s16(in[1], in[6]);
22   s[2] = vaddq_s16(in[2], in[5]);
23   s[3] = vaddq_s16(in[3], in[4]);
24   s[4] = vsubq_s16(in[3], in[4]);
25   s[5] = vsubq_s16(in[2], in[5]);
26   s[6] = vsubq_s16(in[1], in[6]);
27   s[7] = vsubq_s16(in[0], in[7]);
28   // fdct4(step, step);
29   x[0] = vaddq_s16(s[0], s[3]);
30   x[1] = vaddq_s16(s[1], s[2]);
31   x[2] = vsubq_s16(s[1], s[2]);
32   x[3] = vsubq_s16(s[0], s[3]);
33 
34   // fdct4(step, step);
35   // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
36   // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
37   butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
38   // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
39   // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
40   butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
41 
42   // Stage 2
43   // t0 = (s6 - s5) * cospi_16_64;
44   // t1 = (s6 + s5) * cospi_16_64;
45   butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
46 
47   // Stage 3
48   x[0] = vaddq_s16(s[4], t[0]);
49   x[1] = vsubq_s16(s[4], t[0]);
50   x[2] = vsubq_s16(s[7], t[1]);
51   x[3] = vaddq_s16(s[7], t[1]);
52 
53   // Stage 4
54   // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
55   // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
56   butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
57 
58   // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
59   // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
60   butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
61 }
62 
vpx_fdct8x8_pass2_notranspose_neon(int16x8_t * in,int16x8_t * out)63 static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
64                                                       int16x8_t *out) {
65   int16x8_t s[8], x[4], t[2];
66 
67   s[0] = vaddq_s16(in[0], in[7]);
68   s[1] = vaddq_s16(in[1], in[6]);
69   s[2] = vaddq_s16(in[2], in[5]);
70   s[3] = vaddq_s16(in[3], in[4]);
71   s[4] = vsubq_s16(in[3], in[4]);
72   s[5] = vsubq_s16(in[2], in[5]);
73   s[6] = vsubq_s16(in[1], in[6]);
74   s[7] = vsubq_s16(in[0], in[7]);
75   // fdct4(step, step);
76   x[0] = vaddq_s16(s[0], s[3]);
77   x[1] = vaddq_s16(s[1], s[2]);
78   x[2] = vsubq_s16(s[1], s[2]);
79   x[3] = vsubq_s16(s[0], s[3]);
80 
81   // fdct4(step, step);
82   // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
83   // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
84   butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
85                                           &out[4]);
86   // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
87   // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
88   butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
89 
90   // Stage 2
91   // t0 = (s6 - s5) * cospi_16_64;
92   // t1 = (s6 + s5) * cospi_16_64;
93   butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
94                                           &t[0]);
95 
96   // Stage 3
97   x[0] = vaddq_s16(s[4], t[0]);
98   x[1] = vsubq_s16(s[4], t[0]);
99   x[2] = vsubq_s16(s[7], t[1]);
100   x[3] = vaddq_s16(s[7], t[1]);
101 
102   // Stage 4
103   // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
104   // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
105   butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
106 
107   // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
108   // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
109   butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
110 }
111 
vpx_fdct8x8_pass1_neon(int16x8_t * in)112 static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
113   int16x8_t out[8];
114   vpx_fdct8x8_pass1_notranspose_neon(in, out);
115   // transpose 8x8
116   transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
117                     &out[6], &out[7]);
118   in[0] = out[0];
119   in[1] = out[1];
120   in[2] = out[2];
121   in[3] = out[3];
122   in[4] = out[4];
123   in[5] = out[5];
124   in[6] = out[6];
125   in[7] = out[7];
126 }
127 
vpx_fdct8x8_pass2_neon(int16x8_t * in)128 static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
129   int16x8_t out[8];
130   vpx_fdct8x8_pass2_notranspose_neon(in, out);
131   // transpose 8x8
132   transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
133                     &out[6], &out[7]);
134   in[0] = out[0];
135   in[1] = out[1];
136   in[2] = out[2];
137   in[3] = out[3];
138   in[4] = out[4];
139   in[5] = out[5];
140   in[6] = out[6];
141   in[7] = out[7];
142 }
143 
144 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t * left,int32x4_t * right)145 static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
146                                                              int32x4_t *right) {
147   int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
148 
149   sl[0] = vaddq_s32(left[0], left[7]);
150   sl[1] = vaddq_s32(left[1], left[6]);
151   sl[2] = vaddq_s32(left[2], left[5]);
152   sl[3] = vaddq_s32(left[3], left[4]);
153   sl[4] = vsubq_s32(left[3], left[4]);
154   sl[5] = vsubq_s32(left[2], left[5]);
155   sl[6] = vsubq_s32(left[1], left[6]);
156   sl[7] = vsubq_s32(left[0], left[7]);
157   sr[0] = vaddq_s32(right[0], right[7]);
158   sr[1] = vaddq_s32(right[1], right[6]);
159   sr[2] = vaddq_s32(right[2], right[5]);
160   sr[3] = vaddq_s32(right[3], right[4]);
161   sr[4] = vsubq_s32(right[3], right[4]);
162   sr[5] = vsubq_s32(right[2], right[5]);
163   sr[6] = vsubq_s32(right[1], right[6]);
164   sr[7] = vsubq_s32(right[0], right[7]);
165 
166   // fdct4(step, step);
167   // x0 = s0 + s3;
168   xl[0] = vaddq_s32(sl[0], sl[3]);
169   xr[0] = vaddq_s32(sr[0], sr[3]);
170   // x1 = s1 + s2;
171   xl[1] = vaddq_s32(sl[1], sl[2]);
172   xr[1] = vaddq_s32(sr[1], sr[2]);
173   // x2 = s1 - s2;
174   xl[2] = vsubq_s32(sl[1], sl[2]);
175   xr[2] = vsubq_s32(sr[1], sr[2]);
176   // x3 = s0 - s3;
177   xl[3] = vsubq_s32(sl[0], sl[3]);
178   xr[3] = vsubq_s32(sr[0], sr[3]);
179 
180   // fdct4(step, step);
181   // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
182   // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
183   butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
184                                &left[0], &right[0], &left[4], &right[4]);
185   // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
186   // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
187   butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
188                           &left[2], &right[2], &left[6], &right[6]);
189 
190   // Stage 2
191   // t0 = (s6 - s5) * cospi_16_64;
192   // t1 = (s6 + s5) * cospi_16_64;
193   butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
194                                &tr[1], &tl[0], &tr[0]);
195 
196   // Stage 3
197   xl[0] = vaddq_s32(sl[4], tl[0]);
198   xr[0] = vaddq_s32(sr[4], tr[0]);
199   xl[1] = vsubq_s32(sl[4], tl[0]);
200   xr[1] = vsubq_s32(sr[4], tr[0]);
201   xl[2] = vsubq_s32(sl[7], tl[1]);
202   xr[2] = vsubq_s32(sr[7], tr[1]);
203   xl[3] = vaddq_s32(sl[7], tl[1]);
204   xr[3] = vaddq_s32(sr[7], tr[1]);
205 
206   // Stage 4
207   // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
208   // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
209   butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
210                           &left[1], &right[1], &left[7], &right[7]);
211 
212   // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
213   // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
214   butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
215                           &left[5], &right[5], &left[3], &right[3]);
216 }
217 
vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t * left,int32x4_t * right)218 static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
219                                                              int32x4_t *right) {
220   int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
221 
222   sl[0] = vaddq_s32(left[0], left[7]);
223   sl[1] = vaddq_s32(left[1], left[6]);
224   sl[2] = vaddq_s32(left[2], left[5]);
225   sl[3] = vaddq_s32(left[3], left[4]);
226   sl[4] = vsubq_s32(left[3], left[4]);
227   sl[5] = vsubq_s32(left[2], left[5]);
228   sl[6] = vsubq_s32(left[1], left[6]);
229   sl[7] = vsubq_s32(left[0], left[7]);
230   sr[0] = vaddq_s32(right[0], right[7]);
231   sr[1] = vaddq_s32(right[1], right[6]);
232   sr[2] = vaddq_s32(right[2], right[5]);
233   sr[3] = vaddq_s32(right[3], right[4]);
234   sr[4] = vsubq_s32(right[3], right[4]);
235   sr[5] = vsubq_s32(right[2], right[5]);
236   sr[6] = vsubq_s32(right[1], right[6]);
237   sr[7] = vsubq_s32(right[0], right[7]);
238 
239   // fdct4(step, step);
240   // x0 = s0 + s3;
241   xl[0] = vaddq_s32(sl[0], sl[3]);
242   xr[0] = vaddq_s32(sr[0], sr[3]);
243   // x1 = s1 + s2;
244   xl[1] = vaddq_s32(sl[1], sl[2]);
245   xr[1] = vaddq_s32(sr[1], sr[2]);
246   // x2 = s1 - s2;
247   xl[2] = vsubq_s32(sl[1], sl[2]);
248   xr[2] = vsubq_s32(sr[1], sr[2]);
249   // x3 = s0 - s3;
250   xl[3] = vsubq_s32(sl[0], sl[3]);
251   xr[3] = vsubq_s32(sr[0], sr[3]);
252 
253   // fdct4(step, step);
254   // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
255   // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
256   butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
257                                &left[0], &right[0], &left[4], &right[4]);
258   // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
259   // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
260   butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
261                                      cospi_24_64, &left[2], &right[2], &left[6],
262                                      &right[6]);
263 
264   // Stage 2
265   // t0 = (s6 - s5) * cospi_16_64;
266   // t1 = (s6 + s5) * cospi_16_64;
267   butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
268                                &tr[1], &tl[0], &tr[0]);
269 
270   // Stage 3
271   xl[0] = vaddq_s32(sl[4], tl[0]);
272   xr[0] = vaddq_s32(sr[4], tr[0]);
273   xl[1] = vsubq_s32(sl[4], tl[0]);
274   xr[1] = vsubq_s32(sr[4], tr[0]);
275   xl[2] = vsubq_s32(sl[7], tl[1]);
276   xr[2] = vsubq_s32(sr[7], tr[1]);
277   xl[3] = vaddq_s32(sl[7], tl[1]);
278   xr[3] = vaddq_s32(sr[7], tr[1]);
279 
280   // Stage 4
281   // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
282   // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
283   butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
284                                      cospi_28_64, &left[1], &right[1], &left[7],
285                                      &right[7]);
286 
287   // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
288   // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
289   butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
290                                      cospi_12_64, &left[5], &right[5], &left[3],
291                                      &right[3]);
292 }
293 
vpx_highbd_fdct8x8_pass1_neon(int32x4_t * left,int32x4_t * right)294 static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
295                                                  int32x4_t *right) {
296   vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
297   transpose_s32_8x8_2(left, right, left, right);
298 }
299 
vpx_highbd_fdct8x8_pass2_neon(int32x4_t * left,int32x4_t * right)300 static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
301                                                  int32x4_t *right) {
302   vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
303   transpose_s32_8x8_2(left, right, left, right);
304 }
305 
306 #endif  // CONFIG_VP9_HIGHBITDEPTH
307 #endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
308