1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
13
14 #include <arm_neon.h>
15
vpx_fdct8x8_pass1_notranspose_neon(int16x8_t * in,int16x8_t * out)16 static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
17 int16x8_t *out) {
18 int16x8_t s[8], x[4], t[2];
19
20 s[0] = vaddq_s16(in[0], in[7]);
21 s[1] = vaddq_s16(in[1], in[6]);
22 s[2] = vaddq_s16(in[2], in[5]);
23 s[3] = vaddq_s16(in[3], in[4]);
24 s[4] = vsubq_s16(in[3], in[4]);
25 s[5] = vsubq_s16(in[2], in[5]);
26 s[6] = vsubq_s16(in[1], in[6]);
27 s[7] = vsubq_s16(in[0], in[7]);
28 // fdct4(step, step);
29 x[0] = vaddq_s16(s[0], s[3]);
30 x[1] = vaddq_s16(s[1], s[2]);
31 x[2] = vsubq_s16(s[1], s[2]);
32 x[3] = vsubq_s16(s[0], s[3]);
33
34 // fdct4(step, step);
35 // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
36 // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
37 butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
38 // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
39 // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
40 butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
41
42 // Stage 2
43 // t0 = (s6 - s5) * cospi_16_64;
44 // t1 = (s6 + s5) * cospi_16_64;
45 butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
46
47 // Stage 3
48 x[0] = vaddq_s16(s[4], t[0]);
49 x[1] = vsubq_s16(s[4], t[0]);
50 x[2] = vsubq_s16(s[7], t[1]);
51 x[3] = vaddq_s16(s[7], t[1]);
52
53 // Stage 4
54 // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
55 // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
56 butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
57
58 // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
59 // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
60 butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
61 }
62
vpx_fdct8x8_pass2_notranspose_neon(int16x8_t * in,int16x8_t * out)63 static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
64 int16x8_t *out) {
65 int16x8_t s[8], x[4], t[2];
66
67 s[0] = vaddq_s16(in[0], in[7]);
68 s[1] = vaddq_s16(in[1], in[6]);
69 s[2] = vaddq_s16(in[2], in[5]);
70 s[3] = vaddq_s16(in[3], in[4]);
71 s[4] = vsubq_s16(in[3], in[4]);
72 s[5] = vsubq_s16(in[2], in[5]);
73 s[6] = vsubq_s16(in[1], in[6]);
74 s[7] = vsubq_s16(in[0], in[7]);
75 // fdct4(step, step);
76 x[0] = vaddq_s16(s[0], s[3]);
77 x[1] = vaddq_s16(s[1], s[2]);
78 x[2] = vsubq_s16(s[1], s[2]);
79 x[3] = vsubq_s16(s[0], s[3]);
80
81 // fdct4(step, step);
82 // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
83 // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
84 butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
85 &out[4]);
86 // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
87 // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
88 butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
89
90 // Stage 2
91 // t0 = (s6 - s5) * cospi_16_64;
92 // t1 = (s6 + s5) * cospi_16_64;
93 butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
94 &t[0]);
95
96 // Stage 3
97 x[0] = vaddq_s16(s[4], t[0]);
98 x[1] = vsubq_s16(s[4], t[0]);
99 x[2] = vsubq_s16(s[7], t[1]);
100 x[3] = vaddq_s16(s[7], t[1]);
101
102 // Stage 4
103 // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
104 // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
105 butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
106
107 // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
108 // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
109 butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
110 }
111
vpx_fdct8x8_pass1_neon(int16x8_t * in)112 static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
113 int16x8_t out[8];
114 vpx_fdct8x8_pass1_notranspose_neon(in, out);
115 // transpose 8x8
116 transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
117 &out[6], &out[7]);
118 in[0] = out[0];
119 in[1] = out[1];
120 in[2] = out[2];
121 in[3] = out[3];
122 in[4] = out[4];
123 in[5] = out[5];
124 in[6] = out[6];
125 in[7] = out[7];
126 }
127
vpx_fdct8x8_pass2_neon(int16x8_t * in)128 static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
129 int16x8_t out[8];
130 vpx_fdct8x8_pass2_notranspose_neon(in, out);
131 // transpose 8x8
132 transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
133 &out[6], &out[7]);
134 in[0] = out[0];
135 in[1] = out[1];
136 in[2] = out[2];
137 in[3] = out[3];
138 in[4] = out[4];
139 in[5] = out[5];
140 in[6] = out[6];
141 in[7] = out[7];
142 }
143
144 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t * left,int32x4_t * right)145 static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
146 int32x4_t *right) {
147 int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
148
149 sl[0] = vaddq_s32(left[0], left[7]);
150 sl[1] = vaddq_s32(left[1], left[6]);
151 sl[2] = vaddq_s32(left[2], left[5]);
152 sl[3] = vaddq_s32(left[3], left[4]);
153 sl[4] = vsubq_s32(left[3], left[4]);
154 sl[5] = vsubq_s32(left[2], left[5]);
155 sl[6] = vsubq_s32(left[1], left[6]);
156 sl[7] = vsubq_s32(left[0], left[7]);
157 sr[0] = vaddq_s32(right[0], right[7]);
158 sr[1] = vaddq_s32(right[1], right[6]);
159 sr[2] = vaddq_s32(right[2], right[5]);
160 sr[3] = vaddq_s32(right[3], right[4]);
161 sr[4] = vsubq_s32(right[3], right[4]);
162 sr[5] = vsubq_s32(right[2], right[5]);
163 sr[6] = vsubq_s32(right[1], right[6]);
164 sr[7] = vsubq_s32(right[0], right[7]);
165
166 // fdct4(step, step);
167 // x0 = s0 + s3;
168 xl[0] = vaddq_s32(sl[0], sl[3]);
169 xr[0] = vaddq_s32(sr[0], sr[3]);
170 // x1 = s1 + s2;
171 xl[1] = vaddq_s32(sl[1], sl[2]);
172 xr[1] = vaddq_s32(sr[1], sr[2]);
173 // x2 = s1 - s2;
174 xl[2] = vsubq_s32(sl[1], sl[2]);
175 xr[2] = vsubq_s32(sr[1], sr[2]);
176 // x3 = s0 - s3;
177 xl[3] = vsubq_s32(sl[0], sl[3]);
178 xr[3] = vsubq_s32(sr[0], sr[3]);
179
180 // fdct4(step, step);
181 // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
182 // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
183 butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
184 &left[0], &right[0], &left[4], &right[4]);
185 // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
186 // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
187 butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
188 &left[2], &right[2], &left[6], &right[6]);
189
190 // Stage 2
191 // t0 = (s6 - s5) * cospi_16_64;
192 // t1 = (s6 + s5) * cospi_16_64;
193 butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
194 &tr[1], &tl[0], &tr[0]);
195
196 // Stage 3
197 xl[0] = vaddq_s32(sl[4], tl[0]);
198 xr[0] = vaddq_s32(sr[4], tr[0]);
199 xl[1] = vsubq_s32(sl[4], tl[0]);
200 xr[1] = vsubq_s32(sr[4], tr[0]);
201 xl[2] = vsubq_s32(sl[7], tl[1]);
202 xr[2] = vsubq_s32(sr[7], tr[1]);
203 xl[3] = vaddq_s32(sl[7], tl[1]);
204 xr[3] = vaddq_s32(sr[7], tr[1]);
205
206 // Stage 4
207 // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
208 // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
209 butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
210 &left[1], &right[1], &left[7], &right[7]);
211
212 // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
213 // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
214 butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
215 &left[5], &right[5], &left[3], &right[3]);
216 }
217
vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t * left,int32x4_t * right)218 static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
219 int32x4_t *right) {
220 int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
221
222 sl[0] = vaddq_s32(left[0], left[7]);
223 sl[1] = vaddq_s32(left[1], left[6]);
224 sl[2] = vaddq_s32(left[2], left[5]);
225 sl[3] = vaddq_s32(left[3], left[4]);
226 sl[4] = vsubq_s32(left[3], left[4]);
227 sl[5] = vsubq_s32(left[2], left[5]);
228 sl[6] = vsubq_s32(left[1], left[6]);
229 sl[7] = vsubq_s32(left[0], left[7]);
230 sr[0] = vaddq_s32(right[0], right[7]);
231 sr[1] = vaddq_s32(right[1], right[6]);
232 sr[2] = vaddq_s32(right[2], right[5]);
233 sr[3] = vaddq_s32(right[3], right[4]);
234 sr[4] = vsubq_s32(right[3], right[4]);
235 sr[5] = vsubq_s32(right[2], right[5]);
236 sr[6] = vsubq_s32(right[1], right[6]);
237 sr[7] = vsubq_s32(right[0], right[7]);
238
239 // fdct4(step, step);
240 // x0 = s0 + s3;
241 xl[0] = vaddq_s32(sl[0], sl[3]);
242 xr[0] = vaddq_s32(sr[0], sr[3]);
243 // x1 = s1 + s2;
244 xl[1] = vaddq_s32(sl[1], sl[2]);
245 xr[1] = vaddq_s32(sr[1], sr[2]);
246 // x2 = s1 - s2;
247 xl[2] = vsubq_s32(sl[1], sl[2]);
248 xr[2] = vsubq_s32(sr[1], sr[2]);
249 // x3 = s0 - s3;
250 xl[3] = vsubq_s32(sl[0], sl[3]);
251 xr[3] = vsubq_s32(sr[0], sr[3]);
252
253 // fdct4(step, step);
254 // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
255 // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
256 butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
257 &left[0], &right[0], &left[4], &right[4]);
258 // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
259 // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
260 butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
261 cospi_24_64, &left[2], &right[2], &left[6],
262 &right[6]);
263
264 // Stage 2
265 // t0 = (s6 - s5) * cospi_16_64;
266 // t1 = (s6 + s5) * cospi_16_64;
267 butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
268 &tr[1], &tl[0], &tr[0]);
269
270 // Stage 3
271 xl[0] = vaddq_s32(sl[4], tl[0]);
272 xr[0] = vaddq_s32(sr[4], tr[0]);
273 xl[1] = vsubq_s32(sl[4], tl[0]);
274 xr[1] = vsubq_s32(sr[4], tr[0]);
275 xl[2] = vsubq_s32(sl[7], tl[1]);
276 xr[2] = vsubq_s32(sr[7], tr[1]);
277 xl[3] = vaddq_s32(sl[7], tl[1]);
278 xr[3] = vaddq_s32(sr[7], tr[1]);
279
280 // Stage 4
281 // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
282 // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
283 butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
284 cospi_28_64, &left[1], &right[1], &left[7],
285 &right[7]);
286
287 // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
288 // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
289 butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
290 cospi_12_64, &left[5], &right[5], &left[3],
291 &right[3]);
292 }
293
vpx_highbd_fdct8x8_pass1_neon(int32x4_t * left,int32x4_t * right)294 static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
295 int32x4_t *right) {
296 vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
297 transpose_s32_8x8_2(left, right, left, right);
298 }
299
vpx_highbd_fdct8x8_pass2_neon(int32x4_t * left,int32x4_t * right)300 static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
301 int32x4_t *right) {
302 vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
303 transpose_s32_8x8_2(left, right, left, right);
304 }
305
306 #endif // CONFIG_VP9_HIGHBITDEPTH
307 #endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
308