1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
15*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct8x8_pass1_notranspose_neon(int16x8_t * in,int16x8_t * out)16*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
17*fb1b10abSAndroid Build Coastguard Worker int16x8_t *out) {
18*fb1b10abSAndroid Build Coastguard Worker int16x8_t s[8], x[4], t[2];
19*fb1b10abSAndroid Build Coastguard Worker
20*fb1b10abSAndroid Build Coastguard Worker s[0] = vaddq_s16(in[0], in[7]);
21*fb1b10abSAndroid Build Coastguard Worker s[1] = vaddq_s16(in[1], in[6]);
22*fb1b10abSAndroid Build Coastguard Worker s[2] = vaddq_s16(in[2], in[5]);
23*fb1b10abSAndroid Build Coastguard Worker s[3] = vaddq_s16(in[3], in[4]);
24*fb1b10abSAndroid Build Coastguard Worker s[4] = vsubq_s16(in[3], in[4]);
25*fb1b10abSAndroid Build Coastguard Worker s[5] = vsubq_s16(in[2], in[5]);
26*fb1b10abSAndroid Build Coastguard Worker s[6] = vsubq_s16(in[1], in[6]);
27*fb1b10abSAndroid Build Coastguard Worker s[7] = vsubq_s16(in[0], in[7]);
28*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
29*fb1b10abSAndroid Build Coastguard Worker x[0] = vaddq_s16(s[0], s[3]);
30*fb1b10abSAndroid Build Coastguard Worker x[1] = vaddq_s16(s[1], s[2]);
31*fb1b10abSAndroid Build Coastguard Worker x[2] = vsubq_s16(s[1], s[2]);
32*fb1b10abSAndroid Build Coastguard Worker x[3] = vsubq_s16(s[0], s[3]);
33*fb1b10abSAndroid Build Coastguard Worker
34*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
35*fb1b10abSAndroid Build Coastguard Worker // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
36*fb1b10abSAndroid Build Coastguard Worker // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
37*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
38*fb1b10abSAndroid Build Coastguard Worker // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
39*fb1b10abSAndroid Build Coastguard Worker // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
40*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
41*fb1b10abSAndroid Build Coastguard Worker
42*fb1b10abSAndroid Build Coastguard Worker // Stage 2
43*fb1b10abSAndroid Build Coastguard Worker // t0 = (s6 - s5) * cospi_16_64;
44*fb1b10abSAndroid Build Coastguard Worker // t1 = (s6 + s5) * cospi_16_64;
45*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
46*fb1b10abSAndroid Build Coastguard Worker
47*fb1b10abSAndroid Build Coastguard Worker // Stage 3
48*fb1b10abSAndroid Build Coastguard Worker x[0] = vaddq_s16(s[4], t[0]);
49*fb1b10abSAndroid Build Coastguard Worker x[1] = vsubq_s16(s[4], t[0]);
50*fb1b10abSAndroid Build Coastguard Worker x[2] = vsubq_s16(s[7], t[1]);
51*fb1b10abSAndroid Build Coastguard Worker x[3] = vaddq_s16(s[7], t[1]);
52*fb1b10abSAndroid Build Coastguard Worker
53*fb1b10abSAndroid Build Coastguard Worker // Stage 4
54*fb1b10abSAndroid Build Coastguard Worker // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
55*fb1b10abSAndroid Build Coastguard Worker // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
56*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
57*fb1b10abSAndroid Build Coastguard Worker
58*fb1b10abSAndroid Build Coastguard Worker // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
59*fb1b10abSAndroid Build Coastguard Worker // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
60*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
61*fb1b10abSAndroid Build Coastguard Worker }
62*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct8x8_pass2_notranspose_neon(int16x8_t * in,int16x8_t * out)63*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
64*fb1b10abSAndroid Build Coastguard Worker int16x8_t *out) {
65*fb1b10abSAndroid Build Coastguard Worker int16x8_t s[8], x[4], t[2];
66*fb1b10abSAndroid Build Coastguard Worker
67*fb1b10abSAndroid Build Coastguard Worker s[0] = vaddq_s16(in[0], in[7]);
68*fb1b10abSAndroid Build Coastguard Worker s[1] = vaddq_s16(in[1], in[6]);
69*fb1b10abSAndroid Build Coastguard Worker s[2] = vaddq_s16(in[2], in[5]);
70*fb1b10abSAndroid Build Coastguard Worker s[3] = vaddq_s16(in[3], in[4]);
71*fb1b10abSAndroid Build Coastguard Worker s[4] = vsubq_s16(in[3], in[4]);
72*fb1b10abSAndroid Build Coastguard Worker s[5] = vsubq_s16(in[2], in[5]);
73*fb1b10abSAndroid Build Coastguard Worker s[6] = vsubq_s16(in[1], in[6]);
74*fb1b10abSAndroid Build Coastguard Worker s[7] = vsubq_s16(in[0], in[7]);
75*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
76*fb1b10abSAndroid Build Coastguard Worker x[0] = vaddq_s16(s[0], s[3]);
77*fb1b10abSAndroid Build Coastguard Worker x[1] = vaddq_s16(s[1], s[2]);
78*fb1b10abSAndroid Build Coastguard Worker x[2] = vsubq_s16(s[1], s[2]);
79*fb1b10abSAndroid Build Coastguard Worker x[3] = vsubq_s16(s[0], s[3]);
80*fb1b10abSAndroid Build Coastguard Worker
81*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
82*fb1b10abSAndroid Build Coastguard Worker // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
83*fb1b10abSAndroid Build Coastguard Worker // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
84*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
85*fb1b10abSAndroid Build Coastguard Worker &out[4]);
86*fb1b10abSAndroid Build Coastguard Worker // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
87*fb1b10abSAndroid Build Coastguard Worker // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
88*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
89*fb1b10abSAndroid Build Coastguard Worker
90*fb1b10abSAndroid Build Coastguard Worker // Stage 2
91*fb1b10abSAndroid Build Coastguard Worker // t0 = (s6 - s5) * cospi_16_64;
92*fb1b10abSAndroid Build Coastguard Worker // t1 = (s6 + s5) * cospi_16_64;
93*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
94*fb1b10abSAndroid Build Coastguard Worker &t[0]);
95*fb1b10abSAndroid Build Coastguard Worker
96*fb1b10abSAndroid Build Coastguard Worker // Stage 3
97*fb1b10abSAndroid Build Coastguard Worker x[0] = vaddq_s16(s[4], t[0]);
98*fb1b10abSAndroid Build Coastguard Worker x[1] = vsubq_s16(s[4], t[0]);
99*fb1b10abSAndroid Build Coastguard Worker x[2] = vsubq_s16(s[7], t[1]);
100*fb1b10abSAndroid Build Coastguard Worker x[3] = vaddq_s16(s[7], t[1]);
101*fb1b10abSAndroid Build Coastguard Worker
102*fb1b10abSAndroid Build Coastguard Worker // Stage 4
103*fb1b10abSAndroid Build Coastguard Worker // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
104*fb1b10abSAndroid Build Coastguard Worker // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
105*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
106*fb1b10abSAndroid Build Coastguard Worker
107*fb1b10abSAndroid Build Coastguard Worker // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
108*fb1b10abSAndroid Build Coastguard Worker // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
109*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
110*fb1b10abSAndroid Build Coastguard Worker }
111*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct8x8_pass1_neon(int16x8_t * in)112*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
113*fb1b10abSAndroid Build Coastguard Worker int16x8_t out[8];
114*fb1b10abSAndroid Build Coastguard Worker vpx_fdct8x8_pass1_notranspose_neon(in, out);
115*fb1b10abSAndroid Build Coastguard Worker // transpose 8x8
116*fb1b10abSAndroid Build Coastguard Worker transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
117*fb1b10abSAndroid Build Coastguard Worker &out[6], &out[7]);
118*fb1b10abSAndroid Build Coastguard Worker in[0] = out[0];
119*fb1b10abSAndroid Build Coastguard Worker in[1] = out[1];
120*fb1b10abSAndroid Build Coastguard Worker in[2] = out[2];
121*fb1b10abSAndroid Build Coastguard Worker in[3] = out[3];
122*fb1b10abSAndroid Build Coastguard Worker in[4] = out[4];
123*fb1b10abSAndroid Build Coastguard Worker in[5] = out[5];
124*fb1b10abSAndroid Build Coastguard Worker in[6] = out[6];
125*fb1b10abSAndroid Build Coastguard Worker in[7] = out[7];
126*fb1b10abSAndroid Build Coastguard Worker }
127*fb1b10abSAndroid Build Coastguard Worker
vpx_fdct8x8_pass2_neon(int16x8_t * in)128*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
129*fb1b10abSAndroid Build Coastguard Worker int16x8_t out[8];
130*fb1b10abSAndroid Build Coastguard Worker vpx_fdct8x8_pass2_notranspose_neon(in, out);
131*fb1b10abSAndroid Build Coastguard Worker // transpose 8x8
132*fb1b10abSAndroid Build Coastguard Worker transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
133*fb1b10abSAndroid Build Coastguard Worker &out[6], &out[7]);
134*fb1b10abSAndroid Build Coastguard Worker in[0] = out[0];
135*fb1b10abSAndroid Build Coastguard Worker in[1] = out[1];
136*fb1b10abSAndroid Build Coastguard Worker in[2] = out[2];
137*fb1b10abSAndroid Build Coastguard Worker in[3] = out[3];
138*fb1b10abSAndroid Build Coastguard Worker in[4] = out[4];
139*fb1b10abSAndroid Build Coastguard Worker in[5] = out[5];
140*fb1b10abSAndroid Build Coastguard Worker in[6] = out[6];
141*fb1b10abSAndroid Build Coastguard Worker in[7] = out[7];
142*fb1b10abSAndroid Build Coastguard Worker }
143*fb1b10abSAndroid Build Coastguard Worker
144*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t * left,int32x4_t * right)145*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
146*fb1b10abSAndroid Build Coastguard Worker int32x4_t *right) {
147*fb1b10abSAndroid Build Coastguard Worker int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
148*fb1b10abSAndroid Build Coastguard Worker
149*fb1b10abSAndroid Build Coastguard Worker sl[0] = vaddq_s32(left[0], left[7]);
150*fb1b10abSAndroid Build Coastguard Worker sl[1] = vaddq_s32(left[1], left[6]);
151*fb1b10abSAndroid Build Coastguard Worker sl[2] = vaddq_s32(left[2], left[5]);
152*fb1b10abSAndroid Build Coastguard Worker sl[3] = vaddq_s32(left[3], left[4]);
153*fb1b10abSAndroid Build Coastguard Worker sl[4] = vsubq_s32(left[3], left[4]);
154*fb1b10abSAndroid Build Coastguard Worker sl[5] = vsubq_s32(left[2], left[5]);
155*fb1b10abSAndroid Build Coastguard Worker sl[6] = vsubq_s32(left[1], left[6]);
156*fb1b10abSAndroid Build Coastguard Worker sl[7] = vsubq_s32(left[0], left[7]);
157*fb1b10abSAndroid Build Coastguard Worker sr[0] = vaddq_s32(right[0], right[7]);
158*fb1b10abSAndroid Build Coastguard Worker sr[1] = vaddq_s32(right[1], right[6]);
159*fb1b10abSAndroid Build Coastguard Worker sr[2] = vaddq_s32(right[2], right[5]);
160*fb1b10abSAndroid Build Coastguard Worker sr[3] = vaddq_s32(right[3], right[4]);
161*fb1b10abSAndroid Build Coastguard Worker sr[4] = vsubq_s32(right[3], right[4]);
162*fb1b10abSAndroid Build Coastguard Worker sr[5] = vsubq_s32(right[2], right[5]);
163*fb1b10abSAndroid Build Coastguard Worker sr[6] = vsubq_s32(right[1], right[6]);
164*fb1b10abSAndroid Build Coastguard Worker sr[7] = vsubq_s32(right[0], right[7]);
165*fb1b10abSAndroid Build Coastguard Worker
166*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
167*fb1b10abSAndroid Build Coastguard Worker // x0 = s0 + s3;
168*fb1b10abSAndroid Build Coastguard Worker xl[0] = vaddq_s32(sl[0], sl[3]);
169*fb1b10abSAndroid Build Coastguard Worker xr[0] = vaddq_s32(sr[0], sr[3]);
170*fb1b10abSAndroid Build Coastguard Worker // x1 = s1 + s2;
171*fb1b10abSAndroid Build Coastguard Worker xl[1] = vaddq_s32(sl[1], sl[2]);
172*fb1b10abSAndroid Build Coastguard Worker xr[1] = vaddq_s32(sr[1], sr[2]);
173*fb1b10abSAndroid Build Coastguard Worker // x2 = s1 - s2;
174*fb1b10abSAndroid Build Coastguard Worker xl[2] = vsubq_s32(sl[1], sl[2]);
175*fb1b10abSAndroid Build Coastguard Worker xr[2] = vsubq_s32(sr[1], sr[2]);
176*fb1b10abSAndroid Build Coastguard Worker // x3 = s0 - s3;
177*fb1b10abSAndroid Build Coastguard Worker xl[3] = vsubq_s32(sl[0], sl[3]);
178*fb1b10abSAndroid Build Coastguard Worker xr[3] = vsubq_s32(sr[0], sr[3]);
179*fb1b10abSAndroid Build Coastguard Worker
180*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
181*fb1b10abSAndroid Build Coastguard Worker // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
182*fb1b10abSAndroid Build Coastguard Worker // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
183*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
184*fb1b10abSAndroid Build Coastguard Worker &left[0], &right[0], &left[4], &right[4]);
185*fb1b10abSAndroid Build Coastguard Worker // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
186*fb1b10abSAndroid Build Coastguard Worker // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
187*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
188*fb1b10abSAndroid Build Coastguard Worker &left[2], &right[2], &left[6], &right[6]);
189*fb1b10abSAndroid Build Coastguard Worker
190*fb1b10abSAndroid Build Coastguard Worker // Stage 2
191*fb1b10abSAndroid Build Coastguard Worker // t0 = (s6 - s5) * cospi_16_64;
192*fb1b10abSAndroid Build Coastguard Worker // t1 = (s6 + s5) * cospi_16_64;
193*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
194*fb1b10abSAndroid Build Coastguard Worker &tr[1], &tl[0], &tr[0]);
195*fb1b10abSAndroid Build Coastguard Worker
196*fb1b10abSAndroid Build Coastguard Worker // Stage 3
197*fb1b10abSAndroid Build Coastguard Worker xl[0] = vaddq_s32(sl[4], tl[0]);
198*fb1b10abSAndroid Build Coastguard Worker xr[0] = vaddq_s32(sr[4], tr[0]);
199*fb1b10abSAndroid Build Coastguard Worker xl[1] = vsubq_s32(sl[4], tl[0]);
200*fb1b10abSAndroid Build Coastguard Worker xr[1] = vsubq_s32(sr[4], tr[0]);
201*fb1b10abSAndroid Build Coastguard Worker xl[2] = vsubq_s32(sl[7], tl[1]);
202*fb1b10abSAndroid Build Coastguard Worker xr[2] = vsubq_s32(sr[7], tr[1]);
203*fb1b10abSAndroid Build Coastguard Worker xl[3] = vaddq_s32(sl[7], tl[1]);
204*fb1b10abSAndroid Build Coastguard Worker xr[3] = vaddq_s32(sr[7], tr[1]);
205*fb1b10abSAndroid Build Coastguard Worker
206*fb1b10abSAndroid Build Coastguard Worker // Stage 4
207*fb1b10abSAndroid Build Coastguard Worker // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
208*fb1b10abSAndroid Build Coastguard Worker // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
209*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
210*fb1b10abSAndroid Build Coastguard Worker &left[1], &right[1], &left[7], &right[7]);
211*fb1b10abSAndroid Build Coastguard Worker
212*fb1b10abSAndroid Build Coastguard Worker // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
213*fb1b10abSAndroid Build Coastguard Worker // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
214*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
215*fb1b10abSAndroid Build Coastguard Worker &left[5], &right[5], &left[3], &right[3]);
216*fb1b10abSAndroid Build Coastguard Worker }
217*fb1b10abSAndroid Build Coastguard Worker
vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t * left,int32x4_t * right)218*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
219*fb1b10abSAndroid Build Coastguard Worker int32x4_t *right) {
220*fb1b10abSAndroid Build Coastguard Worker int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
221*fb1b10abSAndroid Build Coastguard Worker
222*fb1b10abSAndroid Build Coastguard Worker sl[0] = vaddq_s32(left[0], left[7]);
223*fb1b10abSAndroid Build Coastguard Worker sl[1] = vaddq_s32(left[1], left[6]);
224*fb1b10abSAndroid Build Coastguard Worker sl[2] = vaddq_s32(left[2], left[5]);
225*fb1b10abSAndroid Build Coastguard Worker sl[3] = vaddq_s32(left[3], left[4]);
226*fb1b10abSAndroid Build Coastguard Worker sl[4] = vsubq_s32(left[3], left[4]);
227*fb1b10abSAndroid Build Coastguard Worker sl[5] = vsubq_s32(left[2], left[5]);
228*fb1b10abSAndroid Build Coastguard Worker sl[6] = vsubq_s32(left[1], left[6]);
229*fb1b10abSAndroid Build Coastguard Worker sl[7] = vsubq_s32(left[0], left[7]);
230*fb1b10abSAndroid Build Coastguard Worker sr[0] = vaddq_s32(right[0], right[7]);
231*fb1b10abSAndroid Build Coastguard Worker sr[1] = vaddq_s32(right[1], right[6]);
232*fb1b10abSAndroid Build Coastguard Worker sr[2] = vaddq_s32(right[2], right[5]);
233*fb1b10abSAndroid Build Coastguard Worker sr[3] = vaddq_s32(right[3], right[4]);
234*fb1b10abSAndroid Build Coastguard Worker sr[4] = vsubq_s32(right[3], right[4]);
235*fb1b10abSAndroid Build Coastguard Worker sr[5] = vsubq_s32(right[2], right[5]);
236*fb1b10abSAndroid Build Coastguard Worker sr[6] = vsubq_s32(right[1], right[6]);
237*fb1b10abSAndroid Build Coastguard Worker sr[7] = vsubq_s32(right[0], right[7]);
238*fb1b10abSAndroid Build Coastguard Worker
239*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
240*fb1b10abSAndroid Build Coastguard Worker // x0 = s0 + s3;
241*fb1b10abSAndroid Build Coastguard Worker xl[0] = vaddq_s32(sl[0], sl[3]);
242*fb1b10abSAndroid Build Coastguard Worker xr[0] = vaddq_s32(sr[0], sr[3]);
243*fb1b10abSAndroid Build Coastguard Worker // x1 = s1 + s2;
244*fb1b10abSAndroid Build Coastguard Worker xl[1] = vaddq_s32(sl[1], sl[2]);
245*fb1b10abSAndroid Build Coastguard Worker xr[1] = vaddq_s32(sr[1], sr[2]);
246*fb1b10abSAndroid Build Coastguard Worker // x2 = s1 - s2;
247*fb1b10abSAndroid Build Coastguard Worker xl[2] = vsubq_s32(sl[1], sl[2]);
248*fb1b10abSAndroid Build Coastguard Worker xr[2] = vsubq_s32(sr[1], sr[2]);
249*fb1b10abSAndroid Build Coastguard Worker // x3 = s0 - s3;
250*fb1b10abSAndroid Build Coastguard Worker xl[3] = vsubq_s32(sl[0], sl[3]);
251*fb1b10abSAndroid Build Coastguard Worker xr[3] = vsubq_s32(sr[0], sr[3]);
252*fb1b10abSAndroid Build Coastguard Worker
253*fb1b10abSAndroid Build Coastguard Worker // fdct4(step, step);
254*fb1b10abSAndroid Build Coastguard Worker // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
255*fb1b10abSAndroid Build Coastguard Worker // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
256*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
257*fb1b10abSAndroid Build Coastguard Worker &left[0], &right[0], &left[4], &right[4]);
258*fb1b10abSAndroid Build Coastguard Worker // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
259*fb1b10abSAndroid Build Coastguard Worker // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
260*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
261*fb1b10abSAndroid Build Coastguard Worker cospi_24_64, &left[2], &right[2], &left[6],
262*fb1b10abSAndroid Build Coastguard Worker &right[6]);
263*fb1b10abSAndroid Build Coastguard Worker
264*fb1b10abSAndroid Build Coastguard Worker // Stage 2
265*fb1b10abSAndroid Build Coastguard Worker // t0 = (s6 - s5) * cospi_16_64;
266*fb1b10abSAndroid Build Coastguard Worker // t1 = (s6 + s5) * cospi_16_64;
267*fb1b10abSAndroid Build Coastguard Worker butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
268*fb1b10abSAndroid Build Coastguard Worker &tr[1], &tl[0], &tr[0]);
269*fb1b10abSAndroid Build Coastguard Worker
270*fb1b10abSAndroid Build Coastguard Worker // Stage 3
271*fb1b10abSAndroid Build Coastguard Worker xl[0] = vaddq_s32(sl[4], tl[0]);
272*fb1b10abSAndroid Build Coastguard Worker xr[0] = vaddq_s32(sr[4], tr[0]);
273*fb1b10abSAndroid Build Coastguard Worker xl[1] = vsubq_s32(sl[4], tl[0]);
274*fb1b10abSAndroid Build Coastguard Worker xr[1] = vsubq_s32(sr[4], tr[0]);
275*fb1b10abSAndroid Build Coastguard Worker xl[2] = vsubq_s32(sl[7], tl[1]);
276*fb1b10abSAndroid Build Coastguard Worker xr[2] = vsubq_s32(sr[7], tr[1]);
277*fb1b10abSAndroid Build Coastguard Worker xl[3] = vaddq_s32(sl[7], tl[1]);
278*fb1b10abSAndroid Build Coastguard Worker xr[3] = vaddq_s32(sr[7], tr[1]);
279*fb1b10abSAndroid Build Coastguard Worker
280*fb1b10abSAndroid Build Coastguard Worker // Stage 4
281*fb1b10abSAndroid Build Coastguard Worker // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
282*fb1b10abSAndroid Build Coastguard Worker // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
283*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
284*fb1b10abSAndroid Build Coastguard Worker cospi_28_64, &left[1], &right[1], &left[7],
285*fb1b10abSAndroid Build Coastguard Worker &right[7]);
286*fb1b10abSAndroid Build Coastguard Worker
287*fb1b10abSAndroid Build Coastguard Worker // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
288*fb1b10abSAndroid Build Coastguard Worker // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
289*fb1b10abSAndroid Build Coastguard Worker butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
290*fb1b10abSAndroid Build Coastguard Worker cospi_12_64, &left[5], &right[5], &left[3],
291*fb1b10abSAndroid Build Coastguard Worker &right[3]);
292*fb1b10abSAndroid Build Coastguard Worker }
293*fb1b10abSAndroid Build Coastguard Worker
vpx_highbd_fdct8x8_pass1_neon(int32x4_t * left,int32x4_t * right)294*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
295*fb1b10abSAndroid Build Coastguard Worker int32x4_t *right) {
296*fb1b10abSAndroid Build Coastguard Worker vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
297*fb1b10abSAndroid Build Coastguard Worker transpose_s32_8x8_2(left, right, left, right);
298*fb1b10abSAndroid Build Coastguard Worker }
299*fb1b10abSAndroid Build Coastguard Worker
vpx_highbd_fdct8x8_pass2_neon(int32x4_t * left,int32x4_t * right)300*fb1b10abSAndroid Build Coastguard Worker static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
301*fb1b10abSAndroid Build Coastguard Worker int32x4_t *right) {
302*fb1b10abSAndroid Build Coastguard Worker vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
303*fb1b10abSAndroid Build Coastguard Worker transpose_s32_8x8_2(left, right, left, right);
304*fb1b10abSAndroid Build Coastguard Worker }
305*fb1b10abSAndroid Build Coastguard Worker
306*fb1b10abSAndroid Build Coastguard Worker #endif // CONFIG_VP9_HIGHBITDEPTH
307*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
308