1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom_dsp/arm/transpose_neon.h"
16 #include "aom_dsp/txfm_common.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
20 #include "config/aom_config.h"
21 #include "config/av1_rtcd.h"
22 #include "shift_neon.h"
23 #include "txfm_neon.h"
24
transpose_arrays_s32_64x64(const int32x4_t * in,int32x4_t * out)25 static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
26 int32x4_t *out) {
27 // This is not quite the same as the other transposes defined in
28 // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
29 // unused by the following row transform.
30 for (int j = 0; j < 8; ++j) {
31 for (int i = 0; i < 16; ++i) {
32 transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
33 }
34 }
35 }
36
37 // A note on butterfly helper naming:
38 //
39 // butterfly_[weight_indices]_neon
40 // e.g. butterfly_0312_neon
41 // ^ Weights are applied as indices 0, 3, 2, 1
42 // (see more detail below)
43 //
44 // Weight indices are treated as an index into the 4-tuple of the weight
45 // itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
46 // This is then represented in the helper naming by referring to the lane index
47 // in the loaded tuple that each multiply is performed with:
48 //
49 // in0 in1
50 // /------------
51 // out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1]
52 // out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3]
53 //
54 // So for indices 0321 from the earlier example, we end up with:
55 //
56 // in0 in1
57 // /------------------
58 // out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1)
59 // out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0)
60
61 #define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \
62 do { \
63 int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \
64 int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
65 x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \
66 *out = vrshlq_s32(x, v_bit); \
67 } while (false)
68
butterfly_0112_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)69 static AOM_FORCE_INLINE void butterfly_0112_neon(
70 const int32_t *cospi, const int widx0, const int32x4_t n0,
71 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
72 const int32x4_t v_bit) {
73 int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
74 butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
75 butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
76 }
77
butterfly_2312_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)78 static AOM_FORCE_INLINE void butterfly_2312_neon(
79 const int32_t *cospi, const int widx0, const int32x4_t n0,
80 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
81 const int32x4_t v_bit) {
82 int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
83 butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
84 butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
85 }
86
butterfly_0332_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)87 static AOM_FORCE_INLINE void butterfly_0332_neon(
88 const int32_t *cospi, const int widx0, const int32x4_t n0,
89 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
90 const int32x4_t v_bit) {
91 int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
92 butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
93 butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
94 }
95
butterfly_0130_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)96 static AOM_FORCE_INLINE void butterfly_0130_neon(
97 const int32_t *cospi, const int widx0, const int32x4_t n0,
98 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
99 const int32x4_t v_bit) {
100 int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
101 butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
102 butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
103 }
104
butterfly_cospi32_0002_neon(const int32_t * cospi,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)105 static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
106 const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
107 int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
108 int32x2_t w01 = vld1_s32(cospi + 2 * 32);
109 butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
110 butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
111 }
112
butterfly_cospi32_0222_neon(const int32_t * cospi,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)113 static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
114 const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
115 int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
116 int32x2_t w01 = vld1_s32(cospi + 2 * 32);
117 butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
118 butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
119 }
120
round_rect_array_s32_neon(const int32x4_t * input,int32x4_t * output,const int size)121 static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
122 int32x4_t *output,
123 const int size) {
124 const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
125 int i = 0;
126 do {
127 const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
128 output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
129 } while (++i < size);
130 }
131
round_shift2_rect_array_s32_neon(const int32x4_t * input,int32x4_t * output,const int size)132 static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
133 const int32x4_t *input, int32x4_t *output, const int size) {
134 const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
135 int i = 0;
136 do {
137 const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
138 const int32x4_t r1 = vmulq_s32(r0, sqrt2);
139 output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
140 } while (++i < size);
141 }
142
143 #define LOAD_BUFFER_4XH(h) \
144 static AOM_FORCE_INLINE void load_buffer_4x##h( \
145 const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
146 if (fliplr) { \
147 for (int i = 0; i < (h); ++i) { \
148 int16x4_t a = vld1_s16(input + i * stride); \
149 a = vrev64_s16(a); \
150 in[i] = vshll_n_s16(a, 2); \
151 } \
152 } else { \
153 for (int i = 0; i < (h); ++i) { \
154 int16x4_t a = vld1_s16(input + i * stride); \
155 in[i] = vshll_n_s16(a, 2); \
156 } \
157 } \
158 }
159
160 // AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
161 // avoid the expression even though the compiler can prove that the code path
162 // is never taken if `shift == 0`.
163 #define shift_left_long_s16(a, shift) \
164 ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
165
166 #define LOAD_BUFFER_WXH(w, h, shift) \
167 static AOM_FORCE_INLINE void load_buffer_##w##x##h( \
168 const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
169 assert(w >= 8); \
170 if (fliplr) { \
171 for (int i = 0; i < (h); ++i) { \
172 for (int j = 0; j < (w) / 8; ++j) { \
173 int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
174 a = vrev64q_s16(a); \
175 int j2 = (w) / 8 - j - 1; \
176 in[i + (h) * (2 * j2 + 0)] = \
177 shift_left_long_s16(vget_high_s16(a), (shift)); \
178 in[i + (h) * (2 * j2 + 1)] = \
179 shift_left_long_s16(vget_low_s16(a), (shift)); \
180 } \
181 } \
182 } else { \
183 for (int i = 0; i < (h); ++i) { \
184 for (int j = 0; j < (w) / 8; ++j) { \
185 int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
186 in[i + (h) * (2 * j + 0)] = \
187 shift_left_long_s16(vget_low_s16(a), (shift)); \
188 in[i + (h) * (2 * j + 1)] = \
189 shift_left_long_s16(vget_high_s16(a), (shift)); \
190 } \
191 } \
192 } \
193 }
194
195 LOAD_BUFFER_4XH(4)
196 LOAD_BUFFER_4XH(8)
197 LOAD_BUFFER_4XH(16)
198 LOAD_BUFFER_4XH(32)
199 LOAD_BUFFER_WXH(8, 8, 2)
200 LOAD_BUFFER_WXH(16, 16, 2)
201 LOAD_BUFFER_WXH(32, 64, 0)
202 LOAD_BUFFER_WXH(64, 32, 2)
203 LOAD_BUFFER_WXH(64, 64, 0)
204
205 #if !CONFIG_REALTIME_ONLY
206 LOAD_BUFFER_WXH(16, 64, 0)
207 LOAD_BUFFER_WXH(64, 16, 2)
208 #endif // !CONFIG_REALTIME_ONLY
209
210 #define STORE_BUFFER_WXH(w, h) \
211 static AOM_FORCE_INLINE void store_buffer_##w##x##h( \
212 const int32x4_t *in, int32_t *out, int stride) { \
213 for (int i = 0; i < (w); ++i) { \
214 for (int j = 0; j < (h) / 4; ++j) { \
215 vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
216 } \
217 } \
218 }
219
220 STORE_BUFFER_WXH(4, 4)
221 STORE_BUFFER_WXH(8, 4)
222 STORE_BUFFER_WXH(8, 8)
223 STORE_BUFFER_WXH(16, 4)
224 STORE_BUFFER_WXH(16, 16)
225 STORE_BUFFER_WXH(32, 4)
226 STORE_BUFFER_WXH(32, 32)
227 STORE_BUFFER_WXH(64, 32)
228
229 #if !CONFIG_REALTIME_ONLY
230 STORE_BUFFER_WXH(16, 32)
231 STORE_BUFFER_WXH(64, 16)
232 #endif // !CONFIG_REALTIME_ONLY
233
highbd_fdct4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)234 static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
235 int32x4_t *out, int bit) {
236 const int32_t *const cospi = cospi_arr_s32(bit);
237 const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
238 const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
239
240 const int32x4_t a0 = vaddq_s32(in[0], in[3]);
241 const int32x4_t a1 = vsubq_s32(in[0], in[3]);
242 const int32x4_t a2 = vaddq_s32(in[1], in[2]);
243 const int32x4_t a3 = vsubq_s32(in[1], in[2]);
244
245 const int32x4_t b0 = vmulq_s32(a0, cospi32);
246 const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
247 const int32x4_t b2 = vmulq_s32(a2, cospi32);
248 const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
249
250 const int32x4_t c0 = vaddq_s32(b0, b2);
251 const int32x4_t c1 = vsubq_s32(b0, b2);
252 const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
253 const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
254
255 const int32x4_t v_bit = vdupq_n_s32(-bit);
256 const int32x4_t d0 = vrshlq_s32(c0, v_bit);
257 const int32x4_t d1 = vrshlq_s32(c1, v_bit);
258 const int32x4_t d2 = vrshlq_s32(c2, v_bit);
259 const int32x4_t d3 = vrshlq_s32(c3, v_bit);
260
261 out[0] = d0;
262 out[1] = d2;
263 out[2] = d1;
264 out[3] = d3;
265 }
266
highbd_fadst4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)267 static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
268 int32x4_t *out, int bit) {
269 const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
270
271 const int32x4_t a0 = vaddq_s32(in[0], in[1]);
272 const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
273 const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
274 const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
275
276 const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
277 const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
278 const int32x4_t b2 = vsubq_s32(a0, in[3]);
279
280 const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
281 const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
282 const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
283
284 const int32x4_t d0 = vaddq_s32(c0, a3);
285 const int32x4_t d1 = vsubq_s32(c1, a3);
286 const int32x4_t d2 = vsubq_s32(c1, c0);
287
288 const int32x4_t e0 = vaddq_s32(d2, a3);
289
290 const int32x4_t v_bit = vdupq_n_s32(-bit);
291 out[0] = vrshlq_s32(d0, v_bit);
292 out[1] = vrshlq_s32(c2, v_bit);
293 out[2] = vrshlq_s32(d1, v_bit);
294 out[3] = vrshlq_s32(e0, v_bit);
295 }
296
highbd_fidentity4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)297 static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
298 int32x4_t *out,
299 int bit) {
300 (void)bit;
301 int32x4_t fact = vdupq_n_s32(NewSqrt2);
302
303 for (int i = 0; i < 4; i++) {
304 const int32x4_t a_low = vmulq_s32(in[i], fact);
305 out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
306 }
307 }
308
av1_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * coeff,int input_stride,TX_TYPE tx_type,int bd)309 void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
310 int input_stride, TX_TYPE tx_type, int bd) {
311 (void)bd;
312
313 int ud_flip, lr_flip;
314 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
315 ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
316
317 // Workspace for column/row-wise transforms.
318 int32x4_t buf[4];
319
320 switch (tx_type) {
321 case DCT_DCT:
322 load_buffer_4x4(input, buf, input_stride, 0);
323 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
324 transpose_arrays_s32_4x4(buf, buf);
325 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
326 store_buffer_4x4(buf, coeff, /*stride=*/4);
327 break;
328 case ADST_DCT:
329 load_buffer_4x4(input, buf, input_stride, 0);
330 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
331 transpose_arrays_s32_4x4(buf, buf);
332 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
333 store_buffer_4x4(buf, coeff, /*stride=*/4);
334 break;
335 case DCT_ADST:
336 load_buffer_4x4(input, buf, input_stride, 0);
337 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
338 transpose_arrays_s32_4x4(buf, buf);
339 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
340 store_buffer_4x4(buf, coeff, /*stride=*/4);
341 break;
342 case ADST_ADST:
343 load_buffer_4x4(input, buf, input_stride, 0);
344 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
345 transpose_arrays_s32_4x4(buf, buf);
346 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
347 store_buffer_4x4(buf, coeff, /*stride=*/4);
348 break;
349 case FLIPADST_DCT:
350 load_buffer_4x4(input, buf, input_stride, 0);
351 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
352 transpose_arrays_s32_4x4(buf, buf);
353 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
354 store_buffer_4x4(buf, coeff, /*stride=*/4);
355 break;
356 case DCT_FLIPADST:
357 load_buffer_4x4(input, buf, input_stride, 1);
358 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
359 transpose_arrays_s32_4x4(buf, buf);
360 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
361 store_buffer_4x4(buf, coeff, /*stride=*/4);
362 break;
363 case FLIPADST_FLIPADST:
364 load_buffer_4x4(input, buf, input_stride, 1);
365 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
366 transpose_arrays_s32_4x4(buf, buf);
367 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
368 store_buffer_4x4(buf, coeff, /*stride=*/4);
369 break;
370 case ADST_FLIPADST:
371 load_buffer_4x4(input, buf, input_stride, 1);
372 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
373 transpose_arrays_s32_4x4(buf, buf);
374 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
375 store_buffer_4x4(buf, coeff, /*stride=*/4);
376 break;
377 case FLIPADST_ADST:
378 load_buffer_4x4(input, buf, input_stride, 0);
379 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
380 transpose_arrays_s32_4x4(buf, buf);
381 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
382 store_buffer_4x4(buf, coeff, /*stride=*/4);
383 break;
384 case IDTX:
385 load_buffer_4x4(input, buf, input_stride, 0);
386 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
387 transpose_arrays_s32_4x4(buf, buf);
388 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
389 store_buffer_4x4(buf, coeff, /*stride=*/4);
390 break;
391 case V_DCT:
392 load_buffer_4x4(input, buf, input_stride, 0);
393 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
394 transpose_arrays_s32_4x4(buf, buf);
395 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
396 store_buffer_4x4(buf, coeff, /*stride=*/4);
397 break;
398 case H_DCT:
399 load_buffer_4x4(input, buf, input_stride, 0);
400 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
401 transpose_arrays_s32_4x4(buf, buf);
402 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
403 store_buffer_4x4(buf, coeff, /*stride=*/4);
404 break;
405 case V_ADST:
406 load_buffer_4x4(input, buf, input_stride, 0);
407 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
408 transpose_arrays_s32_4x4(buf, buf);
409 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
410 store_buffer_4x4(buf, coeff, /*stride=*/4);
411 break;
412 case H_ADST:
413 load_buffer_4x4(input, buf, input_stride, 0);
414 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
415 transpose_arrays_s32_4x4(buf, buf);
416 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
417 store_buffer_4x4(buf, coeff, /*stride=*/4);
418 break;
419 case V_FLIPADST:
420 load_buffer_4x4(input, buf, input_stride, 0);
421 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
422 transpose_arrays_s32_4x4(buf, buf);
423 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
424 store_buffer_4x4(buf, coeff, /*stride=*/4);
425 break;
426 case H_FLIPADST:
427 load_buffer_4x4(input, buf, input_stride, 1);
428 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
429 transpose_arrays_s32_4x4(buf, buf);
430 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
431 store_buffer_4x4(buf, coeff, /*stride=*/4);
432 break;
433 default: assert(0);
434 }
435 }
436
437 // Butterfly pre-processing:
438 // e.g. n=4:
439 // out[0] = in[0] + in[3]
440 // out[1] = in[1] + in[2]
441 // out[2] = in[1] - in[2]
442 // out[3] = in[0] - in[3]
443
butterfly_dct_pre(const int32x4_t * input,int32x4_t * output,int n)444 static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
445 int32x4_t *output, int n) {
446 for (int i = 0; i < n / 2; ++i) {
447 output[i] = vaddq_s32(input[i], input[n - i - 1]);
448 }
449 for (int i = 0; i < n / 2; ++i) {
450 output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
451 }
452 }
453
454 // Butterfly post-processing:
455 // e.g. n=8:
456 // out[0] = in0[0] + in1[3];
457 // out[1] = in0[1] + in1[2];
458 // out[2] = in0[1] - in1[2];
459 // out[3] = in0[0] - in1[3];
460 // out[4] = in0[7] - in1[4];
461 // out[5] = in0[6] - in1[5];
462 // out[6] = in0[6] + in1[5];
463 // out[7] = in0[7] + in1[4];
464
butterfly_dct_post(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * output,int n)465 static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
466 const int32x4_t *in1,
467 int32x4_t *output, int n) {
468 for (int i = 0; i < n / 4; ++i) {
469 output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
470 }
471 for (int i = 0; i < n / 4; ++i) {
472 output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
473 }
474 for (int i = 0; i < n / 4; ++i) {
475 output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
476 }
477 for (int i = 0; i < n / 4; ++i) {
478 output[(3 * n) / 4 + i] =
479 vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
480 }
481 }
482
highbd_fdct8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)483 static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
484 int32x4_t *out, int bit) {
485 const int32_t *const cospi = cospi_arr_s32(bit);
486 const int32x4_t v_bit = vdupq_n_s32(-bit);
487
488 // stage 1
489 int32x4_t a[8];
490 butterfly_dct_pre(in, a, 8);
491
492 // stage 2
493 int32x4_t b[8];
494 butterfly_dct_pre(a, b, 4);
495 butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
496
497 // stage 3
498 int32x4_t c[8];
499 butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
500 butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
501 butterfly_dct_post(a + 4, b + 4, c + 4, 4);
502
503 // stage 4-5
504 butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
505 butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
506
507 out[0] = c[0];
508 out[2] = c[2];
509 out[4] = c[1];
510 out[6] = c[3];
511 }
512
highbd_fadst8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)513 static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
514 int32x4_t *out, int bit) {
515 const int32_t *const cospi = cospi_arr_s32(bit);
516 const int32x4_t v_bit = vdupq_n_s32(-bit);
517
518 int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
519 int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
520
521 // stage 0-1
522 u0 = in[0];
523 u1 = in[7];
524 u2 = in[3];
525 u3 = in[4];
526 u4 = in[1];
527 u5 = in[6];
528 u6 = in[2];
529 u7 = in[5];
530
531 // stage 2
532 v0 = u0;
533 v1 = u1;
534 butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
535 v4 = u4;
536 v5 = u5;
537 butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
538
539 // stage 3
540 u0 = vaddq_s32(v0, v2);
541 u1 = vsubq_s32(v3, v1);
542 u2 = vsubq_s32(v0, v2);
543 u3 = vaddq_s32(v1, v3);
544 u4 = vsubq_s32(v6, v4);
545 u5 = vaddq_s32(v5, v7);
546 u6 = vaddq_s32(v4, v6);
547 u7 = vsubq_s32(v5, v7);
548
549 // stage 4
550 v0 = u0;
551 v1 = u1;
552 v2 = u2;
553 v3 = u3;
554
555 butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
556 butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
557
558 // stage 5
559 u0 = vaddq_s32(v0, v4);
560 u1 = vaddq_s32(v1, v5);
561 u2 = vaddq_s32(v2, v6);
562 u3 = vsubq_s32(v7, v3);
563 u4 = vsubq_s32(v0, v4);
564 u5 = vsubq_s32(v1, v5);
565 u6 = vsubq_s32(v2, v6);
566 u7 = vaddq_s32(v3, v7);
567
568 // stage 6
569 butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
570 butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
571 butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
572 butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
573
574 // stage 7
575 out[0] = v1;
576 out[1] = v6;
577 out[2] = v3;
578 out[3] = v4;
579 out[4] = v5;
580 out[5] = v2;
581 out[6] = v7;
582 out[7] = v0;
583 }
584
highbd_fidentity8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)585 static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
586 int32x4_t *out,
587 int bit) {
588 (void)bit;
589 out[0] = vshlq_n_s32(in[0], 1);
590 out[1] = vshlq_n_s32(in[1], 1);
591 out[2] = vshlq_n_s32(in[2], 1);
592 out[3] = vshlq_n_s32(in[3], 1);
593 out[4] = vshlq_n_s32(in[4], 1);
594 out[5] = vshlq_n_s32(in[5], 1);
595 out[6] = vshlq_n_s32(in[6], 1);
596 out[7] = vshlq_n_s32(in[7], 1);
597 }
598
highbd_fdct8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)599 static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
600 int32x4_t *out, int bit,
601 int howmany) {
602 const int stride = 8;
603 int i = 0;
604 do {
605 highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
606 } while (++i < howmany);
607 }
608
highbd_fadst8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)609 static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
610 int32x4_t *out, int bit,
611 int howmany) {
612 const int stride = 8;
613 int i = 0;
614 do {
615 highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
616 } while (++i < howmany);
617 }
618
highbd_fidentity8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)619 static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
620 int32x4_t *out, int bit,
621 int howmany) {
622 (void)bit;
623 const int stride = 8;
624 int i = 0;
625 do {
626 highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
627 } while (++i < howmany);
628 }
629
av1_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)630 void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
631 TX_TYPE tx_type, int bd) {
632 (void)bd;
633
634 int ud_flip, lr_flip;
635 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
636 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
637
638 // Workspaces for column/row-wise transforms.
639 int32x4_t buf0[16], buf1[16];
640
641 switch (tx_type) {
642 case DCT_DCT:
643 load_buffer_8x8(input, buf0, stride, 0);
644 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
645 shift_right_1_round_s32_x4(buf0, buf0, 16);
646 transpose_arrays_s32_8x8(buf0, buf1);
647 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
648 store_buffer_8x8(buf1, coeff, /*stride=*/8);
649 break;
650 case ADST_DCT:
651 load_buffer_8x8(input, buf0, stride, 0);
652 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
653 shift_right_1_round_s32_x4(buf0, buf0, 16);
654 transpose_arrays_s32_8x8(buf0, buf1);
655 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
656 store_buffer_8x8(buf1, coeff, /*stride=*/8);
657 break;
658 case DCT_ADST:
659 load_buffer_8x8(input, buf0, stride, 0);
660 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
661 shift_right_1_round_s32_x4(buf0, buf0, 16);
662 transpose_arrays_s32_8x8(buf0, buf1);
663 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
664 store_buffer_8x8(buf1, coeff, /*stride=*/8);
665 break;
666 case ADST_ADST:
667 load_buffer_8x8(input, buf0, stride, 0);
668 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
669 shift_right_1_round_s32_x4(buf0, buf0, 16);
670 transpose_arrays_s32_8x8(buf0, buf1);
671 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
672 store_buffer_8x8(buf1, coeff, /*stride=*/8);
673 break;
674 case FLIPADST_DCT:
675 load_buffer_8x8(input, buf0, stride, 0);
676 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
677 shift_right_1_round_s32_x4(buf0, buf0, 16);
678 transpose_arrays_s32_8x8(buf0, buf1);
679 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
680 store_buffer_8x8(buf1, coeff, /*stride=*/8);
681 break;
682 case DCT_FLIPADST:
683 load_buffer_8x8(input, buf0, stride, 1);
684 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
685 shift_right_1_round_s32_x4(buf0, buf0, 16);
686 transpose_arrays_s32_8x8(buf0, buf1);
687 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
688 store_buffer_8x8(buf1, coeff, /*stride=*/8);
689 break;
690 case FLIPADST_FLIPADST:
691 load_buffer_8x8(input, buf0, stride, 1);
692 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
693 shift_right_1_round_s32_x4(buf0, buf0, 16);
694 transpose_arrays_s32_8x8(buf0, buf1);
695 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
696 store_buffer_8x8(buf1, coeff, /*stride=*/8);
697 break;
698 case ADST_FLIPADST:
699 load_buffer_8x8(input, buf0, stride, 1);
700 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
701 shift_right_1_round_s32_x4(buf0, buf0, 16);
702 transpose_arrays_s32_8x8(buf0, buf1);
703 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
704 store_buffer_8x8(buf1, coeff, /*stride=*/8);
705 break;
706 case FLIPADST_ADST:
707 load_buffer_8x8(input, buf0, stride, 0);
708 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
709 shift_right_1_round_s32_x4(buf0, buf0, 16);
710 transpose_arrays_s32_8x8(buf0, buf1);
711 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
712 store_buffer_8x8(buf1, coeff, /*stride=*/8);
713 break;
714 case IDTX:
715 load_buffer_8x8(input, buf0, stride, 0);
716 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
717 shift_right_1_round_s32_x4(buf0, buf0, 16);
718 transpose_arrays_s32_8x8(buf0, buf1);
719 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
720 store_buffer_8x8(buf1, coeff, /*stride=*/8);
721 break;
722 case V_DCT:
723 load_buffer_8x8(input, buf0, stride, 0);
724 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
725 shift_right_1_round_s32_x4(buf0, buf0, 16);
726 transpose_arrays_s32_8x8(buf0, buf1);
727 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
728 store_buffer_8x8(buf1, coeff, /*stride=*/8);
729 break;
730 case H_DCT:
731 load_buffer_8x8(input, buf0, stride, 0);
732 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
733 shift_right_1_round_s32_x4(buf0, buf0, 16);
734 transpose_arrays_s32_8x8(buf0, buf1);
735 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
736 store_buffer_8x8(buf1, coeff, /*stride=*/8);
737 break;
738 case V_ADST:
739 load_buffer_8x8(input, buf0, stride, 0);
740 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
741 shift_right_1_round_s32_x4(buf0, buf0, 16);
742 transpose_arrays_s32_8x8(buf0, buf1);
743 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
744 store_buffer_8x8(buf1, coeff, /*stride=*/8);
745 break;
746 case H_ADST:
747 load_buffer_8x8(input, buf0, stride, 0);
748 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
749 shift_right_1_round_s32_x4(buf0, buf0, 16);
750 transpose_arrays_s32_8x8(buf0, buf1);
751 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
752 store_buffer_8x8(buf1, coeff, /*stride=*/8);
753 break;
754 case V_FLIPADST:
755 load_buffer_8x8(input, buf0, stride, 0);
756 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
757 shift_right_1_round_s32_x4(buf0, buf0, 16);
758 transpose_arrays_s32_8x8(buf0, buf1);
759 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
760 store_buffer_8x8(buf1, coeff, /*stride=*/8);
761 break;
762 case H_FLIPADST:
763 load_buffer_8x8(input, buf0, stride, 1);
764 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
765 shift_right_1_round_s32_x4(buf0, buf0, 16);
766 transpose_arrays_s32_8x8(buf0, buf1);
767 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
768 store_buffer_8x8(buf1, coeff, /*stride=*/8);
769 break;
770 default: assert(0);
771 }
772 }
773
highbd_fdct16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)774 static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
775 int bit) {
776 const int32_t *const cospi = cospi_arr_s32(bit);
777 const int32x4_t v_bit = vdupq_n_s32(-bit);
778
779 int32x4_t u[16], v[16];
780
781 // stage 1
782 butterfly_dct_pre(in, u, 16);
783
784 // stage 2
785 butterfly_dct_pre(u, v, 8);
786 v[8] = u[8];
787 v[9] = u[9];
788 butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
789 butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
790 v[14] = u[14];
791 v[15] = u[15];
792
793 // stage 3
794 butterfly_dct_pre(v, u, 4);
795 u[4] = v[4];
796 butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
797 u[7] = v[7];
798 butterfly_dct_post(v + 8, v + 8, u + 8, 8);
799
800 // stage 4
801 butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
802 butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
803 butterfly_dct_post(u + 4, u + 4, v + 4, 4);
804 v[8] = u[8];
805 butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
806 butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
807 v[11] = u[11];
808 v[12] = u[12];
809 v[15] = u[15];
810
811 // stage 5
812 u[0] = v[0];
813 u[1] = v[1];
814 u[2] = v[2];
815 u[3] = v[3];
816 butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
817 butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
818 butterfly_dct_post(v + 8, v + 8, u + 8, 4);
819 butterfly_dct_post(v + 12, v + 12, u + 12, 4);
820
821 // stage 6
822 v[0] = u[0];
823 v[1] = u[1];
824 v[2] = u[2];
825 v[3] = u[3];
826 v[4] = u[4];
827 v[5] = u[5];
828 v[6] = u[6];
829 v[7] = u[7];
830 butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
831 butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
832 butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
833 butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
834
835 out[0] = v[0];
836 out[1] = v[8];
837 out[2] = v[4];
838 out[3] = v[12];
839 out[4] = v[2];
840 out[5] = v[10];
841 out[6] = v[6];
842 out[7] = v[14];
843 out[8] = v[1];
844 out[9] = v[9];
845 out[10] = v[5];
846 out[11] = v[13];
847 out[12] = v[3];
848 out[13] = v[11];
849 out[14] = v[7];
850 out[15] = v[15];
851 }
852
highbd_fadst16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)853 static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
854 int bit) {
855 const int32_t *const cospi = cospi_arr_s32(bit);
856 const int32x4_t v_bit = vdupq_n_s32(-bit);
857
858 int32x4_t u[16], v[16];
859
860 // stage 0-1
861 u[0] = in[0];
862 u[1] = in[15];
863 u[2] = in[7];
864 u[3] = in[8];
865 u[4] = in[3];
866 u[5] = in[12];
867 u[6] = in[4];
868 u[7] = in[11];
869 u[8] = in[1];
870 u[9] = in[14];
871 u[10] = in[6];
872 u[11] = in[9];
873 u[12] = in[2];
874 u[13] = in[13];
875 u[14] = in[5];
876 u[15] = in[10];
877
878 // stage 2
879 v[0] = u[0];
880 v[1] = u[1];
881 butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
882 v[4] = u[4];
883 v[5] = u[5];
884 butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
885 v[8] = u[8];
886 v[9] = u[9];
887 butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
888 v[12] = u[12];
889 v[13] = u[13];
890 butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
891
892 // stage 3
893 u[0] = vaddq_s32(v[0], v[2]);
894 u[1] = vsubq_s32(v[3], v[1]);
895 u[2] = vsubq_s32(v[0], v[2]);
896 u[3] = vaddq_s32(v[1], v[3]);
897 u[4] = vsubq_s32(v[6], v[4]);
898 u[5] = vaddq_s32(v[5], v[7]);
899 u[6] = vaddq_s32(v[4], v[6]);
900 u[7] = vsubq_s32(v[5], v[7]);
901 u[8] = vsubq_s32(v[10], v[8]);
902 u[9] = vaddq_s32(v[9], v[11]);
903 u[10] = vaddq_s32(v[8], v[10]);
904 u[11] = vsubq_s32(v[9], v[11]);
905 u[12] = vaddq_s32(v[12], v[14]);
906 u[13] = vsubq_s32(v[15], v[13]);
907 u[14] = vsubq_s32(v[12], v[14]);
908 u[15] = vaddq_s32(v[13], v[15]);
909
910 // stage 4
911 v[0] = u[0];
912 v[1] = u[1];
913 v[2] = u[2];
914 v[3] = u[3];
915 butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
916 butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
917
918 v[8] = u[8];
919 v[9] = u[9];
920 v[10] = u[10];
921 v[11] = u[11];
922
923 butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
924 butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
925
926 // stage 5
927 u[0] = vaddq_s32(v[0], v[4]);
928 u[1] = vaddq_s32(v[1], v[5]);
929 u[2] = vaddq_s32(v[2], v[6]);
930 u[3] = vsubq_s32(v[7], v[3]);
931 u[4] = vsubq_s32(v[0], v[4]);
932 u[5] = vsubq_s32(v[1], v[5]);
933 u[6] = vsubq_s32(v[2], v[6]);
934 u[7] = vaddq_s32(v[3], v[7]);
935 u[8] = vaddq_s32(v[8], v[12]);
936 u[9] = vaddq_s32(v[9], v[13]);
937 u[10] = vsubq_s32(v[14], v[10]);
938 u[11] = vaddq_s32(v[11], v[15]);
939 u[12] = vsubq_s32(v[8], v[12]);
940 u[13] = vsubq_s32(v[9], v[13]);
941 u[14] = vaddq_s32(v[10], v[14]);
942 u[15] = vsubq_s32(v[11], v[15]);
943
944 // stage 6
945 v[0] = u[0];
946 v[1] = u[1];
947 v[2] = u[2];
948 v[3] = u[3];
949 v[4] = u[4];
950 v[5] = u[5];
951 v[6] = u[6];
952 v[7] = u[7];
953
954 butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
955 butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
956 butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
957 butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
958
959 // stage 7
960 u[0] = vaddq_s32(v[0], v[8]);
961 u[1] = vaddq_s32(v[1], v[9]);
962 u[2] = vaddq_s32(v[2], v[10]);
963 u[3] = vaddq_s32(v[3], v[11]);
964 u[4] = vaddq_s32(v[4], v[12]);
965 u[5] = vaddq_s32(v[5], v[13]);
966 u[6] = vaddq_s32(v[6], v[14]);
967 u[7] = vsubq_s32(v[15], v[7]);
968 u[8] = vsubq_s32(v[0], v[8]);
969 u[9] = vsubq_s32(v[1], v[9]);
970 u[10] = vsubq_s32(v[2], v[10]);
971 u[11] = vsubq_s32(v[3], v[11]);
972 u[12] = vsubq_s32(v[4], v[12]);
973 u[13] = vsubq_s32(v[5], v[13]);
974 u[14] = vsubq_s32(v[6], v[14]);
975 u[15] = vaddq_s32(v[7], v[15]);
976
977 // stage 8
978 butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
979 butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
980 butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
981 butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
982 butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
983 butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
984 butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
985 butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
986
987 // stage 9
988 out[0] = v[1];
989 out[1] = v[14];
990 out[2] = v[3];
991 out[3] = v[12];
992 out[4] = v[5];
993 out[5] = v[10];
994 out[6] = v[7];
995 out[7] = v[8];
996 out[8] = v[9];
997 out[9] = v[6];
998 out[10] = v[11];
999 out[11] = v[4];
1000 out[12] = v[13];
1001 out[13] = v[2];
1002 out[14] = v[15];
1003 out[15] = v[0];
1004 }
1005
highbd_fidentity16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)1006 static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
1007 int bit) {
1008 (void)bit;
1009 const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
1010 const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
1011
1012 for (int i = 0; i < 16; i++) {
1013 int32x4_t a = vmulq_s32(in[i], fact);
1014 a = vaddq_s32(a, offset);
1015 out[i] = vshrq_n_s32(a, NewSqrt2Bits);
1016 }
1017 }
1018
highbd_fdct16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,const int howmany)1019 static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
1020 const int howmany) {
1021 const int stride = 16;
1022 int i = 0;
1023 do {
1024 highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
1025 } while (++i < howmany);
1026 }
1027
highbd_fadst16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)1028 static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
1029 int howmany) {
1030 const int stride = 16;
1031 int i = 0;
1032 do {
1033 highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
1034 } while (++i < howmany);
1035 }
1036
highbd_fidentity16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)1037 static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
1038 int bit, int howmany) {
1039 const int stride = 16;
1040 int i = 0;
1041 do {
1042 highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
1043 } while (++i < howmany);
1044 }
1045
av1_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)1046 void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
1047 TX_TYPE tx_type, int bd) {
1048 (void)bd;
1049 int ud_flip, lr_flip;
1050 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1051 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
1052
1053 // Workspaces for column/row-wise transforms.
1054 int32x4_t buf0[64], buf1[64];
1055
1056 switch (tx_type) {
1057 case DCT_DCT:
1058 load_buffer_16x16(input, buf0, stride, 0);
1059 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1060 shift_right_2_round_s32_x4(buf0, buf0, 64);
1061 transpose_arrays_s32_16x16(buf0, buf1);
1062 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1063 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1064 break;
1065 case ADST_DCT:
1066 load_buffer_16x16(input, buf0, stride, 0);
1067 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1068 shift_right_2_round_s32_x4(buf0, buf0, 64);
1069 transpose_arrays_s32_16x16(buf0, buf1);
1070 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1071 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1072 break;
1073 case DCT_ADST:
1074 load_buffer_16x16(input, buf0, stride, 0);
1075 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1076 shift_right_2_round_s32_x4(buf0, buf0, 64);
1077 transpose_arrays_s32_16x16(buf0, buf1);
1078 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1079 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1080 break;
1081 case ADST_ADST:
1082 load_buffer_16x16(input, buf0, stride, 0);
1083 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1084 shift_right_2_round_s32_x4(buf0, buf0, 64);
1085 transpose_arrays_s32_16x16(buf0, buf1);
1086 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1087 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1088 break;
1089 case FLIPADST_DCT:
1090 load_buffer_16x16(input, buf0, stride, 0);
1091 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1092 shift_right_2_round_s32_x4(buf0, buf0, 64);
1093 transpose_arrays_s32_16x16(buf0, buf1);
1094 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1095 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1096 break;
1097 case DCT_FLIPADST:
1098 load_buffer_16x16(input, buf0, stride, 1);
1099 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1100 shift_right_2_round_s32_x4(buf0, buf0, 64);
1101 transpose_arrays_s32_16x16(buf0, buf1);
1102 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1103 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1104 break;
1105 case FLIPADST_FLIPADST:
1106 load_buffer_16x16(input, buf0, stride, 1);
1107 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1108 shift_right_2_round_s32_x4(buf0, buf0, 64);
1109 transpose_arrays_s32_16x16(buf0, buf1);
1110 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1111 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1112 break;
1113 case ADST_FLIPADST:
1114 load_buffer_16x16(input, buf0, stride, 1);
1115 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1116 shift_right_2_round_s32_x4(buf0, buf0, 64);
1117 transpose_arrays_s32_16x16(buf0, buf1);
1118 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1119 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1120 break;
1121 case FLIPADST_ADST:
1122 load_buffer_16x16(input, buf0, stride, 0);
1123 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1124 shift_right_2_round_s32_x4(buf0, buf0, 64);
1125 transpose_arrays_s32_16x16(buf0, buf1);
1126 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1127 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1128 break;
1129 case IDTX:
1130 load_buffer_16x16(input, buf0, stride, 0);
1131 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1132 shift_right_2_round_s32_x4(buf0, buf0, 64);
1133 transpose_arrays_s32_16x16(buf0, buf1);
1134 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1135 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1136 break;
1137 case V_DCT:
1138 load_buffer_16x16(input, buf0, stride, 0);
1139 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1140 shift_right_2_round_s32_x4(buf0, buf0, 64);
1141 transpose_arrays_s32_16x16(buf0, buf1);
1142 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1143 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1144 break;
1145 case H_DCT:
1146 load_buffer_16x16(input, buf0, stride, 0);
1147 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1148 shift_right_2_round_s32_x4(buf0, buf0, 64);
1149 transpose_arrays_s32_16x16(buf0, buf1);
1150 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1151 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1152 break;
1153 case V_ADST:
1154 load_buffer_16x16(input, buf0, stride, 0);
1155 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1156 shift_right_2_round_s32_x4(buf0, buf0, 64);
1157 transpose_arrays_s32_16x16(buf0, buf1);
1158 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1159 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1160 break;
1161 case H_ADST:
1162 load_buffer_16x16(input, buf0, stride, 0);
1163 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1164 shift_right_2_round_s32_x4(buf0, buf0, 64);
1165 transpose_arrays_s32_16x16(buf0, buf1);
1166 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1167 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1168 break;
1169 case V_FLIPADST:
1170 load_buffer_16x16(input, buf0, stride, 0);
1171 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1172 shift_right_2_round_s32_x4(buf0, buf0, 64);
1173 transpose_arrays_s32_16x16(buf0, buf1);
1174 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1175 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1176 break;
1177 case H_FLIPADST:
1178 load_buffer_16x16(input, buf0, stride, 1);
1179 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1180 shift_right_2_round_s32_x4(buf0, buf0, 64);
1181 transpose_arrays_s32_16x16(buf0, buf1);
1182 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1183 store_buffer_16x16(buf1, coeff, /*stride=*/16);
1184 break;
1185 default: assert(0);
1186 }
1187 }
1188
1189 typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
1190 int stride, int bit, int lr_flip);
1191 typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
1192 int32x4_t *out, int stride,
1193 int bit, int lr_flip,
1194 int howmany, int hm_stride);
1195
1196 typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
1197 int bit, int stride);
1198 typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
1199 int32_t *out, int bit,
1200 int howmany, int hm_stride,
1201 int stride);
1202
1203 // Construct component kernels that include the load_buffer and store_buffer
1204 // stages to avoid the need to spill loaded data to the stack between these and
1205 // the txfm kernel calls.
1206 // The TRANSFORM_*_ONE cases are only ever called in situations where the
1207 // howmany parameter would be one, so no need for the loop at all in these
1208 // cases.
1209
1210 #define TRANSFORM_COL_ONE(name, n) \
1211 static void highbd_##name##_col_neon(const int16_t *input, \
1212 int32x4_t *output, int stride, \
1213 int cos_bit, int lr_flip) { \
1214 int32x4_t buf0[n]; \
1215 load_buffer_4x##n(input, buf0, stride, lr_flip); \
1216 highbd_##name##_x4_neon(buf0, output, cos_bit); \
1217 }
1218
1219 #define TRANSFORM_COL_MANY(name, n) \
1220 static void highbd_##name##_col_many_neon( \
1221 const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
1222 int lr_flip, int howmany, int hm_stride) { \
1223 int i = 0; \
1224 do { \
1225 int32x4_t buf0[n]; \
1226 load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \
1227 highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \
1228 } while (++i < howmany); \
1229 }
1230
1231 #define TRANSFORM_ROW_ONE(name, n) \
1232 static void highbd_##name##_row_neon( \
1233 const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
1234 int32x4_t buf0[n]; \
1235 highbd_##name##_x4_neon(input, buf0, cos_bit); \
1236 store_buffer_##n##x4(buf0, output, stride); \
1237 }
1238
1239 #define TRANSFORM_ROW_RECT_ONE(name, n) \
1240 static void highbd_##name##_row_rect_neon( \
1241 const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
1242 int32x4_t buf0[n]; \
1243 highbd_##name##_x4_neon(input, buf0, cos_bit); \
1244 round_rect_array_s32_neon(buf0, buf0, (n)); \
1245 store_buffer_##n##x4(buf0, output, stride); \
1246 }
1247
1248 #define TRANSFORM_ROW_MANY(name, n) \
1249 static void highbd_##name##_row_many_neon( \
1250 const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
1251 int hm_stride, int stride) { \
1252 int i = 0; \
1253 do { \
1254 int32x4_t buf0[n]; \
1255 highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
1256 store_buffer_##n##x4(buf0, output + 4 * i, stride); \
1257 } while (++i < howmany); \
1258 }
1259
1260 #define TRANSFORM_ROW_RECT_MANY(name, n) \
1261 static void highbd_##name##_row_rect_many_neon( \
1262 const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
1263 int hm_stride, int stride) { \
1264 int i = 0; \
1265 do { \
1266 int32x4_t buf0[n]; \
1267 highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
1268 round_rect_array_s32_neon(buf0, buf0, (n)); \
1269 store_buffer_##n##x4(buf0, output + 4 * i, stride); \
1270 } while (++i < howmany); \
1271 }
1272
1273 TRANSFORM_COL_ONE(fdct8, 8)
1274 TRANSFORM_COL_ONE(fadst8, 8)
1275 TRANSFORM_COL_ONE(fidentity8, 8)
1276
1277 TRANSFORM_COL_MANY(fdct4, 4)
1278 TRANSFORM_COL_MANY(fdct8, 8)
1279 TRANSFORM_COL_MANY(fdct16, 16)
1280 TRANSFORM_COL_MANY(fadst4, 4)
1281 TRANSFORM_COL_MANY(fadst8, 8)
1282 TRANSFORM_COL_MANY(fadst16, 16)
1283 TRANSFORM_COL_MANY(fidentity4, 4)
1284 TRANSFORM_COL_MANY(fidentity8, 8)
1285 TRANSFORM_COL_MANY(fidentity16, 16)
1286
1287 TRANSFORM_ROW_ONE(fdct16, 16)
1288 TRANSFORM_ROW_ONE(fadst16, 16)
1289 TRANSFORM_ROW_ONE(fidentity16, 16)
1290
1291 TRANSFORM_ROW_RECT_ONE(fdct8, 8)
1292 TRANSFORM_ROW_RECT_ONE(fadst8, 8)
1293 TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
1294
1295 #if !CONFIG_REALTIME_ONLY
1296 TRANSFORM_ROW_MANY(fdct4, 4)
1297 TRANSFORM_ROW_MANY(fdct8, 8)
1298 TRANSFORM_ROW_MANY(fadst4, 4)
1299 TRANSFORM_ROW_MANY(fadst8, 8)
1300 TRANSFORM_ROW_MANY(fidentity4, 4)
1301 TRANSFORM_ROW_MANY(fidentity8, 8)
1302 #endif
1303
1304 TRANSFORM_ROW_RECT_MANY(fdct4, 4)
1305 TRANSFORM_ROW_RECT_MANY(fdct8, 8)
1306 TRANSFORM_ROW_RECT_MANY(fdct16, 16)
1307 TRANSFORM_ROW_RECT_MANY(fadst4, 4)
1308 TRANSFORM_ROW_RECT_MANY(fadst8, 8)
1309 TRANSFORM_ROW_RECT_MANY(fadst16, 16)
1310 TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
1311 TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
1312 TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
1313
1314 static const fwd_transform_1d_col_many_neon
1315 col_highbd_txfm8_xn_arr[TX_TYPES] = {
1316 highbd_fdct8_col_many_neon, // DCT_DCT
1317 highbd_fadst8_col_many_neon, // ADST_DCT
1318 highbd_fdct8_col_many_neon, // DCT_ADST
1319 highbd_fadst8_col_many_neon, // ADST_ADST
1320 highbd_fadst8_col_many_neon, // FLIPADST_DCT
1321 highbd_fdct8_col_many_neon, // DCT_FLIPADST
1322 highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST
1323 highbd_fadst8_col_many_neon, // ADST_FLIPADST
1324 highbd_fadst8_col_many_neon, // FLIPADST_ADST
1325 highbd_fidentity8_col_many_neon, // IDTX
1326 highbd_fdct8_col_many_neon, // V_DCT
1327 highbd_fidentity8_col_many_neon, // H_DCT
1328 highbd_fadst8_col_many_neon, // V_ADST
1329 highbd_fidentity8_col_many_neon, // H_ADST
1330 highbd_fadst8_col_many_neon, // V_FLIPADST
1331 highbd_fidentity8_col_many_neon // H_FLIPADST
1332 };
1333
1334 static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
1335 highbd_fdct8_col_neon, // DCT_DCT
1336 highbd_fadst8_col_neon, // ADST_DCT
1337 highbd_fdct8_col_neon, // DCT_ADST
1338 highbd_fadst8_col_neon, // ADST_ADST
1339 highbd_fadst8_col_neon, // FLIPADST_DCT
1340 highbd_fdct8_col_neon, // DCT_FLIPADST
1341 highbd_fadst8_col_neon, // FLIPADST_FLIPADST
1342 highbd_fadst8_col_neon, // ADST_FLIPADST
1343 highbd_fadst8_col_neon, // FLIPADST_ADST
1344 highbd_fidentity8_col_neon, // IDTX
1345 highbd_fdct8_col_neon, // V_DCT
1346 highbd_fidentity8_col_neon, // H_DCT
1347 highbd_fadst8_col_neon, // V_ADST
1348 highbd_fidentity8_col_neon, // H_ADST
1349 highbd_fadst8_col_neon, // V_FLIPADST
1350 highbd_fidentity8_col_neon // H_FLIPADST
1351 };
1352
1353 static const fwd_transform_1d_col_many_neon
1354 col_highbd_txfm16_xn_arr[TX_TYPES] = {
1355 highbd_fdct16_col_many_neon, // DCT_DCT
1356 highbd_fadst16_col_many_neon, // ADST_DCT
1357 highbd_fdct16_col_many_neon, // DCT_ADST
1358 highbd_fadst16_col_many_neon, // ADST_ADST
1359 highbd_fadst16_col_many_neon, // FLIPADST_DCT
1360 highbd_fdct16_col_many_neon, // DCT_FLIPADST
1361 highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST
1362 highbd_fadst16_col_many_neon, // ADST_FLIPADST
1363 highbd_fadst16_col_many_neon, // FLIPADST_ADST
1364 highbd_fidentity16_col_many_neon, // IDTX
1365 highbd_fdct16_col_many_neon, // V_DCT
1366 highbd_fidentity16_col_many_neon, // H_DCT
1367 highbd_fadst16_col_many_neon, // V_ADST
1368 highbd_fidentity16_col_many_neon, // H_ADST
1369 highbd_fadst16_col_many_neon, // V_FLIPADST
1370 highbd_fidentity16_col_many_neon // H_FLIPADST
1371 };
1372
1373 static const fwd_transform_1d_col_many_neon
1374 col_highbd_txfm4_xn_arr[TX_TYPES] = {
1375 highbd_fdct4_col_many_neon, // DCT_DCT
1376 highbd_fadst4_col_many_neon, // ADST_DCT
1377 highbd_fdct4_col_many_neon, // DCT_ADST
1378 highbd_fadst4_col_many_neon, // ADST_ADST
1379 highbd_fadst4_col_many_neon, // FLIPADST_DCT
1380 highbd_fdct4_col_many_neon, // DCT_FLIPADST
1381 highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST
1382 highbd_fadst4_col_many_neon, // ADST_FLIPADST
1383 highbd_fadst4_col_many_neon, // FLIPADST_ADST
1384 highbd_fidentity4_col_many_neon, // IDTX
1385 highbd_fdct4_col_many_neon, // V_DCT
1386 highbd_fidentity4_col_many_neon, // H_DCT
1387 highbd_fadst4_col_many_neon, // V_ADST
1388 highbd_fidentity4_col_many_neon, // H_ADST
1389 highbd_fadst4_col_many_neon, // V_FLIPADST
1390 highbd_fidentity4_col_many_neon // H_FLIPADST
1391 };
1392
1393 static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
1394 highbd_fdct16_row_neon, // DCT_DCT
1395 highbd_fdct16_row_neon, // ADST_DCT
1396 highbd_fadst16_row_neon, // DCT_ADST
1397 highbd_fadst16_row_neon, // ADST_ADST
1398 highbd_fdct16_row_neon, // FLIPADST_DCT
1399 highbd_fadst16_row_neon, // DCT_FLIPADST
1400 highbd_fadst16_row_neon, // FLIPADST_FLIPADST
1401 highbd_fadst16_row_neon, // ADST_FLIPADST
1402 highbd_fadst16_row_neon, // FLIPADST_ADST
1403 highbd_fidentity16_row_neon, // IDTX
1404 highbd_fidentity16_row_neon, // V_DCT
1405 highbd_fdct16_row_neon, // H_DCT
1406 highbd_fidentity16_row_neon, // V_ADST
1407 highbd_fadst16_row_neon, // H_ADST
1408 highbd_fidentity16_row_neon, // V_FLIPADST
1409 highbd_fadst16_row_neon // H_FLIPADST
1410 };
1411
1412 static const fwd_transform_1d_row_many_neon
1413 row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
1414 highbd_fdct16_row_rect_many_neon, // DCT_DCT
1415 highbd_fdct16_row_rect_many_neon, // ADST_DCT
1416 highbd_fadst16_row_rect_many_neon, // DCT_ADST
1417 highbd_fadst16_row_rect_many_neon, // ADST_ADST
1418 highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT
1419 highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST
1420 highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST
1421 highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST
1422 highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST
1423 highbd_fidentity16_row_rect_many_neon, // IDTX
1424 highbd_fidentity16_row_rect_many_neon, // V_DCT
1425 highbd_fdct16_row_rect_many_neon, // H_DCT
1426 highbd_fidentity16_row_rect_many_neon, // V_ADST
1427 highbd_fadst16_row_rect_many_neon, // H_ADST
1428 highbd_fidentity16_row_rect_many_neon, // V_FLIPADST
1429 highbd_fadst16_row_rect_many_neon // H_FLIPADST
1430 };
1431
1432 #if !CONFIG_REALTIME_ONLY
1433 static const fwd_transform_1d_row_many_neon
1434 row_highbd_txfm8_xn_arr[TX_TYPES] = {
1435 highbd_fdct8_row_many_neon, // DCT_DCT
1436 highbd_fdct8_row_many_neon, // ADST_DCT
1437 highbd_fadst8_row_many_neon, // DCT_ADST
1438 highbd_fadst8_row_many_neon, // ADST_ADST
1439 highbd_fdct8_row_many_neon, // FLIPADST_DCT
1440 highbd_fadst8_row_many_neon, // DCT_FLIPADST
1441 highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST
1442 highbd_fadst8_row_many_neon, // ADST_FLIPADST
1443 highbd_fadst8_row_many_neon, // FLIPADST_ADST
1444 highbd_fidentity8_row_many_neon, // IDTX
1445 highbd_fidentity8_row_many_neon, // V_DCT
1446 highbd_fdct8_row_many_neon, // H_DCT
1447 highbd_fidentity8_row_many_neon, // V_ADST
1448 highbd_fadst8_row_many_neon, // H_ADST
1449 highbd_fidentity8_row_many_neon, // V_FLIPADST
1450 highbd_fadst8_row_many_neon // H_FLIPADST
1451 };
1452 #endif
1453
1454 static const fwd_transform_1d_row_many_neon
1455 row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
1456 highbd_fdct8_row_rect_many_neon, // DCT_DCT
1457 highbd_fdct8_row_rect_many_neon, // ADST_DCT
1458 highbd_fadst8_row_rect_many_neon, // DCT_ADST
1459 highbd_fadst8_row_rect_many_neon, // ADST_ADST
1460 highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT
1461 highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST
1462 highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST
1463 highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST
1464 highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST
1465 highbd_fidentity8_row_rect_many_neon, // IDTX
1466 highbd_fidentity8_row_rect_many_neon, // V_DCT
1467 highbd_fdct8_row_rect_many_neon, // H_DCT
1468 highbd_fidentity8_row_rect_many_neon, // V_ADST
1469 highbd_fadst8_row_rect_many_neon, // H_ADST
1470 highbd_fidentity8_row_rect_many_neon, // V_FLIPADST
1471 highbd_fadst8_row_rect_many_neon // H_FLIPADST
1472 };
1473
1474 static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
1475 highbd_fdct8_row_rect_neon, // DCT_DCT
1476 highbd_fdct8_row_rect_neon, // ADST_DCT
1477 highbd_fadst8_row_rect_neon, // DCT_ADST
1478 highbd_fadst8_row_rect_neon, // ADST_ADST
1479 highbd_fdct8_row_rect_neon, // FLIPADST_DCT
1480 highbd_fadst8_row_rect_neon, // DCT_FLIPADST
1481 highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST
1482 highbd_fadst8_row_rect_neon, // ADST_FLIPADST
1483 highbd_fadst8_row_rect_neon, // FLIPADST_ADST
1484 highbd_fidentity8_row_rect_neon, // IDTX
1485 highbd_fidentity8_row_rect_neon, // V_DCT
1486 highbd_fdct8_row_rect_neon, // H_DCT
1487 highbd_fidentity8_row_rect_neon, // V_ADST
1488 highbd_fadst8_row_rect_neon, // H_ADST
1489 highbd_fidentity8_row_rect_neon, // V_FLIPADST
1490 highbd_fadst8_row_rect_neon // H_FLIPADST
1491 };
1492
1493 #if !CONFIG_REALTIME_ONLY
1494 static const fwd_transform_1d_row_many_neon
1495 row_highbd_txfm4_xn_arr[TX_TYPES] = {
1496 highbd_fdct4_row_many_neon, // DCT_DCT
1497 highbd_fdct4_row_many_neon, // ADST_DCT
1498 highbd_fadst4_row_many_neon, // DCT_ADST
1499 highbd_fadst4_row_many_neon, // ADST_ADST
1500 highbd_fdct4_row_many_neon, // FLIPADST_DCT
1501 highbd_fadst4_row_many_neon, // DCT_FLIPADST
1502 highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST
1503 highbd_fadst4_row_many_neon, // ADST_FLIPADST
1504 highbd_fadst4_row_many_neon, // FLIPADST_ADST
1505 highbd_fidentity4_row_many_neon, // IDTX
1506 highbd_fidentity4_row_many_neon, // V_DCT
1507 highbd_fdct4_row_many_neon, // H_DCT
1508 highbd_fidentity4_row_many_neon, // V_ADST
1509 highbd_fadst4_row_many_neon, // H_ADST
1510 highbd_fidentity4_row_many_neon, // V_FLIPADST
1511 highbd_fadst4_row_many_neon // H_FLIPADST
1512 };
1513 #endif
1514
1515 static const fwd_transform_1d_row_many_neon
1516 row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
1517 highbd_fdct4_row_rect_many_neon, // DCT_DCT
1518 highbd_fdct4_row_rect_many_neon, // ADST_DCT
1519 highbd_fadst4_row_rect_many_neon, // DCT_ADST
1520 highbd_fadst4_row_rect_many_neon, // ADST_ADST
1521 highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT
1522 highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST
1523 highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST
1524 highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST
1525 highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST
1526 highbd_fidentity4_row_rect_many_neon, // IDTX
1527 highbd_fidentity4_row_rect_many_neon, // V_DCT
1528 highbd_fdct4_row_rect_many_neon, // H_DCT
1529 highbd_fidentity4_row_rect_many_neon, // V_ADST
1530 highbd_fadst4_row_rect_many_neon, // H_ADST
1531 highbd_fidentity4_row_rect_many_neon, // V_FLIPADST
1532 highbd_fadst4_row_rect_many_neon // H_FLIPADST
1533 };
1534
highbd_fdct32_x4_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)1535 static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
1536 int cos_bit) {
1537 const int32_t *const cospi = cospi_arr_s32(cos_bit);
1538 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1539
1540 // Workspaces for intermediate transform steps.
1541 int32x4_t buf0[32];
1542 int32x4_t buf1[32];
1543
1544 // stage 1
1545 butterfly_dct_pre(input, buf1, 32);
1546
1547 // stage 2
1548 butterfly_dct_pre(buf1, buf0, 16);
1549 buf0[16] = buf1[16];
1550 buf0[17] = buf1[17];
1551 buf0[18] = buf1[18];
1552 buf0[19] = buf1[19];
1553 butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
1554 v_cos_bit);
1555 butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
1556 v_cos_bit);
1557 butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
1558 v_cos_bit);
1559 butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
1560 v_cos_bit);
1561 buf0[28] = buf1[28];
1562 buf0[29] = buf1[29];
1563 buf0[30] = buf1[30];
1564 buf0[31] = buf1[31];
1565
1566 // stage 3
1567 butterfly_dct_pre(buf0, buf1, 8);
1568 buf1[8] = buf0[8];
1569 buf1[9] = buf0[9];
1570 butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
1571 v_cos_bit);
1572 butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
1573 v_cos_bit);
1574 buf1[14] = buf0[14];
1575 buf1[15] = buf0[15];
1576 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
1577
1578 // stage 4
1579 butterfly_dct_pre(buf1, buf0, 4);
1580 buf0[4] = buf1[4];
1581 butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
1582 v_cos_bit);
1583 buf0[7] = buf1[7];
1584 butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
1585 buf0[16] = buf1[16];
1586 buf0[17] = buf1[17];
1587 butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
1588 v_cos_bit);
1589 butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
1590 v_cos_bit);
1591 butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
1592 v_cos_bit);
1593 butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
1594 v_cos_bit);
1595 buf0[22] = buf1[22];
1596 buf0[23] = buf1[23];
1597 buf0[24] = buf1[24];
1598 buf0[25] = buf1[25];
1599 buf0[30] = buf1[30];
1600 buf0[31] = buf1[31];
1601
1602 // stage 5
1603 butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
1604 v_cos_bit);
1605 butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
1606 v_cos_bit);
1607 butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
1608 buf1[8] = buf0[8];
1609 butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
1610 v_cos_bit);
1611 butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
1612 v_cos_bit);
1613 buf1[11] = buf0[11];
1614 buf1[12] = buf0[12];
1615 buf1[15] = buf0[15];
1616 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
1617 butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
1618
1619 // stage 6
1620 buf0[0] = buf1[0];
1621 buf0[1] = buf1[1];
1622 buf0[2] = buf1[2];
1623 buf0[3] = buf1[3];
1624
1625 butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
1626 v_cos_bit);
1627 butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
1628 v_cos_bit);
1629 butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
1630 v_cos_bit);
1631 butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
1632 butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
1633 buf0[16] = buf1[16];
1634 buf0[19] = buf1[19];
1635 buf0[20] = buf1[20];
1636
1637 butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
1638 v_cos_bit);
1639 butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
1640 v_cos_bit);
1641 butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
1642 v_cos_bit);
1643
1644 buf0[23] = buf1[23];
1645 buf0[24] = buf1[24];
1646 buf0[27] = buf1[27];
1647 buf0[28] = buf1[28];
1648 buf0[31] = buf1[31];
1649
1650 // stage 7
1651 buf1[0] = buf0[0];
1652 buf1[1] = buf0[1];
1653 buf1[2] = buf0[2];
1654 buf1[3] = buf0[3];
1655 buf1[4] = buf0[4];
1656 buf1[5] = buf0[5];
1657 buf1[6] = buf0[6];
1658 buf1[7] = buf0[7];
1659 butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
1660 v_cos_bit);
1661 butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
1662 v_cos_bit);
1663 butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
1664 v_cos_bit);
1665 butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
1666 v_cos_bit);
1667 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
1668 butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
1669 butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
1670 butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
1671
1672 // stage 8
1673 buf0[0] = buf1[0];
1674 buf0[1] = buf1[1];
1675 buf0[2] = buf1[2];
1676 buf0[3] = buf1[3];
1677 buf0[4] = buf1[4];
1678 buf0[5] = buf1[5];
1679 buf0[6] = buf1[6];
1680 buf0[7] = buf1[7];
1681 buf0[8] = buf1[8];
1682 buf0[9] = buf1[9];
1683 buf0[10] = buf1[10];
1684 buf0[11] = buf1[11];
1685 buf0[12] = buf1[12];
1686 buf0[13] = buf1[13];
1687 buf0[14] = buf1[14];
1688 buf0[15] = buf1[15];
1689 butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
1690 v_cos_bit);
1691 butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
1692 v_cos_bit);
1693 butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
1694 v_cos_bit);
1695 butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
1696 v_cos_bit);
1697 butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
1698 v_cos_bit);
1699 butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
1700 v_cos_bit);
1701 butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
1702 v_cos_bit);
1703 butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
1704 v_cos_bit);
1705
1706 // stage 9
1707 output[0] = buf0[0];
1708 output[1] = buf0[16];
1709 output[2] = buf0[8];
1710 output[3] = buf0[24];
1711 output[4] = buf0[4];
1712 output[5] = buf0[20];
1713 output[6] = buf0[12];
1714 output[7] = buf0[28];
1715 output[8] = buf0[2];
1716 output[9] = buf0[18];
1717 output[10] = buf0[10];
1718 output[11] = buf0[26];
1719 output[12] = buf0[6];
1720 output[13] = buf0[22];
1721 output[14] = buf0[14];
1722 output[15] = buf0[30];
1723 output[16] = buf0[1];
1724 output[17] = buf0[17];
1725 output[18] = buf0[9];
1726 output[19] = buf0[25];
1727 output[20] = buf0[5];
1728 output[21] = buf0[21];
1729 output[22] = buf0[13];
1730 output[23] = buf0[29];
1731 output[24] = buf0[3];
1732 output[25] = buf0[19];
1733 output[26] = buf0[11];
1734 output[27] = buf0[27];
1735 output[28] = buf0[7];
1736 output[29] = buf0[23];
1737 output[30] = buf0[15];
1738 output[31] = buf0[31];
1739 }
1740
highbd_fdct64_x4_neon(const int32x4_t * input,int32x4_t * output,int8_t cos_bit)1741 static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
1742 int8_t cos_bit) {
1743 const int32_t *const cospi = cospi_arr_s32(cos_bit);
1744 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1745
1746 // stage 1
1747 int32x4_t x1[64];
1748 butterfly_dct_pre(input, x1, 64);
1749
1750 // stage 2
1751 int32x4_t x2[64];
1752 butterfly_dct_pre(x1, x2, 32);
1753 x2[32] = x1[32];
1754 x2[33] = x1[33];
1755 x2[34] = x1[34];
1756 x2[35] = x1[35];
1757 x2[36] = x1[36];
1758 x2[37] = x1[37];
1759 x2[38] = x1[38];
1760 x2[39] = x1[39];
1761 butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
1762 butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
1763 butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
1764 butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
1765 butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
1766 butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
1767 butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
1768 butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
1769 x2[56] = x1[56];
1770 x2[57] = x1[57];
1771 x2[58] = x1[58];
1772 x2[59] = x1[59];
1773 x2[60] = x1[60];
1774 x2[61] = x1[61];
1775 x2[62] = x1[62];
1776 x2[63] = x1[63];
1777
1778 // stage 3
1779 int32x4_t x3[64];
1780 butterfly_dct_pre(x2, x3, 16);
1781 x3[16] = x2[16];
1782 x3[17] = x2[17];
1783 x3[18] = x2[18];
1784 x3[19] = x2[19];
1785 butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
1786 butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
1787 butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
1788 butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
1789 x3[28] = x2[28];
1790 x3[29] = x2[29];
1791 x3[30] = x2[30];
1792 x3[31] = x2[31];
1793 butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
1794
1795 // stage 4
1796 int32x4_t x4[64];
1797 butterfly_dct_pre(x3, x4, 8);
1798 x4[8] = x3[8];
1799 x4[9] = x3[9];
1800 butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
1801 butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
1802 x4[14] = x3[14];
1803 x4[15] = x3[15];
1804 butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
1805 x4[32] = x3[32];
1806 x4[33] = x3[33];
1807 x4[34] = x3[34];
1808 x4[35] = x3[35];
1809 butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
1810 butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
1811 butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
1812 butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
1813 butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
1814 butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
1815 butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
1816 butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
1817 x4[44] = x3[44];
1818 x4[45] = x3[45];
1819 x4[46] = x3[46];
1820 x4[47] = x3[47];
1821 x4[48] = x3[48];
1822 x4[49] = x3[49];
1823 x4[50] = x3[50];
1824 x4[51] = x3[51];
1825 x4[60] = x3[60];
1826 x4[61] = x3[61];
1827 x4[62] = x3[62];
1828 x4[63] = x3[63];
1829
1830 // stage 5
1831 int32x4_t x5[64];
1832 butterfly_dct_pre(x4, x5, 4);
1833 x5[4] = x4[4];
1834 butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
1835 x5[7] = x4[7];
1836 butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
1837 x5[16] = x4[16];
1838 x5[17] = x4[17];
1839 butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
1840 butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
1841 butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
1842 butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
1843 x5[22] = x4[22];
1844 x5[23] = x4[23];
1845 x5[24] = x4[24];
1846 x5[25] = x4[25];
1847 x5[30] = x4[30];
1848 x5[31] = x4[31];
1849 butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
1850 butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
1851
1852 // stage 6
1853 int32x4_t x6[64];
1854 butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
1855 butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
1856 butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
1857 x6[8] = x5[8];
1858 butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
1859 butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
1860 x6[11] = x5[11];
1861 x6[12] = x5[12];
1862 x6[15] = x5[15];
1863 butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
1864 butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
1865 x6[32] = x5[32];
1866 x6[33] = x5[33];
1867 butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
1868 butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
1869 butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
1870 butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
1871 x6[38] = x5[38];
1872 x6[39] = x5[39];
1873 x6[40] = x5[40];
1874 x6[41] = x5[41];
1875 butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
1876 butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
1877 butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
1878 butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
1879 x6[46] = x5[46];
1880 x6[47] = x5[47];
1881 x6[48] = x5[48];
1882 x6[49] = x5[49];
1883 x6[54] = x5[54];
1884 x6[55] = x5[55];
1885 x6[56] = x5[56];
1886 x6[57] = x5[57];
1887 x6[62] = x5[62];
1888 x6[63] = x5[63];
1889
1890 // stage 7
1891 int32x4_t x7[64];
1892 x7[0] = x6[0];
1893 x7[1] = x6[1];
1894 x7[2] = x6[2];
1895 x7[3] = x6[3];
1896 butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
1897 butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
1898 butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
1899 butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
1900 x7[16] = x6[16];
1901 butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
1902 butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
1903 x7[19] = x6[19];
1904 x7[20] = x6[20];
1905 butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
1906 butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
1907 x7[23] = x6[23];
1908 x7[24] = x6[24];
1909 x7[27] = x6[27];
1910 x7[28] = x6[28];
1911 x7[31] = x6[31];
1912 butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
1913 butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
1914 butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
1915 butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
1916
1917 // stage 8
1918 int32x4_t x8[64];
1919 x8[0] = x7[0];
1920 x8[1] = x7[1];
1921 x8[2] = x7[2];
1922 x8[3] = x7[3];
1923 x8[4] = x7[4];
1924 x8[5] = x7[5];
1925 x8[6] = x7[6];
1926 x8[7] = x7[7];
1927
1928 butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
1929 butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
1930 butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
1931 butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
1932 butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
1933 butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
1934 butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
1935 butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
1936 x8[32] = x7[32];
1937 butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
1938 butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
1939 x8[35] = x7[35];
1940 x8[36] = x7[36];
1941 butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
1942 butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
1943 x8[39] = x7[39];
1944 x8[40] = x7[40];
1945 butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
1946 butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
1947 x8[43] = x7[43];
1948 x8[44] = x7[44];
1949 butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
1950 butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
1951 x8[47] = x7[47];
1952 x8[48] = x7[48];
1953 x8[51] = x7[51];
1954 x8[52] = x7[52];
1955 x8[55] = x7[55];
1956 x8[56] = x7[56];
1957 x8[59] = x7[59];
1958 x8[60] = x7[60];
1959 x8[63] = x7[63];
1960
1961 // stage 9
1962 int32x4_t x9[64];
1963 x9[0] = x8[0];
1964 x9[1] = x8[1];
1965 x9[2] = x8[2];
1966 x9[3] = x8[3];
1967 x9[4] = x8[4];
1968 x9[5] = x8[5];
1969 x9[6] = x8[6];
1970 x9[7] = x8[7];
1971 x9[8] = x8[8];
1972 x9[9] = x8[9];
1973 x9[10] = x8[10];
1974 x9[11] = x8[11];
1975 x9[12] = x8[12];
1976 x9[13] = x8[13];
1977 x9[14] = x8[14];
1978 x9[15] = x8[15];
1979 butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
1980 butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
1981 butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
1982 butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
1983 butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
1984 butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
1985 butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
1986 butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
1987 butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
1988 butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
1989 butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
1990 butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
1991 butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
1992 butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
1993 butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
1994 butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
1995
1996 // stage 10
1997 int32x4_t x10[64];
1998 x10[0] = x9[0];
1999 x10[1] = x9[1];
2000 x10[2] = x9[2];
2001 x10[3] = x9[3];
2002 x10[4] = x9[4];
2003 x10[5] = x9[5];
2004 x10[6] = x9[6];
2005 x10[7] = x9[7];
2006 x10[8] = x9[8];
2007 x10[9] = x9[9];
2008 x10[10] = x9[10];
2009 x10[11] = x9[11];
2010 x10[12] = x9[12];
2011 x10[13] = x9[13];
2012 x10[14] = x9[14];
2013 x10[15] = x9[15];
2014 x10[16] = x9[16];
2015 x10[17] = x9[17];
2016 x10[18] = x9[18];
2017 x10[19] = x9[19];
2018 x10[20] = x9[20];
2019 x10[21] = x9[21];
2020 x10[22] = x9[22];
2021 x10[23] = x9[23];
2022 x10[24] = x9[24];
2023 x10[25] = x9[25];
2024 x10[26] = x9[26];
2025 x10[27] = x9[27];
2026 x10[28] = x9[28];
2027 x10[29] = x9[29];
2028 x10[30] = x9[30];
2029 x10[31] = x9[31];
2030 butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
2031 butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
2032 butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
2033 butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
2034 butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
2035 butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
2036 butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
2037 butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
2038 butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
2039 butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
2040 butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
2041 butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
2042 butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
2043 butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
2044 butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
2045 butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
2046
2047 // stage 11
2048 output[0] = x10[0];
2049 output[1] = x10[32];
2050 output[2] = x10[16];
2051 output[3] = x10[48];
2052 output[4] = x10[8];
2053 output[5] = x10[40];
2054 output[6] = x10[24];
2055 output[7] = x10[56];
2056 output[8] = x10[4];
2057 output[9] = x10[36];
2058 output[10] = x10[20];
2059 output[11] = x10[52];
2060 output[12] = x10[12];
2061 output[13] = x10[44];
2062 output[14] = x10[28];
2063 output[15] = x10[60];
2064 output[16] = x10[2];
2065 output[17] = x10[34];
2066 output[18] = x10[18];
2067 output[19] = x10[50];
2068 output[20] = x10[10];
2069 output[21] = x10[42];
2070 output[22] = x10[26];
2071 output[23] = x10[58];
2072 output[24] = x10[6];
2073 output[25] = x10[38];
2074 output[26] = x10[22];
2075 output[27] = x10[54];
2076 output[28] = x10[14];
2077 output[29] = x10[46];
2078 output[30] = x10[30];
2079 output[31] = x10[62];
2080 output[32] = x10[1];
2081 output[33] = x10[33];
2082 output[34] = x10[17];
2083 output[35] = x10[49];
2084 output[36] = x10[9];
2085 output[37] = x10[41];
2086 output[38] = x10[25];
2087 output[39] = x10[57];
2088 output[40] = x10[5];
2089 output[41] = x10[37];
2090 output[42] = x10[21];
2091 output[43] = x10[53];
2092 output[44] = x10[13];
2093 output[45] = x10[45];
2094 output[46] = x10[29];
2095 output[47] = x10[61];
2096 output[48] = x10[3];
2097 output[49] = x10[35];
2098 output[50] = x10[19];
2099 output[51] = x10[51];
2100 output[52] = x10[11];
2101 output[53] = x10[43];
2102 output[54] = x10[27];
2103 output[55] = x10[59];
2104 output[56] = x10[7];
2105 output[57] = x10[39];
2106 output[58] = x10[23];
2107 output[59] = x10[55];
2108 output[60] = x10[15];
2109 output[61] = x10[47];
2110 output[62] = x10[31];
2111 output[63] = x10[63];
2112 }
2113
highbd_fidentity32_x4_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2114 static void highbd_fidentity32_x4_neon(const int32x4_t *input,
2115 int32x4_t *output, int cos_bit) {
2116 (void)cos_bit;
2117 for (int i = 0; i < 32; i++) {
2118 output[i] = vshlq_n_s32(input[i], 2);
2119 }
2120 }
2121
2122 TRANSFORM_COL_MANY(fdct32, 32)
2123 TRANSFORM_COL_MANY(fidentity32, 32)
2124
2125 static const fwd_transform_1d_col_many_neon
2126 col_highbd_txfm32_x4_arr[TX_TYPES] = {
2127 highbd_fdct32_col_many_neon, // DCT_DCT
2128 NULL, // ADST_DCT
2129 NULL, // DCT_ADST
2130 NULL, // ADST_ADST
2131 NULL, // FLIPADST_DCT
2132 NULL, // DCT_FLIPADST
2133 NULL, // FLIPADST_FLIPADST
2134 NULL, // ADST_FLIPADST
2135 NULL, // FLIPADST_ADST
2136 highbd_fidentity32_col_many_neon, // IDTX
2137 NULL, // V_DCT
2138 NULL, // H_DCT
2139 NULL, // V_ADST
2140 NULL, // H_ADST
2141 NULL, // V_FLIPADST
2142 NULL // H_FLIPADST
2143 };
2144
2145 TRANSFORM_ROW_MANY(fdct32, 32)
2146 TRANSFORM_ROW_MANY(fidentity32, 32)
2147
2148 static const fwd_transform_1d_row_many_neon
2149 row_highbd_txfm32_x4_arr[TX_TYPES] = {
2150 highbd_fdct32_row_many_neon, // DCT_DCT
2151 NULL, // ADST_DCT
2152 NULL, // DCT_ADST
2153 NULL, // ADST_ADST
2154 NULL, // FLIPADST_DCT
2155 NULL, // DCT_FLIPADST
2156 NULL, // FLIPADST_FLIPADST
2157 NULL, // ADST_FLIPADST
2158 NULL, // FLIPADST_ADST
2159 highbd_fidentity32_row_many_neon, // IDTX
2160 NULL, // V_DCT
2161 NULL, // H_DCT
2162 NULL, // V_ADST
2163 NULL, // H_ADST
2164 NULL, // V_FLIPADST
2165 NULL // H_FLIPADST
2166 };
2167
2168 TRANSFORM_ROW_RECT_MANY(fdct32, 32)
2169 TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
2170
2171 static const fwd_transform_1d_row_many_neon
2172 row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
2173 highbd_fdct32_row_rect_many_neon, // DCT_DCT
2174 NULL, // ADST_DCT
2175 NULL, // DCT_ADST
2176 NULL, // ADST_ADST
2177 NULL, // FLIPADST_DCT
2178 NULL, // DCT_FLIPADST
2179 NULL, // FLIPADST_FLIPADST
2180 NULL, // ADST_FLIPADST
2181 NULL, // FLIPADST_ADST
2182 highbd_fidentity32_row_rect_many_neon, // IDTX
2183 NULL, // V_DCT
2184 NULL, // H_DCT
2185 NULL, // V_ADST
2186 NULL, // H_ADST
2187 NULL, // V_FLIPADST
2188 NULL // H_FLIPADST
2189 };
2190
av1_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2191 void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
2192 TX_TYPE tx_type, int bd) {
2193 (void)bd;
2194 const fwd_transform_1d_col_many_neon col_txfm =
2195 col_highbd_txfm8_xn_arr[tx_type];
2196 const fwd_transform_1d_row_many_neon row_txfm =
2197 row_rect_highbd_txfm16_xn_arr[tx_type];
2198 int bit = av1_fwd_cos_bit_col[2][1];
2199
2200 int ud_flip, lr_flip;
2201 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2202 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2203
2204 // Column-wise transform.
2205 int32x4_t buf0[32];
2206 if (lr_flip) {
2207 col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
2208 /*hm_stride=*/-8);
2209 } else {
2210 col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
2211 /*hm_stride=*/8);
2212 }
2213 shift_right_2_round_s32_x4(buf0, buf0, 32);
2214
2215 int32x4_t buf1[32];
2216 transpose_arrays_s32_16x8(buf0, buf1);
2217
2218 // Row-wise transform.
2219 row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
2220 }
2221
av1_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2222 void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
2223 TX_TYPE tx_type, int bd) {
2224 (void)bd;
2225 const fwd_transform_1d_col_many_neon col_txfm =
2226 col_highbd_txfm16_xn_arr[tx_type];
2227 const fwd_transform_1d_row_many_neon row_txfm =
2228 row_rect_highbd_txfm8_xn_arr[tx_type];
2229 int bit = av1_fwd_cos_bit_col[1][2];
2230
2231 int ud_flip, lr_flip;
2232 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2233 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2234
2235 // Column-wise transform.
2236 int32x4_t buf0[32];
2237 if (lr_flip) {
2238 col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
2239 /*hm_stride=*/-16);
2240 } else {
2241 col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
2242 /*hm_stride=*/16);
2243 }
2244 shift_right_2_round_s32_x4(buf0, buf0, 32);
2245
2246 int32x4_t buf1[32];
2247 transpose_arrays_s32_8x16(buf0, buf1);
2248
2249 // Row-wise transform.
2250 row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
2251 }
2252
2253 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2254 void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
2255 TX_TYPE tx_type, int bd) {
2256 (void)bd;
2257 int bitcol = av1_fwd_cos_bit_col[0][2];
2258 int bitrow = av1_fwd_cos_bit_row[0][2];
2259 const fwd_transform_1d_col_many_neon col_txfm =
2260 col_highbd_txfm16_xn_arr[tx_type];
2261 const fwd_transform_1d_row_many_neon row_txfm =
2262 row_highbd_txfm4_xn_arr[tx_type];
2263
2264 int ud_flip, lr_flip;
2265 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2266 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2267
2268 // Column-wise transform.
2269 int32x4_t buf0[16];
2270 if (lr_flip) {
2271 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
2272 /*hm_stride=*/0);
2273 } else {
2274 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
2275 /*hm_stride=*/0);
2276 }
2277 shift_right_1_round_s32_x4(buf0, buf0, 16);
2278
2279 int32x4_t buf1[16];
2280 transpose_arrays_s32_4x16(buf0, buf1);
2281
2282 // Row-wise transform.
2283 row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
2284 }
2285 #endif
2286
av1_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2287 void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
2288 TX_TYPE tx_type, int bd) {
2289 (void)bd;
2290 int bitcol = av1_fwd_cos_bit_col[2][0];
2291 int bitrow = av1_fwd_cos_bit_row[2][0];
2292 const fwd_transform_1d_col_many_neon col_txfm =
2293 col_highbd_txfm4_xn_arr[tx_type];
2294 const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
2295
2296 int ud_flip, lr_flip;
2297 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2298 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2299
2300 // Column-wise transform.
2301 int32x4_t buf0[16];
2302 if (lr_flip) {
2303 col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
2304 /*hm_stride=*/-4);
2305 } else {
2306 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
2307 /*hm_stride=*/4);
2308 }
2309
2310 shift_right_1_round_s32_x4(buf0, buf0, 16);
2311 transpose_arrays_s32_4x16(buf0, buf0);
2312
2313 // Row-wise transform.
2314 row_txfm(buf0, coeff, bitrow, /*stride=*/4);
2315 }
2316
av1_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2317 void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
2318 TX_TYPE tx_type, int bd) {
2319 (void)bd;
2320 const fwd_transform_1d_col_many_neon col_txfm =
2321 col_highbd_txfm32_x4_arr[tx_type];
2322 const fwd_transform_1d_row_many_neon row_txfm =
2323 row_rect_highbd_txfm16_xn_arr[tx_type];
2324 int bitcol = av1_fwd_cos_bit_col[2][3];
2325 int bitrow = av1_fwd_cos_bit_row[2][3];
2326
2327 // Column-wise transform.
2328 int32x4_t buf0[128];
2329 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
2330 /*hm_stride=*/32);
2331 shift_right_4_round_s32_x4(buf0, buf0, 128);
2332
2333 int32x4_t buf1[128];
2334 transpose_arrays_s32_16x32(buf0, buf1);
2335
2336 // Row-wise transform.
2337 row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
2338 }
2339
av1_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2340 void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
2341 TX_TYPE tx_type, int bd) {
2342 (void)bd;
2343 (void)tx_type;
2344 int bitcol = av1_fwd_cos_bit_col[3][4];
2345 int bitrow = av1_fwd_cos_bit_row[3][4];
2346
2347 // Column-wise transform.
2348 int32x4_t buf0[512];
2349 load_buffer_32x64(input, buf0, stride, 0);
2350 for (int i = 0; i < 8; i++) {
2351 highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
2352 }
2353 shift_right_2_round_s32_x4(buf0, buf0, 512);
2354
2355 int32x4_t buf1[512];
2356 transpose_arrays_s32_32x64(buf0, buf1);
2357
2358 // Row-wise transform.
2359 for (int i = 0; i < 16; i++) {
2360 highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
2361 }
2362 round_shift2_rect_array_s32_neon(buf1, buf1, 512);
2363 store_buffer_32x32(buf1, coeff, /*stride=*/32);
2364 }
2365
av1_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2366 void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
2367 TX_TYPE tx_type, int bd) {
2368 (void)bd;
2369 (void)tx_type;
2370 int bitcol = av1_fwd_cos_bit_col[4][3];
2371 int bitrow = av1_fwd_cos_bit_row[4][3];
2372
2373 // Column-wise transform.
2374 int32x4_t buf0[512];
2375 load_buffer_64x32(input, buf0, stride, 0);
2376 for (int i = 0; i < 16; i++) {
2377 highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
2378 }
2379 shift_right_4_round_s32_x4(buf0, buf0, 512);
2380
2381 int32x4_t buf1[512];
2382 transpose_arrays_s32_64x32(buf0, buf1);
2383
2384 // Row-wise transform.
2385 for (int i = 0; i < 8; i++) {
2386 highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
2387 }
2388 round_shift2_rect_array_s32_neon(buf1, buf1, 512);
2389 store_buffer_64x32(buf1, coeff, /*stride=*/32);
2390 }
2391
av1_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2392 void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
2393 TX_TYPE tx_type, int bd) {
2394 (void)bd;
2395 const fwd_transform_1d_col_many_neon col_txfm =
2396 col_highbd_txfm16_xn_arr[tx_type];
2397 const fwd_transform_1d_row_many_neon row_txfm =
2398 row_rect_highbd_txfm32_x4_arr[tx_type];
2399 int bitcol = av1_fwd_cos_bit_col[3][2];
2400 int bitrow = av1_fwd_cos_bit_row[3][2];
2401
2402 // Column-wise transform.
2403 int32x4_t buf0[128];
2404 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
2405 /*hm_stride=*/16);
2406 shift_right_4_round_s32_x4(buf0, buf0, 128);
2407
2408 int32x4_t buf1[128];
2409 transpose_arrays_s32_32x16(buf0, buf1);
2410
2411 // Row-wise transform.
2412 row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
2413 }
2414
2415 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2416 void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
2417 TX_TYPE tx_type, int bd) {
2418 (void)bd;
2419 const fwd_transform_1d_col_many_neon col_txfm =
2420 col_highbd_txfm32_x4_arr[tx_type];
2421 const fwd_transform_1d_row_many_neon row_txfm =
2422 row_highbd_txfm8_xn_arr[tx_type];
2423 int bitcol = av1_fwd_cos_bit_col[1][3];
2424 int bitrow = av1_fwd_cos_bit_row[1][3];
2425
2426 // Column-wise transform.
2427 int32x4_t buf0[64];
2428 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
2429 /*hm_stride=*/32);
2430 shift_right_2_round_s32_x4(buf0, buf0, 64);
2431
2432 int32x4_t buf1[64];
2433 transpose_arrays_s32_8x32(buf0, buf1);
2434
2435 // Row-wise transform.
2436 row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
2437 }
2438
av1_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2439 void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
2440 TX_TYPE tx_type, int bd) {
2441 (void)bd;
2442 const fwd_transform_1d_col_many_neon col_txfm =
2443 col_highbd_txfm8_xn_arr[tx_type];
2444 const fwd_transform_1d_row_many_neon row_txfm =
2445 row_highbd_txfm32_x4_arr[tx_type];
2446 int bitcol = av1_fwd_cos_bit_col[3][1];
2447 int bitrow = av1_fwd_cos_bit_row[3][1];
2448
2449 // Column-wise transform.
2450 int32x4_t buf0[64];
2451 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
2452 /*hm_stride=*/8);
2453 shift_right_2_round_s32_x4(buf0, buf0, 64);
2454
2455 int32x4_t buf1[64];
2456 transpose_arrays_s32_32x8(buf0, buf1);
2457
2458 // Row-wise transform.
2459 row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
2460 }
2461 #endif
2462
av1_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2463 void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
2464 TX_TYPE tx_type, int bd) {
2465 (void)bd;
2466 int bitcol = av1_fwd_cos_bit_col[0][1];
2467 int bitrow = av1_fwd_cos_bit_row[0][1];
2468 const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
2469 const fwd_transform_1d_row_many_neon row_txfm =
2470 row_rect_highbd_txfm4_xn_arr[tx_type];
2471
2472 int ud_flip, lr_flip;
2473 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2474 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2475
2476 // Column-wise transform.
2477 int32x4_t buf0[8];
2478 col_txfm(input, buf0, stride, bitcol, lr_flip);
2479 shift_right_1_round_s32_x4(buf0, buf0, 8);
2480
2481 int32x4_t buf1[8];
2482 transpose_arrays_s32_4x8(buf0, buf1);
2483
2484 // Row-wise transform.
2485 row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
2486 }
2487
av1_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2488 void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
2489 TX_TYPE tx_type, int bd) {
2490 (void)bd;
2491 const int bitcol = av1_fwd_cos_bit_col[1][0];
2492 const int bitrow = av1_fwd_cos_bit_row[1][0];
2493 const fwd_transform_1d_col_many_neon col_txfm =
2494 col_highbd_txfm4_xn_arr[tx_type];
2495 const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
2496
2497 int ud_flip, lr_flip;
2498 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2499 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2500
2501 // Column-wise transform.
2502 int32x4_t buf0[8];
2503 if (lr_flip) {
2504 col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
2505 /*hm_stride=*/-4);
2506 } else {
2507 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
2508 /*hm_stride=*/4);
2509 }
2510
2511 shift_right_1_round_s32_x4(buf0, buf0, 8);
2512
2513 int32x4_t buf1[8];
2514 transpose_arrays_s32_8x4(buf0, buf1);
2515
2516 // Row-wise transform.
2517 row_txfm(buf1, coeff, bitrow, /*stride=*/4);
2518 }
2519
2520 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2521 void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
2522 TX_TYPE tx_type, int bd) {
2523 (void)bd;
2524 const int bitcol = av1_fwd_cos_bit_col[2][4];
2525 const int bitrow = av1_fwd_cos_bit_row[2][4];
2526
2527 int ud_flip, lr_flip;
2528 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2529 ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
2530
2531 // Column-wise transform.
2532 int32x4_t buf0[256];
2533 load_buffer_16x64(input, buf0, stride, lr_flip);
2534 for (int i = 0; i < 4; i++) {
2535 highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
2536 }
2537 shift_right_2_round_s32_x4(buf0, buf0, 256);
2538
2539 int32x4_t buf1[256];
2540 transpose_arrays_s32_16x64(buf0, buf1);
2541
2542 // Row-wise transform.
2543 highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
2544 store_buffer_16x32(buf1, coeff, /*stride=*/32);
2545 }
2546
av1_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2547 void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
2548 TX_TYPE tx_type, int bd) {
2549 (void)bd;
2550 const int bitcol = av1_fwd_cos_bit_col[4][2];
2551 const int bitrow = av1_fwd_cos_bit_row[4][2];
2552
2553 int ud_flip, lr_flip;
2554 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2555 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2556
2557 // Column-wise transform.
2558 int32x4_t buf0[256];
2559 load_buffer_64x16(input, buf0, stride, lr_flip);
2560 highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
2561 shift_right_4_round_s32_x4(buf0, buf0, 256);
2562
2563 int32x4_t buf1[256];
2564 transpose_arrays_s32_64x16(buf0, buf1);
2565
2566 // Row-wise transform.
2567 for (int i = 0; i < 4; i++) {
2568 highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
2569 }
2570 store_buffer_64x16(buf1, coeff, /*stride=*/16);
2571 memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
2572 }
2573 #endif
2574
av1_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2575 void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2576 int stride, TX_TYPE tx_type, int bd) {
2577 (void)bd;
2578 const fwd_transform_1d_col_many_neon col_txfm =
2579 col_highbd_txfm32_x4_arr[tx_type];
2580 const fwd_transform_1d_row_many_neon row_txfm =
2581 row_highbd_txfm32_x4_arr[tx_type];
2582
2583 // Column-wise transform.
2584 int32x4_t buf0[256];
2585 col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
2586 /*hm_stride=*/32);
2587 shift_right_4_round_s32_x4(buf0, buf0, 256);
2588
2589 int32x4_t buf1[256];
2590 transpose_arrays_s32_32x32(buf0, buf1);
2591
2592 // Row-wise transform.
2593 row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
2594 /*stride=*/32);
2595 }
2596
av1_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2597 void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
2598 int stride, TX_TYPE tx_type, int bd) {
2599 (void)bd;
2600 (void)tx_type;
2601
2602 // Column-wise transform.
2603 int32x4_t buf0[1024];
2604 load_buffer_64x64(input, buf0, stride, 0);
2605 for (int col = 0; col < 16; col++) {
2606 highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
2607 }
2608 shift_right_2_round_s32_x4(buf0, buf0, 1024);
2609
2610 int32x4_t buf1[1024];
2611 transpose_arrays_s32_64x64(buf0, buf1);
2612
2613 // Row-wise transform.
2614 for (int col = 0; col < 8; col++) {
2615 highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
2616 }
2617 shift_right_2_round_s32_x4(buf1, buf1, 512);
2618 store_buffer_64x32(buf1, output, /*stride=*/32);
2619 }
2620