xref: /aosp_15_r20/external/libaom/av1/encoder/arm/highbd_fwd_txfm_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/arm/transpose_neon.h"
16 #include "aom_dsp/txfm_common.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
20 #include "config/aom_config.h"
21 #include "config/av1_rtcd.h"
22 #include "shift_neon.h"
23 #include "txfm_neon.h"
24 
transpose_arrays_s32_64x64(const int32x4_t * in,int32x4_t * out)25 static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
26                                                         int32x4_t *out) {
27   // This is not quite the same as the other transposes defined in
28   // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
29   // unused by the following row transform.
30   for (int j = 0; j < 8; ++j) {
31     for (int i = 0; i < 16; ++i) {
32       transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
33     }
34   }
35 }
36 
37 // A note on butterfly helper naming:
38 //
39 // butterfly_[weight_indices]_neon
40 // e.g. butterfly_0312_neon
41 //                ^ Weights are applied as indices 0, 3, 2, 1
42 //                  (see more detail below)
43 //
44 // Weight indices are treated as an index into the 4-tuple of the weight
45 // itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
46 // This is then represented in the helper naming by referring to the lane index
47 // in the loaded tuple that each multiply is performed with:
48 //
49 //         in0   in1
50 //      /------------
51 // out0 |  w[0]  w[1]   ==>  out0 = in0 * w[0] + in1 * w[1]
52 // out1 |  w[2]  w[3]   ==>  out1 = in0 * w[2] + in1 * w[3]
53 //
54 // So for indices 0321 from the earlier example, we end up with:
55 //
56 //          in0       in1
57 //      /------------------
58 // out0 | (lane 0) (lane 3)   ==>  out0 = in0 *  w0 + in1 * (w0-1)
59 // out1 | (lane 2) (lane 1)   ==>  out1 = in0 * -w0 + in1 * (1-w0)
60 
61 #define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit)   \
62   do {                                                                  \
63     int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } };                   \
64     int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
65     x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2);        \
66     *out = vrshlq_s32(x, v_bit);                                        \
67   } while (false)
68 
butterfly_0112_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)69 static AOM_FORCE_INLINE void butterfly_0112_neon(
70     const int32_t *cospi, const int widx0, const int32x4_t n0,
71     const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
72     const int32x4_t v_bit) {
73   int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
74   butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
75   butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
76 }
77 
butterfly_2312_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)78 static AOM_FORCE_INLINE void butterfly_2312_neon(
79     const int32_t *cospi, const int widx0, const int32x4_t n0,
80     const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
81     const int32x4_t v_bit) {
82   int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
83   butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
84   butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
85 }
86 
butterfly_0332_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)87 static AOM_FORCE_INLINE void butterfly_0332_neon(
88     const int32_t *cospi, const int widx0, const int32x4_t n0,
89     const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
90     const int32x4_t v_bit) {
91   int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
92   butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
93   butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
94 }
95 
butterfly_0130_neon(const int32_t * cospi,const int widx0,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)96 static AOM_FORCE_INLINE void butterfly_0130_neon(
97     const int32_t *cospi, const int widx0, const int32x4_t n0,
98     const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
99     const int32x4_t v_bit) {
100   int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
101   butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
102   butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
103 }
104 
butterfly_cospi32_0002_neon(const int32_t * cospi,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)105 static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
106     const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
107     int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
108   int32x2_t w01 = vld1_s32(cospi + 2 * 32);
109   butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
110   butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
111 }
112 
butterfly_cospi32_0222_neon(const int32_t * cospi,const int32x4_t n0,const int32x4_t n1,int32x4_t * out0,int32x4_t * out1,const int32x4_t v_bit)113 static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
114     const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
115     int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
116   int32x2_t w01 = vld1_s32(cospi + 2 * 32);
117   butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
118   butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
119 }
120 
round_rect_array_s32_neon(const int32x4_t * input,int32x4_t * output,const int size)121 static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
122                                                        int32x4_t *output,
123                                                        const int size) {
124   const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
125   int i = 0;
126   do {
127     const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
128     output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
129   } while (++i < size);
130 }
131 
round_shift2_rect_array_s32_neon(const int32x4_t * input,int32x4_t * output,const int size)132 static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
133     const int32x4_t *input, int32x4_t *output, const int size) {
134   const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
135   int i = 0;
136   do {
137     const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
138     const int32x4_t r1 = vmulq_s32(r0, sqrt2);
139     output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
140   } while (++i < size);
141 }
142 
143 #define LOAD_BUFFER_4XH(h)                                           \
144   static AOM_FORCE_INLINE void load_buffer_4x##h(                    \
145       const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
146     if (fliplr) {                                                    \
147       for (int i = 0; i < (h); ++i) {                                \
148         int16x4_t a = vld1_s16(input + i * stride);                  \
149         a = vrev64_s16(a);                                           \
150         in[i] = vshll_n_s16(a, 2);                                   \
151       }                                                              \
152     } else {                                                         \
153       for (int i = 0; i < (h); ++i) {                                \
154         int16x4_t a = vld1_s16(input + i * stride);                  \
155         in[i] = vshll_n_s16(a, 2);                                   \
156       }                                                              \
157     }                                                                \
158   }
159 
160 // AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
161 // avoid the expression even though the compiler can prove that the code path
162 // is never taken if `shift == 0`.
163 #define shift_left_long_s16(a, shift) \
164   ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
165 
166 #define LOAD_BUFFER_WXH(w, h, shift)                                 \
167   static AOM_FORCE_INLINE void load_buffer_##w##x##h(                \
168       const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
169     assert(w >= 8);                                                  \
170     if (fliplr) {                                                    \
171       for (int i = 0; i < (h); ++i) {                                \
172         for (int j = 0; j < (w) / 8; ++j) {                          \
173           int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
174           a = vrev64q_s16(a);                                        \
175           int j2 = (w) / 8 - j - 1;                                  \
176           in[i + (h) * (2 * j2 + 0)] =                               \
177               shift_left_long_s16(vget_high_s16(a), (shift));        \
178           in[i + (h) * (2 * j2 + 1)] =                               \
179               shift_left_long_s16(vget_low_s16(a), (shift));         \
180         }                                                            \
181       }                                                              \
182     } else {                                                         \
183       for (int i = 0; i < (h); ++i) {                                \
184         for (int j = 0; j < (w) / 8; ++j) {                          \
185           int16x8_t a = vld1q_s16(input + i * stride + j * 8);       \
186           in[i + (h) * (2 * j + 0)] =                                \
187               shift_left_long_s16(vget_low_s16(a), (shift));         \
188           in[i + (h) * (2 * j + 1)] =                                \
189               shift_left_long_s16(vget_high_s16(a), (shift));        \
190         }                                                            \
191       }                                                              \
192     }                                                                \
193   }
194 
195 LOAD_BUFFER_4XH(4)
196 LOAD_BUFFER_4XH(8)
197 LOAD_BUFFER_4XH(16)
198 LOAD_BUFFER_4XH(32)
199 LOAD_BUFFER_WXH(8, 8, 2)
200 LOAD_BUFFER_WXH(16, 16, 2)
201 LOAD_BUFFER_WXH(32, 64, 0)
202 LOAD_BUFFER_WXH(64, 32, 2)
203 LOAD_BUFFER_WXH(64, 64, 0)
204 
205 #if !CONFIG_REALTIME_ONLY
206 LOAD_BUFFER_WXH(16, 64, 0)
207 LOAD_BUFFER_WXH(64, 16, 2)
208 #endif  // !CONFIG_REALTIME_ONLY
209 
210 #define STORE_BUFFER_WXH(w, h)                                \
211   static AOM_FORCE_INLINE void store_buffer_##w##x##h(        \
212       const int32x4_t *in, int32_t *out, int stride) {        \
213     for (int i = 0; i < (w); ++i) {                           \
214       for (int j = 0; j < (h) / 4; ++j) {                     \
215         vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
216       }                                                       \
217     }                                                         \
218   }
219 
220 STORE_BUFFER_WXH(4, 4)
221 STORE_BUFFER_WXH(8, 4)
222 STORE_BUFFER_WXH(8, 8)
223 STORE_BUFFER_WXH(16, 4)
224 STORE_BUFFER_WXH(16, 16)
225 STORE_BUFFER_WXH(32, 4)
226 STORE_BUFFER_WXH(32, 32)
227 STORE_BUFFER_WXH(64, 32)
228 
229 #if !CONFIG_REALTIME_ONLY
230 STORE_BUFFER_WXH(16, 32)
231 STORE_BUFFER_WXH(64, 16)
232 #endif  // !CONFIG_REALTIME_ONLY
233 
highbd_fdct4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)234 static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
235                                                   int32x4_t *out, int bit) {
236   const int32_t *const cospi = cospi_arr_s32(bit);
237   const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
238   const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
239 
240   const int32x4_t a0 = vaddq_s32(in[0], in[3]);
241   const int32x4_t a1 = vsubq_s32(in[0], in[3]);
242   const int32x4_t a2 = vaddq_s32(in[1], in[2]);
243   const int32x4_t a3 = vsubq_s32(in[1], in[2]);
244 
245   const int32x4_t b0 = vmulq_s32(a0, cospi32);
246   const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
247   const int32x4_t b2 = vmulq_s32(a2, cospi32);
248   const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
249 
250   const int32x4_t c0 = vaddq_s32(b0, b2);
251   const int32x4_t c1 = vsubq_s32(b0, b2);
252   const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
253   const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
254 
255   const int32x4_t v_bit = vdupq_n_s32(-bit);
256   const int32x4_t d0 = vrshlq_s32(c0, v_bit);
257   const int32x4_t d1 = vrshlq_s32(c1, v_bit);
258   const int32x4_t d2 = vrshlq_s32(c2, v_bit);
259   const int32x4_t d3 = vrshlq_s32(c3, v_bit);
260 
261   out[0] = d0;
262   out[1] = d2;
263   out[2] = d1;
264   out[3] = d3;
265 }
266 
highbd_fadst4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)267 static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
268                                                    int32x4_t *out, int bit) {
269   const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
270 
271   const int32x4_t a0 = vaddq_s32(in[0], in[1]);
272   const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
273   const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
274   const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
275 
276   const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
277   const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
278   const int32x4_t b2 = vsubq_s32(a0, in[3]);
279 
280   const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
281   const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
282   const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
283 
284   const int32x4_t d0 = vaddq_s32(c0, a3);
285   const int32x4_t d1 = vsubq_s32(c1, a3);
286   const int32x4_t d2 = vsubq_s32(c1, c0);
287 
288   const int32x4_t e0 = vaddq_s32(d2, a3);
289 
290   const int32x4_t v_bit = vdupq_n_s32(-bit);
291   out[0] = vrshlq_s32(d0, v_bit);
292   out[1] = vrshlq_s32(c2, v_bit);
293   out[2] = vrshlq_s32(d1, v_bit);
294   out[3] = vrshlq_s32(e0, v_bit);
295 }
296 
highbd_fidentity4_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)297 static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
298                                                        int32x4_t *out,
299                                                        int bit) {
300   (void)bit;
301   int32x4_t fact = vdupq_n_s32(NewSqrt2);
302 
303   for (int i = 0; i < 4; i++) {
304     const int32x4_t a_low = vmulq_s32(in[i], fact);
305     out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
306   }
307 }
308 
av1_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * coeff,int input_stride,TX_TYPE tx_type,int bd)309 void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
310                              int input_stride, TX_TYPE tx_type, int bd) {
311   (void)bd;
312 
313   int ud_flip, lr_flip;
314   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
315   ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
316 
317   // Workspace for column/row-wise transforms.
318   int32x4_t buf[4];
319 
320   switch (tx_type) {
321     case DCT_DCT:
322       load_buffer_4x4(input, buf, input_stride, 0);
323       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
324       transpose_arrays_s32_4x4(buf, buf);
325       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
326       store_buffer_4x4(buf, coeff, /*stride=*/4);
327       break;
328     case ADST_DCT:
329       load_buffer_4x4(input, buf, input_stride, 0);
330       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
331       transpose_arrays_s32_4x4(buf, buf);
332       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
333       store_buffer_4x4(buf, coeff, /*stride=*/4);
334       break;
335     case DCT_ADST:
336       load_buffer_4x4(input, buf, input_stride, 0);
337       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
338       transpose_arrays_s32_4x4(buf, buf);
339       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
340       store_buffer_4x4(buf, coeff, /*stride=*/4);
341       break;
342     case ADST_ADST:
343       load_buffer_4x4(input, buf, input_stride, 0);
344       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
345       transpose_arrays_s32_4x4(buf, buf);
346       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
347       store_buffer_4x4(buf, coeff, /*stride=*/4);
348       break;
349     case FLIPADST_DCT:
350       load_buffer_4x4(input, buf, input_stride, 0);
351       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
352       transpose_arrays_s32_4x4(buf, buf);
353       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
354       store_buffer_4x4(buf, coeff, /*stride=*/4);
355       break;
356     case DCT_FLIPADST:
357       load_buffer_4x4(input, buf, input_stride, 1);
358       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
359       transpose_arrays_s32_4x4(buf, buf);
360       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
361       store_buffer_4x4(buf, coeff, /*stride=*/4);
362       break;
363     case FLIPADST_FLIPADST:
364       load_buffer_4x4(input, buf, input_stride, 1);
365       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
366       transpose_arrays_s32_4x4(buf, buf);
367       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
368       store_buffer_4x4(buf, coeff, /*stride=*/4);
369       break;
370     case ADST_FLIPADST:
371       load_buffer_4x4(input, buf, input_stride, 1);
372       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
373       transpose_arrays_s32_4x4(buf, buf);
374       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
375       store_buffer_4x4(buf, coeff, /*stride=*/4);
376       break;
377     case FLIPADST_ADST:
378       load_buffer_4x4(input, buf, input_stride, 0);
379       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
380       transpose_arrays_s32_4x4(buf, buf);
381       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
382       store_buffer_4x4(buf, coeff, /*stride=*/4);
383       break;
384     case IDTX:
385       load_buffer_4x4(input, buf, input_stride, 0);
386       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
387       transpose_arrays_s32_4x4(buf, buf);
388       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
389       store_buffer_4x4(buf, coeff, /*stride=*/4);
390       break;
391     case V_DCT:
392       load_buffer_4x4(input, buf, input_stride, 0);
393       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
394       transpose_arrays_s32_4x4(buf, buf);
395       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
396       store_buffer_4x4(buf, coeff, /*stride=*/4);
397       break;
398     case H_DCT:
399       load_buffer_4x4(input, buf, input_stride, 0);
400       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
401       transpose_arrays_s32_4x4(buf, buf);
402       highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
403       store_buffer_4x4(buf, coeff, /*stride=*/4);
404       break;
405     case V_ADST:
406       load_buffer_4x4(input, buf, input_stride, 0);
407       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
408       transpose_arrays_s32_4x4(buf, buf);
409       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
410       store_buffer_4x4(buf, coeff, /*stride=*/4);
411       break;
412     case H_ADST:
413       load_buffer_4x4(input, buf, input_stride, 0);
414       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
415       transpose_arrays_s32_4x4(buf, buf);
416       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
417       store_buffer_4x4(buf, coeff, /*stride=*/4);
418       break;
419     case V_FLIPADST:
420       load_buffer_4x4(input, buf, input_stride, 0);
421       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
422       transpose_arrays_s32_4x4(buf, buf);
423       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
424       store_buffer_4x4(buf, coeff, /*stride=*/4);
425       break;
426     case H_FLIPADST:
427       load_buffer_4x4(input, buf, input_stride, 1);
428       highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
429       transpose_arrays_s32_4x4(buf, buf);
430       highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
431       store_buffer_4x4(buf, coeff, /*stride=*/4);
432       break;
433     default: assert(0);
434   }
435 }
436 
437 // Butterfly pre-processing:
438 // e.g. n=4:
439 //   out[0] = in[0] + in[3]
440 //   out[1] = in[1] + in[2]
441 //   out[2] = in[1] - in[2]
442 //   out[3] = in[0] - in[3]
443 
butterfly_dct_pre(const int32x4_t * input,int32x4_t * output,int n)444 static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
445                                                int32x4_t *output, int n) {
446   for (int i = 0; i < n / 2; ++i) {
447     output[i] = vaddq_s32(input[i], input[n - i - 1]);
448   }
449   for (int i = 0; i < n / 2; ++i) {
450     output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
451   }
452 }
453 
454 // Butterfly post-processing:
455 // e.g. n=8:
456 //   out[0] = in0[0] + in1[3];
457 //   out[1] = in0[1] + in1[2];
458 //   out[2] = in0[1] - in1[2];
459 //   out[3] = in0[0] - in1[3];
460 //   out[4] = in0[7] - in1[4];
461 //   out[5] = in0[6] - in1[5];
462 //   out[6] = in0[6] + in1[5];
463 //   out[7] = in0[7] + in1[4];
464 
butterfly_dct_post(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * output,int n)465 static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
466                                                 const int32x4_t *in1,
467                                                 int32x4_t *output, int n) {
468   for (int i = 0; i < n / 4; ++i) {
469     output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
470   }
471   for (int i = 0; i < n / 4; ++i) {
472     output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
473   }
474   for (int i = 0; i < n / 4; ++i) {
475     output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
476   }
477   for (int i = 0; i < n / 4; ++i) {
478     output[(3 * n) / 4 + i] =
479         vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
480   }
481 }
482 
highbd_fdct8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)483 static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
484                                                   int32x4_t *out, int bit) {
485   const int32_t *const cospi = cospi_arr_s32(bit);
486   const int32x4_t v_bit = vdupq_n_s32(-bit);
487 
488   // stage 1
489   int32x4_t a[8];
490   butterfly_dct_pre(in, a, 8);
491 
492   // stage 2
493   int32x4_t b[8];
494   butterfly_dct_pre(a, b, 4);
495   butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
496 
497   // stage 3
498   int32x4_t c[8];
499   butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
500   butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
501   butterfly_dct_post(a + 4, b + 4, c + 4, 4);
502 
503   // stage 4-5
504   butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
505   butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
506 
507   out[0] = c[0];
508   out[2] = c[2];
509   out[4] = c[1];
510   out[6] = c[3];
511 }
512 
highbd_fadst8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)513 static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
514                                                    int32x4_t *out, int bit) {
515   const int32_t *const cospi = cospi_arr_s32(bit);
516   const int32x4_t v_bit = vdupq_n_s32(-bit);
517 
518   int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
519   int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
520 
521   // stage 0-1
522   u0 = in[0];
523   u1 = in[7];
524   u2 = in[3];
525   u3 = in[4];
526   u4 = in[1];
527   u5 = in[6];
528   u6 = in[2];
529   u7 = in[5];
530 
531   // stage 2
532   v0 = u0;
533   v1 = u1;
534   butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
535   v4 = u4;
536   v5 = u5;
537   butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
538 
539   // stage 3
540   u0 = vaddq_s32(v0, v2);
541   u1 = vsubq_s32(v3, v1);
542   u2 = vsubq_s32(v0, v2);
543   u3 = vaddq_s32(v1, v3);
544   u4 = vsubq_s32(v6, v4);
545   u5 = vaddq_s32(v5, v7);
546   u6 = vaddq_s32(v4, v6);
547   u7 = vsubq_s32(v5, v7);
548 
549   // stage 4
550   v0 = u0;
551   v1 = u1;
552   v2 = u2;
553   v3 = u3;
554 
555   butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
556   butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
557 
558   // stage 5
559   u0 = vaddq_s32(v0, v4);
560   u1 = vaddq_s32(v1, v5);
561   u2 = vaddq_s32(v2, v6);
562   u3 = vsubq_s32(v7, v3);
563   u4 = vsubq_s32(v0, v4);
564   u5 = vsubq_s32(v1, v5);
565   u6 = vsubq_s32(v2, v6);
566   u7 = vaddq_s32(v3, v7);
567 
568   // stage 6
569   butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
570   butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
571   butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
572   butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
573 
574   // stage 7
575   out[0] = v1;
576   out[1] = v6;
577   out[2] = v3;
578   out[3] = v4;
579   out[4] = v5;
580   out[5] = v2;
581   out[6] = v7;
582   out[7] = v0;
583 }
584 
highbd_fidentity8_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)585 static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
586                                                        int32x4_t *out,
587                                                        int bit) {
588   (void)bit;
589   out[0] = vshlq_n_s32(in[0], 1);
590   out[1] = vshlq_n_s32(in[1], 1);
591   out[2] = vshlq_n_s32(in[2], 1);
592   out[3] = vshlq_n_s32(in[3], 1);
593   out[4] = vshlq_n_s32(in[4], 1);
594   out[5] = vshlq_n_s32(in[5], 1);
595   out[6] = vshlq_n_s32(in[6], 1);
596   out[7] = vshlq_n_s32(in[7], 1);
597 }
598 
highbd_fdct8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)599 static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
600                                                   int32x4_t *out, int bit,
601                                                   int howmany) {
602   const int stride = 8;
603   int i = 0;
604   do {
605     highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
606   } while (++i < howmany);
607 }
608 
highbd_fadst8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)609 static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
610                                                    int32x4_t *out, int bit,
611                                                    int howmany) {
612   const int stride = 8;
613   int i = 0;
614   do {
615     highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
616   } while (++i < howmany);
617 }
618 
highbd_fidentity8_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)619 static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
620                                                        int32x4_t *out, int bit,
621                                                        int howmany) {
622   (void)bit;
623   const int stride = 8;
624   int i = 0;
625   do {
626     highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
627   } while (++i < howmany);
628 }
629 
av1_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)630 void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
631                              TX_TYPE tx_type, int bd) {
632   (void)bd;
633 
634   int ud_flip, lr_flip;
635   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
636   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
637 
638   // Workspaces for column/row-wise transforms.
639   int32x4_t buf0[16], buf1[16];
640 
641   switch (tx_type) {
642     case DCT_DCT:
643       load_buffer_8x8(input, buf0, stride, 0);
644       highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
645       shift_right_1_round_s32_x4(buf0, buf0, 16);
646       transpose_arrays_s32_8x8(buf0, buf1);
647       highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
648       store_buffer_8x8(buf1, coeff, /*stride=*/8);
649       break;
650     case ADST_DCT:
651       load_buffer_8x8(input, buf0, stride, 0);
652       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
653       shift_right_1_round_s32_x4(buf0, buf0, 16);
654       transpose_arrays_s32_8x8(buf0, buf1);
655       highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
656       store_buffer_8x8(buf1, coeff, /*stride=*/8);
657       break;
658     case DCT_ADST:
659       load_buffer_8x8(input, buf0, stride, 0);
660       highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
661       shift_right_1_round_s32_x4(buf0, buf0, 16);
662       transpose_arrays_s32_8x8(buf0, buf1);
663       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
664       store_buffer_8x8(buf1, coeff, /*stride=*/8);
665       break;
666     case ADST_ADST:
667       load_buffer_8x8(input, buf0, stride, 0);
668       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
669       shift_right_1_round_s32_x4(buf0, buf0, 16);
670       transpose_arrays_s32_8x8(buf0, buf1);
671       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
672       store_buffer_8x8(buf1, coeff, /*stride=*/8);
673       break;
674     case FLIPADST_DCT:
675       load_buffer_8x8(input, buf0, stride, 0);
676       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
677       shift_right_1_round_s32_x4(buf0, buf0, 16);
678       transpose_arrays_s32_8x8(buf0, buf1);
679       highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
680       store_buffer_8x8(buf1, coeff, /*stride=*/8);
681       break;
682     case DCT_FLIPADST:
683       load_buffer_8x8(input, buf0, stride, 1);
684       highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
685       shift_right_1_round_s32_x4(buf0, buf0, 16);
686       transpose_arrays_s32_8x8(buf0, buf1);
687       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
688       store_buffer_8x8(buf1, coeff, /*stride=*/8);
689       break;
690     case FLIPADST_FLIPADST:
691       load_buffer_8x8(input, buf0, stride, 1);
692       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
693       shift_right_1_round_s32_x4(buf0, buf0, 16);
694       transpose_arrays_s32_8x8(buf0, buf1);
695       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
696       store_buffer_8x8(buf1, coeff, /*stride=*/8);
697       break;
698     case ADST_FLIPADST:
699       load_buffer_8x8(input, buf0, stride, 1);
700       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
701       shift_right_1_round_s32_x4(buf0, buf0, 16);
702       transpose_arrays_s32_8x8(buf0, buf1);
703       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
704       store_buffer_8x8(buf1, coeff, /*stride=*/8);
705       break;
706     case FLIPADST_ADST:
707       load_buffer_8x8(input, buf0, stride, 0);
708       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
709       shift_right_1_round_s32_x4(buf0, buf0, 16);
710       transpose_arrays_s32_8x8(buf0, buf1);
711       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
712       store_buffer_8x8(buf1, coeff, /*stride=*/8);
713       break;
714     case IDTX:
715       load_buffer_8x8(input, buf0, stride, 0);
716       highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
717       shift_right_1_round_s32_x4(buf0, buf0, 16);
718       transpose_arrays_s32_8x8(buf0, buf1);
719       highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
720       store_buffer_8x8(buf1, coeff, /*stride=*/8);
721       break;
722     case V_DCT:
723       load_buffer_8x8(input, buf0, stride, 0);
724       highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
725       shift_right_1_round_s32_x4(buf0, buf0, 16);
726       transpose_arrays_s32_8x8(buf0, buf1);
727       highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
728       store_buffer_8x8(buf1, coeff, /*stride=*/8);
729       break;
730     case H_DCT:
731       load_buffer_8x8(input, buf0, stride, 0);
732       highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
733       shift_right_1_round_s32_x4(buf0, buf0, 16);
734       transpose_arrays_s32_8x8(buf0, buf1);
735       highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
736       store_buffer_8x8(buf1, coeff, /*stride=*/8);
737       break;
738     case V_ADST:
739       load_buffer_8x8(input, buf0, stride, 0);
740       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
741       shift_right_1_round_s32_x4(buf0, buf0, 16);
742       transpose_arrays_s32_8x8(buf0, buf1);
743       highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
744       store_buffer_8x8(buf1, coeff, /*stride=*/8);
745       break;
746     case H_ADST:
747       load_buffer_8x8(input, buf0, stride, 0);
748       highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
749       shift_right_1_round_s32_x4(buf0, buf0, 16);
750       transpose_arrays_s32_8x8(buf0, buf1);
751       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
752       store_buffer_8x8(buf1, coeff, /*stride=*/8);
753       break;
754     case V_FLIPADST:
755       load_buffer_8x8(input, buf0, stride, 0);
756       highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
757       shift_right_1_round_s32_x4(buf0, buf0, 16);
758       transpose_arrays_s32_8x8(buf0, buf1);
759       highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
760       store_buffer_8x8(buf1, coeff, /*stride=*/8);
761       break;
762     case H_FLIPADST:
763       load_buffer_8x8(input, buf0, stride, 1);
764       highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
765       shift_right_1_round_s32_x4(buf0, buf0, 16);
766       transpose_arrays_s32_8x8(buf0, buf1);
767       highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
768       store_buffer_8x8(buf1, coeff, /*stride=*/8);
769       break;
770     default: assert(0);
771   }
772 }
773 
highbd_fdct16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)774 static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
775                                   int bit) {
776   const int32_t *const cospi = cospi_arr_s32(bit);
777   const int32x4_t v_bit = vdupq_n_s32(-bit);
778 
779   int32x4_t u[16], v[16];
780 
781   // stage 1
782   butterfly_dct_pre(in, u, 16);
783 
784   // stage 2
785   butterfly_dct_pre(u, v, 8);
786   v[8] = u[8];
787   v[9] = u[9];
788   butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
789   butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
790   v[14] = u[14];
791   v[15] = u[15];
792 
793   // stage 3
794   butterfly_dct_pre(v, u, 4);
795   u[4] = v[4];
796   butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
797   u[7] = v[7];
798   butterfly_dct_post(v + 8, v + 8, u + 8, 8);
799 
800   // stage 4
801   butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
802   butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
803   butterfly_dct_post(u + 4, u + 4, v + 4, 4);
804   v[8] = u[8];
805   butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
806   butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
807   v[11] = u[11];
808   v[12] = u[12];
809   v[15] = u[15];
810 
811   // stage 5
812   u[0] = v[0];
813   u[1] = v[1];
814   u[2] = v[2];
815   u[3] = v[3];
816   butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
817   butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
818   butterfly_dct_post(v + 8, v + 8, u + 8, 4);
819   butterfly_dct_post(v + 12, v + 12, u + 12, 4);
820 
821   // stage 6
822   v[0] = u[0];
823   v[1] = u[1];
824   v[2] = u[2];
825   v[3] = u[3];
826   v[4] = u[4];
827   v[5] = u[5];
828   v[6] = u[6];
829   v[7] = u[7];
830   butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
831   butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
832   butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
833   butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
834 
835   out[0] = v[0];
836   out[1] = v[8];
837   out[2] = v[4];
838   out[3] = v[12];
839   out[4] = v[2];
840   out[5] = v[10];
841   out[6] = v[6];
842   out[7] = v[14];
843   out[8] = v[1];
844   out[9] = v[9];
845   out[10] = v[5];
846   out[11] = v[13];
847   out[12] = v[3];
848   out[13] = v[11];
849   out[14] = v[7];
850   out[15] = v[15];
851 }
852 
highbd_fadst16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)853 static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
854                                    int bit) {
855   const int32_t *const cospi = cospi_arr_s32(bit);
856   const int32x4_t v_bit = vdupq_n_s32(-bit);
857 
858   int32x4_t u[16], v[16];
859 
860   // stage 0-1
861   u[0] = in[0];
862   u[1] = in[15];
863   u[2] = in[7];
864   u[3] = in[8];
865   u[4] = in[3];
866   u[5] = in[12];
867   u[6] = in[4];
868   u[7] = in[11];
869   u[8] = in[1];
870   u[9] = in[14];
871   u[10] = in[6];
872   u[11] = in[9];
873   u[12] = in[2];
874   u[13] = in[13];
875   u[14] = in[5];
876   u[15] = in[10];
877 
878   // stage 2
879   v[0] = u[0];
880   v[1] = u[1];
881   butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
882   v[4] = u[4];
883   v[5] = u[5];
884   butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
885   v[8] = u[8];
886   v[9] = u[9];
887   butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
888   v[12] = u[12];
889   v[13] = u[13];
890   butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
891 
892   // stage 3
893   u[0] = vaddq_s32(v[0], v[2]);
894   u[1] = vsubq_s32(v[3], v[1]);
895   u[2] = vsubq_s32(v[0], v[2]);
896   u[3] = vaddq_s32(v[1], v[3]);
897   u[4] = vsubq_s32(v[6], v[4]);
898   u[5] = vaddq_s32(v[5], v[7]);
899   u[6] = vaddq_s32(v[4], v[6]);
900   u[7] = vsubq_s32(v[5], v[7]);
901   u[8] = vsubq_s32(v[10], v[8]);
902   u[9] = vaddq_s32(v[9], v[11]);
903   u[10] = vaddq_s32(v[8], v[10]);
904   u[11] = vsubq_s32(v[9], v[11]);
905   u[12] = vaddq_s32(v[12], v[14]);
906   u[13] = vsubq_s32(v[15], v[13]);
907   u[14] = vsubq_s32(v[12], v[14]);
908   u[15] = vaddq_s32(v[13], v[15]);
909 
910   // stage 4
911   v[0] = u[0];
912   v[1] = u[1];
913   v[2] = u[2];
914   v[3] = u[3];
915   butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
916   butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
917 
918   v[8] = u[8];
919   v[9] = u[9];
920   v[10] = u[10];
921   v[11] = u[11];
922 
923   butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
924   butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
925 
926   // stage 5
927   u[0] = vaddq_s32(v[0], v[4]);
928   u[1] = vaddq_s32(v[1], v[5]);
929   u[2] = vaddq_s32(v[2], v[6]);
930   u[3] = vsubq_s32(v[7], v[3]);
931   u[4] = vsubq_s32(v[0], v[4]);
932   u[5] = vsubq_s32(v[1], v[5]);
933   u[6] = vsubq_s32(v[2], v[6]);
934   u[7] = vaddq_s32(v[3], v[7]);
935   u[8] = vaddq_s32(v[8], v[12]);
936   u[9] = vaddq_s32(v[9], v[13]);
937   u[10] = vsubq_s32(v[14], v[10]);
938   u[11] = vaddq_s32(v[11], v[15]);
939   u[12] = vsubq_s32(v[8], v[12]);
940   u[13] = vsubq_s32(v[9], v[13]);
941   u[14] = vaddq_s32(v[10], v[14]);
942   u[15] = vsubq_s32(v[11], v[15]);
943 
944   // stage 6
945   v[0] = u[0];
946   v[1] = u[1];
947   v[2] = u[2];
948   v[3] = u[3];
949   v[4] = u[4];
950   v[5] = u[5];
951   v[6] = u[6];
952   v[7] = u[7];
953 
954   butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
955   butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
956   butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
957   butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
958 
959   // stage 7
960   u[0] = vaddq_s32(v[0], v[8]);
961   u[1] = vaddq_s32(v[1], v[9]);
962   u[2] = vaddq_s32(v[2], v[10]);
963   u[3] = vaddq_s32(v[3], v[11]);
964   u[4] = vaddq_s32(v[4], v[12]);
965   u[5] = vaddq_s32(v[5], v[13]);
966   u[6] = vaddq_s32(v[6], v[14]);
967   u[7] = vsubq_s32(v[15], v[7]);
968   u[8] = vsubq_s32(v[0], v[8]);
969   u[9] = vsubq_s32(v[1], v[9]);
970   u[10] = vsubq_s32(v[2], v[10]);
971   u[11] = vsubq_s32(v[3], v[11]);
972   u[12] = vsubq_s32(v[4], v[12]);
973   u[13] = vsubq_s32(v[5], v[13]);
974   u[14] = vsubq_s32(v[6], v[14]);
975   u[15] = vaddq_s32(v[7], v[15]);
976 
977   // stage 8
978   butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
979   butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
980   butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
981   butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
982   butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
983   butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
984   butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
985   butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
986 
987   // stage 9
988   out[0] = v[1];
989   out[1] = v[14];
990   out[2] = v[3];
991   out[3] = v[12];
992   out[4] = v[5];
993   out[5] = v[10];
994   out[6] = v[7];
995   out[7] = v[8];
996   out[8] = v[9];
997   out[9] = v[6];
998   out[10] = v[11];
999   out[11] = v[4];
1000   out[12] = v[13];
1001   out[13] = v[2];
1002   out[14] = v[15];
1003   out[15] = v[0];
1004 }
1005 
highbd_fidentity16_x4_neon(const int32x4_t * in,int32x4_t * out,int bit)1006 static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
1007                                        int bit) {
1008   (void)bit;
1009   const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
1010   const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
1011 
1012   for (int i = 0; i < 16; i++) {
1013     int32x4_t a = vmulq_s32(in[i], fact);
1014     a = vaddq_s32(a, offset);
1015     out[i] = vshrq_n_s32(a, NewSqrt2Bits);
1016   }
1017 }
1018 
highbd_fdct16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,const int howmany)1019 static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
1020                                   const int howmany) {
1021   const int stride = 16;
1022   int i = 0;
1023   do {
1024     highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
1025   } while (++i < howmany);
1026 }
1027 
highbd_fadst16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)1028 static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
1029                                    int howmany) {
1030   const int stride = 16;
1031   int i = 0;
1032   do {
1033     highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
1034   } while (++i < howmany);
1035 }
1036 
highbd_fidentity16_xn_neon(const int32x4_t * in,int32x4_t * out,int bit,int howmany)1037 static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
1038                                        int bit, int howmany) {
1039   const int stride = 16;
1040   int i = 0;
1041   do {
1042     highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
1043   } while (++i < howmany);
1044 }
1045 
av1_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)1046 void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
1047                                TX_TYPE tx_type, int bd) {
1048   (void)bd;
1049   int ud_flip, lr_flip;
1050   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1051   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
1052 
1053   // Workspaces for column/row-wise transforms.
1054   int32x4_t buf0[64], buf1[64];
1055 
1056   switch (tx_type) {
1057     case DCT_DCT:
1058       load_buffer_16x16(input, buf0, stride, 0);
1059       highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1060       shift_right_2_round_s32_x4(buf0, buf0, 64);
1061       transpose_arrays_s32_16x16(buf0, buf1);
1062       highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1063       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1064       break;
1065     case ADST_DCT:
1066       load_buffer_16x16(input, buf0, stride, 0);
1067       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1068       shift_right_2_round_s32_x4(buf0, buf0, 64);
1069       transpose_arrays_s32_16x16(buf0, buf1);
1070       highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1071       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1072       break;
1073     case DCT_ADST:
1074       load_buffer_16x16(input, buf0, stride, 0);
1075       highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1076       shift_right_2_round_s32_x4(buf0, buf0, 64);
1077       transpose_arrays_s32_16x16(buf0, buf1);
1078       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1079       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1080       break;
1081     case ADST_ADST:
1082       load_buffer_16x16(input, buf0, stride, 0);
1083       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1084       shift_right_2_round_s32_x4(buf0, buf0, 64);
1085       transpose_arrays_s32_16x16(buf0, buf1);
1086       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1087       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1088       break;
1089     case FLIPADST_DCT:
1090       load_buffer_16x16(input, buf0, stride, 0);
1091       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1092       shift_right_2_round_s32_x4(buf0, buf0, 64);
1093       transpose_arrays_s32_16x16(buf0, buf1);
1094       highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1095       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1096       break;
1097     case DCT_FLIPADST:
1098       load_buffer_16x16(input, buf0, stride, 1);
1099       highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1100       shift_right_2_round_s32_x4(buf0, buf0, 64);
1101       transpose_arrays_s32_16x16(buf0, buf1);
1102       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1103       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1104       break;
1105     case FLIPADST_FLIPADST:
1106       load_buffer_16x16(input, buf0, stride, 1);
1107       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1108       shift_right_2_round_s32_x4(buf0, buf0, 64);
1109       transpose_arrays_s32_16x16(buf0, buf1);
1110       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1111       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1112       break;
1113     case ADST_FLIPADST:
1114       load_buffer_16x16(input, buf0, stride, 1);
1115       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1116       shift_right_2_round_s32_x4(buf0, buf0, 64);
1117       transpose_arrays_s32_16x16(buf0, buf1);
1118       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1119       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1120       break;
1121     case FLIPADST_ADST:
1122       load_buffer_16x16(input, buf0, stride, 0);
1123       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1124       shift_right_2_round_s32_x4(buf0, buf0, 64);
1125       transpose_arrays_s32_16x16(buf0, buf1);
1126       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1127       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1128       break;
1129     case IDTX:
1130       load_buffer_16x16(input, buf0, stride, 0);
1131       highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1132       shift_right_2_round_s32_x4(buf0, buf0, 64);
1133       transpose_arrays_s32_16x16(buf0, buf1);
1134       highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1135       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1136       break;
1137     case V_DCT:
1138       load_buffer_16x16(input, buf0, stride, 0);
1139       highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1140       shift_right_2_round_s32_x4(buf0, buf0, 64);
1141       transpose_arrays_s32_16x16(buf0, buf1);
1142       highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1143       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1144       break;
1145     case H_DCT:
1146       load_buffer_16x16(input, buf0, stride, 0);
1147       highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1148       shift_right_2_round_s32_x4(buf0, buf0, 64);
1149       transpose_arrays_s32_16x16(buf0, buf1);
1150       highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1151       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1152       break;
1153     case V_ADST:
1154       load_buffer_16x16(input, buf0, stride, 0);
1155       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1156       shift_right_2_round_s32_x4(buf0, buf0, 64);
1157       transpose_arrays_s32_16x16(buf0, buf1);
1158       highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1159       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1160       break;
1161     case H_ADST:
1162       load_buffer_16x16(input, buf0, stride, 0);
1163       highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1164       shift_right_2_round_s32_x4(buf0, buf0, 64);
1165       transpose_arrays_s32_16x16(buf0, buf1);
1166       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1167       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1168       break;
1169     case V_FLIPADST:
1170       load_buffer_16x16(input, buf0, stride, 0);
1171       highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1172       shift_right_2_round_s32_x4(buf0, buf0, 64);
1173       transpose_arrays_s32_16x16(buf0, buf1);
1174       highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1175       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1176       break;
1177     case H_FLIPADST:
1178       load_buffer_16x16(input, buf0, stride, 1);
1179       highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
1180       shift_right_2_round_s32_x4(buf0, buf0, 64);
1181       transpose_arrays_s32_16x16(buf0, buf1);
1182       highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
1183       store_buffer_16x16(buf1, coeff, /*stride=*/16);
1184       break;
1185     default: assert(0);
1186   }
1187 }
1188 
1189 typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
1190                                           int stride, int bit, int lr_flip);
1191 typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
1192                                                int32x4_t *out, int stride,
1193                                                int bit, int lr_flip,
1194                                                int howmany, int hm_stride);
1195 
1196 typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
1197                                           int bit, int stride);
1198 typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
1199                                                int32_t *out, int bit,
1200                                                int howmany, int hm_stride,
1201                                                int stride);
1202 
1203 // Construct component kernels that include the load_buffer and store_buffer
1204 // stages to avoid the need to spill loaded data to the stack between these and
1205 // the txfm kernel calls.
1206 // The TRANSFORM_*_ONE cases are only ever called in situations where the
1207 // howmany parameter would be one, so no need for the loop at all in these
1208 // cases.
1209 
1210 #define TRANSFORM_COL_ONE(name, n)                                    \
1211   static void highbd_##name##_col_neon(const int16_t *input,          \
1212                                        int32x4_t *output, int stride, \
1213                                        int cos_bit, int lr_flip) {    \
1214     int32x4_t buf0[n];                                                \
1215     load_buffer_4x##n(input, buf0, stride, lr_flip);                  \
1216     highbd_##name##_x4_neon(buf0, output, cos_bit);                   \
1217   }
1218 
1219 #define TRANSFORM_COL_MANY(name, n)                                     \
1220   static void highbd_##name##_col_many_neon(                            \
1221       const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
1222       int lr_flip, int howmany, int hm_stride) {                        \
1223     int i = 0;                                                          \
1224     do {                                                                \
1225       int32x4_t buf0[n];                                                \
1226       load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip);          \
1227       highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit);   \
1228     } while (++i < howmany);                                            \
1229   }
1230 
1231 #define TRANSFORM_ROW_ONE(name, n)                                        \
1232   static void highbd_##name##_row_neon(                                   \
1233       const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
1234     int32x4_t buf0[n];                                                    \
1235     highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
1236     store_buffer_##n##x4(buf0, output, stride);                           \
1237   }
1238 
1239 #define TRANSFORM_ROW_RECT_ONE(name, n)                                   \
1240   static void highbd_##name##_row_rect_neon(                              \
1241       const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
1242     int32x4_t buf0[n];                                                    \
1243     highbd_##name##_x4_neon(input, buf0, cos_bit);                        \
1244     round_rect_array_s32_neon(buf0, buf0, (n));                           \
1245     store_buffer_##n##x4(buf0, output, stride);                           \
1246   }
1247 
1248 #define TRANSFORM_ROW_MANY(name, n)                                      \
1249   static void highbd_##name##_row_many_neon(                             \
1250       const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
1251       int hm_stride, int stride) {                                       \
1252     int i = 0;                                                           \
1253     do {                                                                 \
1254       int32x4_t buf0[n];                                                 \
1255       highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
1256       store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
1257     } while (++i < howmany);                                             \
1258   }
1259 
1260 #define TRANSFORM_ROW_RECT_MANY(name, n)                                 \
1261   static void highbd_##name##_row_rect_many_neon(                        \
1262       const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
1263       int hm_stride, int stride) {                                       \
1264     int i = 0;                                                           \
1265     do {                                                                 \
1266       int32x4_t buf0[n];                                                 \
1267       highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit);     \
1268       round_rect_array_s32_neon(buf0, buf0, (n));                        \
1269       store_buffer_##n##x4(buf0, output + 4 * i, stride);                \
1270     } while (++i < howmany);                                             \
1271   }
1272 
1273 TRANSFORM_COL_ONE(fdct8, 8)
1274 TRANSFORM_COL_ONE(fadst8, 8)
1275 TRANSFORM_COL_ONE(fidentity8, 8)
1276 
1277 TRANSFORM_COL_MANY(fdct4, 4)
1278 TRANSFORM_COL_MANY(fdct8, 8)
1279 TRANSFORM_COL_MANY(fdct16, 16)
1280 TRANSFORM_COL_MANY(fadst4, 4)
1281 TRANSFORM_COL_MANY(fadst8, 8)
1282 TRANSFORM_COL_MANY(fadst16, 16)
1283 TRANSFORM_COL_MANY(fidentity4, 4)
1284 TRANSFORM_COL_MANY(fidentity8, 8)
1285 TRANSFORM_COL_MANY(fidentity16, 16)
1286 
1287 TRANSFORM_ROW_ONE(fdct16, 16)
1288 TRANSFORM_ROW_ONE(fadst16, 16)
1289 TRANSFORM_ROW_ONE(fidentity16, 16)
1290 
1291 TRANSFORM_ROW_RECT_ONE(fdct8, 8)
1292 TRANSFORM_ROW_RECT_ONE(fadst8, 8)
1293 TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
1294 
1295 #if !CONFIG_REALTIME_ONLY
1296 TRANSFORM_ROW_MANY(fdct4, 4)
1297 TRANSFORM_ROW_MANY(fdct8, 8)
1298 TRANSFORM_ROW_MANY(fadst4, 4)
1299 TRANSFORM_ROW_MANY(fadst8, 8)
1300 TRANSFORM_ROW_MANY(fidentity4, 4)
1301 TRANSFORM_ROW_MANY(fidentity8, 8)
1302 #endif
1303 
1304 TRANSFORM_ROW_RECT_MANY(fdct4, 4)
1305 TRANSFORM_ROW_RECT_MANY(fdct8, 8)
1306 TRANSFORM_ROW_RECT_MANY(fdct16, 16)
1307 TRANSFORM_ROW_RECT_MANY(fadst4, 4)
1308 TRANSFORM_ROW_RECT_MANY(fadst8, 8)
1309 TRANSFORM_ROW_RECT_MANY(fadst16, 16)
1310 TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
1311 TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
1312 TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
1313 
1314 static const fwd_transform_1d_col_many_neon
1315     col_highbd_txfm8_xn_arr[TX_TYPES] = {
1316       highbd_fdct8_col_many_neon,       // DCT_DCT
1317       highbd_fadst8_col_many_neon,      // ADST_DCT
1318       highbd_fdct8_col_many_neon,       // DCT_ADST
1319       highbd_fadst8_col_many_neon,      // ADST_ADST
1320       highbd_fadst8_col_many_neon,      // FLIPADST_DCT
1321       highbd_fdct8_col_many_neon,       // DCT_FLIPADST
1322       highbd_fadst8_col_many_neon,      // FLIPADST_FLIPADST
1323       highbd_fadst8_col_many_neon,      // ADST_FLIPADST
1324       highbd_fadst8_col_many_neon,      // FLIPADST_ADST
1325       highbd_fidentity8_col_many_neon,  // IDTX
1326       highbd_fdct8_col_many_neon,       // V_DCT
1327       highbd_fidentity8_col_many_neon,  // H_DCT
1328       highbd_fadst8_col_many_neon,      // V_ADST
1329       highbd_fidentity8_col_many_neon,  // H_ADST
1330       highbd_fadst8_col_many_neon,      // V_FLIPADST
1331       highbd_fidentity8_col_many_neon   // H_FLIPADST
1332     };
1333 
1334 static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
1335   highbd_fdct8_col_neon,       // DCT_DCT
1336   highbd_fadst8_col_neon,      // ADST_DCT
1337   highbd_fdct8_col_neon,       // DCT_ADST
1338   highbd_fadst8_col_neon,      // ADST_ADST
1339   highbd_fadst8_col_neon,      // FLIPADST_DCT
1340   highbd_fdct8_col_neon,       // DCT_FLIPADST
1341   highbd_fadst8_col_neon,      // FLIPADST_FLIPADST
1342   highbd_fadst8_col_neon,      // ADST_FLIPADST
1343   highbd_fadst8_col_neon,      // FLIPADST_ADST
1344   highbd_fidentity8_col_neon,  // IDTX
1345   highbd_fdct8_col_neon,       // V_DCT
1346   highbd_fidentity8_col_neon,  // H_DCT
1347   highbd_fadst8_col_neon,      // V_ADST
1348   highbd_fidentity8_col_neon,  // H_ADST
1349   highbd_fadst8_col_neon,      // V_FLIPADST
1350   highbd_fidentity8_col_neon   // H_FLIPADST
1351 };
1352 
1353 static const fwd_transform_1d_col_many_neon
1354     col_highbd_txfm16_xn_arr[TX_TYPES] = {
1355       highbd_fdct16_col_many_neon,       // DCT_DCT
1356       highbd_fadst16_col_many_neon,      // ADST_DCT
1357       highbd_fdct16_col_many_neon,       // DCT_ADST
1358       highbd_fadst16_col_many_neon,      // ADST_ADST
1359       highbd_fadst16_col_many_neon,      // FLIPADST_DCT
1360       highbd_fdct16_col_many_neon,       // DCT_FLIPADST
1361       highbd_fadst16_col_many_neon,      // FLIPADST_FLIPADST
1362       highbd_fadst16_col_many_neon,      // ADST_FLIPADST
1363       highbd_fadst16_col_many_neon,      // FLIPADST_ADST
1364       highbd_fidentity16_col_many_neon,  // IDTX
1365       highbd_fdct16_col_many_neon,       // V_DCT
1366       highbd_fidentity16_col_many_neon,  // H_DCT
1367       highbd_fadst16_col_many_neon,      // V_ADST
1368       highbd_fidentity16_col_many_neon,  // H_ADST
1369       highbd_fadst16_col_many_neon,      // V_FLIPADST
1370       highbd_fidentity16_col_many_neon   // H_FLIPADST
1371     };
1372 
1373 static const fwd_transform_1d_col_many_neon
1374     col_highbd_txfm4_xn_arr[TX_TYPES] = {
1375       highbd_fdct4_col_many_neon,       // DCT_DCT
1376       highbd_fadst4_col_many_neon,      // ADST_DCT
1377       highbd_fdct4_col_many_neon,       // DCT_ADST
1378       highbd_fadst4_col_many_neon,      // ADST_ADST
1379       highbd_fadst4_col_many_neon,      // FLIPADST_DCT
1380       highbd_fdct4_col_many_neon,       // DCT_FLIPADST
1381       highbd_fadst4_col_many_neon,      // FLIPADST_FLIPADST
1382       highbd_fadst4_col_many_neon,      // ADST_FLIPADST
1383       highbd_fadst4_col_many_neon,      // FLIPADST_ADST
1384       highbd_fidentity4_col_many_neon,  // IDTX
1385       highbd_fdct4_col_many_neon,       // V_DCT
1386       highbd_fidentity4_col_many_neon,  // H_DCT
1387       highbd_fadst4_col_many_neon,      // V_ADST
1388       highbd_fidentity4_col_many_neon,  // H_ADST
1389       highbd_fadst4_col_many_neon,      // V_FLIPADST
1390       highbd_fidentity4_col_many_neon   // H_FLIPADST
1391     };
1392 
1393 static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
1394   highbd_fdct16_row_neon,       // DCT_DCT
1395   highbd_fdct16_row_neon,       // ADST_DCT
1396   highbd_fadst16_row_neon,      // DCT_ADST
1397   highbd_fadst16_row_neon,      // ADST_ADST
1398   highbd_fdct16_row_neon,       // FLIPADST_DCT
1399   highbd_fadst16_row_neon,      // DCT_FLIPADST
1400   highbd_fadst16_row_neon,      // FLIPADST_FLIPADST
1401   highbd_fadst16_row_neon,      // ADST_FLIPADST
1402   highbd_fadst16_row_neon,      // FLIPADST_ADST
1403   highbd_fidentity16_row_neon,  // IDTX
1404   highbd_fidentity16_row_neon,  // V_DCT
1405   highbd_fdct16_row_neon,       // H_DCT
1406   highbd_fidentity16_row_neon,  // V_ADST
1407   highbd_fadst16_row_neon,      // H_ADST
1408   highbd_fidentity16_row_neon,  // V_FLIPADST
1409   highbd_fadst16_row_neon       // H_FLIPADST
1410 };
1411 
1412 static const fwd_transform_1d_row_many_neon
1413     row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
1414       highbd_fdct16_row_rect_many_neon,       // DCT_DCT
1415       highbd_fdct16_row_rect_many_neon,       // ADST_DCT
1416       highbd_fadst16_row_rect_many_neon,      // DCT_ADST
1417       highbd_fadst16_row_rect_many_neon,      // ADST_ADST
1418       highbd_fdct16_row_rect_many_neon,       // FLIPADST_DCT
1419       highbd_fadst16_row_rect_many_neon,      // DCT_FLIPADST
1420       highbd_fadst16_row_rect_many_neon,      // FLIPADST_FLIPADST
1421       highbd_fadst16_row_rect_many_neon,      // ADST_FLIPADST
1422       highbd_fadst16_row_rect_many_neon,      // FLIPADST_ADST
1423       highbd_fidentity16_row_rect_many_neon,  // IDTX
1424       highbd_fidentity16_row_rect_many_neon,  // V_DCT
1425       highbd_fdct16_row_rect_many_neon,       // H_DCT
1426       highbd_fidentity16_row_rect_many_neon,  // V_ADST
1427       highbd_fadst16_row_rect_many_neon,      // H_ADST
1428       highbd_fidentity16_row_rect_many_neon,  // V_FLIPADST
1429       highbd_fadst16_row_rect_many_neon       // H_FLIPADST
1430     };
1431 
1432 #if !CONFIG_REALTIME_ONLY
1433 static const fwd_transform_1d_row_many_neon
1434     row_highbd_txfm8_xn_arr[TX_TYPES] = {
1435       highbd_fdct8_row_many_neon,       // DCT_DCT
1436       highbd_fdct8_row_many_neon,       // ADST_DCT
1437       highbd_fadst8_row_many_neon,      // DCT_ADST
1438       highbd_fadst8_row_many_neon,      // ADST_ADST
1439       highbd_fdct8_row_many_neon,       // FLIPADST_DCT
1440       highbd_fadst8_row_many_neon,      // DCT_FLIPADST
1441       highbd_fadst8_row_many_neon,      // FLIPADST_FLIPADST
1442       highbd_fadst8_row_many_neon,      // ADST_FLIPADST
1443       highbd_fadst8_row_many_neon,      // FLIPADST_ADST
1444       highbd_fidentity8_row_many_neon,  // IDTX
1445       highbd_fidentity8_row_many_neon,  // V_DCT
1446       highbd_fdct8_row_many_neon,       // H_DCT
1447       highbd_fidentity8_row_many_neon,  // V_ADST
1448       highbd_fadst8_row_many_neon,      // H_ADST
1449       highbd_fidentity8_row_many_neon,  // V_FLIPADST
1450       highbd_fadst8_row_many_neon       // H_FLIPADST
1451     };
1452 #endif
1453 
1454 static const fwd_transform_1d_row_many_neon
1455     row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
1456       highbd_fdct8_row_rect_many_neon,       // DCT_DCT
1457       highbd_fdct8_row_rect_many_neon,       // ADST_DCT
1458       highbd_fadst8_row_rect_many_neon,      // DCT_ADST
1459       highbd_fadst8_row_rect_many_neon,      // ADST_ADST
1460       highbd_fdct8_row_rect_many_neon,       // FLIPADST_DCT
1461       highbd_fadst8_row_rect_many_neon,      // DCT_FLIPADST
1462       highbd_fadst8_row_rect_many_neon,      // FLIPADST_FLIPADST
1463       highbd_fadst8_row_rect_many_neon,      // ADST_FLIPADST
1464       highbd_fadst8_row_rect_many_neon,      // FLIPADST_ADST
1465       highbd_fidentity8_row_rect_many_neon,  // IDTX
1466       highbd_fidentity8_row_rect_many_neon,  // V_DCT
1467       highbd_fdct8_row_rect_many_neon,       // H_DCT
1468       highbd_fidentity8_row_rect_many_neon,  // V_ADST
1469       highbd_fadst8_row_rect_many_neon,      // H_ADST
1470       highbd_fidentity8_row_rect_many_neon,  // V_FLIPADST
1471       highbd_fadst8_row_rect_many_neon       // H_FLIPADST
1472     };
1473 
1474 static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
1475   highbd_fdct8_row_rect_neon,       // DCT_DCT
1476   highbd_fdct8_row_rect_neon,       // ADST_DCT
1477   highbd_fadst8_row_rect_neon,      // DCT_ADST
1478   highbd_fadst8_row_rect_neon,      // ADST_ADST
1479   highbd_fdct8_row_rect_neon,       // FLIPADST_DCT
1480   highbd_fadst8_row_rect_neon,      // DCT_FLIPADST
1481   highbd_fadst8_row_rect_neon,      // FLIPADST_FLIPADST
1482   highbd_fadst8_row_rect_neon,      // ADST_FLIPADST
1483   highbd_fadst8_row_rect_neon,      // FLIPADST_ADST
1484   highbd_fidentity8_row_rect_neon,  // IDTX
1485   highbd_fidentity8_row_rect_neon,  // V_DCT
1486   highbd_fdct8_row_rect_neon,       // H_DCT
1487   highbd_fidentity8_row_rect_neon,  // V_ADST
1488   highbd_fadst8_row_rect_neon,      // H_ADST
1489   highbd_fidentity8_row_rect_neon,  // V_FLIPADST
1490   highbd_fadst8_row_rect_neon       // H_FLIPADST
1491 };
1492 
1493 #if !CONFIG_REALTIME_ONLY
1494 static const fwd_transform_1d_row_many_neon
1495     row_highbd_txfm4_xn_arr[TX_TYPES] = {
1496       highbd_fdct4_row_many_neon,       // DCT_DCT
1497       highbd_fdct4_row_many_neon,       // ADST_DCT
1498       highbd_fadst4_row_many_neon,      // DCT_ADST
1499       highbd_fadst4_row_many_neon,      // ADST_ADST
1500       highbd_fdct4_row_many_neon,       // FLIPADST_DCT
1501       highbd_fadst4_row_many_neon,      // DCT_FLIPADST
1502       highbd_fadst4_row_many_neon,      // FLIPADST_FLIPADST
1503       highbd_fadst4_row_many_neon,      // ADST_FLIPADST
1504       highbd_fadst4_row_many_neon,      // FLIPADST_ADST
1505       highbd_fidentity4_row_many_neon,  // IDTX
1506       highbd_fidentity4_row_many_neon,  // V_DCT
1507       highbd_fdct4_row_many_neon,       // H_DCT
1508       highbd_fidentity4_row_many_neon,  // V_ADST
1509       highbd_fadst4_row_many_neon,      // H_ADST
1510       highbd_fidentity4_row_many_neon,  // V_FLIPADST
1511       highbd_fadst4_row_many_neon       // H_FLIPADST
1512     };
1513 #endif
1514 
1515 static const fwd_transform_1d_row_many_neon
1516     row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
1517       highbd_fdct4_row_rect_many_neon,       // DCT_DCT
1518       highbd_fdct4_row_rect_many_neon,       // ADST_DCT
1519       highbd_fadst4_row_rect_many_neon,      // DCT_ADST
1520       highbd_fadst4_row_rect_many_neon,      // ADST_ADST
1521       highbd_fdct4_row_rect_many_neon,       // FLIPADST_DCT
1522       highbd_fadst4_row_rect_many_neon,      // DCT_FLIPADST
1523       highbd_fadst4_row_rect_many_neon,      // FLIPADST_FLIPADST
1524       highbd_fadst4_row_rect_many_neon,      // ADST_FLIPADST
1525       highbd_fadst4_row_rect_many_neon,      // FLIPADST_ADST
1526       highbd_fidentity4_row_rect_many_neon,  // IDTX
1527       highbd_fidentity4_row_rect_many_neon,  // V_DCT
1528       highbd_fdct4_row_rect_many_neon,       // H_DCT
1529       highbd_fidentity4_row_rect_many_neon,  // V_ADST
1530       highbd_fadst4_row_rect_many_neon,      // H_ADST
1531       highbd_fidentity4_row_rect_many_neon,  // V_FLIPADST
1532       highbd_fadst4_row_rect_many_neon       // H_FLIPADST
1533     };
1534 
highbd_fdct32_x4_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)1535 static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
1536                                   int cos_bit) {
1537   const int32_t *const cospi = cospi_arr_s32(cos_bit);
1538   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1539 
1540   // Workspaces for intermediate transform steps.
1541   int32x4_t buf0[32];
1542   int32x4_t buf1[32];
1543 
1544   // stage 1
1545   butterfly_dct_pre(input, buf1, 32);
1546 
1547   // stage 2
1548   butterfly_dct_pre(buf1, buf0, 16);
1549   buf0[16] = buf1[16];
1550   buf0[17] = buf1[17];
1551   buf0[18] = buf1[18];
1552   buf0[19] = buf1[19];
1553   butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
1554                       v_cos_bit);
1555   butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
1556                       v_cos_bit);
1557   butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
1558                       v_cos_bit);
1559   butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
1560                       v_cos_bit);
1561   buf0[28] = buf1[28];
1562   buf0[29] = buf1[29];
1563   buf0[30] = buf1[30];
1564   buf0[31] = buf1[31];
1565 
1566   // stage 3
1567   butterfly_dct_pre(buf0, buf1, 8);
1568   buf1[8] = buf0[8];
1569   buf1[9] = buf0[9];
1570   butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
1571                       v_cos_bit);
1572   butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
1573                       v_cos_bit);
1574   buf1[14] = buf0[14];
1575   buf1[15] = buf0[15];
1576   butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
1577 
1578   // stage 4
1579   butterfly_dct_pre(buf1, buf0, 4);
1580   buf0[4] = buf1[4];
1581   butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
1582                       v_cos_bit);
1583   buf0[7] = buf1[7];
1584   butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
1585   buf0[16] = buf1[16];
1586   buf0[17] = buf1[17];
1587   butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
1588                       v_cos_bit);
1589   butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
1590                       v_cos_bit);
1591   butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
1592                       v_cos_bit);
1593   butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
1594                       v_cos_bit);
1595   buf0[22] = buf1[22];
1596   buf0[23] = buf1[23];
1597   buf0[24] = buf1[24];
1598   buf0[25] = buf1[25];
1599   buf0[30] = buf1[30];
1600   buf0[31] = buf1[31];
1601 
1602   // stage 5
1603   butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
1604                       v_cos_bit);
1605   butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
1606                       v_cos_bit);
1607   butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
1608   buf1[8] = buf0[8];
1609   butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
1610                       v_cos_bit);
1611   butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
1612                       v_cos_bit);
1613   buf1[11] = buf0[11];
1614   buf1[12] = buf0[12];
1615   buf1[15] = buf0[15];
1616   butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
1617   butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
1618 
1619   // stage 6
1620   buf0[0] = buf1[0];
1621   buf0[1] = buf1[1];
1622   buf0[2] = buf1[2];
1623   buf0[3] = buf1[3];
1624 
1625   butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
1626                       v_cos_bit);
1627   butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
1628                       v_cos_bit);
1629   butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
1630                       v_cos_bit);
1631   butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
1632   butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
1633   buf0[16] = buf1[16];
1634   buf0[19] = buf1[19];
1635   buf0[20] = buf1[20];
1636 
1637   butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
1638                       v_cos_bit);
1639   butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
1640                       v_cos_bit);
1641   butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
1642                       v_cos_bit);
1643 
1644   buf0[23] = buf1[23];
1645   buf0[24] = buf1[24];
1646   buf0[27] = buf1[27];
1647   buf0[28] = buf1[28];
1648   buf0[31] = buf1[31];
1649 
1650   // stage 7
1651   buf1[0] = buf0[0];
1652   buf1[1] = buf0[1];
1653   buf1[2] = buf0[2];
1654   buf1[3] = buf0[3];
1655   buf1[4] = buf0[4];
1656   buf1[5] = buf0[5];
1657   buf1[6] = buf0[6];
1658   buf1[7] = buf0[7];
1659   butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
1660                       v_cos_bit);
1661   butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
1662                       v_cos_bit);
1663   butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
1664                       v_cos_bit);
1665   butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
1666                       v_cos_bit);
1667   butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
1668   butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
1669   butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
1670   butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
1671 
1672   // stage 8
1673   buf0[0] = buf1[0];
1674   buf0[1] = buf1[1];
1675   buf0[2] = buf1[2];
1676   buf0[3] = buf1[3];
1677   buf0[4] = buf1[4];
1678   buf0[5] = buf1[5];
1679   buf0[6] = buf1[6];
1680   buf0[7] = buf1[7];
1681   buf0[8] = buf1[8];
1682   buf0[9] = buf1[9];
1683   buf0[10] = buf1[10];
1684   buf0[11] = buf1[11];
1685   buf0[12] = buf1[12];
1686   buf0[13] = buf1[13];
1687   buf0[14] = buf1[14];
1688   buf0[15] = buf1[15];
1689   butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
1690                       v_cos_bit);
1691   butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
1692                       v_cos_bit);
1693   butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
1694                       v_cos_bit);
1695   butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
1696                       v_cos_bit);
1697   butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
1698                       v_cos_bit);
1699   butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
1700                       v_cos_bit);
1701   butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
1702                       v_cos_bit);
1703   butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
1704                       v_cos_bit);
1705 
1706   // stage 9
1707   output[0] = buf0[0];
1708   output[1] = buf0[16];
1709   output[2] = buf0[8];
1710   output[3] = buf0[24];
1711   output[4] = buf0[4];
1712   output[5] = buf0[20];
1713   output[6] = buf0[12];
1714   output[7] = buf0[28];
1715   output[8] = buf0[2];
1716   output[9] = buf0[18];
1717   output[10] = buf0[10];
1718   output[11] = buf0[26];
1719   output[12] = buf0[6];
1720   output[13] = buf0[22];
1721   output[14] = buf0[14];
1722   output[15] = buf0[30];
1723   output[16] = buf0[1];
1724   output[17] = buf0[17];
1725   output[18] = buf0[9];
1726   output[19] = buf0[25];
1727   output[20] = buf0[5];
1728   output[21] = buf0[21];
1729   output[22] = buf0[13];
1730   output[23] = buf0[29];
1731   output[24] = buf0[3];
1732   output[25] = buf0[19];
1733   output[26] = buf0[11];
1734   output[27] = buf0[27];
1735   output[28] = buf0[7];
1736   output[29] = buf0[23];
1737   output[30] = buf0[15];
1738   output[31] = buf0[31];
1739 }
1740 
highbd_fdct64_x4_neon(const int32x4_t * input,int32x4_t * output,int8_t cos_bit)1741 static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
1742                                   int8_t cos_bit) {
1743   const int32_t *const cospi = cospi_arr_s32(cos_bit);
1744   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1745 
1746   // stage 1
1747   int32x4_t x1[64];
1748   butterfly_dct_pre(input, x1, 64);
1749 
1750   // stage 2
1751   int32x4_t x2[64];
1752   butterfly_dct_pre(x1, x2, 32);
1753   x2[32] = x1[32];
1754   x2[33] = x1[33];
1755   x2[34] = x1[34];
1756   x2[35] = x1[35];
1757   x2[36] = x1[36];
1758   x2[37] = x1[37];
1759   x2[38] = x1[38];
1760   x2[39] = x1[39];
1761   butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
1762   butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
1763   butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
1764   butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
1765   butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
1766   butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
1767   butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
1768   butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
1769   x2[56] = x1[56];
1770   x2[57] = x1[57];
1771   x2[58] = x1[58];
1772   x2[59] = x1[59];
1773   x2[60] = x1[60];
1774   x2[61] = x1[61];
1775   x2[62] = x1[62];
1776   x2[63] = x1[63];
1777 
1778   // stage 3
1779   int32x4_t x3[64];
1780   butterfly_dct_pre(x2, x3, 16);
1781   x3[16] = x2[16];
1782   x3[17] = x2[17];
1783   x3[18] = x2[18];
1784   x3[19] = x2[19];
1785   butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
1786   butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
1787   butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
1788   butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
1789   x3[28] = x2[28];
1790   x3[29] = x2[29];
1791   x3[30] = x2[30];
1792   x3[31] = x2[31];
1793   butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
1794 
1795   // stage 4
1796   int32x4_t x4[64];
1797   butterfly_dct_pre(x3, x4, 8);
1798   x4[8] = x3[8];
1799   x4[9] = x3[9];
1800   butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
1801   butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
1802   x4[14] = x3[14];
1803   x4[15] = x3[15];
1804   butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
1805   x4[32] = x3[32];
1806   x4[33] = x3[33];
1807   x4[34] = x3[34];
1808   x4[35] = x3[35];
1809   butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
1810   butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
1811   butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
1812   butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
1813   butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
1814   butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
1815   butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
1816   butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
1817   x4[44] = x3[44];
1818   x4[45] = x3[45];
1819   x4[46] = x3[46];
1820   x4[47] = x3[47];
1821   x4[48] = x3[48];
1822   x4[49] = x3[49];
1823   x4[50] = x3[50];
1824   x4[51] = x3[51];
1825   x4[60] = x3[60];
1826   x4[61] = x3[61];
1827   x4[62] = x3[62];
1828   x4[63] = x3[63];
1829 
1830   // stage 5
1831   int32x4_t x5[64];
1832   butterfly_dct_pre(x4, x5, 4);
1833   x5[4] = x4[4];
1834   butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
1835   x5[7] = x4[7];
1836   butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
1837   x5[16] = x4[16];
1838   x5[17] = x4[17];
1839   butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
1840   butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
1841   butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
1842   butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
1843   x5[22] = x4[22];
1844   x5[23] = x4[23];
1845   x5[24] = x4[24];
1846   x5[25] = x4[25];
1847   x5[30] = x4[30];
1848   x5[31] = x4[31];
1849   butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
1850   butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
1851 
1852   // stage 6
1853   int32x4_t x6[64];
1854   butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
1855   butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
1856   butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
1857   x6[8] = x5[8];
1858   butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
1859   butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
1860   x6[11] = x5[11];
1861   x6[12] = x5[12];
1862   x6[15] = x5[15];
1863   butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
1864   butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
1865   x6[32] = x5[32];
1866   x6[33] = x5[33];
1867   butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
1868   butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
1869   butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
1870   butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
1871   x6[38] = x5[38];
1872   x6[39] = x5[39];
1873   x6[40] = x5[40];
1874   x6[41] = x5[41];
1875   butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
1876   butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
1877   butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
1878   butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
1879   x6[46] = x5[46];
1880   x6[47] = x5[47];
1881   x6[48] = x5[48];
1882   x6[49] = x5[49];
1883   x6[54] = x5[54];
1884   x6[55] = x5[55];
1885   x6[56] = x5[56];
1886   x6[57] = x5[57];
1887   x6[62] = x5[62];
1888   x6[63] = x5[63];
1889 
1890   // stage 7
1891   int32x4_t x7[64];
1892   x7[0] = x6[0];
1893   x7[1] = x6[1];
1894   x7[2] = x6[2];
1895   x7[3] = x6[3];
1896   butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
1897   butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
1898   butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
1899   butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
1900   x7[16] = x6[16];
1901   butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
1902   butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
1903   x7[19] = x6[19];
1904   x7[20] = x6[20];
1905   butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
1906   butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
1907   x7[23] = x6[23];
1908   x7[24] = x6[24];
1909   x7[27] = x6[27];
1910   x7[28] = x6[28];
1911   x7[31] = x6[31];
1912   butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
1913   butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
1914   butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
1915   butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
1916 
1917   // stage 8
1918   int32x4_t x8[64];
1919   x8[0] = x7[0];
1920   x8[1] = x7[1];
1921   x8[2] = x7[2];
1922   x8[3] = x7[3];
1923   x8[4] = x7[4];
1924   x8[5] = x7[5];
1925   x8[6] = x7[6];
1926   x8[7] = x7[7];
1927 
1928   butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
1929   butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
1930   butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
1931   butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
1932   butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
1933   butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
1934   butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
1935   butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
1936   x8[32] = x7[32];
1937   butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
1938   butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
1939   x8[35] = x7[35];
1940   x8[36] = x7[36];
1941   butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
1942   butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
1943   x8[39] = x7[39];
1944   x8[40] = x7[40];
1945   butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
1946   butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
1947   x8[43] = x7[43];
1948   x8[44] = x7[44];
1949   butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
1950   butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
1951   x8[47] = x7[47];
1952   x8[48] = x7[48];
1953   x8[51] = x7[51];
1954   x8[52] = x7[52];
1955   x8[55] = x7[55];
1956   x8[56] = x7[56];
1957   x8[59] = x7[59];
1958   x8[60] = x7[60];
1959   x8[63] = x7[63];
1960 
1961   // stage 9
1962   int32x4_t x9[64];
1963   x9[0] = x8[0];
1964   x9[1] = x8[1];
1965   x9[2] = x8[2];
1966   x9[3] = x8[3];
1967   x9[4] = x8[4];
1968   x9[5] = x8[5];
1969   x9[6] = x8[6];
1970   x9[7] = x8[7];
1971   x9[8] = x8[8];
1972   x9[9] = x8[9];
1973   x9[10] = x8[10];
1974   x9[11] = x8[11];
1975   x9[12] = x8[12];
1976   x9[13] = x8[13];
1977   x9[14] = x8[14];
1978   x9[15] = x8[15];
1979   butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
1980   butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
1981   butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
1982   butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
1983   butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
1984   butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
1985   butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
1986   butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
1987   butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
1988   butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
1989   butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
1990   butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
1991   butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
1992   butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
1993   butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
1994   butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
1995 
1996   // stage 10
1997   int32x4_t x10[64];
1998   x10[0] = x9[0];
1999   x10[1] = x9[1];
2000   x10[2] = x9[2];
2001   x10[3] = x9[3];
2002   x10[4] = x9[4];
2003   x10[5] = x9[5];
2004   x10[6] = x9[6];
2005   x10[7] = x9[7];
2006   x10[8] = x9[8];
2007   x10[9] = x9[9];
2008   x10[10] = x9[10];
2009   x10[11] = x9[11];
2010   x10[12] = x9[12];
2011   x10[13] = x9[13];
2012   x10[14] = x9[14];
2013   x10[15] = x9[15];
2014   x10[16] = x9[16];
2015   x10[17] = x9[17];
2016   x10[18] = x9[18];
2017   x10[19] = x9[19];
2018   x10[20] = x9[20];
2019   x10[21] = x9[21];
2020   x10[22] = x9[22];
2021   x10[23] = x9[23];
2022   x10[24] = x9[24];
2023   x10[25] = x9[25];
2024   x10[26] = x9[26];
2025   x10[27] = x9[27];
2026   x10[28] = x9[28];
2027   x10[29] = x9[29];
2028   x10[30] = x9[30];
2029   x10[31] = x9[31];
2030   butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
2031   butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
2032   butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
2033   butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
2034   butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
2035   butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
2036   butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
2037   butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
2038   butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
2039   butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
2040   butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
2041   butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
2042   butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
2043   butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
2044   butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
2045   butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
2046 
2047   // stage 11
2048   output[0] = x10[0];
2049   output[1] = x10[32];
2050   output[2] = x10[16];
2051   output[3] = x10[48];
2052   output[4] = x10[8];
2053   output[5] = x10[40];
2054   output[6] = x10[24];
2055   output[7] = x10[56];
2056   output[8] = x10[4];
2057   output[9] = x10[36];
2058   output[10] = x10[20];
2059   output[11] = x10[52];
2060   output[12] = x10[12];
2061   output[13] = x10[44];
2062   output[14] = x10[28];
2063   output[15] = x10[60];
2064   output[16] = x10[2];
2065   output[17] = x10[34];
2066   output[18] = x10[18];
2067   output[19] = x10[50];
2068   output[20] = x10[10];
2069   output[21] = x10[42];
2070   output[22] = x10[26];
2071   output[23] = x10[58];
2072   output[24] = x10[6];
2073   output[25] = x10[38];
2074   output[26] = x10[22];
2075   output[27] = x10[54];
2076   output[28] = x10[14];
2077   output[29] = x10[46];
2078   output[30] = x10[30];
2079   output[31] = x10[62];
2080   output[32] = x10[1];
2081   output[33] = x10[33];
2082   output[34] = x10[17];
2083   output[35] = x10[49];
2084   output[36] = x10[9];
2085   output[37] = x10[41];
2086   output[38] = x10[25];
2087   output[39] = x10[57];
2088   output[40] = x10[5];
2089   output[41] = x10[37];
2090   output[42] = x10[21];
2091   output[43] = x10[53];
2092   output[44] = x10[13];
2093   output[45] = x10[45];
2094   output[46] = x10[29];
2095   output[47] = x10[61];
2096   output[48] = x10[3];
2097   output[49] = x10[35];
2098   output[50] = x10[19];
2099   output[51] = x10[51];
2100   output[52] = x10[11];
2101   output[53] = x10[43];
2102   output[54] = x10[27];
2103   output[55] = x10[59];
2104   output[56] = x10[7];
2105   output[57] = x10[39];
2106   output[58] = x10[23];
2107   output[59] = x10[55];
2108   output[60] = x10[15];
2109   output[61] = x10[47];
2110   output[62] = x10[31];
2111   output[63] = x10[63];
2112 }
2113 
highbd_fidentity32_x4_neon(const int32x4_t * input,int32x4_t * output,int cos_bit)2114 static void highbd_fidentity32_x4_neon(const int32x4_t *input,
2115                                        int32x4_t *output, int cos_bit) {
2116   (void)cos_bit;
2117   for (int i = 0; i < 32; i++) {
2118     output[i] = vshlq_n_s32(input[i], 2);
2119   }
2120 }
2121 
2122 TRANSFORM_COL_MANY(fdct32, 32)
2123 TRANSFORM_COL_MANY(fidentity32, 32)
2124 
2125 static const fwd_transform_1d_col_many_neon
2126     col_highbd_txfm32_x4_arr[TX_TYPES] = {
2127       highbd_fdct32_col_many_neon,       // DCT_DCT
2128       NULL,                              // ADST_DCT
2129       NULL,                              // DCT_ADST
2130       NULL,                              // ADST_ADST
2131       NULL,                              // FLIPADST_DCT
2132       NULL,                              // DCT_FLIPADST
2133       NULL,                              // FLIPADST_FLIPADST
2134       NULL,                              // ADST_FLIPADST
2135       NULL,                              // FLIPADST_ADST
2136       highbd_fidentity32_col_many_neon,  // IDTX
2137       NULL,                              // V_DCT
2138       NULL,                              // H_DCT
2139       NULL,                              // V_ADST
2140       NULL,                              // H_ADST
2141       NULL,                              // V_FLIPADST
2142       NULL                               // H_FLIPADST
2143     };
2144 
2145 TRANSFORM_ROW_MANY(fdct32, 32)
2146 TRANSFORM_ROW_MANY(fidentity32, 32)
2147 
2148 static const fwd_transform_1d_row_many_neon
2149     row_highbd_txfm32_x4_arr[TX_TYPES] = {
2150       highbd_fdct32_row_many_neon,       // DCT_DCT
2151       NULL,                              // ADST_DCT
2152       NULL,                              // DCT_ADST
2153       NULL,                              // ADST_ADST
2154       NULL,                              // FLIPADST_DCT
2155       NULL,                              // DCT_FLIPADST
2156       NULL,                              // FLIPADST_FLIPADST
2157       NULL,                              // ADST_FLIPADST
2158       NULL,                              // FLIPADST_ADST
2159       highbd_fidentity32_row_many_neon,  // IDTX
2160       NULL,                              // V_DCT
2161       NULL,                              // H_DCT
2162       NULL,                              // V_ADST
2163       NULL,                              // H_ADST
2164       NULL,                              // V_FLIPADST
2165       NULL                               // H_FLIPADST
2166     };
2167 
2168 TRANSFORM_ROW_RECT_MANY(fdct32, 32)
2169 TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
2170 
2171 static const fwd_transform_1d_row_many_neon
2172     row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
2173       highbd_fdct32_row_rect_many_neon,       // DCT_DCT
2174       NULL,                                   // ADST_DCT
2175       NULL,                                   // DCT_ADST
2176       NULL,                                   // ADST_ADST
2177       NULL,                                   // FLIPADST_DCT
2178       NULL,                                   // DCT_FLIPADST
2179       NULL,                                   // FLIPADST_FLIPADST
2180       NULL,                                   // ADST_FLIPADST
2181       NULL,                                   // FLIPADST_ADST
2182       highbd_fidentity32_row_rect_many_neon,  // IDTX
2183       NULL,                                   // V_DCT
2184       NULL,                                   // H_DCT
2185       NULL,                                   // V_ADST
2186       NULL,                                   // H_ADST
2187       NULL,                                   // V_FLIPADST
2188       NULL                                    // H_FLIPADST
2189     };
2190 
av1_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2191 void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
2192                               TX_TYPE tx_type, int bd) {
2193   (void)bd;
2194   const fwd_transform_1d_col_many_neon col_txfm =
2195       col_highbd_txfm8_xn_arr[tx_type];
2196   const fwd_transform_1d_row_many_neon row_txfm =
2197       row_rect_highbd_txfm16_xn_arr[tx_type];
2198   int bit = av1_fwd_cos_bit_col[2][1];
2199 
2200   int ud_flip, lr_flip;
2201   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2202   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2203 
2204   // Column-wise transform.
2205   int32x4_t buf0[32];
2206   if (lr_flip) {
2207     col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
2208              /*hm_stride=*/-8);
2209   } else {
2210     col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
2211              /*hm_stride=*/8);
2212   }
2213   shift_right_2_round_s32_x4(buf0, buf0, 32);
2214 
2215   int32x4_t buf1[32];
2216   transpose_arrays_s32_16x8(buf0, buf1);
2217 
2218   // Row-wise transform.
2219   row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
2220 }
2221 
av1_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2222 void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
2223                               TX_TYPE tx_type, int bd) {
2224   (void)bd;
2225   const fwd_transform_1d_col_many_neon col_txfm =
2226       col_highbd_txfm16_xn_arr[tx_type];
2227   const fwd_transform_1d_row_many_neon row_txfm =
2228       row_rect_highbd_txfm8_xn_arr[tx_type];
2229   int bit = av1_fwd_cos_bit_col[1][2];
2230 
2231   int ud_flip, lr_flip;
2232   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2233   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2234 
2235   // Column-wise transform.
2236   int32x4_t buf0[32];
2237   if (lr_flip) {
2238     col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
2239              /*hm_stride=*/-16);
2240   } else {
2241     col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
2242              /*hm_stride=*/16);
2243   }
2244   shift_right_2_round_s32_x4(buf0, buf0, 32);
2245 
2246   int32x4_t buf1[32];
2247   transpose_arrays_s32_8x16(buf0, buf1);
2248 
2249   // Row-wise transform.
2250   row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
2251 }
2252 
2253 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2254 void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
2255                               TX_TYPE tx_type, int bd) {
2256   (void)bd;
2257   int bitcol = av1_fwd_cos_bit_col[0][2];
2258   int bitrow = av1_fwd_cos_bit_row[0][2];
2259   const fwd_transform_1d_col_many_neon col_txfm =
2260       col_highbd_txfm16_xn_arr[tx_type];
2261   const fwd_transform_1d_row_many_neon row_txfm =
2262       row_highbd_txfm4_xn_arr[tx_type];
2263 
2264   int ud_flip, lr_flip;
2265   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2266   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2267 
2268   // Column-wise transform.
2269   int32x4_t buf0[16];
2270   if (lr_flip) {
2271     col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
2272              /*hm_stride=*/0);
2273   } else {
2274     col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
2275              /*hm_stride=*/0);
2276   }
2277   shift_right_1_round_s32_x4(buf0, buf0, 16);
2278 
2279   int32x4_t buf1[16];
2280   transpose_arrays_s32_4x16(buf0, buf1);
2281 
2282   // Row-wise transform.
2283   row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
2284 }
2285 #endif
2286 
av1_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2287 void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
2288                               TX_TYPE tx_type, int bd) {
2289   (void)bd;
2290   int bitcol = av1_fwd_cos_bit_col[2][0];
2291   int bitrow = av1_fwd_cos_bit_row[2][0];
2292   const fwd_transform_1d_col_many_neon col_txfm =
2293       col_highbd_txfm4_xn_arr[tx_type];
2294   const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
2295 
2296   int ud_flip, lr_flip;
2297   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2298   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2299 
2300   // Column-wise transform.
2301   int32x4_t buf0[16];
2302   if (lr_flip) {
2303     col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
2304              /*hm_stride=*/-4);
2305   } else {
2306     col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
2307              /*hm_stride=*/4);
2308   }
2309 
2310   shift_right_1_round_s32_x4(buf0, buf0, 16);
2311   transpose_arrays_s32_4x16(buf0, buf0);
2312 
2313   // Row-wise transform.
2314   row_txfm(buf0, coeff, bitrow, /*stride=*/4);
2315 }
2316 
av1_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2317 void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
2318                                TX_TYPE tx_type, int bd) {
2319   (void)bd;
2320   const fwd_transform_1d_col_many_neon col_txfm =
2321       col_highbd_txfm32_x4_arr[tx_type];
2322   const fwd_transform_1d_row_many_neon row_txfm =
2323       row_rect_highbd_txfm16_xn_arr[tx_type];
2324   int bitcol = av1_fwd_cos_bit_col[2][3];
2325   int bitrow = av1_fwd_cos_bit_row[2][3];
2326 
2327   // Column-wise transform.
2328   int32x4_t buf0[128];
2329   col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
2330            /*hm_stride=*/32);
2331   shift_right_4_round_s32_x4(buf0, buf0, 128);
2332 
2333   int32x4_t buf1[128];
2334   transpose_arrays_s32_16x32(buf0, buf1);
2335 
2336   // Row-wise transform.
2337   row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
2338 }
2339 
av1_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2340 void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
2341                                TX_TYPE tx_type, int bd) {
2342   (void)bd;
2343   (void)tx_type;
2344   int bitcol = av1_fwd_cos_bit_col[3][4];
2345   int bitrow = av1_fwd_cos_bit_row[3][4];
2346 
2347   // Column-wise transform.
2348   int32x4_t buf0[512];
2349   load_buffer_32x64(input, buf0, stride, 0);
2350   for (int i = 0; i < 8; i++) {
2351     highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
2352   }
2353   shift_right_2_round_s32_x4(buf0, buf0, 512);
2354 
2355   int32x4_t buf1[512];
2356   transpose_arrays_s32_32x64(buf0, buf1);
2357 
2358   // Row-wise transform.
2359   for (int i = 0; i < 16; i++) {
2360     highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
2361   }
2362   round_shift2_rect_array_s32_neon(buf1, buf1, 512);
2363   store_buffer_32x32(buf1, coeff, /*stride=*/32);
2364 }
2365 
av1_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2366 void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
2367                                TX_TYPE tx_type, int bd) {
2368   (void)bd;
2369   (void)tx_type;
2370   int bitcol = av1_fwd_cos_bit_col[4][3];
2371   int bitrow = av1_fwd_cos_bit_row[4][3];
2372 
2373   // Column-wise transform.
2374   int32x4_t buf0[512];
2375   load_buffer_64x32(input, buf0, stride, 0);
2376   for (int i = 0; i < 16; i++) {
2377     highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
2378   }
2379   shift_right_4_round_s32_x4(buf0, buf0, 512);
2380 
2381   int32x4_t buf1[512];
2382   transpose_arrays_s32_64x32(buf0, buf1);
2383 
2384   // Row-wise transform.
2385   for (int i = 0; i < 8; i++) {
2386     highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
2387   }
2388   round_shift2_rect_array_s32_neon(buf1, buf1, 512);
2389   store_buffer_64x32(buf1, coeff, /*stride=*/32);
2390 }
2391 
av1_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2392 void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
2393                                TX_TYPE tx_type, int bd) {
2394   (void)bd;
2395   const fwd_transform_1d_col_many_neon col_txfm =
2396       col_highbd_txfm16_xn_arr[tx_type];
2397   const fwd_transform_1d_row_many_neon row_txfm =
2398       row_rect_highbd_txfm32_x4_arr[tx_type];
2399   int bitcol = av1_fwd_cos_bit_col[3][2];
2400   int bitrow = av1_fwd_cos_bit_row[3][2];
2401 
2402   // Column-wise transform.
2403   int32x4_t buf0[128];
2404   col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
2405            /*hm_stride=*/16);
2406   shift_right_4_round_s32_x4(buf0, buf0, 128);
2407 
2408   int32x4_t buf1[128];
2409   transpose_arrays_s32_32x16(buf0, buf1);
2410 
2411   // Row-wise transform.
2412   row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
2413 }
2414 
2415 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2416 void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
2417                               TX_TYPE tx_type, int bd) {
2418   (void)bd;
2419   const fwd_transform_1d_col_many_neon col_txfm =
2420       col_highbd_txfm32_x4_arr[tx_type];
2421   const fwd_transform_1d_row_many_neon row_txfm =
2422       row_highbd_txfm8_xn_arr[tx_type];
2423   int bitcol = av1_fwd_cos_bit_col[1][3];
2424   int bitrow = av1_fwd_cos_bit_row[1][3];
2425 
2426   // Column-wise transform.
2427   int32x4_t buf0[64];
2428   col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
2429            /*hm_stride=*/32);
2430   shift_right_2_round_s32_x4(buf0, buf0, 64);
2431 
2432   int32x4_t buf1[64];
2433   transpose_arrays_s32_8x32(buf0, buf1);
2434 
2435   // Row-wise transform.
2436   row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
2437 }
2438 
av1_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2439 void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
2440                               TX_TYPE tx_type, int bd) {
2441   (void)bd;
2442   const fwd_transform_1d_col_many_neon col_txfm =
2443       col_highbd_txfm8_xn_arr[tx_type];
2444   const fwd_transform_1d_row_many_neon row_txfm =
2445       row_highbd_txfm32_x4_arr[tx_type];
2446   int bitcol = av1_fwd_cos_bit_col[3][1];
2447   int bitrow = av1_fwd_cos_bit_row[3][1];
2448 
2449   // Column-wise transform.
2450   int32x4_t buf0[64];
2451   col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
2452            /*hm_stride=*/8);
2453   shift_right_2_round_s32_x4(buf0, buf0, 64);
2454 
2455   int32x4_t buf1[64];
2456   transpose_arrays_s32_32x8(buf0, buf1);
2457 
2458   // Row-wise transform.
2459   row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
2460 }
2461 #endif
2462 
av1_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2463 void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
2464                              TX_TYPE tx_type, int bd) {
2465   (void)bd;
2466   int bitcol = av1_fwd_cos_bit_col[0][1];
2467   int bitrow = av1_fwd_cos_bit_row[0][1];
2468   const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
2469   const fwd_transform_1d_row_many_neon row_txfm =
2470       row_rect_highbd_txfm4_xn_arr[tx_type];
2471 
2472   int ud_flip, lr_flip;
2473   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2474   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
2475 
2476   // Column-wise transform.
2477   int32x4_t buf0[8];
2478   col_txfm(input, buf0, stride, bitcol, lr_flip);
2479   shift_right_1_round_s32_x4(buf0, buf0, 8);
2480 
2481   int32x4_t buf1[8];
2482   transpose_arrays_s32_4x8(buf0, buf1);
2483 
2484   // Row-wise transform.
2485   row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
2486 }
2487 
av1_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2488 void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
2489                              TX_TYPE tx_type, int bd) {
2490   (void)bd;
2491   const int bitcol = av1_fwd_cos_bit_col[1][0];
2492   const int bitrow = av1_fwd_cos_bit_row[1][0];
2493   const fwd_transform_1d_col_many_neon col_txfm =
2494       col_highbd_txfm4_xn_arr[tx_type];
2495   const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
2496 
2497   int ud_flip, lr_flip;
2498   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2499   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
2500 
2501   // Column-wise transform.
2502   int32x4_t buf0[8];
2503   if (lr_flip) {
2504     col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
2505              /*hm_stride=*/-4);
2506   } else {
2507     col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
2508              /*hm_stride=*/4);
2509   }
2510 
2511   shift_right_1_round_s32_x4(buf0, buf0, 8);
2512 
2513   int32x4_t buf1[8];
2514   transpose_arrays_s32_8x4(buf0, buf1);
2515 
2516   // Row-wise transform.
2517   row_txfm(buf1, coeff, bitrow, /*stride=*/4);
2518 }
2519 
2520 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2521 void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
2522                                TX_TYPE tx_type, int bd) {
2523   (void)bd;
2524   const int bitcol = av1_fwd_cos_bit_col[2][4];
2525   const int bitrow = av1_fwd_cos_bit_row[2][4];
2526 
2527   int ud_flip, lr_flip;
2528   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2529   ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
2530 
2531   // Column-wise transform.
2532   int32x4_t buf0[256];
2533   load_buffer_16x64(input, buf0, stride, lr_flip);
2534   for (int i = 0; i < 4; i++) {
2535     highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
2536   }
2537   shift_right_2_round_s32_x4(buf0, buf0, 256);
2538 
2539   int32x4_t buf1[256];
2540   transpose_arrays_s32_16x64(buf0, buf1);
2541 
2542   // Row-wise transform.
2543   highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
2544   store_buffer_16x32(buf1, coeff, /*stride=*/32);
2545 }
2546 
av1_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2547 void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
2548                                TX_TYPE tx_type, int bd) {
2549   (void)bd;
2550   const int bitcol = av1_fwd_cos_bit_col[4][2];
2551   const int bitrow = av1_fwd_cos_bit_row[4][2];
2552 
2553   int ud_flip, lr_flip;
2554   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2555   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
2556 
2557   // Column-wise transform.
2558   int32x4_t buf0[256];
2559   load_buffer_64x16(input, buf0, stride, lr_flip);
2560   highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
2561   shift_right_4_round_s32_x4(buf0, buf0, 256);
2562 
2563   int32x4_t buf1[256];
2564   transpose_arrays_s32_64x16(buf0, buf1);
2565 
2566   // Row-wise transform.
2567   for (int i = 0; i < 4; i++) {
2568     highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
2569   }
2570   store_buffer_64x16(buf1, coeff, /*stride=*/16);
2571   memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
2572 }
2573 #endif
2574 
av1_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2575 void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2576                                int stride, TX_TYPE tx_type, int bd) {
2577   (void)bd;
2578   const fwd_transform_1d_col_many_neon col_txfm =
2579       col_highbd_txfm32_x4_arr[tx_type];
2580   const fwd_transform_1d_row_many_neon row_txfm =
2581       row_highbd_txfm32_x4_arr[tx_type];
2582 
2583   // Column-wise transform.
2584   int32x4_t buf0[256];
2585   col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
2586            /*hm_stride=*/32);
2587   shift_right_4_round_s32_x4(buf0, buf0, 256);
2588 
2589   int32x4_t buf1[256];
2590   transpose_arrays_s32_32x32(buf0, buf1);
2591 
2592   // Row-wise transform.
2593   row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
2594            /*stride=*/32);
2595 }
2596 
av1_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2597 void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
2598                                int stride, TX_TYPE tx_type, int bd) {
2599   (void)bd;
2600   (void)tx_type;
2601 
2602   // Column-wise transform.
2603   int32x4_t buf0[1024];
2604   load_buffer_64x64(input, buf0, stride, 0);
2605   for (int col = 0; col < 16; col++) {
2606     highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
2607   }
2608   shift_right_2_round_s32_x4(buf0, buf0, 1024);
2609 
2610   int32x4_t buf1[1024];
2611   transpose_arrays_s32_64x64(buf0, buf1);
2612 
2613   // Row-wise transform.
2614   for (int col = 0; col < 8; col++) {
2615     highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
2616   }
2617   shift_right_2_round_s32_x4(buf1, buf1, 512);
2618   store_buffer_64x32(buf1, output, /*stride=*/32);
2619 }
2620