xref: /aosp_15_r20/external/libaom/aom_dsp/arm/transpose_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
14 
15 #include <arm_neon.h>
16 
17 #include "aom_dsp/aom_dsp_common.h"  // For AOM_FORCE_INLINE.
18 #include "config/aom_config.h"
19 
transpose_elems_u8_8x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7)20 static inline void transpose_elems_u8_8x8(
21     uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
22     uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
23     uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
24     uint8x8_t *o7) {
25   // Swap 8 bit elements. Goes from:
26   // a0: 00 01 02 03 04 05 06 07
27   // a1: 10 11 12 13 14 15 16 17
28   // a2: 20 21 22 23 24 25 26 27
29   // a3: 30 31 32 33 34 35 36 37
30   // a4: 40 41 42 43 44 45 46 47
31   // a5: 50 51 52 53 54 55 56 57
32   // a6: 60 61 62 63 64 65 66 67
33   // a7: 70 71 72 73 74 75 76 77
34   // to:
35   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
36   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
37   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
38   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
39 
40   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
41   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
42 
43   // Swap 16 bit elements resulting in:
44   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
45   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
46   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
47   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
48 
49   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
50                                     vreinterpretq_u16_u8(b1.val[0]));
51   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
52                                     vreinterpretq_u16_u8(b1.val[1]));
53 
54   // Unzip 32 bit elements resulting in:
55   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
56   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
57   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
58   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
59   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
60                                     vreinterpretq_u32_u16(c1.val[0]));
61   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
62                                     vreinterpretq_u32_u16(c1.val[1]));
63 
64   *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
65   *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
66   *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
67   *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
68   *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
69   *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
70   *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
71   *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
72 }
73 
transpose_elems_inplace_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)74 static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
75                                                   uint8x8_t *a2, uint8x8_t *a3,
76                                                   uint8x8_t *a4, uint8x8_t *a5,
77                                                   uint8x8_t *a6,
78                                                   uint8x8_t *a7) {
79   transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
80                          a4, a5, a6, a7);
81 }
82 
transpose_arrays_u8_8x8(const uint8x8_t * in,uint8x8_t * out)83 static inline void transpose_arrays_u8_8x8(const uint8x8_t *in,
84                                            uint8x8_t *out) {
85   transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
86                          &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
87                          &out[6], &out[7]);
88 }
89 
transpose_arrays_u8_8x16(const uint8x8_t * x,uint8x16_t * d)90 static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
91                                                       uint8x16_t *d) {
92   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
93   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
94   uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
95   uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
96 
97   uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
98   uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
99   uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
100   uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
101 
102   uint16x4x2_t w4 =
103       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
104   uint16x4x2_t w5 =
105       vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
106   uint16x4x2_t w12 =
107       vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
108   uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
109                               vreinterpret_u16_u8(w11.val[0]));
110 
111   uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
112                              vreinterpret_u32_u16(w5.val[0]));
113   uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
114                              vreinterpret_u32_u16(w5.val[1]));
115   uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
116                               vreinterpret_u32_u16(w13.val[0]));
117   uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
118                               vreinterpret_u32_u16(w13.val[1]));
119 
120   // Store first 4-line result
121   d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
122   d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
123   d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
124   d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
125 
126   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
127   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
128   w12 =
129       vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
130   w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
131                  vreinterpret_u16_u8(w11.val[1]));
132 
133   w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
134                 vreinterpret_u32_u16(w5.val[0]));
135   w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
136                 vreinterpret_u32_u16(w5.val[1]));
137   w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
138                  vreinterpret_u32_u16(w13.val[0]));
139   w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
140                  vreinterpret_u32_u16(w13.val[1]));
141 
142   // Store second 4-line result
143   d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
144   d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
145   d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
146   d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
147 }
148 
transpose_arrays_u8_16x8(const uint8x16_t * x,uint8x8_t * d)149 static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
150                                                       uint8x8_t *d) {
151   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
152   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
153   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
154   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
155 
156   uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
157                               vreinterpretq_u16_u8(w1.val[0]));
158   uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
159                               vreinterpretq_u16_u8(w3.val[0]));
160   uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
161                               vreinterpretq_u16_u8(w1.val[1]));
162   uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
163                               vreinterpretq_u16_u8(w3.val[1]));
164 
165   uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
166                               vreinterpretq_u32_u16(w5.val[0]));
167   uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
168                               vreinterpretq_u32_u16(w7.val[0]));
169   uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
170                                vreinterpretq_u32_u16(w5.val[1]));
171   uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
172                                vreinterpretq_u32_u16(w7.val[1]));
173 
174   d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
175   d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
176   d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
177   d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
178   d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
179   d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
180   d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
181   d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
182   d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
183   d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
184   d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
185   d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
186   d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
187   d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
188   d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
189   d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
190 }
191 
aom_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)192 static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
193   uint16x8x2_t b0;
194 #if AOM_ARCH_AARCH64
195   b0.val[0] = vreinterpretq_u16_u64(
196       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
197   b0.val[1] = vreinterpretq_u16_u64(
198       vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
199 #else
200   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
201                            vreinterpret_u16_u32(vget_low_u32(a1)));
202   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
203                            vreinterpret_u16_u32(vget_high_u32(a1)));
204 #endif
205   return b0;
206 }
207 
transpose_arrays_u8_16x16(const uint8x16_t * x,uint8x16_t * d)208 static inline void transpose_arrays_u8_16x16(const uint8x16_t *x,
209                                              uint8x16_t *d) {
210   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
211   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
212   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
213   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
214 
215   uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
216   uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
217   uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
218   uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
219 
220   uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
221                               vreinterpretq_u16_u8(w1.val[0]));
222   uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
223                               vreinterpretq_u16_u8(w3.val[0]));
224   uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
225                                vreinterpretq_u16_u8(w5.val[0]));
226   uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
227                                vreinterpretq_u16_u8(w7.val[0]));
228 
229   uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
230                                vreinterpretq_u32_u16(w9.val[0]));
231   uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
232                                vreinterpretq_u32_u16(w11.val[0]));
233   uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
234                                vreinterpretq_u32_u16(w9.val[1]));
235   uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
236                                vreinterpretq_u32_u16(w11.val[1]));
237 
238   uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
239   d[0] = vreinterpretq_u8_u16(d01.val[0]);
240   d[1] = vreinterpretq_u8_u16(d01.val[1]);
241   uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
242   d[2] = vreinterpretq_u8_u16(d23.val[0]);
243   d[3] = vreinterpretq_u8_u16(d23.val[1]);
244   uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
245   d[4] = vreinterpretq_u8_u16(d45.val[0]);
246   d[5] = vreinterpretq_u8_u16(d45.val[1]);
247   uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
248   d[6] = vreinterpretq_u8_u16(d67.val[0]);
249   d[7] = vreinterpretq_u8_u16(d67.val[1]);
250 
251   // upper half
252   w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
253                  vreinterpretq_u16_u8(w1.val[1]));
254   w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
255                  vreinterpretq_u16_u8(w3.val[1]));
256   w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
257                   vreinterpretq_u16_u8(w5.val[1]));
258   w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
259                   vreinterpretq_u16_u8(w7.val[1]));
260 
261   w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
262                   vreinterpretq_u32_u16(w9.val[0]));
263   w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
264                   vreinterpretq_u32_u16(w11.val[0]));
265   w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
266                   vreinterpretq_u32_u16(w9.val[1]));
267   w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
268                   vreinterpretq_u32_u16(w11.val[1]));
269 
270   d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
271   d[8] = vreinterpretq_u8_u16(d01.val[0]);
272   d[9] = vreinterpretq_u8_u16(d01.val[1]);
273   d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
274   d[10] = vreinterpretq_u8_u16(d23.val[0]);
275   d[11] = vreinterpretq_u8_u16(d23.val[1]);
276   d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
277   d[12] = vreinterpretq_u8_u16(d45.val[0]);
278   d[13] = vreinterpretq_u8_u16(d45.val[1]);
279   d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
280   d[14] = vreinterpretq_u8_u16(d67.val[0]);
281   d[15] = vreinterpretq_u8_u16(d67.val[1]);
282 }
283 
transpose_arrays_u8_32x16(const uint8x16x2_t * x,uint8x16_t * d)284 static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
285                                                        uint8x16_t *d) {
286   uint8x16_t x2[32];
287   for (int i = 0; i < 16; ++i) {
288     x2[i] = x[i].val[0];
289     x2[i + 16] = x[i].val[1];
290   }
291   transpose_arrays_u8_16x16(x2, d);
292   transpose_arrays_u8_16x16(x2 + 16, d + 16);
293 }
294 
transpose_elems_inplace_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)295 static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
296                                                   uint8x8_t *a2,
297                                                   uint8x8_t *a3) {
298   // Swap 8 bit elements. Goes from:
299   // a0: 00 01 02 03 04 05 06 07
300   // a1: 10 11 12 13 14 15 16 17
301   // a2: 20 21 22 23 24 25 26 27
302   // a3: 30 31 32 33 34 35 36 37
303   // to:
304   // b0.val[0]: 00 10 02 12 04 14 06 16
305   // b0.val[1]: 01 11 03 13 05 15 07 17
306   // b1.val[0]: 20 30 22 32 24 34 26 36
307   // b1.val[1]: 21 31 23 33 25 35 27 37
308 
309   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
310   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
311 
312   // Swap 16 bit elements resulting in:
313   // c0.val[0]: 00 10 20 30 04 14 24 34
314   // c0.val[1]: 02 12 22 32 06 16 26 36
315   // c1.val[0]: 01 11 21 31 05 15 25 35
316   // c1.val[1]: 03 13 23 33 07 17 27 37
317 
318   const uint16x4x2_t c0 =
319       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
320   const uint16x4x2_t c1 =
321       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
322 
323   *a0 = vreinterpret_u8_u16(c0.val[0]);
324   *a1 = vreinterpret_u8_u16(c1.val[0]);
325   *a2 = vreinterpret_u8_u16(c0.val[1]);
326   *a3 = vreinterpret_u8_u16(c1.val[1]);
327 }
328 
transpose_elems_inplace_u8_16x4(uint8x16_t * a0,uint8x16_t * a1,uint8x16_t * a2,uint8x16_t * a3)329 static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
330                                                    uint8x16_t *a1,
331                                                    uint8x16_t *a2,
332                                                    uint8x16_t *a3) {
333   // Swap 8 bit elements. Goes from:
334   // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
335   // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
336   // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
337   // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
338   // to:
339   // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
340   // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
341   // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
342   // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
343 
344   const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
345   const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
346 
347   // Swap 16 bit elements resulting in:
348   // c0.val[0]: 00 10 20 30 04 14 24 34 08  18  28  38  012 112 212 312
349   // c0.val[1]: 02 12 22 32 06 16 26 36 09  19  29  39  013 113 213 313
350   // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
351   // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
352 
353   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
354                                     vreinterpretq_u16_u8(b1.val[0]));
355   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
356                                     vreinterpretq_u16_u8(b1.val[1]));
357 
358   *a0 = vreinterpretq_u8_u16(c0.val[0]);
359   *a1 = vreinterpretq_u8_u16(c1.val[0]);
360   *a2 = vreinterpretq_u8_u16(c0.val[1]);
361   *a3 = vreinterpretq_u8_u16(c1.val[1]);
362 }
363 
transpose_elems_inplace_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)364 static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
365                                                   uint8x8_t *a1) {
366   // Swap 16 bit elements. Goes from:
367   // a0: 00 01 02 03  10 11 12 13
368   // a1: 20 21 22 23  30 31 32 33
369   // to:
370   // b0.val[0]: 00 01 20 21  10 11 30 31
371   // b0.val[1]: 02 03 22 23  12 13 32 33
372 
373   const uint16x4x2_t b0 =
374       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
375 
376   // Swap 32 bit elements resulting in:
377   // c0.val[0]: 00 01 20 21  02 03 22 23
378   // c0.val[1]: 10 11 30 31  12 13 32 33
379 
380   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381                                    vreinterpret_u32_u16(b0.val[1]));
382 
383   // Swap 8 bit elements resulting in:
384   // d0.val[0]: 00 10 20 30  02 12 22 32
385   // d0.val[1]: 01 11 21 31  03 13 23 33
386 
387   const uint8x8x2_t d0 =
388       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
389 
390   *a0 = d0.val[0];
391   *a1 = d0.val[1];
392 }
393 
transpose_elems_u8_4x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3)394 static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
395                                           uint8x8_t a2, uint8x8_t a3,
396                                           uint8x8_t a4, uint8x8_t a5,
397                                           uint8x8_t a6, uint8x8_t a7,
398                                           uint8x8_t *o0, uint8x8_t *o1,
399                                           uint8x8_t *o2, uint8x8_t *o3) {
400   // Swap 32 bit elements. Goes from:
401   // a0: 00 01 02 03 XX XX XX XX
402   // a1: 10 11 12 13 XX XX XX XX
403   // a2: 20 21 22 23 XX XX XX XX
404   // a3; 30 31 32 33 XX XX XX XX
405   // a4: 40 41 42 43 XX XX XX XX
406   // a5: 50 51 52 53 XX XX XX XX
407   // a6: 60 61 62 63 XX XX XX XX
408   // a7: 70 71 72 73 XX XX XX XX
409   // to:
410   // b0.val[0]: 00 01 02 03 40 41 42 43
411   // b1.val[0]: 10 11 12 13 50 51 52 53
412   // b2.val[0]: 20 21 22 23 60 61 62 63
413   // b3.val[0]: 30 31 32 33 70 71 72 73
414 
415   const uint32x2x2_t b0 =
416       vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
417   const uint32x2x2_t b1 =
418       vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
419   const uint32x2x2_t b2 =
420       vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
421   const uint32x2x2_t b3 =
422       vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
423 
424   // Swap 16 bit elements resulting in:
425   // c0.val[0]: 00 01 20 21 40 41 60 61
426   // c0.val[1]: 02 03 22 23 42 43 62 63
427   // c1.val[0]: 10 11 30 31 50 51 70 71
428   // c1.val[1]: 12 13 32 33 52 53 72 73
429 
430   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
431                                    vreinterpret_u16_u32(b2.val[0]));
432   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
433                                    vreinterpret_u16_u32(b3.val[0]));
434 
435   // Swap 8 bit elements resulting in:
436   // d0.val[0]: 00 10 20 30 40 50 60 70
437   // d0.val[1]: 01 11 21 31 41 51 61 71
438   // d1.val[0]: 02 12 22 32 42 52 62 72
439   // d1.val[1]: 03 13 23 33 43 53 63 73
440 
441   const uint8x8x2_t d0 =
442       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
443   const uint8x8x2_t d1 =
444       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
445 
446   *o0 = d0.val[0];
447   *o1 = d0.val[1];
448   *o2 = d1.val[0];
449   *o3 = d1.val[1];
450 }
451 
transpose_array_inplace_u16_4x4(uint16x4_t a[4])452 static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
453   // Input:
454   // 00 01 02 03
455   // 10 11 12 13
456   // 20 21 22 23
457   // 30 31 32 33
458 
459   // b:
460   // 00 10 02 12
461   // 01 11 03 13
462   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
463   // c:
464   // 20 30 22 32
465   // 21 31 23 33
466   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
467   // d:
468   // 00 10 20 30
469   // 02 12 22 32
470   const uint32x2x2_t d =
471       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
472   // e:
473   // 01 11 21 31
474   // 03 13 23 33
475   const uint32x2x2_t e =
476       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
477 
478   // Output:
479   // 00 10 20 30
480   // 01 11 21 31
481   // 02 12 22 32
482   // 03 13 23 33
483   a[0] = vreinterpret_u16_u32(d.val[0]);
484   a[1] = vreinterpret_u16_u32(e.val[0]);
485   a[2] = vreinterpret_u16_u32(d.val[1]);
486   a[3] = vreinterpret_u16_u32(e.val[1]);
487 }
488 
transpose_array_inplace_u16_4x8(uint16x8_t a[4])489 static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
490   // 4x8 Input:
491   // a[0]: 00 01 02 03 04 05 06 07
492   // a[1]: 10 11 12 13 14 15 16 17
493   // a[2]: 20 21 22 23 24 25 26 27
494   // a[3]: 30 31 32 33 34 35 36 37
495 
496   // b0.val[0]: 00 10 02 12 04 14 06 16
497   // b0.val[1]: 01 11 03 13 05 15 07 17
498   // b1.val[0]: 20 30 22 32 24 34 26 36
499   // b1.val[1]: 21 31 23 33 25 35 27 37
500   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
501   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
502 
503   // c0.val[0]: 00 10 20 30 04 14 24 34
504   // c0.val[1]: 02 12 22 32 06 16 26 36
505   // c1.val[0]: 01 11 21 31 05 15 25 35
506   // c1.val[1]: 03 13 23 33 07 17 27 37
507   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
508                                     vreinterpretq_u32_u16(b1.val[0]));
509   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
510                                     vreinterpretq_u32_u16(b1.val[1]));
511 
512   // 8x4 Output:
513   // a[0]: 00 10 20 30 04 14 24 34
514   // a[1]: 01 11 21 31 05 15 25 35
515   // a[2]: 02 12 22 32 06 16 26 36
516   // a[3]: 03 13 23 33 07 17 27 37
517   a[0] = vreinterpretq_u16_u32(c0.val[0]);
518   a[1] = vreinterpretq_u16_u32(c1.val[0]);
519   a[2] = vreinterpretq_u16_u32(c0.val[1]);
520   a[3] = vreinterpretq_u16_u32(c1.val[1]);
521 }
522 
523 // Special transpose for loop filter.
524 // 4x8 Input:
525 // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
526 // a[0]: 00 01 02 03 04 05 06 07
527 // a[1]: 10 11 12 13 14 15 16 17
528 // a[2]: 20 21 22 23 24 25 26 27
529 // a[3]: 30 31 32 33 34 35 36 37
530 // 8x4 Output:
531 // a[0]: 03 13 23 33 04 14 24 34  p0q0
532 // a[1]: 02 12 22 32 05 15 25 35  p1q1
533 // a[2]: 01 11 21 31 06 16 26 36  p2q2
534 // a[3]: 00 10 20 30 07 17 27 37  p3q3
535 // Direct reapplication of the function will reset the high halves, but
536 // reverse the low halves:
537 // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
538 // a[0]: 33 32 31 30 04 05 06 07
539 // a[1]: 23 22 21 20 14 15 16 17
540 // a[2]: 13 12 11 10 24 25 26 27
541 // a[3]: 03 02 01 00 34 35 36 37
542 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
543 // reverse the high halves.
544 // The standard transpose_u16_4x8q will produce the same reversals, but with the
545 // order of the low halves also restored relative to the high halves. This is
546 // preferable because it puts all values from the same source row back together,
547 // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])548 static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
549   // b0.val[0]: 00 10 02 12 04 14 06 16
550   // b0.val[1]: 01 11 03 13 05 15 07 17
551   // b1.val[0]: 20 30 22 32 24 34 26 36
552   // b1.val[1]: 21 31 23 33 25 35 27 37
553   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
554   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
555 
556   // Reverse odd vectors to bring the appropriate items to the front of zips.
557   // b0.val[0]: 00 10 02 12 04 14 06 16
558   // r0       : 03 13 01 11 07 17 05 15
559   // b1.val[0]: 20 30 22 32 24 34 26 36
560   // r1       : 23 33 21 31 27 37 25 35
561   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
562   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
563 
564   // Zip to complete the halves.
565   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
566   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
567   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
568   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
569   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
570                                     vreinterpretq_u32_u16(b1.val[0]));
571   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
572 
573   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
574   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
575   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
576   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
577   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
578   // The third row of c comes first here to swap p2 with q0.
579   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
580 
581   // 8x4 Output:
582   // a[0]: 03 13 23 33 04 14 24 34  p0q0
583   // a[1]: 02 12 22 32 05 15 25 35  p1q1
584   // a[2]: 01 11 21 31 06 16 26 36  p2q2
585   // a[3]: 00 10 20 30 07 17 27 37  p3q3
586   a[0] = d1.val[0];  // p0q0
587   a[1] = d0.val[1];  // p1q1
588   a[2] = d1.val[1];  // p2q2
589   a[3] = d0.val[0];  // p3q3
590 }
591 
transpose_elems_u16_4x8(const uint16x4_t a0,const uint16x4_t a1,const uint16x4_t a2,const uint16x4_t a3,const uint16x4_t a4,const uint16x4_t a5,const uint16x4_t a6,const uint16x4_t a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)592 static inline void transpose_elems_u16_4x8(
593     const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
594     const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
595     const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
596     uint16x8_t *o2, uint16x8_t *o3) {
597   // Combine rows. Goes from:
598   // a0: 00 01 02 03
599   // a1: 10 11 12 13
600   // a2: 20 21 22 23
601   // a3: 30 31 32 33
602   // a4: 40 41 42 43
603   // a5: 50 51 52 53
604   // a6: 60 61 62 63
605   // a7: 70 71 72 73
606   // to:
607   // b0: 00 01 02 03 40 41 42 43
608   // b1: 10 11 12 13 50 51 52 53
609   // b2: 20 21 22 23 60 61 62 63
610   // b3: 30 31 32 33 70 71 72 73
611 
612   const uint16x8_t b0 = vcombine_u16(a0, a4);
613   const uint16x8_t b1 = vcombine_u16(a1, a5);
614   const uint16x8_t b2 = vcombine_u16(a2, a6);
615   const uint16x8_t b3 = vcombine_u16(a3, a7);
616 
617   // Swap 16 bit elements resulting in:
618   // c0.val[0]: 00 10 02 12 40 50 42 52
619   // c0.val[1]: 01 11 03 13 41 51 43 53
620   // c1.val[0]: 20 30 22 32 60 70 62 72
621   // c1.val[1]: 21 31 23 33 61 71 63 73
622 
623   const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
624   const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
625 
626   // Swap 32 bit elements resulting in:
627   // d0.val[0]: 00 10 20 30 40 50 60 70
628   // d0.val[1]: 02 12 22 32 42 52 62 72
629   // d1.val[0]: 01 11 21 31 41 51 61 71
630   // d1.val[1]: 03 13 23 33 43 53 63 73
631 
632   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
633                                     vreinterpretq_u32_u16(c1.val[0]));
634   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
635                                     vreinterpretq_u32_u16(c1.val[1]));
636 
637   *o0 = vreinterpretq_u16_u32(d0.val[0]);
638   *o1 = vreinterpretq_u16_u32(d1.val[0]);
639   *o2 = vreinterpretq_u16_u32(d0.val[1]);
640   *o3 = vreinterpretq_u16_u32(d1.val[1]);
641 }
642 
transpose_elems_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)643 static inline void transpose_elems_s16_4x8(
644     const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
645     const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
646     const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
647     int16x8_t *o2, int16x8_t *o3) {
648   // Combine rows. Goes from:
649   // a0: 00 01 02 03
650   // a1: 10 11 12 13
651   // a2: 20 21 22 23
652   // a3: 30 31 32 33
653   // a4: 40 41 42 43
654   // a5: 50 51 52 53
655   // a6: 60 61 62 63
656   // a7: 70 71 72 73
657   // to:
658   // b0: 00 01 02 03 40 41 42 43
659   // b1: 10 11 12 13 50 51 52 53
660   // b2: 20 21 22 23 60 61 62 63
661   // b3: 30 31 32 33 70 71 72 73
662 
663   const int16x8_t b0 = vcombine_s16(a0, a4);
664   const int16x8_t b1 = vcombine_s16(a1, a5);
665   const int16x8_t b2 = vcombine_s16(a2, a6);
666   const int16x8_t b3 = vcombine_s16(a3, a7);
667 
668   // Swap 16 bit elements resulting in:
669   // c0.val[0]: 00 10 02 12 40 50 42 52
670   // c0.val[1]: 01 11 03 13 41 51 43 53
671   // c1.val[0]: 20 30 22 32 60 70 62 72
672   // c1.val[1]: 21 31 23 33 61 71 63 73
673 
674   const int16x8x2_t c0 = vtrnq_s16(b0, b1);
675   const int16x8x2_t c1 = vtrnq_s16(b2, b3);
676 
677   // Swap 32 bit elements resulting in:
678   // d0.val[0]: 00 10 20 30 40 50 60 70
679   // d0.val[1]: 02 12 22 32 42 52 62 72
680   // d1.val[0]: 01 11 21 31 41 51 61 71
681   // d1.val[1]: 03 13 23 33 43 53 63 73
682 
683   const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
684                                    vreinterpretq_s32_s16(c1.val[0]));
685   const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
686                                    vreinterpretq_s32_s16(c1.val[1]));
687 
688   *o0 = vreinterpretq_s16_s32(d0.val[0]);
689   *o1 = vreinterpretq_s16_s32(d1.val[0]);
690   *o2 = vreinterpretq_s16_s32(d0.val[1]);
691   *o3 = vreinterpretq_s16_s32(d1.val[1]);
692 }
693 
transpose_elems_inplace_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)694 static inline void transpose_elems_inplace_u16_8x8(
695     uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
696     uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
697   // Swap 16 bit elements. Goes from:
698   // a0: 00 01 02 03 04 05 06 07
699   // a1: 10 11 12 13 14 15 16 17
700   // a2: 20 21 22 23 24 25 26 27
701   // a3: 30 31 32 33 34 35 36 37
702   // a4: 40 41 42 43 44 45 46 47
703   // a5: 50 51 52 53 54 55 56 57
704   // a6: 60 61 62 63 64 65 66 67
705   // a7: 70 71 72 73 74 75 76 77
706   // to:
707   // b0.val[0]: 00 10 02 12 04 14 06 16
708   // b0.val[1]: 01 11 03 13 05 15 07 17
709   // b1.val[0]: 20 30 22 32 24 34 26 36
710   // b1.val[1]: 21 31 23 33 25 35 27 37
711   // b2.val[0]: 40 50 42 52 44 54 46 56
712   // b2.val[1]: 41 51 43 53 45 55 47 57
713   // b3.val[0]: 60 70 62 72 64 74 66 76
714   // b3.val[1]: 61 71 63 73 65 75 67 77
715 
716   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
717   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
718   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
719   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
720 
721   // Swap 32 bit elements resulting in:
722   // c0.val[0]: 00 10 20 30 04 14 24 34
723   // c0.val[1]: 02 12 22 32 06 16 26 36
724   // c1.val[0]: 01 11 21 31 05 15 25 35
725   // c1.val[1]: 03 13 23 33 07 17 27 37
726   // c2.val[0]: 40 50 60 70 44 54 64 74
727   // c2.val[1]: 42 52 62 72 46 56 66 76
728   // c3.val[0]: 41 51 61 71 45 55 65 75
729   // c3.val[1]: 43 53 63 73 47 57 67 77
730 
731   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
732                                     vreinterpretq_u32_u16(b1.val[0]));
733   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
734                                     vreinterpretq_u32_u16(b1.val[1]));
735   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
736                                     vreinterpretq_u32_u16(b3.val[0]));
737   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
738                                     vreinterpretq_u32_u16(b3.val[1]));
739 
740   // Swap 64 bit elements resulting in:
741   // d0.val[0]: 00 10 20 30 40 50 60 70
742   // d0.val[1]: 04 14 24 34 44 54 64 74
743   // d1.val[0]: 01 11 21 31 41 51 61 71
744   // d1.val[1]: 05 15 25 35 45 55 65 75
745   // d2.val[0]: 02 12 22 32 42 52 62 72
746   // d2.val[1]: 06 16 26 36 46 56 66 76
747   // d3.val[0]: 03 13 23 33 43 53 63 73
748   // d3.val[1]: 07 17 27 37 47 57 67 77
749 
750   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
751   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
752   const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
753   const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
754 
755   *a0 = d0.val[0];
756   *a1 = d1.val[0];
757   *a2 = d2.val[0];
758   *a3 = d3.val[0];
759   *a4 = d0.val[1];
760   *a5 = d1.val[1];
761   *a6 = d2.val[1];
762   *a7 = d3.val[1];
763 }
764 
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)765 static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
766   int16x8x2_t b0;
767 #if AOM_ARCH_AARCH64
768   b0.val[0] = vreinterpretq_s16_s64(
769       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
770   b0.val[1] = vreinterpretq_s16_s64(
771       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
772 #else
773   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
774                            vreinterpret_s16_s32(vget_low_s32(a1)));
775   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
776                            vreinterpret_s16_s32(vget_high_s32(a1)));
777 #endif
778   return b0;
779 }
780 
transpose_elems_inplace_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)781 static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
782                                                    int16x8_t *a2, int16x8_t *a3,
783                                                    int16x8_t *a4, int16x8_t *a5,
784                                                    int16x8_t *a6,
785                                                    int16x8_t *a7) {
786   // Swap 16 bit elements. Goes from:
787   // a0: 00 01 02 03 04 05 06 07
788   // a1: 10 11 12 13 14 15 16 17
789   // a2: 20 21 22 23 24 25 26 27
790   // a3: 30 31 32 33 34 35 36 37
791   // a4: 40 41 42 43 44 45 46 47
792   // a5: 50 51 52 53 54 55 56 57
793   // a6: 60 61 62 63 64 65 66 67
794   // a7: 70 71 72 73 74 75 76 77
795   // to:
796   // b0.val[0]: 00 10 02 12 04 14 06 16
797   // b0.val[1]: 01 11 03 13 05 15 07 17
798   // b1.val[0]: 20 30 22 32 24 34 26 36
799   // b1.val[1]: 21 31 23 33 25 35 27 37
800   // b2.val[0]: 40 50 42 52 44 54 46 56
801   // b2.val[1]: 41 51 43 53 45 55 47 57
802   // b3.val[0]: 60 70 62 72 64 74 66 76
803   // b3.val[1]: 61 71 63 73 65 75 67 77
804 
805   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
806   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
807   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
808   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
809 
810   // Swap 32 bit elements resulting in:
811   // c0.val[0]: 00 10 20 30 04 14 24 34
812   // c0.val[1]: 02 12 22 32 06 16 26 36
813   // c1.val[0]: 01 11 21 31 05 15 25 35
814   // c1.val[1]: 03 13 23 33 07 17 27 37
815   // c2.val[0]: 40 50 60 70 44 54 64 74
816   // c2.val[1]: 42 52 62 72 46 56 66 76
817   // c3.val[0]: 41 51 61 71 45 55 65 75
818   // c3.val[1]: 43 53 63 73 47 57 67 77
819 
820   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
821                                    vreinterpretq_s32_s16(b1.val[0]));
822   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
823                                    vreinterpretq_s32_s16(b1.val[1]));
824   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
825                                    vreinterpretq_s32_s16(b3.val[0]));
826   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
827                                    vreinterpretq_s32_s16(b3.val[1]));
828 
829   // Swap 64 bit elements resulting in:
830   // d0.val[0]: 00 10 20 30 40 50 60 70
831   // d0.val[1]: 04 14 24 34 44 54 64 74
832   // d1.val[0]: 01 11 21 31 41 51 61 71
833   // d1.val[1]: 05 15 25 35 45 55 65 75
834   // d2.val[0]: 02 12 22 32 42 52 62 72
835   // d2.val[1]: 06 16 26 36 46 56 66 76
836   // d3.val[0]: 03 13 23 33 43 53 63 73
837   // d3.val[1]: 07 17 27 37 47 57 67 77
838 
839   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
840   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
841   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
842   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
843 
844   *a0 = d0.val[0];
845   *a1 = d1.val[0];
846   *a2 = d2.val[0];
847   *a3 = d3.val[0];
848   *a4 = d0.val[1];
849   *a5 = d1.val[1];
850   *a6 = d2.val[1];
851   *a7 = d3.val[1];
852 }
853 
transpose_arrays_s16_8x8(const int16x8_t * a,int16x8_t * out)854 static inline void transpose_arrays_s16_8x8(const int16x8_t *a,
855                                             int16x8_t *out) {
856   // Swap 16 bit elements. Goes from:
857   // a0: 00 01 02 03 04 05 06 07
858   // a1: 10 11 12 13 14 15 16 17
859   // a2: 20 21 22 23 24 25 26 27
860   // a3: 30 31 32 33 34 35 36 37
861   // a4: 40 41 42 43 44 45 46 47
862   // a5: 50 51 52 53 54 55 56 57
863   // a6: 60 61 62 63 64 65 66 67
864   // a7: 70 71 72 73 74 75 76 77
865   // to:
866   // b0.val[0]: 00 10 02 12 04 14 06 16
867   // b0.val[1]: 01 11 03 13 05 15 07 17
868   // b1.val[0]: 20 30 22 32 24 34 26 36
869   // b1.val[1]: 21 31 23 33 25 35 27 37
870   // b2.val[0]: 40 50 42 52 44 54 46 56
871   // b2.val[1]: 41 51 43 53 45 55 47 57
872   // b3.val[0]: 60 70 62 72 64 74 66 76
873   // b3.val[1]: 61 71 63 73 65 75 67 77
874 
875   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
876   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
877   const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
878   const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
879 
880   // Swap 32 bit elements resulting in:
881   // c0.val[0]: 00 10 20 30 04 14 24 34
882   // c0.val[1]: 02 12 22 32 06 16 26 36
883   // c1.val[0]: 01 11 21 31 05 15 25 35
884   // c1.val[1]: 03 13 23 33 07 17 27 37
885   // c2.val[0]: 40 50 60 70 44 54 64 74
886   // c2.val[1]: 42 52 62 72 46 56 66 76
887   // c3.val[0]: 41 51 61 71 45 55 65 75
888   // c3.val[1]: 43 53 63 73 47 57 67 77
889 
890   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
891                                    vreinterpretq_s32_s16(b1.val[0]));
892   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
893                                    vreinterpretq_s32_s16(b1.val[1]));
894   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
895                                    vreinterpretq_s32_s16(b3.val[0]));
896   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
897                                    vreinterpretq_s32_s16(b3.val[1]));
898 
899   // Swap 64 bit elements resulting in:
900   // d0.val[0]: 00 10 20 30 40 50 60 70
901   // d0.val[1]: 04 14 24 34 44 54 64 74
902   // d1.val[0]: 01 11 21 31 41 51 61 71
903   // d1.val[1]: 05 15 25 35 45 55 65 75
904   // d2.val[0]: 02 12 22 32 42 52 62 72
905   // d2.val[1]: 06 16 26 36 46 56 66 76
906   // d3.val[0]: 03 13 23 33 43 53 63 73
907   // d3.val[1]: 07 17 27 37 47 57 67 77
908 
909   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
910   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
911   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
912   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
913 
914   out[0] = d0.val[0];
915   out[1] = d1.val[0];
916   out[2] = d2.val[0];
917   out[3] = d3.val[0];
918   out[4] = d0.val[1];
919   out[5] = d1.val[1];
920   out[6] = d2.val[1];
921   out[7] = d3.val[1];
922 }
923 
transpose_elems_inplace_s16_8x4(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3)924 static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
925                                                    int16x8_t *a2,
926                                                    int16x8_t *a3) {
927   // Swap 16 bit elements. Goes from:
928   // a0: 00 01 02 03 04 05 06 07
929   // a1: 10 11 12 13 14 15 16 17
930   // a2: 20 21 22 23 24 25 26 27
931   // a3: 30 31 32 33 34 35 36 37
932   // to:
933   // b0.val[0]: 00 10 02 12 04 14 06 16
934   // b0.val[1]: 01 11 03 13 05 15 07 17
935   // b1.val[0]: 20 30 22 32 24 34 26 36
936   // b1.val[1]: 21 31 23 33 25 35 27 37
937 
938   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
939   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
940 
941   // Swap 32 bit elements resulting in:
942   // c0.val[0]: 00 10 20 30 04 14 24 34
943   // c0.val[1]: 01 11 21 31 05 15 25 35
944   // c1.val[0]: 02 12 22 32 06 16 26 36
945   // c1.val[1]: 03 13 23 33 07 17 27 37
946 
947   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
948                                    vreinterpretq_s32_s16(b1.val[0]));
949   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
950                                    vreinterpretq_s32_s16(b1.val[1]));
951 
952   *a0 = vreinterpretq_s16_s32(c0.val[0]);
953   *a1 = vreinterpretq_s16_s32(c1.val[0]);
954   *a2 = vreinterpretq_s16_s32(c0.val[1]);
955   *a3 = vreinterpretq_s16_s32(c1.val[1]);
956 }
957 
transpose_elems_inplace_u16_4x4(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3)958 static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
959                                                    uint16x4_t *a1,
960                                                    uint16x4_t *a2,
961                                                    uint16x4_t *a3) {
962   // Swap 16 bit elements. Goes from:
963   // a0: 00 01 02 03
964   // a1: 10 11 12 13
965   // a2: 20 21 22 23
966   // a3: 30 31 32 33
967   // to:
968   // b0.val[0]: 00 10 02 12
969   // b0.val[1]: 01 11 03 13
970   // b1.val[0]: 20 30 22 32
971   // b1.val[1]: 21 31 23 33
972 
973   const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
974   const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
975 
976   // Swap 32 bit elements resulting in:
977   // c0.val[0]: 00 10 20 30
978   // c0.val[1]: 02 12 22 32
979   // c1.val[0]: 01 11 21 31
980   // c1.val[1]: 03 13 23 33
981 
982   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
983                                    vreinterpret_u32_u16(b1.val[0]));
984   const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
985                                    vreinterpret_u32_u16(b1.val[1]));
986 
987   *a0 = vreinterpret_u16_u32(c0.val[0]);
988   *a1 = vreinterpret_u16_u32(c1.val[0]);
989   *a2 = vreinterpret_u16_u32(c0.val[1]);
990   *a3 = vreinterpret_u16_u32(c1.val[1]);
991 }
992 
transpose_elems_inplace_s16_4x4(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)993 static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
994                                                    int16x4_t *a2,
995                                                    int16x4_t *a3) {
996   // Swap 16 bit elements. Goes from:
997   // a0: 00 01 02 03
998   // a1: 10 11 12 13
999   // a2: 20 21 22 23
1000   // a3: 30 31 32 33
1001   // to:
1002   // b0.val[0]: 00 10 02 12
1003   // b0.val[1]: 01 11 03 13
1004   // b1.val[0]: 20 30 22 32
1005   // b1.val[1]: 21 31 23 33
1006 
1007   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
1008   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
1009 
1010   // Swap 32 bit elements resulting in:
1011   // c0.val[0]: 00 10 20 30
1012   // c0.val[1]: 02 12 22 32
1013   // c1.val[0]: 01 11 21 31
1014   // c1.val[1]: 03 13 23 33
1015 
1016   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
1017                                   vreinterpret_s32_s16(b1.val[0]));
1018   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
1019                                   vreinterpret_s32_s16(b1.val[1]));
1020 
1021   *a0 = vreinterpret_s16_s32(c0.val[0]);
1022   *a1 = vreinterpret_s16_s32(c1.val[0]);
1023   *a2 = vreinterpret_s16_s32(c0.val[1]);
1024   *a3 = vreinterpret_s16_s32(c1.val[1]);
1025 }
1026 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)1027 static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
1028   int32x4x2_t b0;
1029 #if AOM_ARCH_AARCH64
1030   b0.val[0] = vreinterpretq_s32_s64(
1031       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1032   b0.val[1] = vreinterpretq_s32_s64(
1033       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1034 #else
1035   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
1036   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
1037 #endif
1038   return b0;
1039 }
1040 
transpose_elems_s32_4x4(const int32x4_t a0,const int32x4_t a1,const int32x4_t a2,const int32x4_t a3,int32x4_t * o0,int32x4_t * o1,int32x4_t * o2,int32x4_t * o3)1041 static inline void transpose_elems_s32_4x4(const int32x4_t a0,
1042                                            const int32x4_t a1,
1043                                            const int32x4_t a2,
1044                                            const int32x4_t a3, int32x4_t *o0,
1045                                            int32x4_t *o1, int32x4_t *o2,
1046                                            int32x4_t *o3) {
1047   // Swap 32 bit elements. Goes from:
1048   // a0: 00 01 02 03
1049   // a1: 10 11 12 13
1050   // a2: 20 21 22 23
1051   // a3: 30 31 32 33
1052   // to:
1053   // b0.val[0]: 00 10 02 12
1054   // b0.val[1]: 01 11 03 13
1055   // b1.val[0]: 20 30 22 32
1056   // b1.val[1]: 21 31 23 33
1057 
1058   const int32x4x2_t b0 = vtrnq_s32(a0, a1);
1059   const int32x4x2_t b1 = vtrnq_s32(a2, a3);
1060 
1061   // Swap 64 bit elements resulting in:
1062   // c0.val[0]: 00 10 20 30
1063   // c0.val[1]: 02 12 22 32
1064   // c1.val[0]: 01 11 21 31
1065   // c1.val[1]: 03 13 23 33
1066 
1067   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
1068   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
1069 
1070   *o0 = c0.val[0];
1071   *o1 = c1.val[0];
1072   *o2 = c0.val[1];
1073   *o3 = c1.val[1];
1074 }
1075 
transpose_elems_inplace_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)1076 static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
1077                                                    int32x4_t *a2,
1078                                                    int32x4_t *a3) {
1079   transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
1080 }
1081 
transpose_arrays_s32_4x4(const int32x4_t * in,int32x4_t * out)1082 static inline void transpose_arrays_s32_4x4(const int32x4_t *in,
1083                                             int32x4_t *out) {
1084   transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
1085                           &out[3]);
1086 }
1087 
transpose_arrays_s32_4nx4n(const int32x4_t * in,int32x4_t * out,const int width,const int height)1088 static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
1089                                                         int32x4_t *out,
1090                                                         const int width,
1091                                                         const int height) {
1092   const int h = height >> 2;
1093   const int w = width >> 2;
1094   for (int j = 0; j < w; j++) {
1095     for (int i = 0; i < h; i++) {
1096       transpose_arrays_s32_4x4(in + j * height + i * 4,
1097                                out + i * width + j * 4);
1098     }
1099   }
1100 }
1101 
1102 #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
1103   static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
1104       const int32x4_t *in, int32x4_t *out) {                   \
1105     transpose_arrays_s32_4nx4n(in, out, w, h);                 \
1106   }
1107 
1108 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
1109 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
1110 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
1111 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
1112 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
1113 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
1114 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
1115 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
1116 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
1117 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
1118 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
1119 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
1120 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
1121 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
1122 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
1123 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
1124 
1125 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
1126 
aom_vtrn1q_s64(int64x2_t a,int64x2_t b)1127 static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
1128 #if AOM_ARCH_AARCH64
1129   return vtrn1q_s64(a, b);
1130 #else
1131   return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1132 #endif
1133 }
1134 
aom_vtrn2q_s64(int64x2_t a,int64x2_t b)1135 static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
1136 #if AOM_ARCH_AARCH64
1137   return vtrn2q_s64(a, b);
1138 #else
1139   return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1140 #endif
1141 }
1142 
transpose_elems_s32_4x8(int32x4_t a0,int32x4_t a1,int32x4_t a2,int32x4_t a3,int32x4_t a4,int32x4_t a5,int32x4_t a6,int32x4_t a7,int32x4x2_t * o0,int32x4x2_t * o1,int32x4x2_t * o2,int32x4x2_t * o3)1143 static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
1144                                            int32x4_t a2, int32x4_t a3,
1145                                            int32x4_t a4, int32x4_t a5,
1146                                            int32x4_t a6, int32x4_t a7,
1147                                            int32x4x2_t *o0, int32x4x2_t *o1,
1148                                            int32x4x2_t *o2, int32x4x2_t *o3) {
1149   // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
1150   // matrix transpose implementation:
1151   // [ A ]^T => [ A^T B^T ]
1152   // [ B ]
1153 
1154   transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
1155   transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
1156 
1157   o0->val[0] = a0;
1158   o1->val[0] = a1;
1159   o2->val[0] = a2;
1160   o3->val[0] = a3;
1161 
1162   o0->val[1] = a4;
1163   o1->val[1] = a5;
1164   o2->val[1] = a6;
1165   o3->val[1] = a7;
1166 }
1167 
transpose_elems_inplace_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)1168 static inline void transpose_elems_inplace_s32_8x8(
1169     int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
1170     int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
1171   // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
1172   // matrix transpose implementation:
1173   // [ A B ]^T => [ A^T C^T ]
1174   // [ C D ]      [ B^T D^T ]
1175 
1176   int32x4_t q0_v1 = a0->val[0];
1177   int32x4_t q0_v2 = a1->val[0];
1178   int32x4_t q0_v3 = a2->val[0];
1179   int32x4_t q0_v4 = a3->val[0];
1180 
1181   int32x4_t q1_v1 = a0->val[1];
1182   int32x4_t q1_v2 = a1->val[1];
1183   int32x4_t q1_v3 = a2->val[1];
1184   int32x4_t q1_v4 = a3->val[1];
1185 
1186   int32x4_t q2_v1 = a4->val[0];
1187   int32x4_t q2_v2 = a5->val[0];
1188   int32x4_t q2_v3 = a6->val[0];
1189   int32x4_t q2_v4 = a7->val[0];
1190 
1191   int32x4_t q3_v1 = a4->val[1];
1192   int32x4_t q3_v2 = a5->val[1];
1193   int32x4_t q3_v3 = a6->val[1];
1194   int32x4_t q3_v4 = a7->val[1];
1195 
1196   transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
1197   transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
1198   transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
1199   transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
1200 
1201   a0->val[0] = q0_v1;
1202   a1->val[0] = q0_v2;
1203   a2->val[0] = q0_v3;
1204   a3->val[0] = q0_v4;
1205 
1206   a0->val[1] = q2_v1;
1207   a1->val[1] = q2_v2;
1208   a2->val[1] = q2_v3;
1209   a3->val[1] = q2_v4;
1210 
1211   a4->val[0] = q1_v1;
1212   a5->val[0] = q1_v2;
1213   a6->val[0] = q1_v3;
1214   a7->val[0] = q1_v4;
1215 
1216   a4->val[1] = q3_v1;
1217   a5->val[1] = q3_v2;
1218   a6->val[1] = q3_v3;
1219   a7->val[1] = q3_v4;
1220 }
1221 
transpose_arrays_s16_4x4(const int16x4_t * const in,int16x4_t * const out)1222 static inline void transpose_arrays_s16_4x4(const int16x4_t *const in,
1223                                             int16x4_t *const out) {
1224   int16x4_t a0 = in[0];
1225   int16x4_t a1 = in[1];
1226   int16x4_t a2 = in[2];
1227   int16x4_t a3 = in[3];
1228 
1229   transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
1230 
1231   out[0] = a0;
1232   out[1] = a1;
1233   out[2] = a2;
1234   out[3] = a3;
1235 }
1236 
transpose_arrays_s16_4x8(const int16x4_t * const in,int16x8_t * const out)1237 static inline void transpose_arrays_s16_4x8(const int16x4_t *const in,
1238                                             int16x8_t *const out) {
1239 #if AOM_ARCH_AARCH64
1240   const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
1241                                   vcombine_s16(in[1], vdup_n_s16(0)));
1242   const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
1243                                   vcombine_s16(in[3], vdup_n_s16(0)));
1244   const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
1245                                   vcombine_s16(in[5], vdup_n_s16(0)));
1246   const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
1247                                   vcombine_s16(in[7], vdup_n_s16(0)));
1248 #else
1249   int16x4x2_t temp;
1250   temp = vzip_s16(in[0], in[1]);
1251   const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
1252   temp = vzip_s16(in[2], in[3]);
1253   const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
1254   temp = vzip_s16(in[4], in[5]);
1255   const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
1256   temp = vzip_s16(in[6], in[7]);
1257   const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
1258 #endif
1259 
1260   const int32x4x2_t b02 =
1261       vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
1262   const int32x4x2_t b13 =
1263       vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
1264 
1265 #if AOM_ARCH_AARCH64
1266   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
1267                                             vreinterpretq_s64_s32(b13.val[0])));
1268   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
1269                                             vreinterpretq_s64_s32(b13.val[0])));
1270   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
1271                                             vreinterpretq_s64_s32(b13.val[1])));
1272   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
1273                                             vreinterpretq_s64_s32(b13.val[1])));
1274 #else
1275   out[0] = vreinterpretq_s16_s32(
1276       vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
1277   out[2] = vreinterpretq_s16_s32(
1278       vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
1279   out[1] = vreinterpretq_s16_s32(
1280       vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
1281   out[3] = vreinterpretq_s16_s32(
1282       vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
1283 #endif
1284 }
1285 
transpose_arrays_s16_8x4(const int16x8_t * const in,int16x4_t * const out)1286 static inline void transpose_arrays_s16_8x4(const int16x8_t *const in,
1287                                             int16x4_t *const out) {
1288   // Swap 16 bit elements. Goes from:
1289   // in[0]: 00 01 02 03 04 05 06 07
1290   // in[1]: 10 11 12 13 14 15 16 17
1291   // in[2]: 20 21 22 23 24 25 26 27
1292   // in[3]: 30 31 32 33 34 35 36 37
1293   // to:
1294   // b0.val[0]: 00 10 02 12 04 14 06 16
1295   // b0.val[1]: 01 11 03 13 05 15 07 17
1296   // b1.val[0]: 20 30 22 32 24 34 26 36
1297   // b1.val[1]: 21 31 23 33 25 35 27 37
1298 
1299   const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
1300   const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
1301 
1302   // Swap 32 bit elements resulting in:
1303   // c0.val[0]: 00 10 20 30 04 14 24 34
1304   // c0.val[1]: 02 12 22 32 06 16 26 36
1305   // c1.val[0]: 01 11 21 31 05 15 25 35
1306   // c1.val[1]: 03 13 23 33 07 17 27 37
1307 
1308   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
1309                                     vreinterpretq_u32_s16(b1.val[0]));
1310   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
1311                                     vreinterpretq_u32_s16(b1.val[1]));
1312 
1313   // Unpack 64 bit elements resulting in:
1314   // out[0]: 00 10 20 30
1315   // out[1]: 01 11 21 31
1316   // out[2]: 02 12 22 32
1317   // out[3]: 03 13 23 33
1318   // out[4]: 04 14 24 34
1319   // out[5]: 05 15 25 35
1320   // out[6]: 06 16 26 36
1321   // out[7]: 07 17 27 37
1322 
1323   out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
1324   out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
1325   out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
1326   out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
1327   out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
1328   out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
1329   out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
1330   out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
1331 }
1332 
transpose_arrays_s64_4x4(const int64x2_t * in,int64x2_t * out)1333 static inline void transpose_arrays_s64_4x4(const int64x2_t *in,
1334                                             int64x2_t *out) {
1335   // Perform a 4x4 matrix transpose going from:
1336   // in[0] = 00 01
1337   // in[1] = 02 03
1338   // in[2] = 10 11
1339   // in[3] = 12 13
1340   // in[4] = 20 21
1341   // in[5] = 22 23
1342   // in[6] = 30 31
1343   // in[7] = 32 33
1344   //
1345   // to:
1346   // out[0] = 00 10
1347   // out[1] = 20 30
1348   // out[2] = 01 11
1349   // out[3] = 21 31
1350   // out[4] = 02 12
1351   // out[5] = 22 32
1352   // out[6] = 03 13
1353   // out[7] = 23 33
1354 
1355   out[0] = aom_vtrn1q_s64(in[0], in[2]);
1356   out[1] = aom_vtrn1q_s64(in[4], in[6]);
1357   out[2] = aom_vtrn2q_s64(in[0], in[2]);
1358   out[3] = aom_vtrn2q_s64(in[4], in[6]);
1359   out[4] = aom_vtrn1q_s64(in[1], in[3]);
1360   out[5] = aom_vtrn1q_s64(in[5], in[7]);
1361   out[6] = aom_vtrn2q_s64(in[1], in[3]);
1362   out[7] = aom_vtrn2q_s64(in[5], in[7]);
1363 }
1364 
1365 #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
1366