xref: /aosp_15_r20/external/libdav1d/src/ppc/itx_tmpl.c (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1 /*
2  * Copyright © 2024, VideoLAN and dav1d authors
3  * Copyright © 2024, Luca Barbato
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "src/ppc/dav1d_types.h"
29 #include "src/ppc/itx.h"
30 #include "src/ppc/utils.h"
31 
32 #if BITDEPTH == 8
33 
34 #define LOAD_4(src, stride, a, b, c, d) \
35 {  \
36     uint8_t *s = src; \
37     a = vec_xl(0, s); \
38     s += stride; \
39     b = vec_xl(0, s); \
40     s += stride; \
41     c = vec_xl(0, s); \
42     s += stride; \
43     d = vec_xl(0, s); \
44 }
45 
46 #define LOAD_DECLARE_2_I16(src, a, b) \
47     i16x8 a = vec_xl(0, src); \
48     i16x8 b = vec_xl(0, src + 8);
49 
50 #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \
51     i32x4 a = i16h_to_i32(sa); \
52     i32x4 b = i16l_to_i32(sa); \
53     i32x4 c = i16h_to_i32(sb); \
54     i32x4 d = i16l_to_i32(sb);
55 
56 #define LOAD_COEFF_4(coeff) \
57     LOAD_DECLARE_2_I16(coeff, c01, c23) \
58     UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3)
59 
60 #define LOAD_SCALE_COEFF_4x8(coeff, scale) \
61     LOAD_DECLARE_2_I16(coeff, c04, c15) \
62     LOAD_DECLARE_2_I16(coeff+16, c26, c37) \
63     i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \
64     i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \
65     i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \
66     i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \
67     c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
68     c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
69     UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
70     c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
71     c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
72     UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
73 
74 #define LOAD_SCALE_COEFF_8x4(coeff, scale) \
75     LOAD_DECLARE_2_I16(coeff, c01, c23) \
76     LOAD_DECLARE_2_I16(coeff+16, c45, c67) \
77     c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
78     c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
79     UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
80     c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
81     c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
82     UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
83 
84 #define LOAD_COEFF_8x8(coeff) \
85     LOAD_DECLARE_2_I16(coeff, c0, c1) \
86     LOAD_DECLARE_2_I16(coeff+16, c2, c3) \
87     LOAD_DECLARE_2_I16(coeff+32, c4, c5) \
88     LOAD_DECLARE_2_I16(coeff+48, c6, c7) \
89     UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \
90     UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \
91     UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \
92     UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \
93 
94 #define LOAD_COEFF_4x16(coeff) \
95     LOAD_DECLARE_2_I16(coeff,    a0b0, c0d0) \
96     LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \
97     LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \
98     LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \
99     UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \
100     UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \
101     UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \
102     UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3)
103 
104 #define LOAD_DECLARE_4(src, stride, a, b, c, d) \
105     u8x16 a, b, c, d; \
106     LOAD_4(src, stride, a, b, c, d)
107 
108 #define STORE_LEN(l, dst, stride, a, b, c, d) \
109 { \
110     uint8_t *dst2 = dst; \
111     vec_xst_len(a, dst2, l); \
112     dst2 += stride; \
113     vec_xst_len(b, dst2, l); \
114     dst2 += stride; \
115     vec_xst_len(c, dst2, l); \
116     dst2 += stride; \
117     vec_xst_len(d, dst2, l); \
118 }
119 
120 #define STORE_4(dst, stride, a, b, c, d) \
121     STORE_LEN(4, dst, stride, a, b, c, d)
122 
123 #define STORE_8(dst, stride, ab, cd, ef, gh) \
124     STORE_LEN(8, dst, stride, ab, cd, ef, gh)
125 
126 #define STORE_16(dst, stride, l0, l1, l2, l3) \
127 { \
128     uint8_t *dst##2 = dst; \
129     vec_xst(l0, 0, dst##2); \
130     dst##2 += stride; \
131     vec_xst(l1, 0, dst##2); \
132     dst##2 += stride; \
133     vec_xst(l2, 0, dst##2); \
134     dst##2 += stride; \
135     vec_xst(l3, 0, dst##2); \
136 }
137 
138 #define APPLY_COEFF_4(a, b, c, d, c01, c23) \
139 { \
140     u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
141     u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \
142  \
143     c01 = vec_adds(c01, vec_splat_s16(8)); \
144     c23 = vec_adds(c23, vec_splat_s16(8)); \
145     c01 = vec_sra(c01, vec_splat_u16(4)); \
146     c23 = vec_sra(c23, vec_splat_u16(4)); \
147  \
148     i16x8 abs = u8h_to_i16(ab); \
149     i16x8 cds = u8h_to_i16(cd); \
150  \
151     abs = vec_adds(abs, c01); \
152     cds = vec_adds(cds, c23); \
153  \
154     a = vec_packsu(abs, abs); \
155     c = vec_packsu(cds, cds); \
156  \
157     b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \
158     d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \
159 }
160 
161 #define APPLY_COEFF_8x4(ab, cd, c01, c23) \
162 { \
163     i16x8 abs = u8h_to_i16(ab); \
164     i16x8 cds = u8h_to_i16(cd); \
165     c01 = vec_adds(c01, vec_splat_s16(8)); \
166     c23 = vec_adds(c23, vec_splat_s16(8)); \
167     c01 = vec_sra(c01, vec_splat_u16(4)); \
168     c23 = vec_sra(c23, vec_splat_u16(4)); \
169  \
170     abs = vec_adds(abs, c01); \
171     cds = vec_adds(cds, c23); \
172  \
173     ab = vec_packsu(abs, abs); \
174     cd = vec_packsu(cds, cds); \
175 }
176 
177 #define APPLY_COEFF_16x4(a, b, c, d, \
178                          c00c01, c02c03, c04c05, c06c07, \
179                          c08c09, c10c11, c12c13, c14c15) \
180 { \
181     i16x8 ah = u8h_to_i16(a); \
182     i16x8 al = u8l_to_i16(a); \
183     i16x8 bh = u8h_to_i16(b); \
184     i16x8 bl = u8l_to_i16(b); \
185     i16x8 ch = u8h_to_i16(c); \
186     i16x8 cl = u8l_to_i16(c); \
187     i16x8 dh = u8h_to_i16(d); \
188     i16x8 dl = u8l_to_i16(d); \
189     SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \
190     SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \
191  \
192     ah = vec_adds(ah, c00c01); \
193     al = vec_adds(al, c02c03); \
194     bh = vec_adds(bh, c04c05); \
195     bl = vec_adds(bl, c06c07); \
196     ch = vec_adds(ch, c08c09); \
197     cl = vec_adds(cl, c10c11); \
198     dh = vec_adds(dh, c12c13); \
199     dl = vec_adds(dl, c14c15); \
200  \
201     a = vec_packsu(ah, al); \
202     b = vec_packsu(bh, bl); \
203     c = vec_packsu(ch, cl); \
204     d = vec_packsu(dh, dl); \
205 }
206 
207 #define IDCT_4_INNER(c0, c1, c2, c3) \
208 { \
209     i32x4 o0 = vec_add(c0, c2); \
210     i32x4 o1 = vec_sub(c0, c2); \
211  \
212     i32x4 v2896 = vec_splats(2896); \
213     i32x4 v1567 = vec_splats(1567); \
214     i32x4 v3784 = vec_splats(3784); \
215     i32x4 v2048 = vec_splats(2048); \
216  \
217     o0 = vec_mul(o0, v2896); \
218     o1 = vec_mul(o1, v2896); \
219  \
220     i32x4 o2a = vec_mul(c1, v1567); \
221     i32x4 o2b = vec_mul(c3, v3784); \
222     i32x4 o3a = vec_mul(c1, v3784); \
223     i32x4 o3b = vec_mul(c3, v1567); \
224  \
225     i32x4 o2 = vec_sub(o2a, o2b); \
226     i32x4 o3 = vec_add(o3a, o3b); \
227  \
228     u32x4 v12 = vec_splat_u32(12); \
229  \
230     o0 = vec_add(o0, v2048); \
231     o1 = vec_add(o1, v2048); \
232     o2 = vec_add(o2, v2048); \
233     o3 = vec_add(o3, v2048); \
234  \
235     o0 = vec_sra(o0, v12); \
236     o1 = vec_sra(o1, v12); \
237     o2 = vec_sra(o2, v12); \
238     o3 = vec_sra(o3, v12); \
239  \
240     c0 = vec_add(o0, o3); \
241     c1 = vec_add(o1, o2); \
242     c2 = vec_sub(o1, o2); \
243     c3 = vec_sub(o0, o3); \
244  \
245 }
246 
247 #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \
248     IDCT_4_INNER(c0, c1, c2, c3) \
249     c03 = vec_packs(c0, c3); \
250     c12 = vec_packs(c1, c2); \
251 
252 #define dct_4_in(c0, c1, c2, c3, c01, c23) \
253 { \
254     IDCT_4_INNER(c0, c1, c2, c3) \
255     c01 = vec_packs(c0, c1); \
256     c23 = vec_packs(c2, c3); \
257     c0 = i16h_to_i32(c01); \
258     c1 = i16l_to_i32(c01); \
259     c2 = i16h_to_i32(c23); \
260     c3 = i16l_to_i32(c23); \
261 }
262 
263 #define dct_4_out(c0, c1, c2, c3, c01, c23) \
264     IDCT_4_INNER(c0, c1, c2, c3) \
265     c01 = vec_packs(c0, c1); \
266     c23 = vec_packs(c2, c3); \
267 
268 
269 #define IDENTITY_4(c01, c23) \
270 { \
271     i16x8 v1697 = vec_splats((int16_t)(1697*8)); \
272     i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \
273     i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \
274     c01 = vec_adds(c01, o01); \
275     c23 = vec_adds(c23, o23); \
276 }
277 
278 #define identity_4_in(c0, c1, c2, c3, c01, c23) \
279 { \
280     IDENTITY_4(c01, c23) \
281     c0 = i16h_to_i32(c01); \
282     c1 = i16l_to_i32(c01); \
283     c2 = i16h_to_i32(c23); \
284     c3 = i16l_to_i32(c23); \
285 }
286 
287 #define identity_4_out(c0, c1, c2, c3, c01, c23) \
288 { \
289     c01 = vec_packs(c0, c1); \
290     c23 = vec_packs(c2, c3); \
291     IDENTITY_4(c01, c23) \
292 }
293 
294 #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \
295 { \
296     i32x4 v1321 = vec_splats(1321); \
297     i32x4 v3803 = vec_splats(3803); \
298     i32x4 v2482 = vec_splats(2482); \
299     i32x4 v3344 = vec_splats(3344); \
300     i32x4 v2048 = vec_splats(2048); \
301     i32x4 i0_v1321 = vec_mul(c0, v1321); \
302     i32x4 i0_v2482 = vec_mul(c0, v2482); \
303     i32x4 i0_v3803 = vec_mul(c0, v3803); \
304     i32x4 i1 = vec_mul(c1, v3344); \
305     i32x4 i2_v1321 = vec_mul(c2, v1321); \
306     i32x4 i2_v2482 = vec_mul(c2, v2482); \
307     i32x4 i2_v3803 = vec_mul(c2, v3803); \
308     i32x4 i3_v1321 = vec_mul(c3, v1321); \
309     i32x4 i3_v2482 = vec_mul(c3, v2482); \
310     i32x4 i3_v3803 = vec_mul(c3, v3803); \
311  \
312     i32x4 n1 = vec_sub(i1, v2048); \
313     i1 = vec_add(i1, v2048); \
314  \
315  \
316     i32x4 o0 = vec_add(i0_v1321, i2_v3803); \
317     i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \
318     i32x4 o2 = vec_sub(c0, c2); \
319     i32x4 o3 = vec_add(i0_v3803, i2_v2482); \
320  \
321     o0 = vec_add(o0, i3_v2482); \
322     o1 = vec_sub(o1, i3_v3803); \
323     o2 = vec_add(o2, c3); \
324     o3 = vec_sub(o3, i3_v1321); \
325  \
326     o0 = vec_add(o0, i1); \
327     o1 = vec_add(o1, i1); \
328     o2 = vec_mul(o2, v3344); \
329     o3 = vec_sub(o3, n1); \
330  \
331     o2 = vec_add(o2, v2048); \
332  \
333     oc0 = vec_sra(o0, vec_splat_u32(12)); \
334     oc1 = vec_sra(o1, vec_splat_u32(12)); \
335     oc2 = vec_sra(o2, vec_splat_u32(12)); \
336     oc3 = vec_sra(o3, vec_splat_u32(12)); \
337 }
338 
339 #define adst_4_in(c0, c1, c2, c3, c01, c23) \
340 { \
341     ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
342 }
343 
344 #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \
345 { \
346     ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
347 }
348 
349 #define adst_4_out(c0, c1, c2, c3, c01, c23) \
350 { \
351     ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
352     c01 = vec_packs(c0, c1); \
353     c23 = vec_packs(c2, c3); \
354 }
355 
356 #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \
357 { \
358     ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
359     c01 = vec_packs(c0, c1); \
360     c23 = vec_packs(c2, c3); \
361 }
362 
dc_only_4xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)363 static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
364 {
365     int dc = coeff[0];
366     const int rnd = (1 << shift) >> 1;
367     if (is_rect2)
368         dc = (dc * 181 + 128) >> 8;
369     dc = (dc * 181 + 128) >> 8;
370     dc = (dc + rnd) >> shift;
371     dc = (dc * 181 + 128 + 2048) >> 12;
372 
373     i16x8 vdc = vec_splats((int16_t)dc);
374     coeff[0] = 0;
375     for (int i = 0; i < n; i++, dst += 4 * stride) {
376         LOAD_DECLARE_4(dst, stride, a, b, c, d)
377 
378         i16x8 as = u8h_to_i16(a);
379         i16x8 bs = u8h_to_i16(b);
380         i16x8 cs = u8h_to_i16(c);
381         i16x8 ds = u8h_to_i16(d);
382 
383         as = vec_adds(as, vdc);
384         bs = vec_adds(bs, vdc);
385         cs = vec_adds(cs, vdc);
386         ds = vec_adds(ds, vdc);
387 
388         a = vec_packsu(as, as);
389         b = vec_packsu(bs, bs);
390         c = vec_packsu(cs, cs);
391         d = vec_packsu(ds, ds);
392 
393         STORE_4(dst, stride, a, b, c, d)
394     }
395 }
396 
dc_only_8xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)397 static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
398 {
399     int dc = coeff[0];
400     const int rnd = (1 << shift) >> 1;
401     if (is_rect2)
402         dc = (dc * 181 + 128) >> 8;
403     dc = (dc * 181 + 128) >> 8;
404     dc = (dc + rnd) >> shift;
405     dc = (dc * 181 + 128 + 2048) >> 12;
406 
407     i16x8 vdc = vec_splats((int16_t)dc);
408     coeff[0] = 0;
409 
410     for (int i = 0; i < n; i++, dst += 4 * stride) {
411         LOAD_DECLARE_4(dst, stride, a, b, c, d)
412 
413         i16x8 as = u8h_to_i16(a);
414         i16x8 bs = u8h_to_i16(b);
415         i16x8 cs = u8h_to_i16(c);
416         i16x8 ds = u8h_to_i16(d);
417 
418         as = vec_adds(as, vdc);
419         bs = vec_adds(bs, vdc);
420         cs = vec_adds(cs, vdc);
421         ds = vec_adds(ds, vdc);
422 
423         a = vec_packsu(as, as);
424         b = vec_packsu(bs, bs);
425         c = vec_packsu(cs, cs);
426         d = vec_packsu(ds, ds);
427 
428         STORE_8(dst, stride, a, b, c, d)
429     }
430 }
431 
dc_only_16xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)432 static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
433 {
434     int dc = coeff[0];
435     const int rnd = (1 << shift) >> 1;
436     if (is_rect2)
437         dc = (dc * 181 + 128) >> 8;
438     dc = (dc * 181 + 128) >> 8;
439     dc = (dc + rnd) >> shift;
440     dc = (dc * 181 + 128 + 2048) >> 12;
441 
442     i16x8 vdc = vec_splats((int16_t)dc);
443     coeff[0] = 0;
444 
445     for (int i = 0; i < n; i++, dst += 4 * stride) {
446         LOAD_DECLARE_4(dst, stride, a, b, c, d)
447 
448         i16x8 ah = u8h_to_i16(a);
449         i16x8 bh = u8h_to_i16(b);
450         i16x8 ch = u8h_to_i16(c);
451         i16x8 dh = u8h_to_i16(d);
452         i16x8 al = u8l_to_i16(a);
453         i16x8 bl = u8l_to_i16(b);
454         i16x8 cl = u8l_to_i16(c);
455         i16x8 dl = u8l_to_i16(d);
456 
457         ah = vec_adds(ah, vdc);
458         bh = vec_adds(bh, vdc);
459         ch = vec_adds(ch, vdc);
460         dh = vec_adds(dh, vdc);
461         al = vec_adds(al, vdc);
462         bl = vec_adds(bl, vdc);
463         cl = vec_adds(cl, vdc);
464         dl = vec_adds(dl, vdc);
465 
466         a = vec_packsu(ah, al);
467         b = vec_packsu(bh, bl);
468         c = vec_packsu(ch, cl);
469         d = vec_packsu(dh, dl);
470 
471         STORE_16(dst, stride, a, b, c, d)
472     }
473 }
474 
dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,const int eob)475 void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
476                                               int16_t *const coeff, const int eob)
477 {
478     assert(eob >= 0);
479 
480     if (eob < 1) {
481         return dc_only_4xN(dst, stride, coeff, 1, 0, 0);
482     }
483 
484     LOAD_COEFF_4(coeff)
485 
486     dct_4_in(c0, c1, c2, c3, c01, c23)
487 
488     TRANSPOSE4_I32(c0, c1, c2, c3)
489 
490     memset(coeff, 0, sizeof(*coeff) * 4 * 4);
491 
492     dct_4_out(c0, c1, c2, c3, c01, c23)
493 
494     LOAD_DECLARE_4(dst, stride, a, b, c, d)
495 
496     APPLY_COEFF_4(a, b, c, d, c01, c23)
497 
498     STORE_4(dst, stride, a, b, c, d)
499 }
500 
dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel * dst,const ptrdiff_t stride,coef * const coeff,const int eob)501 void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride,
502                                               coef *const coeff, const int eob)
503 {
504     LOAD_COEFF_4(coeff)
505 
506     u32x4 v2 = vec_splat_u32(2);
507 
508     c0 = vec_sra(c0, v2);
509     c1 = vec_sra(c1, v2);
510     c2 = vec_sra(c2, v2);
511     c3 = vec_sra(c3, v2);
512 
513     i32x4 t0 = vec_add(c0, c1);
514     i32x4 t2 = vec_sub(c2, c3);
515     i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
516     i32x4 t3 = vec_sub(t4, c3);
517     i32x4 t1 = vec_sub(t4, c1);
518     c0 = vec_sub(t0, t3);
519     c1 = t3;
520     c2 = t1;
521     c3 = vec_add(t2, t1);
522 
523     memset(coeff, 0, sizeof(*coeff) * 4 * 4);
524 
525     TRANSPOSE4_I32(c0, c1, c2, c3)
526 
527     t0 = vec_add(c0, c1);
528     t2 = vec_sub(c2, c3);
529     t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
530     t3 = vec_sub(t4, c3);
531     t1 = vec_sub(t4, c1);
532     c0 = vec_sub(t0, t3);
533     c1 = t3;
534     c2 = t1;
535     c3 = vec_add(t2, t1);
536 
537     c01 = vec_packs(c0, c1);
538     c23 = vec_packs(c2, c3);
539 
540     LOAD_DECLARE_4(dst, stride, a, b, c, d)
541 
542     u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b);
543     u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d);
544 
545     i16x8 abs = u8h_to_i16(ab);
546     i16x8 cds = u8h_to_i16(cd);
547 
548     abs = vec_adds(abs, c01);
549     cds = vec_adds(cds, c23);
550 
551     a = vec_packsu(abs, abs);
552     c = vec_packsu(cds, cds);
553 
554     b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a);
555     d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c);
556 
557     STORE_4(dst, stride, a, b, c, d)
558 }
559 
560 #define inv_txfm_fn4x4(type1, type2) \
561 void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
562                                                           int16_t *const coeff, const int eob) \
563 { \
564     LOAD_COEFF_4(coeff) \
565     type1##_4_in(c0, c1, c2, c3, c01, c23) \
566     memset(coeff, 0, sizeof(*coeff) * 4 * 4); \
567     TRANSPOSE4_I32(c0, c1, c2, c3) \
568     type2##_4_out(c0, c1, c2, c3, c01, c23) \
569     LOAD_DECLARE_4(dst, stride, a, b, c, d) \
570     APPLY_COEFF_4(a, b, c, d, c01, c23) \
571     STORE_4(dst, stride, a, b, c, d) \
572 }
573 
inv_txfm_fn4x4(adst,dct)574 inv_txfm_fn4x4(adst,     dct     )
575 inv_txfm_fn4x4(dct,      adst    )
576 inv_txfm_fn4x4(dct,      flipadst)
577 inv_txfm_fn4x4(flipadst, dct     )
578 inv_txfm_fn4x4(adst,     flipadst)
579 inv_txfm_fn4x4(flipadst, adst    )
580 inv_txfm_fn4x4(identity, dct     )
581 inv_txfm_fn4x4(dct,      identity)
582 inv_txfm_fn4x4(identity, flipadst)
583 inv_txfm_fn4x4(flipadst, identity)
584 inv_txfm_fn4x4(identity, adst   )
585 inv_txfm_fn4x4(adst,     identity)
586 inv_txfm_fn4x4(identity, identity)
587 inv_txfm_fn4x4(adst,     adst    )
588 inv_txfm_fn4x4(flipadst, flipadst)
589 
590 
591 #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
592     dct4_for_dct8(c0, c2, c4, c6, c03, c12) \
593  \
594     i32x4 v799 = vec_splats(799); \
595     i32x4 v4017 = vec_splats(4017); \
596     i32x4 v3406 = vec_splats(3406); \
597     i32x4 v2276 = vec_splats(2276); \
598     i32x4 v2048 = vec_splats(2048); \
599     u32x4 v12 = vec_splat_u32(12); \
600  \
601     i32x4 c1v799 = vec_mul(c1, v799); \
602     i32x4 c7v4017 = vec_mul(c7, v4017); \
603     i32x4 c5v3406 = vec_mul(c5, v3406); \
604     i32x4 c3v2276 = vec_mul(c3, v2276); \
605     i32x4 c5v2276 = vec_mul(c5, v2276); \
606     i32x4 c3v3406 = vec_mul(c3, v3406); \
607     i32x4 c1v4017 = vec_mul(c1, v4017); \
608     i32x4 c7v799 = vec_mul(c7, v799); \
609  \
610     i32x4 t4a = vec_subs(c1v799, c7v4017); \
611     i32x4 t5a = vec_subs(c5v3406, c3v2276); \
612     i32x4 t6a = vec_adds(c5v2276, c3v3406); \
613     i32x4 t7a = vec_adds(c1v4017, c7v799); \
614  \
615     t4a = vec_adds(t4a, v2048); \
616     t5a = vec_adds(t5a, v2048); \
617     t6a = vec_adds(t6a, v2048); \
618     t7a = vec_adds(t7a, v2048); \
619  \
620     t4a = vec_sra(t4a, v12); \
621     t7a = vec_sra(t7a, v12); \
622     t5a = vec_sra(t5a, v12); \
623     t6a = vec_sra(t6a, v12); \
624  \
625     i16x8 t7at4a = vec_packs(t7a, t4a); \
626     i16x8 t6at5a = vec_packs(t6a, t5a); \
627  \
628     i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \
629     t6at5a = vec_subs(t7at4a, t6at5a); \
630  \
631     t6a = i16h_to_i32(t6at5a); \
632     t5a = i16l_to_i32(t6at5a); \
633  \
634     i32x4 t6 = vec_add(t6a, t5a); \
635     i32x4 t5 = vec_sub(t6a, t5a); \
636  \
637     t6 = vec_mul(t6, vec_splats(181)); \
638     t5 = vec_mul(t5, vec_splats(181)); \
639     t6 = vec_add(t6, vec_splats(128)); \
640     t5 = vec_add(t5, vec_splats(128)); \
641  \
642     t6 = vec_sra(t6, vec_splat_u32(8)); \
643     t5 = vec_sra(t5, vec_splat_u32(8)); \
644  \
645     i16x8 t6t5 = vec_packs(t6, t5); \
646  \
647     c74 = vec_subs(c03, t7t4); \
648     c65 = vec_subs(c12, t6t5); \
649     c03 = vec_adds(c03, t7t4); \
650     c12 = vec_adds(c12, t6t5); \
651 
652 #define UNPACK_4_I16_I32(t0, t1, t2, t3) \
653     t0 = i16h_to_i32(t0##t1); \
654     t1 = i16l_to_i32(t0##t1); \
655     t2 = i16h_to_i32(t2##t3); \
656     t3 = i16l_to_i32(t2##t3);
657 
658 #define UNPACK_PAIR_I16_I32(hi, lo, v) \
659     hi = i16h_to_i32(v); \
660     lo = i16l_to_i32(v); \
661 
662 
663 #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \
664 { \
665     i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \
666     IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \
667     UNPACK_4_I16_I32(c0, c3, c1, c2) \
668     UNPACK_4_I16_I32(c7, c4, c6, c5) \
669 }
670 
671 #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
672 { \
673     i16x8 c03, c12, c74, c65; \
674     IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
675     c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \
676     c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \
677     c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \
678     c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \
679 }
680 
681 #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
682                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
683                    c0, c1, c2, c3, c4, c5, c6, c7) \
684 { \
685     dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \
686     dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \
687 }
688 
689 #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
690                     c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
691                     c0, c1, c2, c3, c4, c5, c6, c7) \
692 { \
693     i16x8 c03h, c12h, c74h, c65h; \
694     i16x8 c03l, c12l, c74l, c65l; \
695     { \
696         IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \
697     } \
698     { \
699         IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \
700     } \
701     c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \
702     c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \
703     c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \
704     c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \
705     c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \
706     c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \
707     c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \
708     c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \
709 }
710 
711 #define IDENTITY_8(c01, c23, c45, c67) \
712 { \
713     c01 = vec_adds(c01, c01); \
714     c23 = vec_adds(c23, c23); \
715     c45 = vec_adds(c45, c45); \
716     c67 = vec_adds(c67, c67); \
717 }
718 
719 #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
720 { \
721     IDENTITY_8(c01, c23, c45, c67) \
722     UNPACK_PAIR_I16_I32(c0, c1, c01) \
723     UNPACK_PAIR_I16_I32(c2, c3, c23) \
724     UNPACK_PAIR_I16_I32(c4, c5, c45) \
725     UNPACK_PAIR_I16_I32(c6, c7, c67) \
726 }
727 
728 #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
729     c01 = vec_packs(c0, c1); \
730     c23 = vec_packs(c2, c3); \
731     c45 = vec_packs(c4, c5); \
732     c67 = vec_packs(c6, c7); \
733     IDENTITY_8(c01, c23, c45, c67)
734 
735 #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
736                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
737                         c0, c1, c2, c3, c4, c5, c6, c7) \
738 { \
739     IDENTITY_8(c0, c1, c2, c3) \
740     IDENTITY_8(c4, c5, c6, c7) \
741     UNPACK_PAIR_I16_I32(c0h, c0l, c0) \
742     UNPACK_PAIR_I16_I32(c1h, c1l, c1) \
743     UNPACK_PAIR_I16_I32(c2h, c2l, c2) \
744     UNPACK_PAIR_I16_I32(c3h, c3l, c3) \
745     UNPACK_PAIR_I16_I32(c4h, c4l, c4) \
746     UNPACK_PAIR_I16_I32(c5h, c5l, c5) \
747     UNPACK_PAIR_I16_I32(c6h, c6l, c6) \
748     UNPACK_PAIR_I16_I32(c7h, c7l, c7) \
749 }
750 
751 #define PACK_4(c0, c1, c2, c3, \
752                c0h, c1h, c2h, c3h, \
753                c0l, c1l, c2l, c3l) \
754 { \
755     c0 = vec_packs(c0h, c0l); \
756     c1 = vec_packs(c1h, c1l); \
757     c2 = vec_packs(c2h, c2l); \
758     c3 = vec_packs(c3h, c3l); \
759 }
760 
761 #define DECLARE_PACK_4(c0, c1, c2, c3, \
762                        c0h, c1h, c2h, c3h, \
763                        c0l, c1l, c2l, c3l) \
764     i16x8 c0, c1, c2, c3; \
765     PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l);
766 
767 #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
768                c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
769                c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
770 { \
771     c0 = vec_packs(c0h, c0l); \
772     c1 = vec_packs(c1h, c1l); \
773     c2 = vec_packs(c2h, c2l); \
774     c3 = vec_packs(c3h, c3l); \
775     c4 = vec_packs(c4h, c4l); \
776     c5 = vec_packs(c5h, c5l); \
777     c6 = vec_packs(c6h, c6l); \
778     c7 = vec_packs(c7h, c7l); \
779 }
780 
781 #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
782                          c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
783                          c0, c1, c2, c3, c4, c5, c6, c7) \
784 { \
785     PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
786            c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
787            c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
788     IDENTITY_8(c0, c1, c2, c3) \
789     IDENTITY_8(c4, c5, c6, c7) \
790 }
791 
792 #define DECLARE_SPLAT_I32(val) \
793     i32x4 v##val = vec_splats(val);
794 
795 #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \
796     i32x4 ca##va = vec_mul(ca, va); \
797     i32x4 cb##vb = vec_mul(cb, vb); \
798     i32x4 ca##vb = vec_mul(ca, vb); \
799     i32x4 cb##va = vec_mul(cb, va);
800 
801 #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
802     r0 = vec_adds(ca##va, cb##vb); \
803     r1 = vec_subs(ca##vb, cb##va);
804 
805 #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
806     i32x4 r0, r1; \
807     ADD_SUB_PAIR(r0, r1, ca, cb, va, vb)
808 
809 #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \
810     a = vec_adds(a, rnd); \
811     b = vec_adds(b, rnd); \
812     c = vec_adds(c, rnd); \
813     d = vec_adds(d, rnd); \
814     a = vec_sra(a, shift); \
815     b = vec_sra(b, shift); \
816     c = vec_sra(c, shift); \
817     d = vec_sra(d, shift);
818 
819 #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
820                      o0, o1, o2, o3, o4, o5, o6, o7) \
821 { \
822     DECLARE_SPLAT_I32(4076) \
823     DECLARE_SPLAT_I32(401) \
824  \
825     DECLARE_SPLAT_I32(3612) \
826     DECLARE_SPLAT_I32(1931) \
827  \
828     DECLARE_SPLAT_I32(2598) \
829     DECLARE_SPLAT_I32(3166) \
830  \
831     DECLARE_SPLAT_I32(1189) \
832     DECLARE_SPLAT_I32(3920) \
833  \
834     DECLARE_SPLAT_I32(3784) \
835     DECLARE_SPLAT_I32(1567) \
836  \
837     DECLARE_SPLAT_I32(2048) \
838     u32x4 v12 = vec_splat_u32(12); \
839  \
840     DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \
841     DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \
842     DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \
843     DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \
844  \
845     DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \
846     DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \
847     DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \
848     DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \
849  \
850     SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \
851     SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
852  \
853     i32x4 t0 = vec_add(t0a, t4a); \
854     i32x4 t1 = vec_add(t1a, t5a); \
855     i32x4 t2 = vec_add(t2a, t6a); \
856     i32x4 t3 = vec_add(t3a, t7a); \
857     i32x4 t4 = vec_sub(t0a, t4a); \
858     i32x4 t5 = vec_sub(t1a, t5a); \
859     i32x4 t6 = vec_sub(t2a, t6a); \
860     i32x4 t7 = vec_sub(t3a, t7a); \
861  \
862     i16x8 t0t1 = vec_packs(t0, t1); \
863     i16x8 t2t3 = vec_packs(t2, t3); \
864     i16x8 t4t5 = vec_packs(t4, t5); \
865     i16x8 t6t7 = vec_packs(t6, t7); \
866  \
867     UNPACK_4_I16_I32(t4, t5, t6, t7) \
868     UNPACK_4_I16_I32(t0, t1, t2, t3) \
869  \
870     DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \
871     DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \
872  \
873     ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \
874     ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \
875  \
876     SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
877   \
878     o0 = vec_add(t0, t2); \
879     o1 = vec_add(t4a, t6a); \
880     o7 = vec_add(t1, t3); \
881     o6 = vec_add(t5a, t7a); \
882     t2 = vec_sub(t0, t2); \
883     t3 = vec_sub(t1, t3); \
884     t6 = vec_sub(t4a, t6a); \
885     t7 = vec_sub(t5a, t7a); \
886  \
887     i16x8 o7##o1 = vec_packs(o7, o1); \
888     i16x8 o0##o6 = vec_packs(o0, o6); \
889     t2t3 = vec_packs(t2, t3); \
890     t6t7 = vec_packs(t6, t7); \
891  \
892     UNPACK_4_I16_I32(t2, t3, t6, t7) \
893     UNPACK_4_I16_I32(o7, o1, o0, o6) \
894  \
895     o7 = -o7; \
896     o1 = -o1; \
897  \
898     o3 = vec_add(t2, t3); \
899     o4 = vec_sub(t2, t3); \
900     o5 = vec_sub(t6, t7); \
901     o2 = vec_add(t6, t7); \
902  \
903     i32x4 v181 = vec_splats(181); \
904     i32x4 v128 = vec_splats(128); \
905     u32x4 v8 = vec_splat_u32(8); \
906  \
907     o2 = vec_mul(o2, v181); \
908     o3 = vec_mul(o3, v181); \
909     o4 = vec_mul(o4, v181); \
910     o5 = vec_mul(o5, v181); \
911  \
912     SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \
913  \
914     o3 = -o3; \
915     o5 = -o5; \
916 }
917 
918 #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
919 {\
920     ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
921                  c0, c1, c2, c3, c4, c5, c6, c7) \
922     c01 = vec_packs(c0, c1); \
923     c23 = vec_packs(c2, c3); \
924     c45 = vec_packs(c4, c5); \
925     c67 = vec_packs(c6, c7); \
926     UNPACK_PAIR_I16_I32(c0, c1, c01) \
927     UNPACK_PAIR_I16_I32(c2, c3, c23) \
928     UNPACK_PAIR_I16_I32(c4, c5, c45) \
929     UNPACK_PAIR_I16_I32(c6, c7, c67) \
930 }
931 
932 #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
933 {\
934     ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
935                  c0, c1, c2, c3, c4, c5, c6, c7) \
936     c01 = vec_packs(c0, c1); \
937     c23 = vec_packs(c2, c3); \
938     c45 = vec_packs(c4, c5); \
939     c67 = vec_packs(c6, c7); \
940 }
941 
942 #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
943                     c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
944                     c0, c1, c2, c3, c4, c5, c6, c7) \
945 { \
946     ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
947                  c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
948     ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
949                  c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
950 }
951 
952 #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
953                     c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
954                     c0, c1, c2, c3, c4, c5, c6, c7) \
955 { \
956     ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
957                  c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
958     ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
959                  c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
960     PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
961            c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
962            c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
963 }
964 
965 #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
966 {\
967     ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
968                  c7, c6, c5, c4, c3, c2, c1, c0) \
969     c01 = vec_packs(c0, c1); \
970     c23 = vec_packs(c2, c3); \
971     c45 = vec_packs(c4, c5); \
972     c67 = vec_packs(c6, c7); \
973     UNPACK_PAIR_I16_I32(c0, c1, c01) \
974     UNPACK_PAIR_I16_I32(c2, c3, c23) \
975     UNPACK_PAIR_I16_I32(c4, c5, c45) \
976     UNPACK_PAIR_I16_I32(c6, c7, c67) \
977 }
978 
979 #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
980 {\
981     ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
982                  c7, c6, c5, c4, c3, c2, c1, c0) \
983     c01 = vec_packs(c0, c1); \
984     c23 = vec_packs(c2, c3); \
985     c45 = vec_packs(c4, c5); \
986     c67 = vec_packs(c6, c7); \
987 }
988 
989 #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
990                         c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
991                         c0, c1, c2, c3, c4, c5, c6, c7) \
992 { \
993     ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
994                  c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
995     ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
996                  c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
997 }
998 
999 #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1000                          c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1001                          c0, c1, c2, c3, c4, c5, c6, c7) \
1002 { \
1003     ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1004                  c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
1005     ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1006                  c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
1007     PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
1008            c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1009            c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1010 }
1011 
1012 void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1013                                               int16_t *const coeff, const int eob)
1014 {
1015     i16x8 v = vec_splats((int16_t)(2896*8));
1016 
1017     if (eob < 1) {
1018         return dc_only_4xN(dst, stride, coeff, 2, 1, 0);
1019     }
1020 
1021     LOAD_SCALE_COEFF_4x8(coeff, v)
1022 
1023     dct_4_in(c0, c1, c2, c3, c01, c23)
1024     dct_4_in(c4, c5, c6, c7, c45, c67)
1025 
1026 
1027     memset(coeff, 0, sizeof(*coeff) * 4 * 8);
1028 
1029     TRANSPOSE4_I32(c0, c1, c2, c3);
1030     TRANSPOSE4_I32(c4, c5, c6, c7);
1031 
1032     dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
1033 
1034     LOAD_DECLARE_4(dst, stride, a, b, cc, d)
1035     LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
1036 
1037     APPLY_COEFF_4(a, b, cc, d, c01, c23)
1038     APPLY_COEFF_4(e, f, g, hh, c45, c67)
1039 
1040     STORE_4(dst, stride, a, b, cc, d)
1041     STORE_4(dst + 4 * stride, stride, e, f, g, hh)
1042 }
1043 
1044 
1045 #define inv_txfm_fn4x8(type1, type2) \
1046 void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1047                                                           int16_t *const coeff, const int eob) \
1048 { \
1049     i16x8 v = vec_splats((int16_t)(2896*8)); \
1050     LOAD_SCALE_COEFF_4x8(coeff, v) \
1051     type1##_4_in(c0, c1, c2, c3, c01, c23) \
1052     type1##_4_in(c4, c5, c6, c7, c45, c67) \
1053     memset(coeff, 0, sizeof(*coeff) * 4 * 8); \
1054     TRANSPOSE4_I32(c0, c1, c2, c3); \
1055     TRANSPOSE4_I32(c4, c5, c6, c7); \
1056     type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
1057     LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1058     LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1059     APPLY_COEFF_4(a, b, c, d, c01, c23) \
1060     APPLY_COEFF_4(e, f, g, h, c45, c67) \
1061     STORE_4(dst, stride, a, b, c, d) \
1062     STORE_4(dst + 4 * stride, stride, e, f, g, h) \
1063 }
1064 
inv_txfm_fn4x8(adst,dct)1065 inv_txfm_fn4x8(adst,     dct     )
1066 inv_txfm_fn4x8(dct,      adst    )
1067 inv_txfm_fn4x8(dct,      flipadst)
1068 inv_txfm_fn4x8(flipadst, dct     )
1069 inv_txfm_fn4x8(adst,     flipadst)
1070 inv_txfm_fn4x8(flipadst, adst    )
1071 inv_txfm_fn4x8(identity, dct     )
1072 inv_txfm_fn4x8(dct,      identity)
1073 inv_txfm_fn4x8(identity, flipadst)
1074 inv_txfm_fn4x8(flipadst, identity)
1075 inv_txfm_fn4x8(identity, adst   )
1076 inv_txfm_fn4x8(adst,     identity)
1077 inv_txfm_fn4x8(identity, identity)
1078 inv_txfm_fn4x8(adst,     adst    )
1079 inv_txfm_fn4x8(flipadst, flipadst)
1080 
1081 
1082 void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1083                                               int16_t *const coeff, const int eob)
1084 {
1085     i16x8 v = vec_splats((int16_t)(2896*8));
1086 
1087     if (eob < 1) {
1088         return dc_only_8xN(dst, stride, coeff, 1, 1, 0);
1089     }
1090 
1091     LOAD_SCALE_COEFF_8x4(coeff, v)
1092 
1093     dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
1094 
1095     memset(coeff, 0, sizeof(*coeff) * 8 * 4);
1096 
1097     TRANSPOSE4_I32(c0, c1, c2, c3)
1098     TRANSPOSE4_I32(c4, c5, c6, c7)
1099 
1100     dct_4_out(c0, c1, c2, c3, c01, c23)
1101     dct_4_out(c4, c5, c6, c7, c45, c67)
1102 
1103     LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh)
1104 
1105     i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45);
1106     i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45);
1107     i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67);
1108     i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67);
1109 
1110     APPLY_COEFF_8x4(ae, bf, c04, c15)
1111     APPLY_COEFF_8x4(cg, dh, c26, c37)
1112 
1113     STORE_8(dst, stride, ae, bf, cg, dh)
1114 }
1115 
1116 
1117 #define inv_txfm_fn8x4(type1, type2) \
1118 void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1119                                                           int16_t *const coeff, const int eob) \
1120 { \
1121     i16x8 v = vec_splats((int16_t)(2896*8)); \
1122     LOAD_SCALE_COEFF_8x4(coeff, v) \
1123     type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
1124     memset(coeff, 0, sizeof(*coeff) * 8 * 4); \
1125     TRANSPOSE4_I32(c0, c1, c2, c3) \
1126     TRANSPOSE4_I32(c4, c5, c6, c7) \
1127     type2##_4_out(c0, c1, c2, c3, c01, c23) \
1128     type2##_4_out(c4, c5, c6, c7, c45, c67) \
1129     LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \
1130     i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \
1131     i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \
1132     i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \
1133     i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \
1134     APPLY_COEFF_8x4(ae, bf, c04, c15) \
1135     APPLY_COEFF_8x4(cg, dh, c26, c37) \
1136     STORE_8(dst, stride, ae, bf, cg, dh) \
1137 }
inv_txfm_fn8x4(adst,dct)1138 inv_txfm_fn8x4(adst,     dct     )
1139 inv_txfm_fn8x4(dct,      adst    )
1140 inv_txfm_fn8x4(dct,      flipadst)
1141 inv_txfm_fn8x4(flipadst, dct     )
1142 inv_txfm_fn8x4(adst,     flipadst)
1143 inv_txfm_fn8x4(flipadst, adst    )
1144 inv_txfm_fn8x4(identity, dct     )
1145 inv_txfm_fn8x4(dct,      identity)
1146 inv_txfm_fn8x4(identity, flipadst)
1147 inv_txfm_fn8x4(flipadst, identity)
1148 inv_txfm_fn8x4(identity, adst   )
1149 inv_txfm_fn8x4(adst,     identity)
1150 inv_txfm_fn8x4(identity, identity)
1151 inv_txfm_fn8x4(adst,     adst    )
1152 inv_txfm_fn8x4(flipadst, flipadst)
1153 
1154 void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1155                                               int16_t *const coeff, const int eob)
1156 {
1157     if (eob < 1) {
1158         return dc_only_8xN(dst, stride, coeff, 2, 0, 1);
1159     }
1160 
1161     LOAD_COEFF_8x8(coeff)
1162 
1163     dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1164                c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
1165                c0, c1, c2, c3, c4, c5, c6, c7)
1166 
1167     memset(coeff, 0, sizeof(*coeff) * 8 * 8);
1168 
1169     SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1))
1170     SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1))
1171     SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1))
1172     SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1))
1173 
1174     TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1175                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l)
1176 
1177     dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1178                 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
1179                 c0, c1, c2, c3, c4, c5, c6, c7)
1180 
1181     LOAD_DECLARE_4(dst, stride, a, b, cc, d)
1182     LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
1183 
1184     APPLY_COEFF_8x4(a, b, c0, c1)
1185     APPLY_COEFF_8x4(cc, d, c2, c3)
1186     APPLY_COEFF_8x4(e, f, c4, c5)
1187     APPLY_COEFF_8x4(g, hh, c6, c7)
1188 
1189     STORE_8(dst, stride, a, b, cc, d)
1190     STORE_8(dst + 4 * stride, stride, e, f, g, hh)
1191 }
1192 
1193 #define inv_txfm_fn8x8(type1, type2) \
1194 void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1195                                                           int16_t *const coeff, const int eob) \
1196 { \
1197     LOAD_COEFF_8x8(coeff) \
1198     type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1199                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1200                    c0, c1, c2, c3, c4, c5, c6, c7) \
1201     SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \
1202     SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \
1203     SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \
1204     SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \
1205     memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
1206     TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1207                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1208     type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1209                     c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1210                     c0, c1, c2, c3, c4, c5, c6, c7) \
1211     LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1212     LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1213     APPLY_COEFF_8x4(a, b, c0, c1) \
1214     APPLY_COEFF_8x4(c, d, c2, c3) \
1215     APPLY_COEFF_8x4(e, f, c4, c5) \
1216     APPLY_COEFF_8x4(g, h, c6, c7) \
1217     STORE_8(dst, stride, a, b, c, d) \
1218     STORE_8(dst + 4 * stride, stride, e, f, g, h) \
1219 }
inv_txfm_fn8x8(adst,dct)1220 inv_txfm_fn8x8(adst,     dct     )
1221 inv_txfm_fn8x8(dct,      adst    )
1222 inv_txfm_fn8x8(dct,      flipadst)
1223 inv_txfm_fn8x8(flipadst, dct     )
1224 inv_txfm_fn8x8(adst,     flipadst)
1225 inv_txfm_fn8x8(flipadst, adst    )
1226 inv_txfm_fn8x8(dct,      identity)
1227 inv_txfm_fn8x8(flipadst, identity)
1228 inv_txfm_fn8x8(adst,     identity)
1229 inv_txfm_fn8x8(adst,     adst    )
1230 inv_txfm_fn8x8(flipadst, flipadst)
1231 
1232 // identity + scale is a no op
1233 #define inv_txfm_fn8x8_identity(type2) \
1234 void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1235                                                          int16_t *const coeff, const int eob) \
1236 { \
1237     LOAD_COEFF_8x8(coeff) \
1238     memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
1239     TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1240                    c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1241     type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1242                     c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1243                     c0, c1, c2, c3, c4, c5, c6, c7) \
1244     LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1245     LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1246     APPLY_COEFF_8x4(a, b, c0, c1) \
1247     APPLY_COEFF_8x4(c, d, c2, c3) \
1248     APPLY_COEFF_8x4(e, f, c4, c5) \
1249     APPLY_COEFF_8x4(g, h, c6, c7) \
1250     STORE_8(dst, stride, a, b, c, d) \
1251     STORE_8(dst + 4 * stride, stride, e, f, g, h) \
1252 }
1253 inv_txfm_fn8x8_identity(dct     )
1254 inv_txfm_fn8x8_identity(flipadst)
1255 inv_txfm_fn8x8_identity(adst    )
1256 inv_txfm_fn8x8_identity(identity)
1257 
1258 #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \
1259                      ab, cd, ef, gh) \
1260 { \
1261     ab = vec_packs(a, b); \
1262     cd = vec_packs(c, d); \
1263     ef = vec_packs(e, f); \
1264     gh = vec_packs(g, h); \
1265     UNPACK_PAIR_I16_I32(a, b, ab) \
1266     UNPACK_PAIR_I16_I32(c, d, cd) \
1267     UNPACK_PAIR_I16_I32(e, f, ef) \
1268     UNPACK_PAIR_I16_I32(g, h, gh) \
1269 }
1270 
1271 #define MUL_4_INPLACE(a, b, c, d, v) \
1272     a = vec_mul(a, v); \
1273     b = vec_mul(b, v); \
1274     c = vec_mul(c, v); \
1275     d = vec_mul(d, v); \
1276 
1277 #define IDENTITY_16_V(v) \
1278 { \
1279     i16x8 v_ = vec_adds(v, v); \
1280     v = vec_mradds(v, v1697_16, v_); \
1281 }
1282 
1283 #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
1284                           c08c09, c10c11, c12c13, c14c15) \
1285 { \
1286     i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \
1287     IDENTITY_16_V(c00c01) \
1288     IDENTITY_16_V(c02c03) \
1289     IDENTITY_16_V(c04c05) \
1290     IDENTITY_16_V(c06c07) \
1291     IDENTITY_16_V(c08c09) \
1292     IDENTITY_16_V(c10c11) \
1293     IDENTITY_16_V(c12c13) \
1294     IDENTITY_16_V(c14c15) \
1295 }
1296 
1297 #define IDENTITY_16_4_I32(a, b, c, d) \
1298 { \
1299     i32x4 a2 = vec_add(a, a); \
1300     i32x4 b2 = vec_add(b, b); \
1301     i32x4 c2 = vec_add(c, c); \
1302     i32x4 d2 = vec_add(d, d); \
1303     MUL_4_INPLACE(a, b, c, d, v1697) \
1304     SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \
1305     a = vec_add(a2, a); \
1306     b = vec_add(b2, b); \
1307     c = vec_add(c2, c); \
1308     d = vec_add(d2, d); \
1309 }
1310 
1311 
1312 #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1313                        c08, c09, c10, c11, c12, c13, c14, c15, \
1314                        c00c01, c02c03, c04c05, c06c07, \
1315                        c08c09, c10c11, c12c13, c14c15) \
1316 { \
1317     DECLARE_SPLAT_I32(1697) \
1318     DECLARE_SPLAT_I32(1024) \
1319     IDENTITY_16_4_I32(c00, c01, c02, c03) \
1320     IDENTITY_16_4_I32(c04, c05, c06, c07) \
1321     IDENTITY_16_4_I32(c08, c09, c10, c11) \
1322     IDENTITY_16_4_I32(c12, c13, c14, c15) \
1323 }
1324 
1325 #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1326                         c08, c09, c10, c11, c12, c13, c14, c15, \
1327                         c00c01, c02c03, c04c05, c06c07, \
1328                         c08c09, c10c11, c12c13, c14c15) \
1329 { \
1330     PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1331            c00, c02, c04, c06, c08, c10, c12, c14, \
1332            c01, c03, c05, c07, c09, c11, c13, c15)  \
1333     IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
1334                       c08c09, c10c11, c12c13, c14c15) \
1335 }
1336 
1337 #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \
1338                       c08, c09, c10, c11, c12, c13, c14, c15, \
1339                       c00c03, c01c02, c07c04, c06c05, \
1340                       c08c11, c09c10, c14c13, c15c12) \
1341     IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \
1342                  c00c03, c01c02, c07c04, c06c05) \
1343     DECLARE_SPLAT_I32(128) \
1344     DECLARE_SPLAT_I32(181) \
1345     DECLARE_SPLAT_I32(401) \
1346     DECLARE_SPLAT_I32(4076) \
1347     DECLARE_SPLAT_I32(3166) \
1348     DECLARE_SPLAT_I32(2598) \
1349     DECLARE_SPLAT_I32(1931) \
1350     DECLARE_SPLAT_I32(3612) \
1351     DECLARE_SPLAT_I32(3920) \
1352     DECLARE_SPLAT_I32(1189) \
1353     DECLARE_SPLAT_I32(1567) \
1354     DECLARE_SPLAT_I32(3784) \
1355 \
1356     DECLARE_MUL_PAIR_I32(c01, c15,  v401, v4076) \
1357     DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \
1358     DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \
1359     DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \
1360 \
1361     DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076,  v401) \
1362     DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \
1363     DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \
1364     DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \
1365 \
1366     SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \
1367     SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \
1368 \
1369     CLIP16_I32_8(t15a, t08a, t14a, t09a, \
1370                  t13a, t10a, t12a, t11a, \
1371                  c08c11, c09c10, c14c13, c15c12) \
1372     DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \
1373     DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \
1374     DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \
1375     DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \
1376 \
1377     CLIP16_I32_8(t08, t09, t11, t10, \
1378                  t12, t13, t15, t14, \
1379                  c08c11, c09c10, c14c13, c15c12) \
1380 \
1381     DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \
1382     DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \
1383     \
1384     ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \
1385     ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \
1386     t10a = -t10a; \
1387 \
1388     SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \
1389 \
1390     ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \
1391     ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \
1392     ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \
1393     ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \
1394 \
1395     CLIP16_I32_8(t08a, t11a, t09, t10, \
1396                  t15a, t12a, t14, t13, \
1397                  c08c11, c09c10, c14c13, c15c12) \
1398     ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \
1399     ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \
1400 \
1401     MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \
1402     SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \
1403 \
1404     DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \
1405                    t15a, t14, t08a, t09, \
1406                    t12, t13a, t11,  t10a) \
1407 \
1408     c15c12 = vec_subs(c00c03, t15at12); \
1409     c14c13 = vec_subs(c01c02, t14t13a); \
1410     c08c11 = vec_subs(c07c04, t08at11); \
1411     c09c10 = vec_subs(c06c05, t09t10a); \
1412     c00c03 = vec_adds(c00c03, t15at12); \
1413     c01c02 = vec_adds(c01c02, t14t13a); \
1414     c07c04 = vec_adds(c07c04, t08at11); \
1415     c06c05 = vec_adds(c06c05, t09t10a); \
1416 
1417 #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1418                    c08, c09, c10, c11, c12, c13, c14, c15, \
1419                    c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1420 \
1421     i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \
1422     IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1423                   c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1424     c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \
1425     c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \
1426     c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \
1427     c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \
1428     c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \
1429     c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \
1430     c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \
1431     c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \
1432 
1433 #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1434                   c08, c09, c10, c11, c12, c13, c14, c15, \
1435                   c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1436 \
1437     IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1438                   c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1439     UNPACK_PAIR_I16_I32(c00, c03, c00c03) \
1440     UNPACK_PAIR_I16_I32(c01, c02, c01c02) \
1441     UNPACK_PAIR_I16_I32(c07, c04, c07c04) \
1442     UNPACK_PAIR_I16_I32(c06, c05, c06c05) \
1443     UNPACK_PAIR_I16_I32(c08, c11, c08c11) \
1444     UNPACK_PAIR_I16_I32(c09, c10, c09c10) \
1445     UNPACK_PAIR_I16_I32(c14, c13, c14c13) \
1446     UNPACK_PAIR_I16_I32(c15, c12, c15c12) \
1447 
1448 
1449 #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1450                    cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1451                    a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1452     dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1453     dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1454     dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1455     dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1456 
1457 
1458 #define PACK_4x4(c00, c01, c02, c03, \
1459                  c04, c05, c06, c07, \
1460                  c08, c09, c10, c11, \
1461                  c12, c13, c14, c15, \
1462                  c00c01, c02c03, c04c05, c06c07, \
1463                  c08c09, c10c11, c12c13, c14c15) \
1464 { \
1465     c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \
1466     c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \
1467     c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \
1468     c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \
1469 }
1470 
1471 
1472 
1473 #define dct_4x4_out(c00, c01, c02, c03, \
1474                     c04, c05, c06, c07, \
1475                     c08, c09, c10, c11, \
1476                     c12, c13, c14, c15, \
1477                     c00c01, c02c03, c04c05, c06c07, \
1478                     c08c09, c10c11, c12c13, c14c15) \
1479 { \
1480     IDCT_4_INNER(c00, c01, c02, c03) \
1481     IDCT_4_INNER(c04, c05, c06, c07) \
1482     IDCT_4_INNER(c08, c09, c10, c11) \
1483     IDCT_4_INNER(c12, c13, c14, c15) \
1484 \
1485     PACK_4x4(c00, c01, c02, c03, \
1486              c04, c05, c06, c07, \
1487              c08, c09, c10, c11, \
1488              c12, c13, c14, c15, \
1489              c00c01, c02c03, c04c05, c06c07, \
1490              c08c09, c10c11, c12c13, c14c15) \
1491 }
1492 
1493 #define IDENTITY_4_I32(a, b, c, d) \
1494 { \
1495     DECLARE_SPLAT_I32(5793) \
1496     DECLARE_SPLAT_I32(2048) \
1497     MUL_4_INPLACE(a, b, c, d, v5793) \
1498     SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \
1499 }
1500 
1501 #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1502                        cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1503                        a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1504 { \
1505     IDENTITY_4_I32(cA0, cA1, cA2, cA3) \
1506     IDENTITY_4_I32(cB0, cB1, cB2, cB3) \
1507     IDENTITY_4_I32(cC0, cC1, cC2, cC3) \
1508     IDENTITY_4_I32(cD0, cD1, cD2, cD3) \
1509 }
1510 
1511 #define identity_4x4_out(c00, c01, c02, c03, \
1512                          c04, c05, c06, c07, \
1513                          c08, c09, c10, c11, \
1514                          c12, c13, c14, c15, \
1515                          c00c01, c02c03, c04c05, c06c07, \
1516                          c08c09, c10c11, c12c13, c14c15) \
1517 { \
1518     PACK_4x4(c00, c01, c02, c03, \
1519              c04, c05, c06, c07, \
1520              c08, c09, c10, c11, \
1521              c12, c13, c14, c15, \
1522              c00c01, c02c03, c04c05, c06c07, \
1523              c08c09, c10c11, c12c13, c14c15) \
1524     IDENTITY_4(c00c01, c02c03) \
1525     IDENTITY_4(c04c05, c06c07) \
1526     IDENTITY_4(c08c09, c10c11) \
1527     IDENTITY_4(c12c13, c14c15) \
1528 }
1529 
1530 #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1531                     cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1532                     a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1533     adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1534     adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1535     adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1536     adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1537 
1538 #define adst_4x4_out(c00, c01, c02, c03, \
1539                      c04, c05, c06, c07, \
1540                      c08, c09, c10, c11, \
1541                      c12, c13, c14, c15, \
1542                      c00c01, c02c03, c04c05, c06c07, \
1543                      c08c09, c10c11, c12c13, c14c15) \
1544 { \
1545     ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \
1546     ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \
1547     ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \
1548     ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \
1549 \
1550     PACK_4x4(c00, c01, c02, c03, \
1551              c04, c05, c06, c07, \
1552              c08, c09, c10, c11, \
1553              c12, c13, c14, c15, \
1554              c00c01, c02c03, c04c05, c06c07, \
1555              c08c09, c10c11, c12c13, c14c15) \
1556 }
1557 
1558 #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1559                         cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1560                         a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1561     flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1562     flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1563     flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1564     flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1565 
1566 #define flipadst_4x4_out(c00, c01, c02, c03, \
1567                          c04, c05, c06, c07, \
1568                          c08, c09, c10, c11, \
1569                          c12, c13, c14, c15, \
1570                          c00c01, c02c03, c04c05, c06c07, \
1571                          c08c09, c10c11, c12c13, c14c15) \
1572 { \
1573     ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \
1574     ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \
1575     ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \
1576     ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \
1577 \
1578     PACK_4x4(c00, c01, c02, c03, \
1579              c04, c05, c06, c07, \
1580              c08, c09, c10, c11, \
1581              c12, c13, c14, c15, \
1582              c00c01, c02c03, c04c05, c06c07, \
1583              c08c09, c10c11, c12c13, c14c15) \
1584 }
1585 
1586 #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \
1587                       c08, c09, c10, c11, c12, c13, c14, c15, \
1588                       o00, o01, o02, o03, o04, o05, o06, o07, \
1589                       o08, o09, o10, o11, o12, o13, o14, o15, \
1590                       c00c01, c02c03, c04c05, c06c07) \
1591     DECLARE_SPLAT_I32(2048); \
1592     u32x4 v12 = vec_splat_u32(12); \
1593     DECLARE_SPLAT_I32(4091) \
1594     DECLARE_SPLAT_I32(201) \
1595     DECLARE_SPLAT_I32(3973) \
1596     DECLARE_SPLAT_I32(995) \
1597     DECLARE_SPLAT_I32(3703) \
1598     DECLARE_SPLAT_I32(1751) \
1599     DECLARE_SPLAT_I32(3290) \
1600     DECLARE_SPLAT_I32(2440) \
1601     DECLARE_SPLAT_I32(2751) \
1602     DECLARE_SPLAT_I32(3035) \
1603     DECLARE_SPLAT_I32(2106) \
1604     DECLARE_SPLAT_I32(3513) \
1605     DECLARE_SPLAT_I32(1380) \
1606     DECLARE_SPLAT_I32(3857) \
1607     DECLARE_SPLAT_I32(601) \
1608     DECLARE_SPLAT_I32(4052) \
1609 \
1610     DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \
1611     DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \
1612     DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \
1613     DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \
1614     DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \
1615     DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \
1616     DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \
1617     DECLARE_MUL_PAIR_I32(c01, c14,  v601, v4052) \
1618 \
1619     DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\
1620     DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \
1621     DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \
1622     DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \
1623     DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \
1624     DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \
1625     DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \
1626     DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14,  v601, v4052) \
1627 \
1628     SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \
1629     SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \
1630     SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
1631     SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \
1632 \
1633     DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \
1634     DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \
1635     DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \
1636     DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \
1637     DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \
1638     DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \
1639     DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \
1640     DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \
1641 \
1642     CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \
1643                  c00c01, c02c03, c04c05, c06c07); \
1644     CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \
1645                  c00c01, c02c03, c04c05, c06c07); \
1646 \
1647     DECLARE_SPLAT_I32(4017) \
1648     DECLARE_SPLAT_I32(799) \
1649     DECLARE_SPLAT_I32(2276) \
1650     DECLARE_SPLAT_I32(3406) \
1651 \
1652     DECLARE_MUL_PAIR_I32(t08a, t09a, v4017,  v799); \
1653     DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \
1654     DECLARE_MUL_PAIR_I32(t13a, t12a,  v799, v4017); \
1655     DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \
1656 \
1657     ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017,  v799); \
1658     ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \
1659     ADD_SUB_PAIR(t13, t12, t13a, t12a,  v799, v4017); \
1660     ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \
1661 \
1662     SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
1663     SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \
1664 \
1665     ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \
1666     ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \
1667     ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \
1668     ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \
1669     ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \
1670     ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \
1671     ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \
1672     ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \
1673 \
1674     CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \
1675                  c00c01, c02c03, c04c05, c06c07) \
1676     CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \
1677                  c00c01, c02c03, c04c05, c06c07) \
1678 \
1679     DECLARE_SPLAT_I32(3784) \
1680     DECLARE_SPLAT_I32(1567) \
1681 \
1682     DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \
1683     DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \
1684     DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \
1685     DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \
1686 \
1687     ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \
1688     ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \
1689     ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \
1690     ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \
1691 \
1692     SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \
1693     SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \
1694 \
1695     ADD_SUB_PAIR(o00, t02a, t00,  t02,,) \
1696     ADD_SUB_PAIR(o15, t03a, t01,  t03,,) \
1697     ADD_SUB_PAIR(o03, t06,  t04a, t06a,,) \
1698     ADD_SUB_PAIR(o12, t07,  t05a, t07a,,) \
1699     ADD_SUB_PAIR(o01, t10,  t08a, t10a,,) \
1700     ADD_SUB_PAIR(o14, t11,  t09a, t11a,,) \
1701     ADD_SUB_PAIR(o02, t14a, t12,  t14,,) \
1702     ADD_SUB_PAIR(o13, t15a, t13,  t15,,) \
1703 \
1704     CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \
1705                  c00c01, c02c03, c04c05, c06c07) \
1706     CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \
1707                  c00c01, c02c03, c04c05, c06c07) \
1708 \
1709     DECLARE_SPLAT_I32(181) \
1710     DECLARE_SPLAT_I32(128) \
1711     u32x4 v8 = vec_splat_u32(8); \
1712 \
1713     ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \
1714     ADD_SUB_PAIR(o04, o11, t06,  t07,,) \
1715     ADD_SUB_PAIR(o06, o09, t10,  t11,,) \
1716     ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \
1717 \
1718     MUL_4_INPLACE(o07, o08, o04, o11, v181) \
1719     MUL_4_INPLACE(o06, o09, o05, o10, v181) \
1720 \
1721     SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \
1722     SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \
1723 \
1724     o01 = -o01; \
1725     o03 = -o03; \
1726     o05 = -o05; \
1727     o07 = -o07; \
1728     o09 = -o09; \
1729     o11 = -o11; \
1730     o13 = -o13; \
1731     o15 = -o15; \
1732 
1733 #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1734                    c08, c09, c10, c11, c12, c13, c14, c15, \
1735                    c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1736 { \
1737     ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1738                   c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1739                   c00c01, c02c03, c04c05, c06c07) \
1740 }
1741 
1742 #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1743                     c08, c09, c10, c11, c12, c13, c14, c15, \
1744                     c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1745 { \
1746     ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1747                   c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1748                   c00c01, c02c03, c04c05, c06c07) \
1749     PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1750            c00, c02, c04, c06, c08, c10, c12, c14, \
1751            c01, c03, c05, c07, c09, c11, c13, c15) \
1752 }
1753 
1754 #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1755                        c08, c09, c10, c11, c12, c13, c14, c15, \
1756                        c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1757 { \
1758     ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1759                   c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
1760                   c00c01, c02c03, c04c05, c06c07) \
1761 }
1762 
1763 #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1764                         c08, c09, c10, c11, c12, c13, c14, c15, \
1765                         c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1766 { \
1767     ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1768                   c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
1769                   c00c01, c02c03, c04c05, c06c07) \
1770     PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1771            c00, c02, c04, c06, c08, c10, c12, c14, \
1772            c01, c03, c05, c07, c09, c11, c13, c15) \
1773 }
1774 
1775 
1776 void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1777                                                int16_t *const coeff, const int eob
1778                                                HIGHBD_DECL_SUFFIX)
1779 {
1780     if (eob < 1) {
1781         return dc_only_4xN(dst, stride, coeff, 4, 0, 1);
1782     }
1783 
1784     LOAD_COEFF_4x16(coeff)
1785 
1786     dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1787                cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
1788                a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
1789 
1790     memset(coeff, 0, sizeof(*coeff) * 4 * 16);
1791 
1792     SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1))
1793     SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1))
1794     SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1))
1795     SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1))
1796     TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1797                       cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3)
1798 
1799     dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1800                cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
1801                a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
1802 
1803     LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03)
1804     LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07)
1805     LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11)
1806     LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15)
1807 
1808     APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0);
1809     APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1);
1810     APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2);
1811     APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3);
1812 
1813     STORE_4(dst, stride,               l00, l01, l02, l03);
1814     STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07);
1815     STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11);
1816     STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15);
1817 }
1818 
1819 #define inv_txfm_fn4x16(type1, type2) \
1820 void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1821                                                           int16_t *const coeff, const int eob) \
1822 { \
1823     LOAD_COEFF_4x16(coeff) \
1824     type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1825                    cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1826                    a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1827     memset(coeff, 0, sizeof(*coeff) * 4 * 16); \
1828     SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \
1829     SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \
1830     SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \
1831     SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \
1832     TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1833                       cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \
1834     type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1835                    cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1836                    a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1837     LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \
1838     LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \
1839     LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \
1840     LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \
1841     APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \
1842     APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \
1843     APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \
1844     APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \
1845     STORE_4(dst, stride,               l00, l01, l02, l03); \
1846     STORE_4(dst + 4 * stride, stride,  l04, l05, l06, l07); \
1847     STORE_4(dst + 8 * stride, stride,  l08, l09, l10, l11); \
1848     STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \
1849 }
inv_txfm_fn4x16(adst,dct)1850 inv_txfm_fn4x16(adst,     dct     )
1851 inv_txfm_fn4x16(dct,      adst    )
1852 inv_txfm_fn4x16(dct,      flipadst)
1853 inv_txfm_fn4x16(flipadst, dct     )
1854 inv_txfm_fn4x16(adst,     flipadst)
1855 inv_txfm_fn4x16(flipadst, adst    )
1856 inv_txfm_fn4x16(identity, dct     )
1857 inv_txfm_fn4x16(dct,      identity)
1858 inv_txfm_fn4x16(identity, flipadst)
1859 inv_txfm_fn4x16(flipadst, identity)
1860 inv_txfm_fn4x16(identity, adst   )
1861 inv_txfm_fn4x16(adst,     identity)
1862 inv_txfm_fn4x16(identity, identity)
1863 inv_txfm_fn4x16(adst,     adst    )
1864 inv_txfm_fn4x16(flipadst, flipadst)
1865 
1866 void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1867                                                int16_t *const coeff, const int eob)
1868 {
1869 
1870     if (eob < 1) {
1871         return dc_only_16xN(dst, stride, coeff, 1, 0, 1);
1872     }
1873 
1874     LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1875     LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1876     LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1877     LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1878     UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03)
1879     UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07)
1880     UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11)
1881     UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15)
1882 
1883     dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07,
1884               c08, c09, c10, c11, c12, c13, c14, c15,
1885               c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15)
1886     memset(coeff, 0, sizeof(*coeff) * 16 * 4);
1887     SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1))
1888     SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1))
1889     SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1))
1890     SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1))
1891 
1892     TRANSPOSE4_I32(c00, c01, c02, c03);
1893     TRANSPOSE4_I32(c04, c05, c06, c07);
1894     TRANSPOSE4_I32(c08, c09, c10, c11);
1895     TRANSPOSE4_I32(c12, c13, c14, c15);
1896 
1897     dct_4x4_out(c00, c01, c02, c03,
1898                 c04, c05, c06, c07,
1899                 c08, c09, c10, c11,
1900                 c12, c13, c14, c15,
1901                 c00c01, c02c03, c04c05, c06c07,
1902                 c08c09, c10c11, c12c13, c14c15)
1903 
1904     LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3)
1905 
1906     APPLY_COEFF_16x4(l0, l1, l2, l3,
1907                      c00c01, c02c03, c04c05, c06c07,
1908                      c08c09, c10c11, c12c13, c14c15)
1909 
1910     STORE_16(dst, stride, l0, l1, l2, l3)
1911 }
1912 
1913 #define inv_txfm_fn16x4(type1, type2) \
1914 void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1915                                                           int16_t *const coeff, const int eob) \
1916 { \
1917     LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1918     LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1919     LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1920     LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1921     UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
1922     UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
1923     UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
1924     UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
1925     type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1926                   c08, c09, c10, c11, c12, c13, c14, c15, \
1927                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1928     memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
1929     SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
1930     SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
1931     SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
1932     SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
1933     TRANSPOSE4_I32(c00, c01, c02, c03); \
1934     TRANSPOSE4_I32(c04, c05, c06, c07); \
1935     TRANSPOSE4_I32(c08, c09, c10, c11); \
1936     TRANSPOSE4_I32(c12, c13, c14, c15); \
1937     type2##_4x4_out(c00, c01, c02, c03, \
1938                     c04, c05, c06, c07, \
1939                     c08, c09, c10, c11, \
1940                     c12, c13, c14, c15, \
1941                     c00c01, c02c03, c04c05, c06c07, \
1942                     c08c09, c10c11, c12c13, c14c15); \
1943     LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
1944     APPLY_COEFF_16x4(l0, l1, l2, l3, \
1945                      c00c01, c02c03, c04c05, c06c07, \
1946                      c08c09, c10c11, c12c13, c14c15) \
1947     STORE_16(dst, stride, l0, l1, l2, l3) \
1948 }
1949 
1950 inv_txfm_fn16x4(adst,     dct     )
1951 inv_txfm_fn16x4(dct,      adst    )
1952 inv_txfm_fn16x4(dct,      flipadst)
1953 inv_txfm_fn16x4(flipadst, dct     )
1954 inv_txfm_fn16x4(adst,     flipadst)
1955 inv_txfm_fn16x4(flipadst, adst    )
1956 inv_txfm_fn16x4(dct,      identity)
1957 inv_txfm_fn16x4(flipadst, identity)
1958 inv_txfm_fn16x4(adst,     identity)
1959 inv_txfm_fn16x4(identity, identity)
1960 inv_txfm_fn16x4(adst,     adst    )
1961 inv_txfm_fn16x4(flipadst, flipadst)
1962 
1963 #define inv_txfm_fn16x4_identity(type2) \
1964 void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1965                                                           int16_t *const coeff, const int eob) \
1966 { \
1967     LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1968     LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1969     LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1970     LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1971     UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
1972     UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
1973     UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
1974     UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
1975     identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1976                   c08, c09, c10, c11, c12, c13, c14, c15, \
1977                   c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1978     memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
1979     SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
1980     SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
1981     SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
1982     SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
1983     CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \
1984     CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \
1985     TRANSPOSE4_I32(c00, c01, c02, c03); \
1986     TRANSPOSE4_I32(c04, c05, c06, c07); \
1987     TRANSPOSE4_I32(c08, c09, c10, c11); \
1988     TRANSPOSE4_I32(c12, c13, c14, c15); \
1989     type2##_4x4_out(c00, c01, c02, c03, \
1990                     c04, c05, c06, c07, \
1991                     c08, c09, c10, c11, \
1992                     c12, c13, c14, c15, \
1993                     c00c01, c02c03, c04c05, c06c07, \
1994                     c08c09, c10c11, c12c13, c14c15); \
1995     LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
1996     APPLY_COEFF_16x4(l0, l1, l2, l3, \
1997                      c00c01, c02c03, c04c05, c06c07, \
1998                      c08c09, c10c11, c12c13, c14c15) \
1999     STORE_16(dst, stride, l0, l1, l2, l3) \
2000 }
2001 
2002 inv_txfm_fn16x4_identity(dct)
2003 inv_txfm_fn16x4_identity(adst)
2004 inv_txfm_fn16x4_identity(flipadst)
2005 
2006 #endif // BITDEPTH
2007