1 /*
2 * Copyright © 2024, VideoLAN and dav1d authors
3 * Copyright © 2024, Luca Barbato
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "src/ppc/dav1d_types.h"
29 #include "src/ppc/itx.h"
30 #include "src/ppc/utils.h"
31
32 #if BITDEPTH == 8
33
34 #define LOAD_4(src, stride, a, b, c, d) \
35 { \
36 uint8_t *s = src; \
37 a = vec_xl(0, s); \
38 s += stride; \
39 b = vec_xl(0, s); \
40 s += stride; \
41 c = vec_xl(0, s); \
42 s += stride; \
43 d = vec_xl(0, s); \
44 }
45
46 #define LOAD_DECLARE_2_I16(src, a, b) \
47 i16x8 a = vec_xl(0, src); \
48 i16x8 b = vec_xl(0, src + 8);
49
50 #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \
51 i32x4 a = i16h_to_i32(sa); \
52 i32x4 b = i16l_to_i32(sa); \
53 i32x4 c = i16h_to_i32(sb); \
54 i32x4 d = i16l_to_i32(sb);
55
56 #define LOAD_COEFF_4(coeff) \
57 LOAD_DECLARE_2_I16(coeff, c01, c23) \
58 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3)
59
60 #define LOAD_SCALE_COEFF_4x8(coeff, scale) \
61 LOAD_DECLARE_2_I16(coeff, c04, c15) \
62 LOAD_DECLARE_2_I16(coeff+16, c26, c37) \
63 i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \
64 i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \
65 i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \
66 i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \
67 c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
68 c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
69 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
70 c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
71 c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
72 UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
73
74 #define LOAD_SCALE_COEFF_8x4(coeff, scale) \
75 LOAD_DECLARE_2_I16(coeff, c01, c23) \
76 LOAD_DECLARE_2_I16(coeff+16, c45, c67) \
77 c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \
78 c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \
79 UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \
80 c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \
81 c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \
82 UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7)
83
84 #define LOAD_COEFF_8x8(coeff) \
85 LOAD_DECLARE_2_I16(coeff, c0, c1) \
86 LOAD_DECLARE_2_I16(coeff+16, c2, c3) \
87 LOAD_DECLARE_2_I16(coeff+32, c4, c5) \
88 LOAD_DECLARE_2_I16(coeff+48, c6, c7) \
89 UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \
90 UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \
91 UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \
92 UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \
93
94 #define LOAD_COEFF_4x16(coeff) \
95 LOAD_DECLARE_2_I16(coeff, a0b0, c0d0) \
96 LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \
97 LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \
98 LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \
99 UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \
100 UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \
101 UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \
102 UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3)
103
104 #define LOAD_DECLARE_4(src, stride, a, b, c, d) \
105 u8x16 a, b, c, d; \
106 LOAD_4(src, stride, a, b, c, d)
107
108 #define STORE_LEN(l, dst, stride, a, b, c, d) \
109 { \
110 uint8_t *dst2 = dst; \
111 vec_xst_len(a, dst2, l); \
112 dst2 += stride; \
113 vec_xst_len(b, dst2, l); \
114 dst2 += stride; \
115 vec_xst_len(c, dst2, l); \
116 dst2 += stride; \
117 vec_xst_len(d, dst2, l); \
118 }
119
120 #define STORE_4(dst, stride, a, b, c, d) \
121 STORE_LEN(4, dst, stride, a, b, c, d)
122
123 #define STORE_8(dst, stride, ab, cd, ef, gh) \
124 STORE_LEN(8, dst, stride, ab, cd, ef, gh)
125
126 #define STORE_16(dst, stride, l0, l1, l2, l3) \
127 { \
128 uint8_t *dst##2 = dst; \
129 vec_xst(l0, 0, dst##2); \
130 dst##2 += stride; \
131 vec_xst(l1, 0, dst##2); \
132 dst##2 += stride; \
133 vec_xst(l2, 0, dst##2); \
134 dst##2 += stride; \
135 vec_xst(l3, 0, dst##2); \
136 }
137
138 #define APPLY_COEFF_4(a, b, c, d, c01, c23) \
139 { \
140 u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
141 u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \
142 \
143 c01 = vec_adds(c01, vec_splat_s16(8)); \
144 c23 = vec_adds(c23, vec_splat_s16(8)); \
145 c01 = vec_sra(c01, vec_splat_u16(4)); \
146 c23 = vec_sra(c23, vec_splat_u16(4)); \
147 \
148 i16x8 abs = u8h_to_i16(ab); \
149 i16x8 cds = u8h_to_i16(cd); \
150 \
151 abs = vec_adds(abs, c01); \
152 cds = vec_adds(cds, c23); \
153 \
154 a = vec_packsu(abs, abs); \
155 c = vec_packsu(cds, cds); \
156 \
157 b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \
158 d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \
159 }
160
161 #define APPLY_COEFF_8x4(ab, cd, c01, c23) \
162 { \
163 i16x8 abs = u8h_to_i16(ab); \
164 i16x8 cds = u8h_to_i16(cd); \
165 c01 = vec_adds(c01, vec_splat_s16(8)); \
166 c23 = vec_adds(c23, vec_splat_s16(8)); \
167 c01 = vec_sra(c01, vec_splat_u16(4)); \
168 c23 = vec_sra(c23, vec_splat_u16(4)); \
169 \
170 abs = vec_adds(abs, c01); \
171 cds = vec_adds(cds, c23); \
172 \
173 ab = vec_packsu(abs, abs); \
174 cd = vec_packsu(cds, cds); \
175 }
176
177 #define APPLY_COEFF_16x4(a, b, c, d, \
178 c00c01, c02c03, c04c05, c06c07, \
179 c08c09, c10c11, c12c13, c14c15) \
180 { \
181 i16x8 ah = u8h_to_i16(a); \
182 i16x8 al = u8l_to_i16(a); \
183 i16x8 bh = u8h_to_i16(b); \
184 i16x8 bl = u8l_to_i16(b); \
185 i16x8 ch = u8h_to_i16(c); \
186 i16x8 cl = u8l_to_i16(c); \
187 i16x8 dh = u8h_to_i16(d); \
188 i16x8 dl = u8l_to_i16(d); \
189 SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \
190 SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \
191 \
192 ah = vec_adds(ah, c00c01); \
193 al = vec_adds(al, c02c03); \
194 bh = vec_adds(bh, c04c05); \
195 bl = vec_adds(bl, c06c07); \
196 ch = vec_adds(ch, c08c09); \
197 cl = vec_adds(cl, c10c11); \
198 dh = vec_adds(dh, c12c13); \
199 dl = vec_adds(dl, c14c15); \
200 \
201 a = vec_packsu(ah, al); \
202 b = vec_packsu(bh, bl); \
203 c = vec_packsu(ch, cl); \
204 d = vec_packsu(dh, dl); \
205 }
206
207 #define IDCT_4_INNER(c0, c1, c2, c3) \
208 { \
209 i32x4 o0 = vec_add(c0, c2); \
210 i32x4 o1 = vec_sub(c0, c2); \
211 \
212 i32x4 v2896 = vec_splats(2896); \
213 i32x4 v1567 = vec_splats(1567); \
214 i32x4 v3784 = vec_splats(3784); \
215 i32x4 v2048 = vec_splats(2048); \
216 \
217 o0 = vec_mul(o0, v2896); \
218 o1 = vec_mul(o1, v2896); \
219 \
220 i32x4 o2a = vec_mul(c1, v1567); \
221 i32x4 o2b = vec_mul(c3, v3784); \
222 i32x4 o3a = vec_mul(c1, v3784); \
223 i32x4 o3b = vec_mul(c3, v1567); \
224 \
225 i32x4 o2 = vec_sub(o2a, o2b); \
226 i32x4 o3 = vec_add(o3a, o3b); \
227 \
228 u32x4 v12 = vec_splat_u32(12); \
229 \
230 o0 = vec_add(o0, v2048); \
231 o1 = vec_add(o1, v2048); \
232 o2 = vec_add(o2, v2048); \
233 o3 = vec_add(o3, v2048); \
234 \
235 o0 = vec_sra(o0, v12); \
236 o1 = vec_sra(o1, v12); \
237 o2 = vec_sra(o2, v12); \
238 o3 = vec_sra(o3, v12); \
239 \
240 c0 = vec_add(o0, o3); \
241 c1 = vec_add(o1, o2); \
242 c2 = vec_sub(o1, o2); \
243 c3 = vec_sub(o0, o3); \
244 \
245 }
246
247 #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \
248 IDCT_4_INNER(c0, c1, c2, c3) \
249 c03 = vec_packs(c0, c3); \
250 c12 = vec_packs(c1, c2); \
251
252 #define dct_4_in(c0, c1, c2, c3, c01, c23) \
253 { \
254 IDCT_4_INNER(c0, c1, c2, c3) \
255 c01 = vec_packs(c0, c1); \
256 c23 = vec_packs(c2, c3); \
257 c0 = i16h_to_i32(c01); \
258 c1 = i16l_to_i32(c01); \
259 c2 = i16h_to_i32(c23); \
260 c3 = i16l_to_i32(c23); \
261 }
262
263 #define dct_4_out(c0, c1, c2, c3, c01, c23) \
264 IDCT_4_INNER(c0, c1, c2, c3) \
265 c01 = vec_packs(c0, c1); \
266 c23 = vec_packs(c2, c3); \
267
268
269 #define IDENTITY_4(c01, c23) \
270 { \
271 i16x8 v1697 = vec_splats((int16_t)(1697*8)); \
272 i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \
273 i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \
274 c01 = vec_adds(c01, o01); \
275 c23 = vec_adds(c23, o23); \
276 }
277
278 #define identity_4_in(c0, c1, c2, c3, c01, c23) \
279 { \
280 IDENTITY_4(c01, c23) \
281 c0 = i16h_to_i32(c01); \
282 c1 = i16l_to_i32(c01); \
283 c2 = i16h_to_i32(c23); \
284 c3 = i16l_to_i32(c23); \
285 }
286
287 #define identity_4_out(c0, c1, c2, c3, c01, c23) \
288 { \
289 c01 = vec_packs(c0, c1); \
290 c23 = vec_packs(c2, c3); \
291 IDENTITY_4(c01, c23) \
292 }
293
294 #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \
295 { \
296 i32x4 v1321 = vec_splats(1321); \
297 i32x4 v3803 = vec_splats(3803); \
298 i32x4 v2482 = vec_splats(2482); \
299 i32x4 v3344 = vec_splats(3344); \
300 i32x4 v2048 = vec_splats(2048); \
301 i32x4 i0_v1321 = vec_mul(c0, v1321); \
302 i32x4 i0_v2482 = vec_mul(c0, v2482); \
303 i32x4 i0_v3803 = vec_mul(c0, v3803); \
304 i32x4 i1 = vec_mul(c1, v3344); \
305 i32x4 i2_v1321 = vec_mul(c2, v1321); \
306 i32x4 i2_v2482 = vec_mul(c2, v2482); \
307 i32x4 i2_v3803 = vec_mul(c2, v3803); \
308 i32x4 i3_v1321 = vec_mul(c3, v1321); \
309 i32x4 i3_v2482 = vec_mul(c3, v2482); \
310 i32x4 i3_v3803 = vec_mul(c3, v3803); \
311 \
312 i32x4 n1 = vec_sub(i1, v2048); \
313 i1 = vec_add(i1, v2048); \
314 \
315 \
316 i32x4 o0 = vec_add(i0_v1321, i2_v3803); \
317 i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \
318 i32x4 o2 = vec_sub(c0, c2); \
319 i32x4 o3 = vec_add(i0_v3803, i2_v2482); \
320 \
321 o0 = vec_add(o0, i3_v2482); \
322 o1 = vec_sub(o1, i3_v3803); \
323 o2 = vec_add(o2, c3); \
324 o3 = vec_sub(o3, i3_v1321); \
325 \
326 o0 = vec_add(o0, i1); \
327 o1 = vec_add(o1, i1); \
328 o2 = vec_mul(o2, v3344); \
329 o3 = vec_sub(o3, n1); \
330 \
331 o2 = vec_add(o2, v2048); \
332 \
333 oc0 = vec_sra(o0, vec_splat_u32(12)); \
334 oc1 = vec_sra(o1, vec_splat_u32(12)); \
335 oc2 = vec_sra(o2, vec_splat_u32(12)); \
336 oc3 = vec_sra(o3, vec_splat_u32(12)); \
337 }
338
339 #define adst_4_in(c0, c1, c2, c3, c01, c23) \
340 { \
341 ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
342 }
343
344 #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \
345 { \
346 ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
347 }
348
349 #define adst_4_out(c0, c1, c2, c3, c01, c23) \
350 { \
351 ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \
352 c01 = vec_packs(c0, c1); \
353 c23 = vec_packs(c2, c3); \
354 }
355
356 #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \
357 { \
358 ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \
359 c01 = vec_packs(c0, c1); \
360 c23 = vec_packs(c2, c3); \
361 }
362
dc_only_4xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)363 static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
364 {
365 int dc = coeff[0];
366 const int rnd = (1 << shift) >> 1;
367 if (is_rect2)
368 dc = (dc * 181 + 128) >> 8;
369 dc = (dc * 181 + 128) >> 8;
370 dc = (dc + rnd) >> shift;
371 dc = (dc * 181 + 128 + 2048) >> 12;
372
373 i16x8 vdc = vec_splats((int16_t)dc);
374 coeff[0] = 0;
375 for (int i = 0; i < n; i++, dst += 4 * stride) {
376 LOAD_DECLARE_4(dst, stride, a, b, c, d)
377
378 i16x8 as = u8h_to_i16(a);
379 i16x8 bs = u8h_to_i16(b);
380 i16x8 cs = u8h_to_i16(c);
381 i16x8 ds = u8h_to_i16(d);
382
383 as = vec_adds(as, vdc);
384 bs = vec_adds(bs, vdc);
385 cs = vec_adds(cs, vdc);
386 ds = vec_adds(ds, vdc);
387
388 a = vec_packsu(as, as);
389 b = vec_packsu(bs, bs);
390 c = vec_packsu(cs, cs);
391 d = vec_packsu(ds, ds);
392
393 STORE_4(dst, stride, a, b, c, d)
394 }
395 }
396
dc_only_8xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)397 static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
398 {
399 int dc = coeff[0];
400 const int rnd = (1 << shift) >> 1;
401 if (is_rect2)
402 dc = (dc * 181 + 128) >> 8;
403 dc = (dc * 181 + 128) >> 8;
404 dc = (dc + rnd) >> shift;
405 dc = (dc * 181 + 128 + 2048) >> 12;
406
407 i16x8 vdc = vec_splats((int16_t)dc);
408 coeff[0] = 0;
409
410 for (int i = 0; i < n; i++, dst += 4 * stride) {
411 LOAD_DECLARE_4(dst, stride, a, b, c, d)
412
413 i16x8 as = u8h_to_i16(a);
414 i16x8 bs = u8h_to_i16(b);
415 i16x8 cs = u8h_to_i16(c);
416 i16x8 ds = u8h_to_i16(d);
417
418 as = vec_adds(as, vdc);
419 bs = vec_adds(bs, vdc);
420 cs = vec_adds(cs, vdc);
421 ds = vec_adds(ds, vdc);
422
423 a = vec_packsu(as, as);
424 b = vec_packsu(bs, bs);
425 c = vec_packsu(cs, cs);
426 d = vec_packsu(ds, ds);
427
428 STORE_8(dst, stride, a, b, c, d)
429 }
430 }
431
dc_only_16xN(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,int n,int is_rect2,int shift)432 static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift)
433 {
434 int dc = coeff[0];
435 const int rnd = (1 << shift) >> 1;
436 if (is_rect2)
437 dc = (dc * 181 + 128) >> 8;
438 dc = (dc * 181 + 128) >> 8;
439 dc = (dc + rnd) >> shift;
440 dc = (dc * 181 + 128 + 2048) >> 12;
441
442 i16x8 vdc = vec_splats((int16_t)dc);
443 coeff[0] = 0;
444
445 for (int i = 0; i < n; i++, dst += 4 * stride) {
446 LOAD_DECLARE_4(dst, stride, a, b, c, d)
447
448 i16x8 ah = u8h_to_i16(a);
449 i16x8 bh = u8h_to_i16(b);
450 i16x8 ch = u8h_to_i16(c);
451 i16x8 dh = u8h_to_i16(d);
452 i16x8 al = u8l_to_i16(a);
453 i16x8 bl = u8l_to_i16(b);
454 i16x8 cl = u8l_to_i16(c);
455 i16x8 dl = u8l_to_i16(d);
456
457 ah = vec_adds(ah, vdc);
458 bh = vec_adds(bh, vdc);
459 ch = vec_adds(ch, vdc);
460 dh = vec_adds(dh, vdc);
461 al = vec_adds(al, vdc);
462 bl = vec_adds(bl, vdc);
463 cl = vec_adds(cl, vdc);
464 dl = vec_adds(dl, vdc);
465
466 a = vec_packsu(ah, al);
467 b = vec_packsu(bh, bl);
468 c = vec_packsu(ch, cl);
469 d = vec_packsu(dh, dl);
470
471 STORE_16(dst, stride, a, b, c, d)
472 }
473 }
474
dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t * dst,const ptrdiff_t stride,int16_t * const coeff,const int eob)475 void dav1d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
476 int16_t *const coeff, const int eob)
477 {
478 assert(eob >= 0);
479
480 if (eob < 1) {
481 return dc_only_4xN(dst, stride, coeff, 1, 0, 0);
482 }
483
484 LOAD_COEFF_4(coeff)
485
486 dct_4_in(c0, c1, c2, c3, c01, c23)
487
488 TRANSPOSE4_I32(c0, c1, c2, c3)
489
490 memset(coeff, 0, sizeof(*coeff) * 4 * 4);
491
492 dct_4_out(c0, c1, c2, c3, c01, c23)
493
494 LOAD_DECLARE_4(dst, stride, a, b, c, d)
495
496 APPLY_COEFF_4(a, b, c, d, c01, c23)
497
498 STORE_4(dst, stride, a, b, c, d)
499 }
500
dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel * dst,const ptrdiff_t stride,coef * const coeff,const int eob)501 void dav1d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride,
502 coef *const coeff, const int eob)
503 {
504 LOAD_COEFF_4(coeff)
505
506 u32x4 v2 = vec_splat_u32(2);
507
508 c0 = vec_sra(c0, v2);
509 c1 = vec_sra(c1, v2);
510 c2 = vec_sra(c2, v2);
511 c3 = vec_sra(c3, v2);
512
513 i32x4 t0 = vec_add(c0, c1);
514 i32x4 t2 = vec_sub(c2, c3);
515 i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
516 i32x4 t3 = vec_sub(t4, c3);
517 i32x4 t1 = vec_sub(t4, c1);
518 c0 = vec_sub(t0, t3);
519 c1 = t3;
520 c2 = t1;
521 c3 = vec_add(t2, t1);
522
523 memset(coeff, 0, sizeof(*coeff) * 4 * 4);
524
525 TRANSPOSE4_I32(c0, c1, c2, c3)
526
527 t0 = vec_add(c0, c1);
528 t2 = vec_sub(c2, c3);
529 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1));
530 t3 = vec_sub(t4, c3);
531 t1 = vec_sub(t4, c1);
532 c0 = vec_sub(t0, t3);
533 c1 = t3;
534 c2 = t1;
535 c3 = vec_add(t2, t1);
536
537 c01 = vec_packs(c0, c1);
538 c23 = vec_packs(c2, c3);
539
540 LOAD_DECLARE_4(dst, stride, a, b, c, d)
541
542 u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b);
543 u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d);
544
545 i16x8 abs = u8h_to_i16(ab);
546 i16x8 cds = u8h_to_i16(cd);
547
548 abs = vec_adds(abs, c01);
549 cds = vec_adds(cds, c23);
550
551 a = vec_packsu(abs, abs);
552 c = vec_packsu(cds, cds);
553
554 b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a);
555 d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c);
556
557 STORE_4(dst, stride, a, b, c, d)
558 }
559
560 #define inv_txfm_fn4x4(type1, type2) \
561 void dav1d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
562 int16_t *const coeff, const int eob) \
563 { \
564 LOAD_COEFF_4(coeff) \
565 type1##_4_in(c0, c1, c2, c3, c01, c23) \
566 memset(coeff, 0, sizeof(*coeff) * 4 * 4); \
567 TRANSPOSE4_I32(c0, c1, c2, c3) \
568 type2##_4_out(c0, c1, c2, c3, c01, c23) \
569 LOAD_DECLARE_4(dst, stride, a, b, c, d) \
570 APPLY_COEFF_4(a, b, c, d, c01, c23) \
571 STORE_4(dst, stride, a, b, c, d) \
572 }
573
inv_txfm_fn4x4(adst,dct)574 inv_txfm_fn4x4(adst, dct )
575 inv_txfm_fn4x4(dct, adst )
576 inv_txfm_fn4x4(dct, flipadst)
577 inv_txfm_fn4x4(flipadst, dct )
578 inv_txfm_fn4x4(adst, flipadst)
579 inv_txfm_fn4x4(flipadst, adst )
580 inv_txfm_fn4x4(identity, dct )
581 inv_txfm_fn4x4(dct, identity)
582 inv_txfm_fn4x4(identity, flipadst)
583 inv_txfm_fn4x4(flipadst, identity)
584 inv_txfm_fn4x4(identity, adst )
585 inv_txfm_fn4x4(adst, identity)
586 inv_txfm_fn4x4(identity, identity)
587 inv_txfm_fn4x4(adst, adst )
588 inv_txfm_fn4x4(flipadst, flipadst)
589
590
591 #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
592 dct4_for_dct8(c0, c2, c4, c6, c03, c12) \
593 \
594 i32x4 v799 = vec_splats(799); \
595 i32x4 v4017 = vec_splats(4017); \
596 i32x4 v3406 = vec_splats(3406); \
597 i32x4 v2276 = vec_splats(2276); \
598 i32x4 v2048 = vec_splats(2048); \
599 u32x4 v12 = vec_splat_u32(12); \
600 \
601 i32x4 c1v799 = vec_mul(c1, v799); \
602 i32x4 c7v4017 = vec_mul(c7, v4017); \
603 i32x4 c5v3406 = vec_mul(c5, v3406); \
604 i32x4 c3v2276 = vec_mul(c3, v2276); \
605 i32x4 c5v2276 = vec_mul(c5, v2276); \
606 i32x4 c3v3406 = vec_mul(c3, v3406); \
607 i32x4 c1v4017 = vec_mul(c1, v4017); \
608 i32x4 c7v799 = vec_mul(c7, v799); \
609 \
610 i32x4 t4a = vec_subs(c1v799, c7v4017); \
611 i32x4 t5a = vec_subs(c5v3406, c3v2276); \
612 i32x4 t6a = vec_adds(c5v2276, c3v3406); \
613 i32x4 t7a = vec_adds(c1v4017, c7v799); \
614 \
615 t4a = vec_adds(t4a, v2048); \
616 t5a = vec_adds(t5a, v2048); \
617 t6a = vec_adds(t6a, v2048); \
618 t7a = vec_adds(t7a, v2048); \
619 \
620 t4a = vec_sra(t4a, v12); \
621 t7a = vec_sra(t7a, v12); \
622 t5a = vec_sra(t5a, v12); \
623 t6a = vec_sra(t6a, v12); \
624 \
625 i16x8 t7at4a = vec_packs(t7a, t4a); \
626 i16x8 t6at5a = vec_packs(t6a, t5a); \
627 \
628 i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \
629 t6at5a = vec_subs(t7at4a, t6at5a); \
630 \
631 t6a = i16h_to_i32(t6at5a); \
632 t5a = i16l_to_i32(t6at5a); \
633 \
634 i32x4 t6 = vec_add(t6a, t5a); \
635 i32x4 t5 = vec_sub(t6a, t5a); \
636 \
637 t6 = vec_mul(t6, vec_splats(181)); \
638 t5 = vec_mul(t5, vec_splats(181)); \
639 t6 = vec_add(t6, vec_splats(128)); \
640 t5 = vec_add(t5, vec_splats(128)); \
641 \
642 t6 = vec_sra(t6, vec_splat_u32(8)); \
643 t5 = vec_sra(t5, vec_splat_u32(8)); \
644 \
645 i16x8 t6t5 = vec_packs(t6, t5); \
646 \
647 c74 = vec_subs(c03, t7t4); \
648 c65 = vec_subs(c12, t6t5); \
649 c03 = vec_adds(c03, t7t4); \
650 c12 = vec_adds(c12, t6t5); \
651
652 #define UNPACK_4_I16_I32(t0, t1, t2, t3) \
653 t0 = i16h_to_i32(t0##t1); \
654 t1 = i16l_to_i32(t0##t1); \
655 t2 = i16h_to_i32(t2##t3); \
656 t3 = i16l_to_i32(t2##t3);
657
658 #define UNPACK_PAIR_I16_I32(hi, lo, v) \
659 hi = i16h_to_i32(v); \
660 lo = i16l_to_i32(v); \
661
662
663 #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \
664 { \
665 i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \
666 IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \
667 UNPACK_4_I16_I32(c0, c3, c1, c2) \
668 UNPACK_4_I16_I32(c7, c4, c6, c5) \
669 }
670
671 #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
672 { \
673 i16x8 c03, c12, c74, c65; \
674 IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \
675 c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \
676 c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \
677 c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \
678 c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \
679 }
680
681 #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
682 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
683 c0, c1, c2, c3, c4, c5, c6, c7) \
684 { \
685 dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \
686 dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \
687 }
688
689 #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
690 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
691 c0, c1, c2, c3, c4, c5, c6, c7) \
692 { \
693 i16x8 c03h, c12h, c74h, c65h; \
694 i16x8 c03l, c12l, c74l, c65l; \
695 { \
696 IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \
697 } \
698 { \
699 IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \
700 } \
701 c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \
702 c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \
703 c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \
704 c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \
705 c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \
706 c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \
707 c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \
708 c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \
709 }
710
711 #define IDENTITY_8(c01, c23, c45, c67) \
712 { \
713 c01 = vec_adds(c01, c01); \
714 c23 = vec_adds(c23, c23); \
715 c45 = vec_adds(c45, c45); \
716 c67 = vec_adds(c67, c67); \
717 }
718
719 #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
720 { \
721 IDENTITY_8(c01, c23, c45, c67) \
722 UNPACK_PAIR_I16_I32(c0, c1, c01) \
723 UNPACK_PAIR_I16_I32(c2, c3, c23) \
724 UNPACK_PAIR_I16_I32(c4, c5, c45) \
725 UNPACK_PAIR_I16_I32(c6, c7, c67) \
726 }
727
728 #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
729 c01 = vec_packs(c0, c1); \
730 c23 = vec_packs(c2, c3); \
731 c45 = vec_packs(c4, c5); \
732 c67 = vec_packs(c6, c7); \
733 IDENTITY_8(c01, c23, c45, c67)
734
735 #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
736 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
737 c0, c1, c2, c3, c4, c5, c6, c7) \
738 { \
739 IDENTITY_8(c0, c1, c2, c3) \
740 IDENTITY_8(c4, c5, c6, c7) \
741 UNPACK_PAIR_I16_I32(c0h, c0l, c0) \
742 UNPACK_PAIR_I16_I32(c1h, c1l, c1) \
743 UNPACK_PAIR_I16_I32(c2h, c2l, c2) \
744 UNPACK_PAIR_I16_I32(c3h, c3l, c3) \
745 UNPACK_PAIR_I16_I32(c4h, c4l, c4) \
746 UNPACK_PAIR_I16_I32(c5h, c5l, c5) \
747 UNPACK_PAIR_I16_I32(c6h, c6l, c6) \
748 UNPACK_PAIR_I16_I32(c7h, c7l, c7) \
749 }
750
751 #define PACK_4(c0, c1, c2, c3, \
752 c0h, c1h, c2h, c3h, \
753 c0l, c1l, c2l, c3l) \
754 { \
755 c0 = vec_packs(c0h, c0l); \
756 c1 = vec_packs(c1h, c1l); \
757 c2 = vec_packs(c2h, c2l); \
758 c3 = vec_packs(c3h, c3l); \
759 }
760
761 #define DECLARE_PACK_4(c0, c1, c2, c3, \
762 c0h, c1h, c2h, c3h, \
763 c0l, c1l, c2l, c3l) \
764 i16x8 c0, c1, c2, c3; \
765 PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l);
766
767 #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
768 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
769 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
770 { \
771 c0 = vec_packs(c0h, c0l); \
772 c1 = vec_packs(c1h, c1l); \
773 c2 = vec_packs(c2h, c2l); \
774 c3 = vec_packs(c3h, c3l); \
775 c4 = vec_packs(c4h, c4l); \
776 c5 = vec_packs(c5h, c5l); \
777 c6 = vec_packs(c6h, c6l); \
778 c7 = vec_packs(c7h, c7l); \
779 }
780
781 #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
782 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
783 c0, c1, c2, c3, c4, c5, c6, c7) \
784 { \
785 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
786 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
787 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
788 IDENTITY_8(c0, c1, c2, c3) \
789 IDENTITY_8(c4, c5, c6, c7) \
790 }
791
792 #define DECLARE_SPLAT_I32(val) \
793 i32x4 v##val = vec_splats(val);
794
795 #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \
796 i32x4 ca##va = vec_mul(ca, va); \
797 i32x4 cb##vb = vec_mul(cb, vb); \
798 i32x4 ca##vb = vec_mul(ca, vb); \
799 i32x4 cb##va = vec_mul(cb, va);
800
801 #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
802 r0 = vec_adds(ca##va, cb##vb); \
803 r1 = vec_subs(ca##vb, cb##va);
804
805 #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \
806 i32x4 r0, r1; \
807 ADD_SUB_PAIR(r0, r1, ca, cb, va, vb)
808
809 #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \
810 a = vec_adds(a, rnd); \
811 b = vec_adds(b, rnd); \
812 c = vec_adds(c, rnd); \
813 d = vec_adds(d, rnd); \
814 a = vec_sra(a, shift); \
815 b = vec_sra(b, shift); \
816 c = vec_sra(c, shift); \
817 d = vec_sra(d, shift);
818
819 #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
820 o0, o1, o2, o3, o4, o5, o6, o7) \
821 { \
822 DECLARE_SPLAT_I32(4076) \
823 DECLARE_SPLAT_I32(401) \
824 \
825 DECLARE_SPLAT_I32(3612) \
826 DECLARE_SPLAT_I32(1931) \
827 \
828 DECLARE_SPLAT_I32(2598) \
829 DECLARE_SPLAT_I32(3166) \
830 \
831 DECLARE_SPLAT_I32(1189) \
832 DECLARE_SPLAT_I32(3920) \
833 \
834 DECLARE_SPLAT_I32(3784) \
835 DECLARE_SPLAT_I32(1567) \
836 \
837 DECLARE_SPLAT_I32(2048) \
838 u32x4 v12 = vec_splat_u32(12); \
839 \
840 DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \
841 DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \
842 DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \
843 DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \
844 \
845 DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \
846 DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \
847 DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \
848 DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \
849 \
850 SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \
851 SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
852 \
853 i32x4 t0 = vec_add(t0a, t4a); \
854 i32x4 t1 = vec_add(t1a, t5a); \
855 i32x4 t2 = vec_add(t2a, t6a); \
856 i32x4 t3 = vec_add(t3a, t7a); \
857 i32x4 t4 = vec_sub(t0a, t4a); \
858 i32x4 t5 = vec_sub(t1a, t5a); \
859 i32x4 t6 = vec_sub(t2a, t6a); \
860 i32x4 t7 = vec_sub(t3a, t7a); \
861 \
862 i16x8 t0t1 = vec_packs(t0, t1); \
863 i16x8 t2t3 = vec_packs(t2, t3); \
864 i16x8 t4t5 = vec_packs(t4, t5); \
865 i16x8 t6t7 = vec_packs(t6, t7); \
866 \
867 UNPACK_4_I16_I32(t4, t5, t6, t7) \
868 UNPACK_4_I16_I32(t0, t1, t2, t3) \
869 \
870 DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \
871 DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \
872 \
873 ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \
874 ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \
875 \
876 SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \
877 \
878 o0 = vec_add(t0, t2); \
879 o1 = vec_add(t4a, t6a); \
880 o7 = vec_add(t1, t3); \
881 o6 = vec_add(t5a, t7a); \
882 t2 = vec_sub(t0, t2); \
883 t3 = vec_sub(t1, t3); \
884 t6 = vec_sub(t4a, t6a); \
885 t7 = vec_sub(t5a, t7a); \
886 \
887 i16x8 o7##o1 = vec_packs(o7, o1); \
888 i16x8 o0##o6 = vec_packs(o0, o6); \
889 t2t3 = vec_packs(t2, t3); \
890 t6t7 = vec_packs(t6, t7); \
891 \
892 UNPACK_4_I16_I32(t2, t3, t6, t7) \
893 UNPACK_4_I16_I32(o7, o1, o0, o6) \
894 \
895 o7 = -o7; \
896 o1 = -o1; \
897 \
898 o3 = vec_add(t2, t3); \
899 o4 = vec_sub(t2, t3); \
900 o5 = vec_sub(t6, t7); \
901 o2 = vec_add(t6, t7); \
902 \
903 i32x4 v181 = vec_splats(181); \
904 i32x4 v128 = vec_splats(128); \
905 u32x4 v8 = vec_splat_u32(8); \
906 \
907 o2 = vec_mul(o2, v181); \
908 o3 = vec_mul(o3, v181); \
909 o4 = vec_mul(o4, v181); \
910 o5 = vec_mul(o5, v181); \
911 \
912 SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \
913 \
914 o3 = -o3; \
915 o5 = -o5; \
916 }
917
918 #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
919 {\
920 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
921 c0, c1, c2, c3, c4, c5, c6, c7) \
922 c01 = vec_packs(c0, c1); \
923 c23 = vec_packs(c2, c3); \
924 c45 = vec_packs(c4, c5); \
925 c67 = vec_packs(c6, c7); \
926 UNPACK_PAIR_I16_I32(c0, c1, c01) \
927 UNPACK_PAIR_I16_I32(c2, c3, c23) \
928 UNPACK_PAIR_I16_I32(c4, c5, c45) \
929 UNPACK_PAIR_I16_I32(c6, c7, c67) \
930 }
931
932 #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
933 {\
934 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
935 c0, c1, c2, c3, c4, c5, c6, c7) \
936 c01 = vec_packs(c0, c1); \
937 c23 = vec_packs(c2, c3); \
938 c45 = vec_packs(c4, c5); \
939 c67 = vec_packs(c6, c7); \
940 }
941
942 #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
943 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
944 c0, c1, c2, c3, c4, c5, c6, c7) \
945 { \
946 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
947 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
948 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
949 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
950 }
951
952 #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
953 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
954 c0, c1, c2, c3, c4, c5, c6, c7) \
955 { \
956 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
957 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \
958 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
959 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
960 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
961 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
962 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
963 }
964
965 #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
966 {\
967 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
968 c7, c6, c5, c4, c3, c2, c1, c0) \
969 c01 = vec_packs(c0, c1); \
970 c23 = vec_packs(c2, c3); \
971 c45 = vec_packs(c4, c5); \
972 c67 = vec_packs(c6, c7); \
973 UNPACK_PAIR_I16_I32(c0, c1, c01) \
974 UNPACK_PAIR_I16_I32(c2, c3, c23) \
975 UNPACK_PAIR_I16_I32(c4, c5, c45) \
976 UNPACK_PAIR_I16_I32(c6, c7, c67) \
977 }
978
979 #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
980 {\
981 ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \
982 c7, c6, c5, c4, c3, c2, c1, c0) \
983 c01 = vec_packs(c0, c1); \
984 c23 = vec_packs(c2, c3); \
985 c45 = vec_packs(c4, c5); \
986 c67 = vec_packs(c6, c7); \
987 }
988
989 #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
990 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
991 c0, c1, c2, c3, c4, c5, c6, c7) \
992 { \
993 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
994 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
995 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
996 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
997 }
998
999 #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1000 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1001 c0, c1, c2, c3, c4, c5, c6, c7) \
1002 { \
1003 ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1004 c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \
1005 ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1006 c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \
1007 PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \
1008 c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1009 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1010 }
1011
1012 void dav1d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1013 int16_t *const coeff, const int eob)
1014 {
1015 i16x8 v = vec_splats((int16_t)(2896*8));
1016
1017 if (eob < 1) {
1018 return dc_only_4xN(dst, stride, coeff, 2, 1, 0);
1019 }
1020
1021 LOAD_SCALE_COEFF_4x8(coeff, v)
1022
1023 dct_4_in(c0, c1, c2, c3, c01, c23)
1024 dct_4_in(c4, c5, c6, c7, c45, c67)
1025
1026
1027 memset(coeff, 0, sizeof(*coeff) * 4 * 8);
1028
1029 TRANSPOSE4_I32(c0, c1, c2, c3);
1030 TRANSPOSE4_I32(c4, c5, c6, c7);
1031
1032 dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
1033
1034 LOAD_DECLARE_4(dst, stride, a, b, cc, d)
1035 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
1036
1037 APPLY_COEFF_4(a, b, cc, d, c01, c23)
1038 APPLY_COEFF_4(e, f, g, hh, c45, c67)
1039
1040 STORE_4(dst, stride, a, b, cc, d)
1041 STORE_4(dst + 4 * stride, stride, e, f, g, hh)
1042 }
1043
1044
1045 #define inv_txfm_fn4x8(type1, type2) \
1046 void dav1d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1047 int16_t *const coeff, const int eob) \
1048 { \
1049 i16x8 v = vec_splats((int16_t)(2896*8)); \
1050 LOAD_SCALE_COEFF_4x8(coeff, v) \
1051 type1##_4_in(c0, c1, c2, c3, c01, c23) \
1052 type1##_4_in(c4, c5, c6, c7, c45, c67) \
1053 memset(coeff, 0, sizeof(*coeff) * 4 * 8); \
1054 TRANSPOSE4_I32(c0, c1, c2, c3); \
1055 TRANSPOSE4_I32(c4, c5, c6, c7); \
1056 type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
1057 LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1058 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1059 APPLY_COEFF_4(a, b, c, d, c01, c23) \
1060 APPLY_COEFF_4(e, f, g, h, c45, c67) \
1061 STORE_4(dst, stride, a, b, c, d) \
1062 STORE_4(dst + 4 * stride, stride, e, f, g, h) \
1063 }
1064
inv_txfm_fn4x8(adst,dct)1065 inv_txfm_fn4x8(adst, dct )
1066 inv_txfm_fn4x8(dct, adst )
1067 inv_txfm_fn4x8(dct, flipadst)
1068 inv_txfm_fn4x8(flipadst, dct )
1069 inv_txfm_fn4x8(adst, flipadst)
1070 inv_txfm_fn4x8(flipadst, adst )
1071 inv_txfm_fn4x8(identity, dct )
1072 inv_txfm_fn4x8(dct, identity)
1073 inv_txfm_fn4x8(identity, flipadst)
1074 inv_txfm_fn4x8(flipadst, identity)
1075 inv_txfm_fn4x8(identity, adst )
1076 inv_txfm_fn4x8(adst, identity)
1077 inv_txfm_fn4x8(identity, identity)
1078 inv_txfm_fn4x8(adst, adst )
1079 inv_txfm_fn4x8(flipadst, flipadst)
1080
1081
1082 void dav1d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1083 int16_t *const coeff, const int eob)
1084 {
1085 i16x8 v = vec_splats((int16_t)(2896*8));
1086
1087 if (eob < 1) {
1088 return dc_only_8xN(dst, stride, coeff, 1, 1, 0);
1089 }
1090
1091 LOAD_SCALE_COEFF_8x4(coeff, v)
1092
1093 dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67)
1094
1095 memset(coeff, 0, sizeof(*coeff) * 8 * 4);
1096
1097 TRANSPOSE4_I32(c0, c1, c2, c3)
1098 TRANSPOSE4_I32(c4, c5, c6, c7)
1099
1100 dct_4_out(c0, c1, c2, c3, c01, c23)
1101 dct_4_out(c4, c5, c6, c7, c45, c67)
1102
1103 LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh)
1104
1105 i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45);
1106 i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45);
1107 i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67);
1108 i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67);
1109
1110 APPLY_COEFF_8x4(ae, bf, c04, c15)
1111 APPLY_COEFF_8x4(cg, dh, c26, c37)
1112
1113 STORE_8(dst, stride, ae, bf, cg, dh)
1114 }
1115
1116
1117 #define inv_txfm_fn8x4(type1, type2) \
1118 void dav1d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1119 int16_t *const coeff, const int eob) \
1120 { \
1121 i16x8 v = vec_splats((int16_t)(2896*8)); \
1122 LOAD_SCALE_COEFF_8x4(coeff, v) \
1123 type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \
1124 memset(coeff, 0, sizeof(*coeff) * 8 * 4); \
1125 TRANSPOSE4_I32(c0, c1, c2, c3) \
1126 TRANSPOSE4_I32(c4, c5, c6, c7) \
1127 type2##_4_out(c0, c1, c2, c3, c01, c23) \
1128 type2##_4_out(c4, c5, c6, c7, c45, c67) \
1129 LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \
1130 i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \
1131 i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \
1132 i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \
1133 i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \
1134 APPLY_COEFF_8x4(ae, bf, c04, c15) \
1135 APPLY_COEFF_8x4(cg, dh, c26, c37) \
1136 STORE_8(dst, stride, ae, bf, cg, dh) \
1137 }
inv_txfm_fn8x4(adst,dct)1138 inv_txfm_fn8x4(adst, dct )
1139 inv_txfm_fn8x4(dct, adst )
1140 inv_txfm_fn8x4(dct, flipadst)
1141 inv_txfm_fn8x4(flipadst, dct )
1142 inv_txfm_fn8x4(adst, flipadst)
1143 inv_txfm_fn8x4(flipadst, adst )
1144 inv_txfm_fn8x4(identity, dct )
1145 inv_txfm_fn8x4(dct, identity)
1146 inv_txfm_fn8x4(identity, flipadst)
1147 inv_txfm_fn8x4(flipadst, identity)
1148 inv_txfm_fn8x4(identity, adst )
1149 inv_txfm_fn8x4(adst, identity)
1150 inv_txfm_fn8x4(identity, identity)
1151 inv_txfm_fn8x4(adst, adst )
1152 inv_txfm_fn8x4(flipadst, flipadst)
1153
1154 void dav1d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1155 int16_t *const coeff, const int eob)
1156 {
1157 if (eob < 1) {
1158 return dc_only_8xN(dst, stride, coeff, 2, 0, 1);
1159 }
1160
1161 LOAD_COEFF_8x8(coeff)
1162
1163 dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1164 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
1165 c0, c1, c2, c3, c4, c5, c6, c7)
1166
1167 memset(coeff, 0, sizeof(*coeff) * 8 * 8);
1168
1169 SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1))
1170 SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1))
1171 SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1))
1172 SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1))
1173
1174 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1175 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l)
1176
1177 dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,
1178 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,
1179 c0, c1, c2, c3, c4, c5, c6, c7)
1180
1181 LOAD_DECLARE_4(dst, stride, a, b, cc, d)
1182 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh)
1183
1184 APPLY_COEFF_8x4(a, b, c0, c1)
1185 APPLY_COEFF_8x4(cc, d, c2, c3)
1186 APPLY_COEFF_8x4(e, f, c4, c5)
1187 APPLY_COEFF_8x4(g, hh, c6, c7)
1188
1189 STORE_8(dst, stride, a, b, cc, d)
1190 STORE_8(dst + 4 * stride, stride, e, f, g, hh)
1191 }
1192
1193 #define inv_txfm_fn8x8(type1, type2) \
1194 void dav1d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1195 int16_t *const coeff, const int eob) \
1196 { \
1197 LOAD_COEFF_8x8(coeff) \
1198 type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1199 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1200 c0, c1, c2, c3, c4, c5, c6, c7) \
1201 SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \
1202 SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \
1203 SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \
1204 SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \
1205 memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
1206 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1207 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1208 type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1209 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1210 c0, c1, c2, c3, c4, c5, c6, c7) \
1211 LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1212 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1213 APPLY_COEFF_8x4(a, b, c0, c1) \
1214 APPLY_COEFF_8x4(c, d, c2, c3) \
1215 APPLY_COEFF_8x4(e, f, c4, c5) \
1216 APPLY_COEFF_8x4(g, h, c6, c7) \
1217 STORE_8(dst, stride, a, b, c, d) \
1218 STORE_8(dst + 4 * stride, stride, e, f, g, h) \
1219 }
inv_txfm_fn8x8(adst,dct)1220 inv_txfm_fn8x8(adst, dct )
1221 inv_txfm_fn8x8(dct, adst )
1222 inv_txfm_fn8x8(dct, flipadst)
1223 inv_txfm_fn8x8(flipadst, dct )
1224 inv_txfm_fn8x8(adst, flipadst)
1225 inv_txfm_fn8x8(flipadst, adst )
1226 inv_txfm_fn8x8(dct, identity)
1227 inv_txfm_fn8x8(flipadst, identity)
1228 inv_txfm_fn8x8(adst, identity)
1229 inv_txfm_fn8x8(adst, adst )
1230 inv_txfm_fn8x8(flipadst, flipadst)
1231
1232 // identity + scale is a no op
1233 #define inv_txfm_fn8x8_identity(type2) \
1234 void dav1d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1235 int16_t *const coeff, const int eob) \
1236 { \
1237 LOAD_COEFF_8x8(coeff) \
1238 memset(coeff, 0, sizeof(*coeff) * 8 * 8); \
1239 TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1240 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \
1241 type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \
1242 c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \
1243 c0, c1, c2, c3, c4, c5, c6, c7) \
1244 LOAD_DECLARE_4(dst, stride, a, b, c, d) \
1245 LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \
1246 APPLY_COEFF_8x4(a, b, c0, c1) \
1247 APPLY_COEFF_8x4(c, d, c2, c3) \
1248 APPLY_COEFF_8x4(e, f, c4, c5) \
1249 APPLY_COEFF_8x4(g, h, c6, c7) \
1250 STORE_8(dst, stride, a, b, c, d) \
1251 STORE_8(dst + 4 * stride, stride, e, f, g, h) \
1252 }
1253 inv_txfm_fn8x8_identity(dct )
1254 inv_txfm_fn8x8_identity(flipadst)
1255 inv_txfm_fn8x8_identity(adst )
1256 inv_txfm_fn8x8_identity(identity)
1257
1258 #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \
1259 ab, cd, ef, gh) \
1260 { \
1261 ab = vec_packs(a, b); \
1262 cd = vec_packs(c, d); \
1263 ef = vec_packs(e, f); \
1264 gh = vec_packs(g, h); \
1265 UNPACK_PAIR_I16_I32(a, b, ab) \
1266 UNPACK_PAIR_I16_I32(c, d, cd) \
1267 UNPACK_PAIR_I16_I32(e, f, ef) \
1268 UNPACK_PAIR_I16_I32(g, h, gh) \
1269 }
1270
1271 #define MUL_4_INPLACE(a, b, c, d, v) \
1272 a = vec_mul(a, v); \
1273 b = vec_mul(b, v); \
1274 c = vec_mul(c, v); \
1275 d = vec_mul(d, v); \
1276
1277 #define IDENTITY_16_V(v) \
1278 { \
1279 i16x8 v_ = vec_adds(v, v); \
1280 v = vec_mradds(v, v1697_16, v_); \
1281 }
1282
1283 #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
1284 c08c09, c10c11, c12c13, c14c15) \
1285 { \
1286 i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \
1287 IDENTITY_16_V(c00c01) \
1288 IDENTITY_16_V(c02c03) \
1289 IDENTITY_16_V(c04c05) \
1290 IDENTITY_16_V(c06c07) \
1291 IDENTITY_16_V(c08c09) \
1292 IDENTITY_16_V(c10c11) \
1293 IDENTITY_16_V(c12c13) \
1294 IDENTITY_16_V(c14c15) \
1295 }
1296
1297 #define IDENTITY_16_4_I32(a, b, c, d) \
1298 { \
1299 i32x4 a2 = vec_add(a, a); \
1300 i32x4 b2 = vec_add(b, b); \
1301 i32x4 c2 = vec_add(c, c); \
1302 i32x4 d2 = vec_add(d, d); \
1303 MUL_4_INPLACE(a, b, c, d, v1697) \
1304 SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \
1305 a = vec_add(a2, a); \
1306 b = vec_add(b2, b); \
1307 c = vec_add(c2, c); \
1308 d = vec_add(d2, d); \
1309 }
1310
1311
1312 #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1313 c08, c09, c10, c11, c12, c13, c14, c15, \
1314 c00c01, c02c03, c04c05, c06c07, \
1315 c08c09, c10c11, c12c13, c14c15) \
1316 { \
1317 DECLARE_SPLAT_I32(1697) \
1318 DECLARE_SPLAT_I32(1024) \
1319 IDENTITY_16_4_I32(c00, c01, c02, c03) \
1320 IDENTITY_16_4_I32(c04, c05, c06, c07) \
1321 IDENTITY_16_4_I32(c08, c09, c10, c11) \
1322 IDENTITY_16_4_I32(c12, c13, c14, c15) \
1323 }
1324
1325 #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1326 c08, c09, c10, c11, c12, c13, c14, c15, \
1327 c00c01, c02c03, c04c05, c06c07, \
1328 c08c09, c10c11, c12c13, c14c15) \
1329 { \
1330 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1331 c00, c02, c04, c06, c08, c10, c12, c14, \
1332 c01, c03, c05, c07, c09, c11, c13, c15) \
1333 IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \
1334 c08c09, c10c11, c12c13, c14c15) \
1335 }
1336
1337 #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \
1338 c08, c09, c10, c11, c12, c13, c14, c15, \
1339 c00c03, c01c02, c07c04, c06c05, \
1340 c08c11, c09c10, c14c13, c15c12) \
1341 IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \
1342 c00c03, c01c02, c07c04, c06c05) \
1343 DECLARE_SPLAT_I32(128) \
1344 DECLARE_SPLAT_I32(181) \
1345 DECLARE_SPLAT_I32(401) \
1346 DECLARE_SPLAT_I32(4076) \
1347 DECLARE_SPLAT_I32(3166) \
1348 DECLARE_SPLAT_I32(2598) \
1349 DECLARE_SPLAT_I32(1931) \
1350 DECLARE_SPLAT_I32(3612) \
1351 DECLARE_SPLAT_I32(3920) \
1352 DECLARE_SPLAT_I32(1189) \
1353 DECLARE_SPLAT_I32(1567) \
1354 DECLARE_SPLAT_I32(3784) \
1355 \
1356 DECLARE_MUL_PAIR_I32(c01, c15, v401, v4076) \
1357 DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \
1358 DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \
1359 DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \
1360 \
1361 DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076, v401) \
1362 DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \
1363 DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \
1364 DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \
1365 \
1366 SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \
1367 SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \
1368 \
1369 CLIP16_I32_8(t15a, t08a, t14a, t09a, \
1370 t13a, t10a, t12a, t11a, \
1371 c08c11, c09c10, c14c13, c15c12) \
1372 DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \
1373 DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \
1374 DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \
1375 DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \
1376 \
1377 CLIP16_I32_8(t08, t09, t11, t10, \
1378 t12, t13, t15, t14, \
1379 c08c11, c09c10, c14c13, c15c12) \
1380 \
1381 DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \
1382 DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \
1383 \
1384 ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \
1385 ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \
1386 t10a = -t10a; \
1387 \
1388 SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \
1389 \
1390 ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \
1391 ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \
1392 ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \
1393 ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \
1394 \
1395 CLIP16_I32_8(t08a, t11a, t09, t10, \
1396 t15a, t12a, t14, t13, \
1397 c08c11, c09c10, c14c13, c15c12) \
1398 ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \
1399 ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \
1400 \
1401 MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \
1402 SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \
1403 \
1404 DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \
1405 t15a, t14, t08a, t09, \
1406 t12, t13a, t11, t10a) \
1407 \
1408 c15c12 = vec_subs(c00c03, t15at12); \
1409 c14c13 = vec_subs(c01c02, t14t13a); \
1410 c08c11 = vec_subs(c07c04, t08at11); \
1411 c09c10 = vec_subs(c06c05, t09t10a); \
1412 c00c03 = vec_adds(c00c03, t15at12); \
1413 c01c02 = vec_adds(c01c02, t14t13a); \
1414 c07c04 = vec_adds(c07c04, t08at11); \
1415 c06c05 = vec_adds(c06c05, t09t10a); \
1416
1417 #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1418 c08, c09, c10, c11, c12, c13, c14, c15, \
1419 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1420 \
1421 i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \
1422 IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1423 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1424 c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \
1425 c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \
1426 c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \
1427 c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \
1428 c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \
1429 c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \
1430 c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \
1431 c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \
1432
1433 #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1434 c08, c09, c10, c11, c12, c13, c14, c15, \
1435 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1436 \
1437 IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1438 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \
1439 UNPACK_PAIR_I16_I32(c00, c03, c00c03) \
1440 UNPACK_PAIR_I16_I32(c01, c02, c01c02) \
1441 UNPACK_PAIR_I16_I32(c07, c04, c07c04) \
1442 UNPACK_PAIR_I16_I32(c06, c05, c06c05) \
1443 UNPACK_PAIR_I16_I32(c08, c11, c08c11) \
1444 UNPACK_PAIR_I16_I32(c09, c10, c09c10) \
1445 UNPACK_PAIR_I16_I32(c14, c13, c14c13) \
1446 UNPACK_PAIR_I16_I32(c15, c12, c15c12) \
1447
1448
1449 #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1450 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1451 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1452 dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1453 dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1454 dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1455 dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1456
1457
1458 #define PACK_4x4(c00, c01, c02, c03, \
1459 c04, c05, c06, c07, \
1460 c08, c09, c10, c11, \
1461 c12, c13, c14, c15, \
1462 c00c01, c02c03, c04c05, c06c07, \
1463 c08c09, c10c11, c12c13, c14c15) \
1464 { \
1465 c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \
1466 c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \
1467 c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \
1468 c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \
1469 }
1470
1471
1472
1473 #define dct_4x4_out(c00, c01, c02, c03, \
1474 c04, c05, c06, c07, \
1475 c08, c09, c10, c11, \
1476 c12, c13, c14, c15, \
1477 c00c01, c02c03, c04c05, c06c07, \
1478 c08c09, c10c11, c12c13, c14c15) \
1479 { \
1480 IDCT_4_INNER(c00, c01, c02, c03) \
1481 IDCT_4_INNER(c04, c05, c06, c07) \
1482 IDCT_4_INNER(c08, c09, c10, c11) \
1483 IDCT_4_INNER(c12, c13, c14, c15) \
1484 \
1485 PACK_4x4(c00, c01, c02, c03, \
1486 c04, c05, c06, c07, \
1487 c08, c09, c10, c11, \
1488 c12, c13, c14, c15, \
1489 c00c01, c02c03, c04c05, c06c07, \
1490 c08c09, c10c11, c12c13, c14c15) \
1491 }
1492
1493 #define IDENTITY_4_I32(a, b, c, d) \
1494 { \
1495 DECLARE_SPLAT_I32(5793) \
1496 DECLARE_SPLAT_I32(2048) \
1497 MUL_4_INPLACE(a, b, c, d, v5793) \
1498 SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \
1499 }
1500
1501 #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1502 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1503 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1504 { \
1505 IDENTITY_4_I32(cA0, cA1, cA2, cA3) \
1506 IDENTITY_4_I32(cB0, cB1, cB2, cB3) \
1507 IDENTITY_4_I32(cC0, cC1, cC2, cC3) \
1508 IDENTITY_4_I32(cD0, cD1, cD2, cD3) \
1509 }
1510
1511 #define identity_4x4_out(c00, c01, c02, c03, \
1512 c04, c05, c06, c07, \
1513 c08, c09, c10, c11, \
1514 c12, c13, c14, c15, \
1515 c00c01, c02c03, c04c05, c06c07, \
1516 c08c09, c10c11, c12c13, c14c15) \
1517 { \
1518 PACK_4x4(c00, c01, c02, c03, \
1519 c04, c05, c06, c07, \
1520 c08, c09, c10, c11, \
1521 c12, c13, c14, c15, \
1522 c00c01, c02c03, c04c05, c06c07, \
1523 c08c09, c10c11, c12c13, c14c15) \
1524 IDENTITY_4(c00c01, c02c03) \
1525 IDENTITY_4(c04c05, c06c07) \
1526 IDENTITY_4(c08c09, c10c11) \
1527 IDENTITY_4(c12c13, c14c15) \
1528 }
1529
1530 #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1531 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1532 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1533 adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1534 adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1535 adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1536 adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1537
1538 #define adst_4x4_out(c00, c01, c02, c03, \
1539 c04, c05, c06, c07, \
1540 c08, c09, c10, c11, \
1541 c12, c13, c14, c15, \
1542 c00c01, c02c03, c04c05, c06c07, \
1543 c08c09, c10c11, c12c13, c14c15) \
1544 { \
1545 ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \
1546 ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \
1547 ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \
1548 ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \
1549 \
1550 PACK_4x4(c00, c01, c02, c03, \
1551 c04, c05, c06, c07, \
1552 c08, c09, c10, c11, \
1553 c12, c13, c14, c15, \
1554 c00c01, c02c03, c04c05, c06c07, \
1555 c08c09, c10c11, c12c13, c14c15) \
1556 }
1557
1558 #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1559 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1560 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1561 flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \
1562 flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \
1563 flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \
1564 flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3)
1565
1566 #define flipadst_4x4_out(c00, c01, c02, c03, \
1567 c04, c05, c06, c07, \
1568 c08, c09, c10, c11, \
1569 c12, c13, c14, c15, \
1570 c00c01, c02c03, c04c05, c06c07, \
1571 c08c09, c10c11, c12c13, c14c15) \
1572 { \
1573 ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \
1574 ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \
1575 ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \
1576 ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \
1577 \
1578 PACK_4x4(c00, c01, c02, c03, \
1579 c04, c05, c06, c07, \
1580 c08, c09, c10, c11, \
1581 c12, c13, c14, c15, \
1582 c00c01, c02c03, c04c05, c06c07, \
1583 c08c09, c10c11, c12c13, c14c15) \
1584 }
1585
1586 #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \
1587 c08, c09, c10, c11, c12, c13, c14, c15, \
1588 o00, o01, o02, o03, o04, o05, o06, o07, \
1589 o08, o09, o10, o11, o12, o13, o14, o15, \
1590 c00c01, c02c03, c04c05, c06c07) \
1591 DECLARE_SPLAT_I32(2048); \
1592 u32x4 v12 = vec_splat_u32(12); \
1593 DECLARE_SPLAT_I32(4091) \
1594 DECLARE_SPLAT_I32(201) \
1595 DECLARE_SPLAT_I32(3973) \
1596 DECLARE_SPLAT_I32(995) \
1597 DECLARE_SPLAT_I32(3703) \
1598 DECLARE_SPLAT_I32(1751) \
1599 DECLARE_SPLAT_I32(3290) \
1600 DECLARE_SPLAT_I32(2440) \
1601 DECLARE_SPLAT_I32(2751) \
1602 DECLARE_SPLAT_I32(3035) \
1603 DECLARE_SPLAT_I32(2106) \
1604 DECLARE_SPLAT_I32(3513) \
1605 DECLARE_SPLAT_I32(1380) \
1606 DECLARE_SPLAT_I32(3857) \
1607 DECLARE_SPLAT_I32(601) \
1608 DECLARE_SPLAT_I32(4052) \
1609 \
1610 DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \
1611 DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \
1612 DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \
1613 DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \
1614 DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \
1615 DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \
1616 DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \
1617 DECLARE_MUL_PAIR_I32(c01, c14, v601, v4052) \
1618 \
1619 DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\
1620 DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \
1621 DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \
1622 DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \
1623 DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \
1624 DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \
1625 DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \
1626 DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14, v601, v4052) \
1627 \
1628 SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \
1629 SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \
1630 SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
1631 SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \
1632 \
1633 DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \
1634 DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \
1635 DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \
1636 DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \
1637 DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \
1638 DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \
1639 DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \
1640 DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \
1641 \
1642 CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \
1643 c00c01, c02c03, c04c05, c06c07); \
1644 CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \
1645 c00c01, c02c03, c04c05, c06c07); \
1646 \
1647 DECLARE_SPLAT_I32(4017) \
1648 DECLARE_SPLAT_I32(799) \
1649 DECLARE_SPLAT_I32(2276) \
1650 DECLARE_SPLAT_I32(3406) \
1651 \
1652 DECLARE_MUL_PAIR_I32(t08a, t09a, v4017, v799); \
1653 DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \
1654 DECLARE_MUL_PAIR_I32(t13a, t12a, v799, v4017); \
1655 DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \
1656 \
1657 ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017, v799); \
1658 ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \
1659 ADD_SUB_PAIR(t13, t12, t13a, t12a, v799, v4017); \
1660 ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \
1661 \
1662 SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \
1663 SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \
1664 \
1665 ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \
1666 ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \
1667 ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \
1668 ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \
1669 ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \
1670 ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \
1671 ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \
1672 ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \
1673 \
1674 CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \
1675 c00c01, c02c03, c04c05, c06c07) \
1676 CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \
1677 c00c01, c02c03, c04c05, c06c07) \
1678 \
1679 DECLARE_SPLAT_I32(3784) \
1680 DECLARE_SPLAT_I32(1567) \
1681 \
1682 DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \
1683 DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \
1684 DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \
1685 DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \
1686 \
1687 ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \
1688 ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \
1689 ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \
1690 ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \
1691 \
1692 SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \
1693 SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \
1694 \
1695 ADD_SUB_PAIR(o00, t02a, t00, t02,,) \
1696 ADD_SUB_PAIR(o15, t03a, t01, t03,,) \
1697 ADD_SUB_PAIR(o03, t06, t04a, t06a,,) \
1698 ADD_SUB_PAIR(o12, t07, t05a, t07a,,) \
1699 ADD_SUB_PAIR(o01, t10, t08a, t10a,,) \
1700 ADD_SUB_PAIR(o14, t11, t09a, t11a,,) \
1701 ADD_SUB_PAIR(o02, t14a, t12, t14,,) \
1702 ADD_SUB_PAIR(o13, t15a, t13, t15,,) \
1703 \
1704 CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \
1705 c00c01, c02c03, c04c05, c06c07) \
1706 CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \
1707 c00c01, c02c03, c04c05, c06c07) \
1708 \
1709 DECLARE_SPLAT_I32(181) \
1710 DECLARE_SPLAT_I32(128) \
1711 u32x4 v8 = vec_splat_u32(8); \
1712 \
1713 ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \
1714 ADD_SUB_PAIR(o04, o11, t06, t07,,) \
1715 ADD_SUB_PAIR(o06, o09, t10, t11,,) \
1716 ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \
1717 \
1718 MUL_4_INPLACE(o07, o08, o04, o11, v181) \
1719 MUL_4_INPLACE(o06, o09, o05, o10, v181) \
1720 \
1721 SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \
1722 SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \
1723 \
1724 o01 = -o01; \
1725 o03 = -o03; \
1726 o05 = -o05; \
1727 o07 = -o07; \
1728 o09 = -o09; \
1729 o11 = -o11; \
1730 o13 = -o13; \
1731 o15 = -o15; \
1732
1733 #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1734 c08, c09, c10, c11, c12, c13, c14, c15, \
1735 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1736 { \
1737 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1738 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1739 c00c01, c02c03, c04c05, c06c07) \
1740 }
1741
1742 #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1743 c08, c09, c10, c11, c12, c13, c14, c15, \
1744 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1745 { \
1746 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1747 c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1748 c00c01, c02c03, c04c05, c06c07) \
1749 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1750 c00, c02, c04, c06, c08, c10, c12, c14, \
1751 c01, c03, c05, c07, c09, c11, c13, c15) \
1752 }
1753
1754 #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1755 c08, c09, c10, c11, c12, c13, c14, c15, \
1756 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1757 { \
1758 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1759 c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
1760 c00c01, c02c03, c04c05, c06c07) \
1761 }
1762
1763 #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \
1764 c08, c09, c10, c11, c12, c13, c14, c15, \
1765 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1766 { \
1767 ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \
1768 c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \
1769 c00c01, c02c03, c04c05, c06c07) \
1770 PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \
1771 c00, c02, c04, c06, c08, c10, c12, c14, \
1772 c01, c03, c05, c07, c09, c11, c13, c15) \
1773 }
1774
1775
1776 void dav1d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1777 int16_t *const coeff, const int eob
1778 HIGHBD_DECL_SUFFIX)
1779 {
1780 if (eob < 1) {
1781 return dc_only_4xN(dst, stride, coeff, 4, 0, 1);
1782 }
1783
1784 LOAD_COEFF_4x16(coeff)
1785
1786 dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1787 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
1788 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
1789
1790 memset(coeff, 0, sizeof(*coeff) * 4 * 16);
1791
1792 SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1))
1793 SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1))
1794 SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1))
1795 SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1))
1796 TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1797 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3)
1798
1799 dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3,
1800 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3,
1801 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3)
1802
1803 LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03)
1804 LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07)
1805 LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11)
1806 LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15)
1807
1808 APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0);
1809 APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1);
1810 APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2);
1811 APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3);
1812
1813 STORE_4(dst, stride, l00, l01, l02, l03);
1814 STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07);
1815 STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11);
1816 STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15);
1817 }
1818
1819 #define inv_txfm_fn4x16(type1, type2) \
1820 void dav1d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1821 int16_t *const coeff, const int eob) \
1822 { \
1823 LOAD_COEFF_4x16(coeff) \
1824 type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1825 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1826 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1827 memset(coeff, 0, sizeof(*coeff) * 4 * 16); \
1828 SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \
1829 SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \
1830 SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \
1831 SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \
1832 TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1833 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \
1834 type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \
1835 cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \
1836 a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \
1837 LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \
1838 LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \
1839 LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \
1840 LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \
1841 APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \
1842 APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \
1843 APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \
1844 APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \
1845 STORE_4(dst, stride, l00, l01, l02, l03); \
1846 STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); \
1847 STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); \
1848 STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \
1849 }
inv_txfm_fn4x16(adst,dct)1850 inv_txfm_fn4x16(adst, dct )
1851 inv_txfm_fn4x16(dct, adst )
1852 inv_txfm_fn4x16(dct, flipadst)
1853 inv_txfm_fn4x16(flipadst, dct )
1854 inv_txfm_fn4x16(adst, flipadst)
1855 inv_txfm_fn4x16(flipadst, adst )
1856 inv_txfm_fn4x16(identity, dct )
1857 inv_txfm_fn4x16(dct, identity)
1858 inv_txfm_fn4x16(identity, flipadst)
1859 inv_txfm_fn4x16(flipadst, identity)
1860 inv_txfm_fn4x16(identity, adst )
1861 inv_txfm_fn4x16(adst, identity)
1862 inv_txfm_fn4x16(identity, identity)
1863 inv_txfm_fn4x16(adst, adst )
1864 inv_txfm_fn4x16(flipadst, flipadst)
1865
1866 void dav1d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride,
1867 int16_t *const coeff, const int eob)
1868 {
1869
1870 if (eob < 1) {
1871 return dc_only_16xN(dst, stride, coeff, 1, 0, 1);
1872 }
1873
1874 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1875 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1876 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1877 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1878 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03)
1879 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07)
1880 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11)
1881 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15)
1882
1883 dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07,
1884 c08, c09, c10, c11, c12, c13, c14, c15,
1885 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15)
1886 memset(coeff, 0, sizeof(*coeff) * 16 * 4);
1887 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1))
1888 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1))
1889 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1))
1890 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1))
1891
1892 TRANSPOSE4_I32(c00, c01, c02, c03);
1893 TRANSPOSE4_I32(c04, c05, c06, c07);
1894 TRANSPOSE4_I32(c08, c09, c10, c11);
1895 TRANSPOSE4_I32(c12, c13, c14, c15);
1896
1897 dct_4x4_out(c00, c01, c02, c03,
1898 c04, c05, c06, c07,
1899 c08, c09, c10, c11,
1900 c12, c13, c14, c15,
1901 c00c01, c02c03, c04c05, c06c07,
1902 c08c09, c10c11, c12c13, c14c15)
1903
1904 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3)
1905
1906 APPLY_COEFF_16x4(l0, l1, l2, l3,
1907 c00c01, c02c03, c04c05, c06c07,
1908 c08c09, c10c11, c12c13, c14c15)
1909
1910 STORE_16(dst, stride, l0, l1, l2, l3)
1911 }
1912
1913 #define inv_txfm_fn16x4(type1, type2) \
1914 void dav1d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1915 int16_t *const coeff, const int eob) \
1916 { \
1917 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1918 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1919 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1920 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1921 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
1922 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
1923 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
1924 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
1925 type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1926 c08, c09, c10, c11, c12, c13, c14, c15, \
1927 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1928 memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
1929 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
1930 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
1931 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
1932 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
1933 TRANSPOSE4_I32(c00, c01, c02, c03); \
1934 TRANSPOSE4_I32(c04, c05, c06, c07); \
1935 TRANSPOSE4_I32(c08, c09, c10, c11); \
1936 TRANSPOSE4_I32(c12, c13, c14, c15); \
1937 type2##_4x4_out(c00, c01, c02, c03, \
1938 c04, c05, c06, c07, \
1939 c08, c09, c10, c11, \
1940 c12, c13, c14, c15, \
1941 c00c01, c02c03, c04c05, c06c07, \
1942 c08c09, c10c11, c12c13, c14c15); \
1943 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
1944 APPLY_COEFF_16x4(l0, l1, l2, l3, \
1945 c00c01, c02c03, c04c05, c06c07, \
1946 c08c09, c10c11, c12c13, c14c15) \
1947 STORE_16(dst, stride, l0, l1, l2, l3) \
1948 }
1949
1950 inv_txfm_fn16x4(adst, dct )
1951 inv_txfm_fn16x4(dct, adst )
1952 inv_txfm_fn16x4(dct, flipadst)
1953 inv_txfm_fn16x4(flipadst, dct )
1954 inv_txfm_fn16x4(adst, flipadst)
1955 inv_txfm_fn16x4(flipadst, adst )
1956 inv_txfm_fn16x4(dct, identity)
1957 inv_txfm_fn16x4(flipadst, identity)
1958 inv_txfm_fn16x4(adst, identity)
1959 inv_txfm_fn16x4(identity, identity)
1960 inv_txfm_fn16x4(adst, adst )
1961 inv_txfm_fn16x4(flipadst, flipadst)
1962
1963 #define inv_txfm_fn16x4_identity(type2) \
1964 void dav1d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \
1965 int16_t *const coeff, const int eob) \
1966 { \
1967 LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \
1968 LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \
1969 LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \
1970 LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \
1971 UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \
1972 UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \
1973 UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \
1974 UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \
1975 identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \
1976 c08, c09, c10, c11, c12, c13, c14, c15, \
1977 c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \
1978 memset(coeff, 0, sizeof(*coeff) * 16 * 4); \
1979 SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \
1980 SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \
1981 SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \
1982 SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \
1983 CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \
1984 CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \
1985 TRANSPOSE4_I32(c00, c01, c02, c03); \
1986 TRANSPOSE4_I32(c04, c05, c06, c07); \
1987 TRANSPOSE4_I32(c08, c09, c10, c11); \
1988 TRANSPOSE4_I32(c12, c13, c14, c15); \
1989 type2##_4x4_out(c00, c01, c02, c03, \
1990 c04, c05, c06, c07, \
1991 c08, c09, c10, c11, \
1992 c12, c13, c14, c15, \
1993 c00c01, c02c03, c04c05, c06c07, \
1994 c08c09, c10c11, c12c13, c14c15); \
1995 LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \
1996 APPLY_COEFF_16x4(l0, l1, l2, l3, \
1997 c00c01, c02c03, c04c05, c06c07, \
1998 c08c09, c10c11, c12c13, c14c15) \
1999 STORE_16(dst, stride, l0, l1, l2, l3) \
2000 }
2001
2002 inv_txfm_fn16x4_identity(dct)
2003 inv_txfm_fn16x4_identity(adst)
2004 inv_txfm_fn16x4_identity(flipadst)
2005
2006 #endif // BITDEPTH
2007