xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_utils.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
12*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include <emmintrin.h>  // SSE2
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
18*77c1e3ccSAndroid Build Coastguard Worker 
19*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
20*77c1e3ccSAndroid Build Coastguard Worker   { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
21*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
22*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
23*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
24*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
25*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
26*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
27*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
28*77c1e3ccSAndroid Build Coastguard Worker };
29*77c1e3ccSAndroid Build Coastguard Worker 
30*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
31*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
33*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
34*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
35*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
36*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
37*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
38*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
39*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
40*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
41*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
42*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
43*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
44*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
45*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
46*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
47*77c1e3ccSAndroid Build Coastguard Worker };
48*77c1e3ccSAndroid Build Coastguard Worker 
49*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
50*77c1e3ccSAndroid Build Coastguard Worker   { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
51*77c1e3ccSAndroid Build Coastguard Worker   { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
52*77c1e3ccSAndroid Build Coastguard Worker   { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
53*77c1e3ccSAndroid Build Coastguard Worker   { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
54*77c1e3ccSAndroid Build Coastguard Worker };
55*77c1e3ccSAndroid Build Coastguard Worker 
transpose4x16_sse2(__m128i * x,__m128i * d)56*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
57*77c1e3ccSAndroid Build Coastguard Worker   __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
58*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm_unpacklo_epi8(x[0], x[1]);
59*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm_unpacklo_epi8(x[2], x[3]);
60*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm_unpackhi_epi8(x[0], x[1]);
61*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm_unpackhi_epi8(x[2], x[3]);
62*77c1e3ccSAndroid Build Coastguard Worker 
63*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm_unpacklo_epi16(w0, w1);
64*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm_unpacklo_epi16(w2, w3);
65*77c1e3ccSAndroid Build Coastguard Worker   ww2 = _mm_unpackhi_epi16(w0, w1);
66*77c1e3ccSAndroid Build Coastguard Worker   ww3 = _mm_unpackhi_epi16(w2, w3);
67*77c1e3ccSAndroid Build Coastguard Worker 
68*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm_unpacklo_epi32(ww0, ww1);
69*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm_unpacklo_epi32(ww2, ww3);
70*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm_unpackhi_epi32(ww0, ww1);
71*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm_unpackhi_epi32(ww2, ww3);
72*77c1e3ccSAndroid Build Coastguard Worker 
73*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_unpacklo_epi64(w0, w2);
74*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_unpackhi_epi64(w0, w2);
75*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_unpacklo_epi64(w1, w3);
76*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_unpackhi_epi64(w1, w3);
77*77c1e3ccSAndroid Build Coastguard Worker 
78*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm_srli_si128(d[0], 8);
79*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm_srli_si128(d[1], 8);
80*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm_srli_si128(d[2], 8);
81*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm_srli_si128(d[3], 8);
82*77c1e3ccSAndroid Build Coastguard Worker 
83*77c1e3ccSAndroid Build Coastguard Worker   d[8] = _mm_srli_si128(d[0], 4);
84*77c1e3ccSAndroid Build Coastguard Worker   d[9] = _mm_srli_si128(d[1], 4);
85*77c1e3ccSAndroid Build Coastguard Worker   d[10] = _mm_srli_si128(d[2], 4);
86*77c1e3ccSAndroid Build Coastguard Worker   d[11] = _mm_srli_si128(d[3], 4);
87*77c1e3ccSAndroid Build Coastguard Worker 
88*77c1e3ccSAndroid Build Coastguard Worker   d[12] = _mm_srli_si128(d[0], 12);
89*77c1e3ccSAndroid Build Coastguard Worker   d[13] = _mm_srli_si128(d[1], 12);
90*77c1e3ccSAndroid Build Coastguard Worker   d[14] = _mm_srli_si128(d[2], 12);
91*77c1e3ccSAndroid Build Coastguard Worker   d[15] = _mm_srli_si128(d[3], 12);
92*77c1e3ccSAndroid Build Coastguard Worker }
93*77c1e3ccSAndroid Build Coastguard Worker 
transpose16x16_sse2(__m128i * x,__m128i * d)94*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
95*77c1e3ccSAndroid Build Coastguard Worker   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
96*77c1e3ccSAndroid Build Coastguard Worker   __m128i w10, w11, w12, w13, w14, w15;
97*77c1e3ccSAndroid Build Coastguard Worker 
98*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm_unpacklo_epi8(x[0], x[1]);
99*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm_unpacklo_epi8(x[2], x[3]);
100*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm_unpacklo_epi8(x[4], x[5]);
101*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm_unpacklo_epi8(x[6], x[7]);
102*77c1e3ccSAndroid Build Coastguard Worker 
103*77c1e3ccSAndroid Build Coastguard Worker   w8 = _mm_unpacklo_epi8(x[8], x[9]);
104*77c1e3ccSAndroid Build Coastguard Worker   w9 = _mm_unpacklo_epi8(x[10], x[11]);
105*77c1e3ccSAndroid Build Coastguard Worker   w10 = _mm_unpacklo_epi8(x[12], x[13]);
106*77c1e3ccSAndroid Build Coastguard Worker   w11 = _mm_unpacklo_epi8(x[14], x[15]);
107*77c1e3ccSAndroid Build Coastguard Worker 
108*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm_unpacklo_epi16(w0, w1);
109*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm_unpacklo_epi16(w2, w3);
110*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm_unpacklo_epi16(w8, w9);
111*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm_unpacklo_epi16(w10, w11);
112*77c1e3ccSAndroid Build Coastguard Worker 
113*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm_unpacklo_epi32(w4, w5);
114*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm_unpackhi_epi32(w4, w5);
115*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm_unpacklo_epi32(w12, w13);
116*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm_unpackhi_epi32(w12, w13);
117*77c1e3ccSAndroid Build Coastguard Worker 
118*77c1e3ccSAndroid Build Coastguard Worker   // Store first 4-line result
119*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_unpacklo_epi64(w6, w14);
120*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_unpackhi_epi64(w6, w14);
121*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_unpacklo_epi64(w7, w15);
122*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_unpackhi_epi64(w7, w15);
123*77c1e3ccSAndroid Build Coastguard Worker 
124*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm_unpackhi_epi16(w0, w1);
125*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm_unpackhi_epi16(w2, w3);
126*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm_unpackhi_epi16(w8, w9);
127*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm_unpackhi_epi16(w10, w11);
128*77c1e3ccSAndroid Build Coastguard Worker 
129*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm_unpacklo_epi32(w4, w5);
130*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm_unpackhi_epi32(w4, w5);
131*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm_unpacklo_epi32(w12, w13);
132*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm_unpackhi_epi32(w12, w13);
133*77c1e3ccSAndroid Build Coastguard Worker 
134*77c1e3ccSAndroid Build Coastguard Worker   // Store second 4-line result
135*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm_unpacklo_epi64(w6, w14);
136*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm_unpackhi_epi64(w6, w14);
137*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm_unpacklo_epi64(w7, w15);
138*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm_unpackhi_epi64(w7, w15);
139*77c1e3ccSAndroid Build Coastguard Worker 
140*77c1e3ccSAndroid Build Coastguard Worker   // upper half
141*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm_unpackhi_epi8(x[0], x[1]);
142*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm_unpackhi_epi8(x[2], x[3]);
143*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm_unpackhi_epi8(x[4], x[5]);
144*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm_unpackhi_epi8(x[6], x[7]);
145*77c1e3ccSAndroid Build Coastguard Worker 
146*77c1e3ccSAndroid Build Coastguard Worker   w8 = _mm_unpackhi_epi8(x[8], x[9]);
147*77c1e3ccSAndroid Build Coastguard Worker   w9 = _mm_unpackhi_epi8(x[10], x[11]);
148*77c1e3ccSAndroid Build Coastguard Worker   w10 = _mm_unpackhi_epi8(x[12], x[13]);
149*77c1e3ccSAndroid Build Coastguard Worker   w11 = _mm_unpackhi_epi8(x[14], x[15]);
150*77c1e3ccSAndroid Build Coastguard Worker 
151*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm_unpacklo_epi16(w0, w1);
152*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm_unpacklo_epi16(w2, w3);
153*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm_unpacklo_epi16(w8, w9);
154*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm_unpacklo_epi16(w10, w11);
155*77c1e3ccSAndroid Build Coastguard Worker 
156*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm_unpacklo_epi32(w4, w5);
157*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm_unpackhi_epi32(w4, w5);
158*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm_unpacklo_epi32(w12, w13);
159*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm_unpackhi_epi32(w12, w13);
160*77c1e3ccSAndroid Build Coastguard Worker 
161*77c1e3ccSAndroid Build Coastguard Worker   // Store first 4-line result
162*77c1e3ccSAndroid Build Coastguard Worker   d[8] = _mm_unpacklo_epi64(w6, w14);
163*77c1e3ccSAndroid Build Coastguard Worker   d[9] = _mm_unpackhi_epi64(w6, w14);
164*77c1e3ccSAndroid Build Coastguard Worker   d[10] = _mm_unpacklo_epi64(w7, w15);
165*77c1e3ccSAndroid Build Coastguard Worker   d[11] = _mm_unpackhi_epi64(w7, w15);
166*77c1e3ccSAndroid Build Coastguard Worker 
167*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm_unpackhi_epi16(w0, w1);
168*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm_unpackhi_epi16(w2, w3);
169*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm_unpackhi_epi16(w8, w9);
170*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm_unpackhi_epi16(w10, w11);
171*77c1e3ccSAndroid Build Coastguard Worker 
172*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm_unpacklo_epi32(w4, w5);
173*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm_unpackhi_epi32(w4, w5);
174*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm_unpacklo_epi32(w12, w13);
175*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm_unpackhi_epi32(w12, w13);
176*77c1e3ccSAndroid Build Coastguard Worker 
177*77c1e3ccSAndroid Build Coastguard Worker   // Store second 4-line result
178*77c1e3ccSAndroid Build Coastguard Worker   d[12] = _mm_unpacklo_epi64(w6, w14);
179*77c1e3ccSAndroid Build Coastguard Worker   d[13] = _mm_unpackhi_epi64(w6, w14);
180*77c1e3ccSAndroid Build Coastguard Worker   d[14] = _mm_unpacklo_epi64(w7, w15);
181*77c1e3ccSAndroid Build Coastguard Worker   d[15] = _mm_unpackhi_epi64(w7, w15);
182*77c1e3ccSAndroid Build Coastguard Worker }
183*77c1e3ccSAndroid Build Coastguard Worker 
transpose_TX_16X16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)184*77c1e3ccSAndroid Build Coastguard Worker static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
185*77c1e3ccSAndroid Build Coastguard Worker                                uint8_t *dst, ptrdiff_t pitchDst) {
186*77c1e3ccSAndroid Build Coastguard Worker   __m128i r[16];
187*77c1e3ccSAndroid Build Coastguard Worker   __m128i d[16];
188*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
189*77c1e3ccSAndroid Build Coastguard Worker     r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
190*77c1e3ccSAndroid Build Coastguard Worker   }
191*77c1e3ccSAndroid Build Coastguard Worker   transpose16x16_sse2(r, d);
192*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
193*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
194*77c1e3ccSAndroid Build Coastguard Worker   }
195*77c1e3ccSAndroid Build Coastguard Worker }
196*77c1e3ccSAndroid Build Coastguard Worker 
transpose(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)197*77c1e3ccSAndroid Build Coastguard Worker static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
198*77c1e3ccSAndroid Build Coastguard Worker                       ptrdiff_t pitchDst, int width, int height) {
199*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < height; j += 16)
200*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < width; i += 16)
201*77c1e3ccSAndroid Build Coastguard Worker       transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
202*77c1e3ccSAndroid Build Coastguard Worker                          dst + j * pitchDst + i, pitchDst);
203*77c1e3ccSAndroid Build Coastguard Worker }
204*77c1e3ccSAndroid Build Coastguard Worker 
205*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
206