xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_utils.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
12 #define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
13 
14 #include <emmintrin.h>  // SSE2
15 #include "aom/aom_integer.h"
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 
19 static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
20   { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
21   { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
22   { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
23   { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
24   { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
25   { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
26   { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
27   { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
28 };
29 
30 static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
31   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32   { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
33   { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
34   { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
35   { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
36   { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
37   { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
38   { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
39   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
40   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
41   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
42   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
43   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
44   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
45   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
46   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
47 };
48 
49 static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
50   { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
51   { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
52   { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
53   { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
54 };
55 
transpose4x16_sse2(__m128i * x,__m128i * d)56 static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
57   __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
58   w0 = _mm_unpacklo_epi8(x[0], x[1]);
59   w1 = _mm_unpacklo_epi8(x[2], x[3]);
60   w2 = _mm_unpackhi_epi8(x[0], x[1]);
61   w3 = _mm_unpackhi_epi8(x[2], x[3]);
62 
63   ww0 = _mm_unpacklo_epi16(w0, w1);
64   ww1 = _mm_unpacklo_epi16(w2, w3);
65   ww2 = _mm_unpackhi_epi16(w0, w1);
66   ww3 = _mm_unpackhi_epi16(w2, w3);
67 
68   w0 = _mm_unpacklo_epi32(ww0, ww1);
69   w2 = _mm_unpacklo_epi32(ww2, ww3);
70   w1 = _mm_unpackhi_epi32(ww0, ww1);
71   w3 = _mm_unpackhi_epi32(ww2, ww3);
72 
73   d[0] = _mm_unpacklo_epi64(w0, w2);
74   d[1] = _mm_unpackhi_epi64(w0, w2);
75   d[2] = _mm_unpacklo_epi64(w1, w3);
76   d[3] = _mm_unpackhi_epi64(w1, w3);
77 
78   d[4] = _mm_srli_si128(d[0], 8);
79   d[5] = _mm_srli_si128(d[1], 8);
80   d[6] = _mm_srli_si128(d[2], 8);
81   d[7] = _mm_srli_si128(d[3], 8);
82 
83   d[8] = _mm_srli_si128(d[0], 4);
84   d[9] = _mm_srli_si128(d[1], 4);
85   d[10] = _mm_srli_si128(d[2], 4);
86   d[11] = _mm_srli_si128(d[3], 4);
87 
88   d[12] = _mm_srli_si128(d[0], 12);
89   d[13] = _mm_srli_si128(d[1], 12);
90   d[14] = _mm_srli_si128(d[2], 12);
91   d[15] = _mm_srli_si128(d[3], 12);
92 }
93 
transpose16x16_sse2(__m128i * x,__m128i * d)94 static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
95   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
96   __m128i w10, w11, w12, w13, w14, w15;
97 
98   w0 = _mm_unpacklo_epi8(x[0], x[1]);
99   w1 = _mm_unpacklo_epi8(x[2], x[3]);
100   w2 = _mm_unpacklo_epi8(x[4], x[5]);
101   w3 = _mm_unpacklo_epi8(x[6], x[7]);
102 
103   w8 = _mm_unpacklo_epi8(x[8], x[9]);
104   w9 = _mm_unpacklo_epi8(x[10], x[11]);
105   w10 = _mm_unpacklo_epi8(x[12], x[13]);
106   w11 = _mm_unpacklo_epi8(x[14], x[15]);
107 
108   w4 = _mm_unpacklo_epi16(w0, w1);
109   w5 = _mm_unpacklo_epi16(w2, w3);
110   w12 = _mm_unpacklo_epi16(w8, w9);
111   w13 = _mm_unpacklo_epi16(w10, w11);
112 
113   w6 = _mm_unpacklo_epi32(w4, w5);
114   w7 = _mm_unpackhi_epi32(w4, w5);
115   w14 = _mm_unpacklo_epi32(w12, w13);
116   w15 = _mm_unpackhi_epi32(w12, w13);
117 
118   // Store first 4-line result
119   d[0] = _mm_unpacklo_epi64(w6, w14);
120   d[1] = _mm_unpackhi_epi64(w6, w14);
121   d[2] = _mm_unpacklo_epi64(w7, w15);
122   d[3] = _mm_unpackhi_epi64(w7, w15);
123 
124   w4 = _mm_unpackhi_epi16(w0, w1);
125   w5 = _mm_unpackhi_epi16(w2, w3);
126   w12 = _mm_unpackhi_epi16(w8, w9);
127   w13 = _mm_unpackhi_epi16(w10, w11);
128 
129   w6 = _mm_unpacklo_epi32(w4, w5);
130   w7 = _mm_unpackhi_epi32(w4, w5);
131   w14 = _mm_unpacklo_epi32(w12, w13);
132   w15 = _mm_unpackhi_epi32(w12, w13);
133 
134   // Store second 4-line result
135   d[4] = _mm_unpacklo_epi64(w6, w14);
136   d[5] = _mm_unpackhi_epi64(w6, w14);
137   d[6] = _mm_unpacklo_epi64(w7, w15);
138   d[7] = _mm_unpackhi_epi64(w7, w15);
139 
140   // upper half
141   w0 = _mm_unpackhi_epi8(x[0], x[1]);
142   w1 = _mm_unpackhi_epi8(x[2], x[3]);
143   w2 = _mm_unpackhi_epi8(x[4], x[5]);
144   w3 = _mm_unpackhi_epi8(x[6], x[7]);
145 
146   w8 = _mm_unpackhi_epi8(x[8], x[9]);
147   w9 = _mm_unpackhi_epi8(x[10], x[11]);
148   w10 = _mm_unpackhi_epi8(x[12], x[13]);
149   w11 = _mm_unpackhi_epi8(x[14], x[15]);
150 
151   w4 = _mm_unpacklo_epi16(w0, w1);
152   w5 = _mm_unpacklo_epi16(w2, w3);
153   w12 = _mm_unpacklo_epi16(w8, w9);
154   w13 = _mm_unpacklo_epi16(w10, w11);
155 
156   w6 = _mm_unpacklo_epi32(w4, w5);
157   w7 = _mm_unpackhi_epi32(w4, w5);
158   w14 = _mm_unpacklo_epi32(w12, w13);
159   w15 = _mm_unpackhi_epi32(w12, w13);
160 
161   // Store first 4-line result
162   d[8] = _mm_unpacklo_epi64(w6, w14);
163   d[9] = _mm_unpackhi_epi64(w6, w14);
164   d[10] = _mm_unpacklo_epi64(w7, w15);
165   d[11] = _mm_unpackhi_epi64(w7, w15);
166 
167   w4 = _mm_unpackhi_epi16(w0, w1);
168   w5 = _mm_unpackhi_epi16(w2, w3);
169   w12 = _mm_unpackhi_epi16(w8, w9);
170   w13 = _mm_unpackhi_epi16(w10, w11);
171 
172   w6 = _mm_unpacklo_epi32(w4, w5);
173   w7 = _mm_unpackhi_epi32(w4, w5);
174   w14 = _mm_unpacklo_epi32(w12, w13);
175   w15 = _mm_unpackhi_epi32(w12, w13);
176 
177   // Store second 4-line result
178   d[12] = _mm_unpacklo_epi64(w6, w14);
179   d[13] = _mm_unpackhi_epi64(w6, w14);
180   d[14] = _mm_unpacklo_epi64(w7, w15);
181   d[15] = _mm_unpackhi_epi64(w7, w15);
182 }
183 
transpose_TX_16X16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)184 static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
185                                uint8_t *dst, ptrdiff_t pitchDst) {
186   __m128i r[16];
187   __m128i d[16];
188   for (int j = 0; j < 16; j++) {
189     r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
190   }
191   transpose16x16_sse2(r, d);
192   for (int j = 0; j < 16; j++) {
193     _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
194   }
195 }
196 
transpose(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)197 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
198                       ptrdiff_t pitchDst, int width, int height) {
199   for (int j = 0; j < height; j += 16)
200     for (int i = 0; i < width; i += 16)
201       transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
202                          dst + j * pitchDst + i, pitchDst);
203 }
204 
205 #endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
206