1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include "libyuv/rotate_row.h"
14
15 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
16 #include "libyuv/loongson_intrinsics.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
24 { \
25 DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
26 DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
27 }
28
29 #define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
30 { \
31 DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
32 DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
33 }
34
35 #define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
36 { \
37 DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
38 DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
39 }
40
41 #define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
42 { \
43 DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
44 DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
45 }
46
47 #define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
48 _stride3, _stride4) \
49 { \
50 __lsx_vst(_dst0, _dst, 0); \
51 __lsx_vstx(_dst1, _dst, _stride); \
52 __lsx_vstx(_dst2, _dst, _stride2); \
53 __lsx_vstx(_dst3, _dst, _stride3); \
54 _dst += _stride4; \
55 }
56
57 #define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
58 { \
59 __lsx_vst(_dst0, _dst, 0); \
60 __lsx_vstx(_dst1, _dst, _stride); \
61 _dst += _stride2; \
62 }
63
TransposeWx16_C(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)64 void TransposeWx16_C(const uint8_t* src,
65 int src_stride,
66 uint8_t* dst,
67 int dst_stride,
68 int width) {
69 TransposeWx8_C(src, src_stride, dst, dst_stride, width);
70 TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
71 width);
72 }
73
TransposeUVWx16_C(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)74 void TransposeUVWx16_C(const uint8_t* src,
75 int src_stride,
76 uint8_t* dst_a,
77 int dst_stride_a,
78 uint8_t* dst_b,
79 int dst_stride_b,
80 int width) {
81 TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
82 width);
83 TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
84 dst_stride_a, (dst_b + 8), dst_stride_b, width);
85 }
86
TransposeWx16_LSX(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)87 void TransposeWx16_LSX(const uint8_t* src,
88 int src_stride,
89 uint8_t* dst,
90 int dst_stride,
91 int width) {
92 int x;
93 int len = width / 16;
94 uint8_t* s;
95 int src_stride2 = src_stride << 1;
96 int src_stride3 = src_stride + src_stride2;
97 int src_stride4 = src_stride2 << 1;
98 int dst_stride2 = dst_stride << 1;
99 int dst_stride3 = dst_stride + dst_stride2;
100 int dst_stride4 = dst_stride2 << 1;
101 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
102 __m128i tmp0, tmp1, tmp2, tmp3;
103 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
104 __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
105
106 for (x = 0; x < len; x++) {
107 s = (uint8_t*)src;
108 src0 = __lsx_vld(s, 0);
109 src1 = __lsx_vldx(s, src_stride);
110 src2 = __lsx_vldx(s, src_stride2);
111 src3 = __lsx_vldx(s, src_stride3);
112 s += src_stride4;
113 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
114 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
115 src0 = __lsx_vld(s, 0);
116 src1 = __lsx_vldx(s, src_stride);
117 src2 = __lsx_vldx(s, src_stride2);
118 src3 = __lsx_vldx(s, src_stride3);
119 s += src_stride4;
120 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
121 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
122 ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
123 ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
124 src0 = __lsx_vld(s, 0);
125 src1 = __lsx_vldx(s, src_stride);
126 src2 = __lsx_vldx(s, src_stride2);
127 src3 = __lsx_vldx(s, src_stride3);
128 s += src_stride4;
129 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
130 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
131 src0 = __lsx_vld(s, 0);
132 src1 = __lsx_vldx(s, src_stride);
133 src2 = __lsx_vldx(s, src_stride2);
134 src3 = __lsx_vldx(s, src_stride3);
135 s += src_stride4;
136 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
137 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
138 res8 = __lsx_vilvl_w(reg4, reg0);
139 res9 = __lsx_vilvh_w(reg4, reg0);
140 ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
141 LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
142 dst_stride4);
143 res8 = __lsx_vilvl_w(reg5, reg1);
144 res9 = __lsx_vilvh_w(reg5, reg1);
145 ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
146 LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
147 dst_stride4);
148 res8 = __lsx_vilvl_w(reg6, reg2);
149 res9 = __lsx_vilvh_w(reg6, reg2);
150 ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
151 LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
152 dst_stride4);
153 res8 = __lsx_vilvl_w(reg7, reg3);
154 res9 = __lsx_vilvh_w(reg7, reg3);
155 ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
156 LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
157 dst_stride4);
158 src += 16;
159 }
160 }
161
TransposeUVWx16_LSX(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)162 void TransposeUVWx16_LSX(const uint8_t* src,
163 int src_stride,
164 uint8_t* dst_a,
165 int dst_stride_a,
166 uint8_t* dst_b,
167 int dst_stride_b,
168 int width) {
169 int x;
170 int len = width / 8;
171 uint8_t* s;
172 int src_stride2 = src_stride << 1;
173 int src_stride3 = src_stride + src_stride2;
174 int src_stride4 = src_stride2 << 1;
175 int dst_stride_a2 = dst_stride_a << 1;
176 int dst_stride_b2 = dst_stride_b << 1;
177 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
178 __m128i tmp0, tmp1, tmp2, tmp3;
179 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
180 __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
181
182 for (x = 0; x < len; x++) {
183 s = (uint8_t*)src;
184 src0 = __lsx_vld(s, 0);
185 src1 = __lsx_vldx(s, src_stride);
186 src2 = __lsx_vldx(s, src_stride2);
187 src3 = __lsx_vldx(s, src_stride3);
188 s += src_stride4;
189 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
190 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
191 src0 = __lsx_vld(s, 0);
192 src1 = __lsx_vldx(s, src_stride);
193 src2 = __lsx_vldx(s, src_stride2);
194 src3 = __lsx_vldx(s, src_stride3);
195 s += src_stride4;
196 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
197 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
198 ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
199 ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
200 src0 = __lsx_vld(s, 0);
201 src1 = __lsx_vldx(s, src_stride);
202 src2 = __lsx_vldx(s, src_stride2);
203 src3 = __lsx_vldx(s, src_stride3);
204 s += src_stride4;
205 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
206 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
207 src0 = __lsx_vld(s, 0);
208 src1 = __lsx_vldx(s, src_stride);
209 src2 = __lsx_vldx(s, src_stride2);
210 src3 = __lsx_vldx(s, src_stride3);
211 s += src_stride4;
212 ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
213 ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
214 res8 = __lsx_vilvl_w(reg4, reg0);
215 res9 = __lsx_vilvh_w(reg4, reg0);
216 ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
217 LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
218 LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
219 res8 = __lsx_vilvl_w(reg5, reg1);
220 res9 = __lsx_vilvh_w(reg5, reg1);
221 ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
222 LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
223 LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
224 res8 = __lsx_vilvl_w(reg6, reg2);
225 res9 = __lsx_vilvh_w(reg6, reg2);
226 ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
227 LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
228 LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
229 res8 = __lsx_vilvl_w(reg7, reg3);
230 res9 = __lsx_vilvh_w(reg7, reg3);
231 ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
232 LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
233 LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
234 src += 16;
235 }
236 }
237
238 #ifdef __cplusplus
239 } // extern "C"
240 } // namespace libyuv
241 #endif
242
243 #endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
244