1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include "libyuv/row.h"
14
15 #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
16 #include "libyuv/loongson_intrinsics.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #define ALPHA_VAL (-1)
24
25 // Fill YUV -> RGB conversion constants into vectors
26 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
27 { \
28 ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
29 vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
30 ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
31 vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
32 yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
33 yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
34 }
35
36 // Load 32 YUV422 pixel data
37 #define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
38 { \
39 __m256i temp0, temp1; \
40 \
41 DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
42 temp1 = __lasx_xvld(psrc_v, 0); \
43 temp0 = __lasx_xvsub_b(temp0, const_0x80); \
44 temp1 = __lasx_xvsub_b(temp1, const_0x80); \
45 temp0 = __lasx_vext2xv_h_b(temp0); \
46 temp1 = __lasx_vext2xv_h_b(temp1); \
47 uv_l = __lasx_xvilvl_h(temp0, temp1); \
48 uv_h = __lasx_xvilvh_h(temp0, temp1); \
49 }
50
51 // Load 16 YUV422 pixel data
52 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
53 { \
54 __m256i temp0, temp1; \
55 \
56 out_y = __lasx_xvld(psrc_y, 0); \
57 temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
58 temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
59 uv = __lasx_xvilvl_b(temp0, temp1); \
60 uv = __lasx_xvsub_b(uv, const_0x80); \
61 uv = __lasx_vext2xv_h_b(uv); \
62 }
63
64 // Convert 16 pixels of YUV420 to RGB.
65 #define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
66 g_h, r_l, r_h) \
67 { \
68 __m256i u_l, u_h, v_l, v_h; \
69 __m256i yl_ev, yl_od, yh_ev, yh_od; \
70 __m256i temp0, temp1, temp2, temp3; \
71 \
72 temp0 = __lasx_xvilvl_b(in_y, in_y); \
73 temp1 = __lasx_xvilvh_b(in_y, in_y); \
74 yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
75 yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
76 yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
77 yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
78 DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
79 yl_ev, yl_od, yh_ev, yh_od); \
80 yl_ev = __lasx_xvadd_w(yl_ev, yb); \
81 yl_od = __lasx_xvadd_w(yl_od, yb); \
82 yh_ev = __lasx_xvadd_w(yh_ev, yb); \
83 yh_od = __lasx_xvadd_w(yh_od, yb); \
84 v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
85 u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
86 v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
87 u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
88 temp0 = __lasx_xvadd_w(yl_ev, u_l); \
89 temp1 = __lasx_xvadd_w(yl_od, u_l); \
90 temp2 = __lasx_xvadd_w(yh_ev, u_h); \
91 temp3 = __lasx_xvadd_w(yh_od, u_h); \
92 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
93 temp1, temp2, temp3); \
94 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
95 temp2, temp3); \
96 b_l = __lasx_xvpackev_h(temp1, temp0); \
97 b_h = __lasx_xvpackev_h(temp3, temp2); \
98 temp0 = __lasx_xvadd_w(yl_ev, v_l); \
99 temp1 = __lasx_xvadd_w(yl_od, v_l); \
100 temp2 = __lasx_xvadd_w(yh_ev, v_h); \
101 temp3 = __lasx_xvadd_w(yh_od, v_h); \
102 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
103 temp1, temp2, temp3); \
104 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
105 temp2, temp3); \
106 r_l = __lasx_xvpackev_h(temp1, temp0); \
107 r_h = __lasx_xvpackev_h(temp3, temp2); \
108 DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
109 temp0 = __lasx_xvsub_w(yl_ev, u_l); \
110 temp1 = __lasx_xvsub_w(yl_od, u_l); \
111 temp2 = __lasx_xvsub_w(yh_ev, u_h); \
112 temp3 = __lasx_xvsub_w(yh_od, u_h); \
113 DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
114 temp1, temp2, temp3); \
115 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
116 temp2, temp3); \
117 g_l = __lasx_xvpackev_h(temp1, temp0); \
118 g_h = __lasx_xvpackev_h(temp3, temp2); \
119 }
120
121 // Convert 8 pixels of YUV420 to RGB.
122 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
123 { \
124 __m256i u_l, v_l, yl_ev, yl_od; \
125 __m256i temp0, temp1; \
126 \
127 in_y = __lasx_xvpermi_d(in_y, 0xD8); \
128 temp0 = __lasx_xvilvl_b(in_y, in_y); \
129 yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
130 yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
131 DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
132 yl_ev = __lasx_xvadd_w(yl_ev, yb); \
133 yl_od = __lasx_xvadd_w(yl_od, yb); \
134 v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
135 u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
136 temp0 = __lasx_xvadd_w(yl_ev, u_l); \
137 temp1 = __lasx_xvadd_w(yl_od, u_l); \
138 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
139 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
140 out_b = __lasx_xvpackev_h(temp1, temp0); \
141 temp0 = __lasx_xvadd_w(yl_ev, v_l); \
142 temp1 = __lasx_xvadd_w(yl_od, v_l); \
143 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
144 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
145 out_r = __lasx_xvpackev_h(temp1, temp0); \
146 u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
147 temp0 = __lasx_xvsub_w(yl_ev, u_l); \
148 temp1 = __lasx_xvsub_w(yl_od, u_l); \
149 DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
150 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
151 out_g = __lasx_xvpackev_h(temp1, temp0); \
152 }
153
154 // Pack and Store 16 ARGB values.
155 #define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
156 { \
157 __m256i temp0, temp1, temp2, temp3; \
158 \
159 temp0 = __lasx_xvpackev_b(g_l, b_l); \
160 temp1 = __lasx_xvpackev_b(a_l, r_l); \
161 temp2 = __lasx_xvpackev_b(g_h, b_h); \
162 temp3 = __lasx_xvpackev_b(a_h, r_h); \
163 r_l = __lasx_xvilvl_h(temp1, temp0); \
164 r_h = __lasx_xvilvh_h(temp1, temp0); \
165 g_l = __lasx_xvilvl_h(temp3, temp2); \
166 g_h = __lasx_xvilvh_h(temp3, temp2); \
167 temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
168 temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
169 temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
170 temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
171 __lasx_xvst(temp0, pdst_argb, 0); \
172 __lasx_xvst(temp1, pdst_argb, 32); \
173 __lasx_xvst(temp2, pdst_argb, 64); \
174 __lasx_xvst(temp3, pdst_argb, 96); \
175 pdst_argb += 128; \
176 }
177
178 // Pack and Store 8 ARGB values.
179 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
180 { \
181 __m256i temp0, temp1, temp2, temp3; \
182 \
183 temp0 = __lasx_xvpackev_b(in_g, in_b); \
184 temp1 = __lasx_xvpackev_b(in_a, in_r); \
185 temp2 = __lasx_xvilvl_h(temp1, temp0); \
186 temp3 = __lasx_xvilvh_h(temp1, temp0); \
187 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \
188 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \
189 __lasx_xvst(temp0, pdst_argb, 0); \
190 __lasx_xvst(temp1, pdst_argb, 32); \
191 pdst_argb += 64; \
192 }
193
194 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
195 { \
196 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
197 _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \
198 _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \
199 _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \
200 _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \
201 _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \
202 _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \
203 _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \
204 _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \
205 _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \
206 _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \
207 _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \
208 _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \
209 _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \
210 _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \
211 _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
212 }
213
MirrorRow_LASX(const uint8_t * src,uint8_t * dst,int width)214 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
215 int x;
216 int len = width / 64;
217 __m256i src0, src1;
218 __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
219 0x08090A0B0C0D0E0F, 0x0001020304050607};
220 src += width - 64;
221 for (x = 0; x < len; x++) {
222 DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
223 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
224 src1);
225 src0 = __lasx_xvpermi_q(src0, src0, 0x01);
226 src1 = __lasx_xvpermi_q(src1, src1, 0x01);
227 __lasx_xvst(src1, dst, 0);
228 __lasx_xvst(src0, dst, 32);
229 dst += 64;
230 src -= 64;
231 }
232 }
233
MirrorUVRow_LASX(const uint8_t * src_uv,uint8_t * dst_uv,int width)234 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
235 int x;
236 int len = width / 16;
237 __m256i src, dst;
238 __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
239 0x0004000500060007, 0x0000000100020003};
240
241 src_uv += (width - 16) << 1;
242 for (x = 0; x < len; x++) {
243 src = __lasx_xvld(src_uv, 0);
244 dst = __lasx_xvshuf_h(shuffler, src, src);
245 dst = __lasx_xvpermi_q(dst, dst, 0x01);
246 __lasx_xvst(dst, dst_uv, 0);
247 src_uv -= 32;
248 dst_uv += 32;
249 }
250 }
251
ARGBMirrorRow_LASX(const uint8_t * src,uint8_t * dst,int width)252 void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
253 int x;
254 int len = width / 16;
255 __m256i src0, src1;
256 __m256i dst0, dst1;
257 __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
258 0x0B0A09080F0E0D0C, 0x0302010007060504};
259 src += (width * 4) - 64;
260 for (x = 0; x < len; x++) {
261 DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
262 DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
263 src1);
264 dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
265 dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
266 __lasx_xvst(dst0, dst, 0);
267 __lasx_xvst(dst1, dst, 32);
268 dst += 64;
269 src -= 64;
270 }
271 }
272
I422ToYUY2Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)273 void I422ToYUY2Row_LASX(const uint8_t* src_y,
274 const uint8_t* src_u,
275 const uint8_t* src_v,
276 uint8_t* dst_yuy2,
277 int width) {
278 int x;
279 int len = width / 32;
280 __m256i src_u0, src_v0, src_y0, vec_uv0;
281 __m256i vec_yuy2_0, vec_yuy2_1;
282 __m256i dst_yuy2_0, dst_yuy2_1;
283
284 for (x = 0; x < len; x++) {
285 DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
286 src_y0 = __lasx_xvld(src_y, 0);
287 src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
288 src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
289 vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
290 vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
291 vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
292 dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
293 dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
294 __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
295 __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
296 src_u += 16;
297 src_v += 16;
298 src_y += 32;
299 dst_yuy2 += 64;
300 }
301 }
302
I422ToUYVYRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)303 void I422ToUYVYRow_LASX(const uint8_t* src_y,
304 const uint8_t* src_u,
305 const uint8_t* src_v,
306 uint8_t* dst_uyvy,
307 int width) {
308 int x;
309 int len = width / 32;
310 __m256i src_u0, src_v0, src_y0, vec_uv0;
311 __m256i vec_uyvy0, vec_uyvy1;
312 __m256i dst_uyvy0, dst_uyvy1;
313
314 for (x = 0; x < len; x++) {
315 DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
316 src_y0 = __lasx_xvld(src_y, 0);
317 src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
318 src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
319 vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
320 vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
321 vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
322 dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
323 dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
324 __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
325 __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
326 src_u += 16;
327 src_v += 16;
328 src_y += 32;
329 dst_uyvy += 64;
330 }
331 }
332
I422ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)333 void I422ToARGBRow_LASX(const uint8_t* src_y,
334 const uint8_t* src_u,
335 const uint8_t* src_v,
336 uint8_t* dst_argb,
337 const struct YuvConstants* yuvconstants,
338 int width) {
339 int x;
340 int len = width / 32;
341 __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
342 __m256i vec_ubvr, vec_ugvg;
343 __m256i alpha = __lasx_xvldi(0xFF);
344 __m256i const_0x80 = __lasx_xvldi(0x80);
345
346 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
347 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
348 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
349
350 for (x = 0; x < len; x++) {
351 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
352
353 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
354 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
355 g_h, r_l, r_h);
356 STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
357 src_y += 32;
358 src_u += 16;
359 src_v += 16;
360 }
361 }
362
I422ToRGBARow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)363 void I422ToRGBARow_LASX(const uint8_t* src_y,
364 const uint8_t* src_u,
365 const uint8_t* src_v,
366 uint8_t* dst_argb,
367 const struct YuvConstants* yuvconstants,
368 int width) {
369 int x;
370 int len = width / 32;
371 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
372 __m256i vec_ubvr, vec_ugvg;
373 __m256i alpha = __lasx_xvldi(0xFF);
374 __m256i const_0x80 = __lasx_xvldi(0x80);
375
376 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
377 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
378 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
379
380 for (x = 0; x < len; x++) {
381 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
382
383 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
384 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
385 g_h, r_l, r_h);
386 STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
387 src_y += 32;
388 src_u += 16;
389 src_v += 16;
390 }
391 }
392
I422AlphaToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)393 void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
394 const uint8_t* src_u,
395 const uint8_t* src_v,
396 const uint8_t* src_a,
397 uint8_t* dst_argb,
398 const struct YuvConstants* yuvconstants,
399 int width) {
400 int x;
401 int len = width / 32;
402 int res = width & 31;
403 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
404 __m256i vec_ubvr, vec_ugvg;
405 __m256i zero = __lasx_xvldi(0);
406 __m256i const_0x80 = __lasx_xvldi(0x80);
407
408 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
409 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
410 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
411
412 for (x = 0; x < len; x++) {
413 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
414
415 y = __lasx_xvld(src_a, 0);
416 a_l = __lasx_xvilvl_b(zero, y);
417 a_h = __lasx_xvilvh_b(zero, y);
418 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
419 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
420 g_h, r_l, r_h);
421 STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
422 src_y += 32;
423 src_u += 16;
424 src_v += 16;
425 src_a += 32;
426 }
427 if (res) {
428 __m256i y, uv, r, g, b, a;
429 a = __lasx_xvld(src_a, 0);
430 a = __lasx_vext2xv_hu_bu(a);
431 READYUV422(src_y, src_u, src_v, y, uv);
432 YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
433 STOREARGB(a, r, g, b, dst_argb);
434 }
435 }
436
I422ToRGB24Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)437 void I422ToRGB24Row_LASX(const uint8_t* src_y,
438 const uint8_t* src_u,
439 const uint8_t* src_v,
440 uint8_t* dst_argb,
441 const struct YuvConstants* yuvconstants,
442 int32_t width) {
443 int x;
444 int len = width / 32;
445 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
446 __m256i vec_ubvr, vec_ugvg;
447 __m256i const_0x80 = __lasx_xvldi(0x80);
448 __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
449 0x0504120302100100, 0x0A18090816070614};
450 __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
451 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
452
453 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
454 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
455 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
456
457 for (x = 0; x < len; x++) {
458 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
459 __m256i temp0, temp1, temp2, temp3;
460
461 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
462 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
463 g_h, r_l, r_h);
464 temp0 = __lasx_xvpackev_b(g_l, b_l);
465 temp1 = __lasx_xvpackev_b(g_h, b_h);
466 DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
467 r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
468 temp1);
469
470 b_l = __lasx_xvilvl_d(temp1, temp2);
471 b_h = __lasx_xvilvh_d(temp3, temp1);
472 temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
473 temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
474 temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
475 __lasx_xvst(temp1, dst_argb, 0);
476 __lasx_xvst(temp2, dst_argb, 32);
477 __lasx_xvst(temp3, dst_argb, 64);
478 dst_argb += 96;
479 src_y += 32;
480 src_u += 16;
481 src_v += 16;
482 }
483 }
484
485 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)486 void I422ToRGB565Row_LASX(const uint8_t* src_y,
487 const uint8_t* src_u,
488 const uint8_t* src_v,
489 uint8_t* dst_rgb565,
490 const struct YuvConstants* yuvconstants,
491 int width) {
492 int x;
493 int len = width / 32;
494 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
495 __m256i vec_ubvr, vec_ugvg;
496 __m256i const_0x80 = __lasx_xvldi(0x80);
497
498 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
499 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
500 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
501
502 for (x = 0; x < len; x++) {
503 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
504 __m256i dst_l, dst_h;
505
506 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
507 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
508 g_h, r_l, r_h);
509 b_l = __lasx_xvsrli_h(b_l, 3);
510 b_h = __lasx_xvsrli_h(b_h, 3);
511 g_l = __lasx_xvsrli_h(g_l, 2);
512 g_h = __lasx_xvsrli_h(g_h, 2);
513 r_l = __lasx_xvsrli_h(r_l, 3);
514 r_h = __lasx_xvsrli_h(r_h, 3);
515 r_l = __lasx_xvslli_h(r_l, 11);
516 r_h = __lasx_xvslli_h(r_h, 11);
517 g_l = __lasx_xvslli_h(g_l, 5);
518 g_h = __lasx_xvslli_h(g_h, 5);
519 r_l = __lasx_xvor_v(r_l, g_l);
520 r_l = __lasx_xvor_v(r_l, b_l);
521 r_h = __lasx_xvor_v(r_h, g_h);
522 r_h = __lasx_xvor_v(r_h, b_h);
523 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
524 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
525 __lasx_xvst(dst_l, dst_rgb565, 0);
526 __lasx_xvst(dst_h, dst_rgb565, 32);
527 dst_rgb565 += 64;
528 src_y += 32;
529 src_u += 16;
530 src_v += 16;
531 }
532 }
533
534 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)535 void I422ToARGB4444Row_LASX(const uint8_t* src_y,
536 const uint8_t* src_u,
537 const uint8_t* src_v,
538 uint8_t* dst_argb4444,
539 const struct YuvConstants* yuvconstants,
540 int width) {
541 int x;
542 int len = width / 32;
543 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
544 __m256i vec_ubvr, vec_ugvg;
545 __m256i const_0x80 = __lasx_xvldi(0x80);
546 __m256i alpha = (__m256i)v4u64{0xF000F000F000F000, 0xF000F000F000F000,
547 0xF000F000F000F000, 0xF000F000F000F000};
548 __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
549 0x00F000F000F000F0};
550
551 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
552 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
553 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
554
555 for (x = 0; x < len; x++) {
556 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
557 __m256i dst_l, dst_h;
558
559 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
560 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
561 g_h, r_l, r_h);
562 b_l = __lasx_xvsrli_h(b_l, 4);
563 b_h = __lasx_xvsrli_h(b_h, 4);
564 r_l = __lasx_xvsrli_h(r_l, 4);
565 r_h = __lasx_xvsrli_h(r_h, 4);
566 g_l = __lasx_xvand_v(g_l, mask);
567 g_h = __lasx_xvand_v(g_h, mask);
568 r_l = __lasx_xvslli_h(r_l, 8);
569 r_h = __lasx_xvslli_h(r_h, 8);
570 r_l = __lasx_xvor_v(r_l, alpha);
571 r_h = __lasx_xvor_v(r_h, alpha);
572 r_l = __lasx_xvor_v(r_l, g_l);
573 r_h = __lasx_xvor_v(r_h, g_h);
574 r_l = __lasx_xvor_v(r_l, b_l);
575 r_h = __lasx_xvor_v(r_h, b_h);
576 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
577 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
578 __lasx_xvst(dst_l, dst_argb4444, 0);
579 __lasx_xvst(dst_h, dst_argb4444, 32);
580 dst_argb4444 += 64;
581 src_y += 32;
582 src_u += 16;
583 src_v += 16;
584 }
585 }
586
I422ToARGB1555Row_LASX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)587 void I422ToARGB1555Row_LASX(const uint8_t* src_y,
588 const uint8_t* src_u,
589 const uint8_t* src_v,
590 uint8_t* dst_argb1555,
591 const struct YuvConstants* yuvconstants,
592 int width) {
593 int x;
594 int len = width / 32;
595 __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
596 __m256i vec_ubvr, vec_ugvg;
597 __m256i const_0x80 = __lasx_xvldi(0x80);
598 __m256i alpha = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
599 0x8000800080008000, 0x8000800080008000};
600
601 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
602 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
603 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
604
605 for (x = 0; x < len; x++) {
606 __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
607 __m256i dst_l, dst_h;
608
609 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
610 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
611 g_h, r_l, r_h);
612 b_l = __lasx_xvsrli_h(b_l, 3);
613 b_h = __lasx_xvsrli_h(b_h, 3);
614 g_l = __lasx_xvsrli_h(g_l, 3);
615 g_h = __lasx_xvsrli_h(g_h, 3);
616 g_l = __lasx_xvslli_h(g_l, 5);
617 g_h = __lasx_xvslli_h(g_h, 5);
618 r_l = __lasx_xvsrli_h(r_l, 3);
619 r_h = __lasx_xvsrli_h(r_h, 3);
620 r_l = __lasx_xvslli_h(r_l, 10);
621 r_h = __lasx_xvslli_h(r_h, 10);
622 r_l = __lasx_xvor_v(r_l, alpha);
623 r_h = __lasx_xvor_v(r_h, alpha);
624 r_l = __lasx_xvor_v(r_l, g_l);
625 r_h = __lasx_xvor_v(r_h, g_h);
626 r_l = __lasx_xvor_v(r_l, b_l);
627 r_h = __lasx_xvor_v(r_h, b_h);
628 dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
629 dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
630 __lasx_xvst(dst_l, dst_argb1555, 0);
631 __lasx_xvst(dst_h, dst_argb1555, 32);
632 dst_argb1555 += 64;
633 src_y += 32;
634 src_u += 16;
635 src_v += 16;
636 }
637 }
638
YUY2ToYRow_LASX(const uint8_t * src_yuy2,uint8_t * dst_y,int width)639 void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
640 int x;
641 int len = width / 32;
642 __m256i src0, src1, dst0;
643
644 for (x = 0; x < len; x++) {
645 DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
646 dst0 = __lasx_xvpickev_b(src1, src0);
647 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
648 __lasx_xvst(dst0, dst_y, 0);
649 src_yuy2 += 64;
650 dst_y += 32;
651 }
652 }
653
YUY2ToUVRow_LASX(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)654 void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
655 int src_stride_yuy2,
656 uint8_t* dst_u,
657 uint8_t* dst_v,
658 int width) {
659 const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
660 int x;
661 int len = width / 32;
662 __m256i src0, src1, src2, src3;
663 __m256i tmp0, dst0, dst1;
664
665 for (x = 0; x < len; x++) {
666 DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
667 src_yuy2_next, 32, src0, src1, src2, src3);
668 src0 = __lasx_xvpickod_b(src1, src0);
669 src1 = __lasx_xvpickod_b(src3, src2);
670 tmp0 = __lasx_xvavgr_bu(src1, src0);
671 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
672 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
673 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
674 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
675 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
676 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
677 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
678 src_yuy2 += 64;
679 src_yuy2_next += 64;
680 dst_u += 16;
681 dst_v += 16;
682 }
683 }
684
YUY2ToUV422Row_LASX(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)685 void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
686 uint8_t* dst_u,
687 uint8_t* dst_v,
688 int width) {
689 int x;
690 int len = width / 32;
691 __m256i src0, src1, tmp0, dst0, dst1;
692
693 for (x = 0; x < len; x++) {
694 DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
695 tmp0 = __lasx_xvpickod_b(src1, src0);
696 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
697 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
698 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
699 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
700 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
701 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
702 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
703 src_yuy2 += 64;
704 dst_u += 16;
705 dst_v += 16;
706 }
707 }
708
UYVYToYRow_LASX(const uint8_t * src_uyvy,uint8_t * dst_y,int width)709 void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
710 int x;
711 int len = width / 32;
712 __m256i src0, src1, dst0;
713
714 for (x = 0; x < len; x++) {
715 DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
716 dst0 = __lasx_xvpickod_b(src1, src0);
717 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
718 __lasx_xvst(dst0, dst_y, 0);
719 src_uyvy += 64;
720 dst_y += 32;
721 }
722 }
723
UYVYToUVRow_LASX(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)724 void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
725 int src_stride_uyvy,
726 uint8_t* dst_u,
727 uint8_t* dst_v,
728 int width) {
729 const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
730 int x;
731 int len = width / 32;
732 __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
733
734 for (x = 0; x < len; x++) {
735 DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
736 src_uyvy_next, 32, src0, src1, src2, src3);
737 src0 = __lasx_xvpickev_b(src1, src0);
738 src1 = __lasx_xvpickev_b(src3, src2);
739 tmp0 = __lasx_xvavgr_bu(src1, src0);
740 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
741 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
742 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
743 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
744 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
745 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
746 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
747 src_uyvy += 64;
748 src_uyvy_next += 64;
749 dst_u += 16;
750 dst_v += 16;
751 }
752 }
753
UYVYToUV422Row_LASX(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)754 void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
755 uint8_t* dst_u,
756 uint8_t* dst_v,
757 int width) {
758 int x;
759 int len = width / 32;
760 __m256i src0, src1, tmp0, dst0, dst1;
761
762 for (x = 0; x < len; x++) {
763 DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
764 tmp0 = __lasx_xvpickev_b(src1, src0);
765 tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
766 dst0 = __lasx_xvpickev_b(tmp0, tmp0);
767 dst1 = __lasx_xvpickod_b(tmp0, tmp0);
768 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
769 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
770 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
771 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
772 src_uyvy += 64;
773 dst_u += 16;
774 dst_v += 16;
775 }
776 }
777
ARGBToUVRow_LASX(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)778 void ARGBToUVRow_LASX(const uint8_t* src_argb0,
779 int src_stride_argb,
780 uint8_t* dst_u,
781 uint8_t* dst_v,
782 int width) {
783 int x;
784 int len = width / 32;
785 const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
786
787 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
788 __m256i vec0, vec1, vec2, vec3;
789 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
790 __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
791 0x0038003800380038, 0x0038003800380038};
792 __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
793 0x0025002500250025, 0x0025002500250025};
794 __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
795 0x0013001300130013, 0x0013001300130013};
796 __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
797 0x002f002f002f002f, 0x002f002f002f002f};
798 __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
799 0x0009000900090009, 0x0009000900090009};
800 __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
801 0x0000000700000003};
802 __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
803 0x8080808080808080, 0x8080808080808080};
804
805 for (x = 0; x < len; x++) {
806 DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
807 src_argb0, 96, src0, src1, src2, src3);
808 DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
809 src_argb1, 96, src4, src5, src6, src7);
810 vec0 = __lasx_xvaddwev_h_bu(src0, src4);
811 vec1 = __lasx_xvaddwev_h_bu(src1, src5);
812 vec2 = __lasx_xvaddwev_h_bu(src2, src6);
813 vec3 = __lasx_xvaddwev_h_bu(src3, src7);
814 tmp0 = __lasx_xvpickev_h(vec1, vec0);
815 tmp1 = __lasx_xvpickev_h(vec3, vec2);
816 tmp2 = __lasx_xvpickod_h(vec1, vec0);
817 tmp3 = __lasx_xvpickod_h(vec3, vec2);
818 vec0 = __lasx_xvaddwod_h_bu(src0, src4);
819 vec1 = __lasx_xvaddwod_h_bu(src1, src5);
820 vec2 = __lasx_xvaddwod_h_bu(src2, src6);
821 vec3 = __lasx_xvaddwod_h_bu(src3, src7);
822 tmp4 = __lasx_xvpickev_h(vec1, vec0);
823 tmp5 = __lasx_xvpickev_h(vec3, vec2);
824 vec0 = __lasx_xvpickev_h(tmp1, tmp0);
825 vec1 = __lasx_xvpickod_h(tmp1, tmp0);
826 src0 = __lasx_xvavgr_h(vec0, vec1);
827 vec0 = __lasx_xvpickev_h(tmp3, tmp2);
828 vec1 = __lasx_xvpickod_h(tmp3, tmp2);
829 src1 = __lasx_xvavgr_h(vec0, vec1);
830 vec0 = __lasx_xvpickev_h(tmp5, tmp4);
831 vec1 = __lasx_xvpickod_h(tmp5, tmp4);
832 src2 = __lasx_xvavgr_h(vec0, vec1);
833 dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
834 dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
835 dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
836 dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
837 dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
838 dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
839 dst0 = __lasx_xvperm_w(dst0, control);
840 dst1 = __lasx_xvperm_w(dst1, control);
841 dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
842 dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
843 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
844 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
845 __lasx_xvstelm_d(dst1, dst_v, 0, 0);
846 __lasx_xvstelm_d(dst1, dst_v, 8, 2);
847 src_argb0 += 128;
848 src_argb1 += 128;
849 dst_u += 16;
850 dst_v += 16;
851 }
852 }
853
ARGBToRGB24Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)854 void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
855 int x;
856 int len = (width / 32) - 1;
857 __m256i src0, src1, src2, src3;
858 __m256i tmp0, tmp1, tmp2, tmp3;
859 __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100,
860 0x000000000E0D0C0A};
861 __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
862 0x0000000700000003};
863 for (x = 0; x < len; x++) {
864 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
865 96, src0, src1, src2, src3);
866 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
867 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
868 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
869 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
870 tmp0 = __lasx_xvperm_w(tmp0, control);
871 tmp1 = __lasx_xvperm_w(tmp1, control);
872 tmp2 = __lasx_xvperm_w(tmp2, control);
873 tmp3 = __lasx_xvperm_w(tmp3, control);
874 __lasx_xvst(tmp0, dst_rgb, 0);
875 __lasx_xvst(tmp1, dst_rgb, 24);
876 __lasx_xvst(tmp2, dst_rgb, 48);
877 __lasx_xvst(tmp3, dst_rgb, 72);
878 dst_rgb += 96;
879 src_argb += 128;
880 }
881 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
882 src0, src1, src2, src3);
883 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
884 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
885 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
886 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
887 tmp0 = __lasx_xvperm_w(tmp0, control);
888 tmp1 = __lasx_xvperm_w(tmp1, control);
889 tmp2 = __lasx_xvperm_w(tmp2, control);
890 tmp3 = __lasx_xvperm_w(tmp3, control);
891 __lasx_xvst(tmp0, dst_rgb, 0);
892 __lasx_xvst(tmp1, dst_rgb, 24);
893 __lasx_xvst(tmp2, dst_rgb, 48);
894 dst_rgb += 72;
895 __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
896 __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
897 __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
898 }
899
ARGBToRAWRow_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)900 void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
901 int x;
902 int len = (width / 32) - 1;
903 __m256i src0, src1, src2, src3;
904 __m256i tmp0, tmp1, tmp2, tmp3;
905 __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102,
906 0x000000000C0D0E08};
907 __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
908 0x0000000700000003};
909 for (x = 0; x < len; x++) {
910 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
911 96, src0, src1, src2, src3);
912 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
913 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
914 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
915 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
916 tmp0 = __lasx_xvperm_w(tmp0, control);
917 tmp1 = __lasx_xvperm_w(tmp1, control);
918 tmp2 = __lasx_xvperm_w(tmp2, control);
919 tmp3 = __lasx_xvperm_w(tmp3, control);
920 __lasx_xvst(tmp0, dst_rgb, 0);
921 __lasx_xvst(tmp1, dst_rgb, 24);
922 __lasx_xvst(tmp2, dst_rgb, 48);
923 __lasx_xvst(tmp3, dst_rgb, 72);
924 dst_rgb += 96;
925 src_argb += 128;
926 }
927 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
928 src0, src1, src2, src3);
929 tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
930 tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
931 tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
932 tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
933 tmp0 = __lasx_xvperm_w(tmp0, control);
934 tmp1 = __lasx_xvperm_w(tmp1, control);
935 tmp2 = __lasx_xvperm_w(tmp2, control);
936 tmp3 = __lasx_xvperm_w(tmp3, control);
937 __lasx_xvst(tmp0, dst_rgb, 0);
938 __lasx_xvst(tmp1, dst_rgb, 24);
939 __lasx_xvst(tmp2, dst_rgb, 48);
940 dst_rgb += 72;
941 __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
942 __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
943 __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
944 }
945
ARGBToRGB565Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)946 void ARGBToRGB565Row_LASX(const uint8_t* src_argb,
947 uint8_t* dst_rgb,
948 int width) {
949 int x;
950 int len = width / 16;
951 __m256i zero = __lasx_xvldi(0);
952 __m256i src0, src1, tmp0, tmp1, dst0;
953 __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300,
954 0x0300030003000300};
955
956 for (x = 0; x < len; x++) {
957 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
958 tmp0 = __lasx_xvpickev_b(src1, src0);
959 tmp1 = __lasx_xvpickod_b(src1, src0);
960 tmp0 = __lasx_xvsrli_b(tmp0, 3);
961 tmp1 = __lasx_xvpackev_b(zero, tmp1);
962 tmp1 = __lasx_xvsrli_h(tmp1, 2);
963 tmp0 = __lasx_xvsll_b(tmp0, shift);
964 tmp1 = __lasx_xvslli_h(tmp1, 5);
965 dst0 = __lasx_xvor_v(tmp0, tmp1);
966 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
967 __lasx_xvst(dst0, dst_rgb, 0);
968 dst_rgb += 32;
969 src_argb += 64;
970 }
971 }
972
ARGBToARGB1555Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)973 void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
974 uint8_t* dst_rgb,
975 int width) {
976 int x;
977 int len = width / 16;
978 __m256i zero = __lasx_xvldi(0);
979 __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
980 __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703,
981 0x0703070307030703};
982 __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200,
983 0x0200020002000200};
984
985 for (x = 0; x < len; x++) {
986 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
987 tmp0 = __lasx_xvpickev_b(src1, src0);
988 tmp1 = __lasx_xvpickod_b(src1, src0);
989 tmp0 = __lasx_xvsrli_b(tmp0, 3);
990 tmp1 = __lasx_xvsrl_b(tmp1, shift1);
991 tmp0 = __lasx_xvsll_b(tmp0, shift2);
992 tmp2 = __lasx_xvpackev_b(zero, tmp1);
993 tmp3 = __lasx_xvpackod_b(zero, tmp1);
994 tmp2 = __lasx_xvslli_h(tmp2, 5);
995 tmp3 = __lasx_xvslli_h(tmp3, 15);
996 dst0 = __lasx_xvor_v(tmp0, tmp2);
997 dst0 = __lasx_xvor_v(dst0, tmp3);
998 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
999 __lasx_xvst(dst0, dst_rgb, 0);
1000 dst_rgb += 32;
1001 src_argb += 64;
1002 }
1003 }
1004
ARGBToARGB4444Row_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)1005 void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
1006 uint8_t* dst_rgb,
1007 int width) {
1008 int x;
1009 int len = width / 16;
1010 __m256i src0, src1, tmp0, tmp1, dst0;
1011
1012 for (x = 0; x < len; x++) {
1013 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1014 tmp0 = __lasx_xvpickev_b(src1, src0);
1015 tmp1 = __lasx_xvpickod_b(src1, src0);
1016 tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
1017 tmp0 = __lasx_xvsrli_b(tmp0, 4);
1018 dst0 = __lasx_xvor_v(tmp1, tmp0);
1019 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1020 __lasx_xvst(dst0, dst_rgb, 0);
1021 dst_rgb += 32;
1022 src_argb += 64;
1023 }
1024 }
1025
ARGBToUV444Row_LASX(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)1026 void ARGBToUV444Row_LASX(const uint8_t* src_argb,
1027 uint8_t* dst_u,
1028 uint8_t* dst_v,
1029 int32_t width) {
1030 int x;
1031 int len = width / 32;
1032 __m256i src0, src1, src2, src3;
1033 __m256i tmp0, tmp1, tmp2, tmp3;
1034 __m256i reg0, reg1, reg2, reg3, dst0, dst1;
1035 __m256i const_112 = __lasx_xvldi(112);
1036 __m256i const_74 = __lasx_xvldi(74);
1037 __m256i const_38 = __lasx_xvldi(38);
1038 __m256i const_94 = __lasx_xvldi(94);
1039 __m256i const_18 = __lasx_xvldi(18);
1040 __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
1041 0x8080808080808080, 0x8080808080808080};
1042 __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
1043 0x0000000700000003};
1044 for (x = 0; x < len; x++) {
1045 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
1046 96, src0, src1, src2, src3);
1047 tmp0 = __lasx_xvpickev_h(src1, src0);
1048 tmp1 = __lasx_xvpickod_h(src1, src0);
1049 tmp2 = __lasx_xvpickev_h(src3, src2);
1050 tmp3 = __lasx_xvpickod_h(src3, src2);
1051 reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
1052 reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
1053 reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
1054 reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
1055 reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
1056 reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
1057 reg0 = __lasx_xvsub_h(reg0, reg2);
1058 reg1 = __lasx_xvsub_h(reg1, reg3);
1059 dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
1060 dst0 = __lasx_xvperm_w(dst0, control);
1061 reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
1062 reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
1063 reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
1064 reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
1065 reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
1066 reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
1067 reg0 = __lasx_xvsub_h(reg0, reg2);
1068 reg1 = __lasx_xvsub_h(reg1, reg3);
1069 dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
1070 dst1 = __lasx_xvperm_w(dst1, control);
1071 __lasx_xvst(dst0, dst_u, 0);
1072 __lasx_xvst(dst1, dst_v, 0);
1073 dst_u += 32;
1074 dst_v += 32;
1075 src_argb += 128;
1076 }
1077 }
1078
ARGBMultiplyRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1079 void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
1080 const uint8_t* src_argb1,
1081 uint8_t* dst_argb,
1082 int width) {
1083 int x;
1084 int len = width / 8;
1085 __m256i zero = __lasx_xvldi(0);
1086 __m256i src0, src1, dst0, dst1;
1087 __m256i tmp0, tmp1, tmp2, tmp3;
1088
1089 for (x = 0; x < len; x++) {
1090 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1091 tmp0 = __lasx_xvilvl_b(src0, src0);
1092 tmp1 = __lasx_xvilvh_b(src0, src0);
1093 tmp2 = __lasx_xvilvl_b(zero, src1);
1094 tmp3 = __lasx_xvilvh_b(zero, src1);
1095 dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
1096 dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
1097 dst0 = __lasx_xvpickev_b(dst1, dst0);
1098 __lasx_xvst(dst0, dst_argb, 0);
1099 src_argb0 += 32;
1100 src_argb1 += 32;
1101 dst_argb += 32;
1102 }
1103 }
1104
ARGBAddRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1105 void ARGBAddRow_LASX(const uint8_t* src_argb0,
1106 const uint8_t* src_argb1,
1107 uint8_t* dst_argb,
1108 int width) {
1109 int x;
1110 int len = width / 8;
1111 __m256i src0, src1, dst0;
1112
1113 for (x = 0; x < len; x++) {
1114 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1115 dst0 = __lasx_xvsadd_bu(src0, src1);
1116 __lasx_xvst(dst0, dst_argb, 0);
1117 src_argb0 += 32;
1118 src_argb1 += 32;
1119 dst_argb += 32;
1120 }
1121 }
1122
ARGBSubtractRow_LASX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1123 void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
1124 const uint8_t* src_argb1,
1125 uint8_t* dst_argb,
1126 int width) {
1127 int x;
1128 int len = width / 8;
1129 __m256i src0, src1, dst0;
1130
1131 for (x = 0; x < len; x++) {
1132 DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
1133 dst0 = __lasx_xvssub_bu(src0, src1);
1134 __lasx_xvst(dst0, dst_argb, 0);
1135 src_argb0 += 32;
1136 src_argb1 += 32;
1137 dst_argb += 32;
1138 }
1139 }
1140
ARGBAttenuateRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1141 void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
1142 uint8_t* dst_argb,
1143 int width) {
1144 int x;
1145 int len = width / 16;
1146 __m256i src0, src1, tmp0, tmp1;
1147 __m256i reg0, reg1, reg2, reg3, reg4, reg5;
1148 __m256i b, g, r, a, dst0, dst1;
1149 __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
1150 0x0007000300060002};
1151
1152 for (x = 0; x < len; x++) {
1153 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1154 tmp0 = __lasx_xvpickev_b(src1, src0);
1155 tmp1 = __lasx_xvpickod_b(src1, src0);
1156 b = __lasx_xvpackev_b(tmp0, tmp0);
1157 r = __lasx_xvpackod_b(tmp0, tmp0);
1158 g = __lasx_xvpackev_b(tmp1, tmp1);
1159 a = __lasx_xvpackod_b(tmp1, tmp1);
1160 reg0 = __lasx_xvmulwev_w_hu(b, a);
1161 reg1 = __lasx_xvmulwod_w_hu(b, a);
1162 reg2 = __lasx_xvmulwev_w_hu(r, a);
1163 reg3 = __lasx_xvmulwod_w_hu(r, a);
1164 reg4 = __lasx_xvmulwev_w_hu(g, a);
1165 reg5 = __lasx_xvmulwod_w_hu(g, a);
1166 reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
1167 reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
1168 reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
1169 reg0 = __lasx_xvshuf_h(control, reg0, reg0);
1170 reg2 = __lasx_xvshuf_h(control, reg2, reg2);
1171 reg4 = __lasx_xvshuf_h(control, reg4, reg4);
1172 tmp0 = __lasx_xvpackev_b(reg4, reg0);
1173 tmp1 = __lasx_xvpackev_b(a, reg2);
1174 dst0 = __lasx_xvilvl_h(tmp1, tmp0);
1175 dst1 = __lasx_xvilvh_h(tmp1, tmp0);
1176 __lasx_xvst(dst0, dst_argb, 0);
1177 __lasx_xvst(dst1, dst_argb, 32);
1178 dst_argb += 64;
1179 src_argb += 64;
1180 }
1181 }
1182
ARGBToRGB565DitherRow_LASX(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1183 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
1184 uint8_t* dst_rgb,
1185 uint32_t dither4,
1186 int width) {
1187 int x;
1188 int len = width / 16;
1189 __m256i src0, src1, tmp0, tmp1, dst0;
1190 __m256i b, g, r;
1191 __m256i zero = __lasx_xvldi(0);
1192 __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
1193
1194 vec_dither = __lasx_xvilvl_b(zero, vec_dither);
1195 for (x = 0; x < len; x++) {
1196 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1197 tmp0 = __lasx_xvpickev_b(src1, src0);
1198 tmp1 = __lasx_xvpickod_b(src1, src0);
1199 b = __lasx_xvpackev_b(zero, tmp0);
1200 r = __lasx_xvpackod_b(zero, tmp0);
1201 g = __lasx_xvpackev_b(zero, tmp1);
1202 b = __lasx_xvadd_h(b, vec_dither);
1203 g = __lasx_xvadd_h(g, vec_dither);
1204 r = __lasx_xvadd_h(r, vec_dither);
1205 DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
1206 r = __lasx_xvclip255_h(r);
1207 b = __lasx_xvsrai_h(b, 3);
1208 g = __lasx_xvsrai_h(g, 2);
1209 r = __lasx_xvsrai_h(r, 3);
1210 g = __lasx_xvslli_h(g, 5);
1211 r = __lasx_xvslli_h(r, 11);
1212 dst0 = __lasx_xvor_v(b, g);
1213 dst0 = __lasx_xvor_v(dst0, r);
1214 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1215 __lasx_xvst(dst0, dst_rgb, 0);
1216 src_argb += 64;
1217 dst_rgb += 32;
1218 }
1219 }
1220
ARGBShuffleRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1221 void ARGBShuffleRow_LASX(const uint8_t* src_argb,
1222 uint8_t* dst_argb,
1223 const uint8_t* shuffler,
1224 int width) {
1225 int x;
1226 int len = width / 16;
1227 __m256i src0, src1, dst0, dst1;
1228 __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000,
1229 0x0C0C0C0C08080808};
1230 __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
1231
1232 shuf = __lasx_xvadd_b(shuf, temp);
1233 for (x = 0; x < len; x++) {
1234 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1235 dst0 = __lasx_xvshuf_b(src0, src0, shuf);
1236 dst1 = __lasx_xvshuf_b(src1, src1, shuf);
1237 __lasx_xvst(dst0, dst_argb, 0);
1238 __lasx_xvst(dst1, dst_argb, 32);
1239 src_argb += 64;
1240 dst_argb += 64;
1241 }
1242 }
1243
ARGBShadeRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1244 void ARGBShadeRow_LASX(const uint8_t* src_argb,
1245 uint8_t* dst_argb,
1246 int width,
1247 uint32_t value) {
1248 int x;
1249 int len = width / 8;
1250 __m256i src0, dst0, tmp0, tmp1;
1251 __m256i vec_value = __lasx_xvreplgr2vr_w(value);
1252
1253 vec_value = __lasx_xvilvl_b(vec_value, vec_value);
1254 for (x = 0; x < len; x++) {
1255 src0 = __lasx_xvld(src_argb, 0);
1256 tmp0 = __lasx_xvilvl_b(src0, src0);
1257 tmp1 = __lasx_xvilvh_b(src0, src0);
1258 tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
1259 tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
1260 dst0 = __lasx_xvpickod_b(tmp1, tmp0);
1261 __lasx_xvst(dst0, dst_argb, 0);
1262 src_argb += 32;
1263 dst_argb += 32;
1264 }
1265 }
1266
ARGBGrayRow_LASX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1267 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1268 int x;
1269 int len = width / 16;
1270 __m256i src0, src1, tmp0, tmp1;
1271 __m256i reg0, reg1, reg2, dst0, dst1;
1272 __m256i const_128 = __lasx_xvldi(0x480);
1273 __m256i const_150 = __lasx_xvldi(0x96);
1274 __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
1275 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
1276
1277 for (x = 0; x < len; x++) {
1278 DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
1279 tmp0 = __lasx_xvpickev_b(src1, src0);
1280 tmp1 = __lasx_xvpickod_b(src1, src0);
1281 reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
1282 reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
1283 reg2 = __lasx_xvadd_h(reg0, reg1);
1284 tmp0 = __lasx_xvpackod_b(reg2, reg2);
1285 tmp1 = __lasx_xvpackod_b(tmp1, reg2);
1286 dst0 = __lasx_xvilvl_h(tmp1, tmp0);
1287 dst1 = __lasx_xvilvh_h(tmp1, tmp0);
1288 __lasx_xvst(dst0, dst_argb, 0);
1289 __lasx_xvst(dst1, dst_argb, 32);
1290 src_argb += 64;
1291 dst_argb += 64;
1292 }
1293 }
1294
ARGBSepiaRow_LASX(uint8_t * dst_argb,int width)1295 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
1296 int x;
1297 int len = width / 16;
1298 __m256i src0, src1, tmp0, tmp1;
1299 __m256i reg0, reg1, spb, spg, spr;
1300 __m256i dst0, dst1;
1301 __m256i spb_g = __lasx_xvldi(68);
1302 __m256i spg_g = __lasx_xvldi(88);
1303 __m256i spr_g = __lasx_xvldi(98);
1304 __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311,
1305 0x2311231123112311};
1306 __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16,
1307 0x2D162D162D162D16};
1308 __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218,
1309 0x3218321832183218};
1310 __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100,
1311 0x1F0E1D0C1B0A1908};
1312
1313 for (x = 0; x < len; x++) {
1314 DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
1315 tmp0 = __lasx_xvpickev_b(src1, src0);
1316 tmp1 = __lasx_xvpickod_b(src1, src0);
1317 DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
1318 spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
1319 spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
1320 spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
1321 spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
1322 spb = __lasx_xvsrli_h(spb, 7);
1323 spg = __lasx_xvsrli_h(spg, 7);
1324 spr = __lasx_xvsrli_h(spr, 7);
1325 spg = __lasx_xvsat_hu(spg, 7);
1326 spr = __lasx_xvsat_hu(spr, 7);
1327 reg0 = __lasx_xvpackev_b(spg, spb);
1328 reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
1329 dst0 = __lasx_xvilvl_h(reg1, reg0);
1330 dst1 = __lasx_xvilvh_h(reg1, reg0);
1331 __lasx_xvst(dst0, dst_argb, 0);
1332 __lasx_xvst(dst1, dst_argb, 32);
1333 dst_argb += 64;
1334 }
1335 }
1336
ARGB4444ToARGBRow_LASX(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1337 void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
1338 uint8_t* dst_argb,
1339 int width) {
1340 int x;
1341 int len = width / 32;
1342 __m256i src0, src1;
1343 __m256i tmp0, tmp1, tmp2, tmp3;
1344 __m256i reg0, reg1, reg2, reg3;
1345 __m256i dst0, dst1, dst2, dst3;
1346
1347 for (x = 0; x < len; x++) {
1348 src0 = __lasx_xvld(src_argb4444, 0);
1349 src1 = __lasx_xvld(src_argb4444, 32);
1350 DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0,
1351 tmp0, tmp1, tmp2, tmp3);
1352 DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2);
1353 DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3);
1354 DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3,
1355 tmp0, tmp1, tmp2, tmp3);
1356 DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2);
1357 DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3);
1358 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2,
1359 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1360 __lasx_xvst(dst0, dst_argb, 0);
1361 __lasx_xvst(dst1, dst_argb, 32);
1362 __lasx_xvst(dst2, dst_argb, 64);
1363 __lasx_xvst(dst3, dst_argb, 96);
1364 src_argb4444 += 64;
1365 dst_argb += 128;
1366 }
1367 }
1368
ARGB1555ToARGBRow_LASX(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1369 void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
1370 uint8_t* dst_argb,
1371 int width) {
1372 int x;
1373 int len = width / 32;
1374 __m256i src0, src1;
1375 __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
1376 __m256i reg0, reg1, reg2, reg3;
1377 __m256i dst0, dst1, dst2, dst3;
1378
1379 for (x = 0; x < len; x++) {
1380 src0 = __lasx_xvld(src_argb1555, 0);
1381 src1 = __lasx_xvld(src_argb1555, 32);
1382 tmp0 = __lasx_xvpickev_b(src1, src0);
1383 tmp1 = __lasx_xvpickod_b(src1, src0);
1384 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1385 tmpg = __lasx_xvsrli_b(tmp0, 5);
1386 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1387 reg0 = __lasx_xvslli_b(reg0, 3);
1388 tmpg = __lasx_xvor_v(tmpg, reg0);
1389 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1390 tmpr = __lasx_xvsrli_b(reg1, 2);
1391 tmpa = __lasx_xvsrli_b(tmp1, 7);
1392 tmpa = __lasx_xvneg_b(tmpa);
1393 reg0 = __lasx_xvslli_b(tmpb, 3);
1394 reg1 = __lasx_xvslli_b(tmpg, 3);
1395 reg2 = __lasx_xvslli_b(tmpr, 3);
1396 tmpb = __lasx_xvsrli_b(tmpb, 2);
1397 tmpg = __lasx_xvsrli_b(tmpg, 2);
1398 tmpr = __lasx_xvsrli_b(tmpr, 2);
1399 tmpb = __lasx_xvor_v(reg0, tmpb);
1400 tmpg = __lasx_xvor_v(reg1, tmpg);
1401 tmpr = __lasx_xvor_v(reg2, tmpr);
1402 DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1403 DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3);
1404 dst0 = __lasx_xvilvl_h(reg1, reg0);
1405 dst1 = __lasx_xvilvh_h(reg1, reg0);
1406 dst2 = __lasx_xvilvl_h(reg3, reg2);
1407 dst3 = __lasx_xvilvh_h(reg3, reg2);
1408 DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
1409 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
1410 __lasx_xvst(reg0, dst_argb, 0);
1411 __lasx_xvst(reg1, dst_argb, 32);
1412 __lasx_xvst(reg2, dst_argb, 64);
1413 __lasx_xvst(reg3, dst_argb, 96);
1414 src_argb1555 += 64;
1415 dst_argb += 128;
1416 }
1417 }
1418
RGB565ToARGBRow_LASX(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1419 void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
1420 uint8_t* dst_argb,
1421 int width) {
1422 int x;
1423 int len = width / 32;
1424 __m256i src0, src1;
1425 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1426 __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3;
1427 __m256i alpha = __lasx_xvldi(0xFF);
1428
1429 for (x = 0; x < len; x++) {
1430 src0 = __lasx_xvld(src_rgb565, 0);
1431 src1 = __lasx_xvld(src_rgb565, 32);
1432 tmp0 = __lasx_xvpickev_b(src1, src0);
1433 tmp1 = __lasx_xvpickod_b(src1, src0);
1434 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1435 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1436 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1437 reg0 = __lasx_xvsrli_b(tmp0, 5);
1438 reg1 = __lasx_xvslli_b(reg1, 3);
1439 tmpg = __lasx_xvor_v(reg1, reg0);
1440 reg0 = __lasx_xvslli_b(tmpb, 3);
1441 reg1 = __lasx_xvsrli_b(tmpb, 2);
1442 tmpb = __lasx_xvor_v(reg1, reg0);
1443 reg0 = __lasx_xvslli_b(tmpg, 2);
1444 reg1 = __lasx_xvsrli_b(tmpg, 4);
1445 tmpg = __lasx_xvor_v(reg1, reg0);
1446 reg0 = __lasx_xvsrli_b(tmpr, 5);
1447 tmpr = __lasx_xvor_v(tmpr, reg0);
1448 DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1449 dst0 = __lasx_xvilvl_h(reg1, reg0);
1450 dst1 = __lasx_xvilvh_h(reg1, reg0);
1451 DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1452 dst2 = __lasx_xvilvl_h(reg1, reg0);
1453 dst3 = __lasx_xvilvh_h(reg1, reg0);
1454 DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
1455 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
1456 __lasx_xvst(reg0, dst_argb, 0);
1457 __lasx_xvst(reg1, dst_argb, 32);
1458 __lasx_xvst(reg2, dst_argb, 64);
1459 __lasx_xvst(reg3, dst_argb, 96);
1460 src_rgb565 += 64;
1461 dst_argb += 128;
1462 }
1463 }
1464
RGB24ToARGBRow_LASX(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1465 void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
1466 uint8_t* dst_argb,
1467 int width) {
1468 int x;
1469 int len = width / 32;
1470 __m256i src0, src1, src2;
1471 __m256i tmp0, tmp1, tmp2;
1472 __m256i dst0, dst1, dst2, dst3;
1473 __m256i reg0, reg1, reg2, reg3;
1474 __m256i alpha = __lasx_xvldi(0xFF);
1475 __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
1476 0x1B1A191817161514};
1477 __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
1478 0x0706050403020100};
1479 __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
1480 0x131211100F0E0D0C};
1481 __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
1482 0x100B0A0910080706};
1483
1484 for (x = 0; x < len; x++) {
1485 reg0 = __lasx_xvld(src_rgb24, 0);
1486 reg1 = __lasx_xvld(src_rgb24, 32);
1487 reg2 = __lasx_xvld(src_rgb24, 64);
1488 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1489 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1490 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1491 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
1492 tmp1);
1493 tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
1494 DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1495 tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
1496 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
1497 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1498 __lasx_xvst(dst0, dst_argb, 0);
1499 __lasx_xvst(dst1, dst_argb, 32);
1500 __lasx_xvst(dst2, dst_argb, 64);
1501 __lasx_xvst(dst3, dst_argb, 96);
1502 src_rgb24 += 96;
1503 dst_argb += 128;
1504 }
1505 }
1506
RAWToARGBRow_LASX(const uint8_t * src_raw,uint8_t * dst_argb,int width)1507 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1508 int x;
1509 int len = width / 32;
1510 __m256i src0, src1, src2;
1511 __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
1512 __m256i dst0, dst1, dst2, dst3;
1513 __m256i alpha = __lasx_xvldi(0xFF);
1514 __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
1515 0x1B1A191817161514};
1516 __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
1517 0x0706050403020100};
1518 __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
1519 0x131211100F0E0D0C};
1520 __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
1521 0x10090A0B10060708};
1522
1523 for (x = 0; x < len; x++) {
1524 reg0 = __lasx_xvld(src_raw, 0);
1525 reg1 = __lasx_xvld(src_raw, 32);
1526 reg2 = __lasx_xvld(src_raw, 64);
1527 src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
1528 src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
1529 src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
1530 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
1531 tmp1);
1532 tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
1533 DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1534 tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
1535 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
1536 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
1537 __lasx_xvst(dst0, dst_argb, 0);
1538 __lasx_xvst(dst1, dst_argb, 32);
1539 __lasx_xvst(dst2, dst_argb, 64);
1540 __lasx_xvst(dst3, dst_argb, 96);
1541 src_raw += 96;
1542 dst_argb += 128;
1543 }
1544 }
1545
ARGB1555ToYRow_LASX(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1546 void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
1547 uint8_t* dst_y,
1548 int width) {
1549 int x;
1550 int len = width / 32;
1551 __m256i src0, src1;
1552 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1553 __m256i reg0, reg1, reg2, dst0;
1554 __m256i const_66 = __lasx_xvldi(66);
1555 __m256i const_129 = __lasx_xvldi(129);
1556 __m256i const_25 = __lasx_xvldi(25);
1557 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1558 0x1080108010801080, 0x1080108010801080};
1559
1560 for (x = 0; x < len; x++) {
1561 src0 = __lasx_xvld(src_argb1555, 0);
1562 src1 = __lasx_xvld(src_argb1555, 32);
1563 tmp0 = __lasx_xvpickev_b(src1, src0);
1564 tmp1 = __lasx_xvpickod_b(src1, src0);
1565 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1566 tmpg = __lasx_xvsrli_b(tmp0, 5);
1567 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1568 reg0 = __lasx_xvslli_b(reg0, 3);
1569 tmpg = __lasx_xvor_v(tmpg, reg0);
1570 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1571 tmpr = __lasx_xvsrli_b(reg1, 2);
1572 reg0 = __lasx_xvslli_b(tmpb, 3);
1573 reg1 = __lasx_xvslli_b(tmpg, 3);
1574 reg2 = __lasx_xvslli_b(tmpr, 3);
1575 tmpb = __lasx_xvsrli_b(tmpb, 2);
1576 tmpg = __lasx_xvsrli_b(tmpg, 2);
1577 tmpr = __lasx_xvsrli_b(tmpr, 2);
1578 tmpb = __lasx_xvor_v(reg0, tmpb);
1579 tmpg = __lasx_xvor_v(reg1, tmpg);
1580 tmpr = __lasx_xvor_v(reg2, tmpr);
1581 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
1582 reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
1583 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
1584 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
1585 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
1586 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
1587 dst0 = __lasx_xvpackod_b(reg1, reg0);
1588 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1589 __lasx_xvst(dst0, dst_y, 0);
1590 src_argb1555 += 64;
1591 dst_y += 32;
1592 }
1593 }
1594
ARGB1555ToUVRow_LASX(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1595 void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
1596 int src_stride_argb1555,
1597 uint8_t* dst_u,
1598 uint8_t* dst_v,
1599 int width) {
1600 int x;
1601 int len = width / 32;
1602 const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
1603 __m256i src0, src1, src2, src3;
1604 __m256i tmp0, tmp1, tmp2, tmp3;
1605 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1606 __m256i reg0, reg1, reg2, reg3, dst0;
1607 __m256i const_112 = __lasx_xvldi(0x438);
1608 __m256i const_74 = __lasx_xvldi(0x425);
1609 __m256i const_38 = __lasx_xvldi(0x413);
1610 __m256i const_94 = __lasx_xvldi(0x42F);
1611 __m256i const_18 = __lasx_xvldi(0x409);
1612 __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
1613 0x8080808080808080, 0x8080808080808080};
1614
1615 for (x = 0; x < len; x++) {
1616 DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
1617 next_argb1555, 32, src0, src1, src2, src3);
1618 DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1619 DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1620 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1621 nexb = __lasx_xvandi_b(tmp2, 0x1F);
1622 tmpg = __lasx_xvsrli_b(tmp0, 5);
1623 nexg = __lasx_xvsrli_b(tmp2, 5);
1624 reg0 = __lasx_xvandi_b(tmp1, 0x03);
1625 reg2 = __lasx_xvandi_b(tmp3, 0x03);
1626 reg0 = __lasx_xvslli_b(reg0, 3);
1627 reg2 = __lasx_xvslli_b(reg2, 3);
1628 tmpg = __lasx_xvor_v(tmpg, reg0);
1629 nexg = __lasx_xvor_v(nexg, reg2);
1630 reg1 = __lasx_xvandi_b(tmp1, 0x7C);
1631 reg3 = __lasx_xvandi_b(tmp3, 0x7C);
1632 tmpr = __lasx_xvsrli_b(reg1, 2);
1633 nexr = __lasx_xvsrli_b(reg3, 2);
1634 reg0 = __lasx_xvslli_b(tmpb, 3);
1635 reg1 = __lasx_xvslli_b(tmpg, 3);
1636 reg2 = __lasx_xvslli_b(tmpr, 3);
1637 tmpb = __lasx_xvsrli_b(tmpb, 2);
1638 tmpg = __lasx_xvsrli_b(tmpg, 2);
1639 tmpr = __lasx_xvsrli_b(tmpr, 2);
1640 tmpb = __lasx_xvor_v(reg0, tmpb);
1641 tmpg = __lasx_xvor_v(reg1, tmpg);
1642 tmpr = __lasx_xvor_v(reg2, tmpr);
1643 reg0 = __lasx_xvslli_b(nexb, 3);
1644 reg1 = __lasx_xvslli_b(nexg, 3);
1645 reg2 = __lasx_xvslli_b(nexr, 3);
1646 nexb = __lasx_xvsrli_b(nexb, 2);
1647 nexg = __lasx_xvsrli_b(nexg, 2);
1648 nexr = __lasx_xvsrli_b(nexr, 2);
1649 nexb = __lasx_xvor_v(reg0, nexb);
1650 nexg = __lasx_xvor_v(reg1, nexg);
1651 nexr = __lasx_xvor_v(reg2, nexr);
1652 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1653 reg0 = __lasx_xvpermi_d(reg0, 0xD8);
1654 reg1 = __lasx_xvpermi_d(reg1, 0xD8);
1655 dst0 = __lasx_xvpickod_b(reg1, reg0);
1656 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1657 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1658 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1659 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1660 src_argb1555 += 64;
1661 next_argb1555 += 64;
1662 dst_u += 16;
1663 dst_v += 16;
1664 }
1665 }
1666
RGB565ToYRow_LASX(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1667 void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1668 int x;
1669 int len = width / 32;
1670 __m256i src0, src1;
1671 __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
1672 __m256i reg0, reg1, dst0;
1673 __m256i const_66 = __lasx_xvldi(66);
1674 __m256i const_129 = __lasx_xvldi(129);
1675 __m256i const_25 = __lasx_xvldi(25);
1676 __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
1677 0x1080108010801080, 0x1080108010801080};
1678
1679 for (x = 0; x < len; x++) {
1680 src0 = __lasx_xvld(src_rgb565, 0);
1681 src1 = __lasx_xvld(src_rgb565, 32);
1682 tmp0 = __lasx_xvpickev_b(src1, src0);
1683 tmp1 = __lasx_xvpickod_b(src1, src0);
1684 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1685 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1686 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1687 reg0 = __lasx_xvsrli_b(tmp0, 5);
1688 reg1 = __lasx_xvslli_b(reg1, 3);
1689 tmpg = __lasx_xvor_v(reg1, reg0);
1690 reg0 = __lasx_xvslli_b(tmpb, 3);
1691 reg1 = __lasx_xvsrli_b(tmpb, 2);
1692 tmpb = __lasx_xvor_v(reg1, reg0);
1693 reg0 = __lasx_xvslli_b(tmpg, 2);
1694 reg1 = __lasx_xvsrli_b(tmpg, 4);
1695 tmpg = __lasx_xvor_v(reg1, reg0);
1696 reg0 = __lasx_xvsrli_b(tmpr, 5);
1697 tmpr = __lasx_xvor_v(tmpr, reg0);
1698 reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
1699 reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
1700 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
1701 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
1702 reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
1703 reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
1704 dst0 = __lasx_xvpackod_b(reg1, reg0);
1705 dst0 = __lasx_xvpermi_d(dst0, 0xD8);
1706 __lasx_xvst(dst0, dst_y, 0);
1707 dst_y += 32;
1708 src_rgb565 += 64;
1709 }
1710 }
1711
RGB565ToUVRow_LASX(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1712 void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
1713 int src_stride_rgb565,
1714 uint8_t* dst_u,
1715 uint8_t* dst_v,
1716 int width) {
1717 int x;
1718 int len = width / 32;
1719 const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
1720 __m256i src0, src1, src2, src3;
1721 __m256i tmp0, tmp1, tmp2, tmp3;
1722 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1723 __m256i reg0, reg1, reg2, reg3, dst0;
1724 __m256i const_112 = __lasx_xvldi(0x438);
1725 __m256i const_74 = __lasx_xvldi(0x425);
1726 __m256i const_38 = __lasx_xvldi(0x413);
1727 __m256i const_94 = __lasx_xvldi(0x42F);
1728 __m256i const_18 = __lasx_xvldi(0x409);
1729 __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
1730 0x8080808080808080, 0x8080808080808080};
1731
1732 for (x = 0; x < len; x++) {
1733 DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
1734 next_rgb565, 32, src0, src1, src2, src3);
1735 DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1736 DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1737 tmpb = __lasx_xvandi_b(tmp0, 0x1F);
1738 tmpr = __lasx_xvandi_b(tmp1, 0xF8);
1739 nexb = __lasx_xvandi_b(tmp2, 0x1F);
1740 nexr = __lasx_xvandi_b(tmp3, 0xF8);
1741 reg1 = __lasx_xvandi_b(tmp1, 0x07);
1742 reg3 = __lasx_xvandi_b(tmp3, 0x07);
1743 reg0 = __lasx_xvsrli_b(tmp0, 5);
1744 reg1 = __lasx_xvslli_b(reg1, 3);
1745 reg2 = __lasx_xvsrli_b(tmp2, 5);
1746 reg3 = __lasx_xvslli_b(reg3, 3);
1747 tmpg = __lasx_xvor_v(reg1, reg0);
1748 nexg = __lasx_xvor_v(reg2, reg3);
1749 reg0 = __lasx_xvslli_b(tmpb, 3);
1750 reg1 = __lasx_xvsrli_b(tmpb, 2);
1751 reg2 = __lasx_xvslli_b(nexb, 3);
1752 reg3 = __lasx_xvsrli_b(nexb, 2);
1753 tmpb = __lasx_xvor_v(reg1, reg0);
1754 nexb = __lasx_xvor_v(reg2, reg3);
1755 reg0 = __lasx_xvslli_b(tmpg, 2);
1756 reg1 = __lasx_xvsrli_b(tmpg, 4);
1757 reg2 = __lasx_xvslli_b(nexg, 2);
1758 reg3 = __lasx_xvsrli_b(nexg, 4);
1759 tmpg = __lasx_xvor_v(reg1, reg0);
1760 nexg = __lasx_xvor_v(reg2, reg3);
1761 reg0 = __lasx_xvsrli_b(tmpr, 5);
1762 reg2 = __lasx_xvsrli_b(nexr, 5);
1763 tmpr = __lasx_xvor_v(tmpr, reg0);
1764 nexr = __lasx_xvor_v(nexr, reg2);
1765 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1766 reg0 = __lasx_xvpermi_d(reg0, 0xD8);
1767 reg1 = __lasx_xvpermi_d(reg1, 0xD8);
1768 dst0 = __lasx_xvpickod_b(reg1, reg0);
1769 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1770 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1771 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1772 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1773 dst_u += 16;
1774 dst_v += 16;
1775 src_rgb565 += 64;
1776 next_rgb565 += 64;
1777 }
1778 }
1779
RGB24ToUVRow_LASX(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1780 void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
1781 int src_stride_rgb24,
1782 uint8_t* dst_u,
1783 uint8_t* dst_v,
1784 int width) {
1785 int x;
1786 const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
1787 int len = width / 32;
1788 __m256i src0, src1, src2, reg0, reg1, reg2;
1789 __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
1790 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1791 __m256i const_112 = __lasx_xvldi(0x438);
1792 __m256i const_74 = __lasx_xvldi(0x425);
1793 __m256i const_38 = __lasx_xvldi(0x413);
1794 __m256i const_94 = __lasx_xvldi(0x42F);
1795 __m256i const_18 = __lasx_xvldi(0x409);
1796 __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
1797 0x8080808080808080, 0x8080808080808080};
1798 __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
1799 0x15120F0C09060300, 0x00000000001E1B18};
1800 __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
1801 0x0706050403020100, 0x1D1A1714110A0908};
1802 __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
1803 0x1613100D0A070401, 0x00000000001F1C19};
1804 __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
1805 0x0706050403020100, 0x1E1B1815120A0908};
1806 __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A,
1807 0x1714110E0B080502, 0x0000000000001D1A};
1808 __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908,
1809 0x0706050403020100, 0x1F1C191613100908};
1810
1811 for (x = 0; x < len; x++) {
1812 DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
1813 next_rgb24, 0, reg0, reg1, reg2, tmp0);
1814 DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
1815 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
1816 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
1817 DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
1818 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1819 nexb);
1820 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1821 nexg);
1822 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1823 nexr);
1824 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1825 nexb);
1826 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1827 nexg);
1828 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1829 nexr);
1830 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1831 dst0 = __lasx_xvpickod_b(reg1, reg0);
1832 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1833 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1834 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1835 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1836 src_rgb24 += 96;
1837 next_rgb24 += 96;
1838 dst_u += 16;
1839 dst_v += 16;
1840 }
1841 }
1842
RAWToUVRow_LASX(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1843 void RAWToUVRow_LASX(const uint8_t* src_raw,
1844 int src_stride_raw,
1845 uint8_t* dst_u,
1846 uint8_t* dst_v,
1847 int width) {
1848 int x;
1849 const uint8_t* next_raw = src_raw + src_stride_raw;
1850 int len = width / 32;
1851 __m256i src0, src1, src2, reg0, reg1, reg2;
1852 __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
1853 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1854 __m256i const_112 = __lasx_xvldi(0x438);
1855 __m256i const_74 = __lasx_xvldi(0x425);
1856 __m256i const_38 = __lasx_xvldi(0x413);
1857 __m256i const_94 = __lasx_xvldi(0x42F);
1858 __m256i const_18 = __lasx_xvldi(0x409);
1859 __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
1860 0x8080808080808080, 0x8080808080808080};
1861 __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
1862 0x15120F0C09060300, 0x00000000001E1B18};
1863 __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
1864 0x0706050403020100, 0x1D1A1714110A0908};
1865 __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
1866 0x1613100D0A070401, 0x00000000001F1C19};
1867 __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
1868 0x0706050403020100, 0x1E1B1815120A0908};
1869 __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A,
1870 0x1714110E0B080502, 0x0000000000001D1A};
1871 __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908,
1872 0x0706050403020100, 0x1F1C191613100908};
1873
1874 for (x = 0; x < len; x++) {
1875 DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
1876 reg0, reg1, reg2, tmp0);
1877 DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
1878 DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
1879 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
1880 DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
1881 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1882 nexb);
1883 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1884 nexg);
1885 DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1886 nexr);
1887 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1888 nexb);
1889 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1890 nexg);
1891 DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1892 nexr);
1893 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
1894 dst0 = __lasx_xvpickod_b(reg1, reg0);
1895 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
1896 __lasx_xvstelm_d(dst0, dst_v, 0, 1);
1897 __lasx_xvstelm_d(dst0, dst_u, 8, 2);
1898 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
1899 src_raw += 96;
1900 next_raw += 96;
1901 dst_u += 16;
1902 dst_v += 16;
1903 }
1904 }
1905
NV12ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1906 void NV12ToARGBRow_LASX(const uint8_t* src_y,
1907 const uint8_t* src_uv,
1908 uint8_t* dst_argb,
1909 const struct YuvConstants* yuvconstants,
1910 int width) {
1911 int x;
1912 int len = width / 16;
1913 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
1914 __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
1915 __m256i out_b, out_g, out_r;
1916 __m256i const_0x80 = __lasx_xvldi(0x80);
1917 __m256i alpha = __lasx_xvldi(0xFF);
1918
1919 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
1920 vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
1921 vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
1922
1923 for (x = 0; x < len; x++) {
1924 vec_y = __lasx_xvld(src_y, 0);
1925 vec_vu = __lasx_xvld(src_uv, 0);
1926 vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
1927 vec_vu = __lasx_vext2xv_h_b(vec_vu);
1928 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
1929 out_b);
1930 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1931 src_y += 16;
1932 src_uv += 16;
1933 }
1934 }
1935
NV12ToRGB565Row_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)1936 void NV12ToRGB565Row_LASX(const uint8_t* src_y,
1937 const uint8_t* src_uv,
1938 uint8_t* dst_rgb565,
1939 const struct YuvConstants* yuvconstants,
1940 int width) {
1941 int x;
1942 int len = width / 16;
1943 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
1944 __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
1945 __m256i out_b, out_g, out_r;
1946 __m256i const_0x80 = __lasx_xvldi(0x80);
1947
1948 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
1949 vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
1950 vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
1951
1952 for (x = 0; x < len; x++) {
1953 vec_y = __lasx_xvld(src_y, 0);
1954 vec_vu = __lasx_xvld(src_uv, 0);
1955 vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
1956 vec_vu = __lasx_vext2xv_h_b(vec_vu);
1957 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
1958 out_b);
1959 out_b = __lasx_xvsrli_h(out_b, 3);
1960 out_g = __lasx_xvsrli_h(out_g, 2);
1961 out_r = __lasx_xvsrli_h(out_r, 3);
1962 out_g = __lasx_xvslli_h(out_g, 5);
1963 out_r = __lasx_xvslli_h(out_r, 11);
1964 out_r = __lasx_xvor_v(out_r, out_g);
1965 out_r = __lasx_xvor_v(out_r, out_b);
1966 __lasx_xvst(out_r, dst_rgb565, 0);
1967 src_y += 16;
1968 src_uv += 16;
1969 dst_rgb565 += 32;
1970 }
1971 }
1972
NV21ToARGBRow_LASX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1973 void NV21ToARGBRow_LASX(const uint8_t* src_y,
1974 const uint8_t* src_uv,
1975 uint8_t* dst_argb,
1976 const struct YuvConstants* yuvconstants,
1977 int width) {
1978 int x;
1979 int len = width / 16;
1980 __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
1981 __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv;
1982 __m256i out_b, out_g, out_r;
1983 __m256i const_0x80 = __lasx_xvldi(0x80);
1984 __m256i alpha = __lasx_xvldi(0xFF);
1985
1986 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
1987 vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
1988 vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
1989
1990 for (x = 0; x < len; x++) {
1991 vec_y = __lasx_xvld(src_y, 0);
1992 vec_uv = __lasx_xvld(src_uv, 0);
1993 vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
1994 vec_uv = __lasx_vext2xv_h_b(vec_uv);
1995 YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
1996 out_r);
1997 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1998 src_y += 16;
1999 src_uv += 16;
2000 }
2001 }
2002
2003 struct RgbConstants {
2004 uint8_t kRGBToY[4];
2005 uint16_t kAddY;
2006 uint16_t pad;
2007 };
2008
2009 // RGB to JPeg coefficients
2010 // B * 0.1140 coefficient = 29
2011 // G * 0.5870 coefficient = 150
2012 // R * 0.2990 coefficient = 77
2013 // Add 0.5 = 0x80
2014 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
2015 128,
2016 0};
2017
2018 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
2019
2020 // RGB to BT.601 coefficients
2021 // B * 0.1016 coefficient = 25
2022 // G * 0.5078 coefficient = 129
2023 // R * 0.2578 coefficient = 66
2024 // Add 16.5 = 0x1080
2025
2026 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
2027 0x1080,
2028 0};
2029
2030 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
2031 0x1080,
2032 0};
2033
2034 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_LASX(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2035 static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
2036 uint8_t* dst_y,
2037 int width,
2038 const struct RgbConstants* rgbconstants) {
2039 int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
2040 asm volatile(
2041 "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
2042 "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
2043 "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
2044 "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
2045 "xvld $xr20, %4, 0 \n\t" // load shuff
2046 "1: \n\t"
2047 "xvld $xr4, %0, 0 \n\t"
2048 "xvld $xr5, %0, 32 \n\t"
2049 "xvld $xr6, %0, 64 \n\t"
2050 "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
2051 // ARGB
2052 "xvor.v $xr12, $xr3, $xr3 \n\t"
2053 "xvor.v $xr13, $xr3, $xr3 \n\t"
2054 "addi.d %2, %2, -32 \n\t" // 32 processed per
2055 // loop.
2056 "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR
2057 "xvpickev.b $xr10, $xr7, $xr6 \n\t"
2058 "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA
2059 "xvpickod.b $xr11, $xr7, $xr6 \n\t"
2060 "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B
2061 "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t"
2062 "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G
2063 "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t"
2064 "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R
2065 "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t"
2066 "addi.d %0, %0, 128 \n\t"
2067 "xvpickod.b $xr10, $xr13, $xr12 \n\t"
2068 "xvperm.w $xr11, $xr10, $xr20 \n\t"
2069 "xvst $xr11, %1, 0 \n\t"
2070 "addi.d %1, %1, 32 \n\t"
2071 "bnez %2, 1b \n\t"
2072 : "+&r"(src_argb), // %0
2073 "+&r"(dst_y), // %1
2074 "+&r"(width) // %2
2075 : "r"(rgbconstants), "r"(shuff)
2076 : "memory");
2077 }
2078
ARGBToYRow_LASX(const uint8_t * src_argb,uint8_t * dst_y,int width)2079 void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2080 ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants);
2081 }
2082
ARGBToYJRow_LASX(const uint8_t * src_argb,uint8_t * dst_yj,int width)2083 void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
2084 ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
2085 }
2086
ABGRToYRow_LASX(const uint8_t * src_abgr,uint8_t * dst_y,int width)2087 void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2088 ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants);
2089 }
2090
ABGRToYJRow_LASX(const uint8_t * src_abgr,uint8_t * dst_yj,int width)2091 void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
2092 ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants);
2093 }
2094
2095 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
2096 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_LASX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2097 static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
2098 uint8_t* dst_y,
2099 int width,
2100 const struct RgbConstants* rgbconstants) {
2101 int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
2102 asm volatile(
2103 "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
2104 "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
2105 "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
2106 "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
2107 "xvld $xr20, %4, 0 \n\t" // load shuff
2108 "1: \n\t"
2109 "xvld $xr4, %0, 0 \n\t"
2110 "xvld $xr5, %0, 32 \n\t"
2111 "xvld $xr6, %0, 64 \n\t"
2112 "xvld $xr7, %0, 96 \n\t" // load 32 pixels of
2113 // RGBA
2114 "xvor.v $xr12, $xr3, $xr3 \n\t"
2115 "xvor.v $xr13, $xr3, $xr3 \n\t"
2116 "addi.d %2, %2, -32 \n\t" // 32 processed per
2117 // loop.
2118 "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG
2119 "xvpickev.b $xr10, $xr7, $xr6 \n\t"
2120 "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR
2121 "xvpickod.b $xr11, $xr7, $xr6 \n\t"
2122 "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B
2123 "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t"
2124 "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G
2125 "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t"
2126 "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R
2127 "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t"
2128 "addi.d %0, %0, 128 \n\t"
2129 "xvpickod.b $xr10, $xr13, $xr12 \n\t"
2130 "xvperm.w $xr11, $xr10, $xr20 \n\t"
2131 "xvst $xr11, %1, 0 \n\t"
2132 "addi.d %1, %1, 32 \n\t"
2133 "bnez %2, 1b \n\t"
2134 : "+&r"(src_rgba), // %0
2135 "+&r"(dst_y), // %1
2136 "+&r"(width) // %2
2137 : "r"(rgbconstants), "r"(shuff)
2138 : "memory");
2139 }
2140
RGBAToYRow_LASX(const uint8_t * src_rgba,uint8_t * dst_y,int width)2141 void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2142 RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants);
2143 }
2144
RGBAToYJRow_LASX(const uint8_t * src_rgba,uint8_t * dst_yj,int width)2145 void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
2146 RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
2147 }
2148
BGRAToYRow_LASX(const uint8_t * src_bgra,uint8_t * dst_y,int width)2149 void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2150 RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants);
2151 }
2152
RGBToYMatrixRow_LASX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2153 static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
2154 uint8_t* dst_y,
2155 int width,
2156 const struct RgbConstants* rgbconstants) {
2157 int8_t shuff[128] = {
2158 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
2159 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
2160 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
2161 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
2162 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
2163 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
2164 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
2165 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
2166 asm volatile(
2167 "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
2168 "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
2169 "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
2170 "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants
2171 "xvld $xr4, %4, 0 \n\t" // load shuff
2172 "xvld $xr5, %4, 32 \n\t"
2173 "xvld $xr6, %4, 64 \n\t"
2174 "xvld $xr7, %4, 96 \n\t"
2175 "1: \n\t"
2176 "xvld $xr8, %0, 0 \n\t"
2177 "xvld $xr9, %0, 32 \n\t"
2178 "xvld $xr10, %0, 64 \n\t" // load 32 pixels of
2179 // RGB
2180 "xvor.v $xr12, $xr3, $xr3 \n\t"
2181 "xvor.v $xr13, $xr3, $xr3 \n\t"
2182 "xvor.v $xr11, $xr9, $xr9 \n\t"
2183 "addi.d %2, %2, -32 \n\t" // 32 processed per
2184 // loop.
2185 "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0
2186 "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1
2187 "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2
2188 "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t"
2189 "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t"
2190 "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t"
2191 "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t"
2192 "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G
2193 "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t"
2194 "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B
2195 "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t"
2196 "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R
2197 "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t"
2198 "addi.d %0, %0, 96 \n\t"
2199 "xvpickod.b $xr10, $xr13, $xr12 \n\t"
2200 "xvst $xr10, %1, 0 \n\t"
2201 "addi.d %1, %1, 32 \n\t"
2202 "bnez %2, 1b \n\t"
2203 : "+&r"(src_rgba), // %0
2204 "+&r"(dst_y), // %1
2205 "+&r"(width) // %2
2206 : "r"(rgbconstants), // %3
2207 "r"(shuff) // %4
2208 : "memory");
2209 }
2210
RGB24ToYJRow_LASX(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2211 void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2212 RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
2213 }
2214
RAWToYJRow_LASX(const uint8_t * src_raw,uint8_t * dst_yj,int width)2215 void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2216 RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
2217 }
2218
RGB24ToYRow_LASX(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2219 void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2220 RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
2221 }
2222
RAWToYRow_LASX(const uint8_t * src_raw,uint8_t * dst_y,int width)2223 void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2224 RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
2225 }
2226
ARGBToUVJRow_LASX(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2227 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
2228 int src_stride_argb,
2229 uint8_t* dst_u,
2230 uint8_t* dst_v,
2231 int width) {
2232 int x;
2233 const uint8_t* next_argb = src_argb + src_stride_argb;
2234 int len = width / 32;
2235 __m256i src0, src1, src2, src3;
2236 __m256i nex0, nex1, nex2, nex3;
2237 __m256i tmp0, tmp1, tmp2, tmp3;
2238 __m256i reg0, reg1, dst0;
2239 __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2240 __m256i const_63 = __lasx_xvldi(0x43F);
2241 __m256i const_42 = __lasx_xvldi(0x42A);
2242 __m256i const_21 = __lasx_xvldi(0x415);
2243 __m256i const_53 = __lasx_xvldi(0x435);
2244 __m256i const_10 = __lasx_xvldi(0x40A);
2245 __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
2246 0x8080808080808080, 0x8080808080808080};
2247 __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
2248 0x1F1D0F0D1B190B09};
2249
2250 for (x = 0; x < len; x++) {
2251 DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
2252 96, src0, src1, src2, src3);
2253 DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64,
2254 next_argb, 96, nex0, nex1, nex2, nex3);
2255 tmp0 = __lasx_xvpickev_b(src1, src0);
2256 tmp1 = __lasx_xvpickod_b(src1, src0);
2257 tmp2 = __lasx_xvpickev_b(src3, src2);
2258 tmp3 = __lasx_xvpickod_b(src3, src2);
2259 tmpr = __lasx_xvpickod_b(tmp2, tmp0);
2260 tmpb = __lasx_xvpickev_b(tmp2, tmp0);
2261 tmpg = __lasx_xvpickev_b(tmp3, tmp1);
2262 tmp0 = __lasx_xvpickev_b(nex1, nex0);
2263 tmp1 = __lasx_xvpickod_b(nex1, nex0);
2264 tmp2 = __lasx_xvpickev_b(nex3, nex2);
2265 tmp3 = __lasx_xvpickod_b(nex3, nex2);
2266 nexr = __lasx_xvpickod_b(tmp2, tmp0);
2267 nexb = __lasx_xvpickev_b(tmp2, tmp0);
2268 nexg = __lasx_xvpickev_b(tmp3, tmp1);
2269 tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb);
2270 tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb);
2271 tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg);
2272 tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
2273 reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
2274 reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
2275 tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
2276 tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
2277 tmpr = __lasx_xvavgr_hu(reg0, reg1);
2278 reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
2279 reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
2280 reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
2281 reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
2282 reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
2283 reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
2284 dst0 = __lasx_xvpackod_b(reg1, reg0);
2285 tmp0 = __lasx_xvpermi_d(dst0, 0x44);
2286 tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
2287 dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff);
2288 __lasx_xvstelm_d(dst0, dst_u, 0, 0);
2289 __lasx_xvstelm_d(dst0, dst_v, 0, 2);
2290 __lasx_xvstelm_d(dst0, dst_u, 8, 1);
2291 __lasx_xvstelm_d(dst0, dst_v, 8, 3);
2292 dst_u += 16;
2293 dst_v += 16;
2294 src_argb += 128;
2295 next_argb += 128;
2296 }
2297 }
2298
2299 #ifdef __cplusplus
2300 } // extern "C"
2301 } // namespace libyuv
2302 #endif
2303
2304 #endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
2305