1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include <assert.h>
14
15 #include "libyuv/scale_row.h"
16
17 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
18 #include "libyuv/loongson_intrinsics.h"
19
20 #ifdef __cplusplus
21 namespace libyuv {
22 extern "C" {
23 #endif
24
25 #define LOAD_DATA(_src, _in, _out) \
26 { \
27 int _tmp1, _tmp2, _tmp3, _tmp4; \
28 DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
29 _tmp2, _tmp3, _tmp4); \
30 _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \
31 _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \
32 _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \
33 _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \
34 }
35
ScaleARGBRowDown2_LSX(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)36 void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
37 ptrdiff_t src_stride,
38 uint8_t* dst_argb,
39 int dst_width) {
40 int x;
41 int len = dst_width / 4;
42 (void)src_stride;
43 __m128i src0, src1, dst0;
44
45 for (x = 0; x < len; x++) {
46 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
47 dst0 = __lsx_vpickod_w(src1, src0);
48 __lsx_vst(dst0, dst_argb, 0);
49 src_argb += 32;
50 dst_argb += 16;
51 }
52 }
53
ScaleARGBRowDown2Linear_LSX(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)54 void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
55 ptrdiff_t src_stride,
56 uint8_t* dst_argb,
57 int dst_width) {
58 int x;
59 int len = dst_width / 4;
60 (void)src_stride;
61 __m128i src0, src1, tmp0, tmp1, dst0;
62
63 for (x = 0; x < len; x++) {
64 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
65 tmp0 = __lsx_vpickev_w(src1, src0);
66 tmp1 = __lsx_vpickod_w(src1, src0);
67 dst0 = __lsx_vavgr_bu(tmp1, tmp0);
68 __lsx_vst(dst0, dst_argb, 0);
69 src_argb += 32;
70 dst_argb += 16;
71 }
72 }
73
ScaleARGBRowDown2Box_LSX(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)74 void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
75 ptrdiff_t src_stride,
76 uint8_t* dst_argb,
77 int dst_width) {
78 int x;
79 int len = dst_width / 4;
80 const uint8_t* s = src_argb;
81 const uint8_t* t = src_argb + src_stride;
82 __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
83 __m128i reg0, reg1, reg2, reg3;
84 __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
85
86 for (x = 0; x < len; x++) {
87 DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
88 DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
89 DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
90 shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
91 DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
92 tmp3, reg0, reg1, reg2, reg3);
93 DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
94 dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
95 __lsx_vst(dst0, dst_argb, 0);
96 s += 32;
97 t += 32;
98 dst_argb += 16;
99 }
100 }
101
ScaleARGBRowDownEven_LSX(const uint8_t * src_argb,ptrdiff_t src_stride,int32_t src_stepx,uint8_t * dst_argb,int dst_width)102 void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
103 ptrdiff_t src_stride,
104 int32_t src_stepx,
105 uint8_t* dst_argb,
106 int dst_width) {
107 int x;
108 int len = dst_width / 4;
109 int32_t stepx = src_stepx << 2;
110 (void)src_stride;
111 __m128i dst0, dst1, dst2, dst3;
112
113 for (x = 0; x < len; x++) {
114 dst0 = __lsx_vldrepl_w(src_argb, 0);
115 src_argb += stepx;
116 dst1 = __lsx_vldrepl_w(src_argb, 0);
117 src_argb += stepx;
118 dst2 = __lsx_vldrepl_w(src_argb, 0);
119 src_argb += stepx;
120 dst3 = __lsx_vldrepl_w(src_argb, 0);
121 src_argb += stepx;
122 __lsx_vstelm_w(dst0, dst_argb, 0, 0);
123 __lsx_vstelm_w(dst1, dst_argb, 4, 0);
124 __lsx_vstelm_w(dst2, dst_argb, 8, 0);
125 __lsx_vstelm_w(dst3, dst_argb, 12, 0);
126 dst_argb += 16;
127 }
128 }
129
ScaleARGBRowDownEvenBox_LSX(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)130 void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
131 ptrdiff_t src_stride,
132 int src_stepx,
133 uint8_t* dst_argb,
134 int dst_width) {
135 int x;
136 int len = dst_width / 4;
137 int32_t stepx = src_stepx * 4;
138 const uint8_t* next_argb = src_argb + src_stride;
139 __m128i src0, src1, src2, src3;
140 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
141 __m128i reg0, reg1, dst0;
142
143 for (x = 0; x < len; x++) {
144 tmp0 = __lsx_vldrepl_d(src_argb, 0);
145 src_argb += stepx;
146 tmp1 = __lsx_vldrepl_d(src_argb, 0);
147 src_argb += stepx;
148 tmp2 = __lsx_vldrepl_d(src_argb, 0);
149 src_argb += stepx;
150 tmp3 = __lsx_vldrepl_d(src_argb, 0);
151 src_argb += stepx;
152 tmp4 = __lsx_vldrepl_d(next_argb, 0);
153 next_argb += stepx;
154 tmp5 = __lsx_vldrepl_d(next_argb, 0);
155 next_argb += stepx;
156 tmp6 = __lsx_vldrepl_d(next_argb, 0);
157 next_argb += stepx;
158 tmp7 = __lsx_vldrepl_d(next_argb, 0);
159 next_argb += stepx;
160 DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
161 src0, src1, src2, src3);
162 DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
163 DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
164 DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
165 DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
166 DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
167 dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
168 dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
169 __lsx_vst(dst0, dst_argb, 0);
170 dst_argb += 16;
171 }
172 }
173
ScaleRowDown2_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)174 void ScaleRowDown2_LSX(const uint8_t* src_ptr,
175 ptrdiff_t src_stride,
176 uint8_t* dst,
177 int dst_width) {
178 int x;
179 int len = dst_width / 32;
180 __m128i src0, src1, src2, src3, dst0, dst1;
181 (void)src_stride;
182
183 for (x = 0; x < len; x++) {
184 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
185 src0, src1, src2, src3);
186 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
187 __lsx_vst(dst0, dst, 0);
188 __lsx_vst(dst1, dst, 16);
189 src_ptr += 64;
190 dst += 32;
191 }
192 }
193
ScaleRowDown2Linear_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)194 void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
195 ptrdiff_t src_stride,
196 uint8_t* dst,
197 int dst_width) {
198 int x;
199 int len = dst_width / 32;
200 __m128i src0, src1, src2, src3;
201 __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
202 (void)src_stride;
203
204 for (x = 0; x < len; x++) {
205 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
206 src0, src1, src2, src3);
207 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
208 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
209 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
210 __lsx_vst(dst0, dst, 0);
211 __lsx_vst(dst1, dst, 16);
212 src_ptr += 64;
213 dst += 32;
214 }
215 }
216
ScaleRowDown2Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)217 void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
218 ptrdiff_t src_stride,
219 uint8_t* dst,
220 int dst_width) {
221 int x;
222 int len = dst_width / 32;
223 const uint8_t* src_nex = src_ptr + src_stride;
224 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
225 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
226 __m128i dst0, dst1;
227
228 for (x = 0; x < len; x++) {
229 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
230 src0, src1, src2, src3);
231 DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
232 src4, src5, src6, src7);
233 DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
234 src7, tmp0, tmp2, tmp4, tmp6);
235 DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
236 src7, tmp1, tmp3, tmp5, tmp7);
237 DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
238 tmp0, tmp1, tmp2, tmp3);
239 DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
240 __lsx_vst(dst0, dst, 0);
241 __lsx_vst(dst1, dst, 16);
242 src_ptr += 64;
243 src_nex += 64;
244 dst += 32;
245 }
246 }
247
ScaleRowDown4_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)248 void ScaleRowDown4_LSX(const uint8_t* src_ptr,
249 ptrdiff_t src_stride,
250 uint8_t* dst,
251 int dst_width) {
252 int x;
253 int len = dst_width / 16;
254 __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
255 (void)src_stride;
256
257 for (x = 0; x < len; x++) {
258 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
259 src0, src1, src2, src3);
260 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
261 dst0 = __lsx_vpickod_b(tmp1, tmp0);
262 __lsx_vst(dst0, dst, 0);
263 src_ptr += 64;
264 dst += 16;
265 }
266 }
267
ScaleRowDown4Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)268 void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
269 ptrdiff_t src_stride,
270 uint8_t* dst,
271 int dst_width) {
272 int x;
273 int len = dst_width / 16;
274 const uint8_t* ptr1 = src_ptr + src_stride;
275 const uint8_t* ptr2 = ptr1 + src_stride;
276 const uint8_t* ptr3 = ptr2 + src_stride;
277 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
278 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
279 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
280
281 for (x = 0; x < len; x++) {
282 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
283 src0, src1, src2, src3);
284 DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
285 src6, src7);
286 DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
287 src7, tmp0, tmp2, tmp4, tmp6);
288 DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
289 src7, tmp1, tmp3, tmp5, tmp7);
290 DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
291 reg0, reg1, reg2, reg3);
292 DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
293 src2, src3);
294 DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
295 src6, src7);
296 DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
297 src7, tmp0, tmp2, tmp4, tmp6);
298 DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
299 src7, tmp1, tmp3, tmp5, tmp7);
300 DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
301 reg4, reg5, reg6, reg7);
302 DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
303 reg0, reg1, reg2, reg3);
304 DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
305 reg3, reg0, reg1, reg2, reg3);
306 DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
307 dst0 = __lsx_vpickev_b(tmp1, tmp0);
308 __lsx_vst(dst0, dst, 0);
309 src_ptr += 64;
310 ptr1 += 64;
311 ptr2 += 64;
312 ptr3 += 64;
313 dst += 16;
314 }
315 }
316
ScaleRowDown38_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)317 void ScaleRowDown38_LSX(const uint8_t* src_ptr,
318 ptrdiff_t src_stride,
319 uint8_t* dst,
320 int dst_width) {
321 int x, len;
322 __m128i src0, src1, tmp0;
323 __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
324
325 assert(dst_width % 3 == 0);
326 len = dst_width / 12;
327 (void)src_stride;
328
329 for (x = 0; x < len; x++) {
330 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
331 tmp0 = __lsx_vshuf_b(src1, src0, shuff);
332 __lsx_vstelm_d(tmp0, dst, 0, 0);
333 __lsx_vstelm_w(tmp0, dst, 8, 2);
334 src_ptr += 32;
335 dst += 12;
336 }
337 }
338
ScaleRowDown38_2_Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)339 void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
340 ptrdiff_t src_stride,
341 uint8_t* dst_ptr,
342 int dst_width) {
343 int x, len;
344 const uint8_t* src_nex = src_ptr + src_stride;
345 __m128i src0, src1, src2, src3, dst0;
346 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
347 __m128i reg0, reg1, reg2, reg3;
348 __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
349 __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
350 __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
351
352 assert((dst_width % 3 == 0) && (dst_width > 0));
353 len = dst_width / 12;
354
355 for (x = 0; x < len; x++) {
356 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
357 src1, src2, src3);
358 DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
359 DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
360 DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
361 DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
362 tmp4 = __lsx_vpickev_w(reg3, reg2);
363 tmp5 = __lsx_vadd_h(reg0, reg1);
364 tmp6 = __lsx_vadd_h(tmp5, tmp4);
365 tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
366 tmp0 = __lsx_vpickod_w(reg3, reg2);
367 tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
368 tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
369 dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
370 __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
371 __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
372 src_ptr += 32;
373 src_nex += 32;
374 dst_ptr += 12;
375 }
376 }
377
ScaleRowDown38_3_Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)378 void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
379 ptrdiff_t src_stride,
380 uint8_t* dst_ptr,
381 int dst_width) {
382 int x, len;
383 const uint8_t* ptr1 = src_ptr + src_stride;
384 const uint8_t* ptr2 = ptr1 + src_stride;
385 __m128i src0, src1, src2, src3, src4, src5;
386 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
387 __m128i reg0, reg1, reg2, reg3, dst0;
388 __m128i zero = __lsx_vldi(0);
389 __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
390 __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
391 __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
392
393 assert((dst_width % 3 == 0) && (dst_width > 0));
394 len = dst_width / 12;
395
396 for (x = 0; x < len; x++) {
397 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
398 src2, src3);
399 DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
400 DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
401 DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
402 DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
403 DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
404 DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
405 tmp0, tmp1, tmp2, tmp3);
406 DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
407 DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
408 tmp4 = __lsx_vpickev_w(reg3, reg2);
409 tmp5 = __lsx_vadd_h(reg0, reg1);
410 tmp6 = __lsx_vadd_h(tmp5, tmp4);
411 tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
412 tmp0 = __lsx_vpickod_w(reg3, reg2);
413 tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
414 tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
415 dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
416 __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
417 __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
418 src_ptr += 32;
419 ptr1 += 32;
420 ptr2 += 32;
421 dst_ptr += 12;
422 }
423 }
424
ScaleAddRow_LSX(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)425 void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
426 int x;
427 int len = src_width / 16;
428 __m128i src0, tmp0, tmp1, dst0, dst1;
429 __m128i zero = __lsx_vldi(0);
430
431 assert(src_width > 0);
432
433 for (x = 0; x < len; x++) {
434 src0 = __lsx_vld(src_ptr, 0);
435 DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
436 tmp0 = __lsx_vilvl_b(zero, src0);
437 tmp1 = __lsx_vilvh_b(zero, src0);
438 DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
439 __lsx_vst(dst0, dst_ptr, 0);
440 __lsx_vst(dst1, dst_ptr, 16);
441 src_ptr += 16;
442 dst_ptr += 16;
443 }
444 }
445
ScaleFilterCols_LSX(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)446 void ScaleFilterCols_LSX(uint8_t* dst_ptr,
447 const uint8_t* src_ptr,
448 int dst_width,
449 int x,
450 int dx) {
451 int j;
452 int len = dst_width / 16;
453 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
454 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
455 __m128i vec0, vec1, dst0;
456 __m128i vec_x = __lsx_vreplgr2vr_w(x);
457 __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
458 __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
459 __m128i const2 = __lsx_vreplgr2vr_w(0x40);
460 __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
461
462 vec0 = __lsx_vmul_w(vec_dx, const_tmp);
463 vec1 = __lsx_vslli_w(vec_dx, 2);
464 vec_x = __lsx_vadd_w(vec_x, vec0);
465
466 for (j = 0; j < len; j++) {
467 tmp0 = __lsx_vsrai_w(vec_x, 16);
468 tmp4 = __lsx_vand_v(vec_x, const1);
469 vec_x = __lsx_vadd_w(vec_x, vec1);
470 tmp1 = __lsx_vsrai_w(vec_x, 16);
471 tmp5 = __lsx_vand_v(vec_x, const1);
472 vec_x = __lsx_vadd_w(vec_x, vec1);
473 tmp2 = __lsx_vsrai_w(vec_x, 16);
474 tmp6 = __lsx_vand_v(vec_x, const1);
475 vec_x = __lsx_vadd_w(vec_x, vec1);
476 tmp3 = __lsx_vsrai_w(vec_x, 16);
477 tmp7 = __lsx_vand_v(vec_x, const1);
478 vec_x = __lsx_vadd_w(vec_x, vec1);
479 DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
480 tmp6, tmp7);
481 LOAD_DATA(src_ptr, tmp0, reg0);
482 LOAD_DATA(src_ptr, tmp1, reg1);
483 LOAD_DATA(src_ptr, tmp2, reg2);
484 LOAD_DATA(src_ptr, tmp3, reg3);
485 DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
486 tmp2, tmp3);
487 LOAD_DATA(src_ptr, tmp0, reg4);
488 LOAD_DATA(src_ptr, tmp1, reg5);
489 LOAD_DATA(src_ptr, tmp2, reg6);
490 LOAD_DATA(src_ptr, tmp3, reg7);
491 DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
492 reg4, reg5, reg6, reg7);
493 DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
494 reg4, reg5, reg6, reg7);
495 DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
496 const2, reg4, reg5, reg6, reg7);
497 DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
498 reg6, reg7);
499 DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
500 reg0, reg1, reg2, reg3);
501 DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
502 dst0 = __lsx_vpickev_b(tmp1, tmp0);
503 __lsx_vst(dst0, dst_ptr, 0);
504 dst_ptr += 16;
505 }
506 }
507
ScaleARGBCols_LSX(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)508 void ScaleARGBCols_LSX(uint8_t* dst_argb,
509 const uint8_t* src_argb,
510 int dst_width,
511 int x,
512 int dx) {
513 const uint32_t* src = (const uint32_t*)src_argb;
514 uint32_t* dst = (uint32_t*)dst_argb;
515 int j;
516 int len = dst_width / 4;
517 __m128i tmp0, tmp1, tmp2, dst0;
518 __m128i vec_x = __lsx_vreplgr2vr_w(x);
519 __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
520 __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
521
522 tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
523 tmp1 = __lsx_vslli_w(vec_dx, 2);
524 vec_x = __lsx_vadd_w(vec_x, tmp0);
525
526 for (j = 0; j < len; j++) {
527 tmp2 = __lsx_vsrai_w(vec_x, 16);
528 vec_x = __lsx_vadd_w(vec_x, tmp1);
529 LOAD_DATA(src, tmp2, dst0);
530 __lsx_vst(dst0, dst, 0);
531 dst += 4;
532 }
533 }
534
ScaleARGBFilterCols_LSX(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)535 void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
536 const uint8_t* src_argb,
537 int dst_width,
538 int x,
539 int dx) {
540 const uint32_t* src = (const uint32_t*)src_argb;
541 int j;
542 int len = dst_width / 8;
543 __m128i src0, src1, src2, src3;
544 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
545 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
546 __m128i vec0, vec1, dst0, dst1;
547 __m128i vec_x = __lsx_vreplgr2vr_w(x);
548 __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
549 __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
550 __m128i const_7f = __lsx_vldi(0x7F);
551
552 vec0 = __lsx_vmul_w(vec_dx, const_tmp);
553 vec1 = __lsx_vslli_w(vec_dx, 2);
554 vec_x = __lsx_vadd_w(vec_x, vec0);
555
556 for (j = 0; j < len; j++) {
557 tmp0 = __lsx_vsrai_w(vec_x, 16);
558 reg0 = __lsx_vsrai_w(vec_x, 9);
559 vec_x = __lsx_vadd_w(vec_x, vec1);
560 tmp1 = __lsx_vsrai_w(vec_x, 16);
561 reg1 = __lsx_vsrai_w(vec_x, 9);
562 vec_x = __lsx_vadd_w(vec_x, vec1);
563 DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
564 DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
565 DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
566 DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
567 DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
568 LOAD_DATA(src, tmp0, src0);
569 LOAD_DATA(src, tmp1, src1);
570 DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
571 LOAD_DATA(src, tmp0, src2);
572 LOAD_DATA(src, tmp1, src3);
573 DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
574 DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
575 DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
576 tmp0, tmp1, tmp2, tmp3);
577 DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
578 __lsx_vst(dst0, dst_argb, 0);
579 __lsx_vst(dst1, dst_argb, 16);
580 dst_argb += 32;
581 }
582 }
583
ScaleRowDown34_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)584 void ScaleRowDown34_LSX(const uint8_t* src_ptr,
585 ptrdiff_t src_stride,
586 uint8_t* dst,
587 int dst_width) {
588 int x;
589 (void)src_stride;
590 __m128i src0, src1, src2, src3;
591 __m128i dst0, dst1, dst2;
592 __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
593 __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
594 __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
595
596 assert((dst_width % 3 == 0) && (dst_width > 0));
597
598 for (x = 0; x < dst_width; x += 48) {
599 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
600 src0, src1, src2, src3);
601 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
602 dst1);
603 dst2 = __lsx_vshuf_b(src3, src2, shuff2);
604 __lsx_vst(dst0, dst, 0);
605 __lsx_vst(dst1, dst, 16);
606 __lsx_vst(dst2, dst, 32);
607 src_ptr += 64;
608 dst += 48;
609 }
610 }
611
ScaleRowDown34_0_Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * d,int dst_width)612 void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
613 ptrdiff_t src_stride,
614 uint8_t* d,
615 int dst_width) {
616 const uint8_t* src_nex = src_ptr + src_stride;
617 int x;
618 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
619 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
620 __m128i tmp10, tmp11, dst0, dst1, dst2;
621 __m128i const0 = {0x0103030101010103, 0x0101010303010101};
622 __m128i const1 = {0x0301010101030301, 0x0103030101010103};
623 __m128i const2 = {0x0101010303010101, 0x0301010101030301};
624 __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
625 __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
626 __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
627 __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
628 __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
629 __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
630
631 assert((dst_width % 3 == 0) && (dst_width > 0));
632
633 for (x = 0; x < dst_width; x += 48) {
634 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
635 src0, src1, src2, src3);
636 DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
637 src4, src5, src6, src7);
638 DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
639 shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
640 DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
641 shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
642 DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
643 shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
644 DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
645 const0, src0, src1, src2, src3);
646 DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
647 const1, src4, src5, src6, src7);
648 DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
649 const2, tmp0, tmp1, tmp2, tmp3);
650 DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
651 shift0, src0, src1, src2, src3);
652 DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
653 shift1, src4, src5, src6, src7);
654 DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
655 shift2, tmp0, tmp1, tmp2, tmp3);
656 DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
657 tmp7, tmp8);
658 DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
659 DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
660 src0, src1, src2, src3);
661 DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
662 DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
663 src0, src1, src2, src3);
664 DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
665 DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
666 dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
667 __lsx_vst(dst0, d, 0);
668 __lsx_vst(dst1, d, 16);
669 __lsx_vst(dst2, d, 32);
670 src_ptr += 64;
671 src_nex += 64;
672 d += 48;
673 }
674 }
675
ScaleRowDown34_1_Box_LSX(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * d,int dst_width)676 void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
677 ptrdiff_t src_stride,
678 uint8_t* d,
679 int dst_width) {
680 const uint8_t* src_nex = src_ptr + src_stride;
681 int x;
682 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
683 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
684 __m128i tmp10, tmp11, dst0, dst1, dst2;
685 __m128i const0 = {0x0103030101010103, 0x0101010303010101};
686 __m128i const1 = {0x0301010101030301, 0x0103030101010103};
687 __m128i const2 = {0x0101010303010101, 0x0301010101030301};
688 __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
689 __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
690 __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
691 __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
692 __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
693 __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
694
695 assert((dst_width % 3 == 0) && (dst_width > 0));
696
697 for (x = 0; x < dst_width; x += 48) {
698 DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
699 src0, src1, src2, src3);
700 DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
701 src4, src5, src6, src7);
702 DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
703 shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
704 DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
705 shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
706 DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
707 shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
708 DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
709 const0, src0, src1, src2, src3);
710 DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
711 const1, src4, src5, src6, src7);
712 DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
713 const2, tmp0, tmp1, tmp2, tmp3);
714 DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
715 shift0, src0, src1, src2, src3);
716 DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
717 shift1, src4, src5, src6, src7);
718 DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
719 shift2, tmp0, tmp1, tmp2, tmp3);
720 DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
721 src0, src1, src2, src3);
722 DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
723 DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
724 dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
725 __lsx_vst(dst0, d, 0);
726 __lsx_vst(dst1, d, 16);
727 __lsx_vst(dst2, d, 32);
728 src_ptr += 64;
729 src_nex += 64;
730 d += 48;
731 }
732 }
733
734 #ifdef __cplusplus
735 } // extern "C"
736 } // namespace libyuv
737 #endif
738
739 #endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
740