1 /* filter_lsx_intrinsics.c - LSX optimized filter functions
2 *
3 * Copyright (c) 2021 Loongson Technology Corporation Limited
4 * All rights reserved.
5 * Copyright (c) 2018 Cosmin Truta
6 * Copyright (c) 2016 Glenn Randers-Pehrson
7 * Contributed by Jin Bo ([email protected])
8 *
9 * This code is released under the libpng license.
10 * For conditions of distribution and use, see the disclaimer
11 * and license in png.h
12 */
13
14 #include "../pngpriv.h"
15
16 #ifdef PNG_READ_SUPPORTED
17
18 #if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19
20 #include <lsxintrin.h>
21
22 #define LSX_LD(psrc) __lsx_vld((psrc), 0)
23
24 #define LSX_LD_2(psrc, stride, out0, out1) \
25 { \
26 out0 = LSX_LD(psrc); \
27 out1 = LSX_LD(psrc + stride); \
28 }
29
30 #define LSX_LD_4(psrc, stride, out0, out1, out2, out3) \
31 { \
32 LSX_LD_2(psrc, stride, out0, out1); \
33 LSX_LD_2(psrc + stride * 2, stride, out2, out3); \
34 }
35
36 #define LSX_ST(in, pdst) __lsx_vst(in, (pdst), 0)
37
38 #define LSX_ST_2(in0, in1, pdst, stride) \
39 { \
40 LSX_ST(in0, pdst); \
41 LSX_ST(in1, pdst + stride); \
42 }
43
44 #define LSX_ST_4(in0, in1, in2, in3, pdst, stride) \
45 { \
46 LSX_ST_2(in0, in1, pdst, stride); \
47 LSX_ST_2(in2, in3, pdst + stride * 2, stride); \
48 }
49
50 #define LSX_ADD_B(in0, in1, out0) \
51 { \
52 out0 = __lsx_vadd_b(in0, in1); \
53 }
54
55 #define LSX_ADD_B_2(in0, in1, in2, in3, out0, out1) \
56 { \
57 LSX_ADD_B(in0, in1, out0); \
58 LSX_ADD_B(in2, in3, out1); \
59 }
60
61 #define LSX_ADD_B_4(in0, in1, in2, in3, in4, in5, \
62 in6, in7, out0, out1, out2, out3) \
63 { \
64 LSX_ADD_B_2(in0, in1, in2, in3, out0, out1); \
65 LSX_ADD_B_2(in4, in5, in6, in7, out2, out3); \
66 }
67
68 #define LSX_ABS_B_3(in0, in1, in2, out0, out1, out2) \
69 { \
70 out0 = __lsx_vadda_h(in0, zero); \
71 out1 = __lsx_vadda_h(in1, zero); \
72 out2 = __lsx_vadda_h(in2, zero); \
73 }
74
75 #define LSX_ILVL_B(in_h, in_l, out0) \
76 { \
77 out0 = __lsx_vilvl_b(in_h, in_l); \
78 }
79
80 #define LSX_ILVL_B_2(in0_h, in0_l, in1_h, in1_l, out0, out1) \
81 { \
82 LSX_ILVL_B(in0_h, in0_l, out0); \
83 LSX_ILVL_B(in1_h, in1_l, out1); \
84 }
85
86 #define LSX_HSUB_HU_BU_2(in0, in1, out0, out1) \
87 { \
88 out0 = __lsx_vhsubw_hu_bu(in0, in0); \
89 out1 = __lsx_vhsubw_hu_bu(in1, in1); \
90 }
91
92 #define LSX_CMP_PICK_SMALLER(in0, in1, in2, in3, in4, in5, out0) \
93 { \
94 __m128i _cmph, _cmpb, _in0, _in3; \
95 _cmph = __lsx_vslt_h(in1, in0); \
96 _cmpb = __lsx_vpickev_b(_cmph, _cmph); \
97 _in0 = __lsx_vmin_bu(in0,in1); \
98 _in3 = __lsx_vbitsel_v(in3, in4, _cmpb); \
99 _cmph = __lsx_vslt_h(in2, _in0); \
100 _cmpb = __lsx_vpickev_b(_cmph, _cmph); \
101 _in3 = __lsx_vbitsel_v(_in3, in5, _cmpb); \
102 out0 = __lsx_vadd_b(out0, _in3); \
103 }
104
png_read_filter_row_up_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)105 void png_read_filter_row_up_lsx(png_row_infop row_info, png_bytep row,
106 png_const_bytep prev_row)
107 {
108 size_t n = row_info->rowbytes;
109 png_bytep rp = row;
110 png_const_bytep pp = prev_row;
111 __m128i vec_0, vec_1, vec_2, vec_3;
112 __m128i vec_4, vec_5, vec_6, vec_7;
113
114 while (n >= 64)
115 {
116 LSX_LD_4(rp, 16, vec_0, vec_1, vec_2, vec_3);
117 LSX_LD_4(pp, 16, vec_4, vec_5, vec_6, vec_7);
118 pp += 64;
119 LSX_ADD_B_4(vec_0 ,vec_4, vec_1, vec_5, vec_2, vec_6,
120 vec_3, vec_7, vec_0, vec_1, vec_2, vec_3);
121 LSX_ST_4(vec_0, vec_1, vec_2, vec_3, rp, 16);
122 rp += 64;
123 n -= 64;
124 }
125 if (n & 63)
126 {
127 if (n >= 32)
128 {
129 LSX_LD_2(rp, 16, vec_0, vec_1);
130 LSX_LD_2(pp, 16, vec_2, vec_3);
131 pp += 32;
132 LSX_ADD_B_2(vec_0, vec_2, vec_1, vec_3, vec_0, vec_1);
133 LSX_ST_2(vec_0, vec_1, rp, 16);
134 rp += 32;
135 n -= 32;
136 }
137 if (n & 31)
138 {
139 if (n >= 16)
140 {
141 vec_0 = LSX_LD(rp);
142 vec_1 = LSX_LD(pp);
143 pp += 16;
144 LSX_ADD_B(vec_0, vec_1, vec_0);
145 LSX_ST(vec_0, rp);
146 rp += 16;
147 n -= 16;
148 }
149 if (n >= 8)
150 {
151 vec_0 = __lsx_vldrepl_d(rp, 0);
152 vec_1 = __lsx_vldrepl_d(pp, 0);
153 vec_0 = __lsx_vadd_b(vec_0, vec_1);
154 __lsx_vstelm_d(vec_0, rp, 0, 0);
155 rp += 8;
156 pp += 8;
157 n -= 8;
158 }
159 while (n--)
160 {
161 *rp = *rp + *pp++;
162 rp++;
163 }
164 }
165 }
166 }
167
png_read_filter_row_sub3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)168 void png_read_filter_row_sub3_lsx(png_row_infop row_info, png_bytep row,
169 png_const_bytep prev_row)
170 {
171 size_t n = row_info->rowbytes;
172 png_uint_32 tmp;
173 png_bytep nxt = row;
174 __m128i vec_0, vec_1;
175
176 PNG_UNUSED(prev_row);
177
178 vec_0 = __lsx_vldrepl_w(nxt, 0);
179 nxt += 3;
180 n -= 3;
181
182 while (n >= 3)
183 {
184 vec_1 = __lsx_vldrepl_w(nxt, 0);
185 vec_1 = __lsx_vadd_b(vec_1, vec_0);
186 __lsx_vstelm_h(vec_1, nxt, 0, 0);
187 vec_0 = vec_1;
188 nxt += 2;
189 __lsx_vstelm_b(vec_1, nxt, 0, 2);
190 nxt += 1;
191 n -= 3;
192 }
193
194 row = nxt - 3;
195 while (n--)
196 {
197 *nxt = *nxt + *row++;
198 nxt++;
199 }
200 }
201
png_read_filter_row_sub4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)202 void png_read_filter_row_sub4_lsx(png_row_infop row_info, png_bytep row,
203 png_const_bytep prev_row)
204 {
205 size_t n = row_info->rowbytes;
206 __m128i vec_0, vec_1;
207
208 PNG_UNUSED(prev_row);
209
210 vec_0 = __lsx_vldrepl_w(row, 0);
211 row += 4;
212 n -= 4;
213
214 while (n >= 4)
215 {
216 vec_1 = __lsx_vldrepl_w(row, 0);
217 vec_1 = __lsx_vadd_b(vec_1, vec_0);
218 __lsx_vstelm_w(vec_1, row, 0, 0);
219 vec_0 = vec_1;
220 row += 4;
221 n -= 4;
222 }
223 }
224
png_read_filter_row_avg3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)225 void png_read_filter_row_avg3_lsx(png_row_infop row_info, png_bytep row,
226 png_const_bytep prev_row)
227 {
228 size_t n = row_info->rowbytes;
229 png_bytep nxt = row;
230 png_const_bytep prev_nxt = prev_row;
231 __m128i vec_0, vec_1, vec_2;
232
233 vec_0 = __lsx_vldrepl_w(nxt, 0);
234 vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
235 prev_nxt += 3;
236 vec_1 = __lsx_vsrli_b(vec_1, 1);
237 vec_1 = __lsx_vadd_b(vec_1, vec_0);
238 __lsx_vstelm_h(vec_1, nxt, 0, 0);
239 nxt += 2;
240 __lsx_vstelm_b(vec_1, nxt, 0, 2);
241 nxt += 1;
242 n -= 3;
243
244 while (n >= 3)
245 {
246 vec_2 = vec_1;
247 vec_0 = __lsx_vldrepl_w(nxt, 0);
248 vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
249 prev_nxt += 3;
250
251 vec_1 = __lsx_vavg_bu(vec_1, vec_2);
252 vec_1 = __lsx_vadd_b(vec_1, vec_0);
253
254 __lsx_vstelm_h(vec_1, nxt, 0, 0);
255 nxt += 2;
256 __lsx_vstelm_b(vec_1, nxt, 0, 2);
257 nxt += 1;
258 n -= 3;
259 }
260
261 row = nxt - 3;
262 while (n--)
263 {
264 vec_2 = __lsx_vldrepl_b(row, 0);
265 row++;
266 vec_0 = __lsx_vldrepl_b(nxt, 0);
267 vec_1 = __lsx_vldrepl_b(prev_nxt, 0);
268 prev_nxt++;
269
270 vec_1 = __lsx_vavg_bu(vec_1, vec_2);
271 vec_1 = __lsx_vadd_b(vec_1, vec_0);
272
273 __lsx_vstelm_b(vec_1, nxt, 0, 0);
274 nxt++;
275 }
276 }
277
png_read_filter_row_avg4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)278 void png_read_filter_row_avg4_lsx(png_row_infop row_info, png_bytep row,
279 png_const_bytep prev_row)
280 {
281 size_t n = row_info->rowbytes;
282 __m128i vec_0, vec_1, vec_2;
283
284 vec_0 = __lsx_vldrepl_w(row, 0);
285 vec_1 = __lsx_vldrepl_w(prev_row, 0);
286 prev_row += 4;
287 vec_1 = __lsx_vsrli_b(vec_1, 1);
288 vec_1 = __lsx_vadd_b(vec_1, vec_0);
289 __lsx_vstelm_w(vec_1, row, 0, 0);
290 row += 4;
291 n -= 4;
292
293 while (n >= 4)
294 {
295 vec_2 = vec_1;
296 vec_0 = __lsx_vldrepl_w(row, 0);
297 vec_1 = __lsx_vldrepl_w(prev_row, 0);
298 prev_row += 4;
299
300 vec_1 = __lsx_vavg_bu(vec_1, vec_2);
301 vec_1 = __lsx_vadd_b(vec_1, vec_0);
302
303 __lsx_vstelm_w(vec_1, row, 0, 0);
304 row += 4;
305 n -= 4;
306 }
307 }
308
png_read_filter_row_paeth3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)309 void png_read_filter_row_paeth3_lsx(png_row_infop row_info,
310 png_bytep row,
311 png_const_bytep prev_row)
312 {
313 size_t n = row_info->rowbytes;
314 png_bytep nxt = row;
315 png_const_bytep prev_nxt = prev_row;
316 __m128i vec_a, vec_b, vec_c, vec_d;
317 __m128i vec_pa, vec_pb, vec_pc;
318 __m128i zero = {0};
319
320 vec_a = __lsx_vldrepl_w(nxt, 0);
321 vec_b = __lsx_vldrepl_w(prev_nxt, 0);
322 prev_nxt += 3;
323 vec_d = __lsx_vadd_b(vec_a, vec_b);
324 __lsx_vstelm_h(vec_d, nxt, 0, 0);
325 nxt += 2;
326 __lsx_vstelm_b(vec_d, nxt, 0, 2);
327 nxt += 1;
328 n -= 3;
329
330 while (n >= 3)
331 {
332 vec_a = vec_d;
333 vec_c = vec_b;
334 vec_b = __lsx_vldrepl_w(prev_nxt, 0);
335 prev_nxt += 3;
336 vec_d = __lsx_vldrepl_w(nxt, 0);
337
338 LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
339 LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
340 vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
341 LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
342 LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
343
344 __lsx_vstelm_h(vec_d, nxt, 0, 0);
345 nxt += 2;
346 __lsx_vstelm_b(vec_d, nxt, 0, 2);
347 nxt += 1;
348 n -= 3;
349 }
350
351 prev_row = prev_nxt - 3;
352 row = nxt - 3;
353 while (n--)
354 {
355 vec_a = __lsx_vldrepl_b(row, 0);
356 row++;
357 vec_b = __lsx_vldrepl_b(prev_nxt, 0);
358 prev_nxt++;
359 vec_c = __lsx_vldrepl_b(prev_row, 0);
360 prev_row++;
361 vec_d = __lsx_vldrepl_b(nxt, 0);
362
363 LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
364 LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
365 vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
366 LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
367 LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
368
369 __lsx_vstelm_b(vec_d, nxt, 0, 0);
370 nxt++;
371 }
372 }
373
png_read_filter_row_paeth4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)374 void png_read_filter_row_paeth4_lsx(png_row_infop row_info,
375 png_bytep row,
376 png_const_bytep prev_row)
377 {
378 size_t n = row_info->rowbytes;
379 __m128i vec_a, vec_b, vec_c, vec_d;
380 __m128i vec_pa, vec_pb, vec_pc;
381 __m128i zero = {0};
382
383 vec_a = __lsx_vldrepl_w(row, 0);
384 vec_b = __lsx_vldrepl_w(prev_row, 0);
385 prev_row += 4;
386 vec_d = __lsx_vadd_b(vec_a, vec_b);
387 __lsx_vstelm_w(vec_d, row, 0, 0);
388 row += 4;
389 n -= 4;
390
391 while (n >= 4)
392 {
393 vec_a = vec_d;
394 vec_c = vec_b;
395 vec_b = __lsx_vldrepl_w(prev_row, 0);
396 prev_row += 4;
397 vec_d = __lsx_vldrepl_w(row, 0);
398
399 LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
400 LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
401 vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
402 LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
403 LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
404
405 __lsx_vstelm_w(vec_d, row, 0, 0);
406 row += 4;
407 n -= 4;
408 }
409 }
410
411 #endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 (intrinsics) */
412 #endif /* PNG_READ_SUPPORTED */
413