xref: /aosp_15_r20/external/libpng/loongarch/filter_lsx_intrinsics.c (revision a67afe4df73cf47866eedc69947994b8ff839aba)
1 /* filter_lsx_intrinsics.c - LSX optimized filter functions
2  *
3  * Copyright (c) 2021 Loongson Technology Corporation Limited
4  * All rights reserved.
5  * Copyright (c) 2018 Cosmin Truta
6  * Copyright (c) 2016 Glenn Randers-Pehrson
7  * Contributed by Jin Bo ([email protected])
8  *
9  * This code is released under the libpng license.
10  * For conditions of distribution and use, see the disclaimer
11  * and license in png.h
12  */
13 
14 #include "../pngpriv.h"
15 
16 #ifdef PNG_READ_SUPPORTED
17 
18 #if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19 
20 #include <lsxintrin.h>
21 
22 #define LSX_LD(psrc) __lsx_vld((psrc), 0)
23 
24 #define LSX_LD_2(psrc, stride, out0, out1) \
25 {                                          \
26    out0 = LSX_LD(psrc);                    \
27    out1 = LSX_LD(psrc + stride);           \
28 }
29 
30 #define LSX_LD_4(psrc, stride, out0, out1, out2, out3) \
31 {                                                      \
32    LSX_LD_2(psrc, stride, out0, out1);                 \
33    LSX_LD_2(psrc + stride * 2, stride, out2, out3);    \
34 }
35 
36 #define LSX_ST(in, pdst) __lsx_vst(in, (pdst), 0)
37 
38 #define LSX_ST_2(in0, in1, pdst, stride) \
39 {                                        \
40    LSX_ST(in0, pdst);                    \
41    LSX_ST(in1, pdst + stride);           \
42 }
43 
44 #define LSX_ST_4(in0, in1, in2, in3, pdst, stride) \
45 {                                                  \
46    LSX_ST_2(in0, in1, pdst, stride);               \
47    LSX_ST_2(in2, in3, pdst + stride * 2, stride);  \
48 }
49 
50 #define LSX_ADD_B(in0, in1, out0) \
51 {                                 \
52    out0 = __lsx_vadd_b(in0, in1); \
53 }
54 
55 #define LSX_ADD_B_2(in0, in1, in2, in3, out0, out1) \
56 {                                                   \
57    LSX_ADD_B(in0, in1, out0);                       \
58    LSX_ADD_B(in2, in3, out1);                       \
59 }
60 
61 #define LSX_ADD_B_4(in0, in1, in2, in3, in4, in5,     \
62                     in6, in7, out0, out1, out2, out3) \
63 {                                                     \
64    LSX_ADD_B_2(in0, in1, in2, in3, out0, out1);       \
65    LSX_ADD_B_2(in4, in5, in6, in7, out2, out3);       \
66 }
67 
68 #define LSX_ABS_B_3(in0, in1, in2, out0, out1, out2) \
69 {                                                    \
70    out0 = __lsx_vadda_h(in0, zero);                  \
71    out1 = __lsx_vadda_h(in1, zero);                  \
72    out2 = __lsx_vadda_h(in2, zero);                  \
73 }
74 
75 #define LSX_ILVL_B(in_h, in_l, out0)  \
76 {                                     \
77    out0 = __lsx_vilvl_b(in_h, in_l);  \
78 }
79 
80 #define LSX_ILVL_B_2(in0_h, in0_l, in1_h, in1_l, out0, out1) \
81 {                                                            \
82    LSX_ILVL_B(in0_h, in0_l, out0);                           \
83    LSX_ILVL_B(in1_h, in1_l, out1);                           \
84 }
85 
86 #define LSX_HSUB_HU_BU_2(in0, in1, out0, out1) \
87 {                                              \
88    out0 = __lsx_vhsubw_hu_bu(in0, in0);        \
89    out1 = __lsx_vhsubw_hu_bu(in1, in1);        \
90 }
91 
92 #define LSX_CMP_PICK_SMALLER(in0, in1, in2, in3, in4, in5, out0) \
93 {                                                                \
94    __m128i _cmph, _cmpb, _in0, _in3;                             \
95    _cmph = __lsx_vslt_h(in1, in0);                               \
96    _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
97    _in0  = __lsx_vmin_bu(in0,in1);                               \
98    _in3  = __lsx_vbitsel_v(in3, in4, _cmpb);                     \
99    _cmph = __lsx_vslt_h(in2, _in0);                              \
100    _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
101    _in3  = __lsx_vbitsel_v(_in3, in5, _cmpb);                    \
102    out0  = __lsx_vadd_b(out0, _in3);                             \
103 }
104 
png_read_filter_row_up_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)105 void png_read_filter_row_up_lsx(png_row_infop row_info, png_bytep row,
106                                 png_const_bytep prev_row)
107 {
108    size_t n = row_info->rowbytes;
109    png_bytep rp = row;
110    png_const_bytep pp = prev_row;
111    __m128i vec_0, vec_1, vec_2, vec_3;
112    __m128i vec_4, vec_5, vec_6, vec_7;
113 
114    while (n >= 64)
115    {
116       LSX_LD_4(rp, 16, vec_0, vec_1, vec_2, vec_3);
117       LSX_LD_4(pp, 16, vec_4, vec_5, vec_6, vec_7);
118       pp += 64;
119       LSX_ADD_B_4(vec_0 ,vec_4, vec_1, vec_5, vec_2, vec_6,
120                   vec_3, vec_7, vec_0, vec_1, vec_2, vec_3);
121       LSX_ST_4(vec_0, vec_1, vec_2, vec_3, rp, 16);
122       rp += 64;
123       n -= 64;
124    }
125    if (n & 63)
126    {
127       if (n >= 32)
128       {
129          LSX_LD_2(rp, 16, vec_0, vec_1);
130          LSX_LD_2(pp, 16, vec_2, vec_3);
131          pp += 32;
132          LSX_ADD_B_2(vec_0, vec_2, vec_1, vec_3, vec_0, vec_1);
133          LSX_ST_2(vec_0, vec_1, rp, 16);
134          rp += 32;
135          n -= 32;
136       }
137       if (n & 31)
138       {
139          if (n >= 16)
140          {
141             vec_0 = LSX_LD(rp);
142             vec_1 = LSX_LD(pp);
143             pp += 16;
144             LSX_ADD_B(vec_0, vec_1, vec_0);
145             LSX_ST(vec_0, rp);
146             rp += 16;
147             n -= 16;
148          }
149          if (n >= 8)
150          {
151             vec_0 = __lsx_vldrepl_d(rp, 0);
152             vec_1 = __lsx_vldrepl_d(pp, 0);
153             vec_0 = __lsx_vadd_b(vec_0, vec_1);
154             __lsx_vstelm_d(vec_0, rp, 0, 0);
155             rp += 8;
156             pp += 8;
157             n -= 8;
158          }
159          while (n--)
160          {
161             *rp = *rp + *pp++;
162             rp++;
163          }
164       }
165    }
166 }
167 
png_read_filter_row_sub3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)168 void png_read_filter_row_sub3_lsx(png_row_infop row_info, png_bytep row,
169                                   png_const_bytep prev_row)
170 {
171    size_t n = row_info->rowbytes;
172    png_uint_32 tmp;
173    png_bytep nxt = row;
174    __m128i vec_0, vec_1;
175 
176    PNG_UNUSED(prev_row);
177 
178    vec_0 = __lsx_vldrepl_w(nxt, 0);
179    nxt += 3;
180    n -= 3;
181 
182    while (n >= 3)
183    {
184       vec_1 = __lsx_vldrepl_w(nxt, 0);
185       vec_1 = __lsx_vadd_b(vec_1, vec_0);
186       __lsx_vstelm_h(vec_1, nxt, 0, 0);
187       vec_0 = vec_1;
188       nxt += 2;
189       __lsx_vstelm_b(vec_1, nxt, 0, 2);
190       nxt += 1;
191       n -= 3;
192    }
193 
194    row = nxt - 3;
195    while (n--)
196    {
197       *nxt = *nxt + *row++;
198       nxt++;
199    }
200 }
201 
png_read_filter_row_sub4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)202 void png_read_filter_row_sub4_lsx(png_row_infop row_info, png_bytep row,
203                                   png_const_bytep prev_row)
204 {
205    size_t n = row_info->rowbytes;
206    __m128i vec_0, vec_1;
207 
208    PNG_UNUSED(prev_row);
209 
210    vec_0 = __lsx_vldrepl_w(row, 0);
211    row += 4;
212    n -= 4;
213 
214    while (n >= 4)
215    {
216       vec_1 = __lsx_vldrepl_w(row, 0);
217       vec_1 = __lsx_vadd_b(vec_1, vec_0);
218       __lsx_vstelm_w(vec_1, row, 0, 0);
219       vec_0 = vec_1;
220       row += 4;
221       n -= 4;
222    }
223 }
224 
png_read_filter_row_avg3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)225 void png_read_filter_row_avg3_lsx(png_row_infop row_info, png_bytep row,
226                                   png_const_bytep prev_row)
227 {
228    size_t n = row_info->rowbytes;
229    png_bytep nxt = row;
230    png_const_bytep prev_nxt = prev_row;
231    __m128i vec_0, vec_1, vec_2;
232 
233    vec_0 = __lsx_vldrepl_w(nxt, 0);
234    vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
235    prev_nxt += 3;
236    vec_1 = __lsx_vsrli_b(vec_1, 1);
237    vec_1 = __lsx_vadd_b(vec_1, vec_0);
238    __lsx_vstelm_h(vec_1, nxt, 0, 0);
239    nxt += 2;
240    __lsx_vstelm_b(vec_1, nxt, 0, 2);
241    nxt += 1;
242    n -= 3;
243 
244    while (n >= 3)
245    {
246       vec_2 = vec_1;
247       vec_0 = __lsx_vldrepl_w(nxt, 0);
248       vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
249       prev_nxt += 3;
250 
251       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
252       vec_1 = __lsx_vadd_b(vec_1, vec_0);
253 
254       __lsx_vstelm_h(vec_1, nxt, 0, 0);
255       nxt += 2;
256       __lsx_vstelm_b(vec_1, nxt, 0, 2);
257       nxt += 1;
258       n -= 3;
259    }
260 
261    row = nxt - 3;
262    while (n--)
263    {
264       vec_2 = __lsx_vldrepl_b(row, 0);
265       row++;
266       vec_0 = __lsx_vldrepl_b(nxt, 0);
267       vec_1 = __lsx_vldrepl_b(prev_nxt, 0);
268       prev_nxt++;
269 
270       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
271       vec_1 = __lsx_vadd_b(vec_1, vec_0);
272 
273       __lsx_vstelm_b(vec_1, nxt, 0, 0);
274       nxt++;
275    }
276 }
277 
png_read_filter_row_avg4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)278 void png_read_filter_row_avg4_lsx(png_row_infop row_info, png_bytep row,
279                                   png_const_bytep prev_row)
280 {
281    size_t n = row_info->rowbytes;
282    __m128i vec_0, vec_1, vec_2;
283 
284    vec_0 = __lsx_vldrepl_w(row, 0);
285    vec_1 = __lsx_vldrepl_w(prev_row, 0);
286    prev_row += 4;
287    vec_1 = __lsx_vsrli_b(vec_1, 1);
288    vec_1 = __lsx_vadd_b(vec_1, vec_0);
289    __lsx_vstelm_w(vec_1, row, 0, 0);
290    row += 4;
291    n -= 4;
292 
293    while (n >= 4)
294    {
295       vec_2 = vec_1;
296       vec_0 = __lsx_vldrepl_w(row, 0);
297       vec_1 = __lsx_vldrepl_w(prev_row, 0);
298       prev_row += 4;
299 
300       vec_1 = __lsx_vavg_bu(vec_1, vec_2);
301       vec_1 = __lsx_vadd_b(vec_1, vec_0);
302 
303       __lsx_vstelm_w(vec_1, row, 0, 0);
304       row += 4;
305       n -= 4;
306    }
307 }
308 
png_read_filter_row_paeth3_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)309 void png_read_filter_row_paeth3_lsx(png_row_infop row_info,
310                                     png_bytep row,
311                                     png_const_bytep prev_row)
312 {
313    size_t n = row_info->rowbytes;
314    png_bytep nxt = row;
315    png_const_bytep prev_nxt = prev_row;
316    __m128i vec_a, vec_b, vec_c, vec_d;
317    __m128i vec_pa, vec_pb, vec_pc;
318    __m128i zero = {0};
319 
320    vec_a = __lsx_vldrepl_w(nxt, 0);
321    vec_b = __lsx_vldrepl_w(prev_nxt, 0);
322    prev_nxt += 3;
323    vec_d = __lsx_vadd_b(vec_a, vec_b);
324    __lsx_vstelm_h(vec_d, nxt, 0, 0);
325    nxt += 2;
326    __lsx_vstelm_b(vec_d, nxt, 0, 2);
327    nxt += 1;
328    n -= 3;
329 
330    while (n >= 3)
331    {
332       vec_a = vec_d;
333       vec_c = vec_b;
334       vec_b = __lsx_vldrepl_w(prev_nxt, 0);
335       prev_nxt += 3;
336       vec_d = __lsx_vldrepl_w(nxt, 0);
337 
338       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
339       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
340       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
341       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
342       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
343 
344       __lsx_vstelm_h(vec_d, nxt, 0, 0);
345       nxt += 2;
346       __lsx_vstelm_b(vec_d, nxt, 0, 2);
347       nxt += 1;
348       n -= 3;
349    }
350 
351    prev_row = prev_nxt - 3;
352    row = nxt - 3;
353    while (n--)
354    {
355       vec_a = __lsx_vldrepl_b(row, 0);
356       row++;
357       vec_b = __lsx_vldrepl_b(prev_nxt, 0);
358       prev_nxt++;
359       vec_c = __lsx_vldrepl_b(prev_row, 0);
360       prev_row++;
361       vec_d = __lsx_vldrepl_b(nxt, 0);
362 
363       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
364       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
365       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
366       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
367       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
368 
369       __lsx_vstelm_b(vec_d, nxt, 0, 0);
370       nxt++;
371    }
372 }
373 
png_read_filter_row_paeth4_lsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)374 void png_read_filter_row_paeth4_lsx(png_row_infop row_info,
375                                     png_bytep row,
376                                     png_const_bytep prev_row)
377 {
378    size_t n = row_info->rowbytes;
379    __m128i vec_a, vec_b, vec_c, vec_d;
380    __m128i vec_pa, vec_pb, vec_pc;
381    __m128i zero = {0};
382 
383    vec_a = __lsx_vldrepl_w(row, 0);
384    vec_b = __lsx_vldrepl_w(prev_row, 0);
385    prev_row += 4;
386    vec_d = __lsx_vadd_b(vec_a, vec_b);
387    __lsx_vstelm_w(vec_d, row, 0, 0);
388    row += 4;
389    n -= 4;
390 
391    while (n >= 4)
392    {
393       vec_a = vec_d;
394       vec_c = vec_b;
395       vec_b = __lsx_vldrepl_w(prev_row, 0);
396       prev_row += 4;
397       vec_d = __lsx_vldrepl_w(row, 0);
398 
399       LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
400       LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
401       vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
402       LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
403       LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
404 
405       __lsx_vstelm_w(vec_d, row, 0, 0);
406       row += 4;
407       n -= 4;
408    }
409 }
410 
411 #endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 (intrinsics) */
412 #endif /* PNG_READ_SUPPORTED */
413