1 /* Copyright (C) 1995-2011, 2016 Mark Adler
2  * Copyright (C) 2017 ARM Holdings Inc.
3  * Authors:
4  *   Adenilson Cavalcanti <[email protected]>
5  *   Adam Stylinski <[email protected]>
6  * For conditions of distribution and use, see copyright notice in zlib.h
7  */
8 #ifdef ARM_NEON_ADLER32
9 #ifdef _M_ARM64
10 #  include <arm64_neon.h>
11 #else
12 #  include <arm_neon.h>
13 #endif
14 #include "../../zbuild.h"
15 #include "../../adler32_p.h"
16 #include "../../fallback_builtins.h"
17 
NEON_accum32(uint32_t * s,const unsigned char * buf,size_t len)18 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
19     static const uint16_t ALIGNED_(16) taps[64] = {
20         64, 63, 62, 61, 60, 59, 58, 57,
21         56, 55, 54, 53, 52, 51, 50, 49,
22         48, 47, 46, 45, 44, 43, 42, 41,
23         40, 39, 38, 37, 36, 35, 34, 33,
24         32, 31, 30, 29, 28, 27, 26, 25,
25         24, 23, 22, 21, 20, 19, 18, 17,
26         16, 15, 14, 13, 12, 11, 10, 9,
27         8, 7, 6, 5, 4, 3, 2, 1 };
28 
29     uint32x4_t adacc = vdupq_n_u32(0);
30     uint32x4_t s2acc = vdupq_n_u32(0);
31     uint32x4_t s2acc_0 = vdupq_n_u32(0);
32     uint32x4_t s2acc_1 = vdupq_n_u32(0);
33     uint32x4_t s2acc_2 = vdupq_n_u32(0);
34 
35     adacc = vsetq_lane_u32(s[0], adacc, 0);
36     s2acc = vsetq_lane_u32(s[1], s2acc, 0);
37 
38     uint32x4_t s3acc = vdupq_n_u32(0);
39     uint32x4_t adacc_prev = adacc;
40 
41     uint16x8_t s2_0, s2_1, s2_2, s2_3;
42     s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
43 
44     uint16x8_t s2_4, s2_5, s2_6, s2_7;
45     s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
46 
47     int num_iter = len >> 2;
48     int rem = len & 3;
49 
50     for (int i = 0; i < num_iter; ++i) {
51         uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
52 
53         /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
54          * bit instruction, we'll have to make due summing to 16 bits first */
55         uint16x8x2_t hsum, hsum_fold;
56         hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
57         hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
58 
59         hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
60         hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
61 
62         adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
63         s3acc = vaddq_u32(s3acc, adacc_prev);
64         adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
65 
66         /* If we do straight widening additions to the 16 bit values, we don't incur
67          * the usual penalties of a pairwise add. We can defer the multiplications
68          * until the very end. These will not overflow because we are incurring at
69          * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
70          * summed into once. This means for the maximum input size, the largest value
71          * we will see is 255 * 102 = 26010, safely under uint16 max */
72         s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
73         s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
74         s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
75         s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
76         s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
77         s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
78         s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
79         s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
80 
81         adacc_prev = adacc;
82         buf += 64;
83     }
84 
85     s3acc = vshlq_n_u32(s3acc, 6);
86 
87     if (rem) {
88         uint32x4_t s3acc_0 = vdupq_n_u32(0);
89         while (rem--) {
90             uint8x16_t d0 = vld1q_u8(buf);
91             uint16x8_t adler;
92             adler = vpaddlq_u8(d0);
93             s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
94             s2_7 = vaddw_high_u8(s2_7, d0);
95             adacc = vpadalq_u16(adacc, adler);
96             s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
97             adacc_prev = adacc;
98             buf += 16;
99         }
100 
101         s3acc_0 = vshlq_n_u32(s3acc_0, 4);
102         s3acc = vaddq_u32(s3acc_0, s3acc);
103     }
104 
105     uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
106     uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
107 
108     s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
109     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
110     s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
111     s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
112 
113     s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
114     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
115     s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
116     s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
117 
118     s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
119     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
120     s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
121     s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
122 
123     s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
124     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
125     s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
126     s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
127 
128     s2acc = vaddq_u32(s2acc_0, s2acc);
129     s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
130     s2acc = vaddq_u32(s2acc, s2acc_2);
131 
132     uint32x2_t adacc2, s2acc2, as;
133     s2acc = vaddq_u32(s2acc, s3acc);
134     adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
135     s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
136     as = vpadd_u32(adacc2, s2acc2);
137     s[0] = vget_lane_u32(as, 0);
138     s[1] = vget_lane_u32(as, 1);
139 }
140 
NEON_handle_tail(uint32_t * pair,const unsigned char * buf,size_t len)141 static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
142     unsigned int i;
143     for (i = 0; i < len; ++i) {
144         pair[0] += buf[i];
145         pair[1] += pair[0];
146     }
147 }
148 
adler32_neon(uint32_t adler,const unsigned char * buf,size_t len)149 uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
150     /* split Adler-32 into component sums */
151     uint32_t sum2 = (adler >> 16) & 0xffff;
152     adler &= 0xffff;
153 
154     /* in case user likes doing a byte at a time, keep it fast */
155     if (len == 1)
156         return adler32_len_1(adler, buf, sum2);
157 
158     /* initial Adler-32 value (deferred check for len == 1 speed) */
159     if (buf == NULL)
160         return 1L;
161 
162     /* in case short lengths are provided, keep it somewhat fast */
163     if (len < 16)
164         return adler32_len_16(adler, buf, len, sum2);
165 
166     uint32_t pair[2];
167     int n = NMAX;
168     unsigned int done = 0;
169 
170     /* Split Adler-32 into component sums, it can be supplied by
171      * the caller sites (e.g. in a PNG file).
172      */
173     pair[0] = adler;
174     pair[1] = sum2;
175 
176     /* If memory is not SIMD aligned, do scalar sums to an aligned
177      * offset, provided that doing so doesn't completely eliminate
178      * SIMD operation. Aligned loads are still faster on ARM, even
179      * though there's no explicit aligned load instruction */
180     unsigned int align_offset = ((uintptr_t)buf & 15);
181     unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
182 
183     if (align_offset && len >= (16 + align_adj)) {
184         NEON_handle_tail(pair, buf, align_adj);
185         n -= align_adj;
186         done += align_adj;
187 
188     } else {
189         /* If here, we failed the len criteria test, it wouldn't be
190          * worthwhile to do scalar aligning sums */
191         align_adj = 0;
192     }
193 
194     while (done < len) {
195         int remaining = (int)(len - done);
196         n = MIN(remaining, (done == align_adj) ? n : NMAX);
197 
198         if (n < 16)
199             break;
200 
201         NEON_accum32(pair, buf + done, n >> 4);
202         pair[0] %= BASE;
203         pair[1] %= BASE;
204 
205         int actual_nsums = (n >> 4) << 4;
206         done += actual_nsums;
207     }
208 
209     /* Handle the tail elements. */
210     if (done < len) {
211         NEON_handle_tail(pair, (buf + done), len - done);
212         pair[0] %= BASE;
213         pair[1] %= BASE;
214     }
215 
216     /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
217     return (pair[1] << 16) | pair[0];
218 }
219 
220 #endif
221