1 /* Copyright (C) 1995-2011, 2016 Mark Adler
2 * Copyright (C) 2017 ARM Holdings Inc.
3 * Authors:
4 * Adenilson Cavalcanti <[email protected]>
5 * Adam Stylinski <[email protected]>
6 * For conditions of distribution and use, see copyright notice in zlib.h
7 */
8 #ifdef ARM_NEON_ADLER32
9 #ifdef _M_ARM64
10 # include <arm64_neon.h>
11 #else
12 # include <arm_neon.h>
13 #endif
14 #include "../../zbuild.h"
15 #include "../../adler32_p.h"
16 #include "../../fallback_builtins.h"
17
NEON_accum32(uint32_t * s,const unsigned char * buf,size_t len)18 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
19 static const uint16_t ALIGNED_(16) taps[64] = {
20 64, 63, 62, 61, 60, 59, 58, 57,
21 56, 55, 54, 53, 52, 51, 50, 49,
22 48, 47, 46, 45, 44, 43, 42, 41,
23 40, 39, 38, 37, 36, 35, 34, 33,
24 32, 31, 30, 29, 28, 27, 26, 25,
25 24, 23, 22, 21, 20, 19, 18, 17,
26 16, 15, 14, 13, 12, 11, 10, 9,
27 8, 7, 6, 5, 4, 3, 2, 1 };
28
29 uint32x4_t adacc = vdupq_n_u32(0);
30 uint32x4_t s2acc = vdupq_n_u32(0);
31 uint32x4_t s2acc_0 = vdupq_n_u32(0);
32 uint32x4_t s2acc_1 = vdupq_n_u32(0);
33 uint32x4_t s2acc_2 = vdupq_n_u32(0);
34
35 adacc = vsetq_lane_u32(s[0], adacc, 0);
36 s2acc = vsetq_lane_u32(s[1], s2acc, 0);
37
38 uint32x4_t s3acc = vdupq_n_u32(0);
39 uint32x4_t adacc_prev = adacc;
40
41 uint16x8_t s2_0, s2_1, s2_2, s2_3;
42 s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
43
44 uint16x8_t s2_4, s2_5, s2_6, s2_7;
45 s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
46
47 int num_iter = len >> 2;
48 int rem = len & 3;
49
50 for (int i = 0; i < num_iter; ++i) {
51 uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
52
53 /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
54 * bit instruction, we'll have to make due summing to 16 bits first */
55 uint16x8x2_t hsum, hsum_fold;
56 hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
57 hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
58
59 hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
60 hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
61
62 adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
63 s3acc = vaddq_u32(s3acc, adacc_prev);
64 adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
65
66 /* If we do straight widening additions to the 16 bit values, we don't incur
67 * the usual penalties of a pairwise add. We can defer the multiplications
68 * until the very end. These will not overflow because we are incurring at
69 * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
70 * summed into once. This means for the maximum input size, the largest value
71 * we will see is 255 * 102 = 26010, safely under uint16 max */
72 s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
73 s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
74 s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
75 s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
76 s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
77 s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
78 s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
79 s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
80
81 adacc_prev = adacc;
82 buf += 64;
83 }
84
85 s3acc = vshlq_n_u32(s3acc, 6);
86
87 if (rem) {
88 uint32x4_t s3acc_0 = vdupq_n_u32(0);
89 while (rem--) {
90 uint8x16_t d0 = vld1q_u8(buf);
91 uint16x8_t adler;
92 adler = vpaddlq_u8(d0);
93 s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
94 s2_7 = vaddw_high_u8(s2_7, d0);
95 adacc = vpadalq_u16(adacc, adler);
96 s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
97 adacc_prev = adacc;
98 buf += 16;
99 }
100
101 s3acc_0 = vshlq_n_u32(s3acc_0, 4);
102 s3acc = vaddq_u32(s3acc_0, s3acc);
103 }
104
105 uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
106 uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
107
108 s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
109 s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
110 s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
111 s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
112
113 s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
114 s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
115 s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
116 s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
117
118 s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
119 s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
120 s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
121 s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
122
123 s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
124 s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
125 s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
126 s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
127
128 s2acc = vaddq_u32(s2acc_0, s2acc);
129 s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
130 s2acc = vaddq_u32(s2acc, s2acc_2);
131
132 uint32x2_t adacc2, s2acc2, as;
133 s2acc = vaddq_u32(s2acc, s3acc);
134 adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
135 s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
136 as = vpadd_u32(adacc2, s2acc2);
137 s[0] = vget_lane_u32(as, 0);
138 s[1] = vget_lane_u32(as, 1);
139 }
140
NEON_handle_tail(uint32_t * pair,const unsigned char * buf,size_t len)141 static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
142 unsigned int i;
143 for (i = 0; i < len; ++i) {
144 pair[0] += buf[i];
145 pair[1] += pair[0];
146 }
147 }
148
adler32_neon(uint32_t adler,const unsigned char * buf,size_t len)149 uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
150 /* split Adler-32 into component sums */
151 uint32_t sum2 = (adler >> 16) & 0xffff;
152 adler &= 0xffff;
153
154 /* in case user likes doing a byte at a time, keep it fast */
155 if (len == 1)
156 return adler32_len_1(adler, buf, sum2);
157
158 /* initial Adler-32 value (deferred check for len == 1 speed) */
159 if (buf == NULL)
160 return 1L;
161
162 /* in case short lengths are provided, keep it somewhat fast */
163 if (len < 16)
164 return adler32_len_16(adler, buf, len, sum2);
165
166 uint32_t pair[2];
167 int n = NMAX;
168 unsigned int done = 0;
169
170 /* Split Adler-32 into component sums, it can be supplied by
171 * the caller sites (e.g. in a PNG file).
172 */
173 pair[0] = adler;
174 pair[1] = sum2;
175
176 /* If memory is not SIMD aligned, do scalar sums to an aligned
177 * offset, provided that doing so doesn't completely eliminate
178 * SIMD operation. Aligned loads are still faster on ARM, even
179 * though there's no explicit aligned load instruction */
180 unsigned int align_offset = ((uintptr_t)buf & 15);
181 unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
182
183 if (align_offset && len >= (16 + align_adj)) {
184 NEON_handle_tail(pair, buf, align_adj);
185 n -= align_adj;
186 done += align_adj;
187
188 } else {
189 /* If here, we failed the len criteria test, it wouldn't be
190 * worthwhile to do scalar aligning sums */
191 align_adj = 0;
192 }
193
194 while (done < len) {
195 int remaining = (int)(len - done);
196 n = MIN(remaining, (done == align_adj) ? n : NMAX);
197
198 if (n < 16)
199 break;
200
201 NEON_accum32(pair, buf + done, n >> 4);
202 pair[0] %= BASE;
203 pair[1] %= BASE;
204
205 int actual_nsums = (n >> 4) << 4;
206 done += actual_nsums;
207 }
208
209 /* Handle the tail elements. */
210 if (done < len) {
211 NEON_handle_tail(pair, (buf + done), len - done);
212 pair[0] %= BASE;
213 pair[1] %= BASE;
214 }
215
216 /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
217 return (pair[1] << 16) | pair[0];
218 }
219
220 #endif
221