1 /* crc32_simd.c
2 *
3 * Copyright 2017 The Chromium Authors
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the Chromium source repository LICENSE file.
6 */
7
8 #include "crc32_simd.h"
9 #if defined(CRC32_SIMD_AVX512_PCLMUL)
10
11 /*
12 * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
13 * length must be at least 256, and a multiple of 64. Based on:
14 *
15 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
16 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
17 */
18
19 #include <emmintrin.h>
20 #include <smmintrin.h>
21 #include <wmmintrin.h>
22 #include <immintrin.h>
23
crc32_avx512_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)24 uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
25 const unsigned char *buf,
26 z_size_t len,
27 uint32_t crc)
28 {
29 /*
30 * Definitions of the bit-reflected domain constants k1,k2,k3,k4
31 * are similar to those given at the end of the paper, and remaining
32 * constants and CRC32+Barrett polynomials remain unchanged.
33 *
34 * Replace the index of x from 128 to 512. As follows:
35 * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
36 * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
37 * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
38 * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
39 */
40 static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
41 0x011542778a, 0x01322d1430,
42 0x011542778a, 0x01322d1430,
43 0x011542778a, 0x01322d1430 };
44 static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
45 0x0154442bd4, 0x01c6e41596,
46 0x0154442bd4, 0x01c6e41596,
47 0x0154442bd4, 0x01c6e41596 };
48 static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
49 static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
50 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
51 __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
52 __m128i a0, a1, a2, a3;
53
54 /*
55 * There's at least one block of 256.
56 */
57 x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
58 x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
59 x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
60 x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
61
62 x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
63
64 x0 = _mm512_load_si512((__m512i *)k1k2);
65
66 buf += 256;
67 len -= 256;
68
69 /*
70 * Parallel fold blocks of 256, if any.
71 */
72 while (len >= 256)
73 {
74 x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
75 x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
76 x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
77 x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
78
79
80 x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
81 x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
82 x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
83 x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
84
85 y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
86 y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
87 y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
88 y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
89
90 x1 = _mm512_xor_si512(x1, x5);
91 x2 = _mm512_xor_si512(x2, x6);
92 x3 = _mm512_xor_si512(x3, x7);
93 x4 = _mm512_xor_si512(x4, x8);
94
95 x1 = _mm512_xor_si512(x1, y5);
96 x2 = _mm512_xor_si512(x2, y6);
97 x3 = _mm512_xor_si512(x3, y7);
98 x4 = _mm512_xor_si512(x4, y8);
99
100 buf += 256;
101 len -= 256;
102 }
103
104 /*
105 * Fold into 512-bits.
106 */
107 x0 = _mm512_load_si512((__m512i *)k3k4);
108
109 x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
110 x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
111 x1 = _mm512_xor_si512(x1, x2);
112 x1 = _mm512_xor_si512(x1, x5);
113
114 x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
115 x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
116 x1 = _mm512_xor_si512(x1, x3);
117 x1 = _mm512_xor_si512(x1, x5);
118
119 x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
120 x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
121 x1 = _mm512_xor_si512(x1, x4);
122 x1 = _mm512_xor_si512(x1, x5);
123
124 /*
125 * Single fold blocks of 64, if any.
126 */
127 while (len >= 64)
128 {
129 x2 = _mm512_loadu_si512((__m512i *)buf);
130
131 x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
132 x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
133 x1 = _mm512_xor_si512(x1, x2);
134 x1 = _mm512_xor_si512(x1, x5);
135
136 buf += 64;
137 len -= 64;
138 }
139
140 /*
141 * Fold 512-bits to 384-bits.
142 */
143 a0 = _mm_load_si128((__m128i *)k5k6);
144
145 a1 = _mm512_extracti32x4_epi32(x1, 0);
146 a2 = _mm512_extracti32x4_epi32(x1, 1);
147
148 a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
149 a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
150
151 a1 = _mm_xor_si128(a1, a3);
152 a1 = _mm_xor_si128(a1, a2);
153
154 /*
155 * Fold 384-bits to 256-bits.
156 */
157 a2 = _mm512_extracti32x4_epi32(x1, 2);
158 a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
159 a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
160 a1 = _mm_xor_si128(a1, a3);
161 a1 = _mm_xor_si128(a1, a2);
162
163 /*
164 * Fold 256-bits to 128-bits.
165 */
166 a2 = _mm512_extracti32x4_epi32(x1, 3);
167 a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
168 a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
169 a1 = _mm_xor_si128(a1, a3);
170 a1 = _mm_xor_si128(a1, a2);
171
172 /*
173 * Fold 128-bits to 64-bits.
174 */
175 a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
176 a3 = _mm_setr_epi32(~0, 0, ~0, 0);
177 a1 = _mm_srli_si128(a1, 8);
178 a1 = _mm_xor_si128(a1, a2);
179
180 a0 = _mm_loadl_epi64((__m128i*)k7k8);
181 a2 = _mm_srli_si128(a1, 4);
182 a1 = _mm_and_si128(a1, a3);
183 a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
184 a1 = _mm_xor_si128(a1, a2);
185
186 /*
187 * Barret reduce to 32-bits.
188 */
189 a0 = _mm_load_si128((__m128i*)poly);
190
191 a2 = _mm_and_si128(a1, a3);
192 a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
193 a2 = _mm_and_si128(a2, a3);
194 a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
195 a1 = _mm_xor_si128(a1, a2);
196
197 /*
198 * Return the crc32.
199 */
200 return _mm_extract_epi32(a1, 1);
201 }
202
203 #elif defined(CRC32_SIMD_SSE42_PCLMUL)
204
205 /*
206 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
207 * length must be at least 64, and a multiple of 16.
208 */
209
210 #include <emmintrin.h>
211 #include <smmintrin.h>
212 #include <wmmintrin.h>
213
crc32_sse42_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)214 uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
215 const unsigned char *buf,
216 z_size_t len,
217 uint32_t crc)
218 {
219 /*
220 * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
221 * the CRC32+Barrett polynomials given at the end of the paper.
222 */
223 static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
224 static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
225 static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
226 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
227
228 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
229
230 /*
231 * There's at least one block of 64.
232 */
233 x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
234 x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
235 x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
236 x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
237
238 x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
239
240 x0 = _mm_load_si128((__m128i *)k1k2);
241
242 buf += 64;
243 len -= 64;
244
245 /*
246 * Parallel fold blocks of 64, if any.
247 */
248 while (len >= 64)
249 {
250 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
251 x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
252 x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
253 x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
254
255 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
256 x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
257 x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
258 x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
259
260 y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
261 y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
262 y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
263 y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
264
265 x1 = _mm_xor_si128(x1, x5);
266 x2 = _mm_xor_si128(x2, x6);
267 x3 = _mm_xor_si128(x3, x7);
268 x4 = _mm_xor_si128(x4, x8);
269
270 x1 = _mm_xor_si128(x1, y5);
271 x2 = _mm_xor_si128(x2, y6);
272 x3 = _mm_xor_si128(x3, y7);
273 x4 = _mm_xor_si128(x4, y8);
274
275 buf += 64;
276 len -= 64;
277 }
278
279 /*
280 * Fold into 128-bits.
281 */
282 x0 = _mm_load_si128((__m128i *)k3k4);
283
284 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
285 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
286 x1 = _mm_xor_si128(x1, x2);
287 x1 = _mm_xor_si128(x1, x5);
288
289 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
290 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
291 x1 = _mm_xor_si128(x1, x3);
292 x1 = _mm_xor_si128(x1, x5);
293
294 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
295 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
296 x1 = _mm_xor_si128(x1, x4);
297 x1 = _mm_xor_si128(x1, x5);
298
299 /*
300 * Single fold blocks of 16, if any.
301 */
302 while (len >= 16)
303 {
304 x2 = _mm_loadu_si128((__m128i *)buf);
305
306 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
307 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
308 x1 = _mm_xor_si128(x1, x2);
309 x1 = _mm_xor_si128(x1, x5);
310
311 buf += 16;
312 len -= 16;
313 }
314
315 /*
316 * Fold 128-bits to 64-bits.
317 */
318 x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
319 x3 = _mm_setr_epi32(~0, 0, ~0, 0);
320 x1 = _mm_srli_si128(x1, 8);
321 x1 = _mm_xor_si128(x1, x2);
322
323 x0 = _mm_loadl_epi64((__m128i*)k5k0);
324
325 x2 = _mm_srli_si128(x1, 4);
326 x1 = _mm_and_si128(x1, x3);
327 x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
328 x1 = _mm_xor_si128(x1, x2);
329
330 /*
331 * Barret reduce to 32-bits.
332 */
333 x0 = _mm_load_si128((__m128i*)poly);
334
335 x2 = _mm_and_si128(x1, x3);
336 x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
337 x2 = _mm_and_si128(x2, x3);
338 x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
339 x1 = _mm_xor_si128(x1, x2);
340
341 /*
342 * Return the crc32.
343 */
344 return _mm_extract_epi32(x1, 1);
345 }
346
347 #elif defined(CRC32_ARMV8_CRC32)
348
349 /* CRC32 checksums using ARMv8-a crypto instructions.
350 */
351
352 #if defined(__clang__)
353 /* We need some extra types for using PMULL.
354 */
355 #if defined(__aarch64__)
356 #include <arm_neon.h>
357 #include <arm_acle.h>
358 #endif
359
360 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
361 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
362 * (Namely, mixing and matching different module-level targets makes ThinLTO
363 * warn, and Android defaults to armv7-a. This restriction does not apply to
364 * function-level `target`s, however.)
365 *
366 * Since we only need four crc intrinsics, and since clang's implementation of
367 * those are just wrappers around compiler builtins, it's simplest to #define
368 * those builtins directly. If this #define list grows too much (or we depend on
369 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
370 * to go about this.
371 *
372 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
373 * feature for this target (ignoring feature)." This appears to be a harmless
374 * bug in clang.
375 *
376 * These definitions must appear *after* including arm_acle.h otherwise that
377 * header may end up defining functions named __builtin_arm_crc32* that call
378 * themselves, creating an infinite loop when the intrinsic is called.
379 */
380 /* XXX: Cannot hook into builtins with XCode for arm64. */
381 #if !defined(ARMV8_OS_MACOS)
382 #define __crc32b __builtin_arm_crc32b
383 #define __crc32d __builtin_arm_crc32d
384 #define __crc32w __builtin_arm_crc32w
385 #define __crc32cw __builtin_arm_crc32cw
386 #endif
387
388 #if defined(__aarch64__)
389 #define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
390 #else // !defined(__aarch64__)
391 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
392 #endif // defined(__aarch64__)
393
394 #elif defined(__GNUC__)
395 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
396 * allowed. We can just include arm_acle.h.
397 */
398 #include <arm_acle.h>
399 #include <arm_neon.h>
400 #define TARGET_ARMV8_WITH_CRC
401 #else // !defined(__GNUC__) && !defined(_aarch64__)
402 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
403 #endif
404
405 TARGET_ARMV8_WITH_CRC
armv8_crc32_little(const unsigned char * buf,z_size_t len,uint32_t crc)406 uint32_t ZLIB_INTERNAL armv8_crc32_little(
407 const unsigned char *buf,
408 z_size_t len,
409 uint32_t crc)
410 {
411 uint32_t c = (uint32_t) ~crc;
412
413 while (len && ((uintptr_t)buf & 7)) {
414 c = __crc32b(c, *buf++);
415 --len;
416 }
417
418 const uint64_t *buf8 = (const uint64_t *)buf;
419
420 while (len >= 64) {
421 c = __crc32d(c, *buf8++);
422 c = __crc32d(c, *buf8++);
423 c = __crc32d(c, *buf8++);
424 c = __crc32d(c, *buf8++);
425
426 c = __crc32d(c, *buf8++);
427 c = __crc32d(c, *buf8++);
428 c = __crc32d(c, *buf8++);
429 c = __crc32d(c, *buf8++);
430 len -= 64;
431 }
432
433 while (len >= 8) {
434 c = __crc32d(c, *buf8++);
435 len -= 8;
436 }
437
438 buf = (const unsigned char *)buf8;
439
440 while (len--) {
441 c = __crc32b(c, *buf++);
442 }
443
444 return ~c;
445 }
446
447 #if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
448
449 /*
450 * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
451 * length must be at least 64, and a multiple of 16. Based on:
452 *
453 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
454 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
455 */
456 TARGET_ARMV8_WITH_CRC
pmull_lo(const uint64x2_t a,const uint64x2_t b)457 static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
458 {
459 uint8x16_t r;
460 __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
461 : "=w" (r) : "w" (a), "w" (b) );
462 return r;
463 }
464
465 TARGET_ARMV8_WITH_CRC
pmull_01(const uint64x2_t a,const uint64x2_t b)466 static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
467 {
468 uint8x16_t r;
469 __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
470 : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
471 return r;
472 }
473
474 TARGET_ARMV8_WITH_CRC
pmull_hi(const uint64x2_t a,const uint64x2_t b)475 static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
476 {
477 uint8x16_t r;
478 __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
479 : "=w" (r) : "w" (a), "w" (b) );
480 return r;
481 }
482
483 TARGET_ARMV8_WITH_CRC
armv8_crc32_pmull_little(const unsigned char * buf,z_size_t len,uint32_t crc)484 uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
485 const unsigned char *buf,
486 z_size_t len,
487 uint32_t crc)
488 {
489 /*
490 * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
491 * the CRC32+Barrett polynomials given at the end of the paper.
492 */
493 static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
494 static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
495 static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
496 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
497
498 uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
499
500 /*
501 * There's at least one block of 64.
502 */
503 x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
504 x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
505 x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
506 x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
507
508 x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
509
510 x0 = vld1q_u64(k1k2);
511
512 buf += 64;
513 len -= 64;
514
515 /*
516 * Parallel fold blocks of 64, if any.
517 */
518 while (len >= 64)
519 {
520 x5 = (uint64x2_t) pmull_lo(x1, x0);
521 x6 = (uint64x2_t) pmull_lo(x2, x0);
522 x7 = (uint64x2_t) pmull_lo(x3, x0);
523 x8 = (uint64x2_t) pmull_lo(x4, x0);
524
525 y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
526 y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
527 y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
528 y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
529
530 x1 = (uint64x2_t) pmull_hi(x1, x0);
531 x2 = (uint64x2_t) pmull_hi(x2, x0);
532 x3 = (uint64x2_t) pmull_hi(x3, x0);
533 x4 = (uint64x2_t) pmull_hi(x4, x0);
534
535 x1 = veorq_u64(x1, x5);
536 x2 = veorq_u64(x2, x6);
537 x3 = veorq_u64(x3, x7);
538 x4 = veorq_u64(x4, x8);
539
540 x1 = veorq_u64(x1, y5);
541 x2 = veorq_u64(x2, y6);
542 x3 = veorq_u64(x3, y7);
543 x4 = veorq_u64(x4, y8);
544
545 buf += 64;
546 len -= 64;
547 }
548
549 /*
550 * Fold into 128-bits.
551 */
552 x0 = vld1q_u64(k3k4);
553
554 x5 = (uint64x2_t) pmull_lo(x1, x0);
555 x1 = (uint64x2_t) pmull_hi(x1, x0);
556 x1 = veorq_u64(x1, x2);
557 x1 = veorq_u64(x1, x5);
558
559 x5 = (uint64x2_t) pmull_lo(x1, x0);
560 x1 = (uint64x2_t) pmull_hi(x1, x0);
561 x1 = veorq_u64(x1, x3);
562 x1 = veorq_u64(x1, x5);
563
564 x5 = (uint64x2_t) pmull_lo(x1, x0);
565 x1 = (uint64x2_t) pmull_hi(x1, x0);
566 x1 = veorq_u64(x1, x4);
567 x1 = veorq_u64(x1, x5);
568
569 /*
570 * Single fold blocks of 16, if any.
571 */
572 while (len >= 16)
573 {
574 x2 = vld1q_u64((const uint64_t *)buf);
575
576 x5 = (uint64x2_t) pmull_lo(x1, x0);
577 x1 = (uint64x2_t) pmull_hi(x1, x0);
578 x1 = veorq_u64(x1, x2);
579 x1 = veorq_u64(x1, x5);
580
581 buf += 16;
582 len -= 16;
583 }
584
585 /*
586 * Fold 128-bits to 64-bits.
587 */
588 static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
589
590 x2 = (uint64x2_t) pmull_01(x1, x0);
591 x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
592 x3 = (uint64x2_t) vld1q_u32(mask);
593 x1 = veorq_u64(x1, x2);
594
595 x0 = vld1q_u64(k5k0);
596
597 x2 = (uint64x2_t) pmull_01(x2, x0);
598 x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
599 x1 = vandq_u64(x1, x3);
600 x1 = (uint64x2_t) pmull_lo(x1, x0);
601 x1 = veorq_u64(x1, x2);
602
603 /*
604 * Barret reduce to 32-bits.
605 */
606 x0 = vld1q_u64(poly);
607
608 x2 = vandq_u64(x1, x3);
609 x2 = (uint64x2_t) pmull_01(x2, x0);
610 x2 = vandq_u64(x2, x3);
611 x2 = (uint64x2_t) pmull_lo(x2, x0);
612 x1 = veorq_u64(x1, x2);
613
614 /*
615 * Return the crc32.
616 */
617 return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
618 }
619 #endif /* aarch64 specific code. */
620
621 #endif
622