xref: /aosp_15_r20/external/zlib/crc32_simd.c (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1 /* crc32_simd.c
2  *
3  * Copyright 2017 The Chromium Authors
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the Chromium source repository LICENSE file.
6  */
7 
8 #include "crc32_simd.h"
9 #if defined(CRC32_SIMD_AVX512_PCLMUL)
10 
11 /*
12  * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
13  * length must be at least 256, and a multiple of 64. Based on:
14  *
15  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
16  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
17  */
18 
19 #include <emmintrin.h>
20 #include <smmintrin.h>
21 #include <wmmintrin.h>
22 #include <immintrin.h>
23 
crc32_avx512_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)24 uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
25     const unsigned char *buf,
26     z_size_t len,
27     uint32_t crc)
28 {
29     /*
30      * Definitions of the bit-reflected domain constants k1,k2,k3,k4
31      * are similar to those given at the end of the paper, and remaining
32      * constants and CRC32+Barrett polynomials remain unchanged.
33      *
34      * Replace the index of x from 128 to 512. As follows:
35      * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
36      * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
37      * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
38      * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
39      */
40     static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
41                                                 0x011542778a, 0x01322d1430,
42                                                 0x011542778a, 0x01322d1430,
43                                                 0x011542778a, 0x01322d1430 };
44     static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
45                                                 0x0154442bd4, 0x01c6e41596,
46                                                 0x0154442bd4, 0x01c6e41596,
47                                                 0x0154442bd4, 0x01c6e41596 };
48     static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
49     static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
50     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
51     __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
52     __m128i a0, a1, a2, a3;
53 
54     /*
55      * There's at least one block of 256.
56      */
57     x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
58     x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
59     x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
60     x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
61 
62     x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
63 
64     x0 = _mm512_load_si512((__m512i *)k1k2);
65 
66     buf += 256;
67     len -= 256;
68 
69     /*
70      * Parallel fold blocks of 256, if any.
71      */
72     while (len >= 256)
73     {
74         x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
75         x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
76         x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
77         x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
78 
79 
80         x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
81         x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
82         x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
83         x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
84 
85         y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
86         y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
87         y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
88         y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
89 
90         x1 = _mm512_xor_si512(x1, x5);
91         x2 = _mm512_xor_si512(x2, x6);
92         x3 = _mm512_xor_si512(x3, x7);
93         x4 = _mm512_xor_si512(x4, x8);
94 
95         x1 = _mm512_xor_si512(x1, y5);
96         x2 = _mm512_xor_si512(x2, y6);
97         x3 = _mm512_xor_si512(x3, y7);
98         x4 = _mm512_xor_si512(x4, y8);
99 
100         buf += 256;
101         len -= 256;
102     }
103 
104     /*
105      * Fold into 512-bits.
106      */
107     x0 = _mm512_load_si512((__m512i *)k3k4);
108 
109     x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
110     x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
111     x1 = _mm512_xor_si512(x1, x2);
112     x1 = _mm512_xor_si512(x1, x5);
113 
114     x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
115     x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
116     x1 = _mm512_xor_si512(x1, x3);
117     x1 = _mm512_xor_si512(x1, x5);
118 
119     x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
120     x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
121     x1 = _mm512_xor_si512(x1, x4);
122     x1 = _mm512_xor_si512(x1, x5);
123 
124     /*
125      * Single fold blocks of 64, if any.
126      */
127     while (len >= 64)
128     {
129         x2 = _mm512_loadu_si512((__m512i *)buf);
130 
131         x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
132         x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
133         x1 = _mm512_xor_si512(x1, x2);
134         x1 = _mm512_xor_si512(x1, x5);
135 
136         buf += 64;
137         len -= 64;
138     }
139 
140     /*
141      * Fold 512-bits to 384-bits.
142      */
143     a0 = _mm_load_si128((__m128i *)k5k6);
144 
145     a1 = _mm512_extracti32x4_epi32(x1, 0);
146     a2 = _mm512_extracti32x4_epi32(x1, 1);
147 
148     a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
149     a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
150 
151     a1 = _mm_xor_si128(a1, a3);
152     a1 = _mm_xor_si128(a1, a2);
153 
154     /*
155      * Fold 384-bits to 256-bits.
156      */
157     a2 = _mm512_extracti32x4_epi32(x1, 2);
158     a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
159     a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
160     a1 = _mm_xor_si128(a1, a3);
161     a1 = _mm_xor_si128(a1, a2);
162 
163     /*
164      * Fold 256-bits to 128-bits.
165      */
166     a2 = _mm512_extracti32x4_epi32(x1, 3);
167     a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
168     a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
169     a1 = _mm_xor_si128(a1, a3);
170     a1 = _mm_xor_si128(a1, a2);
171 
172     /*
173      * Fold 128-bits to 64-bits.
174      */
175     a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
176     a3 = _mm_setr_epi32(~0, 0, ~0, 0);
177     a1 = _mm_srli_si128(a1, 8);
178     a1 = _mm_xor_si128(a1, a2);
179 
180     a0 = _mm_loadl_epi64((__m128i*)k7k8);
181     a2 = _mm_srli_si128(a1, 4);
182     a1 = _mm_and_si128(a1, a3);
183     a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
184     a1 = _mm_xor_si128(a1, a2);
185 
186     /*
187      * Barret reduce to 32-bits.
188      */
189     a0 = _mm_load_si128((__m128i*)poly);
190 
191     a2 = _mm_and_si128(a1, a3);
192     a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
193     a2 = _mm_and_si128(a2, a3);
194     a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
195     a1 = _mm_xor_si128(a1, a2);
196 
197     /*
198      * Return the crc32.
199      */
200     return _mm_extract_epi32(a1, 1);
201 }
202 
203 #elif defined(CRC32_SIMD_SSE42_PCLMUL)
204 
205 /*
206  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
207  * length must be at least 64, and a multiple of 16.
208  */
209 
210 #include <emmintrin.h>
211 #include <smmintrin.h>
212 #include <wmmintrin.h>
213 
crc32_sse42_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)214 uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
215     const unsigned char *buf,
216     z_size_t len,
217     uint32_t crc)
218 {
219     /*
220      * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
221      * the CRC32+Barrett polynomials given at the end of the paper.
222      */
223     static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
224     static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
225     static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
226     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
227 
228     __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
229 
230     /*
231      * There's at least one block of 64.
232      */
233     x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
234     x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
235     x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
236     x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
237 
238     x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
239 
240     x0 = _mm_load_si128((__m128i *)k1k2);
241 
242     buf += 64;
243     len -= 64;
244 
245     /*
246      * Parallel fold blocks of 64, if any.
247      */
248     while (len >= 64)
249     {
250         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
251         x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
252         x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
253         x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
254 
255         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
256         x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
257         x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
258         x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
259 
260         y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
261         y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
262         y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
263         y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
264 
265         x1 = _mm_xor_si128(x1, x5);
266         x2 = _mm_xor_si128(x2, x6);
267         x3 = _mm_xor_si128(x3, x7);
268         x4 = _mm_xor_si128(x4, x8);
269 
270         x1 = _mm_xor_si128(x1, y5);
271         x2 = _mm_xor_si128(x2, y6);
272         x3 = _mm_xor_si128(x3, y7);
273         x4 = _mm_xor_si128(x4, y8);
274 
275         buf += 64;
276         len -= 64;
277     }
278 
279     /*
280      * Fold into 128-bits.
281      */
282     x0 = _mm_load_si128((__m128i *)k3k4);
283 
284     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
285     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
286     x1 = _mm_xor_si128(x1, x2);
287     x1 = _mm_xor_si128(x1, x5);
288 
289     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
290     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
291     x1 = _mm_xor_si128(x1, x3);
292     x1 = _mm_xor_si128(x1, x5);
293 
294     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
295     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
296     x1 = _mm_xor_si128(x1, x4);
297     x1 = _mm_xor_si128(x1, x5);
298 
299     /*
300      * Single fold blocks of 16, if any.
301      */
302     while (len >= 16)
303     {
304         x2 = _mm_loadu_si128((__m128i *)buf);
305 
306         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
307         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
308         x1 = _mm_xor_si128(x1, x2);
309         x1 = _mm_xor_si128(x1, x5);
310 
311         buf += 16;
312         len -= 16;
313     }
314 
315     /*
316      * Fold 128-bits to 64-bits.
317      */
318     x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
319     x3 = _mm_setr_epi32(~0, 0, ~0, 0);
320     x1 = _mm_srli_si128(x1, 8);
321     x1 = _mm_xor_si128(x1, x2);
322 
323     x0 = _mm_loadl_epi64((__m128i*)k5k0);
324 
325     x2 = _mm_srli_si128(x1, 4);
326     x1 = _mm_and_si128(x1, x3);
327     x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
328     x1 = _mm_xor_si128(x1, x2);
329 
330     /*
331      * Barret reduce to 32-bits.
332      */
333     x0 = _mm_load_si128((__m128i*)poly);
334 
335     x2 = _mm_and_si128(x1, x3);
336     x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
337     x2 = _mm_and_si128(x2, x3);
338     x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
339     x1 = _mm_xor_si128(x1, x2);
340 
341     /*
342      * Return the crc32.
343      */
344     return _mm_extract_epi32(x1, 1);
345 }
346 
347 #elif defined(CRC32_ARMV8_CRC32)
348 
349 /* CRC32 checksums using ARMv8-a crypto instructions.
350  */
351 
352 #if defined(__clang__)
353 /* We need some extra types for using PMULL.
354  */
355 #if defined(__aarch64__)
356 #include <arm_neon.h>
357 #include <arm_acle.h>
358 #endif
359 
360 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
361  * armv8 target, which is incompatible with ThinLTO optimizations on Android.
362  * (Namely, mixing and matching different module-level targets makes ThinLTO
363  * warn, and Android defaults to armv7-a. This restriction does not apply to
364  * function-level `target`s, however.)
365  *
366  * Since we only need four crc intrinsics, and since clang's implementation of
367  * those are just wrappers around compiler builtins, it's simplest to #define
368  * those builtins directly. If this #define list grows too much (or we depend on
369  * an intrinsic that isn't a trivial wrapper), we may have to find a better way
370  * to go about this.
371  *
372  * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
373  * feature for this target (ignoring feature)." This appears to be a harmless
374  * bug in clang.
375  *
376  * These definitions must appear *after* including arm_acle.h otherwise that
377  * header may end up defining functions named __builtin_arm_crc32* that call
378  * themselves, creating an infinite loop when the intrinsic is called.
379  */
380 /* XXX: Cannot hook into builtins with XCode for arm64. */
381 #if !defined(ARMV8_OS_MACOS)
382 #define __crc32b __builtin_arm_crc32b
383 #define __crc32d __builtin_arm_crc32d
384 #define __crc32w __builtin_arm_crc32w
385 #define __crc32cw __builtin_arm_crc32cw
386 #endif
387 
388 #if defined(__aarch64__)
389 #define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
390 #else  // !defined(__aarch64__)
391 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
392 #endif  // defined(__aarch64__)
393 
394 #elif defined(__GNUC__)
395 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
396  * allowed. We can just include arm_acle.h.
397  */
398 #include <arm_acle.h>
399 #include <arm_neon.h>
400 #define TARGET_ARMV8_WITH_CRC
401 #else  // !defined(__GNUC__) && !defined(_aarch64__)
402 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
403 #endif
404 
405 TARGET_ARMV8_WITH_CRC
armv8_crc32_little(const unsigned char * buf,z_size_t len,uint32_t crc)406 uint32_t ZLIB_INTERNAL armv8_crc32_little(
407     const unsigned char *buf,
408     z_size_t len,
409     uint32_t crc)
410 {
411     uint32_t c = (uint32_t) ~crc;
412 
413     while (len && ((uintptr_t)buf & 7)) {
414         c = __crc32b(c, *buf++);
415         --len;
416     }
417 
418     const uint64_t *buf8 = (const uint64_t *)buf;
419 
420     while (len >= 64) {
421         c = __crc32d(c, *buf8++);
422         c = __crc32d(c, *buf8++);
423         c = __crc32d(c, *buf8++);
424         c = __crc32d(c, *buf8++);
425 
426         c = __crc32d(c, *buf8++);
427         c = __crc32d(c, *buf8++);
428         c = __crc32d(c, *buf8++);
429         c = __crc32d(c, *buf8++);
430         len -= 64;
431     }
432 
433     while (len >= 8) {
434         c = __crc32d(c, *buf8++);
435         len -= 8;
436     }
437 
438     buf = (const unsigned char *)buf8;
439 
440     while (len--) {
441         c = __crc32b(c, *buf++);
442     }
443 
444     return ~c;
445 }
446 
447 #if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
448 
449 /*
450  * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
451  * length must be at least 64, and a multiple of 16. Based on:
452  *
453  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
454  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
455  */
456 TARGET_ARMV8_WITH_CRC
pmull_lo(const uint64x2_t a,const uint64x2_t b)457 static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
458 {
459     uint8x16_t r;
460     __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
461         : "=w" (r) : "w" (a), "w" (b) );
462     return r;
463 }
464 
465 TARGET_ARMV8_WITH_CRC
pmull_01(const uint64x2_t a,const uint64x2_t b)466 static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
467 {
468     uint8x16_t r;
469     __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
470         : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
471     return r;
472 }
473 
474 TARGET_ARMV8_WITH_CRC
pmull_hi(const uint64x2_t a,const uint64x2_t b)475 static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
476 {
477     uint8x16_t r;
478     __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
479         : "=w" (r) : "w" (a), "w" (b) );
480     return r;
481 }
482 
483 TARGET_ARMV8_WITH_CRC
armv8_crc32_pmull_little(const unsigned char * buf,z_size_t len,uint32_t crc)484 uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
485     const unsigned char *buf,
486     z_size_t len,
487     uint32_t crc)
488 {
489     /*
490      * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
491      * the CRC32+Barrett polynomials given at the end of the paper.
492      */
493     static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
494     static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
495     static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
496     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
497 
498     uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
499 
500     /*
501      * There's at least one block of 64.
502      */
503     x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
504     x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
505     x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
506     x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
507 
508     x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
509 
510     x0 = vld1q_u64(k1k2);
511 
512     buf += 64;
513     len -= 64;
514 
515     /*
516      * Parallel fold blocks of 64, if any.
517      */
518     while (len >= 64)
519     {
520         x5 = (uint64x2_t) pmull_lo(x1, x0);
521         x6 = (uint64x2_t) pmull_lo(x2, x0);
522         x7 = (uint64x2_t) pmull_lo(x3, x0);
523         x8 = (uint64x2_t) pmull_lo(x4, x0);
524 
525         y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
526         y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
527         y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
528         y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
529 
530         x1 = (uint64x2_t) pmull_hi(x1, x0);
531         x2 = (uint64x2_t) pmull_hi(x2, x0);
532         x3 = (uint64x2_t) pmull_hi(x3, x0);
533         x4 = (uint64x2_t) pmull_hi(x4, x0);
534 
535         x1 = veorq_u64(x1, x5);
536         x2 = veorq_u64(x2, x6);
537         x3 = veorq_u64(x3, x7);
538         x4 = veorq_u64(x4, x8);
539 
540         x1 = veorq_u64(x1, y5);
541         x2 = veorq_u64(x2, y6);
542         x3 = veorq_u64(x3, y7);
543         x4 = veorq_u64(x4, y8);
544 
545         buf += 64;
546         len -= 64;
547     }
548 
549     /*
550      * Fold into 128-bits.
551      */
552     x0 = vld1q_u64(k3k4);
553 
554     x5 = (uint64x2_t) pmull_lo(x1, x0);
555     x1 = (uint64x2_t) pmull_hi(x1, x0);
556     x1 = veorq_u64(x1, x2);
557     x1 = veorq_u64(x1, x5);
558 
559     x5 = (uint64x2_t) pmull_lo(x1, x0);
560     x1 = (uint64x2_t) pmull_hi(x1, x0);
561     x1 = veorq_u64(x1, x3);
562     x1 = veorq_u64(x1, x5);
563 
564     x5 = (uint64x2_t) pmull_lo(x1, x0);
565     x1 = (uint64x2_t) pmull_hi(x1, x0);
566     x1 = veorq_u64(x1, x4);
567     x1 = veorq_u64(x1, x5);
568 
569     /*
570      * Single fold blocks of 16, if any.
571      */
572     while (len >= 16)
573     {
574         x2 = vld1q_u64((const uint64_t *)buf);
575 
576         x5 = (uint64x2_t) pmull_lo(x1, x0);
577         x1 = (uint64x2_t) pmull_hi(x1, x0);
578         x1 = veorq_u64(x1, x2);
579         x1 = veorq_u64(x1, x5);
580 
581         buf += 16;
582         len -= 16;
583     }
584 
585     /*
586      * Fold 128-bits to 64-bits.
587      */
588     static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
589 
590     x2 = (uint64x2_t) pmull_01(x1, x0);
591     x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
592     x3 = (uint64x2_t) vld1q_u32(mask);
593     x1 = veorq_u64(x1, x2);
594 
595     x0 = vld1q_u64(k5k0);
596 
597     x2 = (uint64x2_t) pmull_01(x2, x0);
598     x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
599     x1 = vandq_u64(x1, x3);
600     x1 = (uint64x2_t) pmull_lo(x1, x0);
601     x1 = veorq_u64(x1, x2);
602 
603     /*
604      * Barret reduce to 32-bits.
605      */
606     x0 = vld1q_u64(poly);
607 
608     x2 = vandq_u64(x1, x3);
609     x2 = (uint64x2_t) pmull_01(x2, x0);
610     x2 = vandq_u64(x2, x3);
611     x2 = (uint64x2_t) pmull_lo(x2, x0);
612     x1 = veorq_u64(x1, x2);
613 
614     /*
615      * Return the crc32.
616      */
617     return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
618 }
619 #endif /* aarch64 specific code. */
620 
621 #endif
622