1*86ee64e7SAndroid Build Coastguard Worker /* crc32_simd.c
2*86ee64e7SAndroid Build Coastguard Worker *
3*86ee64e7SAndroid Build Coastguard Worker * Copyright 2017 The Chromium Authors
4*86ee64e7SAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license that can be
5*86ee64e7SAndroid Build Coastguard Worker * found in the Chromium source repository LICENSE file.
6*86ee64e7SAndroid Build Coastguard Worker */
7*86ee64e7SAndroid Build Coastguard Worker
8*86ee64e7SAndroid Build Coastguard Worker #include "crc32_simd.h"
9*86ee64e7SAndroid Build Coastguard Worker #if defined(CRC32_SIMD_AVX512_PCLMUL)
10*86ee64e7SAndroid Build Coastguard Worker
11*86ee64e7SAndroid Build Coastguard Worker /*
12*86ee64e7SAndroid Build Coastguard Worker * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
13*86ee64e7SAndroid Build Coastguard Worker * length must be at least 256, and a multiple of 64. Based on:
14*86ee64e7SAndroid Build Coastguard Worker *
15*86ee64e7SAndroid Build Coastguard Worker * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
16*86ee64e7SAndroid Build Coastguard Worker * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
17*86ee64e7SAndroid Build Coastguard Worker */
18*86ee64e7SAndroid Build Coastguard Worker
19*86ee64e7SAndroid Build Coastguard Worker #include <emmintrin.h>
20*86ee64e7SAndroid Build Coastguard Worker #include <smmintrin.h>
21*86ee64e7SAndroid Build Coastguard Worker #include <wmmintrin.h>
22*86ee64e7SAndroid Build Coastguard Worker #include <immintrin.h>
23*86ee64e7SAndroid Build Coastguard Worker
crc32_avx512_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)24*86ee64e7SAndroid Build Coastguard Worker uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
25*86ee64e7SAndroid Build Coastguard Worker const unsigned char *buf,
26*86ee64e7SAndroid Build Coastguard Worker z_size_t len,
27*86ee64e7SAndroid Build Coastguard Worker uint32_t crc)
28*86ee64e7SAndroid Build Coastguard Worker {
29*86ee64e7SAndroid Build Coastguard Worker /*
30*86ee64e7SAndroid Build Coastguard Worker * Definitions of the bit-reflected domain constants k1,k2,k3,k4
31*86ee64e7SAndroid Build Coastguard Worker * are similar to those given at the end of the paper, and remaining
32*86ee64e7SAndroid Build Coastguard Worker * constants and CRC32+Barrett polynomials remain unchanged.
33*86ee64e7SAndroid Build Coastguard Worker *
34*86ee64e7SAndroid Build Coastguard Worker * Replace the index of x from 128 to 512. As follows:
35*86ee64e7SAndroid Build Coastguard Worker * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
36*86ee64e7SAndroid Build Coastguard Worker * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
37*86ee64e7SAndroid Build Coastguard Worker * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
38*86ee64e7SAndroid Build Coastguard Worker * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
39*86ee64e7SAndroid Build Coastguard Worker */
40*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
41*86ee64e7SAndroid Build Coastguard Worker 0x011542778a, 0x01322d1430,
42*86ee64e7SAndroid Build Coastguard Worker 0x011542778a, 0x01322d1430,
43*86ee64e7SAndroid Build Coastguard Worker 0x011542778a, 0x01322d1430 };
44*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
45*86ee64e7SAndroid Build Coastguard Worker 0x0154442bd4, 0x01c6e41596,
46*86ee64e7SAndroid Build Coastguard Worker 0x0154442bd4, 0x01c6e41596,
47*86ee64e7SAndroid Build Coastguard Worker 0x0154442bd4, 0x01c6e41596 };
48*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
49*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
50*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
51*86ee64e7SAndroid Build Coastguard Worker __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
52*86ee64e7SAndroid Build Coastguard Worker __m128i a0, a1, a2, a3;
53*86ee64e7SAndroid Build Coastguard Worker
54*86ee64e7SAndroid Build Coastguard Worker /*
55*86ee64e7SAndroid Build Coastguard Worker * There's at least one block of 256.
56*86ee64e7SAndroid Build Coastguard Worker */
57*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
58*86ee64e7SAndroid Build Coastguard Worker x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
59*86ee64e7SAndroid Build Coastguard Worker x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
60*86ee64e7SAndroid Build Coastguard Worker x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
61*86ee64e7SAndroid Build Coastguard Worker
62*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
63*86ee64e7SAndroid Build Coastguard Worker
64*86ee64e7SAndroid Build Coastguard Worker x0 = _mm512_load_si512((__m512i *)k1k2);
65*86ee64e7SAndroid Build Coastguard Worker
66*86ee64e7SAndroid Build Coastguard Worker buf += 256;
67*86ee64e7SAndroid Build Coastguard Worker len -= 256;
68*86ee64e7SAndroid Build Coastguard Worker
69*86ee64e7SAndroid Build Coastguard Worker /*
70*86ee64e7SAndroid Build Coastguard Worker * Parallel fold blocks of 256, if any.
71*86ee64e7SAndroid Build Coastguard Worker */
72*86ee64e7SAndroid Build Coastguard Worker while (len >= 256)
73*86ee64e7SAndroid Build Coastguard Worker {
74*86ee64e7SAndroid Build Coastguard Worker x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
75*86ee64e7SAndroid Build Coastguard Worker x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
76*86ee64e7SAndroid Build Coastguard Worker x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
77*86ee64e7SAndroid Build Coastguard Worker x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
78*86ee64e7SAndroid Build Coastguard Worker
79*86ee64e7SAndroid Build Coastguard Worker
80*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
81*86ee64e7SAndroid Build Coastguard Worker x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
82*86ee64e7SAndroid Build Coastguard Worker x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
83*86ee64e7SAndroid Build Coastguard Worker x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
84*86ee64e7SAndroid Build Coastguard Worker
85*86ee64e7SAndroid Build Coastguard Worker y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
86*86ee64e7SAndroid Build Coastguard Worker y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
87*86ee64e7SAndroid Build Coastguard Worker y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
88*86ee64e7SAndroid Build Coastguard Worker y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
89*86ee64e7SAndroid Build Coastguard Worker
90*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x5);
91*86ee64e7SAndroid Build Coastguard Worker x2 = _mm512_xor_si512(x2, x6);
92*86ee64e7SAndroid Build Coastguard Worker x3 = _mm512_xor_si512(x3, x7);
93*86ee64e7SAndroid Build Coastguard Worker x4 = _mm512_xor_si512(x4, x8);
94*86ee64e7SAndroid Build Coastguard Worker
95*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, y5);
96*86ee64e7SAndroid Build Coastguard Worker x2 = _mm512_xor_si512(x2, y6);
97*86ee64e7SAndroid Build Coastguard Worker x3 = _mm512_xor_si512(x3, y7);
98*86ee64e7SAndroid Build Coastguard Worker x4 = _mm512_xor_si512(x4, y8);
99*86ee64e7SAndroid Build Coastguard Worker
100*86ee64e7SAndroid Build Coastguard Worker buf += 256;
101*86ee64e7SAndroid Build Coastguard Worker len -= 256;
102*86ee64e7SAndroid Build Coastguard Worker }
103*86ee64e7SAndroid Build Coastguard Worker
104*86ee64e7SAndroid Build Coastguard Worker /*
105*86ee64e7SAndroid Build Coastguard Worker * Fold into 512-bits.
106*86ee64e7SAndroid Build Coastguard Worker */
107*86ee64e7SAndroid Build Coastguard Worker x0 = _mm512_load_si512((__m512i *)k3k4);
108*86ee64e7SAndroid Build Coastguard Worker
109*86ee64e7SAndroid Build Coastguard Worker x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
110*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
111*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x2);
112*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x5);
113*86ee64e7SAndroid Build Coastguard Worker
114*86ee64e7SAndroid Build Coastguard Worker x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
115*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
116*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x3);
117*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x5);
118*86ee64e7SAndroid Build Coastguard Worker
119*86ee64e7SAndroid Build Coastguard Worker x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
120*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
121*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x4);
122*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x5);
123*86ee64e7SAndroid Build Coastguard Worker
124*86ee64e7SAndroid Build Coastguard Worker /*
125*86ee64e7SAndroid Build Coastguard Worker * Single fold blocks of 64, if any.
126*86ee64e7SAndroid Build Coastguard Worker */
127*86ee64e7SAndroid Build Coastguard Worker while (len >= 64)
128*86ee64e7SAndroid Build Coastguard Worker {
129*86ee64e7SAndroid Build Coastguard Worker x2 = _mm512_loadu_si512((__m512i *)buf);
130*86ee64e7SAndroid Build Coastguard Worker
131*86ee64e7SAndroid Build Coastguard Worker x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
132*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
133*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x2);
134*86ee64e7SAndroid Build Coastguard Worker x1 = _mm512_xor_si512(x1, x5);
135*86ee64e7SAndroid Build Coastguard Worker
136*86ee64e7SAndroid Build Coastguard Worker buf += 64;
137*86ee64e7SAndroid Build Coastguard Worker len -= 64;
138*86ee64e7SAndroid Build Coastguard Worker }
139*86ee64e7SAndroid Build Coastguard Worker
140*86ee64e7SAndroid Build Coastguard Worker /*
141*86ee64e7SAndroid Build Coastguard Worker * Fold 512-bits to 384-bits.
142*86ee64e7SAndroid Build Coastguard Worker */
143*86ee64e7SAndroid Build Coastguard Worker a0 = _mm_load_si128((__m128i *)k5k6);
144*86ee64e7SAndroid Build Coastguard Worker
145*86ee64e7SAndroid Build Coastguard Worker a1 = _mm512_extracti32x4_epi32(x1, 0);
146*86ee64e7SAndroid Build Coastguard Worker a2 = _mm512_extracti32x4_epi32(x1, 1);
147*86ee64e7SAndroid Build Coastguard Worker
148*86ee64e7SAndroid Build Coastguard Worker a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
149*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
150*86ee64e7SAndroid Build Coastguard Worker
151*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a3);
152*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
153*86ee64e7SAndroid Build Coastguard Worker
154*86ee64e7SAndroid Build Coastguard Worker /*
155*86ee64e7SAndroid Build Coastguard Worker * Fold 384-bits to 256-bits.
156*86ee64e7SAndroid Build Coastguard Worker */
157*86ee64e7SAndroid Build Coastguard Worker a2 = _mm512_extracti32x4_epi32(x1, 2);
158*86ee64e7SAndroid Build Coastguard Worker a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
159*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
160*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a3);
161*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
162*86ee64e7SAndroid Build Coastguard Worker
163*86ee64e7SAndroid Build Coastguard Worker /*
164*86ee64e7SAndroid Build Coastguard Worker * Fold 256-bits to 128-bits.
165*86ee64e7SAndroid Build Coastguard Worker */
166*86ee64e7SAndroid Build Coastguard Worker a2 = _mm512_extracti32x4_epi32(x1, 3);
167*86ee64e7SAndroid Build Coastguard Worker a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
168*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
169*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a3);
170*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
171*86ee64e7SAndroid Build Coastguard Worker
172*86ee64e7SAndroid Build Coastguard Worker /*
173*86ee64e7SAndroid Build Coastguard Worker * Fold 128-bits to 64-bits.
174*86ee64e7SAndroid Build Coastguard Worker */
175*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
176*86ee64e7SAndroid Build Coastguard Worker a3 = _mm_setr_epi32(~0, 0, ~0, 0);
177*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_srli_si128(a1, 8);
178*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
179*86ee64e7SAndroid Build Coastguard Worker
180*86ee64e7SAndroid Build Coastguard Worker a0 = _mm_loadl_epi64((__m128i*)k7k8);
181*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_srli_si128(a1, 4);
182*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_and_si128(a1, a3);
183*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
184*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
185*86ee64e7SAndroid Build Coastguard Worker
186*86ee64e7SAndroid Build Coastguard Worker /*
187*86ee64e7SAndroid Build Coastguard Worker * Barret reduce to 32-bits.
188*86ee64e7SAndroid Build Coastguard Worker */
189*86ee64e7SAndroid Build Coastguard Worker a0 = _mm_load_si128((__m128i*)poly);
190*86ee64e7SAndroid Build Coastguard Worker
191*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_and_si128(a1, a3);
192*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
193*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_and_si128(a2, a3);
194*86ee64e7SAndroid Build Coastguard Worker a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
195*86ee64e7SAndroid Build Coastguard Worker a1 = _mm_xor_si128(a1, a2);
196*86ee64e7SAndroid Build Coastguard Worker
197*86ee64e7SAndroid Build Coastguard Worker /*
198*86ee64e7SAndroid Build Coastguard Worker * Return the crc32.
199*86ee64e7SAndroid Build Coastguard Worker */
200*86ee64e7SAndroid Build Coastguard Worker return _mm_extract_epi32(a1, 1);
201*86ee64e7SAndroid Build Coastguard Worker }
202*86ee64e7SAndroid Build Coastguard Worker
203*86ee64e7SAndroid Build Coastguard Worker #elif defined(CRC32_SIMD_SSE42_PCLMUL)
204*86ee64e7SAndroid Build Coastguard Worker
205*86ee64e7SAndroid Build Coastguard Worker /*
206*86ee64e7SAndroid Build Coastguard Worker * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
207*86ee64e7SAndroid Build Coastguard Worker * length must be at least 64, and a multiple of 16.
208*86ee64e7SAndroid Build Coastguard Worker */
209*86ee64e7SAndroid Build Coastguard Worker
210*86ee64e7SAndroid Build Coastguard Worker #include <emmintrin.h>
211*86ee64e7SAndroid Build Coastguard Worker #include <smmintrin.h>
212*86ee64e7SAndroid Build Coastguard Worker #include <wmmintrin.h>
213*86ee64e7SAndroid Build Coastguard Worker
crc32_sse42_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)214*86ee64e7SAndroid Build Coastguard Worker uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
215*86ee64e7SAndroid Build Coastguard Worker const unsigned char *buf,
216*86ee64e7SAndroid Build Coastguard Worker z_size_t len,
217*86ee64e7SAndroid Build Coastguard Worker uint32_t crc)
218*86ee64e7SAndroid Build Coastguard Worker {
219*86ee64e7SAndroid Build Coastguard Worker /*
220*86ee64e7SAndroid Build Coastguard Worker * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
221*86ee64e7SAndroid Build Coastguard Worker * the CRC32+Barrett polynomials given at the end of the paper.
222*86ee64e7SAndroid Build Coastguard Worker */
223*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
224*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
225*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
226*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
227*86ee64e7SAndroid Build Coastguard Worker
228*86ee64e7SAndroid Build Coastguard Worker __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
229*86ee64e7SAndroid Build Coastguard Worker
230*86ee64e7SAndroid Build Coastguard Worker /*
231*86ee64e7SAndroid Build Coastguard Worker * There's at least one block of 64.
232*86ee64e7SAndroid Build Coastguard Worker */
233*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
234*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
235*86ee64e7SAndroid Build Coastguard Worker x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
236*86ee64e7SAndroid Build Coastguard Worker x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
237*86ee64e7SAndroid Build Coastguard Worker
238*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
239*86ee64e7SAndroid Build Coastguard Worker
240*86ee64e7SAndroid Build Coastguard Worker x0 = _mm_load_si128((__m128i *)k1k2);
241*86ee64e7SAndroid Build Coastguard Worker
242*86ee64e7SAndroid Build Coastguard Worker buf += 64;
243*86ee64e7SAndroid Build Coastguard Worker len -= 64;
244*86ee64e7SAndroid Build Coastguard Worker
245*86ee64e7SAndroid Build Coastguard Worker /*
246*86ee64e7SAndroid Build Coastguard Worker * Parallel fold blocks of 64, if any.
247*86ee64e7SAndroid Build Coastguard Worker */
248*86ee64e7SAndroid Build Coastguard Worker while (len >= 64)
249*86ee64e7SAndroid Build Coastguard Worker {
250*86ee64e7SAndroid Build Coastguard Worker x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
251*86ee64e7SAndroid Build Coastguard Worker x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
252*86ee64e7SAndroid Build Coastguard Worker x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
253*86ee64e7SAndroid Build Coastguard Worker x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
254*86ee64e7SAndroid Build Coastguard Worker
255*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
256*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
257*86ee64e7SAndroid Build Coastguard Worker x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
258*86ee64e7SAndroid Build Coastguard Worker x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
259*86ee64e7SAndroid Build Coastguard Worker
260*86ee64e7SAndroid Build Coastguard Worker y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
261*86ee64e7SAndroid Build Coastguard Worker y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
262*86ee64e7SAndroid Build Coastguard Worker y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
263*86ee64e7SAndroid Build Coastguard Worker y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
264*86ee64e7SAndroid Build Coastguard Worker
265*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x5);
266*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_xor_si128(x2, x6);
267*86ee64e7SAndroid Build Coastguard Worker x3 = _mm_xor_si128(x3, x7);
268*86ee64e7SAndroid Build Coastguard Worker x4 = _mm_xor_si128(x4, x8);
269*86ee64e7SAndroid Build Coastguard Worker
270*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, y5);
271*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_xor_si128(x2, y6);
272*86ee64e7SAndroid Build Coastguard Worker x3 = _mm_xor_si128(x3, y7);
273*86ee64e7SAndroid Build Coastguard Worker x4 = _mm_xor_si128(x4, y8);
274*86ee64e7SAndroid Build Coastguard Worker
275*86ee64e7SAndroid Build Coastguard Worker buf += 64;
276*86ee64e7SAndroid Build Coastguard Worker len -= 64;
277*86ee64e7SAndroid Build Coastguard Worker }
278*86ee64e7SAndroid Build Coastguard Worker
279*86ee64e7SAndroid Build Coastguard Worker /*
280*86ee64e7SAndroid Build Coastguard Worker * Fold into 128-bits.
281*86ee64e7SAndroid Build Coastguard Worker */
282*86ee64e7SAndroid Build Coastguard Worker x0 = _mm_load_si128((__m128i *)k3k4);
283*86ee64e7SAndroid Build Coastguard Worker
284*86ee64e7SAndroid Build Coastguard Worker x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
285*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
286*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x2);
287*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x5);
288*86ee64e7SAndroid Build Coastguard Worker
289*86ee64e7SAndroid Build Coastguard Worker x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
290*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
291*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x3);
292*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x5);
293*86ee64e7SAndroid Build Coastguard Worker
294*86ee64e7SAndroid Build Coastguard Worker x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
295*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
296*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x4);
297*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x5);
298*86ee64e7SAndroid Build Coastguard Worker
299*86ee64e7SAndroid Build Coastguard Worker /*
300*86ee64e7SAndroid Build Coastguard Worker * Single fold blocks of 16, if any.
301*86ee64e7SAndroid Build Coastguard Worker */
302*86ee64e7SAndroid Build Coastguard Worker while (len >= 16)
303*86ee64e7SAndroid Build Coastguard Worker {
304*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_loadu_si128((__m128i *)buf);
305*86ee64e7SAndroid Build Coastguard Worker
306*86ee64e7SAndroid Build Coastguard Worker x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
307*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
308*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x2);
309*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x5);
310*86ee64e7SAndroid Build Coastguard Worker
311*86ee64e7SAndroid Build Coastguard Worker buf += 16;
312*86ee64e7SAndroid Build Coastguard Worker len -= 16;
313*86ee64e7SAndroid Build Coastguard Worker }
314*86ee64e7SAndroid Build Coastguard Worker
315*86ee64e7SAndroid Build Coastguard Worker /*
316*86ee64e7SAndroid Build Coastguard Worker * Fold 128-bits to 64-bits.
317*86ee64e7SAndroid Build Coastguard Worker */
318*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
319*86ee64e7SAndroid Build Coastguard Worker x3 = _mm_setr_epi32(~0, 0, ~0, 0);
320*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_srli_si128(x1, 8);
321*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x2);
322*86ee64e7SAndroid Build Coastguard Worker
323*86ee64e7SAndroid Build Coastguard Worker x0 = _mm_loadl_epi64((__m128i*)k5k0);
324*86ee64e7SAndroid Build Coastguard Worker
325*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_srli_si128(x1, 4);
326*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_and_si128(x1, x3);
327*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
328*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x2);
329*86ee64e7SAndroid Build Coastguard Worker
330*86ee64e7SAndroid Build Coastguard Worker /*
331*86ee64e7SAndroid Build Coastguard Worker * Barret reduce to 32-bits.
332*86ee64e7SAndroid Build Coastguard Worker */
333*86ee64e7SAndroid Build Coastguard Worker x0 = _mm_load_si128((__m128i*)poly);
334*86ee64e7SAndroid Build Coastguard Worker
335*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_and_si128(x1, x3);
336*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
337*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_and_si128(x2, x3);
338*86ee64e7SAndroid Build Coastguard Worker x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
339*86ee64e7SAndroid Build Coastguard Worker x1 = _mm_xor_si128(x1, x2);
340*86ee64e7SAndroid Build Coastguard Worker
341*86ee64e7SAndroid Build Coastguard Worker /*
342*86ee64e7SAndroid Build Coastguard Worker * Return the crc32.
343*86ee64e7SAndroid Build Coastguard Worker */
344*86ee64e7SAndroid Build Coastguard Worker return _mm_extract_epi32(x1, 1);
345*86ee64e7SAndroid Build Coastguard Worker }
346*86ee64e7SAndroid Build Coastguard Worker
347*86ee64e7SAndroid Build Coastguard Worker #elif defined(CRC32_ARMV8_CRC32)
348*86ee64e7SAndroid Build Coastguard Worker
349*86ee64e7SAndroid Build Coastguard Worker /* CRC32 checksums using ARMv8-a crypto instructions.
350*86ee64e7SAndroid Build Coastguard Worker */
351*86ee64e7SAndroid Build Coastguard Worker
352*86ee64e7SAndroid Build Coastguard Worker #if defined(__clang__)
353*86ee64e7SAndroid Build Coastguard Worker /* We need some extra types for using PMULL.
354*86ee64e7SAndroid Build Coastguard Worker */
355*86ee64e7SAndroid Build Coastguard Worker #if defined(__aarch64__)
356*86ee64e7SAndroid Build Coastguard Worker #include <arm_neon.h>
357*86ee64e7SAndroid Build Coastguard Worker #include <arm_acle.h>
358*86ee64e7SAndroid Build Coastguard Worker #endif
359*86ee64e7SAndroid Build Coastguard Worker
360*86ee64e7SAndroid Build Coastguard Worker /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
361*86ee64e7SAndroid Build Coastguard Worker * armv8 target, which is incompatible with ThinLTO optimizations on Android.
362*86ee64e7SAndroid Build Coastguard Worker * (Namely, mixing and matching different module-level targets makes ThinLTO
363*86ee64e7SAndroid Build Coastguard Worker * warn, and Android defaults to armv7-a. This restriction does not apply to
364*86ee64e7SAndroid Build Coastguard Worker * function-level `target`s, however.)
365*86ee64e7SAndroid Build Coastguard Worker *
366*86ee64e7SAndroid Build Coastguard Worker * Since we only need four crc intrinsics, and since clang's implementation of
367*86ee64e7SAndroid Build Coastguard Worker * those are just wrappers around compiler builtins, it's simplest to #define
368*86ee64e7SAndroid Build Coastguard Worker * those builtins directly. If this #define list grows too much (or we depend on
369*86ee64e7SAndroid Build Coastguard Worker * an intrinsic that isn't a trivial wrapper), we may have to find a better way
370*86ee64e7SAndroid Build Coastguard Worker * to go about this.
371*86ee64e7SAndroid Build Coastguard Worker *
372*86ee64e7SAndroid Build Coastguard Worker * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
373*86ee64e7SAndroid Build Coastguard Worker * feature for this target (ignoring feature)." This appears to be a harmless
374*86ee64e7SAndroid Build Coastguard Worker * bug in clang.
375*86ee64e7SAndroid Build Coastguard Worker *
376*86ee64e7SAndroid Build Coastguard Worker * These definitions must appear *after* including arm_acle.h otherwise that
377*86ee64e7SAndroid Build Coastguard Worker * header may end up defining functions named __builtin_arm_crc32* that call
378*86ee64e7SAndroid Build Coastguard Worker * themselves, creating an infinite loop when the intrinsic is called.
379*86ee64e7SAndroid Build Coastguard Worker */
380*86ee64e7SAndroid Build Coastguard Worker /* XXX: Cannot hook into builtins with XCode for arm64. */
381*86ee64e7SAndroid Build Coastguard Worker #if !defined(ARMV8_OS_MACOS)
382*86ee64e7SAndroid Build Coastguard Worker #define __crc32b __builtin_arm_crc32b
383*86ee64e7SAndroid Build Coastguard Worker #define __crc32d __builtin_arm_crc32d
384*86ee64e7SAndroid Build Coastguard Worker #define __crc32w __builtin_arm_crc32w
385*86ee64e7SAndroid Build Coastguard Worker #define __crc32cw __builtin_arm_crc32cw
386*86ee64e7SAndroid Build Coastguard Worker #endif
387*86ee64e7SAndroid Build Coastguard Worker
388*86ee64e7SAndroid Build Coastguard Worker #if defined(__aarch64__)
389*86ee64e7SAndroid Build Coastguard Worker #define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
390*86ee64e7SAndroid Build Coastguard Worker #else // !defined(__aarch64__)
391*86ee64e7SAndroid Build Coastguard Worker #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
392*86ee64e7SAndroid Build Coastguard Worker #endif // defined(__aarch64__)
393*86ee64e7SAndroid Build Coastguard Worker
394*86ee64e7SAndroid Build Coastguard Worker #elif defined(__GNUC__)
395*86ee64e7SAndroid Build Coastguard Worker /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
396*86ee64e7SAndroid Build Coastguard Worker * allowed. We can just include arm_acle.h.
397*86ee64e7SAndroid Build Coastguard Worker */
398*86ee64e7SAndroid Build Coastguard Worker #include <arm_acle.h>
399*86ee64e7SAndroid Build Coastguard Worker #include <arm_neon.h>
400*86ee64e7SAndroid Build Coastguard Worker #define TARGET_ARMV8_WITH_CRC
401*86ee64e7SAndroid Build Coastguard Worker #else // !defined(__GNUC__) && !defined(_aarch64__)
402*86ee64e7SAndroid Build Coastguard Worker #error ARM CRC32 SIMD extensions only supported for Clang and GCC
403*86ee64e7SAndroid Build Coastguard Worker #endif
404*86ee64e7SAndroid Build Coastguard Worker
405*86ee64e7SAndroid Build Coastguard Worker TARGET_ARMV8_WITH_CRC
armv8_crc32_little(const unsigned char * buf,z_size_t len,uint32_t crc)406*86ee64e7SAndroid Build Coastguard Worker uint32_t ZLIB_INTERNAL armv8_crc32_little(
407*86ee64e7SAndroid Build Coastguard Worker const unsigned char *buf,
408*86ee64e7SAndroid Build Coastguard Worker z_size_t len,
409*86ee64e7SAndroid Build Coastguard Worker uint32_t crc)
410*86ee64e7SAndroid Build Coastguard Worker {
411*86ee64e7SAndroid Build Coastguard Worker uint32_t c = (uint32_t) ~crc;
412*86ee64e7SAndroid Build Coastguard Worker
413*86ee64e7SAndroid Build Coastguard Worker while (len && ((uintptr_t)buf & 7)) {
414*86ee64e7SAndroid Build Coastguard Worker c = __crc32b(c, *buf++);
415*86ee64e7SAndroid Build Coastguard Worker --len;
416*86ee64e7SAndroid Build Coastguard Worker }
417*86ee64e7SAndroid Build Coastguard Worker
418*86ee64e7SAndroid Build Coastguard Worker const uint64_t *buf8 = (const uint64_t *)buf;
419*86ee64e7SAndroid Build Coastguard Worker
420*86ee64e7SAndroid Build Coastguard Worker while (len >= 64) {
421*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
422*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
423*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
424*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
425*86ee64e7SAndroid Build Coastguard Worker
426*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
427*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
428*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
429*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
430*86ee64e7SAndroid Build Coastguard Worker len -= 64;
431*86ee64e7SAndroid Build Coastguard Worker }
432*86ee64e7SAndroid Build Coastguard Worker
433*86ee64e7SAndroid Build Coastguard Worker while (len >= 8) {
434*86ee64e7SAndroid Build Coastguard Worker c = __crc32d(c, *buf8++);
435*86ee64e7SAndroid Build Coastguard Worker len -= 8;
436*86ee64e7SAndroid Build Coastguard Worker }
437*86ee64e7SAndroid Build Coastguard Worker
438*86ee64e7SAndroid Build Coastguard Worker buf = (const unsigned char *)buf8;
439*86ee64e7SAndroid Build Coastguard Worker
440*86ee64e7SAndroid Build Coastguard Worker while (len--) {
441*86ee64e7SAndroid Build Coastguard Worker c = __crc32b(c, *buf++);
442*86ee64e7SAndroid Build Coastguard Worker }
443*86ee64e7SAndroid Build Coastguard Worker
444*86ee64e7SAndroid Build Coastguard Worker return ~c;
445*86ee64e7SAndroid Build Coastguard Worker }
446*86ee64e7SAndroid Build Coastguard Worker
447*86ee64e7SAndroid Build Coastguard Worker #if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
448*86ee64e7SAndroid Build Coastguard Worker
449*86ee64e7SAndroid Build Coastguard Worker /*
450*86ee64e7SAndroid Build Coastguard Worker * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
451*86ee64e7SAndroid Build Coastguard Worker * length must be at least 64, and a multiple of 16. Based on:
452*86ee64e7SAndroid Build Coastguard Worker *
453*86ee64e7SAndroid Build Coastguard Worker * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
454*86ee64e7SAndroid Build Coastguard Worker * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
455*86ee64e7SAndroid Build Coastguard Worker */
456*86ee64e7SAndroid Build Coastguard Worker TARGET_ARMV8_WITH_CRC
pmull_lo(const uint64x2_t a,const uint64x2_t b)457*86ee64e7SAndroid Build Coastguard Worker static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
458*86ee64e7SAndroid Build Coastguard Worker {
459*86ee64e7SAndroid Build Coastguard Worker uint8x16_t r;
460*86ee64e7SAndroid Build Coastguard Worker __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
461*86ee64e7SAndroid Build Coastguard Worker : "=w" (r) : "w" (a), "w" (b) );
462*86ee64e7SAndroid Build Coastguard Worker return r;
463*86ee64e7SAndroid Build Coastguard Worker }
464*86ee64e7SAndroid Build Coastguard Worker
465*86ee64e7SAndroid Build Coastguard Worker TARGET_ARMV8_WITH_CRC
pmull_01(const uint64x2_t a,const uint64x2_t b)466*86ee64e7SAndroid Build Coastguard Worker static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
467*86ee64e7SAndroid Build Coastguard Worker {
468*86ee64e7SAndroid Build Coastguard Worker uint8x16_t r;
469*86ee64e7SAndroid Build Coastguard Worker __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
470*86ee64e7SAndroid Build Coastguard Worker : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
471*86ee64e7SAndroid Build Coastguard Worker return r;
472*86ee64e7SAndroid Build Coastguard Worker }
473*86ee64e7SAndroid Build Coastguard Worker
474*86ee64e7SAndroid Build Coastguard Worker TARGET_ARMV8_WITH_CRC
pmull_hi(const uint64x2_t a,const uint64x2_t b)475*86ee64e7SAndroid Build Coastguard Worker static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
476*86ee64e7SAndroid Build Coastguard Worker {
477*86ee64e7SAndroid Build Coastguard Worker uint8x16_t r;
478*86ee64e7SAndroid Build Coastguard Worker __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
479*86ee64e7SAndroid Build Coastguard Worker : "=w" (r) : "w" (a), "w" (b) );
480*86ee64e7SAndroid Build Coastguard Worker return r;
481*86ee64e7SAndroid Build Coastguard Worker }
482*86ee64e7SAndroid Build Coastguard Worker
483*86ee64e7SAndroid Build Coastguard Worker TARGET_ARMV8_WITH_CRC
armv8_crc32_pmull_little(const unsigned char * buf,z_size_t len,uint32_t crc)484*86ee64e7SAndroid Build Coastguard Worker uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
485*86ee64e7SAndroid Build Coastguard Worker const unsigned char *buf,
486*86ee64e7SAndroid Build Coastguard Worker z_size_t len,
487*86ee64e7SAndroid Build Coastguard Worker uint32_t crc)
488*86ee64e7SAndroid Build Coastguard Worker {
489*86ee64e7SAndroid Build Coastguard Worker /*
490*86ee64e7SAndroid Build Coastguard Worker * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
491*86ee64e7SAndroid Build Coastguard Worker * the CRC32+Barrett polynomials given at the end of the paper.
492*86ee64e7SAndroid Build Coastguard Worker */
493*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
494*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
495*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
496*86ee64e7SAndroid Build Coastguard Worker static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
497*86ee64e7SAndroid Build Coastguard Worker
498*86ee64e7SAndroid Build Coastguard Worker uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
499*86ee64e7SAndroid Build Coastguard Worker
500*86ee64e7SAndroid Build Coastguard Worker /*
501*86ee64e7SAndroid Build Coastguard Worker * There's at least one block of 64.
502*86ee64e7SAndroid Build Coastguard Worker */
503*86ee64e7SAndroid Build Coastguard Worker x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
504*86ee64e7SAndroid Build Coastguard Worker x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
505*86ee64e7SAndroid Build Coastguard Worker x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
506*86ee64e7SAndroid Build Coastguard Worker x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
507*86ee64e7SAndroid Build Coastguard Worker
508*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
509*86ee64e7SAndroid Build Coastguard Worker
510*86ee64e7SAndroid Build Coastguard Worker x0 = vld1q_u64(k1k2);
511*86ee64e7SAndroid Build Coastguard Worker
512*86ee64e7SAndroid Build Coastguard Worker buf += 64;
513*86ee64e7SAndroid Build Coastguard Worker len -= 64;
514*86ee64e7SAndroid Build Coastguard Worker
515*86ee64e7SAndroid Build Coastguard Worker /*
516*86ee64e7SAndroid Build Coastguard Worker * Parallel fold blocks of 64, if any.
517*86ee64e7SAndroid Build Coastguard Worker */
518*86ee64e7SAndroid Build Coastguard Worker while (len >= 64)
519*86ee64e7SAndroid Build Coastguard Worker {
520*86ee64e7SAndroid Build Coastguard Worker x5 = (uint64x2_t) pmull_lo(x1, x0);
521*86ee64e7SAndroid Build Coastguard Worker x6 = (uint64x2_t) pmull_lo(x2, x0);
522*86ee64e7SAndroid Build Coastguard Worker x7 = (uint64x2_t) pmull_lo(x3, x0);
523*86ee64e7SAndroid Build Coastguard Worker x8 = (uint64x2_t) pmull_lo(x4, x0);
524*86ee64e7SAndroid Build Coastguard Worker
525*86ee64e7SAndroid Build Coastguard Worker y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
526*86ee64e7SAndroid Build Coastguard Worker y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
527*86ee64e7SAndroid Build Coastguard Worker y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
528*86ee64e7SAndroid Build Coastguard Worker y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
529*86ee64e7SAndroid Build Coastguard Worker
530*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_hi(x1, x0);
531*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) pmull_hi(x2, x0);
532*86ee64e7SAndroid Build Coastguard Worker x3 = (uint64x2_t) pmull_hi(x3, x0);
533*86ee64e7SAndroid Build Coastguard Worker x4 = (uint64x2_t) pmull_hi(x4, x0);
534*86ee64e7SAndroid Build Coastguard Worker
535*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x5);
536*86ee64e7SAndroid Build Coastguard Worker x2 = veorq_u64(x2, x6);
537*86ee64e7SAndroid Build Coastguard Worker x3 = veorq_u64(x3, x7);
538*86ee64e7SAndroid Build Coastguard Worker x4 = veorq_u64(x4, x8);
539*86ee64e7SAndroid Build Coastguard Worker
540*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, y5);
541*86ee64e7SAndroid Build Coastguard Worker x2 = veorq_u64(x2, y6);
542*86ee64e7SAndroid Build Coastguard Worker x3 = veorq_u64(x3, y7);
543*86ee64e7SAndroid Build Coastguard Worker x4 = veorq_u64(x4, y8);
544*86ee64e7SAndroid Build Coastguard Worker
545*86ee64e7SAndroid Build Coastguard Worker buf += 64;
546*86ee64e7SAndroid Build Coastguard Worker len -= 64;
547*86ee64e7SAndroid Build Coastguard Worker }
548*86ee64e7SAndroid Build Coastguard Worker
549*86ee64e7SAndroid Build Coastguard Worker /*
550*86ee64e7SAndroid Build Coastguard Worker * Fold into 128-bits.
551*86ee64e7SAndroid Build Coastguard Worker */
552*86ee64e7SAndroid Build Coastguard Worker x0 = vld1q_u64(k3k4);
553*86ee64e7SAndroid Build Coastguard Worker
554*86ee64e7SAndroid Build Coastguard Worker x5 = (uint64x2_t) pmull_lo(x1, x0);
555*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_hi(x1, x0);
556*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x2);
557*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x5);
558*86ee64e7SAndroid Build Coastguard Worker
559*86ee64e7SAndroid Build Coastguard Worker x5 = (uint64x2_t) pmull_lo(x1, x0);
560*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_hi(x1, x0);
561*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x3);
562*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x5);
563*86ee64e7SAndroid Build Coastguard Worker
564*86ee64e7SAndroid Build Coastguard Worker x5 = (uint64x2_t) pmull_lo(x1, x0);
565*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_hi(x1, x0);
566*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x4);
567*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x5);
568*86ee64e7SAndroid Build Coastguard Worker
569*86ee64e7SAndroid Build Coastguard Worker /*
570*86ee64e7SAndroid Build Coastguard Worker * Single fold blocks of 16, if any.
571*86ee64e7SAndroid Build Coastguard Worker */
572*86ee64e7SAndroid Build Coastguard Worker while (len >= 16)
573*86ee64e7SAndroid Build Coastguard Worker {
574*86ee64e7SAndroid Build Coastguard Worker x2 = vld1q_u64((const uint64_t *)buf);
575*86ee64e7SAndroid Build Coastguard Worker
576*86ee64e7SAndroid Build Coastguard Worker x5 = (uint64x2_t) pmull_lo(x1, x0);
577*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_hi(x1, x0);
578*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x2);
579*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x5);
580*86ee64e7SAndroid Build Coastguard Worker
581*86ee64e7SAndroid Build Coastguard Worker buf += 16;
582*86ee64e7SAndroid Build Coastguard Worker len -= 16;
583*86ee64e7SAndroid Build Coastguard Worker }
584*86ee64e7SAndroid Build Coastguard Worker
585*86ee64e7SAndroid Build Coastguard Worker /*
586*86ee64e7SAndroid Build Coastguard Worker * Fold 128-bits to 64-bits.
587*86ee64e7SAndroid Build Coastguard Worker */
588*86ee64e7SAndroid Build Coastguard Worker static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
589*86ee64e7SAndroid Build Coastguard Worker
590*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) pmull_01(x1, x0);
591*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
592*86ee64e7SAndroid Build Coastguard Worker x3 = (uint64x2_t) vld1q_u32(mask);
593*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x2);
594*86ee64e7SAndroid Build Coastguard Worker
595*86ee64e7SAndroid Build Coastguard Worker x0 = vld1q_u64(k5k0);
596*86ee64e7SAndroid Build Coastguard Worker
597*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) pmull_01(x2, x0);
598*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
599*86ee64e7SAndroid Build Coastguard Worker x1 = vandq_u64(x1, x3);
600*86ee64e7SAndroid Build Coastguard Worker x1 = (uint64x2_t) pmull_lo(x1, x0);
601*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x2);
602*86ee64e7SAndroid Build Coastguard Worker
603*86ee64e7SAndroid Build Coastguard Worker /*
604*86ee64e7SAndroid Build Coastguard Worker * Barret reduce to 32-bits.
605*86ee64e7SAndroid Build Coastguard Worker */
606*86ee64e7SAndroid Build Coastguard Worker x0 = vld1q_u64(poly);
607*86ee64e7SAndroid Build Coastguard Worker
608*86ee64e7SAndroid Build Coastguard Worker x2 = vandq_u64(x1, x3);
609*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) pmull_01(x2, x0);
610*86ee64e7SAndroid Build Coastguard Worker x2 = vandq_u64(x2, x3);
611*86ee64e7SAndroid Build Coastguard Worker x2 = (uint64x2_t) pmull_lo(x2, x0);
612*86ee64e7SAndroid Build Coastguard Worker x1 = veorq_u64(x1, x2);
613*86ee64e7SAndroid Build Coastguard Worker
614*86ee64e7SAndroid Build Coastguard Worker /*
615*86ee64e7SAndroid Build Coastguard Worker * Return the crc32.
616*86ee64e7SAndroid Build Coastguard Worker */
617*86ee64e7SAndroid Build Coastguard Worker return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
618*86ee64e7SAndroid Build Coastguard Worker }
619*86ee64e7SAndroid Build Coastguard Worker #endif /* aarch64 specific code. */
620*86ee64e7SAndroid Build Coastguard Worker
621*86ee64e7SAndroid Build Coastguard Worker #endif
622