xref: /aosp_15_r20/external/zlib/patches/0001-simd.patch (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32.c b/crc32.c
2*86ee64e7SAndroid Build Coastguard Workerindex 9580440c0e6b..9162429cc7b4 100644
3*86ee64e7SAndroid Build Coastguard Worker--- a/crc32.c
4*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32.c
5*86ee64e7SAndroid Build Coastguard Worker@@ -28,6 +28,8 @@
6*86ee64e7SAndroid Build Coastguard Worker #  endif /* !DYNAMIC_CRC_TABLE */
7*86ee64e7SAndroid Build Coastguard Worker #endif /* MAKECRCH */
8*86ee64e7SAndroid Build Coastguard Worker
9*86ee64e7SAndroid Build Coastguard Worker+#include "deflate.h"
10*86ee64e7SAndroid Build Coastguard Worker+#include "x86.h"
11*86ee64e7SAndroid Build Coastguard Worker #include "zutil.h"      /* for STDC and FAR definitions */
12*86ee64e7SAndroid Build Coastguard Worker
13*86ee64e7SAndroid Build Coastguard Worker /* Definitions for doing the crc four data bytes at a time. */
14*86ee64e7SAndroid Build Coastguard Worker@@ -440,3 +442,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
15*86ee64e7SAndroid Build Coastguard Worker {
16*86ee64e7SAndroid Build Coastguard Worker     return crc32_combine_(crc1, crc2, len2);
17*86ee64e7SAndroid Build Coastguard Worker }
18*86ee64e7SAndroid Build Coastguard Worker+
19*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL void crc_reset(deflate_state *const s)
20*86ee64e7SAndroid Build Coastguard Worker+{
21*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd) {
22*86ee64e7SAndroid Build Coastguard Worker+        crc_fold_init(s);
23*86ee64e7SAndroid Build Coastguard Worker+        return;
24*86ee64e7SAndroid Build Coastguard Worker+    }
25*86ee64e7SAndroid Build Coastguard Worker+    s->strm->adler = crc32(0L, Z_NULL, 0);
26*86ee64e7SAndroid Build Coastguard Worker+}
27*86ee64e7SAndroid Build Coastguard Worker+
28*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
29*86ee64e7SAndroid Build Coastguard Worker+{
30*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd)
31*86ee64e7SAndroid Build Coastguard Worker+        s->strm->adler = crc_fold_512to32(s);
32*86ee64e7SAndroid Build Coastguard Worker+}
33*86ee64e7SAndroid Build Coastguard Worker+
34*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
35*86ee64e7SAndroid Build Coastguard Worker+{
36*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd) {
37*86ee64e7SAndroid Build Coastguard Worker+        crc_fold_copy(strm->state, dst, strm->next_in, size);
38*86ee64e7SAndroid Build Coastguard Worker+        return;
39*86ee64e7SAndroid Build Coastguard Worker+    }
40*86ee64e7SAndroid Build Coastguard Worker+    zmemcpy(dst, strm->next_in, size);
41*86ee64e7SAndroid Build Coastguard Worker+    strm->adler = crc32(strm->adler, dst, size);
42*86ee64e7SAndroid Build Coastguard Worker+}
43*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc_folding.c b/crc_folding.c
44*86ee64e7SAndroid Build Coastguard Workernew file mode 100644
45*86ee64e7SAndroid Build Coastguard Workerindex 000000000000..48d77744aaf4
46*86ee64e7SAndroid Build Coastguard Worker--- /dev/null
47*86ee64e7SAndroid Build Coastguard Worker+++ b/crc_folding.c
48*86ee64e7SAndroid Build Coastguard Worker@@ -0,0 +1,493 @@
49*86ee64e7SAndroid Build Coastguard Worker+/*
50*86ee64e7SAndroid Build Coastguard Worker+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
51*86ee64e7SAndroid Build Coastguard Worker+ * instruction.
52*86ee64e7SAndroid Build Coastguard Worker+ *
53*86ee64e7SAndroid Build Coastguard Worker+ * A white paper describing this algorithm can be found at:
54*86ee64e7SAndroid Build Coastguard Worker+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
55*86ee64e7SAndroid Build Coastguard Worker+ *
56*86ee64e7SAndroid Build Coastguard Worker+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
57*86ee64e7SAndroid Build Coastguard Worker+ * Authors:
58*86ee64e7SAndroid Build Coastguard Worker+ * 	Wajdi Feghali   <[email protected]>
59*86ee64e7SAndroid Build Coastguard Worker+ * 	Jim Guilford    <[email protected]>
60*86ee64e7SAndroid Build Coastguard Worker+ * 	Vinodh Gopal    <[email protected]>
61*86ee64e7SAndroid Build Coastguard Worker+ * 	Erdinc Ozturk   <[email protected]>
62*86ee64e7SAndroid Build Coastguard Worker+ * 	Jim Kukunas     <[email protected]>
63*86ee64e7SAndroid Build Coastguard Worker+ *
64*86ee64e7SAndroid Build Coastguard Worker+ * For conditions of distribution and use, see copyright notice in zlib.h
65*86ee64e7SAndroid Build Coastguard Worker+ */
66*86ee64e7SAndroid Build Coastguard Worker+
67*86ee64e7SAndroid Build Coastguard Worker+#include "deflate.h"
68*86ee64e7SAndroid Build Coastguard Worker+
69*86ee64e7SAndroid Build Coastguard Worker+#include <inttypes.h>
70*86ee64e7SAndroid Build Coastguard Worker+#include <emmintrin.h>
71*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h>
72*86ee64e7SAndroid Build Coastguard Worker+#include <wmmintrin.h>
73*86ee64e7SAndroid Build Coastguard Worker+
74*86ee64e7SAndroid Build Coastguard Worker+#define CRC_LOAD(s) \
75*86ee64e7SAndroid Build Coastguard Worker+    do { \
76*86ee64e7SAndroid Build Coastguard Worker+        __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
77*86ee64e7SAndroid Build Coastguard Worker+        __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
78*86ee64e7SAndroid Build Coastguard Worker+        __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
79*86ee64e7SAndroid Build Coastguard Worker+        __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
80*86ee64e7SAndroid Build Coastguard Worker+        __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
81*86ee64e7SAndroid Build Coastguard Worker+
82*86ee64e7SAndroid Build Coastguard Worker+#define CRC_SAVE(s) \
83*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
84*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
85*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
86*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
87*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
88*86ee64e7SAndroid Build Coastguard Worker+    } while (0);
89*86ee64e7SAndroid Build Coastguard Worker+
90*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
91*86ee64e7SAndroid Build Coastguard Worker+{
92*86ee64e7SAndroid Build Coastguard Worker+    CRC_LOAD(s)
93*86ee64e7SAndroid Build Coastguard Worker+
94*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
95*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc1 = _mm_setzero_si128();
96*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = _mm_setzero_si128();
97*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_setzero_si128();
98*86ee64e7SAndroid Build Coastguard Worker+
99*86ee64e7SAndroid Build Coastguard Worker+    CRC_SAVE(s)
100*86ee64e7SAndroid Build Coastguard Worker+
101*86ee64e7SAndroid Build Coastguard Worker+    s->strm->adler = 0;
102*86ee64e7SAndroid Build Coastguard Worker+}
103*86ee64e7SAndroid Build Coastguard Worker+
104*86ee64e7SAndroid Build Coastguard Worker+local void fold_1(deflate_state *const s,
105*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc0, __m128i *xmm_crc1,
106*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc2, __m128i *xmm_crc3)
107*86ee64e7SAndroid Build Coastguard Worker+{
108*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_fold4 = _mm_set_epi32(
109*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0x54442bd4,
110*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0xc6e41596);
111*86ee64e7SAndroid Build Coastguard Worker+
112*86ee64e7SAndroid Build Coastguard Worker+    __m128i x_tmp3;
113*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_crc0, ps_crc3, ps_res;
114*86ee64e7SAndroid Build Coastguard Worker+
115*86ee64e7SAndroid Build Coastguard Worker+    x_tmp3 = *xmm_crc3;
116*86ee64e7SAndroid Build Coastguard Worker+
117*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = *xmm_crc0;
118*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
119*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
120*86ee64e7SAndroid Build Coastguard Worker+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
121*86ee64e7SAndroid Build Coastguard Worker+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
122*86ee64e7SAndroid Build Coastguard Worker+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
123*86ee64e7SAndroid Build Coastguard Worker+
124*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = *xmm_crc1;
125*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = *xmm_crc2;
126*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = x_tmp3;
127*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_castps_si128(ps_res);
128*86ee64e7SAndroid Build Coastguard Worker+}
129*86ee64e7SAndroid Build Coastguard Worker+
130*86ee64e7SAndroid Build Coastguard Worker+local void fold_2(deflate_state *const s,
131*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc0, __m128i *xmm_crc1,
132*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc2, __m128i *xmm_crc3)
133*86ee64e7SAndroid Build Coastguard Worker+{
134*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_fold4 = _mm_set_epi32(
135*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0x54442bd4,
136*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0xc6e41596);
137*86ee64e7SAndroid Build Coastguard Worker+
138*86ee64e7SAndroid Build Coastguard Worker+    __m128i x_tmp3, x_tmp2;
139*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
140*86ee64e7SAndroid Build Coastguard Worker+
141*86ee64e7SAndroid Build Coastguard Worker+    x_tmp3 = *xmm_crc3;
142*86ee64e7SAndroid Build Coastguard Worker+    x_tmp2 = *xmm_crc2;
143*86ee64e7SAndroid Build Coastguard Worker+
144*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = *xmm_crc1;
145*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
146*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
147*86ee64e7SAndroid Build Coastguard Worker+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
148*86ee64e7SAndroid Build Coastguard Worker+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
149*86ee64e7SAndroid Build Coastguard Worker+    ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
150*86ee64e7SAndroid Build Coastguard Worker+
151*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = *xmm_crc0;
152*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
153*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
154*86ee64e7SAndroid Build Coastguard Worker+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
155*86ee64e7SAndroid Build Coastguard Worker+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
156*86ee64e7SAndroid Build Coastguard Worker+    ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
157*86ee64e7SAndroid Build Coastguard Worker+
158*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = x_tmp2;
159*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = x_tmp3;
160*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_castps_si128(ps_res20);
161*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_castps_si128(ps_res31);
162*86ee64e7SAndroid Build Coastguard Worker+}
163*86ee64e7SAndroid Build Coastguard Worker+
164*86ee64e7SAndroid Build Coastguard Worker+local void fold_3(deflate_state *const s,
165*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc0, __m128i *xmm_crc1,
166*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc2, __m128i *xmm_crc3)
167*86ee64e7SAndroid Build Coastguard Worker+{
168*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_fold4 = _mm_set_epi32(
169*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0x54442bd4,
170*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0xc6e41596);
171*86ee64e7SAndroid Build Coastguard Worker+
172*86ee64e7SAndroid Build Coastguard Worker+    __m128i x_tmp3;
173*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
174*86ee64e7SAndroid Build Coastguard Worker+
175*86ee64e7SAndroid Build Coastguard Worker+    x_tmp3 = *xmm_crc3;
176*86ee64e7SAndroid Build Coastguard Worker+
177*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = *xmm_crc2;
178*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
179*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
180*86ee64e7SAndroid Build Coastguard Worker+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
181*86ee64e7SAndroid Build Coastguard Worker+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
182*86ee64e7SAndroid Build Coastguard Worker+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
183*86ee64e7SAndroid Build Coastguard Worker+
184*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = *xmm_crc1;
185*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
186*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
187*86ee64e7SAndroid Build Coastguard Worker+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
188*86ee64e7SAndroid Build Coastguard Worker+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
189*86ee64e7SAndroid Build Coastguard Worker+    ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
190*86ee64e7SAndroid Build Coastguard Worker+
191*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = *xmm_crc0;
192*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
193*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
194*86ee64e7SAndroid Build Coastguard Worker+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
195*86ee64e7SAndroid Build Coastguard Worker+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
196*86ee64e7SAndroid Build Coastguard Worker+    ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
197*86ee64e7SAndroid Build Coastguard Worker+
198*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = x_tmp3;
199*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_castps_si128(ps_res10);
200*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_castps_si128(ps_res21);
201*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_castps_si128(ps_res32);
202*86ee64e7SAndroid Build Coastguard Worker+}
203*86ee64e7SAndroid Build Coastguard Worker+
204*86ee64e7SAndroid Build Coastguard Worker+local void fold_4(deflate_state *const s,
205*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc0, __m128i *xmm_crc1,
206*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc2, __m128i *xmm_crc3)
207*86ee64e7SAndroid Build Coastguard Worker+{
208*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_fold4 = _mm_set_epi32(
209*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0x54442bd4,
210*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0xc6e41596);
211*86ee64e7SAndroid Build Coastguard Worker+
212*86ee64e7SAndroid Build Coastguard Worker+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
213*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
214*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
215*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
216*86ee64e7SAndroid Build Coastguard Worker+
217*86ee64e7SAndroid Build Coastguard Worker+    x_tmp0 = *xmm_crc0;
218*86ee64e7SAndroid Build Coastguard Worker+    x_tmp1 = *xmm_crc1;
219*86ee64e7SAndroid Build Coastguard Worker+    x_tmp2 = *xmm_crc2;
220*86ee64e7SAndroid Build Coastguard Worker+    x_tmp3 = *xmm_crc3;
221*86ee64e7SAndroid Build Coastguard Worker+
222*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
223*86ee64e7SAndroid Build Coastguard Worker+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
224*86ee64e7SAndroid Build Coastguard Worker+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
225*86ee64e7SAndroid Build Coastguard Worker+    ps_t0 = _mm_castsi128_ps(x_tmp0);
226*86ee64e7SAndroid Build Coastguard Worker+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
227*86ee64e7SAndroid Build Coastguard Worker+
228*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
229*86ee64e7SAndroid Build Coastguard Worker+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
230*86ee64e7SAndroid Build Coastguard Worker+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
231*86ee64e7SAndroid Build Coastguard Worker+    ps_t1 = _mm_castsi128_ps(x_tmp1);
232*86ee64e7SAndroid Build Coastguard Worker+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
233*86ee64e7SAndroid Build Coastguard Worker+
234*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
235*86ee64e7SAndroid Build Coastguard Worker+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
236*86ee64e7SAndroid Build Coastguard Worker+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
237*86ee64e7SAndroid Build Coastguard Worker+    ps_t2 = _mm_castsi128_ps(x_tmp2);
238*86ee64e7SAndroid Build Coastguard Worker+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
239*86ee64e7SAndroid Build Coastguard Worker+
240*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
241*86ee64e7SAndroid Build Coastguard Worker+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
242*86ee64e7SAndroid Build Coastguard Worker+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
243*86ee64e7SAndroid Build Coastguard Worker+    ps_t3 = _mm_castsi128_ps(x_tmp3);
244*86ee64e7SAndroid Build Coastguard Worker+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
245*86ee64e7SAndroid Build Coastguard Worker+
246*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_castps_si128(ps_res0);
247*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_castps_si128(ps_res1);
248*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_castps_si128(ps_res2);
249*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_castps_si128(ps_res3);
250*86ee64e7SAndroid Build Coastguard Worker+}
251*86ee64e7SAndroid Build Coastguard Worker+
252*86ee64e7SAndroid Build Coastguard Worker+local const unsigned zalign(32) pshufb_shf_table[60] = {
253*86ee64e7SAndroid Build Coastguard Worker+	0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
254*86ee64e7SAndroid Build Coastguard Worker+	0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
255*86ee64e7SAndroid Build Coastguard Worker+	0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
256*86ee64e7SAndroid Build Coastguard Worker+	0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
257*86ee64e7SAndroid Build Coastguard Worker+	0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
258*86ee64e7SAndroid Build Coastguard Worker+	0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
259*86ee64e7SAndroid Build Coastguard Worker+	0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl  9 (16 - 7)/shr7 */
260*86ee64e7SAndroid Build Coastguard Worker+	0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl  8 (16 - 8)/shr8 */
261*86ee64e7SAndroid Build Coastguard Worker+	0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl  7 (16 - 9)/shr9 */
262*86ee64e7SAndroid Build Coastguard Worker+	0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl  6 (16 -10)/shr10*/
263*86ee64e7SAndroid Build Coastguard Worker+	0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl  5 (16 -11)/shr11*/
264*86ee64e7SAndroid Build Coastguard Worker+	0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl  4 (16 -12)/shr12*/
265*86ee64e7SAndroid Build Coastguard Worker+	0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
266*86ee64e7SAndroid Build Coastguard Worker+	0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
267*86ee64e7SAndroid Build Coastguard Worker+	0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
268*86ee64e7SAndroid Build Coastguard Worker+};
269*86ee64e7SAndroid Build Coastguard Worker+
270*86ee64e7SAndroid Build Coastguard Worker+local void partial_fold(deflate_state *const s, const size_t len,
271*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc0, __m128i *xmm_crc1,
272*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc2, __m128i *xmm_crc3,
273*86ee64e7SAndroid Build Coastguard Worker+        __m128i *xmm_crc_part)
274*86ee64e7SAndroid Build Coastguard Worker+{
275*86ee64e7SAndroid Build Coastguard Worker+
276*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_fold4 = _mm_set_epi32(
277*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0x54442bd4,
278*86ee64e7SAndroid Build Coastguard Worker+            0x00000001, 0xc6e41596);
279*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
280*86ee64e7SAndroid Build Coastguard Worker+
281*86ee64e7SAndroid Build Coastguard Worker+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
282*86ee64e7SAndroid Build Coastguard Worker+    __m128i xmm_a0_0, xmm_a0_1;
283*86ee64e7SAndroid Build Coastguard Worker+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
284*86ee64e7SAndroid Build Coastguard Worker+
285*86ee64e7SAndroid Build Coastguard Worker+    xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
286*86ee64e7SAndroid Build Coastguard Worker+    xmm_shr = xmm_shl;
287*86ee64e7SAndroid Build Coastguard Worker+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
288*86ee64e7SAndroid Build Coastguard Worker+
289*86ee64e7SAndroid Build Coastguard Worker+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
290*86ee64e7SAndroid Build Coastguard Worker+
291*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
292*86ee64e7SAndroid Build Coastguard Worker+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
293*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
294*86ee64e7SAndroid Build Coastguard Worker+
295*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
296*86ee64e7SAndroid Build Coastguard Worker+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
297*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
298*86ee64e7SAndroid Build Coastguard Worker+
299*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
300*86ee64e7SAndroid Build Coastguard Worker+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
301*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
302*86ee64e7SAndroid Build Coastguard Worker+
303*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
304*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
305*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
306*86ee64e7SAndroid Build Coastguard Worker+
307*86ee64e7SAndroid Build Coastguard Worker+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
308*86ee64e7SAndroid Build Coastguard Worker+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
309*86ee64e7SAndroid Build Coastguard Worker+
310*86ee64e7SAndroid Build Coastguard Worker+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
311*86ee64e7SAndroid Build Coastguard Worker+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
312*86ee64e7SAndroid Build Coastguard Worker+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
313*86ee64e7SAndroid Build Coastguard Worker+
314*86ee64e7SAndroid Build Coastguard Worker+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
315*86ee64e7SAndroid Build Coastguard Worker+    ps_res = _mm_xor_ps(ps_res, psa0_1);
316*86ee64e7SAndroid Build Coastguard Worker+
317*86ee64e7SAndroid Build Coastguard Worker+    *xmm_crc3 = _mm_castps_si128(ps_res);
318*86ee64e7SAndroid Build Coastguard Worker+}
319*86ee64e7SAndroid Build Coastguard Worker+
320*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
321*86ee64e7SAndroid Build Coastguard Worker+        unsigned char *dst, const unsigned char *src, long len)
322*86ee64e7SAndroid Build Coastguard Worker+{
323*86ee64e7SAndroid Build Coastguard Worker+    unsigned long algn_diff;
324*86ee64e7SAndroid Build Coastguard Worker+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
325*86ee64e7SAndroid Build Coastguard Worker+
326*86ee64e7SAndroid Build Coastguard Worker+    CRC_LOAD(s)
327*86ee64e7SAndroid Build Coastguard Worker+
328*86ee64e7SAndroid Build Coastguard Worker+    if (len < 16) {
329*86ee64e7SAndroid Build Coastguard Worker+        if (len == 0)
330*86ee64e7SAndroid Build Coastguard Worker+            return;
331*86ee64e7SAndroid Build Coastguard Worker+        goto partial;
332*86ee64e7SAndroid Build Coastguard Worker+    }
333*86ee64e7SAndroid Build Coastguard Worker+
334*86ee64e7SAndroid Build Coastguard Worker+    algn_diff = 0 - (uintptr_t)src & 0xF;
335*86ee64e7SAndroid Build Coastguard Worker+    if (algn_diff) {
336*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
337*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
338*86ee64e7SAndroid Build Coastguard Worker+
339*86ee64e7SAndroid Build Coastguard Worker+        dst += algn_diff;
340*86ee64e7SAndroid Build Coastguard Worker+        src += algn_diff;
341*86ee64e7SAndroid Build Coastguard Worker+        len -= algn_diff;
342*86ee64e7SAndroid Build Coastguard Worker+
343*86ee64e7SAndroid Build Coastguard Worker+        partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
344*86ee64e7SAndroid Build Coastguard Worker+            &xmm_crc_part);
345*86ee64e7SAndroid Build Coastguard Worker+    }
346*86ee64e7SAndroid Build Coastguard Worker+
347*86ee64e7SAndroid Build Coastguard Worker+    while ((len -= 64) >= 0) {
348*86ee64e7SAndroid Build Coastguard Worker+        xmm_t0 = _mm_load_si128((__m128i *)src);
349*86ee64e7SAndroid Build Coastguard Worker+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
350*86ee64e7SAndroid Build Coastguard Worker+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
351*86ee64e7SAndroid Build Coastguard Worker+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
352*86ee64e7SAndroid Build Coastguard Worker+
353*86ee64e7SAndroid Build Coastguard Worker+        fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
354*86ee64e7SAndroid Build Coastguard Worker+
355*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
356*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
357*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
358*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
359*86ee64e7SAndroid Build Coastguard Worker+
360*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
361*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
362*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
363*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
364*86ee64e7SAndroid Build Coastguard Worker+
365*86ee64e7SAndroid Build Coastguard Worker+        src += 64;
366*86ee64e7SAndroid Build Coastguard Worker+        dst += 64;
367*86ee64e7SAndroid Build Coastguard Worker+    }
368*86ee64e7SAndroid Build Coastguard Worker+
369*86ee64e7SAndroid Build Coastguard Worker+    /*
370*86ee64e7SAndroid Build Coastguard Worker+     * len = num bytes left - 64
371*86ee64e7SAndroid Build Coastguard Worker+     */
372*86ee64e7SAndroid Build Coastguard Worker+    if (len + 16 >= 0) {
373*86ee64e7SAndroid Build Coastguard Worker+        len += 16;
374*86ee64e7SAndroid Build Coastguard Worker+
375*86ee64e7SAndroid Build Coastguard Worker+        xmm_t0 = _mm_load_si128((__m128i *)src);
376*86ee64e7SAndroid Build Coastguard Worker+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
377*86ee64e7SAndroid Build Coastguard Worker+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
378*86ee64e7SAndroid Build Coastguard Worker+
379*86ee64e7SAndroid Build Coastguard Worker+        fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
380*86ee64e7SAndroid Build Coastguard Worker+
381*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
382*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
383*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
384*86ee64e7SAndroid Build Coastguard Worker+
385*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
386*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
387*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
388*86ee64e7SAndroid Build Coastguard Worker+
389*86ee64e7SAndroid Build Coastguard Worker+        if (len == 0)
390*86ee64e7SAndroid Build Coastguard Worker+            goto done;
391*86ee64e7SAndroid Build Coastguard Worker+
392*86ee64e7SAndroid Build Coastguard Worker+        dst += 48;
393*86ee64e7SAndroid Build Coastguard Worker+        src += 48;
394*86ee64e7SAndroid Build Coastguard Worker+    } else if (len + 32 >= 0) {
395*86ee64e7SAndroid Build Coastguard Worker+        len += 32;
396*86ee64e7SAndroid Build Coastguard Worker+
397*86ee64e7SAndroid Build Coastguard Worker+        xmm_t0 = _mm_load_si128((__m128i *)src);
398*86ee64e7SAndroid Build Coastguard Worker+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
399*86ee64e7SAndroid Build Coastguard Worker+
400*86ee64e7SAndroid Build Coastguard Worker+        fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
401*86ee64e7SAndroid Build Coastguard Worker+
402*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
403*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
404*86ee64e7SAndroid Build Coastguard Worker+
405*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
406*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
407*86ee64e7SAndroid Build Coastguard Worker+
408*86ee64e7SAndroid Build Coastguard Worker+        if (len == 0)
409*86ee64e7SAndroid Build Coastguard Worker+            goto done;
410*86ee64e7SAndroid Build Coastguard Worker+
411*86ee64e7SAndroid Build Coastguard Worker+        dst += 32;
412*86ee64e7SAndroid Build Coastguard Worker+        src += 32;
413*86ee64e7SAndroid Build Coastguard Worker+    } else if (len + 48 >= 0) {
414*86ee64e7SAndroid Build Coastguard Worker+        len += 48;
415*86ee64e7SAndroid Build Coastguard Worker+
416*86ee64e7SAndroid Build Coastguard Worker+        xmm_t0 = _mm_load_si128((__m128i *)src);
417*86ee64e7SAndroid Build Coastguard Worker+
418*86ee64e7SAndroid Build Coastguard Worker+        fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419*86ee64e7SAndroid Build Coastguard Worker+
420*86ee64e7SAndroid Build Coastguard Worker+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
421*86ee64e7SAndroid Build Coastguard Worker+
422*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
423*86ee64e7SAndroid Build Coastguard Worker+
424*86ee64e7SAndroid Build Coastguard Worker+        if (len == 0)
425*86ee64e7SAndroid Build Coastguard Worker+            goto done;
426*86ee64e7SAndroid Build Coastguard Worker+
427*86ee64e7SAndroid Build Coastguard Worker+        dst += 16;
428*86ee64e7SAndroid Build Coastguard Worker+        src += 16;
429*86ee64e7SAndroid Build Coastguard Worker+    } else {
430*86ee64e7SAndroid Build Coastguard Worker+        len += 64;
431*86ee64e7SAndroid Build Coastguard Worker+        if (len == 0)
432*86ee64e7SAndroid Build Coastguard Worker+            goto done;
433*86ee64e7SAndroid Build Coastguard Worker+    }
434*86ee64e7SAndroid Build Coastguard Worker+
435*86ee64e7SAndroid Build Coastguard Worker+partial:
436*86ee64e7SAndroid Build Coastguard Worker+
437*86ee64e7SAndroid Build Coastguard Worker+#if defined(_MSC_VER)
438*86ee64e7SAndroid Build Coastguard Worker+    /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
439*86ee64e7SAndroid Build Coastguard Worker+    {
440*86ee64e7SAndroid Build Coastguard Worker+        int32_t parts[4] = {0, 0, 0, 0};
441*86ee64e7SAndroid Build Coastguard Worker+        memcpy(&parts, src, len);
442*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
443*86ee64e7SAndroid Build Coastguard Worker+    }
444*86ee64e7SAndroid Build Coastguard Worker+#else
445*86ee64e7SAndroid Build Coastguard Worker+    {
446*86ee64e7SAndroid Build Coastguard Worker+        int64_t parts[2] = {0, 0};
447*86ee64e7SAndroid Build Coastguard Worker+        memcpy(&parts, src, len);
448*86ee64e7SAndroid Build Coastguard Worker+        xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
449*86ee64e7SAndroid Build Coastguard Worker+    }
450*86ee64e7SAndroid Build Coastguard Worker+#endif
451*86ee64e7SAndroid Build Coastguard Worker+
452*86ee64e7SAndroid Build Coastguard Worker+    zmemcpy(dst, src, len);  /* TODO: Possibly generate more efficient code. */
453*86ee64e7SAndroid Build Coastguard Worker+    partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
454*86ee64e7SAndroid Build Coastguard Worker+        &xmm_crc_part);
455*86ee64e7SAndroid Build Coastguard Worker+done:
456*86ee64e7SAndroid Build Coastguard Worker+    CRC_SAVE(s)
457*86ee64e7SAndroid Build Coastguard Worker+}
458*86ee64e7SAndroid Build Coastguard Worker+
459*86ee64e7SAndroid Build Coastguard Worker+local const unsigned zalign(16) crc_k[] = {
460*86ee64e7SAndroid Build Coastguard Worker+    0xccaa009e, 0x00000000, /* rk1 */
461*86ee64e7SAndroid Build Coastguard Worker+    0x751997d0, 0x00000001, /* rk2 */
462*86ee64e7SAndroid Build Coastguard Worker+    0xccaa009e, 0x00000000, /* rk5 */
463*86ee64e7SAndroid Build Coastguard Worker+    0x63cd6124, 0x00000001, /* rk6 */
464*86ee64e7SAndroid Build Coastguard Worker+    0xf7011640, 0x00000001, /* rk7 */
465*86ee64e7SAndroid Build Coastguard Worker+    0xdb710640, 0x00000001  /* rk8 */
466*86ee64e7SAndroid Build Coastguard Worker+};
467*86ee64e7SAndroid Build Coastguard Worker+
468*86ee64e7SAndroid Build Coastguard Worker+local const unsigned zalign(16) crc_mask[4] = {
469*86ee64e7SAndroid Build Coastguard Worker+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
470*86ee64e7SAndroid Build Coastguard Worker+};
471*86ee64e7SAndroid Build Coastguard Worker+
472*86ee64e7SAndroid Build Coastguard Worker+local const unsigned zalign(16) crc_mask2[4] = {
473*86ee64e7SAndroid Build Coastguard Worker+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
474*86ee64e7SAndroid Build Coastguard Worker+};
475*86ee64e7SAndroid Build Coastguard Worker+
476*86ee64e7SAndroid Build Coastguard Worker+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
477*86ee64e7SAndroid Build Coastguard Worker+{
478*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
479*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
480*86ee64e7SAndroid Build Coastguard Worker+
481*86ee64e7SAndroid Build Coastguard Worker+    unsigned crc;
482*86ee64e7SAndroid Build Coastguard Worker+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
483*86ee64e7SAndroid Build Coastguard Worker+
484*86ee64e7SAndroid Build Coastguard Worker+    CRC_LOAD(s)
485*86ee64e7SAndroid Build Coastguard Worker+
486*86ee64e7SAndroid Build Coastguard Worker+    /*
487*86ee64e7SAndroid Build Coastguard Worker+     * k1
488*86ee64e7SAndroid Build Coastguard Worker+     */
489*86ee64e7SAndroid Build Coastguard Worker+    crc_fold = _mm_load_si128((__m128i *)crc_k);
490*86ee64e7SAndroid Build Coastguard Worker+
491*86ee64e7SAndroid Build Coastguard Worker+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
492*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
493*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
494*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
495*86ee64e7SAndroid Build Coastguard Worker+
496*86ee64e7SAndroid Build Coastguard Worker+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
497*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
498*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
499*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
500*86ee64e7SAndroid Build Coastguard Worker+
501*86ee64e7SAndroid Build Coastguard Worker+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
502*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
503*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
504*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
505*86ee64e7SAndroid Build Coastguard Worker+
506*86ee64e7SAndroid Build Coastguard Worker+    /*
507*86ee64e7SAndroid Build Coastguard Worker+     * k5
508*86ee64e7SAndroid Build Coastguard Worker+     */
509*86ee64e7SAndroid Build Coastguard Worker+    crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
510*86ee64e7SAndroid Build Coastguard Worker+
511*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc0 = xmm_crc3;
512*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
513*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
514*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
515*86ee64e7SAndroid Build Coastguard Worker+
516*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc0 = xmm_crc3;
517*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
518*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
519*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
520*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
521*86ee64e7SAndroid Build Coastguard Worker+
522*86ee64e7SAndroid Build Coastguard Worker+    /*
523*86ee64e7SAndroid Build Coastguard Worker+     * k7
524*86ee64e7SAndroid Build Coastguard Worker+     */
525*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc1 = xmm_crc3;
526*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = xmm_crc3;
527*86ee64e7SAndroid Build Coastguard Worker+    crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
528*86ee64e7SAndroid Build Coastguard Worker+
529*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
530*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
531*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
532*86ee64e7SAndroid Build Coastguard Worker+
533*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc2 = xmm_crc3;
534*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
535*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
536*86ee64e7SAndroid Build Coastguard Worker+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
537*86ee64e7SAndroid Build Coastguard Worker+
538*86ee64e7SAndroid Build Coastguard Worker+    crc = _mm_extract_epi32(xmm_crc3, 2);
539*86ee64e7SAndroid Build Coastguard Worker+    return ~crc;
540*86ee64e7SAndroid Build Coastguard Worker+    CRC_SAVE(s)
541*86ee64e7SAndroid Build Coastguard Worker+}
542*86ee64e7SAndroid Build Coastguard Workerdiff --git a/deflate.c b/deflate.c
543*86ee64e7SAndroid Build Coastguard Workerindex 1ec761448de9..aa0c9c67a6dc 100644
544*86ee64e7SAndroid Build Coastguard Worker--- a/deflate.c
545*86ee64e7SAndroid Build Coastguard Worker+++ b/deflate.c
546*86ee64e7SAndroid Build Coastguard Worker@@ -48,8 +48,9 @@
547*86ee64e7SAndroid Build Coastguard Worker  */
548*86ee64e7SAndroid Build Coastguard Worker
549*86ee64e7SAndroid Build Coastguard Worker /* @(#) $Id$ */
550*86ee64e7SAndroid Build Coastguard Worker-
551*86ee64e7SAndroid Build Coastguard Worker+#include <assert.h>
552*86ee64e7SAndroid Build Coastguard Worker #include "deflate.h"
553*86ee64e7SAndroid Build Coastguard Worker+#include "x86.h"
554*86ee64e7SAndroid Build Coastguard Worker
555*86ee64e7SAndroid Build Coastguard Worker const char deflate_copyright[] =
556*86ee64e7SAndroid Build Coastguard Worker    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
557*86ee64e7SAndroid Build Coastguard Worker@@ -86,7 +87,7 @@ local block_state deflate_huff   OF((deflate_state *s, int flush));
558*86ee64e7SAndroid Build Coastguard Worker local void lm_init        OF((deflate_state *s));
559*86ee64e7SAndroid Build Coastguard Worker local void putShortMSB    OF((deflate_state *s, uInt b));
560*86ee64e7SAndroid Build Coastguard Worker local void flush_pending  OF((z_streamp strm));
561*86ee64e7SAndroid Build Coastguard Worker-local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
562*86ee64e7SAndroid Build Coastguard Worker+unsigned ZLIB_INTERNAL deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
563*86ee64e7SAndroid Build Coastguard Worker #ifdef ASMV
564*86ee64e7SAndroid Build Coastguard Worker #  pragma message("Assembler code may have bugs -- use at your own risk")
565*86ee64e7SAndroid Build Coastguard Worker       void match_init OF((void)); /* asm code initialization */
566*86ee64e7SAndroid Build Coastguard Worker@@ -100,6 +101,20 @@ local  void check_match OF((deflate_state *s, IPos start, IPos match,
567*86ee64e7SAndroid Build Coastguard Worker                             int length));
568*86ee64e7SAndroid Build Coastguard Worker #endif
569*86ee64e7SAndroid Build Coastguard Worker
570*86ee64e7SAndroid Build Coastguard Worker+/* From crc32.c */
571*86ee64e7SAndroid Build Coastguard Worker+extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
572*86ee64e7SAndroid Build Coastguard Worker+extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
573*86ee64e7SAndroid Build Coastguard Worker+extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
574*86ee64e7SAndroid Build Coastguard Worker+
575*86ee64e7SAndroid Build Coastguard Worker+#ifdef _MSC_VER
576*86ee64e7SAndroid Build Coastguard Worker+#define INLINE __inline
577*86ee64e7SAndroid Build Coastguard Worker+#else
578*86ee64e7SAndroid Build Coastguard Worker+#define INLINE inline
579*86ee64e7SAndroid Build Coastguard Worker+#endif
580*86ee64e7SAndroid Build Coastguard Worker+
581*86ee64e7SAndroid Build Coastguard Worker+/* Inline optimisation */
582*86ee64e7SAndroid Build Coastguard Worker+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
583*86ee64e7SAndroid Build Coastguard Worker+
584*86ee64e7SAndroid Build Coastguard Worker /* ===========================================================================
585*86ee64e7SAndroid Build Coastguard Worker  * Local data
586*86ee64e7SAndroid Build Coastguard Worker  */
587*86ee64e7SAndroid Build Coastguard Worker@@ -162,7 +177,6 @@ local const config configuration_table[10] = {
588*86ee64e7SAndroid Build Coastguard Worker  */
589*86ee64e7SAndroid Build Coastguard Worker #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
590*86ee64e7SAndroid Build Coastguard Worker
591*86ee64e7SAndroid Build Coastguard Worker-
592*86ee64e7SAndroid Build Coastguard Worker /* ===========================================================================
593*86ee64e7SAndroid Build Coastguard Worker  * Insert string str in the dictionary and set match_head to the previous head
594*86ee64e7SAndroid Build Coastguard Worker  * of the hash chain (the most recent string with same hash key). Return
595*86ee64e7SAndroid Build Coastguard Worker@@ -173,17 +187,28 @@ local const config configuration_table[10] = {
596*86ee64e7SAndroid Build Coastguard Worker  *    characters and the first MIN_MATCH bytes of str are valid (except for
597*86ee64e7SAndroid Build Coastguard Worker  *    the last MIN_MATCH-1 bytes of the input file).
598*86ee64e7SAndroid Build Coastguard Worker  */
599*86ee64e7SAndroid Build Coastguard Worker+local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
600*86ee64e7SAndroid Build Coastguard Worker+{
601*86ee64e7SAndroid Build Coastguard Worker+    Pos ret;
602*86ee64e7SAndroid Build Coastguard Worker+
603*86ee64e7SAndroid Build Coastguard Worker+    UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
604*86ee64e7SAndroid Build Coastguard Worker #ifdef FASTEST
605*86ee64e7SAndroid Build Coastguard Worker-#define INSERT_STRING(s, str, match_head) \
606*86ee64e7SAndroid Build Coastguard Worker-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
607*86ee64e7SAndroid Build Coastguard Worker-    match_head = s->head[s->ins_h], \
608*86ee64e7SAndroid Build Coastguard Worker-    s->head[s->ins_h] = (Pos)(str))
609*86ee64e7SAndroid Build Coastguard Worker+    ret = s->head[s->ins_h];
610*86ee64e7SAndroid Build Coastguard Worker #else
611*86ee64e7SAndroid Build Coastguard Worker-#define INSERT_STRING(s, str, match_head) \
612*86ee64e7SAndroid Build Coastguard Worker-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
613*86ee64e7SAndroid Build Coastguard Worker-    match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
614*86ee64e7SAndroid Build Coastguard Worker-    s->head[s->ins_h] = (Pos)(str))
615*86ee64e7SAndroid Build Coastguard Worker+    ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
616*86ee64e7SAndroid Build Coastguard Worker #endif
617*86ee64e7SAndroid Build Coastguard Worker+    s->head[s->ins_h] = str;
618*86ee64e7SAndroid Build Coastguard Worker+
619*86ee64e7SAndroid Build Coastguard Worker+    return ret;
620*86ee64e7SAndroid Build Coastguard Worker+}
621*86ee64e7SAndroid Build Coastguard Worker+
622*86ee64e7SAndroid Build Coastguard Worker+local INLINE Pos insert_string(deflate_state *const s, const Pos str)
623*86ee64e7SAndroid Build Coastguard Worker+{
624*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd)
625*86ee64e7SAndroid Build Coastguard Worker+        return insert_string_sse(s, str);
626*86ee64e7SAndroid Build Coastguard Worker+    return insert_string_c(s, str);
627*86ee64e7SAndroid Build Coastguard Worker+}
628*86ee64e7SAndroid Build Coastguard Worker+
629*86ee64e7SAndroid Build Coastguard Worker
630*86ee64e7SAndroid Build Coastguard Worker /* ===========================================================================
631*86ee64e7SAndroid Build Coastguard Worker  * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
632*86ee64e7SAndroid Build Coastguard Worker@@ -248,6 +273,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
633*86ee64e7SAndroid Build Coastguard Worker     const char *version;
634*86ee64e7SAndroid Build Coastguard Worker     int stream_size;
635*86ee64e7SAndroid Build Coastguard Worker {
636*86ee64e7SAndroid Build Coastguard Worker+    unsigned window_padding = 8;
637*86ee64e7SAndroid Build Coastguard Worker     deflate_state *s;
638*86ee64e7SAndroid Build Coastguard Worker     int wrap = 1;
639*86ee64e7SAndroid Build Coastguard Worker     static const char my_version[] = ZLIB_VERSION;
640*86ee64e7SAndroid Build Coastguard Worker@@ -257,6 +283,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
641*86ee64e7SAndroid Build Coastguard Worker      * output size for (length,distance) codes is <= 24 bits.
642*86ee64e7SAndroid Build Coastguard Worker      */
643*86ee64e7SAndroid Build Coastguard Worker
644*86ee64e7SAndroid Build Coastguard Worker+    x86_check_features();
645*86ee64e7SAndroid Build Coastguard Worker+
646*86ee64e7SAndroid Build Coastguard Worker     if (version == Z_NULL || version[0] != my_version[0] ||
647*86ee64e7SAndroid Build Coastguard Worker         stream_size != sizeof(z_stream)) {
648*86ee64e7SAndroid Build Coastguard Worker         return Z_VERSION_ERROR;
649*86ee64e7SAndroid Build Coastguard Worker@@ -313,12 +341,19 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
650*86ee64e7SAndroid Build Coastguard Worker     s->w_size = 1 << s->w_bits;
651*86ee64e7SAndroid Build Coastguard Worker     s->w_mask = s->w_size - 1;
652*86ee64e7SAndroid Build Coastguard Worker
653*86ee64e7SAndroid Build Coastguard Worker-    s->hash_bits = (uInt)memLevel + 7;
654*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd) {
655*86ee64e7SAndroid Build Coastguard Worker+        s->hash_bits = 15;
656*86ee64e7SAndroid Build Coastguard Worker+    } else {
657*86ee64e7SAndroid Build Coastguard Worker+        s->hash_bits = memLevel + 7;
658*86ee64e7SAndroid Build Coastguard Worker+    }
659*86ee64e7SAndroid Build Coastguard Worker+
660*86ee64e7SAndroid Build Coastguard Worker     s->hash_size = 1 << s->hash_bits;
661*86ee64e7SAndroid Build Coastguard Worker     s->hash_mask = s->hash_size - 1;
662*86ee64e7SAndroid Build Coastguard Worker     s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
663*86ee64e7SAndroid Build Coastguard Worker
664*86ee64e7SAndroid Build Coastguard Worker-    s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
665*86ee64e7SAndroid Build Coastguard Worker+    s->window = (Bytef *) ZALLOC(strm,
666*86ee64e7SAndroid Build Coastguard Worker+                                 s->w_size + window_padding,
667*86ee64e7SAndroid Build Coastguard Worker+                                 2*sizeof(Byte));
668*86ee64e7SAndroid Build Coastguard Worker     s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
669*86ee64e7SAndroid Build Coastguard Worker     s->head   = (Posf *)  ZALLOC(strm, s->hash_size, sizeof(Pos));
670*86ee64e7SAndroid Build Coastguard Worker
671*86ee64e7SAndroid Build Coastguard Worker@@ -418,11 +453,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
672*86ee64e7SAndroid Build Coastguard Worker         str = s->strstart;
673*86ee64e7SAndroid Build Coastguard Worker         n = s->lookahead - (MIN_MATCH-1);
674*86ee64e7SAndroid Build Coastguard Worker         do {
675*86ee64e7SAndroid Build Coastguard Worker-            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
676*86ee64e7SAndroid Build Coastguard Worker-#ifndef FASTEST
677*86ee64e7SAndroid Build Coastguard Worker-            s->prev[str & s->w_mask] = s->head[s->ins_h];
678*86ee64e7SAndroid Build Coastguard Worker-#endif
679*86ee64e7SAndroid Build Coastguard Worker-            s->head[s->ins_h] = (Pos)str;
680*86ee64e7SAndroid Build Coastguard Worker+            insert_string(s, str);
681*86ee64e7SAndroid Build Coastguard Worker             str++;
682*86ee64e7SAndroid Build Coastguard Worker         } while (--n);
683*86ee64e7SAndroid Build Coastguard Worker         s->strstart = str;
684*86ee64e7SAndroid Build Coastguard Worker@@ -848,7 +879,7 @@ int ZEXPORT deflate (strm, flush)
685*86ee64e7SAndroid Build Coastguard Worker #ifdef GZIP
686*86ee64e7SAndroid Build Coastguard Worker     if (s->status == GZIP_STATE) {
687*86ee64e7SAndroid Build Coastguard Worker         /* gzip header */
688*86ee64e7SAndroid Build Coastguard Worker-        strm->adler = crc32(0L, Z_NULL, 0);
689*86ee64e7SAndroid Build Coastguard Worker+        crc_reset(s);
690*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, 31);
691*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, 139);
692*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, 8);
693*86ee64e7SAndroid Build Coastguard Worker@@ -1049,6 +1080,7 @@ int ZEXPORT deflate (strm, flush)
694*86ee64e7SAndroid Build Coastguard Worker     /* Write the trailer */
695*86ee64e7SAndroid Build Coastguard Worker #ifdef GZIP
696*86ee64e7SAndroid Build Coastguard Worker     if (s->wrap == 2) {
697*86ee64e7SAndroid Build Coastguard Worker+        crc_finalize(s);
698*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, (Byte)(strm->adler & 0xff));
699*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
700*86ee64e7SAndroid Build Coastguard Worker         put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
701*86ee64e7SAndroid Build Coastguard Worker@@ -1161,7 +1193,7 @@ int ZEXPORT deflateCopy (dest, source)
702*86ee64e7SAndroid Build Coastguard Worker  * allocating a large strm->next_in buffer and copying from it.
703*86ee64e7SAndroid Build Coastguard Worker  * (See also flush_pending()).
704*86ee64e7SAndroid Build Coastguard Worker  */
705*86ee64e7SAndroid Build Coastguard Worker-local unsigned read_buf(strm, buf, size)
706*86ee64e7SAndroid Build Coastguard Worker+ZLIB_INTERNAL unsigned deflate_read_buf(strm, buf, size)
707*86ee64e7SAndroid Build Coastguard Worker     z_streamp strm;
708*86ee64e7SAndroid Build Coastguard Worker     Bytef *buf;
709*86ee64e7SAndroid Build Coastguard Worker     unsigned size;
710*86ee64e7SAndroid Build Coastguard Worker@@ -1173,15 +1205,16 @@ local unsigned read_buf(strm, buf, size)
711*86ee64e7SAndroid Build Coastguard Worker
712*86ee64e7SAndroid Build Coastguard Worker     strm->avail_in  -= len;
713*86ee64e7SAndroid Build Coastguard Worker
714*86ee64e7SAndroid Build Coastguard Worker-    zmemcpy(buf, strm->next_in, len);
715*86ee64e7SAndroid Build Coastguard Worker-    if (strm->state->wrap == 1) {
716*86ee64e7SAndroid Build Coastguard Worker-        strm->adler = adler32(strm->adler, buf, len);
717*86ee64e7SAndroid Build Coastguard Worker-    }
718*86ee64e7SAndroid Build Coastguard Worker #ifdef GZIP
719*86ee64e7SAndroid Build Coastguard Worker-    else if (strm->state->wrap == 2) {
720*86ee64e7SAndroid Build Coastguard Worker-        strm->adler = crc32(strm->adler, buf, len);
721*86ee64e7SAndroid Build Coastguard Worker-    }
722*86ee64e7SAndroid Build Coastguard Worker+    if (strm->state->wrap == 2)
723*86ee64e7SAndroid Build Coastguard Worker+        copy_with_crc(strm, buf, len);
724*86ee64e7SAndroid Build Coastguard Worker+    else
725*86ee64e7SAndroid Build Coastguard Worker #endif
726*86ee64e7SAndroid Build Coastguard Worker+    {
727*86ee64e7SAndroid Build Coastguard Worker+        zmemcpy(buf, strm->next_in, len);
728*86ee64e7SAndroid Build Coastguard Worker+        if (strm->state->wrap == 1)
729*86ee64e7SAndroid Build Coastguard Worker+            strm->adler = adler32(strm->adler, buf, len);
730*86ee64e7SAndroid Build Coastguard Worker+    }
731*86ee64e7SAndroid Build Coastguard Worker     strm->next_in  += len;
732*86ee64e7SAndroid Build Coastguard Worker     strm->total_in += len;
733*86ee64e7SAndroid Build Coastguard Worker
734*86ee64e7SAndroid Build Coastguard Worker@@ -1479,7 +1512,19 @@ local void check_match(s, start, match, length)
735*86ee64e7SAndroid Build Coastguard Worker  *    performed for at least two bytes (required for the zip translate_eol
736*86ee64e7SAndroid Build Coastguard Worker  *    option -- not supported here).
737*86ee64e7SAndroid Build Coastguard Worker  */
738*86ee64e7SAndroid Build Coastguard Worker-local void fill_window(s)
739*86ee64e7SAndroid Build Coastguard Worker+local void fill_window_c(deflate_state *s);
740*86ee64e7SAndroid Build Coastguard Worker+
741*86ee64e7SAndroid Build Coastguard Worker+local void fill_window(deflate_state *s)
742*86ee64e7SAndroid Build Coastguard Worker+{
743*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_simd) {
744*86ee64e7SAndroid Build Coastguard Worker+        fill_window_sse(s);
745*86ee64e7SAndroid Build Coastguard Worker+        return;
746*86ee64e7SAndroid Build Coastguard Worker+    }
747*86ee64e7SAndroid Build Coastguard Worker+
748*86ee64e7SAndroid Build Coastguard Worker+    fill_window_c(s);
749*86ee64e7SAndroid Build Coastguard Worker+}
750*86ee64e7SAndroid Build Coastguard Worker+
751*86ee64e7SAndroid Build Coastguard Worker+local void fill_window_c(s)
752*86ee64e7SAndroid Build Coastguard Worker     deflate_state *s;
753*86ee64e7SAndroid Build Coastguard Worker {
754*86ee64e7SAndroid Build Coastguard Worker     unsigned n;
755*86ee64e7SAndroid Build Coastguard Worker@@ -1847,7 +1892,7 @@ local block_state deflate_fast(s, flush)
756*86ee64e7SAndroid Build Coastguard Worker          */
757*86ee64e7SAndroid Build Coastguard Worker         hash_head = NIL;
758*86ee64e7SAndroid Build Coastguard Worker         if (s->lookahead >= MIN_MATCH) {
759*86ee64e7SAndroid Build Coastguard Worker-            INSERT_STRING(s, s->strstart, hash_head);
760*86ee64e7SAndroid Build Coastguard Worker+            hash_head = insert_string(s, s->strstart);
761*86ee64e7SAndroid Build Coastguard Worker         }
762*86ee64e7SAndroid Build Coastguard Worker
763*86ee64e7SAndroid Build Coastguard Worker         /* Find the longest match, discarding those <= prev_length.
764*86ee64e7SAndroid Build Coastguard Worker@@ -1878,7 +1923,7 @@ local block_state deflate_fast(s, flush)
765*86ee64e7SAndroid Build Coastguard Worker                 s->match_length--; /* string at strstart already in table */
766*86ee64e7SAndroid Build Coastguard Worker                 do {
767*86ee64e7SAndroid Build Coastguard Worker                     s->strstart++;
768*86ee64e7SAndroid Build Coastguard Worker-                    INSERT_STRING(s, s->strstart, hash_head);
769*86ee64e7SAndroid Build Coastguard Worker+                    hash_head = insert_string(s, s->strstart);
770*86ee64e7SAndroid Build Coastguard Worker                     /* strstart never exceeds WSIZE-MAX_MATCH, so there are
771*86ee64e7SAndroid Build Coastguard Worker                      * always MIN_MATCH bytes ahead.
772*86ee64e7SAndroid Build Coastguard Worker                      */
773*86ee64e7SAndroid Build Coastguard Worker@@ -1950,7 +1995,7 @@ local block_state deflate_slow(s, flush)
774*86ee64e7SAndroid Build Coastguard Worker          */
775*86ee64e7SAndroid Build Coastguard Worker         hash_head = NIL;
776*86ee64e7SAndroid Build Coastguard Worker         if (s->lookahead >= MIN_MATCH) {
777*86ee64e7SAndroid Build Coastguard Worker-            INSERT_STRING(s, s->strstart, hash_head);
778*86ee64e7SAndroid Build Coastguard Worker+            hash_head = insert_string(s, s->strstart);
779*86ee64e7SAndroid Build Coastguard Worker         }
780*86ee64e7SAndroid Build Coastguard Worker
781*86ee64e7SAndroid Build Coastguard Worker         /* Find the longest match, discarding those <= prev_length.
782*86ee64e7SAndroid Build Coastguard Worker@@ -2001,7 +2046,7 @@ local block_state deflate_slow(s, flush)
783*86ee64e7SAndroid Build Coastguard Worker             s->prev_length -= 2;
784*86ee64e7SAndroid Build Coastguard Worker             do {
785*86ee64e7SAndroid Build Coastguard Worker                 if (++s->strstart <= max_insert) {
786*86ee64e7SAndroid Build Coastguard Worker-                    INSERT_STRING(s, s->strstart, hash_head);
787*86ee64e7SAndroid Build Coastguard Worker+                    hash_head = insert_string(s, s->strstart);
788*86ee64e7SAndroid Build Coastguard Worker                 }
789*86ee64e7SAndroid Build Coastguard Worker             } while (--s->prev_length != 0);
790*86ee64e7SAndroid Build Coastguard Worker             s->match_available = 0;
791*86ee64e7SAndroid Build Coastguard Worker@@ -2161,3 +2206,37 @@ local block_state deflate_huff(s, flush)
792*86ee64e7SAndroid Build Coastguard Worker         FLUSH_BLOCK(s, 0);
793*86ee64e7SAndroid Build Coastguard Worker     return block_done;
794*86ee64e7SAndroid Build Coastguard Worker }
795*86ee64e7SAndroid Build Coastguard Worker+
796*86ee64e7SAndroid Build Coastguard Worker+/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
797*86ee64e7SAndroid Build Coastguard Worker+ * use intrinsic without extra params
798*86ee64e7SAndroid Build Coastguard Worker+ */
799*86ee64e7SAndroid Build Coastguard Worker+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
800*86ee64e7SAndroid Build Coastguard Worker+{
801*86ee64e7SAndroid Build Coastguard Worker+    Pos ret;
802*86ee64e7SAndroid Build Coastguard Worker+    unsigned *ip, val, h = 0;
803*86ee64e7SAndroid Build Coastguard Worker+
804*86ee64e7SAndroid Build Coastguard Worker+    ip = (unsigned *)&s->window[str];
805*86ee64e7SAndroid Build Coastguard Worker+    val = *ip;
806*86ee64e7SAndroid Build Coastguard Worker+
807*86ee64e7SAndroid Build Coastguard Worker+    if (s->level >= 6)
808*86ee64e7SAndroid Build Coastguard Worker+        val &= 0xFFFFFF;
809*86ee64e7SAndroid Build Coastguard Worker+
810*86ee64e7SAndroid Build Coastguard Worker+/* Windows clang should use inline asm */
811*86ee64e7SAndroid Build Coastguard Worker+#if defined(_MSC_VER) && !defined(__clang__)
812*86ee64e7SAndroid Build Coastguard Worker+    h = _mm_crc32_u32(h, val);
813*86ee64e7SAndroid Build Coastguard Worker+#elif defined(__i386__) || defined(__amd64__)
814*86ee64e7SAndroid Build Coastguard Worker+    __asm__ __volatile__ (
815*86ee64e7SAndroid Build Coastguard Worker+        "crc32 %1,%0\n\t"
816*86ee64e7SAndroid Build Coastguard Worker+    : "+r" (h)
817*86ee64e7SAndroid Build Coastguard Worker+    : "r" (val)
818*86ee64e7SAndroid Build Coastguard Worker+    );
819*86ee64e7SAndroid Build Coastguard Worker+#else
820*86ee64e7SAndroid Build Coastguard Worker+    /* This should never happen */
821*86ee64e7SAndroid Build Coastguard Worker+    assert(0);
822*86ee64e7SAndroid Build Coastguard Worker+#endif
823*86ee64e7SAndroid Build Coastguard Worker+
824*86ee64e7SAndroid Build Coastguard Worker+    ret = s->head[h & s->hash_mask];
825*86ee64e7SAndroid Build Coastguard Worker+    s->head[h & s->hash_mask] = str;
826*86ee64e7SAndroid Build Coastguard Worker+    s->prev[str & s->w_mask] = ret;
827*86ee64e7SAndroid Build Coastguard Worker+    return ret;
828*86ee64e7SAndroid Build Coastguard Worker+}
829*86ee64e7SAndroid Build Coastguard Workerdiff --git a/deflate.h b/deflate.h
830*86ee64e7SAndroid Build Coastguard Workerindex 23ecdd312bc0..ab56df7663b6 100644
831*86ee64e7SAndroid Build Coastguard Worker--- a/deflate.h
832*86ee64e7SAndroid Build Coastguard Worker+++ b/deflate.h
833*86ee64e7SAndroid Build Coastguard Worker@@ -109,7 +109,7 @@ typedef struct internal_state {
834*86ee64e7SAndroid Build Coastguard Worker     ulg   gzindex;       /* where in extra, name, or comment */
835*86ee64e7SAndroid Build Coastguard Worker     Byte  method;        /* can only be DEFLATED */
836*86ee64e7SAndroid Build Coastguard Worker     int   last_flush;    /* value of flush param for previous deflate call */
837*86ee64e7SAndroid Build Coastguard Worker-
838*86ee64e7SAndroid Build Coastguard Worker+    unsigned zalign(16) crc0[4 * 5];
839*86ee64e7SAndroid Build Coastguard Worker                 /* used by deflate.c: */
840*86ee64e7SAndroid Build Coastguard Worker
841*86ee64e7SAndroid Build Coastguard Worker     uInt  w_size;        /* LZ77 window size (32K by default) */
842*86ee64e7SAndroid Build Coastguard Worker@@ -346,4 +346,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
843*86ee64e7SAndroid Build Coastguard Worker               flush = _tr_tally(s, distance, length)
844*86ee64e7SAndroid Build Coastguard Worker #endif
845*86ee64e7SAndroid Build Coastguard Worker
846*86ee64e7SAndroid Build Coastguard Worker+/* Functions that are SIMD optimised on x86 */
847*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
848*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
849*86ee64e7SAndroid Build Coastguard Worker+                                 unsigned char* dst,
850*86ee64e7SAndroid Build Coastguard Worker+                                 const unsigned char* src,
851*86ee64e7SAndroid Build Coastguard Worker+                                 long len);
852*86ee64e7SAndroid Build Coastguard Worker+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
853*86ee64e7SAndroid Build Coastguard Worker+
854*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
855*86ee64e7SAndroid Build Coastguard Worker+
856*86ee64e7SAndroid Build Coastguard Worker #endif /* DEFLATE_H */
857*86ee64e7SAndroid Build Coastguard Workerdiff --git a/fill_window_sse.c b/fill_window_sse.c
858*86ee64e7SAndroid Build Coastguard Workernew file mode 100644
859*86ee64e7SAndroid Build Coastguard Workerindex 000000000000..949ccce1ba9c
860*86ee64e7SAndroid Build Coastguard Worker--- /dev/null
861*86ee64e7SAndroid Build Coastguard Worker+++ b/fill_window_sse.c
862*86ee64e7SAndroid Build Coastguard Worker@@ -0,0 +1,177 @@
863*86ee64e7SAndroid Build Coastguard Worker+/*
864*86ee64e7SAndroid Build Coastguard Worker+ * Fill Window with SSE2-optimized hash shifting
865*86ee64e7SAndroid Build Coastguard Worker+ *
866*86ee64e7SAndroid Build Coastguard Worker+ * Copyright (C) 2013 Intel Corporation
867*86ee64e7SAndroid Build Coastguard Worker+ * Authors:
868*86ee64e7SAndroid Build Coastguard Worker+ *  Arjan van de Ven    <[email protected]>
869*86ee64e7SAndroid Build Coastguard Worker+ *  Jim Kukunas         <[email protected]>
870*86ee64e7SAndroid Build Coastguard Worker+ *
871*86ee64e7SAndroid Build Coastguard Worker+ * For conditions of distribution and use, see copyright notice in zlib.h
872*86ee64e7SAndroid Build Coastguard Worker+ */
873*86ee64e7SAndroid Build Coastguard Worker+
874*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h>
875*86ee64e7SAndroid Build Coastguard Worker+#include "deflate.h"
876*86ee64e7SAndroid Build Coastguard Worker+
877*86ee64e7SAndroid Build Coastguard Worker+#define UPDATE_HASH(s,h,i) \
878*86ee64e7SAndroid Build Coastguard Worker+    {\
879*86ee64e7SAndroid Build Coastguard Worker+        if (s->level < 6) { \
880*86ee64e7SAndroid Build Coastguard Worker+            h = (3483 * (s->window[i]) +\
881*86ee64e7SAndroid Build Coastguard Worker+                 23081* (s->window[i+1]) +\
882*86ee64e7SAndroid Build Coastguard Worker+                 6954 * (s->window[i+2]) +\
883*86ee64e7SAndroid Build Coastguard Worker+                 20947* (s->window[i+3])) & s->hash_mask;\
884*86ee64e7SAndroid Build Coastguard Worker+        } else {\
885*86ee64e7SAndroid Build Coastguard Worker+            h = (25881* (s->window[i]) +\
886*86ee64e7SAndroid Build Coastguard Worker+                 24674* (s->window[i+1]) +\
887*86ee64e7SAndroid Build Coastguard Worker+                 25811* (s->window[i+2])) & s->hash_mask;\
888*86ee64e7SAndroid Build Coastguard Worker+        }\
889*86ee64e7SAndroid Build Coastguard Worker+    }\
890*86ee64e7SAndroid Build Coastguard Worker+
891*86ee64e7SAndroid Build Coastguard Worker+extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
892*86ee64e7SAndroid Build Coastguard Worker+
893*86ee64e7SAndroid Build Coastguard Worker+void fill_window_sse(deflate_state *s)
894*86ee64e7SAndroid Build Coastguard Worker+{
895*86ee64e7SAndroid Build Coastguard Worker+    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
896*86ee64e7SAndroid Build Coastguard Worker+
897*86ee64e7SAndroid Build Coastguard Worker+    register unsigned n;
898*86ee64e7SAndroid Build Coastguard Worker+    register Posf *p;
899*86ee64e7SAndroid Build Coastguard Worker+    unsigned more;    /* Amount of free space at the end of the window. */
900*86ee64e7SAndroid Build Coastguard Worker+    uInt wsize = s->w_size;
901*86ee64e7SAndroid Build Coastguard Worker+
902*86ee64e7SAndroid Build Coastguard Worker+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
903*86ee64e7SAndroid Build Coastguard Worker+
904*86ee64e7SAndroid Build Coastguard Worker+    do {
905*86ee64e7SAndroid Build Coastguard Worker+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
906*86ee64e7SAndroid Build Coastguard Worker+
907*86ee64e7SAndroid Build Coastguard Worker+        /* Deal with !@#$% 64K limit: */
908*86ee64e7SAndroid Build Coastguard Worker+        if (sizeof(int) <= 2) {
909*86ee64e7SAndroid Build Coastguard Worker+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
910*86ee64e7SAndroid Build Coastguard Worker+                more = wsize;
911*86ee64e7SAndroid Build Coastguard Worker+
912*86ee64e7SAndroid Build Coastguard Worker+            } else if (more == (unsigned)(-1)) {
913*86ee64e7SAndroid Build Coastguard Worker+                /* Very unlikely, but possible on 16 bit machine if
914*86ee64e7SAndroid Build Coastguard Worker+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
915*86ee64e7SAndroid Build Coastguard Worker+                 */
916*86ee64e7SAndroid Build Coastguard Worker+                more--;
917*86ee64e7SAndroid Build Coastguard Worker+            }
918*86ee64e7SAndroid Build Coastguard Worker+        }
919*86ee64e7SAndroid Build Coastguard Worker+
920*86ee64e7SAndroid Build Coastguard Worker+        /* If the window is almost full and there is insufficient lookahead,
921*86ee64e7SAndroid Build Coastguard Worker+         * move the upper half to the lower one to make room in the upper half.
922*86ee64e7SAndroid Build Coastguard Worker+         */
923*86ee64e7SAndroid Build Coastguard Worker+        if (s->strstart >= wsize+MAX_DIST(s)) {
924*86ee64e7SAndroid Build Coastguard Worker+
925*86ee64e7SAndroid Build Coastguard Worker+            zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
926*86ee64e7SAndroid Build Coastguard Worker+            s->match_start -= wsize;
927*86ee64e7SAndroid Build Coastguard Worker+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
928*86ee64e7SAndroid Build Coastguard Worker+            s->block_start -= (long) wsize;
929*86ee64e7SAndroid Build Coastguard Worker+
930*86ee64e7SAndroid Build Coastguard Worker+            /* Slide the hash table (could be avoided with 32 bit values
931*86ee64e7SAndroid Build Coastguard Worker+               at the expense of memory usage). We slide even when level == 0
932*86ee64e7SAndroid Build Coastguard Worker+               to keep the hash table consistent if we switch back to level > 0
933*86ee64e7SAndroid Build Coastguard Worker+               later. (Using level 0 permanently is not an optimal usage of
934*86ee64e7SAndroid Build Coastguard Worker+               zlib, so we don't care about this pathological case.)
935*86ee64e7SAndroid Build Coastguard Worker+             */
936*86ee64e7SAndroid Build Coastguard Worker+            n = s->hash_size;
937*86ee64e7SAndroid Build Coastguard Worker+            p = &s->head[n];
938*86ee64e7SAndroid Build Coastguard Worker+            p -= 8;
939*86ee64e7SAndroid Build Coastguard Worker+            do {
940*86ee64e7SAndroid Build Coastguard Worker+                __m128i value, result;
941*86ee64e7SAndroid Build Coastguard Worker+
942*86ee64e7SAndroid Build Coastguard Worker+                value = _mm_loadu_si128((__m128i *)p);
943*86ee64e7SAndroid Build Coastguard Worker+                result = _mm_subs_epu16(value, xmm_wsize);
944*86ee64e7SAndroid Build Coastguard Worker+                _mm_storeu_si128((__m128i *)p, result);
945*86ee64e7SAndroid Build Coastguard Worker+
946*86ee64e7SAndroid Build Coastguard Worker+                p -= 8;
947*86ee64e7SAndroid Build Coastguard Worker+                n -= 8;
948*86ee64e7SAndroid Build Coastguard Worker+            } while (n > 0);
949*86ee64e7SAndroid Build Coastguard Worker+
950*86ee64e7SAndroid Build Coastguard Worker+            n = wsize;
951*86ee64e7SAndroid Build Coastguard Worker+#ifndef FASTEST
952*86ee64e7SAndroid Build Coastguard Worker+            p = &s->prev[n];
953*86ee64e7SAndroid Build Coastguard Worker+            p -= 8;
954*86ee64e7SAndroid Build Coastguard Worker+            do {
955*86ee64e7SAndroid Build Coastguard Worker+                __m128i value, result;
956*86ee64e7SAndroid Build Coastguard Worker+
957*86ee64e7SAndroid Build Coastguard Worker+                value = _mm_loadu_si128((__m128i *)p);
958*86ee64e7SAndroid Build Coastguard Worker+                result = _mm_subs_epu16(value, xmm_wsize);
959*86ee64e7SAndroid Build Coastguard Worker+                _mm_storeu_si128((__m128i *)p, result);
960*86ee64e7SAndroid Build Coastguard Worker+
961*86ee64e7SAndroid Build Coastguard Worker+                p -= 8;
962*86ee64e7SAndroid Build Coastguard Worker+                n -= 8;
963*86ee64e7SAndroid Build Coastguard Worker+            } while (n > 0);
964*86ee64e7SAndroid Build Coastguard Worker+#endif
965*86ee64e7SAndroid Build Coastguard Worker+            more += wsize;
966*86ee64e7SAndroid Build Coastguard Worker+        }
967*86ee64e7SAndroid Build Coastguard Worker+        if (s->strm->avail_in == 0) break;
968*86ee64e7SAndroid Build Coastguard Worker+
969*86ee64e7SAndroid Build Coastguard Worker+        /* If there was no sliding:
970*86ee64e7SAndroid Build Coastguard Worker+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
971*86ee64e7SAndroid Build Coastguard Worker+         *    more == window_size - lookahead - strstart
972*86ee64e7SAndroid Build Coastguard Worker+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
973*86ee64e7SAndroid Build Coastguard Worker+         * => more >= window_size - 2*WSIZE + 2
974*86ee64e7SAndroid Build Coastguard Worker+         * In the BIG_MEM or MMAP case (not yet supported),
975*86ee64e7SAndroid Build Coastguard Worker+         *   window_size == input_size + MIN_LOOKAHEAD  &&
976*86ee64e7SAndroid Build Coastguard Worker+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
977*86ee64e7SAndroid Build Coastguard Worker+         * Otherwise, window_size == 2*WSIZE so more >= 2.
978*86ee64e7SAndroid Build Coastguard Worker+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
979*86ee64e7SAndroid Build Coastguard Worker+         */
980*86ee64e7SAndroid Build Coastguard Worker+        Assert(more >= 2, "more < 2");
981*86ee64e7SAndroid Build Coastguard Worker+
982*86ee64e7SAndroid Build Coastguard Worker+        n = deflate_read_buf(s->strm,
983*86ee64e7SAndroid Build Coastguard Worker+                             s->window + s->strstart + s->lookahead,
984*86ee64e7SAndroid Build Coastguard Worker+                             more);
985*86ee64e7SAndroid Build Coastguard Worker+        s->lookahead += n;
986*86ee64e7SAndroid Build Coastguard Worker+
987*86ee64e7SAndroid Build Coastguard Worker+        /* Initialize the hash value now that we have some input: */
988*86ee64e7SAndroid Build Coastguard Worker+        if (s->lookahead >= MIN_MATCH) {
989*86ee64e7SAndroid Build Coastguard Worker+            uInt str = s->strstart;
990*86ee64e7SAndroid Build Coastguard Worker+            s->ins_h = s->window[str];
991*86ee64e7SAndroid Build Coastguard Worker+            if (str >= 1)
992*86ee64e7SAndroid Build Coastguard Worker+                UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
993*86ee64e7SAndroid Build Coastguard Worker+#if MIN_MATCH != 3
994*86ee64e7SAndroid Build Coastguard Worker+            Call UPDATE_HASH() MIN_MATCH-3 more times
995*86ee64e7SAndroid Build Coastguard Worker+#endif
996*86ee64e7SAndroid Build Coastguard Worker+        }
997*86ee64e7SAndroid Build Coastguard Worker+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
998*86ee64e7SAndroid Build Coastguard Worker+         * but this is not important since only literal bytes will be emitted.
999*86ee64e7SAndroid Build Coastguard Worker+         */
1000*86ee64e7SAndroid Build Coastguard Worker+
1001*86ee64e7SAndroid Build Coastguard Worker+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
1002*86ee64e7SAndroid Build Coastguard Worker+
1003*86ee64e7SAndroid Build Coastguard Worker+    /* If the WIN_INIT bytes after the end of the current data have never been
1004*86ee64e7SAndroid Build Coastguard Worker+     * written, then zero those bytes in order to avoid memory check reports of
1005*86ee64e7SAndroid Build Coastguard Worker+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
1006*86ee64e7SAndroid Build Coastguard Worker+     * the longest match routines.  Update the high water mark for the next
1007*86ee64e7SAndroid Build Coastguard Worker+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
1008*86ee64e7SAndroid Build Coastguard Worker+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
1009*86ee64e7SAndroid Build Coastguard Worker+     */
1010*86ee64e7SAndroid Build Coastguard Worker+    if (s->high_water < s->window_size) {
1011*86ee64e7SAndroid Build Coastguard Worker+        ulg curr = s->strstart + (ulg)(s->lookahead);
1012*86ee64e7SAndroid Build Coastguard Worker+        ulg init;
1013*86ee64e7SAndroid Build Coastguard Worker+
1014*86ee64e7SAndroid Build Coastguard Worker+        if (s->high_water < curr) {
1015*86ee64e7SAndroid Build Coastguard Worker+            /* Previous high water mark below current data -- zero WIN_INIT
1016*86ee64e7SAndroid Build Coastguard Worker+             * bytes or up to end of window, whichever is less.
1017*86ee64e7SAndroid Build Coastguard Worker+             */
1018*86ee64e7SAndroid Build Coastguard Worker+            init = s->window_size - curr;
1019*86ee64e7SAndroid Build Coastguard Worker+            if (init > WIN_INIT)
1020*86ee64e7SAndroid Build Coastguard Worker+                init = WIN_INIT;
1021*86ee64e7SAndroid Build Coastguard Worker+            zmemzero(s->window + curr, (unsigned)init);
1022*86ee64e7SAndroid Build Coastguard Worker+            s->high_water = curr + init;
1023*86ee64e7SAndroid Build Coastguard Worker+        }
1024*86ee64e7SAndroid Build Coastguard Worker+        else if (s->high_water < (ulg)curr + WIN_INIT) {
1025*86ee64e7SAndroid Build Coastguard Worker+            /* High water mark at or above current data, but below current data
1026*86ee64e7SAndroid Build Coastguard Worker+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
1027*86ee64e7SAndroid Build Coastguard Worker+             * to end of window, whichever is less.
1028*86ee64e7SAndroid Build Coastguard Worker+             */
1029*86ee64e7SAndroid Build Coastguard Worker+            init = (ulg)curr + WIN_INIT - s->high_water;
1030*86ee64e7SAndroid Build Coastguard Worker+            if (init > s->window_size - s->high_water)
1031*86ee64e7SAndroid Build Coastguard Worker+                init = s->window_size - s->high_water;
1032*86ee64e7SAndroid Build Coastguard Worker+            zmemzero(s->window + s->high_water, (unsigned)init);
1033*86ee64e7SAndroid Build Coastguard Worker+            s->high_water += init;
1034*86ee64e7SAndroid Build Coastguard Worker+        }
1035*86ee64e7SAndroid Build Coastguard Worker+    }
1036*86ee64e7SAndroid Build Coastguard Worker+
1037*86ee64e7SAndroid Build Coastguard Worker+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
1038*86ee64e7SAndroid Build Coastguard Worker+           "not enough room for search");
1039*86ee64e7SAndroid Build Coastguard Worker+}
1040*86ee64e7SAndroid Build Coastguard Workerdiff --git a/simd_stub.c b/simd_stub.c
1041*86ee64e7SAndroid Build Coastguard Workernew file mode 100644
1042*86ee64e7SAndroid Build Coastguard Workerindex 000000000000..c6d46051498f
1043*86ee64e7SAndroid Build Coastguard Worker--- /dev/null
1044*86ee64e7SAndroid Build Coastguard Worker+++ b/simd_stub.c
1045*86ee64e7SAndroid Build Coastguard Worker@@ -0,0 +1,35 @@
1046*86ee64e7SAndroid Build Coastguard Worker+/* simd_stub.c -- stub implementations
1047*86ee64e7SAndroid Build Coastguard Worker+* Copyright (C) 2014 Intel Corporation
1048*86ee64e7SAndroid Build Coastguard Worker+* For conditions of distribution and use, see copyright notice in zlib.h
1049*86ee64e7SAndroid Build Coastguard Worker+*/
1050*86ee64e7SAndroid Build Coastguard Worker+#include <assert.h>
1051*86ee64e7SAndroid Build Coastguard Worker+
1052*86ee64e7SAndroid Build Coastguard Worker+#include "deflate.h"
1053*86ee64e7SAndroid Build Coastguard Worker+#include "x86.h"
1054*86ee64e7SAndroid Build Coastguard Worker+
1055*86ee64e7SAndroid Build Coastguard Worker+int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1056*86ee64e7SAndroid Build Coastguard Worker+
1057*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
1058*86ee64e7SAndroid Build Coastguard Worker+    assert(0);
1059*86ee64e7SAndroid Build Coastguard Worker+}
1060*86ee64e7SAndroid Build Coastguard Worker+
1061*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
1062*86ee64e7SAndroid Build Coastguard Worker+                                 unsigned char *dst,
1063*86ee64e7SAndroid Build Coastguard Worker+                                 const unsigned char *src,
1064*86ee64e7SAndroid Build Coastguard Worker+                                 long len) {
1065*86ee64e7SAndroid Build Coastguard Worker+    assert(0);
1066*86ee64e7SAndroid Build Coastguard Worker+}
1067*86ee64e7SAndroid Build Coastguard Worker+
1068*86ee64e7SAndroid Build Coastguard Worker+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
1069*86ee64e7SAndroid Build Coastguard Worker+    assert(0);
1070*86ee64e7SAndroid Build Coastguard Worker+    return 0;
1071*86ee64e7SAndroid Build Coastguard Worker+}
1072*86ee64e7SAndroid Build Coastguard Worker+
1073*86ee64e7SAndroid Build Coastguard Worker+void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
1074*86ee64e7SAndroid Build Coastguard Worker+{
1075*86ee64e7SAndroid Build Coastguard Worker+    assert(0);
1076*86ee64e7SAndroid Build Coastguard Worker+}
1077*86ee64e7SAndroid Build Coastguard Worker+
1078*86ee64e7SAndroid Build Coastguard Worker+void x86_check_features(void)
1079*86ee64e7SAndroid Build Coastguard Worker+{
1080*86ee64e7SAndroid Build Coastguard Worker+}
1081*86ee64e7SAndroid Build Coastguard Workerdiff --git a/x86.c b/x86.c
1082*86ee64e7SAndroid Build Coastguard Workernew file mode 100644
1083*86ee64e7SAndroid Build Coastguard Workerindex 000000000000..e56fe8b85a39
1084*86ee64e7SAndroid Build Coastguard Worker--- /dev/null
1085*86ee64e7SAndroid Build Coastguard Worker+++ b/x86.c
1086*86ee64e7SAndroid Build Coastguard Worker@@ -0,0 +1,92 @@
1087*86ee64e7SAndroid Build Coastguard Worker+/*
1088*86ee64e7SAndroid Build Coastguard Worker+ * x86 feature check
1089*86ee64e7SAndroid Build Coastguard Worker+ *
1090*86ee64e7SAndroid Build Coastguard Worker+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
1091*86ee64e7SAndroid Build Coastguard Worker+ * Author:
1092*86ee64e7SAndroid Build Coastguard Worker+ *  Jim Kukunas
1093*86ee64e7SAndroid Build Coastguard Worker+ *
1094*86ee64e7SAndroid Build Coastguard Worker+ * For conditions of distribution and use, see copyright notice in zlib.h
1095*86ee64e7SAndroid Build Coastguard Worker+ */
1096*86ee64e7SAndroid Build Coastguard Worker+
1097*86ee64e7SAndroid Build Coastguard Worker+#include "x86.h"
1098*86ee64e7SAndroid Build Coastguard Worker+#include "zutil.h"
1099*86ee64e7SAndroid Build Coastguard Worker+
1100*86ee64e7SAndroid Build Coastguard Worker+int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1101*86ee64e7SAndroid Build Coastguard Worker+
1102*86ee64e7SAndroid Build Coastguard Worker+#ifndef _MSC_VER
1103*86ee64e7SAndroid Build Coastguard Worker+#include <pthread.h>
1104*86ee64e7SAndroid Build Coastguard Worker+
1105*86ee64e7SAndroid Build Coastguard Worker+pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
1106*86ee64e7SAndroid Build Coastguard Worker+static void _x86_check_features(void);
1107*86ee64e7SAndroid Build Coastguard Worker+
1108*86ee64e7SAndroid Build Coastguard Worker+void x86_check_features(void)
1109*86ee64e7SAndroid Build Coastguard Worker+{
1110*86ee64e7SAndroid Build Coastguard Worker+  pthread_once(&cpu_check_inited_once, _x86_check_features);
1111*86ee64e7SAndroid Build Coastguard Worker+}
1112*86ee64e7SAndroid Build Coastguard Worker+
1113*86ee64e7SAndroid Build Coastguard Worker+static void _x86_check_features(void)
1114*86ee64e7SAndroid Build Coastguard Worker+{
1115*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_sse2;
1116*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_sse42;
1117*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_pclmulqdq;
1118*86ee64e7SAndroid Build Coastguard Worker+    unsigned eax, ebx, ecx, edx;
1119*86ee64e7SAndroid Build Coastguard Worker+
1120*86ee64e7SAndroid Build Coastguard Worker+    eax = 1;
1121*86ee64e7SAndroid Build Coastguard Worker+#ifdef __i386__
1122*86ee64e7SAndroid Build Coastguard Worker+    __asm__ __volatile__ (
1123*86ee64e7SAndroid Build Coastguard Worker+        "xchg %%ebx, %1\n\t"
1124*86ee64e7SAndroid Build Coastguard Worker+        "cpuid\n\t"
1125*86ee64e7SAndroid Build Coastguard Worker+        "xchg %1, %%ebx\n\t"
1126*86ee64e7SAndroid Build Coastguard Worker+    : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
1127*86ee64e7SAndroid Build Coastguard Worker+    );
1128*86ee64e7SAndroid Build Coastguard Worker+#else
1129*86ee64e7SAndroid Build Coastguard Worker+    __asm__ __volatile__ (
1130*86ee64e7SAndroid Build Coastguard Worker+        "cpuid\n\t"
1131*86ee64e7SAndroid Build Coastguard Worker+    : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
1132*86ee64e7SAndroid Build Coastguard Worker+    );
1133*86ee64e7SAndroid Build Coastguard Worker+#endif  /* (__i386__) */
1134*86ee64e7SAndroid Build Coastguard Worker+
1135*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_sse2 = edx & 0x4000000;
1136*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_sse42 = ecx & 0x100000;
1137*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_pclmulqdq = ecx & 0x2;
1138*86ee64e7SAndroid Build Coastguard Worker+
1139*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1140*86ee64e7SAndroid Build Coastguard Worker+                          x86_cpu_has_sse42 &&
1141*86ee64e7SAndroid Build Coastguard Worker+                          x86_cpu_has_pclmulqdq;
1142*86ee64e7SAndroid Build Coastguard Worker+}
1143*86ee64e7SAndroid Build Coastguard Worker+#else
1144*86ee64e7SAndroid Build Coastguard Worker+#include <intrin.h>
1145*86ee64e7SAndroid Build Coastguard Worker+#include <windows.h>
1146*86ee64e7SAndroid Build Coastguard Worker+
1147*86ee64e7SAndroid Build Coastguard Worker+static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1148*86ee64e7SAndroid Build Coastguard Worker+                                         PVOID param,
1149*86ee64e7SAndroid Build Coastguard Worker+                                         PVOID *context);
1150*86ee64e7SAndroid Build Coastguard Worker+static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
1151*86ee64e7SAndroid Build Coastguard Worker+
1152*86ee64e7SAndroid Build Coastguard Worker+void x86_check_features(void)
1153*86ee64e7SAndroid Build Coastguard Worker+{
1154*86ee64e7SAndroid Build Coastguard Worker+    InitOnceExecuteOnce(&cpu_check_inited_once, _x86_check_features,
1155*86ee64e7SAndroid Build Coastguard Worker+                        NULL, NULL);
1156*86ee64e7SAndroid Build Coastguard Worker+}
1157*86ee64e7SAndroid Build Coastguard Worker+
1158*86ee64e7SAndroid Build Coastguard Worker+static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1159*86ee64e7SAndroid Build Coastguard Worker+                                         PVOID param,
1160*86ee64e7SAndroid Build Coastguard Worker+                                         PVOID *context)
1161*86ee64e7SAndroid Build Coastguard Worker+{
1162*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_sse2;
1163*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_sse42;
1164*86ee64e7SAndroid Build Coastguard Worker+    int x86_cpu_has_pclmulqdq;
1165*86ee64e7SAndroid Build Coastguard Worker+    int regs[4];
1166*86ee64e7SAndroid Build Coastguard Worker+
1167*86ee64e7SAndroid Build Coastguard Worker+    __cpuid(regs, 1);
1168*86ee64e7SAndroid Build Coastguard Worker+
1169*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_sse2 = regs[3] & 0x4000000;
1170*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_sse42= regs[2] & 0x100000;
1171*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_has_pclmulqdq = regs[2] & 0x2;
1172*86ee64e7SAndroid Build Coastguard Worker+
1173*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1174*86ee64e7SAndroid Build Coastguard Worker+                          x86_cpu_has_sse42 &&
1175*86ee64e7SAndroid Build Coastguard Worker+                          x86_cpu_has_pclmulqdq;
1176*86ee64e7SAndroid Build Coastguard Worker+    return TRUE;
1177*86ee64e7SAndroid Build Coastguard Worker+}
1178*86ee64e7SAndroid Build Coastguard Worker+#endif  /* _MSC_VER */
1179*86ee64e7SAndroid Build Coastguard Workerdiff --git a/x86.h b/x86.h
1180*86ee64e7SAndroid Build Coastguard Workernew file mode 100644
1181*86ee64e7SAndroid Build Coastguard Workerindex 000000000000..ebcf10ab09d2
1182*86ee64e7SAndroid Build Coastguard Worker--- /dev/null
1183*86ee64e7SAndroid Build Coastguard Worker+++ b/x86.h
1184*86ee64e7SAndroid Build Coastguard Worker@@ -0,0 +1,15 @@
1185*86ee64e7SAndroid Build Coastguard Worker+/* x86.h -- check for x86 CPU features
1186*86ee64e7SAndroid Build Coastguard Worker+* Copyright (C) 2013 Intel Corporation Jim Kukunas
1187*86ee64e7SAndroid Build Coastguard Worker+* For conditions of distribution and use, see copyright notice in zlib.h
1188*86ee64e7SAndroid Build Coastguard Worker+*/
1189*86ee64e7SAndroid Build Coastguard Worker+
1190*86ee64e7SAndroid Build Coastguard Worker+#ifndef X86_H
1191*86ee64e7SAndroid Build Coastguard Worker+#define X86_H
1192*86ee64e7SAndroid Build Coastguard Worker+
1193*86ee64e7SAndroid Build Coastguard Worker+#include "zlib.h"
1194*86ee64e7SAndroid Build Coastguard Worker+
1195*86ee64e7SAndroid Build Coastguard Worker+extern int x86_cpu_enable_simd;
1196*86ee64e7SAndroid Build Coastguard Worker+
1197*86ee64e7SAndroid Build Coastguard Worker+void x86_check_features(void);
1198*86ee64e7SAndroid Build Coastguard Worker+
1199*86ee64e7SAndroid Build Coastguard Worker+#endif  /* X86_H */
1200*86ee64e7SAndroid Build Coastguard Workerdiff --git a/zutil.h b/zutil.h
1201*86ee64e7SAndroid Build Coastguard Workerindex 80375b8b6109..4425bcf75eb3 100644
1202*86ee64e7SAndroid Build Coastguard Worker--- a/zutil.h
1203*86ee64e7SAndroid Build Coastguard Worker+++ b/zutil.h
1204*86ee64e7SAndroid Build Coastguard Worker@@ -283,4 +283,10 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
1205*86ee64e7SAndroid Build Coastguard Worker #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
1206*86ee64e7SAndroid Build Coastguard Worker                     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
1207*86ee64e7SAndroid Build Coastguard Worker
1208*86ee64e7SAndroid Build Coastguard Worker+#ifdef _MSC_VER
1209*86ee64e7SAndroid Build Coastguard Worker+#define zalign(x) __declspec(align(x))
1210*86ee64e7SAndroid Build Coastguard Worker+#else
1211*86ee64e7SAndroid Build Coastguard Worker+#define zalign(x) __attribute__((aligned((x))))
1212*86ee64e7SAndroid Build Coastguard Worker+#endif
1213*86ee64e7SAndroid Build Coastguard Worker+
1214*86ee64e7SAndroid Build Coastguard Worker #endif /* ZUTIL_H */
1215