xref: /aosp_15_r20/external/zlib/patches/0011-avx512.patch (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1From 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001
2From: Jia Liu <[email protected]>
3Date: Thu, 30 Mar 2023 11:13:16 +0800
4Subject: [PATCH] Enabled AVX512 for CRC32
5
6Enabled AVX512 for CRC32 that provide best of known performance
7beyond current SSE SIMD optimization. It enables multiple folding
8operations and AVX512 new instructions, providing ~3.5X CRC32
9performance and ~3.7% gain on Zlib_bench gzip performance.
10---
11 CMakeLists.txt |   8 +-
12 cpu_features.c |   9 +++
13 cpu_features.h |   1 +
14 crc32.c        |  14 +++-
15 crc32_simd.c   | 198 ++++++++++++++++++++++++++++++++++++++++++++++++-
16 crc32_simd.h   |   6 ++
17 6 files changed, 230 insertions(+), 6 deletions(-)
18
19diff --git a/CMakeLists.txt b/CMakeLists.txt
20index f06e193..d45b902 100644
21--- a/CMakeLists.txt
22+++ b/CMakeLists.txt
23@@ -22,6 +22,7 @@ check_include_file(stdint.h    HAVE_STDINT_H)
24 check_include_file(stddef.h    HAVE_STDDEF_H)
25
26 option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
27+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
28
29 # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
30 # and architectures (e.g. Arm).
31@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
32    add_definitions(-DADLER32_SIMD_SSSE3)
33    add_definitions(-DINFLATE_CHUNK_READ_64LE)
34    add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
35+   if (ENABLE_SIMD_AVX512)
36+    add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
37+    add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
38+   else()
39+    add_compile_options(-msse4.2 -mpclmul)
40+   endif()
41    add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
42-   add_compile_options(-msse4.2 -mpclmul)
43    # Required by CPU features detection code.
44    add_definitions(-DX86_NOT_WINDOWS)
45    # Apparently some environments (e.g. CentOS) require to explicitly link
46diff --git a/cpu_features.c b/cpu_features.c
47index 877d5f2..ac6ee88 100644
48--- a/cpu_features.c
49+++ b/cpu_features.c
50@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
51 int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
52 int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
53 int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
54+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
55
56 #ifndef CPU_NO_SIMD
57
58@@ -138,6 +139,10 @@ static void _cpu_check_features(void)
59 /* On x86 we simply use a instruction to check the CPU features.
60  * (i.e. CPUID).
61  */
62+#ifdef CRC32_SIMD_AVX512_PCLMUL
63+#include <immintrin.h>
64+#include <xsaveintrin.h>
65+#endif
66 static void _cpu_check_features(void)
67 {
68     int x86_cpu_has_sse2;
69@@ -164,6 +169,10 @@ static void _cpu_check_features(void)
70     x86_cpu_enable_simd = x86_cpu_has_sse2 &&
71                           x86_cpu_has_sse42 &&
72                           x86_cpu_has_pclmulqdq;
73+
74+#ifdef CRC32_SIMD_AVX512_PCLMUL
75+    x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
76+#endif
77 }
78 #endif
79 #endif
80diff --git a/cpu_features.h b/cpu_features.h
81index 279246c..aed3e83 100644
82--- a/cpu_features.h
83+++ b/cpu_features.h
84@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
85 extern int x86_cpu_enable_sse2;
86 extern int x86_cpu_enable_ssse3;
87 extern int x86_cpu_enable_simd;
88+extern int x86_cpu_enable_avx512;
89
90 void cpu_check_features(void);
91diff --git a/crc32.c b/crc32.c
92index 4486098..acb6972 100644
93--- a/crc32.c
94+++ b/crc32.c
95@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
96     }
97
98 #endif
99-#if defined(CRC32_SIMD_SSE42_PCLMUL)
100+#if defined(CRC32_SIMD_AVX512_PCLMUL)
101+    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
102+        /* crc32 64-byte chunks */
103+        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
104+        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
105+        /* check remaining data */
106+        len -= chunk_size;
107+        if (!len)
108+            return crc;
109+        /* Fall into the default crc32 for the remaining data. */
110+        buf += chunk_size;
111+    }
112+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
113     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
114         /* crc32 16-byte chunks */
115         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
116diff --git a/crc32_simd.c b/crc32_simd.c
117index d80beba..7428270 100644
118--- a/crc32_simd.c
119+++ b/crc32_simd.c
120@@ -6,17 +6,207 @@
121  */
122
123 #include "crc32_simd.h"
124-
125-#if defined(CRC32_SIMD_SSE42_PCLMUL)
126+#if defined(CRC32_SIMD_AVX512_PCLMUL)
127
128 /*
129- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
130- * length must be at least 64, and a multiple of 16. Based on:
131+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
132+ * length must be at least 256, and a multiple of 64. Based on:
133  *
134  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
135  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
136  */
137
138+#include <emmintrin.h>
139+#include <smmintrin.h>
140+#include <wmmintrin.h>
141+#include <immintrin.h>
142+
143+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
144+    const unsigned char *buf,
145+    z_size_t len,
146+    uint32_t crc)
147+{
148+    /*
149+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
150+     * are similar to those given at the end of the paper, and remaining
151+     * constants and CRC32+Barrett polynomials remain unchanged.
152+     *
153+     * Replace the index of x from 128 to 512. As follows:
154+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
155+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
156+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
157+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
158+     */
159+    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
160+                                                0x011542778a, 0x01322d1430,
161+                                                0x011542778a, 0x01322d1430,
162+                                                0x011542778a, 0x01322d1430 };
163+    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
164+                                                0x0154442bd4, 0x01c6e41596,
165+                                                0x0154442bd4, 0x01c6e41596,
166+                                                0x0154442bd4, 0x01c6e41596 };
167+    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
168+    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
169+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
170+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
171+    __m128i a0, a1, a2, a3;
172+
173+    /*
174+     * There's at least one block of 256.
175+     */
176+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
177+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
178+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
179+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
180+
181+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
182+
183+    x0 = _mm512_load_si512((__m512i *)k1k2);
184+
185+    buf += 256;
186+    len -= 256;
187+
188+    /*
189+     * Parallel fold blocks of 256, if any.
190+     */
191+    while (len >= 256)
192+    {
193+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
194+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
195+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
196+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
197+
198+
199+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
200+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
201+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
202+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
203+
204+        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
205+        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
206+        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
207+        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
208+
209+        x1 = _mm512_xor_si512(x1, x5);
210+        x2 = _mm512_xor_si512(x2, x6);
211+        x3 = _mm512_xor_si512(x3, x7);
212+        x4 = _mm512_xor_si512(x4, x8);
213+
214+        x1 = _mm512_xor_si512(x1, y5);
215+        x2 = _mm512_xor_si512(x2, y6);
216+        x3 = _mm512_xor_si512(x3, y7);
217+        x4 = _mm512_xor_si512(x4, y8);
218+
219+        buf += 256;
220+        len -= 256;
221+    }
222+
223+    /*
224+     * Fold into 512-bits.
225+     */
226+    x0 = _mm512_load_si512((__m512i *)k3k4);
227+
228+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
229+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
230+    x1 = _mm512_xor_si512(x1, x2);
231+    x1 = _mm512_xor_si512(x1, x5);
232+
233+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
234+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
235+    x1 = _mm512_xor_si512(x1, x3);
236+    x1 = _mm512_xor_si512(x1, x5);
237+
238+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
239+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
240+    x1 = _mm512_xor_si512(x1, x4);
241+    x1 = _mm512_xor_si512(x1, x5);
242+
243+    /*
244+     * Single fold blocks of 64, if any.
245+     */
246+    while (len >= 64)
247+    {
248+        x2 = _mm512_loadu_si512((__m512i *)buf);
249+
250+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
251+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
252+        x1 = _mm512_xor_si512(x1, x2);
253+        x1 = _mm512_xor_si512(x1, x5);
254+
255+        buf += 64;
256+        len -= 64;
257+    }
258+
259+    /*
260+     * Fold 512-bits to 384-bits.
261+     */
262+    a0 = _mm_load_si128((__m128i *)k5k6);
263+
264+    a1 = _mm512_extracti32x4_epi32(x1, 0);
265+    a2 = _mm512_extracti32x4_epi32(x1, 1);
266+
267+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
268+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
269+
270+    a1 = _mm_xor_si128(a1, a3);
271+    a1 = _mm_xor_si128(a1, a2);
272+
273+    /*
274+     * Fold 384-bits to 256-bits.
275+     */
276+    a2 = _mm512_extracti32x4_epi32(x1, 2);
277+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
278+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
279+    a1 = _mm_xor_si128(a1, a3);
280+    a1 = _mm_xor_si128(a1, a2);
281+
282+    /*
283+     * Fold 256-bits to 128-bits.
284+     */
285+    a2 = _mm512_extracti32x4_epi32(x1, 3);
286+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
287+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
288+    a1 = _mm_xor_si128(a1, a3);
289+    a1 = _mm_xor_si128(a1, a2);
290+
291+    /*
292+     * Fold 128-bits to 64-bits.
293+     */
294+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
295+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
296+    a1 = _mm_srli_si128(a1, 8);
297+    a1 = _mm_xor_si128(a1, a2);
298+
299+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
300+    a2 = _mm_srli_si128(a1, 4);
301+    a1 = _mm_and_si128(a1, a3);
302+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
303+    a1 = _mm_xor_si128(a1, a2);
304+
305+    /*
306+     * Barret reduce to 32-bits.
307+     */
308+    a0 = _mm_load_si128((__m128i*)poly);
309+
310+    a2 = _mm_and_si128(a1, a3);
311+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
312+    a2 = _mm_and_si128(a2, a3);
313+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
314+    a1 = _mm_xor_si128(a1, a2);
315+
316+    /*
317+     * Return the crc32.
318+     */
319+    return _mm_extract_epi32(a1, 1);
320+}
321+
322+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
323+
324+/*
325+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
326+ * length must be at least 64, and a multiple of 16.
327+ */
328+
329 #include <emmintrin.h>
330 #include <smmintrin.h>
331 #include <wmmintrin.h>
332diff --git a/crc32_simd.h b/crc32_simd.h
333index c0346dc..8462464 100644
334--- a/crc32_simd.h
335+++ b/crc32_simd.h
336@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
337                                          z_size_t len,
338                                          uint32_t crc);
339
340+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
341+                                          z_size_t len,
342+                                          uint32_t crc);
343+
344 /*
345  * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
346  * for computing the crc32 of an arbitrary length buffer.
347  */
348 #define Z_CRC32_SSE42_MINIMUM_LENGTH 64
349 #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
350+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
351+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
352
353 /*
354  * CRC32 checksums using ARMv8-a crypto instructions.
355--
3562.34.1
357
358