1From 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001 2From: Jia Liu <[email protected]> 3Date: Thu, 30 Mar 2023 11:13:16 +0800 4Subject: [PATCH] Enabled AVX512 for CRC32 5 6Enabled AVX512 for CRC32 that provide best of known performance 7beyond current SSE SIMD optimization. It enables multiple folding 8operations and AVX512 new instructions, providing ~3.5X CRC32 9performance and ~3.7% gain on Zlib_bench gzip performance. 10--- 11 CMakeLists.txt | 8 +- 12 cpu_features.c | 9 +++ 13 cpu_features.h | 1 + 14 crc32.c | 14 +++- 15 crc32_simd.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++- 16 crc32_simd.h | 6 ++ 17 6 files changed, 230 insertions(+), 6 deletions(-) 18 19diff --git a/CMakeLists.txt b/CMakeLists.txt 20index f06e193..d45b902 100644 21--- a/CMakeLists.txt 22+++ b/CMakeLists.txt 23@@ -22,6 +22,7 @@ check_include_file(stdint.h HAVE_STDINT_H) 24 check_include_file(stddef.h HAVE_STDDEF_H) 25 26 option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF) 27+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF) 28 29 # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx) 30 # and architectures (e.g. Arm). 31@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS) 32 add_definitions(-DADLER32_SIMD_SSSE3) 33 add_definitions(-DINFLATE_CHUNK_READ_64LE) 34 add_definitions(-DCRC32_SIMD_SSE42_PCLMUL) 35+ if (ENABLE_SIMD_AVX512) 36+ add_definitions(-DCRC32_SIMD_AVX512_PCLMUL) 37+ add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul) 38+ else() 39+ add_compile_options(-msse4.2 -mpclmul) 40+ endif() 41 add_definitions(-DDEFLATE_SLIDE_HASH_SSE2) 42- add_compile_options(-msse4.2 -mpclmul) 43 # Required by CPU features detection code. 44 add_definitions(-DX86_NOT_WINDOWS) 45 # Apparently some environments (e.g. CentOS) require to explicitly link 46diff --git a/cpu_features.c b/cpu_features.c 47index 877d5f2..ac6ee88 100644 48--- a/cpu_features.c 49+++ b/cpu_features.c 50@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0; 51 int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0; 52 int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; 53 int ZLIB_INTERNAL x86_cpu_enable_simd = 0; 54+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0; 55 56 #ifndef CPU_NO_SIMD 57 58@@ -138,6 +139,10 @@ static void _cpu_check_features(void) 59 /* On x86 we simply use a instruction to check the CPU features. 60 * (i.e. CPUID). 61 */ 62+#ifdef CRC32_SIMD_AVX512_PCLMUL 63+#include <immintrin.h> 64+#include <xsaveintrin.h> 65+#endif 66 static void _cpu_check_features(void) 67 { 68 int x86_cpu_has_sse2; 69@@ -164,6 +169,10 @@ static void _cpu_check_features(void) 70 x86_cpu_enable_simd = x86_cpu_has_sse2 && 71 x86_cpu_has_sse42 && 72 x86_cpu_has_pclmulqdq; 73+ 74+#ifdef CRC32_SIMD_AVX512_PCLMUL 75+ x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040; 76+#endif 77 } 78 #endif 79 #endif 80diff --git a/cpu_features.h b/cpu_features.h 81index 279246c..aed3e83 100644 82--- a/cpu_features.h 83+++ b/cpu_features.h 84@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull; 85 extern int x86_cpu_enable_sse2; 86 extern int x86_cpu_enable_ssse3; 87 extern int x86_cpu_enable_simd; 88+extern int x86_cpu_enable_avx512; 89 90 void cpu_check_features(void); 91diff --git a/crc32.c b/crc32.c 92index 4486098..acb6972 100644 93--- a/crc32.c 94+++ b/crc32.c 95@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) 96 } 97 98 #endif 99-#if defined(CRC32_SIMD_SSE42_PCLMUL) 100+#if defined(CRC32_SIMD_AVX512_PCLMUL) 101+ if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) { 102+ /* crc32 64-byte chunks */ 103+ z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK; 104+ crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc); 105+ /* check remaining data */ 106+ len -= chunk_size; 107+ if (!len) 108+ return crc; 109+ /* Fall into the default crc32 for the remaining data. */ 110+ buf += chunk_size; 111+ } 112+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 113 if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) { 114 /* crc32 16-byte chunks */ 115 z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK; 116diff --git a/crc32_simd.c b/crc32_simd.c 117index d80beba..7428270 100644 118--- a/crc32_simd.c 119+++ b/crc32_simd.c 120@@ -6,17 +6,207 @@ 121 */ 122 123 #include "crc32_simd.h" 124- 125-#if defined(CRC32_SIMD_SSE42_PCLMUL) 126+#if defined(CRC32_SIMD_AVX512_PCLMUL) 127 128 /* 129- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 130- * length must be at least 64, and a multiple of 16. Based on: 131+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer 132+ * length must be at least 256, and a multiple of 64. Based on: 133 * 134 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 135 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 136 */ 137 138+#include <emmintrin.h> 139+#include <smmintrin.h> 140+#include <wmmintrin.h> 141+#include <immintrin.h> 142+ 143+uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */ 144+ const unsigned char *buf, 145+ z_size_t len, 146+ uint32_t crc) 147+{ 148+ /* 149+ * Definitions of the bit-reflected domain constants k1,k2,k3,k4 150+ * are similar to those given at the end of the paper, and remaining 151+ * constants and CRC32+Barrett polynomials remain unchanged. 152+ * 153+ * Replace the index of x from 128 to 512. As follows: 154+ * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a 155+ * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430 156+ * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4 157+ * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596 158+ */ 159+ static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430, 160+ 0x011542778a, 0x01322d1430, 161+ 0x011542778a, 0x01322d1430, 162+ 0x011542778a, 0x01322d1430 }; 163+ static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596, 164+ 0x0154442bd4, 0x01c6e41596, 165+ 0x0154442bd4, 0x01c6e41596, 166+ 0x0154442bd4, 0x01c6e41596 }; 167+ static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e }; 168+ static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 }; 169+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; 170+ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; 171+ __m128i a0, a1, a2, a3; 172+ 173+ /* 174+ * There's at least one block of 256. 175+ */ 176+ x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 177+ x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 178+ x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 179+ x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 180+ 181+ x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); 182+ 183+ x0 = _mm512_load_si512((__m512i *)k1k2); 184+ 185+ buf += 256; 186+ len -= 256; 187+ 188+ /* 189+ * Parallel fold blocks of 256, if any. 190+ */ 191+ while (len >= 256) 192+ { 193+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 194+ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); 195+ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); 196+ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); 197+ 198+ 199+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 200+ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); 201+ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); 202+ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); 203+ 204+ y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 205+ y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 206+ y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 207+ y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 208+ 209+ x1 = _mm512_xor_si512(x1, x5); 210+ x2 = _mm512_xor_si512(x2, x6); 211+ x3 = _mm512_xor_si512(x3, x7); 212+ x4 = _mm512_xor_si512(x4, x8); 213+ 214+ x1 = _mm512_xor_si512(x1, y5); 215+ x2 = _mm512_xor_si512(x2, y6); 216+ x3 = _mm512_xor_si512(x3, y7); 217+ x4 = _mm512_xor_si512(x4, y8); 218+ 219+ buf += 256; 220+ len -= 256; 221+ } 222+ 223+ /* 224+ * Fold into 512-bits. 225+ */ 226+ x0 = _mm512_load_si512((__m512i *)k3k4); 227+ 228+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 229+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 230+ x1 = _mm512_xor_si512(x1, x2); 231+ x1 = _mm512_xor_si512(x1, x5); 232+ 233+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 234+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 235+ x1 = _mm512_xor_si512(x1, x3); 236+ x1 = _mm512_xor_si512(x1, x5); 237+ 238+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 239+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 240+ x1 = _mm512_xor_si512(x1, x4); 241+ x1 = _mm512_xor_si512(x1, x5); 242+ 243+ /* 244+ * Single fold blocks of 64, if any. 245+ */ 246+ while (len >= 64) 247+ { 248+ x2 = _mm512_loadu_si512((__m512i *)buf); 249+ 250+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 251+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 252+ x1 = _mm512_xor_si512(x1, x2); 253+ x1 = _mm512_xor_si512(x1, x5); 254+ 255+ buf += 64; 256+ len -= 64; 257+ } 258+ 259+ /* 260+ * Fold 512-bits to 384-bits. 261+ */ 262+ a0 = _mm_load_si128((__m128i *)k5k6); 263+ 264+ a1 = _mm512_extracti32x4_epi32(x1, 0); 265+ a2 = _mm512_extracti32x4_epi32(x1, 1); 266+ 267+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 268+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 269+ 270+ a1 = _mm_xor_si128(a1, a3); 271+ a1 = _mm_xor_si128(a1, a2); 272+ 273+ /* 274+ * Fold 384-bits to 256-bits. 275+ */ 276+ a2 = _mm512_extracti32x4_epi32(x1, 2); 277+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 278+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 279+ a1 = _mm_xor_si128(a1, a3); 280+ a1 = _mm_xor_si128(a1, a2); 281+ 282+ /* 283+ * Fold 256-bits to 128-bits. 284+ */ 285+ a2 = _mm512_extracti32x4_epi32(x1, 3); 286+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 287+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 288+ a1 = _mm_xor_si128(a1, a3); 289+ a1 = _mm_xor_si128(a1, a2); 290+ 291+ /* 292+ * Fold 128-bits to 64-bits. 293+ */ 294+ a2 = _mm_clmulepi64_si128(a1, a0, 0x10); 295+ a3 = _mm_setr_epi32(~0, 0, ~0, 0); 296+ a1 = _mm_srli_si128(a1, 8); 297+ a1 = _mm_xor_si128(a1, a2); 298+ 299+ a0 = _mm_loadl_epi64((__m128i*)k7k8); 300+ a2 = _mm_srli_si128(a1, 4); 301+ a1 = _mm_and_si128(a1, a3); 302+ a1 = _mm_clmulepi64_si128(a1, a0, 0x00); 303+ a1 = _mm_xor_si128(a1, a2); 304+ 305+ /* 306+ * Barret reduce to 32-bits. 307+ */ 308+ a0 = _mm_load_si128((__m128i*)poly); 309+ 310+ a2 = _mm_and_si128(a1, a3); 311+ a2 = _mm_clmulepi64_si128(a2, a0, 0x10); 312+ a2 = _mm_and_si128(a2, a3); 313+ a2 = _mm_clmulepi64_si128(a2, a0, 0x00); 314+ a1 = _mm_xor_si128(a1, a2); 315+ 316+ /* 317+ * Return the crc32. 318+ */ 319+ return _mm_extract_epi32(a1, 1); 320+} 321+ 322+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 323+ 324+/* 325+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 326+ * length must be at least 64, and a multiple of 16. 327+ */ 328+ 329 #include <emmintrin.h> 330 #include <smmintrin.h> 331 #include <wmmintrin.h> 332diff --git a/crc32_simd.h b/crc32_simd.h 333index c0346dc..8462464 100644 334--- a/crc32_simd.h 335+++ b/crc32_simd.h 336@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf, 337 z_size_t len, 338 uint32_t crc); 339 340+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf, 341+ z_size_t len, 342+ uint32_t crc); 343+ 344 /* 345 * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c 346 * for computing the crc32 of an arbitrary length buffer. 347 */ 348 #define Z_CRC32_SSE42_MINIMUM_LENGTH 64 349 #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15 350+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256 351+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63 352 353 /* 354 * CRC32 checksums using ARMv8-a crypto instructions. 355-- 3562.34.1 357 358