1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
16
17 #include <cstddef>
18 #include <cstdint>
19 #include <memory>
20 #include <vector>
21
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/base/internal/endian.h"
25 #include "absl/base/prefetch.h"
26 #include "absl/crc/internal/cpu_detect.h"
27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
28 #include "absl/crc/internal/crc_internal.h"
29 #include "absl/memory/memory.h"
30 #include "absl/numeric/bits.h"
31
32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
33 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
35 #endif
36
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace crc_internal {
40
41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
42
43 // Implementation details not exported outside of file
44 namespace {
45
46 // Some machines have CRC acceleration hardware.
47 // We can do a faster version of Extend() on such machines.
48 class CRC32AcceleratedX86ARMCombined : public CRC32 {
49 public:
CRC32AcceleratedX86ARMCombined()50 CRC32AcceleratedX86ARMCombined() {}
~CRC32AcceleratedX86ARMCombined()51 ~CRC32AcceleratedX86ARMCombined() override {}
52 void ExtendByZeroes(uint32_t* crc, size_t length) const override;
53 uint32_t ComputeZeroConstant(size_t length) const;
54
55 private:
56 CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
57 delete;
58 CRC32AcceleratedX86ARMCombined& operator=(
59 const CRC32AcceleratedX86ARMCombined&) = delete;
60 };
61
62 // Constants for switching between algorithms.
63 // Chosen by comparing speed at different powers of 2.
64 constexpr size_t kSmallCutoff = 256;
65 constexpr size_t kMediumCutoff = 2048;
66
67 #define ABSL_INTERNAL_STEP1(crc) \
68 do { \
69 crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
70 } while (0)
71 #define ABSL_INTERNAL_STEP2(crc) \
72 do { \
73 crc = \
74 CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
75 p += 2; \
76 } while (0)
77 #define ABSL_INTERNAL_STEP4(crc) \
78 do { \
79 crc = \
80 CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
81 p += 4; \
82 } while (0)
83 #define ABSL_INTERNAL_STEP8(crc, data) \
84 do { \
85 crc = CRC32_u64(static_cast<uint32_t>(crc), \
86 absl::little_endian::Load64(data)); \
87 data += 8; \
88 } while (0)
89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
90 do { \
91 ABSL_INTERNAL_STEP8(crc0, p0); \
92 ABSL_INTERNAL_STEP8(crc1, p1); \
93 } while (0)
94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
95 do { \
96 ABSL_INTERNAL_STEP8(crc0, p0); \
97 ABSL_INTERNAL_STEP8(crc1, p1); \
98 ABSL_INTERNAL_STEP8(crc2, p2); \
99 } while (0)
100
101 namespace {
102
multiply(uint32_t a,uint32_t b)103 uint32_t multiply(uint32_t a, uint32_t b) {
104 V128 power = V128_From64WithZeroFill(a);
105 V128 crc = V128_From64WithZeroFill(b);
106 V128 res = V128_PMulLow(power, crc);
107
108 // Combine crc values.
109 //
110 // Adding res to itself is equivalent to multiplying by 2,
111 // or shifting left by 1. Addition is used as not all compilers
112 // are able to generate optimal code without this hint.
113 // https://godbolt.org/z/rr3fMnf39
114 res = V128_Add64(res, res);
115 return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
116 CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
117 }
118
119 // Powers of crc32c polynomial, for faster ExtendByZeros.
120 // Verified against folly:
121 // folly/hash/detail/Crc32CombineDetail.cpp
122 constexpr uint32_t kCRC32CPowers[] = {
123 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
124 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
125 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
126 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
127 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
128 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
129 0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
130 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
131 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
132 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
133 0x00800000, 0x00008000,
134 };
135
136 } // namespace
137
138 // Compute a magic constant, so that multiplying by it is the same as
139 // extending crc by length zeros.
ComputeZeroConstant(size_t length) const140 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
141 size_t length) const {
142 // Lowest 2 bits are handled separately in ExtendByZeroes
143 length >>= 2;
144
145 int index = absl::countr_zero(length);
146 uint32_t prev = kCRC32CPowers[index];
147 length &= length - 1;
148
149 while (length) {
150 // For each bit of length, extend by 2**n zeros.
151 index = absl::countr_zero(length);
152 prev = multiply(prev, kCRC32CPowers[index]);
153 length &= length - 1;
154 }
155 return prev;
156 }
157
ExtendByZeroes(uint32_t * crc,size_t length) const158 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
159 size_t length) const {
160 uint32_t val = *crc;
161 // Don't bother with multiplication for small length.
162 switch (length & 3) {
163 case 0:
164 break;
165 case 1:
166 val = CRC32_u8(val, 0);
167 break;
168 case 2:
169 val = CRC32_u16(val, 0);
170 break;
171 case 3:
172 val = CRC32_u8(val, 0);
173 val = CRC32_u16(val, 0);
174 break;
175 }
176 if (length > 3) {
177 val = multiply(val, ComputeZeroConstant(length));
178 }
179 *crc = val;
180 }
181
182 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
183 // Instruction"
184 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
185 // We only need every 4th value, because we unroll loop by 4.
186 constexpr uint64_t kClmulConstants[] = {
187 0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
188 0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
189 0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
190 0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
191 0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
192 0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
193 0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
194 0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
195 0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
196 0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
197 0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
198 0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
199 0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
200 };
201
202 enum class CutoffStrategy {
203 // Use 3 CRC streams to fold into 1.
204 Fold3,
205 // Unroll CRC instructions for 64 bytes.
206 Unroll64CRC,
207 };
208
209 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
210 // methods and data that don't need the template arguments.
211 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
212 : public CRC32AcceleratedX86ARMCombined {
213 protected:
214 // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
215 // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
216 // high latency, so we run 4 128-bit partial checksums that can be reduced to
217 // a single value by FinalizePclmulStream later. Computing crc for arbitrary
218 // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
219 // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
220 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
221 // We are applying it to CRC32C polynomial.
Process64BytesPclmul(const uint8_t * p,V128 * partialCRC) const222 ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
223 const uint8_t* p, V128* partialCRC) const {
224 V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
225
226 V128 partialCRC1 = partialCRC[0];
227 V128 partialCRC2 = partialCRC[1];
228 V128 partialCRC3 = partialCRC[2];
229 V128 partialCRC4 = partialCRC[3];
230
231 V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
232 V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
233 V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
234 V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
235 V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
236 V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
237 V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
238 V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
239 partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
240 partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
241 partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
242 partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
243 partialCRC1 = V128_Xor(tmp1, partialCRC1);
244 partialCRC2 = V128_Xor(tmp2, partialCRC2);
245 partialCRC3 = V128_Xor(tmp3, partialCRC3);
246 partialCRC4 = V128_Xor(tmp4, partialCRC4);
247 partialCRC1 = V128_Xor(partialCRC1, data1);
248 partialCRC2 = V128_Xor(partialCRC2, data2);
249 partialCRC3 = V128_Xor(partialCRC3, data3);
250 partialCRC4 = V128_Xor(partialCRC4, data4);
251 partialCRC[0] = partialCRC1;
252 partialCRC[1] = partialCRC2;
253 partialCRC[2] = partialCRC3;
254 partialCRC[3] = partialCRC4;
255 }
256
257 // Reduce partialCRC produced by Process64BytesPclmul into a single value,
258 // that represents crc checksum of all the processed bytes.
259 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
FinalizePclmulStream(V128 * partialCRC) const260 FinalizePclmulStream(V128* partialCRC) const {
261 V128 partialCRC1 = partialCRC[0];
262 V128 partialCRC2 = partialCRC[1];
263 V128 partialCRC3 = partialCRC[2];
264 V128 partialCRC4 = partialCRC[3];
265
266 // Combine 4 vectors of partial crc into a single vector.
267 V128 reductionMultiplicands =
268 V128_Load(reinterpret_cast<const V128*>(k5k6));
269
270 V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
271 V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
272
273 partialCRC1 = V128_Xor(low, high);
274 partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
275
276 low = V128_PMulLow(reductionMultiplicands, partialCRC3);
277 high = V128_PMulHi(reductionMultiplicands, partialCRC3);
278
279 partialCRC3 = V128_Xor(low, high);
280 partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
281
282 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
283
284 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
285 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
286 V128 fullCRC = V128_Xor(low, high);
287 fullCRC = V128_Xor(fullCRC, partialCRC3);
288
289 // Reduce fullCRC into scalar value.
290 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
291
292 V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
293
294 V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
295 fullCRC = V128_ShiftRight<8>(fullCRC);
296 fullCRC = V128_Xor(fullCRC, tmp);
297
298 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
299
300 tmp = V128_ShiftRight<4>(fullCRC);
301 fullCRC = V128_And(fullCRC, mask);
302 fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
303 fullCRC = V128_Xor(tmp, fullCRC);
304
305 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
306
307 tmp = V128_And(fullCRC, mask);
308 tmp = V128_PMul01(reductionMultiplicands, tmp);
309 tmp = V128_And(tmp, mask);
310 tmp = V128_PMulLow(reductionMultiplicands, tmp);
311
312 fullCRC = V128_Xor(tmp, fullCRC);
313
314 return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
315 }
316
317 // Update crc with 64 bytes of data from p.
Process64BytesCRC(const uint8_t * p,uint64_t crc) const318 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
319 uint64_t crc) const {
320 for (int i = 0; i < 8; i++) {
321 crc =
322 CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
323 p += 8;
324 }
325 return crc;
326 }
327
328 // Generated by crc32c_x86_test --crc32c_generate_constants=true
329 // and verified against constants in linux kernel for S390:
330 // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
331 alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
332 alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
333 alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
334 alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
335 alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
336 alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
337
338 // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
339 // size. Each group is CRCed in parallel then combined at the end of the
340 // block.
341 static constexpr size_t kGroupsSmall = 3;
342 // For large runs we use up to kMaxStreams blocks computed with CRC
343 // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
344 // are combined in the end.
345 static constexpr size_t kMaxStreams = 3;
346 };
347
348 #ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
349 alignas(16) constexpr uint64_t
350 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
351 alignas(16) constexpr uint64_t
352 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
353 alignas(16) constexpr uint64_t
354 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
355 alignas(16) constexpr uint64_t
356 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
357 alignas(16) constexpr uint64_t
358 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
359 alignas(16) constexpr uint32_t
360 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
361 constexpr size_t
362 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
363 constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
364 #endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
365
366 template <size_t num_crc_streams, size_t num_pclmul_streams,
367 CutoffStrategy strategy>
368 class CRC32AcceleratedX86ARMCombinedMultipleStreams
369 : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
370 ABSL_ATTRIBUTE_HOT
Extend(uint32_t * crc,const void * bytes,size_t length) const371 void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
372 static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
373 "Invalid number of crc streams");
374 static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
375 "Invalid number of pclmul streams");
376 const uint8_t* p = static_cast<const uint8_t*>(bytes);
377 const uint8_t* e = p + length;
378 uint32_t l = *crc;
379 uint64_t l64;
380
381 // We have dedicated instruction for 1,2,4 and 8 bytes.
382 if (length & 8) {
383 ABSL_INTERNAL_STEP8(l, p);
384 length &= ~size_t{8};
385 }
386 if (length & 4) {
387 ABSL_INTERNAL_STEP4(l);
388 length &= ~size_t{4};
389 }
390 if (length & 2) {
391 ABSL_INTERNAL_STEP2(l);
392 length &= ~size_t{2};
393 }
394 if (length & 1) {
395 ABSL_INTERNAL_STEP1(l);
396 length &= ~size_t{1};
397 }
398 if (length == 0) {
399 *crc = l;
400 return;
401 }
402 // length is now multiple of 16.
403
404 // For small blocks just run simple loop, because cost of combining multiple
405 // streams is significant.
406 if (strategy != CutoffStrategy::Unroll64CRC) {
407 if (length < kSmallCutoff) {
408 while (length >= 16) {
409 ABSL_INTERNAL_STEP8(l, p);
410 ABSL_INTERNAL_STEP8(l, p);
411 length -= 16;
412 }
413 *crc = l;
414 return;
415 }
416 }
417
418 // For medium blocks we run 3 crc streams and combine them as described in
419 // Intel paper above. Running 4th stream doesn't help, because crc
420 // instruction has latency 3 and throughput 1.
421 if (length < kMediumCutoff) {
422 l64 = l;
423 if (strategy == CutoffStrategy::Fold3) {
424 uint64_t l641 = 0;
425 uint64_t l642 = 0;
426 const size_t blockSize = 32;
427 size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
428 const uint8_t* p1 = p + bs * blockSize;
429 const uint8_t* p2 = p1 + bs * blockSize;
430
431 for (size_t i = 0; i + 1 < bs; ++i) {
432 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
433 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
434 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
435 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
436 PrefetchToLocalCache(
437 reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
438 PrefetchToLocalCache(
439 reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
440 PrefetchToLocalCache(
441 reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
442 }
443 // Don't run crc on last 8 bytes.
444 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
445 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
446 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
447 ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
448
449 V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
450
451 V128 tmp = V128_From64WithZeroFill(l64);
452
453 V128 res1 = V128_PMulLow(tmp, magic);
454
455 tmp = V128_From64WithZeroFill(l641);
456
457 V128 res2 = V128_PMul10(tmp, magic);
458 V128 x = V128_Xor(res1, res2);
459 l64 = static_cast<uint64_t>(V128_Low64(x)) ^
460 absl::little_endian::Load64(p2);
461 l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
462
463 p = p2 + 8;
464 } else if (strategy == CutoffStrategy::Unroll64CRC) {
465 while ((e - p) >= 64) {
466 l64 = Process64BytesCRC(p, l64);
467 p += 64;
468 }
469 }
470 } else {
471 // There is a lot of data, we can ignore combine costs and run all
472 // requested streams (num_crc_streams + num_pclmul_streams),
473 // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
474 // so on some cpus it makes sense to execute both of them for different
475 // streams.
476
477 // Point x at first 8-byte aligned byte in string.
478 const uint8_t* x = RoundUp<8>(p);
479 // Process bytes until p is 8-byte aligned, if that isn't past the end.
480 while (p != x) {
481 ABSL_INTERNAL_STEP1(l);
482 }
483
484 size_t bs = static_cast<size_t>(e - p) /
485 (num_crc_streams + num_pclmul_streams) / 64;
486 const uint8_t* crc_streams[kMaxStreams];
487 const uint8_t* pclmul_streams[kMaxStreams];
488 // We are guaranteed to have at least one crc stream.
489 crc_streams[0] = p;
490 for (size_t i = 1; i < num_crc_streams; i++) {
491 crc_streams[i] = crc_streams[i - 1] + bs * 64;
492 }
493 pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
494 for (size_t i = 1; i < num_pclmul_streams; i++) {
495 pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
496 }
497
498 // Per stream crc sums.
499 uint64_t l64_crc[kMaxStreams] = {l};
500 uint64_t l64_pclmul[kMaxStreams] = {0};
501
502 // Peel first iteration, because PCLMULQDQ stream, needs setup.
503 for (size_t i = 0; i < num_crc_streams; i++) {
504 l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
505 crc_streams[i] += 16 * 4;
506 }
507
508 V128 partialCRC[kMaxStreams][4];
509 for (size_t i = 0; i < num_pclmul_streams; i++) {
510 partialCRC[i][0] = V128_LoadU(
511 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
512 partialCRC[i][1] = V128_LoadU(
513 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
514 partialCRC[i][2] = V128_LoadU(
515 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
516 partialCRC[i][3] = V128_LoadU(
517 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
518 pclmul_streams[i] += 16 * 4;
519 }
520
521 for (size_t i = 1; i < bs; i++) {
522 // Prefetch data for next iterations.
523 for (size_t j = 0; j < num_crc_streams; j++) {
524 PrefetchToLocalCache(
525 reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
526 }
527 for (size_t j = 0; j < num_pclmul_streams; j++) {
528 PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
529 kPrefetchHorizon));
530 }
531
532 // We process each stream in 64 byte blocks. This can be written as
533 // for (int i = 0; i < num_pclmul_streams; i++) {
534 // Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
535 // pclmul_streams[i] += 16 * 4;
536 // }
537 // for (int i = 0; i < num_crc_streams; i++) {
538 // l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
539 // crc_streams[i] += 16*4;
540 // }
541 // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
542 // gives ~2% performance boost.
543 l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
544 crc_streams[0] += 16 * 4;
545 if (num_pclmul_streams > 0) {
546 Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
547 pclmul_streams[0] += 16 * 4;
548 }
549 if (num_crc_streams > 1) {
550 l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
551 crc_streams[1] += 16 * 4;
552 }
553 if (num_pclmul_streams > 1) {
554 Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
555 pclmul_streams[1] += 16 * 4;
556 }
557 if (num_crc_streams > 2) {
558 l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
559 crc_streams[2] += 16 * 4;
560 }
561 if (num_pclmul_streams > 2) {
562 Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
563 pclmul_streams[2] += 16 * 4;
564 }
565 }
566
567 // PCLMULQDQ based streams require special final step;
568 // CRC based don't.
569 for (size_t i = 0; i < num_pclmul_streams; i++) {
570 l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
571 }
572
573 // Combine all streams into single result.
574 uint32_t magic = ComputeZeroConstant(bs * 64);
575 l64 = l64_crc[0];
576 for (size_t i = 1; i < num_crc_streams; i++) {
577 l64 = multiply(static_cast<uint32_t>(l64), magic);
578 l64 ^= l64_crc[i];
579 }
580 for (size_t i = 0; i < num_pclmul_streams; i++) {
581 l64 = multiply(static_cast<uint32_t>(l64), magic);
582 l64 ^= l64_pclmul[i];
583 }
584
585 // Update p.
586 if (num_pclmul_streams > 0) {
587 p = pclmul_streams[num_pclmul_streams - 1];
588 } else {
589 p = crc_streams[num_crc_streams - 1];
590 }
591 }
592 l = static_cast<uint32_t>(l64);
593
594 while ((e - p) >= 16) {
595 ABSL_INTERNAL_STEP8(l, p);
596 ABSL_INTERNAL_STEP8(l, p);
597 }
598 // Process the last few bytes
599 while (p != e) {
600 ABSL_INTERNAL_STEP1(l);
601 }
602
603 #undef ABSL_INTERNAL_STEP8BY3
604 #undef ABSL_INTERNAL_STEP8BY2
605 #undef ABSL_INTERNAL_STEP8
606 #undef ABSL_INTERNAL_STEP4
607 #undef ABSL_INTERNAL_STEP2
608 #undef ABSL_INTERNAL_STEP1
609
610 *crc = l;
611 }
612 };
613
614 } // namespace
615
616 // Intel processors with SSE4.2 have an instruction for one particular
617 // 32-bit CRC polynomial: crc32c
TryNewCRC32AcceleratedX86ARMCombined()618 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
619 CpuType type = GetCpuType();
620 switch (type) {
621 case CpuType::kIntelHaswell:
622 case CpuType::kAmdRome:
623 case CpuType::kAmdNaples:
624 case CpuType::kAmdMilan:
625 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
626 3, 1, CutoffStrategy::Fold3>();
627 // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
628 case CpuType::kIntelCascadelakeXeon:
629 case CpuType::kIntelSkylakeXeon:
630 case CpuType::kIntelBroadwell:
631 case CpuType::kIntelSkylake:
632 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
633 3, 2, CutoffStrategy::Fold3>();
634 // PCLMULQDQ is slow, don't use it.
635 case CpuType::kIntelIvybridge:
636 case CpuType::kIntelSandybridge:
637 case CpuType::kIntelWestmere:
638 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
639 3, 0, CutoffStrategy::Fold3>();
640 case CpuType::kArmNeoverseN1:
641 case CpuType::kArmNeoverseN2:
642 case CpuType::kArmNeoverseV1:
643 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
644 1, 1, CutoffStrategy::Unroll64CRC>();
645 case CpuType::kAmpereSiryn:
646 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
647 3, 2, CutoffStrategy::Fold3>();
648 case CpuType::kArmNeoverseV2:
649 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
650 1, 2, CutoffStrategy::Unroll64CRC>();
651 #if defined(__aarch64__)
652 default:
653 // Not all ARM processors support the needed instructions, so check here
654 // before trying to use an accelerated implementation.
655 if (SupportsArmCRC32PMULL()) {
656 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
657 1, 1, CutoffStrategy::Unroll64CRC>();
658 } else {
659 return nullptr;
660 }
661 #else
662 default:
663 // Something else, play it safe and assume slow PCLMULQDQ.
664 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
665 3, 0, CutoffStrategy::Fold3>();
666 #endif
667 }
668 }
669
NewCRC32AcceleratedX86ARMCombinedAll()670 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
671 auto ret = std::vector<std::unique_ptr<CRCImpl>>();
672 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
673 1, 0, CutoffStrategy::Fold3>>());
674 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
675 1, 1, CutoffStrategy::Fold3>>());
676 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
677 1, 2, CutoffStrategy::Fold3>>());
678 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
679 1, 3, CutoffStrategy::Fold3>>());
680 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
681 2, 0, CutoffStrategy::Fold3>>());
682 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
683 2, 1, CutoffStrategy::Fold3>>());
684 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
685 2, 2, CutoffStrategy::Fold3>>());
686 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
687 2, 3, CutoffStrategy::Fold3>>());
688 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
689 3, 0, CutoffStrategy::Fold3>>());
690 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
691 3, 1, CutoffStrategy::Fold3>>());
692 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
693 3, 2, CutoffStrategy::Fold3>>());
694 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
695 3, 3, CutoffStrategy::Fold3>>());
696 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
697 1, 0, CutoffStrategy::Unroll64CRC>>());
698 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
699 1, 1, CutoffStrategy::Unroll64CRC>>());
700 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
701 1, 2, CutoffStrategy::Unroll64CRC>>());
702 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
703 1, 3, CutoffStrategy::Unroll64CRC>>());
704 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
705 2, 0, CutoffStrategy::Unroll64CRC>>());
706 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
707 2, 1, CutoffStrategy::Unroll64CRC>>());
708 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
709 2, 2, CutoffStrategy::Unroll64CRC>>());
710 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
711 2, 3, CutoffStrategy::Unroll64CRC>>());
712 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
713 3, 0, CutoffStrategy::Unroll64CRC>>());
714 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
715 3, 1, CutoffStrategy::Unroll64CRC>>());
716 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
717 3, 2, CutoffStrategy::Unroll64CRC>>());
718 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
719 3, 3, CutoffStrategy::Unroll64CRC>>());
720
721 return ret;
722 }
723
724 #else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
725
726 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
727 return std::vector<std::unique_ptr<CRCImpl>>();
728 }
729
730 // no hardware acceleration available
731 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
732
733 #endif
734
735 } // namespace crc_internal
736 ABSL_NAMESPACE_END
737 } // namespace absl
738