xref: /aosp_15_r20/external/abseil-cpp/absl/crc/internal/crc_x86_arm_combined.cc (revision 9356374a3709195abf420251b3e825997ff56c0f)
1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <memory>
20 #include <vector>
21 
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/base/internal/endian.h"
25 #include "absl/base/prefetch.h"
26 #include "absl/crc/internal/cpu_detect.h"
27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
28 #include "absl/crc/internal/crc_internal.h"
29 #include "absl/memory/memory.h"
30 #include "absl/numeric/bits.h"
31 
32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
33     defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
35 #endif
36 
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace crc_internal {
40 
41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
42 
43 // Implementation details not exported outside of file
44 namespace {
45 
46 // Some machines have CRC acceleration hardware.
47 // We can do a faster version of Extend() on such machines.
48 class CRC32AcceleratedX86ARMCombined : public CRC32 {
49  public:
CRC32AcceleratedX86ARMCombined()50   CRC32AcceleratedX86ARMCombined() {}
~CRC32AcceleratedX86ARMCombined()51   ~CRC32AcceleratedX86ARMCombined() override {}
52   void ExtendByZeroes(uint32_t* crc, size_t length) const override;
53   uint32_t ComputeZeroConstant(size_t length) const;
54 
55  private:
56   CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
57       delete;
58   CRC32AcceleratedX86ARMCombined& operator=(
59       const CRC32AcceleratedX86ARMCombined&) = delete;
60 };
61 
62 // Constants for switching between algorithms.
63 // Chosen by comparing speed at different powers of 2.
64 constexpr size_t kSmallCutoff = 256;
65 constexpr size_t kMediumCutoff = 2048;
66 
67 #define ABSL_INTERNAL_STEP1(crc)                      \
68   do {                                                \
69     crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
70   } while (0)
71 #define ABSL_INTERNAL_STEP2(crc)                                               \
72   do {                                                                         \
73     crc =                                                                      \
74         CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
75     p += 2;                                                                    \
76   } while (0)
77 #define ABSL_INTERNAL_STEP4(crc)                                               \
78   do {                                                                         \
79     crc =                                                                      \
80         CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
81     p += 4;                                                                    \
82   } while (0)
83 #define ABSL_INTERNAL_STEP8(crc, data)                  \
84   do {                                                  \
85     crc = CRC32_u64(static_cast<uint32_t>(crc),         \
86                     absl::little_endian::Load64(data)); \
87     data += 8;                                          \
88   } while (0)
89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
90   do {                                             \
91     ABSL_INTERNAL_STEP8(crc0, p0);                 \
92     ABSL_INTERNAL_STEP8(crc1, p1);                 \
93   } while (0)
94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
95   do {                                                       \
96     ABSL_INTERNAL_STEP8(crc0, p0);                           \
97     ABSL_INTERNAL_STEP8(crc1, p1);                           \
98     ABSL_INTERNAL_STEP8(crc2, p2);                           \
99   } while (0)
100 
101 namespace {
102 
multiply(uint32_t a,uint32_t b)103 uint32_t multiply(uint32_t a, uint32_t b) {
104   V128 power = V128_From64WithZeroFill(a);
105   V128 crc = V128_From64WithZeroFill(b);
106   V128 res = V128_PMulLow(power, crc);
107 
108   // Combine crc values.
109   //
110   // Adding res to itself is equivalent to multiplying by 2,
111   // or shifting left by 1. Addition is used as not all compilers
112   // are able to generate optimal code without this hint.
113   // https://godbolt.org/z/rr3fMnf39
114   res = V128_Add64(res, res);
115   return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
116          CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
117 }
118 
119 // Powers of crc32c polynomial, for faster ExtendByZeros.
120 // Verified against folly:
121 // folly/hash/detail/Crc32CombineDetail.cpp
122 constexpr uint32_t kCRC32CPowers[] = {
123     0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
124     0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
125     0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
126     0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
127     0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
128     0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
129     0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
130     0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
131     0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
132     0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
133     0x00800000, 0x00008000,
134 };
135 
136 }  // namespace
137 
138 // Compute a magic constant, so that multiplying by it is the same as
139 // extending crc by length zeros.
ComputeZeroConstant(size_t length) const140 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
141     size_t length) const {
142   // Lowest 2 bits are handled separately in ExtendByZeroes
143   length >>= 2;
144 
145   int index = absl::countr_zero(length);
146   uint32_t prev = kCRC32CPowers[index];
147   length &= length - 1;
148 
149   while (length) {
150     // For each bit of length, extend by 2**n zeros.
151     index = absl::countr_zero(length);
152     prev = multiply(prev, kCRC32CPowers[index]);
153     length &= length - 1;
154   }
155   return prev;
156 }
157 
ExtendByZeroes(uint32_t * crc,size_t length) const158 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
159                                                     size_t length) const {
160   uint32_t val = *crc;
161   // Don't bother with multiplication for small length.
162   switch (length & 3) {
163     case 0:
164       break;
165     case 1:
166       val = CRC32_u8(val, 0);
167       break;
168     case 2:
169       val = CRC32_u16(val, 0);
170       break;
171     case 3:
172       val = CRC32_u8(val, 0);
173       val = CRC32_u16(val, 0);
174       break;
175   }
176   if (length > 3) {
177     val = multiply(val, ComputeZeroConstant(length));
178   }
179   *crc = val;
180 }
181 
182 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
183 // Instruction"
184 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
185 // We only need every 4th value, because we unroll loop by 4.
186 constexpr uint64_t kClmulConstants[] = {
187     0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
188     0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
189     0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
190     0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
191     0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
192     0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
193     0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
194     0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
195     0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
196     0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
197     0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
198     0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
199     0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
200 };
201 
202 enum class CutoffStrategy {
203   // Use 3 CRC streams to fold into 1.
204   Fold3,
205   // Unroll CRC instructions for 64 bytes.
206   Unroll64CRC,
207 };
208 
209 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
210 // methods and data that don't need the template arguments.
211 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
212     : public CRC32AcceleratedX86ARMCombined {
213  protected:
214   // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
215   // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
216   // high latency, so we run 4 128-bit partial checksums that can be reduced to
217   // a single value by FinalizePclmulStream later. Computing crc for arbitrary
218   // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
219   // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
220   // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
221   // We are applying it to CRC32C polynomial.
Process64BytesPclmul(const uint8_t * p,V128 * partialCRC) const222   ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
223       const uint8_t* p, V128* partialCRC) const {
224     V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
225 
226     V128 partialCRC1 = partialCRC[0];
227     V128 partialCRC2 = partialCRC[1];
228     V128 partialCRC3 = partialCRC[2];
229     V128 partialCRC4 = partialCRC[3];
230 
231     V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
232     V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
233     V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
234     V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
235     V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
236     V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
237     V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
238     V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
239     partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
240     partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
241     partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
242     partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
243     partialCRC1 = V128_Xor(tmp1, partialCRC1);
244     partialCRC2 = V128_Xor(tmp2, partialCRC2);
245     partialCRC3 = V128_Xor(tmp3, partialCRC3);
246     partialCRC4 = V128_Xor(tmp4, partialCRC4);
247     partialCRC1 = V128_Xor(partialCRC1, data1);
248     partialCRC2 = V128_Xor(partialCRC2, data2);
249     partialCRC3 = V128_Xor(partialCRC3, data3);
250     partialCRC4 = V128_Xor(partialCRC4, data4);
251     partialCRC[0] = partialCRC1;
252     partialCRC[1] = partialCRC2;
253     partialCRC[2] = partialCRC3;
254     partialCRC[3] = partialCRC4;
255   }
256 
257   // Reduce partialCRC produced by Process64BytesPclmul into a single value,
258   // that represents crc checksum of all the processed bytes.
259   ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
FinalizePclmulStream(V128 * partialCRC) const260   FinalizePclmulStream(V128* partialCRC) const {
261     V128 partialCRC1 = partialCRC[0];
262     V128 partialCRC2 = partialCRC[1];
263     V128 partialCRC3 = partialCRC[2];
264     V128 partialCRC4 = partialCRC[3];
265 
266     // Combine 4 vectors of partial crc into a single vector.
267     V128 reductionMultiplicands =
268         V128_Load(reinterpret_cast<const V128*>(k5k6));
269 
270     V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
271     V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
272 
273     partialCRC1 = V128_Xor(low, high);
274     partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
275 
276     low = V128_PMulLow(reductionMultiplicands, partialCRC3);
277     high = V128_PMulHi(reductionMultiplicands, partialCRC3);
278 
279     partialCRC3 = V128_Xor(low, high);
280     partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
281 
282     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
283 
284     low = V128_PMulLow(reductionMultiplicands, partialCRC1);
285     high = V128_PMulHi(reductionMultiplicands, partialCRC1);
286     V128 fullCRC = V128_Xor(low, high);
287     fullCRC = V128_Xor(fullCRC, partialCRC3);
288 
289     // Reduce fullCRC into scalar value.
290     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
291 
292     V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
293 
294     V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
295     fullCRC = V128_ShiftRight<8>(fullCRC);
296     fullCRC = V128_Xor(fullCRC, tmp);
297 
298     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
299 
300     tmp = V128_ShiftRight<4>(fullCRC);
301     fullCRC = V128_And(fullCRC, mask);
302     fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
303     fullCRC = V128_Xor(tmp, fullCRC);
304 
305     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
306 
307     tmp = V128_And(fullCRC, mask);
308     tmp = V128_PMul01(reductionMultiplicands, tmp);
309     tmp = V128_And(tmp, mask);
310     tmp = V128_PMulLow(reductionMultiplicands, tmp);
311 
312     fullCRC = V128_Xor(tmp, fullCRC);
313 
314     return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
315   }
316 
317   // Update crc with 64 bytes of data from p.
Process64BytesCRC(const uint8_t * p,uint64_t crc) const318   ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
319                                                           uint64_t crc) const {
320     for (int i = 0; i < 8; i++) {
321       crc =
322           CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
323       p += 8;
324     }
325     return crc;
326   }
327 
328   // Generated by crc32c_x86_test --crc32c_generate_constants=true
329   // and verified against constants in linux kernel for S390:
330   // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
331   alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
332   alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
333   alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
334   alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
335   alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
336   alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
337 
338   // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
339   // size. Each group is CRCed in parallel then combined at the end of the
340   // block.
341   static constexpr size_t kGroupsSmall = 3;
342   // For large runs we use up to kMaxStreams blocks computed with CRC
343   // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
344   // are combined in the end.
345   static constexpr size_t kMaxStreams = 3;
346 };
347 
348 #ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
349 alignas(16) constexpr uint64_t
350     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
351 alignas(16) constexpr uint64_t
352     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
353 alignas(16) constexpr uint64_t
354     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
355 alignas(16) constexpr uint64_t
356     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
357 alignas(16) constexpr uint64_t
358     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
359 alignas(16) constexpr uint32_t
360     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
361 constexpr size_t
362     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
363 constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
364 #endif  // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
365 
366 template <size_t num_crc_streams, size_t num_pclmul_streams,
367           CutoffStrategy strategy>
368 class CRC32AcceleratedX86ARMCombinedMultipleStreams
369     : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
370   ABSL_ATTRIBUTE_HOT
Extend(uint32_t * crc,const void * bytes,size_t length) const371   void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
372     static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
373                   "Invalid number of crc streams");
374     static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
375                   "Invalid number of pclmul streams");
376     const uint8_t* p = static_cast<const uint8_t*>(bytes);
377     const uint8_t* e = p + length;
378     uint32_t l = *crc;
379     uint64_t l64;
380 
381     // We have dedicated instruction for 1,2,4 and 8 bytes.
382     if (length & 8) {
383       ABSL_INTERNAL_STEP8(l, p);
384       length &= ~size_t{8};
385     }
386     if (length & 4) {
387       ABSL_INTERNAL_STEP4(l);
388       length &= ~size_t{4};
389     }
390     if (length & 2) {
391       ABSL_INTERNAL_STEP2(l);
392       length &= ~size_t{2};
393     }
394     if (length & 1) {
395       ABSL_INTERNAL_STEP1(l);
396       length &= ~size_t{1};
397     }
398     if (length == 0) {
399       *crc = l;
400       return;
401     }
402     // length is now multiple of 16.
403 
404     // For small blocks just run simple loop, because cost of combining multiple
405     // streams is significant.
406     if (strategy != CutoffStrategy::Unroll64CRC) {
407       if (length < kSmallCutoff) {
408         while (length >= 16) {
409           ABSL_INTERNAL_STEP8(l, p);
410           ABSL_INTERNAL_STEP8(l, p);
411           length -= 16;
412         }
413         *crc = l;
414         return;
415       }
416     }
417 
418     // For medium blocks we run 3 crc streams and combine them as described in
419     // Intel paper above. Running 4th stream doesn't help, because crc
420     // instruction has latency 3 and throughput 1.
421     if (length < kMediumCutoff) {
422       l64 = l;
423       if (strategy == CutoffStrategy::Fold3) {
424         uint64_t l641 = 0;
425         uint64_t l642 = 0;
426         const size_t blockSize = 32;
427         size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
428         const uint8_t* p1 = p + bs * blockSize;
429         const uint8_t* p2 = p1 + bs * blockSize;
430 
431         for (size_t i = 0; i + 1 < bs; ++i) {
432           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
433           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
434           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
435           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
436           PrefetchToLocalCache(
437               reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
438           PrefetchToLocalCache(
439               reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
440           PrefetchToLocalCache(
441               reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
442         }
443         // Don't run crc on last 8 bytes.
444         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
445         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
446         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
447         ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
448 
449         V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
450 
451         V128 tmp = V128_From64WithZeroFill(l64);
452 
453         V128 res1 = V128_PMulLow(tmp, magic);
454 
455         tmp = V128_From64WithZeroFill(l641);
456 
457         V128 res2 = V128_PMul10(tmp, magic);
458         V128 x = V128_Xor(res1, res2);
459         l64 = static_cast<uint64_t>(V128_Low64(x)) ^
460               absl::little_endian::Load64(p2);
461         l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
462 
463         p = p2 + 8;
464       } else if (strategy == CutoffStrategy::Unroll64CRC) {
465         while ((e - p) >= 64) {
466           l64 = Process64BytesCRC(p, l64);
467           p += 64;
468         }
469       }
470     } else {
471       // There is a lot of data, we can ignore combine costs and run all
472       // requested streams (num_crc_streams + num_pclmul_streams),
473       // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
474       // so on some cpus it makes sense to execute both of them for different
475       // streams.
476 
477       // Point x at first 8-byte aligned byte in string.
478       const uint8_t* x = RoundUp<8>(p);
479       // Process bytes until p is 8-byte aligned, if that isn't past the end.
480       while (p != x) {
481         ABSL_INTERNAL_STEP1(l);
482       }
483 
484       size_t bs = static_cast<size_t>(e - p) /
485                   (num_crc_streams + num_pclmul_streams) / 64;
486       const uint8_t* crc_streams[kMaxStreams];
487       const uint8_t* pclmul_streams[kMaxStreams];
488       // We are guaranteed to have at least one crc stream.
489       crc_streams[0] = p;
490       for (size_t i = 1; i < num_crc_streams; i++) {
491         crc_streams[i] = crc_streams[i - 1] + bs * 64;
492       }
493       pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
494       for (size_t i = 1; i < num_pclmul_streams; i++) {
495         pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
496       }
497 
498       // Per stream crc sums.
499       uint64_t l64_crc[kMaxStreams] = {l};
500       uint64_t l64_pclmul[kMaxStreams] = {0};
501 
502       // Peel first iteration, because PCLMULQDQ stream, needs setup.
503       for (size_t i = 0; i < num_crc_streams; i++) {
504         l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
505         crc_streams[i] += 16 * 4;
506       }
507 
508       V128 partialCRC[kMaxStreams][4];
509       for (size_t i = 0; i < num_pclmul_streams; i++) {
510         partialCRC[i][0] = V128_LoadU(
511             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
512         partialCRC[i][1] = V128_LoadU(
513             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
514         partialCRC[i][2] = V128_LoadU(
515             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
516         partialCRC[i][3] = V128_LoadU(
517             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
518         pclmul_streams[i] += 16 * 4;
519       }
520 
521       for (size_t i = 1; i < bs; i++) {
522         // Prefetch data for next iterations.
523         for (size_t j = 0; j < num_crc_streams; j++) {
524           PrefetchToLocalCache(
525               reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
526         }
527         for (size_t j = 0; j < num_pclmul_streams; j++) {
528           PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
529                                                              kPrefetchHorizon));
530         }
531 
532         // We process each stream in 64 byte blocks. This can be written as
533         // for (int i = 0; i < num_pclmul_streams; i++) {
534         //   Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
535         //   pclmul_streams[i] += 16 * 4;
536         // }
537         // for (int i = 0; i < num_crc_streams; i++) {
538         //   l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
539         //   crc_streams[i] += 16*4;
540         // }
541         // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
542         // gives ~2% performance boost.
543         l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
544         crc_streams[0] += 16 * 4;
545         if (num_pclmul_streams > 0) {
546           Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
547           pclmul_streams[0] += 16 * 4;
548         }
549         if (num_crc_streams > 1) {
550           l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
551           crc_streams[1] += 16 * 4;
552         }
553         if (num_pclmul_streams > 1) {
554           Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
555           pclmul_streams[1] += 16 * 4;
556         }
557         if (num_crc_streams > 2) {
558           l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
559           crc_streams[2] += 16 * 4;
560         }
561         if (num_pclmul_streams > 2) {
562           Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
563           pclmul_streams[2] += 16 * 4;
564         }
565       }
566 
567       // PCLMULQDQ based streams require special final step;
568       // CRC based don't.
569       for (size_t i = 0; i < num_pclmul_streams; i++) {
570         l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
571       }
572 
573       // Combine all streams into single result.
574       uint32_t magic = ComputeZeroConstant(bs * 64);
575       l64 = l64_crc[0];
576       for (size_t i = 1; i < num_crc_streams; i++) {
577         l64 = multiply(static_cast<uint32_t>(l64), magic);
578         l64 ^= l64_crc[i];
579       }
580       for (size_t i = 0; i < num_pclmul_streams; i++) {
581         l64 = multiply(static_cast<uint32_t>(l64), magic);
582         l64 ^= l64_pclmul[i];
583       }
584 
585       // Update p.
586       if (num_pclmul_streams > 0) {
587         p = pclmul_streams[num_pclmul_streams - 1];
588       } else {
589         p = crc_streams[num_crc_streams - 1];
590       }
591     }
592     l = static_cast<uint32_t>(l64);
593 
594     while ((e - p) >= 16) {
595       ABSL_INTERNAL_STEP8(l, p);
596       ABSL_INTERNAL_STEP8(l, p);
597     }
598     // Process the last few bytes
599     while (p != e) {
600       ABSL_INTERNAL_STEP1(l);
601     }
602 
603 #undef ABSL_INTERNAL_STEP8BY3
604 #undef ABSL_INTERNAL_STEP8BY2
605 #undef ABSL_INTERNAL_STEP8
606 #undef ABSL_INTERNAL_STEP4
607 #undef ABSL_INTERNAL_STEP2
608 #undef ABSL_INTERNAL_STEP1
609 
610     *crc = l;
611   }
612 };
613 
614 }  // namespace
615 
616 // Intel processors with SSE4.2 have an instruction for one particular
617 // 32-bit CRC polynomial:  crc32c
TryNewCRC32AcceleratedX86ARMCombined()618 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
619   CpuType type = GetCpuType();
620   switch (type) {
621     case CpuType::kIntelHaswell:
622     case CpuType::kAmdRome:
623     case CpuType::kAmdNaples:
624     case CpuType::kAmdMilan:
625       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
626           3, 1, CutoffStrategy::Fold3>();
627     // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
628     case CpuType::kIntelCascadelakeXeon:
629     case CpuType::kIntelSkylakeXeon:
630     case CpuType::kIntelBroadwell:
631     case CpuType::kIntelSkylake:
632       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
633           3, 2, CutoffStrategy::Fold3>();
634     // PCLMULQDQ is slow, don't use it.
635     case CpuType::kIntelIvybridge:
636     case CpuType::kIntelSandybridge:
637     case CpuType::kIntelWestmere:
638       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
639           3, 0, CutoffStrategy::Fold3>();
640     case CpuType::kArmNeoverseN1:
641     case CpuType::kArmNeoverseN2:
642     case CpuType::kArmNeoverseV1:
643       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
644           1, 1, CutoffStrategy::Unroll64CRC>();
645     case CpuType::kAmpereSiryn:
646       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
647           3, 2, CutoffStrategy::Fold3>();
648     case CpuType::kArmNeoverseV2:
649       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
650           1, 2, CutoffStrategy::Unroll64CRC>();
651 #if defined(__aarch64__)
652     default:
653       // Not all ARM processors support the needed instructions, so check here
654       // before trying to use an accelerated implementation.
655       if (SupportsArmCRC32PMULL()) {
656         return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
657             1, 1, CutoffStrategy::Unroll64CRC>();
658       } else {
659         return nullptr;
660       }
661 #else
662     default:
663       // Something else, play it safe and assume slow PCLMULQDQ.
664       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
665           3, 0, CutoffStrategy::Fold3>();
666 #endif
667   }
668 }
669 
NewCRC32AcceleratedX86ARMCombinedAll()670 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
671   auto ret = std::vector<std::unique_ptr<CRCImpl>>();
672   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
673                     1, 0, CutoffStrategy::Fold3>>());
674   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
675                     1, 1, CutoffStrategy::Fold3>>());
676   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
677                     1, 2, CutoffStrategy::Fold3>>());
678   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
679                     1, 3, CutoffStrategy::Fold3>>());
680   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
681                     2, 0, CutoffStrategy::Fold3>>());
682   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
683                     2, 1, CutoffStrategy::Fold3>>());
684   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
685                     2, 2, CutoffStrategy::Fold3>>());
686   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
687                     2, 3, CutoffStrategy::Fold3>>());
688   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
689                     3, 0, CutoffStrategy::Fold3>>());
690   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
691                     3, 1, CutoffStrategy::Fold3>>());
692   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
693                     3, 2, CutoffStrategy::Fold3>>());
694   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
695                     3, 3, CutoffStrategy::Fold3>>());
696   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
697                     1, 0, CutoffStrategy::Unroll64CRC>>());
698   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
699                     1, 1, CutoffStrategy::Unroll64CRC>>());
700   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
701                     1, 2, CutoffStrategy::Unroll64CRC>>());
702   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
703                     1, 3, CutoffStrategy::Unroll64CRC>>());
704   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
705                     2, 0, CutoffStrategy::Unroll64CRC>>());
706   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
707                     2, 1, CutoffStrategy::Unroll64CRC>>());
708   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
709                     2, 2, CutoffStrategy::Unroll64CRC>>());
710   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
711                     2, 3, CutoffStrategy::Unroll64CRC>>());
712   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
713                     3, 0, CutoffStrategy::Unroll64CRC>>());
714   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
715                     3, 1, CutoffStrategy::Unroll64CRC>>());
716   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
717                     3, 2, CutoffStrategy::Unroll64CRC>>());
718   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
719                     3, 3, CutoffStrategy::Unroll64CRC>>());
720 
721   return ret;
722 }
723 
724 #else  // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
725 
726 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
727   return std::vector<std::unique_ptr<CRCImpl>>();
728 }
729 
730 // no hardware acceleration available
731 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
732 
733 #endif
734 
735 }  // namespace crc_internal
736 ABSL_NAMESPACE_END
737 }  // namespace absl
738