1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Implementation of CRCs (aka Rabin Fingerprints).
16 // Treats the input as a polynomial with coefficients in Z(2),
17 // and finds the remainder when divided by an irreducible polynomial
18 // of the appropriate length.
19 // It handles all CRC sizes from 8 to 128 bits.
20 // It's somewhat complicated by having separate implementations optimized for
21 // CRC's <=32 bits, <= 64 bits, and <= 128 bits.
22 // The input string is prefixed with a "1" bit, and has "degree" "0" bits
23 // appended to it before the remainder is found.   This ensures that
24 // short strings are scrambled somewhat and that strings consisting
25 // of all nulls have a non-zero CRC.
26 //
27 // Uses the "interleaved word-by-word" method from
28 // "Everything we know about CRC but afraid to forget" by Andrew Kadatch
29 // and Bob Jenkins,
30 // http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
31 //
32 // The idea is to compute kStride CRCs simultaneously, allowing the
33 // processor to more effectively use multiple execution units. Each of
34 // the CRCs is calculated on one word of data followed by kStride - 1
35 // words of zeroes; the CRC starting points are staggered by one word.
36 // Assuming a stride of 4 with data words "ABCDABCDABCD", the first
37 // CRC is over A000A000A, the second over 0B000B000B, and so on.
38 // The CRC of the whole data is then calculated by properly aligning the
39 // CRCs by appending zeroes until the data lengths agree then XORing
40 // the CRCs.
41 
42 #include "absl/crc/internal/crc.h"
43 
44 #include <cstdint>
45 
46 #include "absl/base/internal/endian.h"
47 #include "absl/base/internal/prefetch.h"
48 #include "absl/base/internal/raw_logging.h"
49 #include "absl/crc/internal/crc_internal.h"
50 
51 namespace absl {
52 ABSL_NAMESPACE_BEGIN
53 namespace crc_internal {
54 
55 namespace {
56 
57 // Constants
58 #if defined(__i386__) || defined(__x86_64__)
59 constexpr bool kNeedAlignedLoads = false;
60 #else
61 constexpr bool kNeedAlignedLoads = true;
62 #endif
63 
64 // We express the number of zeroes as a number in base ZEROES_BASE. By
65 // pre-computing the zero extensions for all possible components of such an
66 // expression (numbers in a form a*ZEROES_BASE**b), we can calculate the
67 // resulting extension by multiplying the extensions for individual components
68 // using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of
69 // zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries.
70 constexpr int ZEROES_BASE_LG = 4;                   // log_2(ZEROES_BASE)
71 constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG);  // must be a power of 2
72 
73 constexpr uint32_t kCrc32cPoly = 0x82f63b78;
74 
ReverseBits(uint32_t bits)75 uint32_t ReverseBits(uint32_t bits) {
76   bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1;
77   bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2;
78   bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4;
79   return absl::gbswap_32(bits);
80 }
81 
82 // Polynomial long multiplication mod the polynomial of degree 32.
PolyMultiply(uint32_t * val,uint32_t m,uint32_t poly)83 void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) {
84   uint32_t l = *val;
85   uint32_t result = 0;
86   auto onebit = uint32_t{0x80000000u};
87   for (uint32_t one = onebit; one != 0; one >>= 1) {
88     if ((l & one) != 0) {
89       result ^= m;
90     }
91     if (m & 1) {
92       m = (m >> 1) ^ poly;
93     } else {
94       m >>= 1;
95     }
96   }
97   *val = result;
98 }
99 }  // namespace
100 
FillWordTable(uint32_t poly,uint32_t last,int word_size,Uint32By256 * t)101 void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size,
102                             Uint32By256* t) {
103   for (int j = 0; j != word_size; j++) {  // for each byte of extension....
104     t[j][0] = 0;                          // a zero has no effect
105     for (int i = 128; i != 0; i >>= 1) {  // fill in entries for powers of 2
106       if (j == 0 && i == 128) {
107         t[j][i] = last;  // top bit in last byte is given
108       } else {
109         // each successive power of two is derived from the previous
110         // one, either in this table, or the last table
111         uint32_t pred;
112         if (i == 128) {
113           pred = t[j - 1][1];
114         } else {
115           pred = t[j][i << 1];
116         }
117         // Advance the CRC by one bit (multiply by X, and take remainder
118         // through one step of polynomial long division)
119         if (pred & 1) {
120           t[j][i] = (pred >> 1) ^ poly;
121         } else {
122           t[j][i] = pred >> 1;
123         }
124       }
125     }
126     // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b)
127     // so we can make all the tables for non-powers of two by
128     // xoring previously created entries.
129     for (int i = 2; i != 256; i <<= 1) {
130       for (int k = i + 1; k != (i << 1); k++) {
131         t[j][k] = t[j][i] ^ t[j][k - i];
132       }
133     }
134   }
135 }
136 
FillZeroesTable(uint32_t poly,Uint32By256 * t)137 int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) {
138   uint32_t inc = 1;
139   inc <<= 31;
140 
141   // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0.
142   inc >>= 1;
143 
144   // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte.
145   for (int i = 0; i < 3; ++i) {
146     PolyMultiply(&inc, inc, poly);
147   }
148 
149   int j = 0;
150   for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) {
151     // Every entry in the table adds an additional inc_len zeroes.
152     uint32_t v = inc;
153     for (int a = 1; a != ZEROES_BASE; a++) {
154       t[0][j] = v;
155       PolyMultiply(&v, inc, poly);
156       j++;
157     }
158     inc = v;
159   }
160   ABSL_RAW_CHECK(j <= 256, "");
161   return j;
162 }
163 
164 // Internal version of the "constructor".
NewInternal()165 CRCImpl* CRCImpl::NewInternal() {
166   // Find an accelearated implementation first.
167   CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined();
168 
169   // Fall back to generic implementions if no acceleration is available.
170   if (result == nullptr) {
171     result = new CRC32();
172   }
173 
174   result->InitTables();
175 
176   return result;
177 }
178 
179 // The CRC of the empty string is always the CRC polynomial itself.
Empty(uint32_t * crc) const180 void CRCImpl::Empty(uint32_t* crc) const { *crc = kCrc32cPoly; }
181 
182 //  The 32-bit implementation
183 
InitTables()184 void CRC32::InitTables() {
185   // Compute the table for extending a CRC by one byte.
186   Uint32By256* t = new Uint32By256[4];
187   FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t);
188   for (int i = 0; i != 256; i++) {
189     this->table0_[i] = t[0][i];
190   }
191 
192   // Construct a table for updating the CRC by 4 bytes data followed by
193   // 12 bytes of zeroes.
194   //
195   // Note: the data word size could be larger than the CRC size; it might
196   // be slightly faster to use a 64-bit data word, but doing so doubles the
197   // table size.
198   uint32_t last = kCrc32cPoly;
199   const size_t size = 12;
200   for (size_t i = 0; i < size; ++i) {
201     last = (last >> 8) ^ this->table0_[last & 0xff];
202   }
203   FillWordTable(kCrc32cPoly, last, 4, t);
204   for (size_t b = 0; b < 4; ++b) {
205     for (int i = 0; i < 256; ++i) {
206       this->table_[b][i] = t[b][i];
207     }
208   }
209 
210   int j = FillZeroesTable(kCrc32cPoly, t);
211   ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), "");
212   for (int i = 0; i < j; i++) {
213     this->zeroes_[i] = t[0][i];
214   }
215 
216   delete[] t;
217 
218   // Build up tables for _reversing_ the operation of doing CRC operations on
219   // zero bytes.
220 
221   // In C++, extending `crc` by a single zero bit is done by the following:
222   // (A)  bool low_bit_set = (crc & 1);
223   //      crc >>= 1;
224   //      if (low_bit_set) crc ^= kCrc32cPoly;
225   //
226   // In particular note that the high bit of `crc` after this operation will be
227   // set if and only if the low bit of `crc` was set before it.  This means that
228   // no information is lost, and the operation can be reversed, as follows:
229   // (B)  bool high_bit_set = (crc & 0x80000000u);
230   //      if (high_bit_set) crc ^= kCrc32cPoly;
231   //      crc <<= 1;
232   //      if (high_bit_set) crc ^= 1;
233   //
234   // Or, equivalently:
235   // (C)  bool high_bit_set = (crc & 0x80000000u);
236   //      crc <<= 1;
237   //      if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1);
238   //
239   // The last observation is, if we store our checksums in variable `rcrc`,
240   // with order of the bits reversed, the inverse operation becomes:
241   // (D)  bool low_bit_set = (rcrc & 1);
242   //      rcrc >>= 1;
243   //      if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1)
244   //
245   // This is the same algorithm (A) that we started with, only with a different
246   // polynomial bit pattern.  This means that by building up our tables with
247   // this alternate polynomial, we can apply the CRC algorithms to a
248   // bit-reversed CRC checksum to perform inverse zero-extension.
249 
250   const uint32_t kCrc32cUnextendPoly =
251       ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1));
252   FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_);
253 
254   j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_);
255   ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)),
256                  "");
257 }
258 
Extend(uint32_t * crc,const void * bytes,size_t length) const259 void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
260   const uint8_t* p = static_cast<const uint8_t*>(bytes);
261   const uint8_t* e = p + length;
262   uint32_t l = *crc;
263 
264   auto step_one_byte = [this, &p, &l] () {
265     int c = (l & 0xff) ^ *p++;
266     l = this->table0_[c] ^ (l >> 8);
267   };
268 
269   if (kNeedAlignedLoads) {
270     // point x at first 4-byte aligned byte in string. this might be past the
271     // end of the string.
272     const uint8_t* x = RoundUp<4>(p);
273     if (x <= e) {
274       // Process bytes until finished or p is 4-byte aligned
275       while (p != x) {
276         step_one_byte();
277       }
278     }
279   }
280 
281   const size_t kSwathSize = 16;
282   if (static_cast<size_t>(e - p) >= kSwathSize) {
283     // Load one swath of data into the operating buffers.
284     uint32_t buf0 = absl::little_endian::Load32(p) ^ l;
285     uint32_t buf1 = absl::little_endian::Load32(p + 4);
286     uint32_t buf2 = absl::little_endian::Load32(p + 8);
287     uint32_t buf3 = absl::little_endian::Load32(p + 12);
288     p += kSwathSize;
289 
290     // Increment a CRC value by a "swath"; this combines the four bytes
291     // starting at `ptr` and twelve zero bytes, so that four CRCs can be
292     // built incrementally and combined at the end.
293     const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) {
294       return absl::little_endian::Load32(ptr) ^
295              this->table_[3][crc_in & 0xff] ^
296              this->table_[2][(crc_in >> 8) & 0xff] ^
297              this->table_[1][(crc_in >> 16) & 0xff] ^
298              this->table_[0][crc_in >> 24];
299     };
300 
301     // Run one CRC calculation step over all swaths in one 16-byte stride
302     const auto step_stride = [&]() {
303       buf0 = step_swath(buf0, p);
304       buf1 = step_swath(buf1, p + 4);
305       buf2 = step_swath(buf2, p + 8);
306       buf3 = step_swath(buf3, p + 12);
307       p += 16;
308     };
309 
310     // Process kStride interleaved swaths through the data in parallel.
311     while ((e - p) > kPrefetchHorizon) {
312       base_internal::PrefetchNta(
313           reinterpret_cast<const void*>(p + kPrefetchHorizon));
314       // Process 64 bytes at a time
315       step_stride();
316       step_stride();
317       step_stride();
318       step_stride();
319     }
320     while (static_cast<size_t>(e - p) >= kSwathSize) {
321       step_stride();
322     }
323 
324     // Now advance one word at a time as far as possible. This isn't worth
325     // doing if we have word-advance tables.
326     while (static_cast<size_t>(e - p) >= 4) {
327       buf0 = step_swath(buf0, p);
328       uint32_t tmp = buf0;
329       buf0 = buf1;
330       buf1 = buf2;
331       buf2 = buf3;
332       buf3 = tmp;
333       p += 4;
334     }
335 
336     // Combine the results from the different swaths. This is just a CRC
337     // on the data values in the bufX words.
338     auto combine_one_word = [this](uint32_t crc_in, uint32_t w) {
339       w ^= crc_in;
340       for (size_t i = 0; i < 4; ++i) {
341         w = (w >> 8) ^ this->table0_[w & 0xff];
342       }
343       return w;
344     };
345 
346     l = combine_one_word(0, buf0);
347     l = combine_one_word(l, buf1);
348     l = combine_one_word(l, buf2);
349     l = combine_one_word(l, buf3);
350   }
351 
352   // Process the last few bytes
353   while (p != e) {
354     step_one_byte();
355   }
356 
357   *crc = l;
358 }
359 
ExtendByZeroesImpl(uint32_t * crc,size_t length,const uint32_t zeroes_table[256],const uint32_t poly_table[256]) const360 void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length,
361                                const uint32_t zeroes_table[256],
362                                const uint32_t poly_table[256]) const {
363   if (length != 0) {
364     uint32_t l = *crc;
365     // For each ZEROES_BASE_LG bits in length
366     // (after the low-order bits have been removed)
367     // we lookup the appropriate polynomial in the zeroes_ array
368     // and do a polynomial long multiplication (mod the CRC polynomial)
369     // to extend the CRC by the appropriate number of bits.
370     for (int i = 0; length != 0;
371          i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) {
372       int c = length & (ZEROES_BASE - 1);  // pick next ZEROES_BASE_LG bits
373       if (c != 0) {                        // if they are not zero,
374                                            // multiply by entry in table
375         // Build a table to aid in multiplying 2 bits at a time.
376         // It takes too long to build tables for more bits.
377         uint64_t m = zeroes_table[c + i - 1];
378         m <<= 1;
379         uint64_t m2 = m << 1;
380         uint64_t mtab[4] = {0, m, m2, m2 ^ m};
381 
382         // Do the multiply one byte at a time.
383         uint64_t result = 0;
384         for (int x = 0; x < 32; x += 8) {
385           // The carry-less multiply.
386           result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^
387                     (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6);
388           l >>= 8;
389 
390           // Reduce modulo the polynomial
391           result = (result >> 8) ^ poly_table[result & 0xff];
392         }
393         l = static_cast<uint32_t>(result);
394       }
395     }
396     *crc = l;
397   }
398 }
399 
ExtendByZeroes(uint32_t * crc,size_t length) const400 void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const {
401   return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_);
402 }
403 
UnextendByZeroes(uint32_t * crc,size_t length) const404 void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const {
405   // See the comment in CRC32::InitTables() for an explanation of the algorithm
406   // below.
407   *crc = ReverseBits(*crc);
408   ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_);
409   *crc = ReverseBits(*crc);
410 }
411 
Scramble(uint32_t * crc) const412 void CRC32::Scramble(uint32_t* crc) const {
413   // Rotate by near half the word size plus 1.  See the scramble comment in
414   // crc_internal.h for an explanation.
415   constexpr int scramble_rotate = (32 / 2) + 1;
416   *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo),
417                                32, scramble_rotate) &
418          MaskOfLength<uint32_t>(32);
419 }
420 
Unscramble(uint32_t * crc) const421 void CRC32::Unscramble(uint32_t* crc) const {
422   constexpr int scramble_rotate = (32 / 2) + 1;
423   uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32,
424                                            32 - scramble_rotate);
425   *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32);
426 }
427 
428 // Constructor and destructor for base class CRC.
~CRC()429 CRC::~CRC() {}
CRC()430 CRC::CRC() {}
431 
432 // The "constructor" for a CRC32C with a standard polynomial.
Crc32c()433 CRC* CRC::Crc32c() {
434   static CRC* singleton = CRCImpl::NewInternal();
435   return singleton;
436 }
437 
438 // This Concat implementation works for arbitrary polynomials.
Concat(uint32_t * px,uint32_t y,size_t ylen)439 void CRC::Concat(uint32_t* px, uint32_t y, size_t ylen) {
440   // https://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks
441   // The CRC of a message M is the remainder of polynomial divison modulo G,
442   // where the coefficient arithmetic is performed modulo 2 (so +/- are XOR):
443   //   R(x) = M(x) x**n (mod G)
444   // (n is the degree of G)
445   // In practice, we use an initial value A and a bitmask B to get
446   //   R = (A ^ B)x**|M| ^ Mx**n ^ B (mod G)
447   // If M is the concatenation of two strings S and T, and Z is the string of
448   // len(T) 0s, then the remainder CRC(ST) can be expressed as:
449   //   R = (A ^ B)x**|ST| ^ STx**n ^ B
450   //     = (A ^ B)x**|SZ| ^ SZx**n ^ B ^ Tx**n
451   //     = CRC(SZ) ^ Tx**n
452   // CRC(Z) = (A ^ B)x**|T| ^ B
453   // CRC(T) = (A ^ B)x**|T| ^ Tx**n ^ B
454   // So R = CRC(SZ) ^ CRC(Z) ^ CRC(T)
455   //
456   // And further, since CRC(SZ) = Extend(CRC(S), Z),
457   //  CRC(SZ) ^ CRC(Z) = Extend(CRC(S) ^ CRC(''), Z).
458   uint32_t z;
459   uint32_t t;
460   Empty(&z);
461   t = *px ^ z;
462   ExtendByZeroes(&t, ylen);
463   *px = t ^ y;
464 }
465 
466 }  // namespace crc_internal
467 ABSL_NAMESPACE_END
468 }  // namespace absl
469