xref: /aosp_15_r20/external/toybox/lib/deflate.c (revision cf5a6c84e2b8763fc1a7db14496fd4742913b199)
1 /* deflate.c - deflate/inflate code for gzip and friends
2  *
3  * Copyright 2014 Rob Landley <[email protected]>
4  *
5  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
6  * LSB 4.1 has gzip, gunzip, and zcat
7  *
8  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
9  */
10 
11 #include "toys.h"
12 
13 struct deflate {
14   // Huffman codes: base offset and extra bits tables (length and distance)
15   char lenbits[29], distbits[30];
16   unsigned short lenbase[29], distbase[30];
17   void *fixdisthuff, *fixlithuff;
18 
19   // CRC
20   void (*crcfunc)(struct deflate *dd, char *data, unsigned len);
21   unsigned crctable[256], crc;
22 
23 
24   // Tables only used for deflation
25   unsigned short *hashhead, *hashchain;
26 
27   // Compressed data buffer (extra space malloced at end)
28   unsigned pos, len;
29   int infd, outfd, outbuflen;
30   char *outbuf, data[];
31 };
32 
33 // little endian bit buffer
34 struct bitbuf {
35   int fd, bitpos, len, max;
36   char *buf, data[];
37 };
38 
39 // malloc a struct bitbuf
bitbuf_init(int fd,int size)40 static struct bitbuf *bitbuf_init(int fd, int size)
41 {
42   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
43 
44   bb->max = size;
45   bb->fd = fd;
46   bb->buf = bb->data;
47 
48   return bb;
49 }
50 
51 // Advance bitpos without the overhead of recording bits
52 // Loads more data when input buffer empty
53 // call with 0 to just load data, returns 0 at EOF
bitbuf_skip(struct bitbuf * bb,int bits)54 static int bitbuf_skip(struct bitbuf *bb, int bits)
55 {
56   int pos = bb->bitpos + bits + (bits<0), len;
57 
58   while (pos >= (len = bb->len<<3)) {
59     pos -= len;
60     if (1 > (bb->len = read(bb->fd, bb->buf, bb->max))) {
61       if (!bits) break;
62       error_exit("inflate EOF");
63     }
64   }
65   bb->bitpos = pos;
66 
67   return pos<len;
68 }
69 
70 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)71 static inline int bitbuf_bit(struct bitbuf *bb)
72 {
73   int bufpos = bb->bitpos>>3;
74 
75   if (bufpos == bb->len) {
76     bitbuf_skip(bb, -1);
77     bufpos = 0;
78   }
79 
80   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
81 }
82 
83 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)84 static unsigned bitbuf_get(struct bitbuf *bb, int bits)
85 {
86   int result = 0, offset = 0;
87 
88   while (bits) {
89     int click = bb->bitpos >> 3, blow, blen;
90 
91     // Load more data if buffer empty
92     if (click == bb->len) {
93       bitbuf_skip(bb, -1);
94       click = 0;
95     }
96 
97     // grab bits from next byte
98     blow = bb->bitpos & 7;
99     blen = 8-blow;
100     if (blen > bits) blen = bits;
101     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
102     offset += blen;
103     bits -= blen;
104     bb->bitpos += blen;
105   }
106 
107   return result;
108 }
109 
bitbuf_flush(struct bitbuf * bb)110 static void bitbuf_flush(struct bitbuf *bb)
111 {
112   if (!bb->bitpos) return;
113 
114   xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
115   memset(bb->buf, 0, bb->max);
116   bb->bitpos = 0;
117 }
118 
bitbuf_put(struct bitbuf * bb,int data,int len)119 static void bitbuf_put(struct bitbuf *bb, int data, int len)
120 {
121   while (len) {
122     int click = bb->bitpos >> 3, blow, blen;
123 
124     // Flush buffer if necessary
125     if (click == bb->max) {
126       bitbuf_flush(bb);
127       click = 0;
128     }
129     blow = bb->bitpos & 7;
130     blen = 8-blow;
131     if (blen > len) blen = len;
132     bb->buf[click] |= data << blow;
133     bb->bitpos += blen;
134     data >>= blen;
135     len -= blen;
136   }
137 }
138 
139 // Output inflated data
inflate_out(struct deflate * dd,int len)140 static void inflate_out(struct deflate *dd, int len)
141 {
142   if (!len) return;
143   if (dd->outfd!=-1) xwrite(dd->outfd, dd->data, len);
144   else if (len>dd->outbuflen) error_exit("inflate too big");
145   else {
146     memcpy(dd->outbuf, dd->data, len);
147     dd->outbuf += len;
148     dd->outbuflen -= len;
149   }
150   if (dd->crcfunc) dd->crcfunc(dd, dd->data, len);
151 }
152 
output_byte(struct deflate * dd,char sym)153 static void output_byte(struct deflate *dd, char sym)
154 {
155   int pos = dd->pos++ & 32767;
156 
157   dd->data[pos] = sym;
158   if (pos == 32767) inflate_out(dd, 32768);
159 }
160 
161 // Huffman coding uses bits to traverse a binary tree to a leaf node,
162 // By placing frequently occurring symbols at shorter paths, frequently
163 // used symbols may be represented in fewer bits than uncommon symbols.
164 // (length[0] isn't used but code's clearer if it's there.)
165 
166 struct huff {
167   unsigned short length[16];  // How many symbols have this bit length?
168   unsigned short symbol[288]; // sorted by bit length, then ascending order
169 };
170 
171 // Create simple huffman tree from array of bit lengths.
172 
173 // The symbols in the huffman trees are sorted (first by bit length
174 // of the code to reach them, then by symbol number). This means that given
175 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)176 static void len2huff(struct huff *huff, char bitlen[], int len)
177 {
178   int offset[16];
179   int i;
180 
181   // Count number of codes at each bit length
182   memset(huff, 0, sizeof(struct huff));
183   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
184 
185   // Sort symbols by bit length, then symbol. Get list of starting positions
186   // for each group, then write each symbol to next position within its group.
187   *huff->length = *offset = 0;
188   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
189   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
190 }
191 
192 // Fetch and decode next huffman coded symbol from bitbuf.
193 // This takes advantage of the sorting to navigate the tree as an array:
194 // each time we fetch a bit we have all the codes at that bit level in
195 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)196 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
197 {
198   unsigned short *length = huff->length;
199   int start = 0, offset = 0;
200 
201   // Traverse through the bit lengths until our code is in this range
202   for (;;) {
203     offset = (offset << 1) | bitbuf_bit(bb);
204     start += *++length;
205     if ((offset -= *length) < 0) break;
206     if ((length - huff->length) & 16) error_exit("bad symbol");
207   }
208 
209   return huff->symbol[start + offset];
210 }
211 
212 // Decompress deflated data from bitbuf to dd->outfd.
inflate(struct deflate * dd,struct bitbuf * bb)213 static void inflate(struct deflate *dd, struct bitbuf *bb)
214 {
215   dd->crc = ~0;
216 
217   // repeat until spanked
218   for (;;) {
219     int final, type;
220 
221     final = bitbuf_get(bb, 1);
222     type = bitbuf_get(bb, 2);
223 
224     if (type == 3) error_exit("bad type");
225 
226     // Uncompressed block?
227     if (!type) {
228       int len, nlen;
229 
230       // Align to byte, read length
231       bitbuf_skip(bb, (8-bb->bitpos)&7);
232       len = bitbuf_get(bb, 16);
233       nlen = bitbuf_get(bb, 16);
234       if (len != (0xffff & ~nlen)) error_exit("bad len");
235 
236       // Dump literal output data
237       while (len) {
238         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
239         char *p = bb->buf+pos;
240 
241         // dump bytes until done or end of current bitbuf contents
242         if (bblen > len) bblen = len;
243         pos = bblen;
244         while (pos--) output_byte(dd, *(p++));
245         bitbuf_skip(bb, bblen << 3);
246         len -= bblen;
247       }
248 
249     // Compressed block
250     } else {
251       struct huff *disthuff, *lithuff;
252 
253       // Dynamic huffman codes?
254       if (type == 2) {
255         struct huff *h2 = ((struct huff *)libbuf)+1;
256         int i, litlen, distlen, hufflen;
257         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
258                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
259 
260         // The huffman trees are stored as a series of bit lengths
261         litlen = bitbuf_get(bb, 5)+257;  // max 288
262         distlen = bitbuf_get(bb, 5)+1;   // max 32
263         hufflen = bitbuf_get(bb, 4)+4;   // max 19
264 
265         // The literal and distance codes are themselves compressed, in
266         // a complicated way: an array of bit lengths (hufflen many
267         // entries, each 3 bits) is used to fill out an array of 19 entries
268         // in a magic order, leaving the rest 0. Then make a tree out of it:
269         memset(bits = libbuf+1, 0, 19);
270         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
271         len2huff(h2, bits, 19);
272 
273         // Use that tree to read in the literal and distance bit lengths
274         for (i = 0; i < litlen + distlen;) {
275           int sym = huff_and_puff(bb, h2);
276 
277           // 0-15 are literals, 16 = repeat previous code 3-6 times,
278           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
279           if (sym < 16) bits[i++] = sym;
280           else {
281             int len = sym & 2;
282 
283             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
284             memset(bits+i, bits[i-1] * !(sym&3), len);
285             i += len;
286           }
287         }
288         if (i > litlen+distlen) error_exit("bad tree");
289 
290         len2huff(lithuff = h2, bits, litlen);
291         len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
292 
293       // Static huffman codes
294       } else {
295         lithuff = dd->fixlithuff;
296         disthuff = dd->fixdisthuff;
297       }
298 
299       // Use huffman tables to decode block of compressed symbols
300       for (;;) {
301         int sym = huff_and_puff(bb, lithuff);
302 
303         // Literal?
304         if (sym < 256) output_byte(dd, sym);
305 
306         // Copy range?
307         else if (sym > 256) {
308           int len, dist;
309 
310           sym -= 257;
311           len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
312           sym = huff_and_puff(bb, disthuff);
313           dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
314           sym = dd->pos & 32767;
315 
316           while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
317 
318         // End of block
319         } else break;
320       }
321     }
322 
323     // Was that the last block?
324     if (final) break;
325   }
326 
327   if (dd->pos & 32767) inflate_out(dd, dd->pos&32767);
328 }
329 
330 // Deflate from dd->infd to bitbuf
331 // For deflate, dd->len = input read, dd->pos = input consumed
deflate(struct deflate * dd,struct bitbuf * bb)332 static void deflate(struct deflate *dd, struct bitbuf *bb)
333 {
334   char *data = dd->data;
335   int len, final = 0;
336 
337   dd->crc = ~0;
338 
339   while (!final) {
340     // Read next half-window of data if we haven't hit EOF yet.
341     len = readall(dd->infd, data+(dd->len&32768), 32768);
342     if (len < 0) perror_exit("read"); // TODO: add filename
343     if (len != 32768) final++;
344     if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
345     // dd->len += len;  crcfunc advances len TODO
346 
347     // store block as literal
348 // TODO: actually compress!
349     bitbuf_put(bb, final, 1);
350     bitbuf_put(bb, 0, 1);
351 
352     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
353     bitbuf_put(bb, len, 16);
354     bitbuf_put(bb, 0xffff & ~len, 16);
355 
356     // repeat until spanked
357     while (dd->pos != dd->len) {
358       unsigned pos = dd->pos&65535;
359 
360       bitbuf_put(bb, data[pos], 8);
361 
362       // need to refill buffer?
363       if (!(32767 & ++dd->pos) && !final) break;
364     }
365   }
366   bitbuf_flush(bb);
367 }
368 
369 // Allocate memory for deflate/inflate.
init_deflate(int compress)370 static struct deflate *init_deflate(int compress)
371 {
372   int i, n = 1;
373   struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
374 
375   memset(dd, 0, sizeof(struct deflate));
376   // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
377   if (compress) {
378     dd->hashhead = (unsigned short *)(dd->data+65536);
379     dd->hashchain = (unsigned short *)(dd->data+65536+32768);
380   }
381 
382   // Calculate lenbits, lenbase, distbits, distbase
383   *dd->lenbase = 3;
384   for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
385     if (i>4) {
386       if (!(i&3)) {
387         dd->lenbits[i]++;
388         n <<= 1;
389       }
390       if (i == 27) n--;
391       else dd->lenbits[i+1] = dd->lenbits[i];
392     }
393     dd->lenbase[i+1] = n + dd->lenbase[i];
394   }
395   n = 0;
396   for (i = 0; i<sizeof(dd->distbits); i++) {
397     dd->distbase[i] = 1<<n;
398     if (i) dd->distbase[i] += dd->distbase[i-1];
399     if (i>3 && !(i&1)) n++;
400     dd->distbits[i] = n;
401   }
402 
403 // TODO layout and lifetime of this?
404   // Init fixed huffman tables
405   for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
406   len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
407   memset(libbuf, 5, 30);
408   len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
409 
410   return dd;
411 }
412 
413 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)414 static int is_gzip(struct bitbuf *bb)
415 {
416   int flags;
417 
418   // Confirm signature
419   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
420     return 0;
421   bitbuf_skip(bb, 6*8);
422 
423   // Skip extra, name, comment, header CRC fields
424   if (flags & 4) bitbuf_skip(bb, bitbuf_get(bb, 16) * 8);
425   if (flags & 8) while (bitbuf_get(bb, 8));
426   if (flags & 16) while (bitbuf_get(bb, 8));
427   if (flags & 2) bitbuf_skip(bb, 16);
428 
429   return 1;
430 }
431 
gzip_crc(struct deflate * dd,char * data,unsigned len)432 static void gzip_crc(struct deflate *dd, char *data, unsigned len)
433 {
434   int i;
435   unsigned crc, *crc_table = dd->crctable;
436 
437   crc = dd->crc;
438   for (i = 0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
439   dd->crc = crc;
440   dd->len += len;
441 }
442 
443 /*
444 // Start with crc = 1, or pass in last crc to append more data
445 // Deferred modulus good for paged size inputs (can't overflow for ~5500 bytes)
446 unsigned adler32(char *buf, unsigned len, unsigned crc)
447 {
448   unsigned aa = crc&((1<<16)-1), bb = crc>>16;
449 
450   while (len--) {
451     aa += *buf++;
452     bb += aa;
453   }
454 
455   return ((bb%65521)<<16)+aa%65521;
456 }
457 */
458 
gzip_fd(int infd,int outfd)459 long long gzip_fd(int infd, int outfd)
460 {
461   struct bitbuf *bb = bitbuf_init(outfd, 4096);
462   struct deflate *dd = init_deflate(1);
463   long long rc;
464 
465   // Header from RFC 1952 section 2.2:
466   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
467   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
468   // Operating System (FF=unknown)
469 
470   dd->infd = infd;
471   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
472 
473   // Little endian crc table
474   crc_init(dd->crctable, 1);
475   dd->crcfunc = gzip_crc;
476 
477   deflate(dd, bb);
478 
479   // tail: crc32, len32
480 
481   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
482   bitbuf_put(bb, ~dd->crc, 32);
483   bitbuf_put(bb, dd->len, 32);
484   rc = dd->len;
485 
486   bitbuf_flush(bb);
487   free(bb);
488   free(dd);
489 
490   return rc;
491 }
492 
gunzip_common(struct bitbuf * bb,struct deflate * dd)493 long long gunzip_common(struct bitbuf *bb, struct deflate *dd)
494 {
495   long long rc = 0;
496 
497   // Little endian crc table
498   crc_init(dd->crctable, 1);
499   dd->crcfunc = gzip_crc;
500 
501   do {
502     if (!is_gzip(bb)) error_exit("not gzip");
503 
504     inflate(dd, bb);
505     // tail: crc32, len32
506     bitbuf_skip(bb, (8-bb->bitpos)&7);
507     if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
508       error_exit("bad crc");
509     rc += dd->len;
510     bitbuf_skip(bb, (8-bb->bitpos)&7);
511     dd->pos = dd->len = 0;
512   } while (bitbuf_skip(bb, 0));
513   free(bb);
514   free(dd);
515 
516   return rc;
517 }
518 
gunzip_mem(char * inbuf,int inlen,char * outbuf,int outlen)519 long long gunzip_mem(char *inbuf, int inlen, char *outbuf, int outlen)
520 {
521   struct bitbuf *bb = bitbuf_init(-1, 0);
522   struct deflate *dd = init_deflate(0);
523 
524   bb->buf = inbuf;
525   bb->max = bb->len = inlen;
526   dd->outfd = -1;
527   dd->outbuf = outbuf;
528   dd->outbuflen = outlen;
529 
530   return gunzip_common(bb, dd);
531 }
532 
gunzip_fd(int infd,int outfd)533 long long gunzip_fd(int infd, int outfd)
534 {
535   struct bitbuf *bb = bitbuf_init(infd, 4096);
536   struct deflate *dd = init_deflate(0);
537 
538   dd->outfd = outfd;
539   return gunzip_common(bb, dd);
540 }
541