1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package bzip2 implements bzip2 decompression.
6package bzip2
7
8import "io"
9
10// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
11// of guessing: https://en.wikipedia.org/wiki/Bzip2
12// The source code to pyflate was useful for debugging:
13// http://www.paul.sladen.org/projects/pyflate
14
15// A StructuralError is returned when the bzip2 data is found to be
16// syntactically invalid.
17type StructuralError string
18
19func (s StructuralError) Error() string {
20	return "bzip2 data invalid: " + string(s)
21}
22
23// A reader decompresses bzip2 compressed data.
24type reader struct {
25	br           bitReader
26	fileCRC      uint32
27	blockCRC     uint32
28	wantBlockCRC uint32
29	setupDone    bool // true if we have parsed the bzip2 header.
30	eof          bool
31	blockSize    int       // blockSize in bytes, i.e. 900 * 1000.
32	c            [256]uint // the ``C'' array for the inverse BWT.
33	tt           []uint32  // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
34	tPos         uint32    // Index of the next output byte in tt.
35
36	preRLE      []uint32 // contains the RLE data still to be processed.
37	preRLEUsed  int      // number of entries of preRLE used.
38	lastByte    int      // the last byte value seen.
39	byteRepeats uint     // the number of repeats of lastByte seen.
40	repeats     uint     // the number of copies of lastByte to output.
41}
42
43// NewReader returns an io.Reader which decompresses bzip2 data from r.
44// If r does not also implement [io.ByteReader],
45// the decompressor may read more data than necessary from r.
46func NewReader(r io.Reader) io.Reader {
47	bz2 := new(reader)
48	bz2.br = newBitReader(r)
49	return bz2
50}
51
52const bzip2FileMagic = 0x425a // "BZ"
53const bzip2BlockMagic = 0x314159265359
54const bzip2FinalMagic = 0x177245385090
55
56// setup parses the bzip2 header.
57func (bz2 *reader) setup(needMagic bool) error {
58	br := &bz2.br
59
60	if needMagic {
61		magic := br.ReadBits(16)
62		if magic != bzip2FileMagic {
63			return StructuralError("bad magic value")
64		}
65	}
66
67	t := br.ReadBits(8)
68	if t != 'h' {
69		return StructuralError("non-Huffman entropy encoding")
70	}
71
72	level := br.ReadBits(8)
73	if level < '1' || level > '9' {
74		return StructuralError("invalid compression level")
75	}
76
77	bz2.fileCRC = 0
78	bz2.blockSize = 100 * 1000 * (level - '0')
79	if bz2.blockSize > len(bz2.tt) {
80		bz2.tt = make([]uint32, bz2.blockSize)
81	}
82	return nil
83}
84
85func (bz2 *reader) Read(buf []byte) (n int, err error) {
86	if bz2.eof {
87		return 0, io.EOF
88	}
89
90	if !bz2.setupDone {
91		err = bz2.setup(true)
92		brErr := bz2.br.Err()
93		if brErr != nil {
94			err = brErr
95		}
96		if err != nil {
97			return 0, err
98		}
99		bz2.setupDone = true
100	}
101
102	n, err = bz2.read(buf)
103	brErr := bz2.br.Err()
104	if brErr != nil {
105		err = brErr
106	}
107	return
108}
109
110func (bz2 *reader) readFromBlock(buf []byte) int {
111	// bzip2 is a block based compressor, except that it has a run-length
112	// preprocessing step. The block based nature means that we can
113	// preallocate fixed-size buffers and reuse them. However, the RLE
114	// preprocessing would require allocating huge buffers to store the
115	// maximum expansion. Thus we process blocks all at once, except for
116	// the RLE which we decompress as required.
117	n := 0
118	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
119		// We have RLE data pending.
120
121		// The run-length encoding works like this:
122		// Any sequence of four equal bytes is followed by a length
123		// byte which contains the number of repeats of that byte to
124		// include. (The number of repeats can be zero.) Because we are
125		// decompressing on-demand our state is kept in the reader
126		// object.
127
128		if bz2.repeats > 0 {
129			buf[n] = byte(bz2.lastByte)
130			n++
131			bz2.repeats--
132			if bz2.repeats == 0 {
133				bz2.lastByte = -1
134			}
135			continue
136		}
137
138		bz2.tPos = bz2.preRLE[bz2.tPos]
139		b := byte(bz2.tPos)
140		bz2.tPos >>= 8
141		bz2.preRLEUsed++
142
143		if bz2.byteRepeats == 3 {
144			bz2.repeats = uint(b)
145			bz2.byteRepeats = 0
146			continue
147		}
148
149		if bz2.lastByte == int(b) {
150			bz2.byteRepeats++
151		} else {
152			bz2.byteRepeats = 0
153		}
154		bz2.lastByte = int(b)
155
156		buf[n] = b
157		n++
158	}
159
160	return n
161}
162
163func (bz2 *reader) read(buf []byte) (int, error) {
164	for {
165		n := bz2.readFromBlock(buf)
166		if n > 0 || len(buf) == 0 {
167			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
168			return n, nil
169		}
170
171		// End of block. Check CRC.
172		if bz2.blockCRC != bz2.wantBlockCRC {
173			bz2.br.err = StructuralError("block checksum mismatch")
174			return 0, bz2.br.err
175		}
176
177		// Find next block.
178		br := &bz2.br
179		switch br.ReadBits64(48) {
180		default:
181			return 0, StructuralError("bad magic value found")
182
183		case bzip2BlockMagic:
184			// Start of block.
185			err := bz2.readBlock()
186			if err != nil {
187				return 0, err
188			}
189
190		case bzip2FinalMagic:
191			// Check end-of-file CRC.
192			wantFileCRC := uint32(br.ReadBits64(32))
193			if br.err != nil {
194				return 0, br.err
195			}
196			if bz2.fileCRC != wantFileCRC {
197				br.err = StructuralError("file checksum mismatch")
198				return 0, br.err
199			}
200
201			// Skip ahead to byte boundary.
202			// Is there a file concatenated to this one?
203			// It would start with BZ.
204			if br.bits%8 != 0 {
205				br.ReadBits(br.bits % 8)
206			}
207			b, err := br.r.ReadByte()
208			if err == io.EOF {
209				br.err = io.EOF
210				bz2.eof = true
211				return 0, io.EOF
212			}
213			if err != nil {
214				br.err = err
215				return 0, err
216			}
217			z, err := br.r.ReadByte()
218			if err != nil {
219				if err == io.EOF {
220					err = io.ErrUnexpectedEOF
221				}
222				br.err = err
223				return 0, err
224			}
225			if b != 'B' || z != 'Z' {
226				return 0, StructuralError("bad magic value in continuation file")
227			}
228			if err := bz2.setup(false); err != nil {
229				return 0, err
230			}
231		}
232	}
233}
234
235// readBlock reads a bzip2 block. The magic number should already have been consumed.
236func (bz2 *reader) readBlock() (err error) {
237	br := &bz2.br
238	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
239	bz2.blockCRC = 0
240	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
241	randomized := br.ReadBits(1)
242	if randomized != 0 {
243		return StructuralError("deprecated randomized files")
244	}
245	origPtr := uint(br.ReadBits(24))
246
247	// If not every byte value is used in the block (i.e., it's text) then
248	// the symbol set is reduced. The symbols used are stored as a
249	// two-level, 16x16 bitmap.
250	symbolRangeUsedBitmap := br.ReadBits(16)
251	symbolPresent := make([]bool, 256)
252	numSymbols := 0
253	for symRange := uint(0); symRange < 16; symRange++ {
254		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
255			bits := br.ReadBits(16)
256			for symbol := uint(0); symbol < 16; symbol++ {
257				if bits&(1<<(15-symbol)) != 0 {
258					symbolPresent[16*symRange+symbol] = true
259					numSymbols++
260				}
261			}
262		}
263	}
264
265	if numSymbols == 0 {
266		// There must be an EOF symbol.
267		return StructuralError("no symbols in input")
268	}
269
270	// A block uses between two and six different Huffman trees.
271	numHuffmanTrees := br.ReadBits(3)
272	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
273		return StructuralError("invalid number of Huffman trees")
274	}
275
276	// The Huffman tree can switch every 50 symbols so there's a list of
277	// tree indexes telling us which tree to use for each 50 symbol block.
278	numSelectors := br.ReadBits(15)
279	treeIndexes := make([]uint8, numSelectors)
280
281	// The tree indexes are move-to-front transformed and stored as unary
282	// numbers.
283	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
284	for i := range treeIndexes {
285		c := 0
286		for {
287			inc := br.ReadBits(1)
288			if inc == 0 {
289				break
290			}
291			c++
292		}
293		if c >= numHuffmanTrees {
294			return StructuralError("tree index too large")
295		}
296		treeIndexes[i] = mtfTreeDecoder.Decode(c)
297	}
298
299	// The list of symbols for the move-to-front transform is taken from
300	// the previously decoded symbol bitmap.
301	symbols := make([]byte, numSymbols)
302	nextSymbol := 0
303	for i := 0; i < 256; i++ {
304		if symbolPresent[i] {
305			symbols[nextSymbol] = byte(i)
306			nextSymbol++
307		}
308	}
309	mtf := newMTFDecoder(symbols)
310
311	numSymbols += 2 // to account for RUNA and RUNB symbols
312	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
313
314	// Now we decode the arrays of code-lengths for each tree.
315	lengths := make([]uint8, numSymbols)
316	for i := range huffmanTrees {
317		// The code lengths are delta encoded from a 5-bit base value.
318		length := br.ReadBits(5)
319		for j := range lengths {
320			for {
321				if length < 1 || length > 20 {
322					return StructuralError("Huffman length out of range")
323				}
324				if !br.ReadBit() {
325					break
326				}
327				if br.ReadBit() {
328					length--
329				} else {
330					length++
331				}
332			}
333			lengths[j] = uint8(length)
334		}
335		huffmanTrees[i], err = newHuffmanTree(lengths)
336		if err != nil {
337			return err
338		}
339	}
340
341	selectorIndex := 1 // the next tree index to use
342	if len(treeIndexes) == 0 {
343		return StructuralError("no tree selectors given")
344	}
345	if int(treeIndexes[0]) >= len(huffmanTrees) {
346		return StructuralError("tree selector out of range")
347	}
348	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
349	bufIndex := 0 // indexes bz2.buf, the output buffer.
350	// The output of the move-to-front transform is run-length encoded and
351	// we merge the decoding into the Huffman parsing loop. These two
352	// variables accumulate the repeat count. See the Wikipedia page for
353	// details.
354	repeat := 0
355	repeatPower := 0
356
357	// The `C' array (used by the inverse BWT) needs to be zero initialized.
358	clear(bz2.c[:])
359
360	decoded := 0 // counts the number of symbols decoded by the current tree.
361	for {
362		if decoded == 50 {
363			if selectorIndex >= numSelectors {
364				return StructuralError("insufficient selector indices for number of symbols")
365			}
366			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
367				return StructuralError("tree selector out of range")
368			}
369			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
370			selectorIndex++
371			decoded = 0
372		}
373
374		v := currentHuffmanTree.Decode(br)
375		decoded++
376
377		if v < 2 {
378			// This is either the RUNA or RUNB symbol.
379			if repeat == 0 {
380				repeatPower = 1
381			}
382			repeat += repeatPower << v
383			repeatPower <<= 1
384
385			// This limit of 2 million comes from the bzip2 source
386			// code. It prevents repeat from overflowing.
387			if repeat > 2*1024*1024 {
388				return StructuralError("repeat count too large")
389			}
390			continue
391		}
392
393		if repeat > 0 {
394			// We have decoded a complete run-length so we need to
395			// replicate the last output symbol.
396			if repeat > bz2.blockSize-bufIndex {
397				return StructuralError("repeats past end of block")
398			}
399			for i := 0; i < repeat; i++ {
400				b := mtf.First()
401				bz2.tt[bufIndex] = uint32(b)
402				bz2.c[b]++
403				bufIndex++
404			}
405			repeat = 0
406		}
407
408		if int(v) == numSymbols-1 {
409			// This is the EOF symbol. Because it's always at the
410			// end of the move-to-front list, and never gets moved
411			// to the front, it has this unique value.
412			break
413		}
414
415		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
416		// one would expect |v-2| to be passed to the MTF decoder.
417		// However, the front of the MTF list is never referenced as 0,
418		// it's always referenced with a run-length of 1. Thus 0
419		// doesn't need to be encoded and we have |v-1| in the next
420		// line.
421		b := mtf.Decode(int(v - 1))
422		if bufIndex >= bz2.blockSize {
423			return StructuralError("data exceeds block size")
424		}
425		bz2.tt[bufIndex] = uint32(b)
426		bz2.c[b]++
427		bufIndex++
428	}
429
430	if origPtr >= uint(bufIndex) {
431		return StructuralError("origPtr out of bounds")
432	}
433
434	// We have completed the entropy decoding. Now we can perform the
435	// inverse BWT and setup the RLE buffer.
436	bz2.preRLE = bz2.tt[:bufIndex]
437	bz2.preRLEUsed = 0
438	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
439	bz2.lastByte = -1
440	bz2.byteRepeats = 0
441	bz2.repeats = 0
442
443	return nil
444}
445
446// inverseBWT implements the inverse Burrows-Wheeler transform as described in
447// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
448// In that document, origPtr is called “I” and c is the “C” array after the
449// first pass over the data. It's an argument here because we merge the first
450// pass with the Huffman decoding.
451//
452// This also implements the “single array” method from the bzip2 source code
453// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
454// index of the next byte in the top 24-bits. The index of the first byte is
455// returned.
456func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
457	sum := uint(0)
458	for i := 0; i < 256; i++ {
459		sum += c[i]
460		c[i] = sum - c[i]
461	}
462
463	for i := range tt {
464		b := tt[i] & 0xff
465		tt[c[b]] |= uint32(i) << 8
466		c[b]++
467	}
468
469	return tt[origPtr] >> 8
470}
471
472// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
473// causing the bits in the input to be processed in the reverse of the usual order.
474
475var crctab [256]uint32
476
477func init() {
478	const poly = 0x04C11DB7
479	for i := range crctab {
480		crc := uint32(i) << 24
481		for j := 0; j < 8; j++ {
482			if crc&0x80000000 != 0 {
483				crc = (crc << 1) ^ poly
484			} else {
485				crc <<= 1
486			}
487		}
488		crctab[i] = crc
489	}
490}
491
492// updateCRC updates the crc value to incorporate the data in b.
493// The initial value is 0.
494func updateCRC(val uint32, b []byte) uint32 {
495	crc := ^val
496	for _, v := range b {
497		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
498	}
499	return ^crc
500}
501