1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package bzip2 implements bzip2 decompression. 6package bzip2 7 8import "io" 9 10// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot 11// of guessing: https://en.wikipedia.org/wiki/Bzip2 12// The source code to pyflate was useful for debugging: 13// http://www.paul.sladen.org/projects/pyflate 14 15// A StructuralError is returned when the bzip2 data is found to be 16// syntactically invalid. 17type StructuralError string 18 19func (s StructuralError) Error() string { 20 return "bzip2 data invalid: " + string(s) 21} 22 23// A reader decompresses bzip2 compressed data. 24type reader struct { 25 br bitReader 26 fileCRC uint32 27 blockCRC uint32 28 wantBlockCRC uint32 29 setupDone bool // true if we have parsed the bzip2 header. 30 eof bool 31 blockSize int // blockSize in bytes, i.e. 900 * 1000. 32 c [256]uint // the ``C'' array for the inverse BWT. 33 tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. 34 tPos uint32 // Index of the next output byte in tt. 35 36 preRLE []uint32 // contains the RLE data still to be processed. 37 preRLEUsed int // number of entries of preRLE used. 38 lastByte int // the last byte value seen. 39 byteRepeats uint // the number of repeats of lastByte seen. 40 repeats uint // the number of copies of lastByte to output. 41} 42 43// NewReader returns an io.Reader which decompresses bzip2 data from r. 44// If r does not also implement [io.ByteReader], 45// the decompressor may read more data than necessary from r. 46func NewReader(r io.Reader) io.Reader { 47 bz2 := new(reader) 48 bz2.br = newBitReader(r) 49 return bz2 50} 51 52const bzip2FileMagic = 0x425a // "BZ" 53const bzip2BlockMagic = 0x314159265359 54const bzip2FinalMagic = 0x177245385090 55 56// setup parses the bzip2 header. 57func (bz2 *reader) setup(needMagic bool) error { 58 br := &bz2.br 59 60 if needMagic { 61 magic := br.ReadBits(16) 62 if magic != bzip2FileMagic { 63 return StructuralError("bad magic value") 64 } 65 } 66 67 t := br.ReadBits(8) 68 if t != 'h' { 69 return StructuralError("non-Huffman entropy encoding") 70 } 71 72 level := br.ReadBits(8) 73 if level < '1' || level > '9' { 74 return StructuralError("invalid compression level") 75 } 76 77 bz2.fileCRC = 0 78 bz2.blockSize = 100 * 1000 * (level - '0') 79 if bz2.blockSize > len(bz2.tt) { 80 bz2.tt = make([]uint32, bz2.blockSize) 81 } 82 return nil 83} 84 85func (bz2 *reader) Read(buf []byte) (n int, err error) { 86 if bz2.eof { 87 return 0, io.EOF 88 } 89 90 if !bz2.setupDone { 91 err = bz2.setup(true) 92 brErr := bz2.br.Err() 93 if brErr != nil { 94 err = brErr 95 } 96 if err != nil { 97 return 0, err 98 } 99 bz2.setupDone = true 100 } 101 102 n, err = bz2.read(buf) 103 brErr := bz2.br.Err() 104 if brErr != nil { 105 err = brErr 106 } 107 return 108} 109 110func (bz2 *reader) readFromBlock(buf []byte) int { 111 // bzip2 is a block based compressor, except that it has a run-length 112 // preprocessing step. The block based nature means that we can 113 // preallocate fixed-size buffers and reuse them. However, the RLE 114 // preprocessing would require allocating huge buffers to store the 115 // maximum expansion. Thus we process blocks all at once, except for 116 // the RLE which we decompress as required. 117 n := 0 118 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { 119 // We have RLE data pending. 120 121 // The run-length encoding works like this: 122 // Any sequence of four equal bytes is followed by a length 123 // byte which contains the number of repeats of that byte to 124 // include. (The number of repeats can be zero.) Because we are 125 // decompressing on-demand our state is kept in the reader 126 // object. 127 128 if bz2.repeats > 0 { 129 buf[n] = byte(bz2.lastByte) 130 n++ 131 bz2.repeats-- 132 if bz2.repeats == 0 { 133 bz2.lastByte = -1 134 } 135 continue 136 } 137 138 bz2.tPos = bz2.preRLE[bz2.tPos] 139 b := byte(bz2.tPos) 140 bz2.tPos >>= 8 141 bz2.preRLEUsed++ 142 143 if bz2.byteRepeats == 3 { 144 bz2.repeats = uint(b) 145 bz2.byteRepeats = 0 146 continue 147 } 148 149 if bz2.lastByte == int(b) { 150 bz2.byteRepeats++ 151 } else { 152 bz2.byteRepeats = 0 153 } 154 bz2.lastByte = int(b) 155 156 buf[n] = b 157 n++ 158 } 159 160 return n 161} 162 163func (bz2 *reader) read(buf []byte) (int, error) { 164 for { 165 n := bz2.readFromBlock(buf) 166 if n > 0 || len(buf) == 0 { 167 bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) 168 return n, nil 169 } 170 171 // End of block. Check CRC. 172 if bz2.blockCRC != bz2.wantBlockCRC { 173 bz2.br.err = StructuralError("block checksum mismatch") 174 return 0, bz2.br.err 175 } 176 177 // Find next block. 178 br := &bz2.br 179 switch br.ReadBits64(48) { 180 default: 181 return 0, StructuralError("bad magic value found") 182 183 case bzip2BlockMagic: 184 // Start of block. 185 err := bz2.readBlock() 186 if err != nil { 187 return 0, err 188 } 189 190 case bzip2FinalMagic: 191 // Check end-of-file CRC. 192 wantFileCRC := uint32(br.ReadBits64(32)) 193 if br.err != nil { 194 return 0, br.err 195 } 196 if bz2.fileCRC != wantFileCRC { 197 br.err = StructuralError("file checksum mismatch") 198 return 0, br.err 199 } 200 201 // Skip ahead to byte boundary. 202 // Is there a file concatenated to this one? 203 // It would start with BZ. 204 if br.bits%8 != 0 { 205 br.ReadBits(br.bits % 8) 206 } 207 b, err := br.r.ReadByte() 208 if err == io.EOF { 209 br.err = io.EOF 210 bz2.eof = true 211 return 0, io.EOF 212 } 213 if err != nil { 214 br.err = err 215 return 0, err 216 } 217 z, err := br.r.ReadByte() 218 if err != nil { 219 if err == io.EOF { 220 err = io.ErrUnexpectedEOF 221 } 222 br.err = err 223 return 0, err 224 } 225 if b != 'B' || z != 'Z' { 226 return 0, StructuralError("bad magic value in continuation file") 227 } 228 if err := bz2.setup(false); err != nil { 229 return 0, err 230 } 231 } 232 } 233} 234 235// readBlock reads a bzip2 block. The magic number should already have been consumed. 236func (bz2 *reader) readBlock() (err error) { 237 br := &bz2.br 238 bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. 239 bz2.blockCRC = 0 240 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC 241 randomized := br.ReadBits(1) 242 if randomized != 0 { 243 return StructuralError("deprecated randomized files") 244 } 245 origPtr := uint(br.ReadBits(24)) 246 247 // If not every byte value is used in the block (i.e., it's text) then 248 // the symbol set is reduced. The symbols used are stored as a 249 // two-level, 16x16 bitmap. 250 symbolRangeUsedBitmap := br.ReadBits(16) 251 symbolPresent := make([]bool, 256) 252 numSymbols := 0 253 for symRange := uint(0); symRange < 16; symRange++ { 254 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { 255 bits := br.ReadBits(16) 256 for symbol := uint(0); symbol < 16; symbol++ { 257 if bits&(1<<(15-symbol)) != 0 { 258 symbolPresent[16*symRange+symbol] = true 259 numSymbols++ 260 } 261 } 262 } 263 } 264 265 if numSymbols == 0 { 266 // There must be an EOF symbol. 267 return StructuralError("no symbols in input") 268 } 269 270 // A block uses between two and six different Huffman trees. 271 numHuffmanTrees := br.ReadBits(3) 272 if numHuffmanTrees < 2 || numHuffmanTrees > 6 { 273 return StructuralError("invalid number of Huffman trees") 274 } 275 276 // The Huffman tree can switch every 50 symbols so there's a list of 277 // tree indexes telling us which tree to use for each 50 symbol block. 278 numSelectors := br.ReadBits(15) 279 treeIndexes := make([]uint8, numSelectors) 280 281 // The tree indexes are move-to-front transformed and stored as unary 282 // numbers. 283 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) 284 for i := range treeIndexes { 285 c := 0 286 for { 287 inc := br.ReadBits(1) 288 if inc == 0 { 289 break 290 } 291 c++ 292 } 293 if c >= numHuffmanTrees { 294 return StructuralError("tree index too large") 295 } 296 treeIndexes[i] = mtfTreeDecoder.Decode(c) 297 } 298 299 // The list of symbols for the move-to-front transform is taken from 300 // the previously decoded symbol bitmap. 301 symbols := make([]byte, numSymbols) 302 nextSymbol := 0 303 for i := 0; i < 256; i++ { 304 if symbolPresent[i] { 305 symbols[nextSymbol] = byte(i) 306 nextSymbol++ 307 } 308 } 309 mtf := newMTFDecoder(symbols) 310 311 numSymbols += 2 // to account for RUNA and RUNB symbols 312 huffmanTrees := make([]huffmanTree, numHuffmanTrees) 313 314 // Now we decode the arrays of code-lengths for each tree. 315 lengths := make([]uint8, numSymbols) 316 for i := range huffmanTrees { 317 // The code lengths are delta encoded from a 5-bit base value. 318 length := br.ReadBits(5) 319 for j := range lengths { 320 for { 321 if length < 1 || length > 20 { 322 return StructuralError("Huffman length out of range") 323 } 324 if !br.ReadBit() { 325 break 326 } 327 if br.ReadBit() { 328 length-- 329 } else { 330 length++ 331 } 332 } 333 lengths[j] = uint8(length) 334 } 335 huffmanTrees[i], err = newHuffmanTree(lengths) 336 if err != nil { 337 return err 338 } 339 } 340 341 selectorIndex := 1 // the next tree index to use 342 if len(treeIndexes) == 0 { 343 return StructuralError("no tree selectors given") 344 } 345 if int(treeIndexes[0]) >= len(huffmanTrees) { 346 return StructuralError("tree selector out of range") 347 } 348 currentHuffmanTree := huffmanTrees[treeIndexes[0]] 349 bufIndex := 0 // indexes bz2.buf, the output buffer. 350 // The output of the move-to-front transform is run-length encoded and 351 // we merge the decoding into the Huffman parsing loop. These two 352 // variables accumulate the repeat count. See the Wikipedia page for 353 // details. 354 repeat := 0 355 repeatPower := 0 356 357 // The `C' array (used by the inverse BWT) needs to be zero initialized. 358 clear(bz2.c[:]) 359 360 decoded := 0 // counts the number of symbols decoded by the current tree. 361 for { 362 if decoded == 50 { 363 if selectorIndex >= numSelectors { 364 return StructuralError("insufficient selector indices for number of symbols") 365 } 366 if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { 367 return StructuralError("tree selector out of range") 368 } 369 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] 370 selectorIndex++ 371 decoded = 0 372 } 373 374 v := currentHuffmanTree.Decode(br) 375 decoded++ 376 377 if v < 2 { 378 // This is either the RUNA or RUNB symbol. 379 if repeat == 0 { 380 repeatPower = 1 381 } 382 repeat += repeatPower << v 383 repeatPower <<= 1 384 385 // This limit of 2 million comes from the bzip2 source 386 // code. It prevents repeat from overflowing. 387 if repeat > 2*1024*1024 { 388 return StructuralError("repeat count too large") 389 } 390 continue 391 } 392 393 if repeat > 0 { 394 // We have decoded a complete run-length so we need to 395 // replicate the last output symbol. 396 if repeat > bz2.blockSize-bufIndex { 397 return StructuralError("repeats past end of block") 398 } 399 for i := 0; i < repeat; i++ { 400 b := mtf.First() 401 bz2.tt[bufIndex] = uint32(b) 402 bz2.c[b]++ 403 bufIndex++ 404 } 405 repeat = 0 406 } 407 408 if int(v) == numSymbols-1 { 409 // This is the EOF symbol. Because it's always at the 410 // end of the move-to-front list, and never gets moved 411 // to the front, it has this unique value. 412 break 413 } 414 415 // Since two metasymbols (RUNA and RUNB) have values 0 and 1, 416 // one would expect |v-2| to be passed to the MTF decoder. 417 // However, the front of the MTF list is never referenced as 0, 418 // it's always referenced with a run-length of 1. Thus 0 419 // doesn't need to be encoded and we have |v-1| in the next 420 // line. 421 b := mtf.Decode(int(v - 1)) 422 if bufIndex >= bz2.blockSize { 423 return StructuralError("data exceeds block size") 424 } 425 bz2.tt[bufIndex] = uint32(b) 426 bz2.c[b]++ 427 bufIndex++ 428 } 429 430 if origPtr >= uint(bufIndex) { 431 return StructuralError("origPtr out of bounds") 432 } 433 434 // We have completed the entropy decoding. Now we can perform the 435 // inverse BWT and setup the RLE buffer. 436 bz2.preRLE = bz2.tt[:bufIndex] 437 bz2.preRLEUsed = 0 438 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) 439 bz2.lastByte = -1 440 bz2.byteRepeats = 0 441 bz2.repeats = 0 442 443 return nil 444} 445 446// inverseBWT implements the inverse Burrows-Wheeler transform as described in 447// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. 448// In that document, origPtr is called “I” and c is the “C” array after the 449// first pass over the data. It's an argument here because we merge the first 450// pass with the Huffman decoding. 451// 452// This also implements the “single array” method from the bzip2 source code 453// which leaves the output, still shuffled, in the bottom 8 bits of tt with the 454// index of the next byte in the top 24-bits. The index of the first byte is 455// returned. 456func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { 457 sum := uint(0) 458 for i := 0; i < 256; i++ { 459 sum += c[i] 460 c[i] = sum - c[i] 461 } 462 463 for i := range tt { 464 b := tt[i] & 0xff 465 tt[c[b]] |= uint32(i) << 8 466 c[b]++ 467 } 468 469 return tt[origPtr] >> 8 470} 471 472// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, 473// causing the bits in the input to be processed in the reverse of the usual order. 474 475var crctab [256]uint32 476 477func init() { 478 const poly = 0x04C11DB7 479 for i := range crctab { 480 crc := uint32(i) << 24 481 for j := 0; j < 8; j++ { 482 if crc&0x80000000 != 0 { 483 crc = (crc << 1) ^ poly 484 } else { 485 crc <<= 1 486 } 487 } 488 crctab[i] = crc 489 } 490} 491 492// updateCRC updates the crc value to incorporate the data in b. 493// The initial value is 0. 494func updateCRC(val uint32, b []byte) uint32 { 495 crc := ^val 496 for _, v := range b { 497 crc = crctab[byte(crc>>24)^v] ^ (crc << 8) 498 } 499 return ^crc 500} 501