/* * Copyright (c) 2011 Matthew Francis * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.xbib.io.compress.bzip2; import java.io.IOException; /* * Block decoding consists of the following stages: * 1. Read block header - BZip2BlockDecompressor() * 2. Read Huffman tables - readHuffmanTables() * 3. Read and decode Huffman encoded data - decodeHuffmanData() * 4. Run-Length Decoding[2] - decodeHuffmanData() * 5. Inverse Move To Front Transform - decodeHuffmanData() * 6. Inverse Burrows Wheeler Transform - initialiseInverseBWT() * 7. Run-Length Decoding[1] - read() * 8. Optional Block De-Randomisation - read() (through decodeNextBWTByte()) */ /** * Reads and decompresses a single BZip2 block */ public class BZip2BlockDecompressor { /** * The BZip2 specification originally included the optional addition of a slight pseudo-random * perturbation to the input data, in order to work around the block sorting algorithm's non- * optimal performance on some types of input. The current mainline bzip2 does not require this * and will not create randomised blocks, but compatibility is still required for old data (and * third party compressors that haven't caught up). When decompressing a randomised block, for * each value N in this array, a 1 will be XOR'd onto the output of the Burrows-Wheeler * transform stage after N bytes, then the next N taken from the following entry. */ private static final int[] RNUMS = { 619, 720, 127, 481, 931, 816, 813, 233, 566, 247, 985, 724, 205, 454, 863, 491, 741, 242, 949, 214, 733, 859, 335, 708, 621, 574, 73, 654, 730, 472, 419, 436, 278, 496, 867, 210, 399, 680, 480, 51, 878, 465, 811, 169, 869, 675, 611, 697, 867, 561, 862, 687, 507, 283, 482, 129, 807, 591, 733, 623, 150, 238, 59, 379, 684, 877, 625, 169, 643, 105, 170, 607, 520, 932, 727, 476, 693, 425, 174, 647, 73, 122, 335, 530, 442, 853, 695, 249, 445, 515, 909, 545, 703, 919, 874, 474, 882, 500, 594, 612, 641, 801, 220, 162, 819, 984, 589, 513, 495, 799, 161, 604, 958, 533, 221, 400, 386, 867, 600, 782, 382, 596, 414, 171, 516, 375, 682, 485, 911, 276, 98, 553, 163, 354, 666, 933, 424, 341, 533, 870, 227, 730, 475, 186, 263, 647, 537, 686, 600, 224, 469, 68, 770, 919, 190, 373, 294, 822, 808, 206, 184, 943, 795, 384, 383, 461, 404, 758, 839, 887, 715, 67, 618, 276, 204, 918, 873, 777, 604, 560, 951, 160, 578, 722, 79, 804, 96, 409, 713, 940, 652, 934, 970, 447, 318, 353, 859, 672, 112, 785, 645, 863, 803, 350, 139, 93, 354, 99, 820, 908, 609, 772, 154, 274, 580, 184, 79, 626, 630, 742, 653, 282, 762, 623, 680, 81, 927, 626, 789, 125, 411, 521, 938, 300, 821, 78, 343, 175, 128, 250, 170, 774, 972, 275, 999, 639, 495, 78, 352, 126, 857, 956, 358, 619, 580, 124, 737, 594, 701, 612, 669, 112, 134, 694, 363, 992, 809, 743, 168, 974, 944, 375, 748, 52, 600, 747, 642, 182, 862, 81, 344, 805, 988, 739, 511, 655, 814, 334, 249, 515, 897, 955, 664, 981, 649, 113, 974, 459, 893, 228, 433, 837, 553, 268, 926, 240, 102, 654, 459, 51, 686, 754, 806, 760, 493, 403, 415, 394, 687, 700, 946, 670, 656, 610, 738, 392, 760, 799, 887, 653, 978, 321, 576, 617, 626, 502, 894, 679, 243, 440, 680, 879, 194, 572, 640, 724, 926, 56, 204, 700, 707, 151, 457, 449, 797, 195, 791, 558, 945, 679, 297, 59, 87, 824, 713, 663, 412, 693, 342, 606, 134, 108, 571, 364, 631, 212, 174, 643, 304, 329, 343, 97, 430, 751, 497, 314, 983, 374, 822, 928, 140, 206, 73, 263, 980, 736, 876, 478, 430, 305, 170, 514, 364, 692, 829, 82, 855, 953, 676, 246, 369, 970, 294, 750, 807, 827, 150, 790, 288, 923, 804, 378, 215, 828, 592, 281, 565, 555, 710, 82, 896, 831, 547, 261, 524, 462, 293, 465, 502, 56, 661, 821, 976, 991, 658, 869, 905, 758, 745, 193, 768, 550, 608, 933, 378, 286, 215, 979, 792, 961, 61, 688, 793, 644, 986, 403, 106, 366, 905, 644, 372, 567, 466, 434, 645, 210, 389, 550, 919, 135, 780, 773, 635, 389, 707, 100, 626, 958, 165, 504, 920, 176, 193, 713, 857, 265, 203, 50, 668, 108, 645, 990, 626, 197, 510, 357, 358, 850, 858, 364, 936, 638 }; /** * Provides bits of input to decode */ private final BZip2BitInputStream bitInputStream; /** * Calculates the block CRC from the fully decoded bytes of the block */ private final CRC32 crc = new CRC32(); /** * The CRC of the current block as read from the block header */ private final int blockCRC; /** * {@code true} if the current block is randomised, otherwise {@code false} */ private final boolean blockRandomised; /* Huffman Decoding stage */ /** * The end-of-block Huffman symbol. Decoding of the block ends when this is encountered */ private int huffmanEndOfBlockSymbol; /** * A map from Huffman symbol index to output character. Some types of data (e.g. ASCII text) * may contain only a limited number of byte values; Huffman symbols are only allocated to * those values that actually occur in the uncompressed data. */ private final byte[] huffmanSymbolMap = new byte[256]; /* Move To Front stage */ /** * Counts of each byte value within the {@link bwtTransformedArray} data. Collected at the Move * To Front stage, consumed by the Inverse Burrows Wheeler Transform stage */ private final int[] bwtByteCounts = new int[256]; /** * The Burrows-Wheeler Transform processed data. Read at the Move To Front stage, consumed by the * Inverse Burrows Wheeler Transform stage */ private byte[] bwtBlock; /* Inverse Burrows-Wheeler Transform stage */ /** * At each position contains the union of :- * An output character (8 bits) * A pointer from each position to its successor (24 bits, left shifted 8 bits) * As the pointer cannot exceed the maximum block size of 900k, 24 bits is more than enough to * hold it; Folding the character data into the spare bits while performing the inverse BWT, * when both pieces of information are available, saves a large number of memory accesses in * the final decoding stages. */ private int[] bwtMergedPointers; /** * The current merged pointer into the Burrow-Wheeler Transform array */ private int bwtCurrentMergedPointer; /** * The actual length in bytes of the current block at the Inverse Burrows Wheeler Transform * stage (before final Run-Length Decoding) */ private int bwtBlockLength; /** * The number of output bytes that have been decoded up to the Inverse Burrows Wheeler Transform * stage */ private int bwtBytesDecoded; /* Run-Length Encoding and Random Perturbation stage */ /** * The most recently RLE decoded byte */ private int rleLastDecodedByte = -1; /** * The number of previous identical output bytes decoded. After 4 identical bytes, the next byte * decoded is an RLE repeat count */ private int rleAccumulator; /** * The RLE repeat count of the current decoded byte. When this reaches zero, a new byte is * decoded */ private int rleRepeat; /** * If the current block is randomised, the position within the RNUMS randomisation array */ private int randomIndex = 0; /** * If the current block is randomised, the remaining count at the current RNUMS position */ private int randomCount = RNUMS[0] - 1; /** * Read and decode the block's Huffman tables * * @return A decoder for the Huffman stage that uses the decoded tables * @throws java.io.IOException if the input stream reaches EOF before all table data has been read */ private BZip2HuffmanStageDecoder readHuffmanTables() throws IOException { final BZip2BitInputStream bitInputStream = this.bitInputStream; final byte[] huffmanSymbolMap = this.huffmanSymbolMap; final byte[][] tableCodeLengths = new byte[BZip2Constants.HUFFMAN_MAXIMUM_TABLES][BZip2Constants.HUFFMAN_MAXIMUM_ALPHABET_SIZE]; /* Read Huffman symbol to output byte map */ int huffmanUsedRanges = bitInputStream.readBits(16); int huffmanSymbolCount = 0; for (int i = 0; i < 16; i++) { if ((huffmanUsedRanges & ((1 << 15) >>> i)) != 0) { for (int j = 0, k = i << 4; j < 16; j++, k++) { if (bitInputStream.readBoolean()) { huffmanSymbolMap[huffmanSymbolCount++] = (byte) k; } } } } int endOfBlockSymbol = huffmanSymbolCount + 1; this.huffmanEndOfBlockSymbol = endOfBlockSymbol; /* Read total number of tables and selectors*/ final int totalTables = bitInputStream.readBits(3); final int totalSelectors = bitInputStream.readBits(15); if ( (totalTables < BZip2Constants.HUFFMAN_MINIMUM_TABLES) || (totalTables > BZip2Constants.HUFFMAN_MAXIMUM_TABLES) || (totalSelectors < 1) || (totalSelectors > BZip2Constants.HUFFMAN_MAXIMUM_SELECTORS) ) { throw new BZip2Exception("BZip2 block Huffman tables invalid"); } /* Read and decode MTFed Huffman selector list */ final MoveToFront tableMTF = new MoveToFront(); final byte[] selectors = new byte[totalSelectors]; for (int selector = 0; selector < totalSelectors; selector++) { selectors[selector] = tableMTF.indexToFront(bitInputStream.readUnary()); } /* Read the Canonical Huffman code lengths for each table */ for (int table = 0; table < totalTables; table++) { int currentLength = bitInputStream.readBits(5); for (int i = 0; i <= endOfBlockSymbol; i++) { while (bitInputStream.readBoolean()) { currentLength += bitInputStream.readBoolean() ? -1 : 1; } tableCodeLengths[table][i] = (byte) currentLength; } } return new BZip2HuffmanStageDecoder(bitInputStream, endOfBlockSymbol + 1, tableCodeLengths, selectors); } /** * Reads the Huffman encoded data from the input stream, performs Run-Length Decoding and * applies the Move To Front transform to reconstruct the Burrows-Wheeler Transform array * * @param huffmanDecoder The Huffman decoder through which symbols are read * @throws java.io.IOException if an end-of-block symbol was not decoded within the declared block size */ private void decodeHuffmanData(final BZip2HuffmanStageDecoder huffmanDecoder) throws IOException { final byte[] bwtBlock = this.bwtBlock; final byte[] huffmanSymbolMap = this.huffmanSymbolMap; final int streamBlockSize = this.bwtBlock.length; final int huffmanEndOfBlockSymbol = this.huffmanEndOfBlockSymbol; final int[] bwtByteCounts = this.bwtByteCounts; final MoveToFront symbolMTF = new MoveToFront(); int bwtBlockLength = 0; int repeatCount = 0; int repeatIncrement = 1; int mtfValue = 0; for (; ; ) { final int nextSymbol = huffmanDecoder.nextSymbol(); if (nextSymbol == BZip2Constants.HUFFMAN_SYMBOL_RUNA) { repeatCount += repeatIncrement; repeatIncrement <<= 1; } else if (nextSymbol == BZip2Constants.HUFFMAN_SYMBOL_RUNB) { repeatCount += repeatIncrement << 1; repeatIncrement <<= 1; } else { if (repeatCount > 0) { if (bwtBlockLength + repeatCount > streamBlockSize) { throw new BZip2Exception("BZip2 block exceeds declared block size"); } final byte nextByte = huffmanSymbolMap[mtfValue]; bwtByteCounts[nextByte & 0xff] += repeatCount; while (--repeatCount >= 0) { bwtBlock[bwtBlockLength++] = nextByte; } repeatCount = 0; repeatIncrement = 1; } if (nextSymbol == huffmanEndOfBlockSymbol) { break; } if (bwtBlockLength >= streamBlockSize) { throw new BZip2Exception("BZip2 block exceeds declared block size"); } mtfValue = symbolMTF.indexToFront(nextSymbol - 1) & 0xff; final byte nextByte = huffmanSymbolMap[mtfValue]; bwtByteCounts[nextByte & 0xff]++; bwtBlock[bwtBlockLength++] = nextByte; } } this.bwtBlockLength = bwtBlockLength; } /** * Set up the Inverse Burrows-Wheeler Transform merged pointer array * * @param bwtStartPointer The start pointer into the BWT array * @throws java.io.IOException if the given start pointer is invalid */ private void initialiseInverseBWT(final int bwtStartPointer) throws IOException { final byte[] bwtBlock = this.bwtBlock; final int[] bwtMergedPointers = new int[this.bwtBlockLength]; final int[] characterBase = new int[256]; if ((bwtStartPointer < 0) || (bwtStartPointer >= this.bwtBlockLength)) { throw new BZip2Exception("BZip2 start pointer invalid"); } // Cumulatise character counts System.arraycopy(this.bwtByteCounts, 0, characterBase, 1, 255); for (int i = 2; i <= 255; i++) { characterBase[i] += characterBase[i - 1]; } // Merged-Array Inverse Burrows-Wheeler Transform // Combining the output characters and forward pointers into a single array here, where we // have already read both of the corresponding values, cuts down on memory accesses in the // final walk through the array for (int i = 0; i < this.bwtBlockLength; i++) { int value = bwtBlock[i] & 0xff; bwtMergedPointers[characterBase[value]++] = (i << 8) + value; } this.bwtBlock = null; this.bwtMergedPointers = bwtMergedPointers; this.bwtCurrentMergedPointer = bwtMergedPointers[bwtStartPointer]; } /** * Decodes a byte from the Burrows-Wheeler Transform stage. If the block has randomisation * applied, reverses the randomisation * * @return The decoded byte */ private int decodeNextBWTByte() { int mergedPointer = this.bwtCurrentMergedPointer; int nextDecodedByte = mergedPointer & 0xff; this.bwtCurrentMergedPointer = this.bwtMergedPointers[mergedPointer >>> 8]; if (this.blockRandomised) { if (--this.randomCount == 0) { nextDecodedByte ^= 1; this.randomIndex = (this.randomIndex + 1) % 512; this.randomCount = RNUMS[this.randomIndex]; } } this.bwtBytesDecoded++; return nextDecodedByte; } /** * Decodes a byte from the final Run-Length Encoding stage, pulling a new byte from the * Burrows-Wheeler Transform stage when required * * @return The decoded byte, or -1 if there are no more bytes */ public int read() { while (this.rleRepeat < 1) { if (this.bwtBytesDecoded == this.bwtBlockLength) { return -1; } int nextByte = decodeNextBWTByte(); if (nextByte != this.rleLastDecodedByte) { // New byte, restart accumulation this.rleLastDecodedByte = nextByte; this.rleRepeat = 1; this.rleAccumulator = 1; this.crc.updateCRC(nextByte); } else { if (++this.rleAccumulator == 4) { // Accumulation complete, start repetition int rleRepeat = decodeNextBWTByte() + 1; this.rleRepeat = rleRepeat; this.rleAccumulator = 0; this.crc.updateCRC(nextByte, rleRepeat); } else { this.rleRepeat = 1; this.crc.updateCRC(nextByte); } } } this.rleRepeat--; return this.rleLastDecodedByte; } /** * Decodes multiple bytes from the final Run-Length Encoding stage, pulling new bytes from the * Burrows-Wheeler Transform stage when required * * @param destination The array to write to * @param offset The starting position within the array * @param length The number of bytes to read * @return The number of bytes actually read, or -1 if there are no bytes left in the block */ public int read(final byte[] destination, int offset, final int length) { int i; for (i = 0; i < length; i++, offset++) { int decoded = read(); if (decoded == -1) { return (i == 0) ? -1 : i; } destination[offset] = (byte) decoded; } return i; } /** * Verify and return the block CRC. This method may only be called after all of the block's * bytes have been read * * @return The block CRC * @throws java.io.IOException if the CRC verification failed */ public int checkCRC() throws IOException { if (this.blockCRC != this.crc.getCRC()) { throw new BZip2Exception("BZip2 block CRC error"); } return this.crc.getCRC(); } /** * @param bitInputStream The BZip2BitInputStream to read from * @param blockSize The maximum decoded size of the block * @throws java.io.IOException If the block could not be decoded */ public BZip2BlockDecompressor(final BZip2BitInputStream bitInputStream, final int blockSize) throws IOException { this.bitInputStream = bitInputStream; this.bwtBlock = new byte[blockSize]; final int bwtStartPointer; // Read block header this.blockCRC = this.bitInputStream.readInteger(); this.blockRandomised = this.bitInputStream.readBoolean(); bwtStartPointer = this.bitInputStream.readBits(24); // Read block data and decode through to the Inverse Burrows Wheeler Transform stage BZip2HuffmanStageDecoder huffmanDecoder = readHuffmanTables(); decodeHuffmanData(huffmanDecoder); initialiseInverseBWT(bwtStartPointer); } }