/* * Copyright (c) 2011 Matthew Francis * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.xbib.io.compress.bzip2; import java.io.IOException; import java.util.Arrays; /** * An encoder for the BZip2 Huffman encoding stage */ class BZip2HuffmanStageEncoder { /** * Used in initial Huffman table generation */ private static final int HUFFMAN_HIGH_SYMBOL_COST = 15; /** * The BZip2BitOutputStream to which the Huffman tables and data is written */ private final BZip2BitOutputStream bitOutputStream; /** * The output of the Move To Front Transform and Run Length Encoding[2] stages */ private final char[] mtfBlock; /** * The actual number of values contained in the mtfBlock array */ private int mtfLength; /** * The number of unique values in the mtfBlock array */ private int mtfAlphabetSize; /** * The global frequencies of values within the mtfBlock array */ private final int[] mtfSymbolFrequencies; /** * The Canonical Huffman code lengths for each table */ private final int[][] huffmanCodeLengths; /** * Merged code symbols for each table. The value at each position is ((code length << 24) | code) */ private final int[][] huffmanMergedCodeSymbols; /** * The selectors for each segment */ private final byte[] selectors; /** * @param bitOutputStream The BZip2BitOutputStream to write to * @param mtfBlock The MTF block data * @param mtfLength The actual length of the MTF block * @param mtfAlphabetSize The size of the MTF block's alphabet * @param mtfSymbolFrequencies The frequencies the MTF block's symbols */ public BZip2HuffmanStageEncoder(final BZip2BitOutputStream bitOutputStream, final char[] mtfBlock, final int mtfLength, final int mtfAlphabetSize, final int[] mtfSymbolFrequencies) { this.bitOutputStream = bitOutputStream; this.mtfBlock = mtfBlock; this.mtfLength = mtfLength; this.mtfAlphabetSize = mtfAlphabetSize; this.mtfSymbolFrequencies = mtfSymbolFrequencies; final int totalTables = selectTableCount(mtfLength); this.huffmanCodeLengths = new int[totalTables][mtfAlphabetSize]; this.huffmanMergedCodeSymbols = new int[totalTables][mtfAlphabetSize]; this.selectors = new byte[(mtfLength + BZip2Constants.HUFFMAN_GROUP_RUN_LENGTH - 1) / BZip2Constants.HUFFMAN_GROUP_RUN_LENGTH]; } /** * Selects an appropriate table count for a given MTF length * * @param mtfLength The length to select a table count for * @return The selected table count */ private int selectTableCount(final int mtfLength) { if (mtfLength >= 2400) { return 6; } if (mtfLength >= 1200) { return 5; } if (mtfLength >= 600) { return 4; } if (mtfLength >= 200) { return 3; } return 2; } /** * Generate a Huffman code length table for a given list of symbol frequencies * * @param alphabetSize The total number of symbols * @param symbolFrequencies The frequencies of the symbols * @param codeLengths The array to which the generated code lengths should be written */ private static void generateHuffmanCodeLengths(final int alphabetSize, final int[] symbolFrequencies, final int[] codeLengths) { final int[] mergedFrequenciesAndIndices = new int[alphabetSize]; final int[] sortedFrequencies = new int[alphabetSize]; // The Huffman allocator needs its input symbol frequencies to be sorted, but we need to return code lengths in the same order as the // corresponding frequencies are passed in // The symbol frequency and index are merged into a single array of integers - frequency in the high 23 bits, index in the low 9 bits. // 2^23 = 8,388,608 which is higher than the maximum possible frequency for one symbol in a block // 2^9 = 512 which is higher than the maximum possible alphabet size (== 258) // Sorting this array simultaneously sorts the frequencies and leaves a lookup that can be used to cheaply invert the sort for (int i = 0; i < alphabetSize; i++) { mergedFrequenciesAndIndices[i] = (symbolFrequencies[i] << 9) | i; } Arrays.sort(mergedFrequenciesAndIndices); for (int i = 0; i < alphabetSize; i++) { sortedFrequencies[i] = mergedFrequenciesAndIndices[i] >>> 9; } // Allocate code lengths - the allocation is in place, so the code lengths will be in the sortedFrequencies array afterwards HuffmanAllocator.allocateHuffmanCodeLengths(sortedFrequencies, BZip2Constants.HUFFMAN_ENCODE_MAXIMUM_CODE_LENGTH); // Reverse the sort to place the code lengths in the same order as the symbols whose frequencies were passed in for (int i = 0; i < alphabetSize; i++) { codeLengths[mergedFrequenciesAndIndices[i] & 0x1ff] = sortedFrequencies[i]; } } /** * Generate initial Huffman code length tables, giving each table a different low cost section * of the alphabet that is roughly equal in overall cumulative frequency. Note that the initial * tables are invalid for actual Huffman code generation, and only serve as the seed for later * iterative optimisation in optimiseSelectorsAndHuffmanTables(int). */ private void generateHuffmanOptimisationSeeds() { final int[][] huffmanCodeLengths = this.huffmanCodeLengths; final int[] mtfSymbolFrequencies = this.mtfSymbolFrequencies; final int mtfAlphabetSize = this.mtfAlphabetSize; final int totalTables = huffmanCodeLengths.length; int remainingLength = this.mtfLength; int lowCostEnd = -1; for (int i = 0; i < totalTables; i++) { final int targetCumulativeFrequency = remainingLength / (totalTables - i); final int lowCostStart = lowCostEnd + 1; int actualCumulativeFrequency = 0; while ((actualCumulativeFrequency < targetCumulativeFrequency) && (lowCostEnd < (mtfAlphabetSize - 1))) { actualCumulativeFrequency += mtfSymbolFrequencies[++lowCostEnd]; } if ((lowCostEnd > lowCostStart) && (i != 0) && (i != (totalTables - 1)) && (((totalTables - i) & 1) == 0)) { actualCumulativeFrequency -= mtfSymbolFrequencies[lowCostEnd--]; } final int[] tableCodeLengths = huffmanCodeLengths[i]; for (int j = 0; j < mtfAlphabetSize; j++) { if ((j < lowCostStart) || (j > lowCostEnd)) { tableCodeLengths[j] = HUFFMAN_HIGH_SYMBOL_COST; } } remainingLength -= actualCumulativeFrequency; } } /** * Co-optimise the selector list and the alternative Huffman table code lengths. This method is * called repeatedly in the hope that the total encoded size of the selectors, the Huffman code * lengths and the block data encoded with them will converge towards a minimum.<br> * If the data is highly incompressible, it is possible that the total encoded size will * instead diverge (increase) slightly.<br> * * @param storeSelectors If {@code true}, write out the (final) chosen selectors */ private void optimiseSelectorsAndHuffmanTables(final boolean storeSelectors) { final char[] mtfBlock = this.mtfBlock; final byte[] selectors = this.selectors; final int[][] huffmanCodeLengths = this.huffmanCodeLengths; final int mtfLength = this.mtfLength; final int mtfAlphabetSize = this.mtfAlphabetSize; final int totalTables = huffmanCodeLengths.length; final int[][] tableFrequencies = new int[totalTables][mtfAlphabetSize]; int selectorIndex = 0; // Find the best table for each group of 50 block bytes based on the current Huffman code lengths for (int groupStart = 0; groupStart < mtfLength; ) { final int groupEnd = Math.min(groupStart + BZip2Constants.HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1; // Calculate the cost of this group when encoded by each table short[] cost = new short[totalTables]; for (int i = groupStart; i <= groupEnd; i++) { final int value = mtfBlock[i]; for (int j = 0; j < totalTables; j++) { cost[j] += huffmanCodeLengths[j][value]; } } // Find the table with the least cost for this group byte bestTable = 0; int bestCost = cost[0]; for (byte i = 1; i < totalTables; i++) { final int tableCost = cost[i]; if (tableCost < bestCost) { bestCost = tableCost; bestTable = i; } } // Accumulate symbol frequencies for the table chosen for this block final int[] bestGroupFrequencies = tableFrequencies[bestTable]; for (int i = groupStart; i <= groupEnd; i++) { bestGroupFrequencies[mtfBlock[i]]++; } // Store a selector indicating the table chosen for this block if (storeSelectors) { selectors[selectorIndex++] = bestTable; } groupStart = groupEnd + 1; } // Generate new Huffman code lengths based on the frequencies for each table accumulated in this iteration for (int i = 0; i < totalTables; i++) { generateHuffmanCodeLengths(mtfAlphabetSize, tableFrequencies[i], huffmanCodeLengths[i]); } } /** * Assigns Canonical Huffman codes based on the calculated lengths */ private void assignHuffmanCodeSymbols() { final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols; final int[][] huffmanCodeLengths = this.huffmanCodeLengths; final int mtfAlphabetSize = this.mtfAlphabetSize; final int totalTables = huffmanCodeLengths.length; for (int i = 0; i < totalTables; i++) { final int[] tableLengths = huffmanCodeLengths[i]; int minimumLength = 32; int maximumLength = 0; for (int j = 0; j < mtfAlphabetSize; j++) { final int length = tableLengths[j]; if (length > maximumLength) { maximumLength = length; } if (length < minimumLength) { minimumLength = length; } } int code = 0; for (int j = minimumLength; j <= maximumLength; j++) { for (int k = 0; k < mtfAlphabetSize; k++) { if ((huffmanCodeLengths[i][k] & 0xff) == j) { huffmanMergedCodeSymbols[i][k] = (j << 24) | code; code++; } } code <<= 1; } } } /** * Write out the selector list and Huffman tables * * @throws java.io.IOException on any I/O error writing the data */ private void writeSelectorsAndHuffmanTables() throws IOException { final BZip2BitOutputStream bitOutputStream = this.bitOutputStream; final byte[] selectors = this.selectors; final int totalSelectors = selectors.length; final int[][] huffmanCodeLengths = this.huffmanCodeLengths; final int mtfAlphabetSize = this.mtfAlphabetSize; final int totalTables = huffmanCodeLengths.length; bitOutputStream.writeBits(3, totalTables); bitOutputStream.writeBits(15, totalSelectors); // Write the selectors MoveToFront selectorMTF = new MoveToFront(); for (int i = 0; i < totalSelectors; i++) { bitOutputStream.writeUnary(selectorMTF.valueToFront(selectors[i])); } // Write the Huffman tables for (int i = 0; i < totalTables; i++) { final int[] tableLengths = huffmanCodeLengths[i]; int currentLength = tableLengths[0]; bitOutputStream.writeBits(5, currentLength); for (int j = 0; j < mtfAlphabetSize; j++) { final int codeLength = tableLengths[j]; final int value = (currentLength < codeLength) ? 2 : 3; int delta = Math.abs(codeLength - currentLength); while (delta-- > 0) { bitOutputStream.writeBits(2, value); } bitOutputStream.writeBoolean(false); currentLength = codeLength; } } } /** * Writes out the encoded block data * * @throws java.io.IOException on any I/O error writing the data */ private void writeBlockData() throws IOException { final BZip2BitOutputStream bitOutputStream = this.bitOutputStream; final int[][] huffmanMergedCodeSymbols = this.huffmanMergedCodeSymbols; final byte[] selectors = this.selectors; final char[] mtf = this.mtfBlock; final int mtfLength = this.mtfLength; int selectorIndex = 0; for (int mtfIndex = 0; mtfIndex < mtfLength; ) { final int groupEnd = Math.min(mtfIndex + BZip2Constants.HUFFMAN_GROUP_RUN_LENGTH, mtfLength) - 1; final int[] tableMergedCodeSymbols = huffmanMergedCodeSymbols[selectors[selectorIndex++]]; while (mtfIndex <= groupEnd) { final int mergedCodeSymbol = tableMergedCodeSymbols[mtf[mtfIndex++]]; bitOutputStream.writeBits(mergedCodeSymbol >>> 24, mergedCodeSymbol); } } } /** * Encodes and writes the block data * * @throws java.io.IOException on any I/O error writing the data */ public void encode() throws IOException { // Create optimised selector list and Huffman tables generateHuffmanOptimisationSeeds(); for (int i = 3; i >= 0; i--) { optimiseSelectorsAndHuffmanTables(i == 0); } assignHuffmanCodeSymbols(); // Write out the tables and the block data encoded with them writeSelectorsAndHuffmanTables(); writeBlockData(); } }