HuffmanCodec.java example

Explorer
blazegraph-master
- database-master
package it.unimi.dsi.compression;

/*		 
 * DSI utilities
 *
 * Copyright (C) 2005-2009 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 2.1 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;

import java.io.Serializable;
import java.util.Arrays;

import cern.colt.Sorting;
import cern.colt.function.IntComparator;

/** An implementation of Huffman optimal prefix-free coding.
 * 
 * <p>A Huffman coder is built starting from an array of frequencies corresponding to each
 * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
 * 
 * <p>Instances of this class compute a <em>canonical</em> Huffman code 
 * (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, <i>Commun. ACM</i> 7(3), pages 166−169, 1964), which can 
 * by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}. 
 * The construction uses the most efficient one-pass in-place codelength computation procedure
 * described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”,
 * <i>Algorithms and Data Structures, 4th International Workshop</i>, 
 * number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995.
 * 
 * <p>We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64.
 * However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers,
 * no codeword longer than the base-[(5<sup>1/2</sup> + 1)/2] logarithm of 5<sup>1/2</sup> · 2<sup>31</sup> (less than 47) will ever be generated. 
 * <p>
 * <h3>Modifications</h3>
 * <ol><li>
 * This class has been modified to define an alternative ctor which exposes the
 * symbol[] in correlated order with the codeWord bitLength[] and the shortest
 * code word in the generated canonical.</li>
 * <li>
 * A method has been added to recreate the {@link PrefixCoder} from the 
 * shortest code word, the code word length[], and the symbol[].
 * </li>
 * </ol>
 * */

public class HuffmanCodec implements PrefixCodec, Serializable {
    
	private static final boolean DEBUG = false;
	private static final boolean ASSERTS = false;
	private static final long serialVersionUID = 2L;

	/** The number of symbols of this coder. */
	public final int size;
	/** The codewords for this coder. */
	private final BitVector[] codeWord;
	/** A cached singleton instance of the coder of this codec. */
	private final Fast64CodeWordCoder coder;
	/** A cached singleton instance of the decoder of this codec. */
	private final CanonicalFast64CodeWordDecoder decoder;

    /**
     * Class encapsulates the data necessary to reconstruct a
     * {@link CanonicalFast64CodeWordDecoder} or recreate the code.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     * @version $Id: HuffmanCodec.java 2265 2009-10-26 12:51:06Z thompsonbry $
     */
	public static class DecoderInputs {
	    
	    private BitVector shortestCodeWord;
	    private int symbol[];
	    private int length[];

        /**
         * Ctor may be passed to {@link HuffmanCodec} to obtain the assigned
         * length[] and symbol[] data and the shortest code word.  
         */
        public DecoderInputs() {
            
        }

        /**
         * Ctor may be used to explicitly populate an instance with the caller's
         * data.
         * 
         * @param shortestCodeWord
         * @param length
         * @param symbol
         */
        public DecoderInputs(final BitVector shortestCodeWord,
                final int[] length, final int[] symbol) {
            assert shortestCodeWord!=null;
            assert length!=null;
            assert symbol!=null;
            assert length.length==symbol.length;
            assert shortestCodeWord.size()==length[0];
            this.shortestCodeWord = shortestCodeWord;
            this.length = length;
            this.symbol = symbol;
	    }

        /**
         * The shortest code word. Note that canonical huffman codes can be
         * recreated from just length[0] and the shortest code word.
         */
        public BitVector getShortestCodeWord() {
            return shortestCodeWord;
        }
        
	    /**
	     * Return the symbol[] in the permuted order used to construct the
	     * {@link CanonicalFast64CodeWordDecoder}. This information is
	     * <em>transient</em>.
	     */
	    public int[] getSymbols() {
	        return symbol;
	    }

	    /**
	     * Return the codeWord bit lengths in the non-decreasing order used to
	     * construct the {@link CanonicalFast64CodeWordDecoder}. This information is
	     * <em>transient</em>.
	     */
	    public int[] getLengths() {
	        return length;
	    }

	}
	
    /** Creates a new Huffman codec using the given vector of frequencies.
     * 
     * @param frequency a vector of nonnnegative frequencies.
     */
    public HuffmanCodec( final int[] frequency ) {
    
        this(frequency, new DecoderInputs());
        
    }

    /**
     * Creates a new Huffman codec using the given vector of frequencies.
     * 
     * @param frequency
     *            a vector of non-negative frequencies.
     * @param decoderInputs
     *            The inputs necessary to reconstruct a
     *            {@link CanonicalFast64CodeWordDecoder} will be set on this
     *            object.
     */
	public HuffmanCodec( final int[] frequency, final DecoderInputs decoderInputs ) {
	    if(decoderInputs==null)
	        throw new IllegalArgumentException();
		size = frequency.length;
		
		if ( size == 0 || size == 1 ) {
			codeWord = new BitVector[ size ];
			if ( size == 1 ) codeWord[ 0 ] = LongArrayBitVector.getInstance();
			coder = new Fast64CodeWordCoder( codeWord, new long[ size ] );
			// Modified BBT 8/11/2009
//            decoder = new CanonicalFast64CodeWordDecoder( new int[ size ], new int[ size ] );
			decoderInputs.shortestCodeWord = LongArrayBitVector.getInstance().length( 0 );
			decoderInputs.length = new int[size];
			decoderInputs.symbol = new int[size];
			decoder = new CanonicalFast64CodeWordDecoder( decoderInputs.length, decoderInputs.symbol );
			return;
        }
        
        final long[] a = new long[ size ];
        for( int i = size; i-- != 0; ) a[ i ] = frequency[ i ];
        // Sort frequencies (this is the only n log n step).
        Arrays.sort( a );
        
        // The following lines are from Moffat & Katajainen sample code. Please refer to their paper.
        
        // First pass, left to right, setting parent pointers.
		a[ 0 ] += a[ 1 ];
		int root = 0;
		int leaf = 2;
		for ( int next = 1; next < size - 1; next++ ) {
			// Select first item for a pairing.
			if ( leaf >= size || a[ root ] < a[ leaf ] ) {
				a[ next ] = a[ root ];
				a[ root++ ] = next;
			}
			else a[ next ] = a[ leaf++ ];

			// Add on the second item.
			if ( leaf >= size || ( root < next && a[ root ] < a[ leaf ] ) ) {
				a[ next ] += a[ root ];
				a[ root++ ] = next;
			}
			else a[ next ] += a[ leaf++ ];
		}

		// Second pass, right to left, setting internal depths.
		a[ size - 2 ] = 0;
		for ( int next = size - 3; next >= 0; next-- ) a[ next ] = a[ (int)a[ next ] ] + 1;

		// Third pass, right to left, setting leaf depths.
		int available = 1, used = 0, depth = 0;
		root = size - 2;
		int next = size - 1;
		while ( available > 0 ) {
			while ( root >= 0 && a[ root ] == depth ) {
				used++;
				root--;
			}
			while ( available > used ) {
				a[ next-- ] = depth;
				available--;
			}
			available = 2 * used;
			depth++;
			used = 0;
		}
		
		// Reverse the order of symbol lengths, and store them into an int array.
		final int[] length = new int[ size ];
		for( int i = size; i-- != 0; ) length[ i ] = (int)a[ size - 1 - i ];

		// Sort symbols indices by decreasing frequencies (so symbols correspond to lengths).
		final int[] symbol = new int[ size ];
		for( int i = size; i-- != 0; ) symbol[ i ] = i;
		Sorting.quickSort( symbol, 0, size, new IntComparator() {
			public int compare( int x, int y ) {
				return frequency[ y ] - frequency[ x ];
			}
		});
		
		// Assign codewords (just for the coder--the decoder needs just the lengths).
		int s = symbol[ 0 ];
		int l = length[ 0 ];
		long value = 0;
		BitVector v;
		codeWord = new BitVector[ size ];
		final long[] longCodeWord = new long[ size ];
		codeWord[ s ] = LongArrayBitVector.getInstance().length( l );
		
		for( int i = 1; i < size; i++ ) {
			s = symbol[ i ];
			if ( length[ i ] == l ) value++;
			else {
				value++;
				value <<= length[ i ] - l;
				if ( ASSERTS ) assert length[ i ] > l;
				l = length[ i ];
			}
			v = LongArrayBitVector.getInstance().length( l );
			for( int j = l; j-- != 0; ) if ( ( 1L << j & value ) != 0 ) v.set( l - 1 - j );
			codeWord[ s ] = v;
			longCodeWord[ s ] = value;
		}
		
		coder = new Fast64CodeWordCoder( codeWord, longCodeWord );
		// Modified BBT 8/11/2009
//        decoder = new CanonicalFast64CodeWordDecoder( length, symbol );
        decoderInputs.shortestCodeWord = codeWord[symbol[0]];
		decoderInputs.length = length;
        decoderInputs.symbol = symbol;
        assert decoderInputs.shortestCodeWord.size() == length[0] : "shortestCodeWord="
                + decoderInputs.shortestCodeWord
                + ", but length[0]="
                + length[0]; 
        decoder = new CanonicalFast64CodeWordDecoder( decoderInputs.length, decoderInputs.symbol);
		
		if ( DEBUG ) {
			final BitVector[] codeWord = codeWords();
			System.err.println( "Codes: " );
			for( int i = 0; i < size; i++ ) 
				System.err.println( i + " (" + codeWord[ i ].size() + " bits): " + codeWord[ i ] );
	
			long totFreq = 0;
			for( int i = size; i-- != 0; ) totFreq += frequency[ i ];
			long totBits = 0;
			for( int i = size; i-- != 0; ) totBits += frequency[ i ] * codeWord[ i ].size();
			System.err.println( "Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE) );
		}
}

	public CodeWordCoder coder() {
		return coder;
	}
	
	public Decoder decoder() {
		return decoder;
	}
	
	public int size() {
		return size;
	}

	public BitVector[] codeWords() {
		return coder.codeWords();
	}

    /**
     * (Re-)constructs the canonical huffman code from the shortest code word,
     * the non-decreasing bit lengths of each code word, and the permutation of
     * the symbols corresponding to those bit lengths. This information is
     * necessary and sufficient to reconstruct a canonical huffman code.
     * 
     * @param decoderInputs
     *            This contains the necessary and sufficient information to
     *            recreate the {@link PrefixCoder}.
     * 
     * @return A new {@link PrefixCoder} instance for the corresponding
     *         canonical huffman code.
     */
    static public PrefixCoder newCoder(final DecoderInputs decoderInputs) {

        return newCoder(decoderInputs.getShortestCodeWord(), decoderInputs
                .getLengths(), decoderInputs.getSymbols());

	}

    /**
     * (Re-)constructs the canonical huffman code from the shortest code word,
     * the non-decreasing bit lengths of each code word, and the permutation of
     * the symbols corresponding to those bit lengths. This information is
     * necessary and sufficient to reconstruct a canonical huffman code.
     * 
     * @param shortestCodeWord
     *            The code word with the shortest bit length.
     * @param length
     *            The bit length of each code word in the non-decreasing order
     *            assigned when the code was generated. The length of this array
     *            is the #of symbols in the code.
     * @param symbol
     *            The permutation of the symbols in the assigned when the
     *            canonical huffman code was generated. The length of this array
     *            is the #of symbols in the code.
     * 
     * @return A new {@link PrefixCoder} instance for the corresponding
     *         canonical huffman code.
     * 
     * @see DecoderInputs
     */
    static public PrefixCoder newCoder(final BitVector shortestCodeWord,
            final int[] length, final int[] symbol) {

        if (shortestCodeWord == null)
            throw new IllegalArgumentException();
        if (shortestCodeWord.size() == 0)
            throw new IllegalArgumentException();
        if (length == null)
            throw new IllegalArgumentException();
        if (length.length == 0)
            throw new IllegalArgumentException();
        if (symbol == null)
            throw new IllegalArgumentException();
        if (symbol.length == 0)
            throw new IllegalArgumentException();

        final int size = length.length;
        int s = symbol[ 0 ];
        int l = length[ 0 ];
        long value = 0;
        BitVector v;
        final BitVector[] codeWord = new BitVector[ size ];
        final long[] longCodeWord = new long[ size ];
        codeWord[ s ] = LongArrayBitVector.getInstance().length( l );
        
        for( int i = 1; i < size; i++ ) {
            s = symbol[ i ];
            if ( length[ i ] == l ) value++;
            else {
                value++;
                value <<= length[ i ] - l;
                if ( ASSERTS ) assert length[ i ] > l;
                l = length[ i ];
            }
            v = LongArrayBitVector.getInstance().length( l );
            for( int j = l; j-- != 0; ) if ( ( 1L << j & value ) != 0 ) v.set( l - 1 - j );
            codeWord[ s ] = v;
            longCodeWord[ s ] = value;
        }

        return new Fast64CodeWordCoder(codeWord, longCodeWord);

    }
    
}