/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Aug 6, 2009
*/
package com.bigdata.btree.raba.codec;
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.compression.CanonicalFast64CodeWordDecoder;
import it.unimi.dsi.compression.Coder;
import it.unimi.dsi.compression.Fast64CodeWordCoder;
import it.unimi.dsi.compression.HuffmanCodec;
import it.unimi.dsi.compression.PrefixCoder;
import it.unimi.dsi.compression.HuffmanCodec.DecoderInputs;
import it.unimi.dsi.fastutil.booleans.BooleanIterator;
import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.ReadOnlyKeysRaba;
import com.bigdata.btree.raba.ReadOnlyValuesRaba;
import com.bigdata.btree.raba.codec.CanonicalHuffmanRabaCoder.AbstractCodingSetup;
import com.bigdata.btree.raba.codec.CanonicalHuffmanRabaCoder.RabaCodingSetup;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
/**
* Test suite for the {@link CanonicalHuffmanRabaCoder}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TestCanonicalHuffmanRabaCoder extends AbstractRabaCoderTestCase {
/**
*
*/
public TestCanonicalHuffmanRabaCoder() {
}
/**
* @param name
*/
public TestCanonicalHuffmanRabaCoder(String name) {
super(name);
}
protected void setUp() throws Exception {
rabaCoder = CanonicalHuffmanRabaCoder.INSTANCE;
}
/**
* Format the code book as a multi-line string.
*
* @param codeWords
* The code words.
*
* @return A representation of the code book.
*/
static protected String printCodeBook(final BitVector[] codeWords) {
final StringBuilder sb = new StringBuilder();
for (BitVector v : codeWords) {
final long long1 = v.getLong(0, v.size());
final long long2 = Long.reverse(long1 << (64-v.size()));
// System.err.println("codeWord=" + v + ", v.size=" + v.size()
// + " : long2=" + long2);
sb.append("codeWord: " + v + ", bitLength=" + v.size()
+ ", longValue=" + long2 + "\n");
}
return sb.toString();
}
/*
* Bootstrapping unit tests for various assumptions about the
* {@link HuffmanCodec} implementation class.
*/
/**
* Test with a simple fixed frequency[].
*/
public void test_huffmanCodec01() {
final int[] frequency = new int[] { 1, 2, 3, 3, 4, 5 };
doRoundTripTest(frequency);
}
/**
* This test was written to a bug in {@link HuffmanCodec}, which has since
* been fixed.
*
* <pre>
* java.lang.ArrayIndexOutOfBoundsException: -2
* at it.unimi.dsi.compression.CanonicalFast64CodeWordDecoder.<init>(CanonicalFast64CodeWordDecoder.java:62)
* at it.unimi.dsi.compression.HuffmanCodec.<init>(HuffmanCodec.java:107)
* at com.bigdata.btree.raba.codec.TestCanonicalHuffmanRabaCoder.doRoundTripTest(TestCanonicalHuffmanRabaCoder.java:166)
* at com.bigdata.btree.raba.codec.TestCanonicalHuffmanRabaCoder.test_huffmanCodec_noSymbols(TestCanonicalHuffmanRabaCoder.java:121)
* </pre>
*/
public void test_huffmanCodec_noSymbols() {
final int[] frequency = new int[] {};
doRoundTripTest(frequency);
}
/**
* This test was written to a bug in {@link HuffmanCodec}, which has since
* been fixed.
*
* <pre>
* java.lang.ArrayIndexOutOfBoundsException: -1
* at it.unimi.dsi.compression.CanonicalFast64CodeWordDecoder.<init>(CanonicalFast64CodeWordDecoder.java:89)
* at it.unimi.dsi.compression.HuffmanCodec.<init>(HuffmanCodec.java:107)
* at com.bigdata.btree.raba.codec.TestCanonicalHuffmanRabaCoder.doRoundTripTest(TestCanonicalHuffmanRabaCoder.java:166)
* at com.bigdata.btree.raba.codec.TestCanonicalHuffmanRabaCoder.test_huffmanCodec_oneSymbols(TestCanonicalHuffmanRabaCoder.java:132)
* </pre>
*/
public void test_huffmanCodec_oneSymbols() {
final int[] frequency = new int[] {1};
doRoundTripTest(frequency);
}
/**
* Stress test with random frequency distributions of between 2 and 256
* distinct symbols. Frequencies MAY be zero for some symbols. Tests with
* zero and one symbols are done separately since both cases have errors.
*/
public void test_huffmanCodecStress() {
final int ntrials = 10000;
final Random r = new Random();
for (int trial = 0; trial < ntrials; trial++) {
// #of distinct symbols in [2:256].
final int[] frequency = new int[r.nextInt(255) + 2];
for (int i = 0; i < frequency.length; i++) {
if (r.nextFloat() < 0.001) {
// zero freq allowed but rare.
frequency[i] = 0;
} else {
frequency[i] = r.nextInt(4000);
}
}
doRoundTripTest(frequency);
}
}
/**
* This verifies that a code book constructed from a given set of
* frequencies may be reconstructed from the cord word bit lengths, given in
* a non-decreasing order, together with the symbols in a correlated array.
*
* @param frequency
*/
public void doRoundTripTest(final int[] frequency) {
final DecoderInputs decoderInputs = new DecoderInputs();
final HuffmanCodec codec = new HuffmanCodec(frequency, decoderInputs);
if (log.isDebugEnabled()) {
log.debug(printCodeBook(codec.codeWords()) + "\nlength[]="
+ Arrays.toString(decoderInputs.getLengths()) + "\nsymbol[]="
+ Arrays.toString(decoderInputs.getSymbols()));
}
final CanonicalFast64CodeWordDecoder actualDecoder = new CanonicalFast64CodeWordDecoder(
decoderInputs.getLengths(), decoderInputs.getSymbols());
for (int i = 0; i < frequency.length; i++) {
final BooleanIterator coded = codec.coder().encode(i/*symbol*/);
assertEquals(i, actualDecoder.decode(coded));
}
}
/**
* Stress test with 256 distinct symbols (corresponding to byte values in
* the application). A large percentage of all symbols have a zero frequency
* code, which models the expected patterns of B+Tree keys.
*
* @throws IOException
*/
public void test_huffmanRecoderStress() throws IOException {
final int ntrials = 10000;
final int percentZero = 40;
final Random r = new Random();
for (int trial = 0; trial < ntrials; trial++) {
final int[] frequency = new int[256];
for (int i = 0; i < frequency.length; i++) {
if (r.nextInt() < percentZero) {
frequency[i] = 0;
} else {
frequency[i] = r.nextInt(4000);
}
}
doRecoderRoundTripTest(frequency);
}
}
/**
* Simple test with a known symbol frequency distribution.
*
* @throws IOException
*/
public void test_huffmanRecoder01() throws IOException {
final int[] frequency = new int[]{1,0,3,5,0,0,9};
doRecoderRoundTripTest(frequency);
}
/**
* Verify we can regenerate the {@link Fast64CodeWordCoder} from the code
* word[]. This is tested by coding and decoding random symbol sequences.
* For this test we need to reconstruct the {@link Fast64CodeWordCoder}. To
* do that, we need to use the codeWord[] and create a long[] having the
* same values as the codeWords, but expressed as 64-bit integers.
*
* @param frequency
* The frequency[] should include a reasonable proportion of
* symbols with a zero frequency in order to replicate the
* expected conditions when coding non-random data such as are
* found in the keys of a B+Tree.
*
* @throws IOException
*/
public void doRecoderRoundTripTest(final int frequency[]) throws IOException {
final DecoderInputs decoderInputs = new DecoderInputs();
final HuffmanCodec codec = new HuffmanCodec(frequency, decoderInputs);
final PrefixCoder expected = codec.coder();
final PrefixCoder actual = new Fast64CodeWordCoder(codec.codeWords());
if (log.isDebugEnabled())
log.debug(printCodeBook(codec.codeWords()));
/*
* First verify that both coders produce the same coded values for a
* symbol sequence of random length drawn from the full set of symbols
* of random length [1:nsymbols].
*/
final int[] value = new int[r.nextInt(frequency.length) + 1];
for(int i=0; i<value.length; i++) {
// any of the symbols in [0:nsymbols-1].
value[i] = r.nextInt(frequency.length);
}
/*
* Now code the symbol sequence using both coders and then compare the
* coded values. They should be the same.
*/
final byte[] codedValue;
{
final FastByteArrayOutputStream ebaos = new FastByteArrayOutputStream();
final FastByteArrayOutputStream abaos = new FastByteArrayOutputStream();
final OutputBitStream eobs = new OutputBitStream(ebaos);
final OutputBitStream aobs = new OutputBitStream(abaos);
for (int i = 0; i < value.length; i++) {
final int symbol = value[i];
expected.encode(symbol, eobs);
actual.encode(symbol, aobs);
}
eobs.flush();
aobs.flush();
assertEquals(0, BytesUtil.compareBytesWithLenAndOffset(0/* aoff */,
ebaos.length, ebaos.array, 0/* boff */, abaos.length,
abaos.array));
codedValue = new byte[abaos.length];
System.arraycopy(abaos.array/*src*/, 0/*srcPos*/, codedValue/*dest*/, 0/*destPos*/, abaos.length/*len*/);
}
/*
* Now verify that the coded sequence decodes to the original symbol
* sequence using a Decoder which is reconstructed from the bit length
* and symbol arrays of the codec.
*/
final CanonicalFast64CodeWordDecoder actualDecoder = new CanonicalFast64CodeWordDecoder(
decoderInputs.getLengths(), decoderInputs.getSymbols());
{
final InputBitStream ibs = new InputBitStream(codedValue);
for (int i = 0; i < value.length; i++) {
assertEquals(value[i]/* symbol */, actualDecoder.decode(ibs));
}
}
}
/**
* Unit test for processing an empty {@link IRaba} representing B+Tree keys.
* <p>
* For an empty {@link IRaba}, {@link RabaCodingSetup} actually assigns
* <code>null</code> for the {@link DecoderInputs} due to a bug in the
* {@link HuffmanCodec} when nsymbols == 0. Therefore, this verifies that
* the {@link Coder} and {@link DecoderInputs} are <code>null</code> and
* that the symbol count is zero.
*
* @throws IOException
*/
public void test_emptyKeyRabaSetup() throws IOException {
final int n = 0;
final byte[][] a = new byte[n][];
final IRaba raba = new ReadOnlyKeysRaba(a);
final AbstractCodingSetup setup = new RabaCodingSetup(raba);
assertEquals(0,setup.getSymbolCount());
assertNull(setup.codec());
assertNull(setup.decoderInputs());
// doDecoderInputRoundTripTest(setup.getSymbolCount(), setup
// .decoderInputs());
//
// // verify that we can re-create the coder.
// doCoderRoundTripTest(setup.codec().codeWords(), setup.decoderInputs()
// .getShortestCodeWord(), setup.decoderInputs().getLengths(),
// setup.decoderInputs().getSymbols());
}
/**
* Unit test for processing an {@link IRaba} representing B+Tree keys
* suitable to setup the data for compression.
*
* @throws IOException
*/
public void test_keyRabaSetup() throws IOException {
final int n = 8;
final byte[][] a = new byte[n][];
a[0] = new byte[]{1,2};
a[1] = new byte[]{1,2,3};
a[2] = new byte[]{1,3};
a[3] = new byte[]{1,3,1};
a[4] = new byte[]{1,3,3};
a[5] = new byte[]{1,3,7};
a[6] = new byte[]{1,5};
a[7] = new byte[]{1,6,0};
final IRaba raba = new ReadOnlyKeysRaba(a);
final AbstractCodingSetup setup = new RabaCodingSetup(raba);
doDecoderInputRoundTripTest(setup.getSymbolCount(), setup
.decoderInputs());
// verify that we can re-create the coder.
doCoderRoundTripTest(setup.codec().codeWords(), setup.decoderInputs()
.getShortestCodeWord(), setup.decoderInputs().getLengths(),
setup.decoderInputs().getSymbols());
}
/**
* Unit test for processing an {@link IRaba} representing B+Tree values
* suitable to setup the data for compression.
*
* @throws IOException
*
* @todo test w/ nulls.
*/
public void test_valueRabaSetup() throws IOException {
final int n = 3;
final byte[][] a = new byte[n][];
a[0] = new byte[]{2,3};
a[1] = new byte[]{3,5};
a[2] = new byte[]{'m','i','k','e'};
final IRaba raba = new ReadOnlyValuesRaba(a);
final RabaCodingSetup setup = new RabaCodingSetup(raba);
// verify that we can re-create the decoder.
doDecoderInputRoundTripTest(setup.getSymbolCount(), setup
.decoderInputs());
// verify that we can re-create the coder.
doCoderRoundTripTest(setup.codec().codeWords(), setup.decoderInputs()
.getShortestCodeWord(), setup.decoderInputs().getLengths(),
setup.decoderInputs().getSymbols());
}
/**
* Unit test for processing an empty {@link IRaba} representing B+Tree
* values.
* <p>
* For an empty {@link IRaba}, {@link RabaCodingSetup} actually assigns
* <code>null</code> for the {@link DecoderInputs} due to a bug in the
* {@link HuffmanCodec} when nsymbols == 0. Therefore, this verifies that
* the {@link Coder} and {@link DecoderInputs} are <code>null</code> and
* that the symbol count is zero.
*
* @throws IOException
*/
public void test_emptyValueRabaSetup() throws IOException {
final int n = 0;
final byte[][] a = new byte[n][];
final IRaba raba = new ReadOnlyValuesRaba(a);
final RabaCodingSetup setup = new RabaCodingSetup(raba);
assertEquals(0,setup.getSymbolCount());
assertNull(setup.codec());
assertNull(setup.decoderInputs());
// // verify that we can re-create the decoder.
// doDecoderInputRoundTripTest(setup.getSymbolCount(), setup
// .decoderInputs());
//
// // verify that we can re-create the coder.
// doCoderRoundTripTest(setup.codec().codeWords(), setup.decoderInputs()
// .getShortestCodeWord(), setup.decoderInputs().getLengths(),
// setup.decoderInputs().getSymbols());
}
/**
* Verify that we can round-trip the data required to reconstruct the
* decoder.
*
* @param decoderInputs
*
* @throws IOException
*/
private void doDecoderInputRoundTripTest(final int nsymbols,
final DecoderInputs decoderInputs) throws IOException {
final byte[] in;
{
final FastByteArrayOutputStream baos = new FastByteArrayOutputStream();
final OutputBitStream obs = new OutputBitStream(baos);
final StringBuilder sb = CanonicalHuffmanRabaCoder.log.isDebugEnabled()?new StringBuilder():null;
CanonicalHuffmanRabaCoder.writeDecoderInputs(decoderInputs, obs, sb);
if (sb != null) {
CanonicalHuffmanRabaCoder.log.debug(sb.toString());
}
obs.flush();
obs.close();
// just the bytes written.
in = new byte[baos.length];
System.arraycopy(baos.array, 0, in, 0, baos.length);
}
{
final InputBitStream ibs = new InputBitStream(in);
final StringBuilder sb = CanonicalHuffmanRabaCoder.log
.isDebugEnabled() ? new StringBuilder() : null;
final DecoderInputs actualInputs = CanonicalHuffmanRabaCoder
.readDecoderInputs(nsymbols, ibs, sb);
if (sb != null) {
CanonicalHuffmanRabaCoder.log.debug(sb.toString());
}
assertEquals("shortestCodeWord", decoderInputs
.getShortestCodeWord(), actualInputs.getShortestCodeWord());
assertEquals("length[]", decoderInputs.getLengths(), actualInputs
.getLengths());
assertEquals("symbol[]", decoderInputs.getSymbols(), actualInputs
.getSymbols());
}
}
/**
* @param shortestCodeWord
* @param lengths
* @param
*/
private void doCoderRoundTripTest(final BitVector[] expected,
final BitVector shortestCodeWord, final int[] length,
final int[] symbol) {
final PrefixCoder newCoder = HuffmanCodec.newCoder(shortestCodeWord,
length, symbol);
final BitVector[] actual = newCoder.codeWords();
assertEquals("codeWord[]", expected, actual);
if (log.isDebugEnabled()) {
log.debug("\nexpected: " + Arrays.toString(expected)
+ "\nactual : " + Arrays.toString(actual));
}
}
/**
* A stress test for compatibility with {@link InputBitStream}. An array is
* filled with random bits and the behavior of {@link InputBitStream} and
* {@link BytesUtil#getBits(byte[], int, int)} is compared on a number of
* randomly selected bit slices.
*
* TODO Could be a performance comparison.
*
* @throws IOException
*/
public void test_stress_InputBitStream_compatible() throws IOException {
final Random r = new Random();
// #of
final int limit = 1000;
// Note: length is guaranteed to be LT int32 bits so [int] index is Ok.
final int len = r.nextInt(Bytes.kilobyte32 * 8) + 1;
final int bitlen = len << 3;
// Fill array with random data.
final byte[] b = new byte[len];
r.nextBytes(b);
// wrap with InputBitStream.
final InputBitStream ibs = new InputBitStream(b);
for (int i = 0; i < limit; i++) {
/**
* Start of the bit slice.
*
* Note: I added the max(x,1) after observing the following
* exception during one CI run:
*
* <pre>
* java.lang.IllegalArgumentException: n must be positive
* at java.util.Random.nextInt(Random.java:250)
* at com.bigdata.btree.raba.codec.TestCanonicalHuffmanRabaCoder.test_stress_InputBitStream_compatible(TestCanonicalHuffmanRabaCoder.java:618) *
* </pre>
*/
final int sliceBitOff = r.nextInt(Math.max(bitlen - 32, 1));
final int bitsremaining = bitlen - sliceBitOff;
// allow any slice of between 1 and 32 bits length.
final int sliceBitLen = r.nextInt(Math.min(32, bitsremaining)) + 1;
assert sliceBitLen >= 1 && sliceBitLen <= 32;
// position the stream.
ibs.position(sliceBitOff);
final int v1 = ibs.readInt(sliceBitLen);
final int v2 = BytesUtil.getBits(b, sliceBitOff, sliceBitLen);
if (v1 != v2) {
fail("Expected=" + v1 + ", actual=" + v2 + ", trial=" + i
+ ", bitSlice(off=" + sliceBitOff + ", len="
+ sliceBitLen + ")" + ", arrayLen=" + b.length);
}
}
}
public void test_confirm_InputBitStream_compatible() throws IOException {
final byte[] tbuf = new byte[] {
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA,
(byte) 0xAA
};
// wrap with InputBitStream.
final InputBitStream ibs = new InputBitStream(tbuf);
// 1010
assertTrue(compare(ibs, tbuf, 0, 4) == 0xA);
// 1010 1010
assertTrue(compare(ibs, tbuf, 0, 8) == 0xAA);
// 0101
assertTrue(compare(ibs, tbuf, 1, 4) == 0x5);
// 01 0101
assertTrue(compare(ibs, tbuf, 1, 6) == 0x15);
// 1010 1010
assertTrue(compare(ibs, tbuf, 0, 32) == 0xAAAAAAAA);
assertTrue(compare(ibs, tbuf, 1, 32) == 0x55555555);
// Now try some 64bit comparisons
assertTrue(compare64(ibs, tbuf, 0, 48) == 0xAAAAAAAAAAAAL);
assertTrue(compare64(ibs, tbuf, 1, 48) == 0x555555555555L);
}
int compare(InputBitStream ibs, byte[] buf, int offset, int bits) throws IOException {
ibs.position(offset);
int v1 = ibs.readInt(bits);
int v2 = BytesUtil.getBits(buf, offset, bits);
assertTrue(v1 == v2);
return v1;
}
long compare64(InputBitStream ibs, byte[] buf, int offset, int bits) throws IOException {
ibs.position(offset);
long v1 = ibs.readLong(bits);
long v2 = BytesUtil.getBits64(buf, offset, bits);
assertTrue(v1 == v2);
return v1;
}
}