package com.yahoo.glimmer.util; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Map; import java.util.TreeMap; import org.apache.hadoop.io.compress.SplittableCompressionCodec; import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream; import org.junit.Test; public class Bz2BlockIndexedOutputStreamTest { private CBZip2InputStream cbZip2InputStream; private Map<Long, BlockInfo> firstRecordToBlockMap = new TreeMap<Long, BlockInfo>(); private Long firstRecordInBlock; @Test public void test() throws IOException { File dataFile = File.createTempFile("temp", ".bz2"); dataFile.deleteOnExit(); BitSequenceMonitor.Callback callback = new BitSequenceMonitor.Callback() { private int blockIndex = -1; private long startOffset = 4; private BlockInfo lastBlockInfo; private long compressTime; @Override public void sequenceStart(long byteOffset, int bitInByte) { if (blockIndex >= 0) { // End last block compressTime = System.currentTimeMillis() - compressTime; System.out.println("blockEnd(" + blockIndex + ", " + startOffset + ", " + byteOffset + ") first:" + firstRecordInBlock + " t:" + compressTime); if (lastBlockInfo != null) { lastBlockInfo.end = byteOffset; } } blockIndex++; startOffset = byteOffset; System.out.println("blockStart(" + blockIndex + ", " + startOffset + ") first:" + firstRecordInBlock); if (firstRecordInBlock != null) { lastBlockInfo = new BlockInfo(blockIndex, startOffset, 0); firstRecordToBlockMap.put(firstRecordInBlock, lastBlockInfo); firstRecordInBlock = null; } compressTime = System.currentTimeMillis(); } @Override public void close(long byteOffset) { lastBlockInfo.end = byteOffset; } }; Bz2BlockIndexedOutputStream compressedDataOut = Bz2BlockIndexedOutputStream.newInstance(new FileOutputStream(dataFile), 1, callback); for (long l = 100000000; l < 100200000; l++) { if (firstRecordInBlock == null) { firstRecordInBlock = l; } compressedDataOut.write(Long.toString(l).getBytes("ASCII")); compressedDataOut.write('\n'); } compressedDataOut.flush(); compressedDataOut.close(); long lastRangeEnd = 4; for (long firstRecord : firstRecordToBlockMap.keySet()) { System.out.print("Testing"); BlockInfo blockInfo = firstRecordToBlockMap.get(firstRecord); FileInputStream dataIn = new FileInputStream(dataFile); assertEquals(blockInfo.start, dataIn.skip(blockInfo.start)); cbZip2InputStream = new CBZip2InputStream(dataIn, SplittableCompressionCodec.READ_MODE.BYBLOCK); BufferedReader reader = new BufferedReader(new InputStreamReader(cbZip2InputStream, "ASCII")); long uncompressTime = System.currentTimeMillis(); int uncompressedByteCount = 0; String s = reader.readLine(); if (s.length() != 9) { s = reader.readLine(); } System.out.print("\texpected:" + firstRecord); System.out.print("\tfirst:" + s); long firstLong = Long.parseLong(s); // the first or second record in the block should match the key. assertTrue(firstLong == firstRecord || firstLong + 1 == firstRecord); String lastS = null; while (s != null) { lastS = s; uncompressedByteCount += s.length() + 1; s = reader.readLine(); } uncompressTime = System.currentTimeMillis() - uncompressTime; // The un-compressor seems to ignore some of the block header and // read following blocks too.. // The uncompressed contents of a block shouldn't be much more than // the blockSize100K used when creating the compressor. // System.out.println("\tlast:" + lastS + "\tuncompressedByteCount:" + uncompressedByteCount + "\tblockRange:" + blockInfo + " t:" + uncompressTime); assertEquals(lastRangeEnd, blockInfo.start); lastRangeEnd = blockInfo.end; } assertEquals(21, firstRecordToBlockMap.size()); assertEquals(dataFile.length(), lastRangeEnd); } private static class BlockInfo { final int index; final long start; long end; public BlockInfo(int index, long start, long end) { this.index = index; this.start = start; this.end = end; } @Override public String toString() { return Integer.toString(index) + ':' + start + ',' + end; } } }