package com.yahoo.glimmer.util; import it.unimi.di.big.mg4j.document.AbstractDocumentCollection; import it.unimi.di.big.mg4j.document.Document; import it.unimi.di.big.mg4j.document.DocumentCollection; import it.unimi.di.big.mg4j.document.DocumentFactory; import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap; import it.unimi.dsi.io.ByteBufferInputStream; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.itadaki.bzip2.BZip2BitInputStream; import org.itadaki.bzip2.BZip2BlockDecompressor; import org.itadaki.bzip2.BZip2Constants; /** * A DocumentCollection using a bzip2 file and a list of 'first docId in block' to block offsets. * * Would be nice to make retrieval of docs that are not in the collection more efficient. * @author tep * */ public class BlockCompressedDocumentCollection extends AbstractDocumentCollection implements Serializable { private final static Logger LOGGER = Logger.getLogger(BlockCompressedDocumentCollection.class); private static final long serialVersionUID = -7943857364950329249L; private static final byte[] ZERO_BYTE_BUFFER = new byte[0]; public static final String COMPRESSED_FILE_EXTENSION = ".bz2"; public static final String BLOCK_OFFSETS_EXTENSION = ".blockOffsets"; private final String name; private final DocumentFactory documentFactory; private BlockOffsets blockOffsets; private FileInputStream bz2InputStream; private FileChannel bz2FileChannel; private int uncompressedBlockSize; private BlockCache blockCache; public BlockCompressedDocumentCollection(String name, DocumentFactory documentFactory, final int cacheSize) { this.name = new File(name).getName(); this.documentFactory = documentFactory; } @Override public void filename(CharSequence absolutePathToAFileInTheCollection) throws IOException { File absolutePathToCollection = new File(absolutePathToAFileInTheCollection.toString()).getParentFile(); File bz2File = new File(absolutePathToCollection, name + COMPRESSED_FILE_EXTENSION); bz2InputStream = new FileInputStream(bz2File); if (bz2InputStream.read() != 'B' || bz2InputStream.read() != 'Z' || bz2InputStream.read() != 'h') { bz2InputStream.close(); throw new IllegalArgumentException(bz2File.getAbsolutePath() + " doesn't have bzip2 header!"); } uncompressedBlockSize = bz2InputStream.read() - '0'; if (uncompressedBlockSize < 0 || uncompressedBlockSize > 9) { bz2InputStream.close(); throw new IllegalArgumentException(bz2File.getAbsolutePath() + " has a invalid block size byte."); } uncompressedBlockSize *= 100 * 1024; // This is weird. The uncompressed blocks can be bigger than multiples of 100000. FileChannel bz2FileChannel = bz2InputStream.getChannel(); File blockOffsetsFile = new File(absolutePathToCollection, name + BLOCK_OFFSETS_EXTENSION); InputStream blockOffsetsInputStream = new FileInputStream(blockOffsetsFile); init(bz2FileChannel, blockOffsetsInputStream, uncompressedBlockSize); blockOffsetsInputStream.close(); } public void init(FileChannel bz2FileChannel, InputStream blockOffsetsInputStream, int uncompressedBlockSize) throws IOException { this.bz2FileChannel = bz2FileChannel; DataInputStream blockOffsetsDataInput = new DataInputStream(blockOffsetsInputStream); try { blockOffsets = (BlockOffsets) BinIO.loadObject(blockOffsetsDataInput); } catch (ClassNotFoundException e) { throw new RuntimeException("BinIO.loadObject() threw:" + e); } this.uncompressedBlockSize = uncompressedBlockSize; blockCache = new BlockCache(new BlockCache.BlockReader() { @Override public int readBlock(long blockIndex, byte[] buffer) throws IOException { return uncompressBlock(blockIndex, buffer); } }, blockOffsets.getBlockCount() - 1, uncompressedBlockSize, 1024); } @Override public long size() { return blockOffsets.getLastDocId(); } @Override public Document document(long index) throws IOException { InputStream stream = stream(index); Reference2ObjectMap<Enum<?>, Object> metadata = getMetadata(stream); return documentFactory.getDocument(stream, metadata); } @Override public InputStream stream(long index) throws IOException { InputStream inputStream; long time = System.currentTimeMillis(); try { inputStream = getInputStreamStartingAtDocStart(index); } catch (IOException e) { LOGGER.error("Got IOException getting doc at index:" + index); throw e; } time = System.currentTimeMillis() - time; LOGGER.debug("stream(" + index + ") took " + time + "ms."); if (inputStream == null) { inputStream = new ByteArrayInputStream(ZERO_BYTE_BUFFER); } return inputStream; } @Override public Reference2ObjectMap<Enum<?>, Object> metadata(long index) throws IOException { return getMetadata(stream(index)); } private Reference2ObjectMap<Enum<?>, Object> getMetadata(InputStream stream) throws IOException { Reference2ObjectOpenHashMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>(); // TODO Why is this not picked up from the factories metadata? metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8"); return metadata; } @Override public DocumentCollection copy() { throw new UnsupportedOperationException(); } @Override public DocumentFactory factory() { return documentFactory; } private InputStream getInputStreamStartingAtDocStart(long requiredDocId) throws IOException { if (requiredDocId > blockOffsets.getLastDocId()) { return null; } final long blockIndex = blockOffsets.getBlockIndex(requiredDocId); if (blockIndex == -1 || blockIndex >= blockOffsets.getBlockCount()) { return null; } int[] recordStartOffset = { -1 }; long currentDocId = getNextDocId(blockIndex, recordStartOffset); if (currentDocId > requiredDocId) { // The first doc id in the block is bigger than the // requiredDocId LOGGER.warn("The first doc id(" + currentDocId + ") in the block(" + blockIndex + ") is bigger than the requiredDocId(" + requiredDocId + "). This maybe an error in the bySubject.blockOffsets file."); return null; } while (currentDocId != -1 && currentDocId < requiredDocId) { currentDocId = getNextDocId(blockIndex, recordStartOffset); } if (currentDocId == requiredDocId) { // Found. return blockCache.getInputStream(blockIndex, recordStartOffset[0]); } if (currentDocId == -1 && recordStartOffset[0] == -1) { // There are no docIds in this block. LOGGER.warn("There are no docIds in this block(" + blockIndex + "). RequiredDocId(" + requiredDocId + "). This maybe an error in the bySubject.blockOffsets file."); } // Not found. return null; } private int uncompressBlock(long blockIndex, byte[] uncompressedBuffer) throws IOException { final long blockStartBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex); final long blockEndBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex + 1); final long blockStartByteOffset = blockStartBitOffset / 8; final int blockStartSkipBits = (int) (blockStartBitOffset % 8); final long blockEndByteOffset = blockEndBitOffset / 8; final MappedByteBuffer blockMappedByteBuffer = bz2FileChannel.map(MapMode.READ_ONLY, blockStartByteOffset, (blockEndByteOffset - blockStartByteOffset) + 1); final ByteBufferInputStream blockInputStream = new ByteBufferInputStream(blockMappedByteBuffer); final BZip2BitInputStream blockBitInputStream = new BZip2BitInputStream(blockInputStream); blockBitInputStream.readBits(blockStartSkipBits); /* Read block-header or end-of-stream marker */ final int marker1 = blockBitInputStream.readBits(24); final int marker2 = blockBitInputStream.readBits(24); if (marker1 == BZip2Constants.BLOCK_HEADER_MARKER_1 && marker2 == BZip2Constants.BLOCK_HEADER_MARKER_2) { // System.err.println("Decompressing block " + blockIndex + " S:" + // blockStartBitOffset + " E:" + blockEndBitOffset); // System.err.flush(); final BZip2BlockDecompressor blockDecompressor = new BZip2BlockDecompressor(blockBitInputStream, uncompressedBlockSize); return blockDecompressor.read(uncompressedBuffer, 0, uncompressedBlockSize); } else if (marker1 == BZip2Constants.STREAM_END_MARKER_1 && marker2 == BZip2Constants.STREAM_END_MARKER_2) { throw new IllegalArgumentException("End of BZip2 marker at bit " + blockStartBitOffset); } else { throw new IllegalStateException("Not a BZip2 block header at bit " + blockStartBitOffset); } } /** * @param blockIndex * The index of the block we are looking for the next DocId in. * @param startAtByteIndex * The current byte index in the block * @return The next docId. -1 if there are no more doc starts. * @throws IllegalStateException * On corrupt record starts. * @throws IOException * On failing to read blocks. */ private long getNextDocId(long blockIndex, int[] startAtByteIndex) throws IllegalStateException, IOException { BlockCache.Block block = blockCache.getBlock(blockIndex); int recordDelimiterIndex = startAtByteIndex[0]; byte[] blockBytes = block.getBytes(); int blockLength = block.getLength(); if (blockIndex != 0 || recordDelimiterIndex != -1) { // First record in // first block // check. if (recordDelimiterIndex == -1) { recordDelimiterIndex++; } while (recordDelimiterIndex < blockLength && blockBytes[recordDelimiterIndex] != BySubjectRecord.RECORD_DELIMITER) { recordDelimiterIndex++; } } if (recordDelimiterIndex == blockLength) { // No RECORD_DELIMITER found after startAtByteIndex. return -1; } // recordDelimiterIndex points to the RECORD_DELIMITER before the next // record or -1 for the first record of the first block) int docIdDigitIndex = recordDelimiterIndex + 1; long currentDocId = 0; while (docIdDigitIndex < blockLength && blockBytes[docIdDigitIndex] >= '0' && blockBytes[docIdDigitIndex] <= '9') { currentDocId *= 10; currentDocId += blockBytes[docIdDigitIndex] - '0'; docIdDigitIndex++; } int docIdLength = docIdDigitIndex - recordDelimiterIndex - 1; if (docIdDigitIndex == blockLength) { // DocId spans blocks or last RECORD_DELIMITER block = blockCache.getBlock(blockIndex + 1); if (block == null) { // Last RECORD_DELIMITER. return -1; } blockBytes = block.getBytes(); int nextBlockLength = block.getLength(); docIdDigitIndex = 0; while (docIdDigitIndex < nextBlockLength && blockBytes[docIdDigitIndex] >= '0' && blockBytes[docIdDigitIndex] <= '9') { currentDocId *= 10; currentDocId += blockBytes[docIdDigitIndex] - '0'; docIdDigitIndex++; } docIdLength += docIdDigitIndex; } if (blockBytes[docIdDigitIndex] != BySubjectRecord.FIELD_DELIMITER) { throw new IllegalStateException("Expecting field delimiter but found byte>" + blockBytes[docIdDigitIndex] + "<. Record started with " + new String(blockBytes, recordDelimiterIndex + 1, docIdDigitIndex)); } if (docIdLength == 0) { throw new IllegalStateException("Zero length docId found!"); } // Success. startAtByteIndex[0] = recordDelimiterIndex + 1; return currentDocId; } @Override public void close() throws IOException { super.close(); bz2FileChannel.close(); if (bz2InputStream != null) { bz2InputStream.close(); } } public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.println("Args are: <full path to collection bz2 file> [docId>]"); return; } String collectionBase = args[0]; File collectionBaseFile = new File(collectionBase); String collectionName = collectionBaseFile.getName(); int collectionNamePostfixStart = collectionName.lastIndexOf('.'); if (collectionNamePostfixStart > 0) { collectionName = collectionName.substring(0, collectionNamePostfixStart); } BlockCompressedDocumentCollection collection = new BlockCompressedDocumentCollection(collectionName, null, 10); collection.filename(collectionBase); if (args.length >= 2) { // dump docs. for (int i = 1; i < args.length; i++) { long docId = Long.parseLong(args[i]); long time = System.currentTimeMillis(); InputStream docStream = collection.stream(docId); time = System.currentTimeMillis() - time; System.out.println(time + " milliseconds."); IOUtils.copy(docStream, System.out); System.out.println(); } } else { // print offsets and size. collection.blockOffsets.printTo(System.out); } collection.close(); } }