BlockCompressedDocumentCollection.java example

Explorer

Glimmer-master
- src
  - main
    - java
      - com
        yahoo
        glimmer
        indexing
        CombinedTermProcessor.java
        CompressionCodecHelper.java
        DocSizesReader.java
        HorizontalDocument.java
        HorizontalDocumentFactory.java
        NonWordTermProcessor.java
        OntologyLoader.java
        RDFDocument.java
        RDFDocumentFactory.java
        ResourceRefTermProcessor.java
        StopwordTermProcessor.java
        TitleListDocumentCollection.java
        VerticalDocument.java
        VerticalDocumentFactory.java
        WordArrayReader.java
        generator
        DocumentMapper.java
        Index.java
        IndexRecordWriter.java
        IndexRecordWriterDocValue.java
        IndexRecordWriterSizeValue.java
        IndexRecordWriterTermValue.java
        IndexRecordWriterValue.java
        TermKey.java
        TermReduce.java
        TermValue.java
        TripleIndexGenerator.java
        preprocessor
        PredicatePrefixTupleFilter.java
        PrepTool.java
        RegexTupleFilter.java
        ResourceRecordWriter.java
        ResourcesReducer.java
        Tuple.java
        TupleElement.java
        TupleFilter.java
        TupleFilterSerializer.java
        TuplesToResourcesMapper.java
        query
        Context.java
        QueryLogger.java
        RDFIndex.java
        RDFIndexStatistics.java
        RDFIndexStatisticsBuilder.java
        RDFQueryParser.java
        SetDocumentPriors.java
        WOOScorer.java
        util
        BZip2BlockIndexedOutputStream.java
        BZip2BlockOffsetTool.java
        BitSequenceMonitor.java
        BlockCache.java
        BlockCompressedDocumentCollection.java
        BlockOffsets.java
        BySubjectRecord.java
        Bz2BlockIndexedOutputStream.java
        ComputeHashTool.java
        DigestOutputStream.java
        MapReducePartInputStreamEnumeration.java
        MergeSortTool.java
        ReadersWriterMergeSort.java
        UncompressedInputStream.java
        Util.java
        vocabulary
        OwlUtils.java
        web
        DocObjectView.java
        FormatParameterToViewNameTranslator.java
        IndexMap.java
        JsObjectView.java
        PhraseListQueryFilter.java
        Querier.java
        QueryCommand.java
        QueryController.java
        QueryFilter.java
        QueryResult.java
        QueryResultItem.java
        WebRequestDemo.java
        XmlObjectView.java
      - org
        itadaki
        bzip2
        BZip2BitInputStream.java
        BZip2BitOutputStream.java
        BZip2BlockCompressor.java
        BZip2BlockDecompressor.java
        BZip2Constants.java
        BZip2DivSufSort.java
        BZip2Exception.java
        BZip2HuffmanStageDecoder.java
        BZip2HuffmanStageEncoder.java
        BZip2InputStream.java
        BZip2MTFAndRLE2StageEncoder.java
        BZip2OutputStream.java
        CRC32.java
        HuffmanAllocator.java
        MoveToFront.java
  - test
    - java
      - com
        yahoo
        glimmer
        indexing
        AbstractDocumentFactoryTest.java
        HorizontalDocumentFactoryTest.java
        VerticalDocumentFactoryTest.java
        generator
        DocumentMapperTest.java
        IndexRecordWriterTest.java
        TermKeyMatcher.java
        TermKeyTest.java
        TermReduceTest.java
        preprocessor
        PredicatePrefixTupleFilterTest.java
        ResourceRecordWriterTest.java
        ResourcesReducerTest.java
        TextMatcher.java
        TupleFilterSerializerTest.java
        TuplesToResourcesMapperTest.java
        query
        RDFIndexStatisticsBuilderTest.java
        util
        BlockCompressedDocumentCollectionTest.java
        BySubjectRecordTest.java
        Bz2BlockIndexedOutputStreamTest.java
        ComputeHashToolTest.java
        ReadersWriterMergeSortTest.java
        UtilTest.java
        vocabulary
        OwlUtilsTest.java
        web
        PhraseListQueryFilterTest.java
        QueryControllerTest.java
      - org
        itadaki
        bzip2
        TestBZip2BitInputStream.java
        TestBZip2BitOutputStream.java
        TestBZip2BlockDecompressor.java
        TestBZip2DivSufSort.java
        TestBZip2HuffmanStageDecoder.java
        TestBZip2OutputStream.java
        TestHuffmanAllocator.java

package com.yahoo.glimmer.util;

import it.unimi.di.big.mg4j.document.AbstractDocumentCollection;
import it.unimi.di.big.mg4j.document.Document;
import it.unimi.di.big.mg4j.document.DocumentCollection;
import it.unimi.di.big.mg4j.document.DocumentFactory;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.ByteBufferInputStream;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.itadaki.bzip2.BZip2BitInputStream;
import org.itadaki.bzip2.BZip2BlockDecompressor;
import org.itadaki.bzip2.BZip2Constants;

/**
 * A DocumentCollection using a bzip2 file and a list of 'first docId in block' to block offsets.
 * 
 * Would be nice to make retrieval of docs that are not in the collection more efficient.
 * @author tep
 *
 */
public class BlockCompressedDocumentCollection extends AbstractDocumentCollection implements Serializable {
    private final static Logger LOGGER = Logger.getLogger(BlockCompressedDocumentCollection.class);
    private static final long serialVersionUID = -7943857364950329249L;

    private static final byte[] ZERO_BYTE_BUFFER = new byte[0];

    public static final String COMPRESSED_FILE_EXTENSION = ".bz2";
    public static final String BLOCK_OFFSETS_EXTENSION = ".blockOffsets";

    private final String name;
    private final DocumentFactory documentFactory;

    private BlockOffsets blockOffsets;
    private FileInputStream bz2InputStream;
    private FileChannel bz2FileChannel;
    private int uncompressedBlockSize;
    private BlockCache blockCache;

    public BlockCompressedDocumentCollection(String name, DocumentFactory documentFactory, final int cacheSize) {
	this.name = new File(name).getName();
	this.documentFactory = documentFactory;
    }

    @Override
    public void filename(CharSequence absolutePathToAFileInTheCollection) throws IOException {
	File absolutePathToCollection = new File(absolutePathToAFileInTheCollection.toString()).getParentFile();

	File bz2File = new File(absolutePathToCollection, name + COMPRESSED_FILE_EXTENSION);
	bz2InputStream = new FileInputStream(bz2File);

	if (bz2InputStream.read() != 'B' || bz2InputStream.read() != 'Z' || bz2InputStream.read() != 'h') {
	    bz2InputStream.close();
	    throw new IllegalArgumentException(bz2File.getAbsolutePath() + " doesn't have bzip2 header!");
	}
	uncompressedBlockSize = bz2InputStream.read() - '0';
	if (uncompressedBlockSize < 0 || uncompressedBlockSize > 9) {
	    bz2InputStream.close();
	    throw new IllegalArgumentException(bz2File.getAbsolutePath() + " has a invalid block size byte.");
	}
	uncompressedBlockSize *= 100 * 1024; // This is weird.  The uncompressed blocks can be bigger than multiples of 100000.

	FileChannel bz2FileChannel = bz2InputStream.getChannel();

	File blockOffsetsFile = new File(absolutePathToCollection, name + BLOCK_OFFSETS_EXTENSION);
	InputStream blockOffsetsInputStream = new FileInputStream(blockOffsetsFile);
	init(bz2FileChannel, blockOffsetsInputStream, uncompressedBlockSize);
	blockOffsetsInputStream.close();
    }

    public void init(FileChannel bz2FileChannel, InputStream blockOffsetsInputStream, int uncompressedBlockSize) throws IOException {
	this.bz2FileChannel = bz2FileChannel;

	DataInputStream blockOffsetsDataInput = new DataInputStream(blockOffsetsInputStream);
	try {
	    blockOffsets = (BlockOffsets) BinIO.loadObject(blockOffsetsDataInput);
	} catch (ClassNotFoundException e) {
	    throw new RuntimeException("BinIO.loadObject() threw:" + e);
	}

	this.uncompressedBlockSize = uncompressedBlockSize;

	blockCache = new BlockCache(new BlockCache.BlockReader() {
	    @Override
	    public int readBlock(long blockIndex, byte[] buffer) throws IOException {
		return uncompressBlock(blockIndex, buffer);
	    }
	}, blockOffsets.getBlockCount() - 1, uncompressedBlockSize, 1024);
    }

    @Override
    public long size() {
	return blockOffsets.getLastDocId();
    }

    @Override
    public Document document(long index) throws IOException {
	InputStream stream = stream(index);
	Reference2ObjectMap<Enum<?>, Object> metadata = getMetadata(stream);
	return documentFactory.getDocument(stream, metadata);
    }

    @Override
    public InputStream stream(long index) throws IOException {
	InputStream inputStream;
	long time = System.currentTimeMillis();
	try {
	    inputStream = getInputStreamStartingAtDocStart(index);
	} catch (IOException e) {
	    LOGGER.error("Got IOException getting doc at index:" + index);
	    throw e;
	}
	time = System.currentTimeMillis() - time;
	LOGGER.debug("stream(" + index + ") took " + time + "ms.");

	if (inputStream == null) {
	    inputStream = new ByteArrayInputStream(ZERO_BYTE_BUFFER);
	}
	return inputStream;
    }

    @Override
    public Reference2ObjectMap<Enum<?>, Object> metadata(long index) throws IOException {
	return getMetadata(stream(index));
    }

    private Reference2ObjectMap<Enum<?>, Object> getMetadata(InputStream stream) throws IOException {
	Reference2ObjectOpenHashMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>();

	// TODO Why is this not picked up from the factories metadata?
	metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");

	return metadata;
    }

    @Override
    public DocumentCollection copy() {
	throw new UnsupportedOperationException();
    }

    @Override
    public DocumentFactory factory() {
	return documentFactory;
    }

    private InputStream getInputStreamStartingAtDocStart(long requiredDocId) throws IOException {
	if (requiredDocId > blockOffsets.getLastDocId()) {
	    return null;
	}
	final long blockIndex = blockOffsets.getBlockIndex(requiredDocId);

	if (blockIndex == -1 || blockIndex >= blockOffsets.getBlockCount()) {
	    return null;
	}

	int[] recordStartOffset = { -1 };
	long currentDocId = getNextDocId(blockIndex, recordStartOffset);

	if (currentDocId > requiredDocId) {
	    // The first doc id in the block is bigger than the
	    // requiredDocId
	    LOGGER.warn("The first doc id(" + currentDocId + ") in the block(" + blockIndex + ") is bigger than the requiredDocId(" + requiredDocId
		    + ").  This maybe an error in the bySubject.blockOffsets file.");
	    return null;
	}

	while (currentDocId != -1 && currentDocId < requiredDocId) {
	    currentDocId = getNextDocId(blockIndex, recordStartOffset);
	}

	if (currentDocId == requiredDocId) {
	    // Found.
	    return blockCache.getInputStream(blockIndex, recordStartOffset[0]);
	}

	if (currentDocId == -1 && recordStartOffset[0] == -1) {
	    // There are no docIds in this block.
	    LOGGER.warn("There are no docIds in this block(" + blockIndex + "). RequiredDocId(" + requiredDocId
		    + "). This maybe an error in the bySubject.blockOffsets file.");
	}
	// Not found.
	return null;
    }

    private int uncompressBlock(long blockIndex, byte[] uncompressedBuffer) throws IOException {
	final long blockStartBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex);
	final long blockEndBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex + 1);

	final long blockStartByteOffset = blockStartBitOffset / 8;
	final int blockStartSkipBits = (int) (blockStartBitOffset % 8);
	final long blockEndByteOffset = blockEndBitOffset / 8;

	final MappedByteBuffer blockMappedByteBuffer = bz2FileChannel.map(MapMode.READ_ONLY, blockStartByteOffset,
		(blockEndByteOffset - blockStartByteOffset) + 1);

	final ByteBufferInputStream blockInputStream = new ByteBufferInputStream(blockMappedByteBuffer);
	final BZip2BitInputStream blockBitInputStream = new BZip2BitInputStream(blockInputStream);
	blockBitInputStream.readBits(blockStartSkipBits);

	/* Read block-header or end-of-stream marker */
	final int marker1 = blockBitInputStream.readBits(24);
	final int marker2 = blockBitInputStream.readBits(24);

	if (marker1 == BZip2Constants.BLOCK_HEADER_MARKER_1 && marker2 == BZip2Constants.BLOCK_HEADER_MARKER_2) {
	    // System.err.println("Decompressing block " + blockIndex + " S:" +
	    // blockStartBitOffset + " E:" + blockEndBitOffset);
	    // System.err.flush();
	    final BZip2BlockDecompressor blockDecompressor = new BZip2BlockDecompressor(blockBitInputStream, uncompressedBlockSize);
	    return blockDecompressor.read(uncompressedBuffer, 0, uncompressedBlockSize);
	} else if (marker1 == BZip2Constants.STREAM_END_MARKER_1 && marker2 == BZip2Constants.STREAM_END_MARKER_2) {
	    throw new IllegalArgumentException("End of BZip2 marker at bit " + blockStartBitOffset);
	} else {
	    throw new IllegalStateException("Not a BZip2 block header at bit " + blockStartBitOffset);
	}
    }

    /**
     * @param blockIndex
     *            The index of the block we are looking for the next DocId in.
     * @param startAtByteIndex
     *            The current byte index in the block
     * @return The next docId. -1 if there are no more doc starts.
     * @throws IllegalStateException
     *             On corrupt record starts.
     * @throws IOException
     *             On failing to read blocks.
     */
    private long getNextDocId(long blockIndex, int[] startAtByteIndex) throws IllegalStateException, IOException {
	BlockCache.Block block = blockCache.getBlock(blockIndex);

	int recordDelimiterIndex = startAtByteIndex[0];

	byte[] blockBytes = block.getBytes();
	int blockLength = block.getLength();

	if (blockIndex != 0 || recordDelimiterIndex != -1) { // First record in
							     // first block
							     // check.
	    if (recordDelimiterIndex == -1) {
		recordDelimiterIndex++;
	    }
	    while (recordDelimiterIndex < blockLength && blockBytes[recordDelimiterIndex] != BySubjectRecord.RECORD_DELIMITER) {
		recordDelimiterIndex++;
	    }
	}

	if (recordDelimiterIndex == blockLength) {
	    // No RECORD_DELIMITER found after startAtByteIndex.
	    return -1;
	}

	// recordDelimiterIndex points to the RECORD_DELIMITER before the next
	// record or -1 for the first record of the first block)

	int docIdDigitIndex = recordDelimiterIndex + 1;
	long currentDocId = 0;
	while (docIdDigitIndex < blockLength && blockBytes[docIdDigitIndex] >= '0' && blockBytes[docIdDigitIndex] <= '9') {
	    currentDocId *= 10;
	    currentDocId += blockBytes[docIdDigitIndex] - '0';
	    docIdDigitIndex++;
	}

	int docIdLength = docIdDigitIndex - recordDelimiterIndex - 1;

	if (docIdDigitIndex == blockLength) {
	    // DocId spans blocks or last RECORD_DELIMITER
	    block = blockCache.getBlock(blockIndex + 1);
	    if (block == null) {
		// Last RECORD_DELIMITER.
		return -1;
	    }
	    blockBytes = block.getBytes();
	    int nextBlockLength = block.getLength();
	    docIdDigitIndex = 0;

	    while (docIdDigitIndex < nextBlockLength && blockBytes[docIdDigitIndex] >= '0' && blockBytes[docIdDigitIndex] <= '9') {
		currentDocId *= 10;
		currentDocId += blockBytes[docIdDigitIndex] - '0';
		docIdDigitIndex++;
	    }
	    docIdLength += docIdDigitIndex;
	}

	if (blockBytes[docIdDigitIndex] != BySubjectRecord.FIELD_DELIMITER) {
	    throw new IllegalStateException("Expecting field delimiter but found byte>" + blockBytes[docIdDigitIndex] + "<. Record started with "
		    + new String(blockBytes, recordDelimiterIndex + 1, docIdDigitIndex));
	}

	if (docIdLength == 0) {
	    throw new IllegalStateException("Zero length docId found!");
	}

	// Success.
	startAtByteIndex[0] = recordDelimiterIndex + 1;
	return currentDocId;
    }

    @Override
    public void close() throws IOException {
	super.close();
	bz2FileChannel.close();
	if (bz2InputStream != null) {
	    bz2InputStream.close();
	}
    }

    public static void main(String[] args) throws IOException {
	if (args.length < 1) {
	    System.err.println("Args are: <full path to collection bz2 file> [docId>]");
	    return;
	}
	String collectionBase = args[0];

	File collectionBaseFile = new File(collectionBase);
	String collectionName = collectionBaseFile.getName();
	int collectionNamePostfixStart = collectionName.lastIndexOf('.');
	if (collectionNamePostfixStart > 0) {
	    collectionName = collectionName.substring(0, collectionNamePostfixStart);
	}

	BlockCompressedDocumentCollection collection = new BlockCompressedDocumentCollection(collectionName, null, 10);
	collection.filename(collectionBase);

	if (args.length >= 2) {
	    // dump docs.
	    for (int i = 1; i < args.length; i++) {
		long docId = Long.parseLong(args[i]);
		long time = System.currentTimeMillis();
		InputStream docStream = collection.stream(docId);
		time = System.currentTimeMillis() - time;
		System.out.println(time + " milliseconds.");
		IOUtils.copy(docStream, System.out);
		System.out.println();
	    }
	} else {
	    // print offsets and size.

	    collection.blockOffsets.printTo(System.out);
	}
	collection.close();
    }
}