/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.index.codecs.siren10; import java.io.IOException; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.sindice.siren.index.codecs.block.BlockDecompressor; import org.sindice.siren.index.codecs.block.BlockIndexInput; import org.sindice.siren.util.ArrayUtils; /** * Implementation of the {@link BlockIndexInput} for the .nod file of the SIREn * postings format. */ public class NodBlockIndexInput extends BlockIndexInput { protected BlockDecompressor nodDecompressor; public NodBlockIndexInput(final IndexInput in, final BlockDecompressor nodDecompressor) throws IOException { super(in); this.nodDecompressor = nodDecompressor; } @Override public NodBlockReader getBlockReader() { // Clone index input. A cloned index input does not need to be closed // by the block reader, as the underlying stream will be closed by the // input it was cloned from return new NodBlockReader(in.clone()); } /** * Implementation of the {@link BlockReader} for the .nod file. * * <p> * * Read and decode blocks containing the the node labels and term frequencies. */ protected class NodBlockReader extends BlockReader { int nodLenBlockSize; IntsRef nodLenBuffer = new IntsRef(); int nodBlockSize; IntsRef nodBuffer = new IntsRef(); int termFreqBlockSize; IntsRef termFreqBuffer = new IntsRef(); /** * Used to slice the nodBuffer and disclose only the subset containing * information about the current node. */ private final IntsRef currentNode = new IntsRef(); boolean nodLenReadPending = true; boolean nodReadPending = true; boolean termFreqReadPending = true; int nodLenCompressedBufferLength; BytesRef nodLenCompressedBuffer = new BytesRef(); int nodCompressedBufferLength; BytesRef nodCompressedBuffer = new BytesRef(); int termFreqCompressedBufferLength; BytesRef termFreqCompressedBuffer = new BytesRef(); private NodBlockReader(final IndexInput in) { super(in); // ensure that the output buffers has the minimum size required nodLenBuffer = ArrayUtils.grow(nodLenBuffer, nodDecompressor.getWindowSize()); nodBuffer = ArrayUtils.grow(nodBuffer, nodDecompressor.getWindowSize()); termFreqBuffer = ArrayUtils.grow(termFreqBuffer, nodDecompressor.getWindowSize()); } @Override protected void readHeader() throws IOException { // logger.debug("Read Nod header: {}", this.hashCode()); // logger.debug("Nod header start at {}", in.getFilePointer()); // read blockSize and check buffer size nodLenBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int nodLenBufferLength = this.getMinimumBufferSize(nodLenBlockSize, nodDecompressor.getWindowSize()); nodLenBuffer = ArrayUtils.grow(nodLenBuffer, nodLenBufferLength); // logger.debug("Read Nod length block size: {}", nodLenblockSize); nodBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int nodBufferLength = this.getMinimumBufferSize(nodBlockSize, nodDecompressor.getWindowSize()); nodBuffer = ArrayUtils.grow(nodBuffer, nodBufferLength); // logger.debug("Read Nod block size: {}", nodBlockSize); termFreqBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int termFreqBufferLength = this.getMinimumBufferSize(termFreqBlockSize, nodDecompressor.getWindowSize()); termFreqBuffer = ArrayUtils.grow(termFreqBuffer, termFreqBufferLength); // logger.debug("Read Term Freq In Node block size: {}", termFreqblockSize); // read size of each compressed data block and check buffer size nodLenCompressedBufferLength = in.readVInt(); nodLenCompressedBuffer = ArrayUtils.grow(nodLenCompressedBuffer, nodLenCompressedBufferLength); nodLenReadPending = true; nodCompressedBufferLength = in.readVInt(); nodCompressedBuffer = ArrayUtils.grow(nodCompressedBuffer, nodCompressedBufferLength); nodReadPending = true; termFreqCompressedBufferLength = in.readVInt(); termFreqCompressedBuffer = ArrayUtils.grow(termFreqCompressedBuffer, termFreqCompressedBufferLength); termFreqReadPending = true; // decode node lengths this.decodeNodeLengths(); // copy reference of node buffer currentNode.ints = nodBuffer.ints; } @Override protected void skipData() throws IOException { long size = 0; if (nodLenReadPending) { size += nodLenCompressedBufferLength; } if (nodReadPending) { size += nodCompressedBufferLength; } if (termFreqReadPending) { size += termFreqCompressedBufferLength; } this.seek(in.getFilePointer() + size); // logger.debug("Skip Nod data: {}", in.getFilePointer() + size); } private void decodeNodeLengths() throws IOException { // logger.debug("Decode Nodes Length: {}", this.hashCode()); // logger.debug("Decode Nodes Length at {}", in.getFilePointer()); in.readBytes(nodLenCompressedBuffer.bytes, 0, nodLenCompressedBufferLength); nodLenCompressedBuffer.offset = 0; nodLenCompressedBuffer.length = nodLenCompressedBufferLength; nodDecompressor.decompress(nodLenCompressedBuffer, nodLenBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor nodLenBuffer.length = nodLenBlockSize; nodLenReadPending = false; } private void decodeNodes() throws IOException { // logger.debug("Decode Nodes: {}", this.hashCode()); // logger.debug("Decode Nodes at {}", in.getFilePointer()); in.readBytes(nodCompressedBuffer.bytes, 0, nodCompressedBufferLength); nodCompressedBuffer.offset = 0; nodCompressedBuffer.length = nodCompressedBufferLength; nodDecompressor.decompress(nodCompressedBuffer, nodBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor nodBuffer.length = nodBlockSize; nodReadPending = false; } private void decodeTermFreqs() throws IOException { // logger.debug("Decode Term Freq in Node: {}", this.hashCode()); // logger.debug("Decode Term Freq in Node at {}", in.getFilePointer()); in.readBytes(termFreqCompressedBuffer.bytes, 0, termFreqCompressedBufferLength); termFreqCompressedBuffer.offset = 0; termFreqCompressedBuffer.length = termFreqCompressedBufferLength; nodDecompressor.decompress(termFreqCompressedBuffer, termFreqBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor termFreqBuffer.length = termFreqBlockSize; termFreqReadPending = false; } /** * Decode and return the next node label of the current block. * * <p> * * The {@link IntsRef} returned is a slice of the uncompressed node block. */ public IntsRef nextNode() throws IOException { if (nodReadPending) { this.decodeNodes(); } // decode delta this.deltaDecoding(); return currentNode; } /** * Decode delta of the node. * <p> * If a new doc has been read (currentNode.length == 0), then update currentNode * offset and length. Otherwise, perform delta decoding. * <p> * Perform delta decoding while current node id and previous node id are * equals. */ private final void deltaDecoding() { final int[] nodBufferInts = nodBuffer.ints; // increment length by one final int nodLength = nodLenBuffer.ints[nodLenBuffer.offset++] + 1; final int nodOffset = nodBuffer.offset; final int nodEnd = nodOffset + nodLength; final int currentNodeOffset = currentNode.offset; final int currentNodeEnd = currentNodeOffset + currentNode.length; for (int i = nodOffset, j = currentNodeOffset; i < nodEnd && j < currentNodeEnd; i++, j++) { nodBufferInts[i] += nodBufferInts[j]; // if node ids are different, then stop decoding if (nodBufferInts[i] != nodBufferInts[j]) { break; } } // increment node buffer offset nodBuffer.offset += nodLength; // update last node offset and length currentNode.offset = nodOffset; currentNode.length = nodLength; } /** * Decode and return the next term frequency of the current block. */ public int nextTermFreqInNode() throws IOException { if (termFreqReadPending) { this.decodeTermFreqs(); } // increment freq by one return termFreqBuffer.ints[termFreqBuffer.offset++] + 1; } @Override public boolean isExhausted() { return nodLenBuffer.offset >= nodLenBuffer.length; } @Override public void initBlock() { nodLenBuffer.offset = nodLenBuffer.length = 0; nodBuffer.offset = nodBuffer.length = 0; termFreqBuffer.offset = termFreqBuffer.length = 0; this.resetCurrentNode(); nodLenReadPending = true; nodReadPending = true; termFreqReadPending = true; nodLenCompressedBufferLength = 0; nodCompressedBufferLength = 0; termFreqCompressedBufferLength = 0; } public void resetCurrentNode() { currentNode.offset = currentNode.length = 0; } } }