NodBlockIndexInput.java example

Explorer
siren-master
/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sindice.siren.index.codecs.siren10;

import java.io.IOException;

import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.index.codecs.block.BlockDecompressor;
import org.sindice.siren.index.codecs.block.BlockIndexInput;
import org.sindice.siren.util.ArrayUtils;

/**
 * Implementation of the {@link BlockIndexInput} for the .nod file of the SIREn
 * postings format.
 */
public class NodBlockIndexInput extends BlockIndexInput {

  protected BlockDecompressor nodDecompressor;

  public NodBlockIndexInput(final IndexInput in, final BlockDecompressor nodDecompressor)
  throws IOException {
    super(in);
    this.nodDecompressor = nodDecompressor;
  }

  @Override
  public NodBlockReader getBlockReader() {
    // Clone index input. A cloned index input does not need to be closed
    // by the block reader, as the underlying stream will be closed by the
    // input it was cloned from
    return new NodBlockReader(in.clone());
  }

  /**
   * Implementation of the {@link BlockReader} for the .nod file.
   *
   * <p>
   *
   * Read and decode blocks containing the the node labels and term frequencies.
   */
  protected class NodBlockReader extends BlockReader {

    int nodLenBlockSize;
    IntsRef nodLenBuffer = new IntsRef();
    int nodBlockSize;
    IntsRef nodBuffer = new IntsRef();
    int termFreqBlockSize;
    IntsRef termFreqBuffer = new IntsRef();

    /**
     * Used to slice the nodBuffer and disclose only the subset containing
     * information about the current node.
     */
    private final IntsRef currentNode = new IntsRef();

    boolean nodLenReadPending = true;
    boolean nodReadPending = true;
    boolean termFreqReadPending = true;

    int nodLenCompressedBufferLength;
    BytesRef nodLenCompressedBuffer = new BytesRef();
    int nodCompressedBufferLength;
    BytesRef nodCompressedBuffer = new BytesRef();
    int termFreqCompressedBufferLength;
    BytesRef termFreqCompressedBuffer = new BytesRef();

    private NodBlockReader(final IndexInput in) {
      super(in);
      // ensure that the output buffers has the minimum size required
      nodLenBuffer = ArrayUtils.grow(nodLenBuffer, nodDecompressor.getWindowSize());
      nodBuffer = ArrayUtils.grow(nodBuffer, nodDecompressor.getWindowSize());
      termFreqBuffer = ArrayUtils.grow(termFreqBuffer, nodDecompressor.getWindowSize());
    }

    @Override
    protected void readHeader() throws IOException {
      // logger.debug("Read Nod header: {}", this.hashCode());
      // logger.debug("Nod header start at {}", in.getFilePointer());

      // read blockSize and check buffer size
      nodLenBlockSize = in.readVInt();
      // ensure that the output buffer has the minimum size required
      final int nodLenBufferLength = this.getMinimumBufferSize(nodLenBlockSize, nodDecompressor.getWindowSize());
      nodLenBuffer = ArrayUtils.grow(nodLenBuffer, nodLenBufferLength);
      // logger.debug("Read Nod length block size: {}", nodLenblockSize);

      nodBlockSize = in.readVInt();
      // ensure that the output buffer has the minimum size required
      final int nodBufferLength = this.getMinimumBufferSize(nodBlockSize, nodDecompressor.getWindowSize());
      nodBuffer = ArrayUtils.grow(nodBuffer, nodBufferLength);
      // logger.debug("Read Nod block size: {}", nodBlockSize);

      termFreqBlockSize = in.readVInt();
      // ensure that the output buffer has the minimum size required
      final int termFreqBufferLength = this.getMinimumBufferSize(termFreqBlockSize, nodDecompressor.getWindowSize());
      termFreqBuffer = ArrayUtils.grow(termFreqBuffer, termFreqBufferLength);
      // logger.debug("Read Term Freq In Node block size: {}", termFreqblockSize);

      // read size of each compressed data block and check buffer size
      nodLenCompressedBufferLength = in.readVInt();
      nodLenCompressedBuffer = ArrayUtils.grow(nodLenCompressedBuffer, nodLenCompressedBufferLength);
      nodLenReadPending = true;

      nodCompressedBufferLength = in.readVInt();
      nodCompressedBuffer = ArrayUtils.grow(nodCompressedBuffer, nodCompressedBufferLength);
      nodReadPending = true;

      termFreqCompressedBufferLength = in.readVInt();
      termFreqCompressedBuffer = ArrayUtils.grow(termFreqCompressedBuffer, termFreqCompressedBufferLength);
      termFreqReadPending = true;

      // decode node lengths
      this.decodeNodeLengths();

      // copy reference of node buffer
      currentNode.ints = nodBuffer.ints;
    }

    @Override
    protected void skipData() throws IOException {
      long size = 0;
      if (nodLenReadPending) {
        size += nodLenCompressedBufferLength;
      }
      if (nodReadPending) {
        size += nodCompressedBufferLength;
      }
      if (termFreqReadPending) {
        size += termFreqCompressedBufferLength;
      }
      this.seek(in.getFilePointer() + size);
      // logger.debug("Skip Nod data: {}", in.getFilePointer() + size);
    }

    private void decodeNodeLengths() throws IOException {
      // logger.debug("Decode Nodes Length: {}", this.hashCode());
      // logger.debug("Decode Nodes Length at {}", in.getFilePointer());
      in.readBytes(nodLenCompressedBuffer.bytes, 0, nodLenCompressedBufferLength);
      nodLenCompressedBuffer.offset = 0;
      nodLenCompressedBuffer.length = nodLenCompressedBufferLength;
      nodDecompressor.decompress(nodLenCompressedBuffer, nodLenBuffer);
      // set length limit based on block size, as certain decompressor with
      // large window size can set it larger than the blockSize, e.g., AFor
      nodLenBuffer.length = nodLenBlockSize;

      nodLenReadPending = false;
    }

    private void decodeNodes() throws IOException {
      // logger.debug("Decode Nodes: {}", this.hashCode());
      // logger.debug("Decode Nodes at {}", in.getFilePointer());
      in.readBytes(nodCompressedBuffer.bytes, 0, nodCompressedBufferLength);
      nodCompressedBuffer.offset = 0;
      nodCompressedBuffer.length = nodCompressedBufferLength;
      nodDecompressor.decompress(nodCompressedBuffer, nodBuffer);
      // set length limit based on block size, as certain decompressor with
      // large window size can set it larger than the blockSize, e.g., AFor
      nodBuffer.length = nodBlockSize;

      nodReadPending = false;
    }

    private void decodeTermFreqs() throws IOException {
      // logger.debug("Decode Term Freq in Node: {}", this.hashCode());
      // logger.debug("Decode Term Freq in Node at {}", in.getFilePointer());
      in.readBytes(termFreqCompressedBuffer.bytes, 0, termFreqCompressedBufferLength);
      termFreqCompressedBuffer.offset = 0;
      termFreqCompressedBuffer.length = termFreqCompressedBufferLength;
      nodDecompressor.decompress(termFreqCompressedBuffer, termFreqBuffer);
      // set length limit based on block size, as certain decompressor with
      // large window size can set it larger than the blockSize, e.g., AFor
      termFreqBuffer.length = termFreqBlockSize;

      termFreqReadPending = false;
    }

    /**
     * Decode and return the next node label of the current block.
     *
     * <p>
     *
     * The {@link IntsRef} returned is a slice of the uncompressed node block.
     */
    public IntsRef nextNode() throws IOException {
      if (nodReadPending) {
        this.decodeNodes();
      }
      // decode delta
      this.deltaDecoding();
      return currentNode;
    }

    /**
     * Decode delta of the node.
     * <p>
     * If a new doc has been read (currentNode.length == 0), then update currentNode
     * offset and length. Otherwise, perform delta decoding.
     * <p>
     * Perform delta decoding while current node id and previous node id are
     * equals.
     */
    private final void deltaDecoding() {
      final int[] nodBufferInts = nodBuffer.ints;
      // increment length by one
      final int nodLength = nodLenBuffer.ints[nodLenBuffer.offset++] + 1;
      final int nodOffset = nodBuffer.offset;
      final int nodEnd = nodOffset + nodLength;

      final int currentNodeOffset = currentNode.offset;
      final int currentNodeEnd = currentNodeOffset + currentNode.length;

      for (int i = nodOffset, j = currentNodeOffset;
           i < nodEnd && j < currentNodeEnd; i++, j++) {
        nodBufferInts[i] += nodBufferInts[j];
        // if node ids are different, then stop decoding
        if (nodBufferInts[i] != nodBufferInts[j]) {
          break;
        }
      }

      // increment node buffer offset
      nodBuffer.offset += nodLength;
      // update last node offset and length
      currentNode.offset = nodOffset;
      currentNode.length = nodLength;
    }

    /**
     * Decode and return the next term frequency of the current block.
     */
    public int nextTermFreqInNode() throws IOException {
      if (termFreqReadPending) {
        this.decodeTermFreqs();
      }
      // increment freq by one
      return termFreqBuffer.ints[termFreqBuffer.offset++] + 1;
    }

    @Override
    public boolean isExhausted() {
      return nodLenBuffer.offset >= nodLenBuffer.length;
    }

    @Override
    public void initBlock() {
      nodLenBuffer.offset = nodLenBuffer.length = 0;
      nodBuffer.offset = nodBuffer.length = 0;
      termFreqBuffer.offset = termFreqBuffer.length = 0;
      this.resetCurrentNode();

      nodLenReadPending = true;
      nodReadPending = true;
      termFreqReadPending = true;

      nodLenCompressedBufferLength = 0;
      nodCompressedBufferLength = 0;
      termFreqCompressedBufferLength = 0;
    }

    public void resetCurrentNode() {
      currentNode.offset = currentNode.length = 0;
    }

  }

}