Siren10PostingsWriter.java example

Explorer
siren-master
/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sindice.siren.index.codecs.siren10;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MappingMultiDocsAndPositionsEnum;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.analysis.filter.VIntPayloadCodec;
import org.sindice.siren.index.MappingMultiDocsNodesAndPositionsEnum;
import org.sindice.siren.index.codecs.block.BlockIndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Writes the document identifiers, node frequencies, node labels, term
 * frequencies, term positions and block skip data.
 */
public class Siren10PostingsWriter extends PostingsWriterBase {

  final static String                          CODEC                       = "Siren10PostingsWriter";

  final static String                          DOC_EXTENSION               = "doc";
  final static String                          SKIP_EXTENSION              = "skp";
  final static String                          NOD_EXTENSION               = "nod";
  final static String                          POS_EXTENSION               = "pos";

  // Increment version to change it:
  final static int                             VERSION_START               = 0;
  final static int                             VERSION_CURRENT             = VERSION_START;

  DocsFreqBlockIndexOutput                     docOut;
  DocsFreqBlockIndexOutput.DocsFreqBlockWriter docWriter;
  DocsFreqBlockIndexOutput.Index               docIndex;

  NodBlockIndexOutput                          nodOut;
  NodBlockIndexOutput.NodBlockWriter           nodWriter;
  NodBlockIndexOutput.Index                    nodIndex;

  PosBlockIndexOutput                          posOut;
  PosBlockIndexOutput.PosBlockWriter           posWriter;
  PosBlockIndexOutput.Index                    posIndex;

  IndexOutput                                  skipOut;
  IndexOutput                                  termsOut;

  final Siren10SkipListWriter skipWriter;

  /**
   * Expert: The fraction of blocks stored in skip tables,
   * used to accelerate {@link DocsEnum#advance(int)}.  Larger values result in
   * smaller indexes, greater acceleration, but fewer accelerable cases, while
   * smaller values result in bigger indexes, less acceleration and more
   * accelerable cases.
   */
  final int blockSkipInterval;
  static final int DEFAULT_BLOCK_SKIP_INTERVAL = 2;

  /**
   * Expert: minimum block to write any skip data at all
   */
  final int blockSkipMinimum;

  /**
   * Expert: maximum block size allowed.
   */
  final int maxBlockSize;

  /**
   * Expert: The maximum number of skip levels. Smaller values result in
   * slightly smaller indexes, but slower skipping in big posting lists.
   */
  final int maxSkipLevels = 10;

  final int totalNumDocs;

  IndexOptions indexOptions;

  FieldInfo fieldInfo;

  int blockCount;

  // Holds pending byte[] blob for the current terms block
  private final RAMOutputStream indexBytesWriter = new RAMOutputStream();

  protected static final Logger logger = LoggerFactory.getLogger(Siren10PostingsWriter.class);

  public Siren10PostingsWriter(final SegmentWriteState state,
                               final Siren10BlockStreamFactory factory)
  throws IOException {
    this(state, DEFAULT_BLOCK_SKIP_INTERVAL, factory);
  }

  public Siren10PostingsWriter(final SegmentWriteState state,
                               final int blockSkipInterval,
                               final Siren10BlockStreamFactory factory)
  throws IOException {
    nodOut = null;
    nodIndex = null;
    posOut = null;
    posIndex = null;
    boolean success = false;

    try {
      this.blockSkipInterval = blockSkipInterval;
      this.blockSkipMinimum = blockSkipInterval; /* set to the same for now */

      final String docFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
        state.segmentSuffix, DOC_EXTENSION);
      docOut = factory.createDocsFreqOutput(state.directory, docFileName, state.context);
      docWriter = docOut.getBlockWriter();
      docIndex = docOut.index();

      this.maxBlockSize = docWriter.getMaxBlockSize();

      final String nodFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
        state.segmentSuffix, NOD_EXTENSION);
      nodOut = factory.createNodOutput(state.directory, nodFileName, state.context);
      nodWriter = nodOut.getBlockWriter();
      nodIndex = nodOut.index();

      final String posFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
        state.segmentSuffix, POS_EXTENSION);
      posOut = factory.createPosOutput(state.directory, posFileName, state.context);
      posWriter = posOut.getBlockWriter();
      posIndex = posOut.index();

      final String skipFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
        state.segmentSuffix, SKIP_EXTENSION);
      skipOut = state.directory.createOutput(skipFileName, state.context);

      totalNumDocs = state.segmentInfo.getDocCount();

      // EStimate number of blocks that will be written
      final int numBlocks = (int) Math.ceil(totalNumDocs / (double) docWriter.getMaxBlockSize());
      skipWriter = new Siren10SkipListWriter(blockSkipInterval, maxSkipLevels,
        numBlocks, docOut);
      docWriter.setNodeBlockIndex(nodIndex);
      docWriter.setPosBlockIndex(posIndex);

      success = true;
    }
    finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(docOut, skipOut, nodOut, posOut);
      }
    }
  }

  @Override
  public void start(final IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
    termsOut.writeInt(blockSkipInterval);                // write skipInterval
    termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
    termsOut.writeInt(blockSkipMinimum);                 // write skipMinimum
    termsOut.writeInt(maxBlockSize);                 // write maxBlockSize
  }

  @Override
  public void startTerm() throws IOException {
    docIndex.mark();
    nodIndex.mark();
    posIndex.mark();

    skipWriter.resetSkip(docIndex);
  }

  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public void setField(final FieldInfo fieldInfo) {
    this.fieldInfo = fieldInfo;
    this.indexOptions = fieldInfo.getIndexOptions();
    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
      throw new UnsupportedOperationException("this codec cannot index offsets");
    }
    skipWriter.setIndexOptions(indexOptions);
  }

  /**
   * Adds a new doc in this term. If this returns null
   * then we just skip consuming positions.
   * <p>
   * {@code termDocFreq} parameter is ignored as term frequency in document is
   * not used.
   */
  @Override
  public void startDoc(final int docID, final int termDocFreq)
  throws IOException {
    if (docID < 0) {
      throw new CorruptIndexException("docs out of order (" + docID + ") (docOut: " + docOut + ")");
    }

    if (docWriter.isFull()) {
      if ((++blockCount % blockSkipInterval) == 0) {
        skipWriter.setSkipData(docWriter.getFirstDocId());
        skipWriter.bufferSkip(blockCount);
      }
      docWriter.flush();
      nodWriter.flush(); // flush node block to synchronise it with doc block
      posWriter.flush(); // flush pos block to synchronise it with doc block
    }

    docWriter.write(docID);

    // reset current node for delta computation
    nodWriter.resetCurrentNode();

    // reset payload hash to sentinel value
    lastNodeHash = Long.MAX_VALUE;
  }

  /**
   * Sentinel value {@link Long.MAX_VALUE} is necessary in order to avoid
   * equality with nodes composed of '0' values.
   * <p>
   * Use long to avoid collision between sentinel value and payload hashcode.
   * <p>
   * Using payload hashcode seems to be the fastest way for testing node
   * equality. See micro-benchmark {@link NodeEqualityBenchmark}.
   */
  private long lastNodeHash = Long.MAX_VALUE;

  private final VIntPayloadCodec sirenPayload = new VIntPayloadCodec();

  private int nodeFreqInDoc = 0;
  private int termFreqInNode = 0;

  @Override
  public void addPosition(final int position, final BytesRef payload,
                          final int startOffset, final int endOffset)
  throws IOException {
    assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
    // we always receive node ids in the payload
    assert payload != null;

    // decode payload
    sirenPayload.decode(payload);
    final IntsRef node = sirenPayload.getNode();

    // check if we received the same node
    // TODO: we pay the cost of decoding the node before testing the equality
    // we could instead directly compute the node hash based on the byte array
    final int nodeHash = node.hashCode();
    if (lastNodeHash != nodeHash) { // if different node
      // add term freq for previous node if not first payload.
      if (lastNodeHash != Long.MAX_VALUE) {
        this.addTermFreqInNode();
      }
      // add new node
      this.addNode(node);
    }
    lastNodeHash = nodeHash;

    // add position
    this.addPosition(sirenPayload.getPosition());
  }

  private void addNode(final IntsRef node) {
    nodWriter.write(node);
    nodeFreqInDoc++;
    // reset current position for delta computation
    posWriter.resetCurrentPosition();
  }

  private void addPosition(final int position) {
    posWriter.write(position);
    termFreqInNode++;
  }

  private void addNodeFreqInDoc() {
    docWriter.writeNodeFreq(nodeFreqInDoc);
    nodeFreqInDoc = 0;
  }

  private void addTermFreqInNode() {
    nodWriter.writeTermFreq(termFreqInNode);
    termFreqInNode = 0;
  }

  @Override
  public void finishDoc() {
    this.addNodeFreqInDoc();
    this.addTermFreqInNode();
  }

  private static class PendingTerm {

    public final BlockIndexOutput.Index docIndex;
    public final long skipFP;
    public final int blockCount;

    public PendingTerm(final BlockIndexOutput.Index docIndex,
                       final long skipFP, final int blockCount) {
      this.docIndex = docIndex;
      this.skipFP = skipFP;
      this.blockCount = blockCount;
    }
  }

  private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();

  /**
   * Called when we are done adding docs to this term
   */
  @Override
  public void finishTerm(final TermStats stats) throws IOException {
    assert stats.docFreq > 0;

    // if block flush pending, write last skip data
    if (!docWriter.isEmpty() && (++blockCount % blockSkipInterval) == 0) {
      skipWriter.setSkipData(docWriter.getFirstDocId());
      skipWriter.bufferSkip(blockCount);
    }

    // flush doc block
    docWriter.flush();
    final BlockIndexOutput.Index docIndexCopy = docOut.index();
    docIndexCopy.copyFrom(docIndex, false);

    // flush node block
    nodWriter.flush();
    final BlockIndexOutput.Index nodIndexCopy = nodOut.index();
    nodIndexCopy.copyFrom(nodIndex, false);

    // flush pos block
    posWriter.flush();
    final BlockIndexOutput.Index posIndexCopy = posOut.index();
    posIndexCopy.copyFrom(posIndex, false);

    // Write skip data to the output file
    final long skipFP;
    if (blockCount >= blockSkipMinimum) {
      skipFP = skipOut.getFilePointer();
      skipWriter.writeSkip(skipOut);
    }
    else {
      skipFP = -1;
    }

    pendingTerms.add(new PendingTerm(docIndexCopy, skipFP, blockCount));

    // reset block counter
    blockCount = 0;
  }

  @Override
  public void flushTermsBlock(final int start, final int count) throws IOException {
    // logger.debug("flushTermsBlock: {}", this.hashCode());
    assert indexBytesWriter.getFilePointer() == 0;
    final int absStart = pendingTerms.size() - start;
    final List<PendingTerm> slice = pendingTerms.subList(absStart, absStart+count);

    long lastSkipFP = 0;

    if (count == 0) {
      termsOut.writeByte((byte) 0);
      return;
    }

    final PendingTerm firstTerm = slice.get(0);
    final BlockIndexOutput.Index docIndexFlush = firstTerm.docIndex;

    for (int idx = 0; idx < slice.size(); idx++) {
      final boolean isFirstTerm = idx == 0;
      final PendingTerm t = slice.get(idx);

      // write block count stat
      // logger.debug("Write blockCount: {}", t.blockCount);
      indexBytesWriter.writeVInt(t.blockCount);

      docIndexFlush.copyFrom(t.docIndex, false);
      // logger.debug("Write docIndex: {}", docIndexFlush);
      docIndexFlush.write(indexBytesWriter, isFirstTerm);

      if (t.skipFP != -1) {
        if (isFirstTerm) {
          indexBytesWriter.writeVLong(t.skipFP);
        }
        else {
          indexBytesWriter.writeVLong(t.skipFP - lastSkipFP);
        }
        lastSkipFP = t.skipFP;
      }
    }

    termsOut.writeVLong((int) indexBytesWriter.getFilePointer());
    indexBytesWriter.writeTo(termsOut);
    indexBytesWriter.reset();
    slice.clear();
  }

  @Override
  public void close() throws IOException {
    IOUtils.close(docOut, skipOut, nodOut, posOut);
  }

  private final MappingMultiDocsNodesAndPositionsEnum postingsEnum = new MappingMultiDocsNodesAndPositionsEnum();

  /**
   * Default merge impl: append documents, nodes and positions, mapping around
   * deletes.
   * <p>
   * Bypass the {@link Siren10PostingsWriter} methods and work directly with
   * the BlockWriters for maximum efficiency.
   * <p>
   * TODO - Optimisation: If document blocks match the block size, and no
   * document deleted, then it would be possible to copy block directly as byte
   * array, avoiding decoding and encoding.
   **/
  @Override
  public TermStats merge(final MergeState mergeState, final DocsEnum postings,
                         final FixedBitSet visitedDocs)
  throws IOException {
    int df = 0;
    long totTF = 0;

    postingsEnum.setMergeState(mergeState);
    postingsEnum.reset((MappingMultiDocsAndPositionsEnum) postings);

    while (postingsEnum.nextDocument()) {
      final int doc = postingsEnum.doc();
      visitedDocs.set(doc);

      this.startDoc(doc, -1);

      final int nodeFreq = postingsEnum.nodeFreqInDoc();
      docWriter.writeNodeFreq(nodeFreq);

      while (postingsEnum.nextNode()) {
        final IntsRef node = postingsEnum.node();
        nodWriter.write(node);

        final int termFreqInNode = postingsEnum.termFreqInNode();
        nodWriter.writeTermFreq(termFreqInNode);

        // reset current position for delta computation
        posWriter.resetCurrentPosition();

        while (postingsEnum.nextPosition()) {
          final int position = postingsEnum.pos();
          posWriter.write(position);
          totTF++;
        }
      }
      df++;
    }

    return new TermStats(df, totTF);
  }

}