Lucene40PostingsWriter.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.codecs.lucene40;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** Consumes doc & freq, writing them using the current
 *  index file format */

import java.io.IOException;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;  // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

/**
 * Concrete class that writes the 4.0 frq/prx postings format.
 * 
 * @see Lucene40PostingsFormat
 * @lucene.experimental 
 */
public final class Lucene40PostingsWriter extends PushPostingsWriterBase {

  final IndexOutput freqOut;
  final IndexOutput proxOut;
  final Lucene40SkipListWriter skipListWriter;
  /** Expert: The fraction of TermDocs entries stored in skip tables,
   * used to accelerate {@link DocsEnum#advance(int)}.  Larger values result in
   * smaller indexes, greater acceleration, but fewer accelerable cases, while
   * smaller values result in bigger indexes, less acceleration and more
   * accelerable cases. More detailed experiments would be useful here. */
  static final int DEFAULT_SKIP_INTERVAL = 16;
  final int skipInterval;
  
  /**
   * Expert: minimum docFreq to write any skip data at all
   */
  final int skipMinimum;

  /** Expert: The maximum number of skip levels. Smaller values result in 
   * slightly smaller indexes, but slower skipping in big posting lists.
   */
  final int maxSkipLevels = 10;
  final int totalNumDocs;

  // Starts a new term
  long freqStart;
  long proxStart;
  int lastPayloadLength;
  int lastOffsetLength;
  int lastPosition;
  int lastOffset;

  final static StandardTermState emptyState = new StandardTermState();
  StandardTermState lastState;

  // private String segment;

  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  {@link #DEFAULT_SKIP_INTERVAL}. */
  public Lucene40PostingsWriter(SegmentWriteState state) throws IOException {
    this(state, DEFAULT_SKIP_INTERVAL);
  }
  
  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  specified {@code skipInterval}. */
  public Lucene40PostingsWriter(SegmentWriteState state, int skipInterval) throws IOException {
    super();
    this.skipInterval = skipInterval;
    this.skipMinimum = skipInterval; /* set to the same for now */
    // this.segment = state.segmentName;
    String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
    freqOut = state.directory.createOutput(fileName, state.context);
    boolean success = false;
    IndexOutput proxOut = null;
    try {
      CodecUtil.writeHeader(freqOut, Lucene40PostingsReader.FRQ_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      // TODO: this is a best effort, if one of these fields has no postings
      // then we make an empty prx file, same as if we are wrapped in 
      // per-field postingsformat. maybe... we shouldn't
      // bother w/ this opto?  just create empty prx file...?
      if (state.fieldInfos.hasProx()) {
        // At least one field does not omit TF, so create the
        // prox file
        fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
        proxOut = state.directory.createOutput(fileName, state.context);
        CodecUtil.writeHeader(proxOut, Lucene40PostingsReader.PRX_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      } else {
        // Every field omits TF so we will write no prox file
        proxOut = null;
      }
      this.proxOut = proxOut;
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(freqOut, proxOut);
      }
    }

    totalNumDocs = state.segmentInfo.getDocCount();

    skipListWriter = new Lucene40SkipListWriter(skipInterval,
                                               maxSkipLevels,
                                               totalNumDocs,
                                               freqOut,
                                               proxOut);
  }

  @Override
  public void init(IndexOutput termsOut) throws IOException {
    CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
    termsOut.writeInt(skipInterval);                // write skipInterval
    termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
    termsOut.writeInt(skipMinimum);                 // write skipMinimum
  }

  @Override
  public BlockTermState newTermState() {
    return new StandardTermState();
  }

  @Override
  public void startTerm() {
    freqStart = freqOut.getFilePointer();
    //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart);
    if (proxOut != null) {
      proxStart = proxOut.getFilePointer();
    }
    // force first payload to write its length
    lastPayloadLength = -1;
    // force first offset to write its length
    lastOffsetLength = -1;
    skipListWriter.resetSkip();
  }

  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public int setField(FieldInfo fieldInfo) {
    super.setField(fieldInfo);
    //System.out.println("SPW: setField");
    /*
    if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) {
      DEBUG = true;
    } else {
      DEBUG = false;
    }
    */

    lastState = emptyState;
    //System.out.println("  set init blockFreqStart=" + freqStart);
    //System.out.println("  set init blockProxStart=" + proxStart);
    return 0;
  }

  int lastDocID;
  int df;

  @Override
  public void startDoc(int docID, int termDocFreq) throws IOException {
    // if (DEBUG) System.out.println("SPW:   startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());

    final int delta = docID - lastDocID;
    
    if (docID < 0 || (df > 0 && delta <= 0)) {
      throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (freqOut: " + freqOut + ")");
    }

    if ((++df % skipInterval) == 0) {
      skipListWriter.setSkipData(lastDocID, writePayloads, lastPayloadLength, writeOffsets, lastOffsetLength);
      skipListWriter.bufferSkip(df);
    }

    assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;

    lastDocID = docID;
    if (indexOptions == IndexOptions.DOCS_ONLY) {
      freqOut.writeVInt(delta);
    } else if (1 == termDocFreq) {
      freqOut.writeVInt((delta<<1) | 1);
    } else {
      freqOut.writeVInt(delta<<1);
      freqOut.writeVInt(termDocFreq);
    }

    lastPosition = 0;
    lastOffset = 0;
  }

  /** Add a new position & payload */
  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
    //if (DEBUG) System.out.println("SPW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
    assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
    assert proxOut != null;

    final int delta = position - lastPosition;
    
    assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition;            // not quite right (if pos=0 is repeated twice we don't catch it)

    lastPosition = position;

    int payloadLength = 0;

    if (writePayloads) {
      payloadLength = payload == null ? 0 : payload.length;

      if (payloadLength != lastPayloadLength) {
        lastPayloadLength = payloadLength;
        proxOut.writeVInt((delta<<1)|1);
        proxOut.writeVInt(payloadLength);
      } else {
        proxOut.writeVInt(delta << 1);
      }
    } else {
      proxOut.writeVInt(delta);
    }
    
    if (writeOffsets) {
      // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
      // and the numbers aren't that much smaller anyways.
      int offsetDelta = startOffset - lastOffset;
      int offsetLength = endOffset - startOffset;
      assert offsetDelta >= 0 && offsetLength >= 0 : "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset;
      if (offsetLength != lastOffsetLength) {
        proxOut.writeVInt(offsetDelta << 1 | 1);
        proxOut.writeVInt(offsetLength);
      } else {
        proxOut.writeVInt(offsetDelta << 1);
      }
      lastOffset = startOffset;
      lastOffsetLength = offsetLength;
    }
    
    if (payloadLength > 0) {
      proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
    }
  }

  @Override
  public void finishDoc() {
  }

  private static class StandardTermState extends BlockTermState {
    public long freqStart;
    public long proxStart;
    public long skipOffset;
  }

  /** Called when we are done adding docs to this term */
  @Override
  public void finishTerm(BlockTermState _state) throws IOException {
    StandardTermState state = (StandardTermState) _state;
    // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
    assert state.docFreq > 0;

    // TODO: wasteful we are counting this (counting # docs
    // for this term) in two places?
    assert state.docFreq == df;
    state.freqStart = freqStart;
    state.proxStart = proxStart;
    if (df >= skipMinimum) {
      state.skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
    } else {
      state.skipOffset = -1;
    }
    lastDocID = 0;
    df = 0;
  }

  @Override
  public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
    StandardTermState state = (StandardTermState)_state;
    if (absolute) {
      lastState = emptyState;
    }
    out.writeVLong(state.freqStart - lastState.freqStart);
    if (state.skipOffset != -1) {
      assert state.skipOffset > 0;
      out.writeVLong(state.skipOffset);
    }
    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      out.writeVLong(state.proxStart - lastState.proxStart);
    }
    lastState = state;
  }

  @Override
  public void close() throws IOException {
    try {
      freqOut.close();
    } finally {
      if (proxOut != null) {
        proxOut.close();
      }
    }
  }
}