StandardPostingsReaderImpl.java example

Explorer
solrcene-master
package org.apache.lucene.index.codecs.standard;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Collection;

import org.apache.lucene.store.Directory;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;

/** Concrete class that reads the current doc/freq/skip
 *  postings format. 
 *  @lucene.experimental */

public class StandardPostingsReaderImpl extends StandardPostingsReader {

  private final IndexInput freqIn;
  private final IndexInput proxIn;

  int skipInterval;
  int maxSkipLevels;

  public StandardPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize) throws IOException {
    freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION),
                           readBufferSize);
    if (segmentInfo.getHasProx()) {
      boolean success = false;
      try {
        proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION),
                               readBufferSize);
        success = true;
      } finally {
        if (!success) {
          freqIn.close();
        }
      }
    } else {
      proxIn = null;
    }
  }

  public static void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) throws IOException {
    files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.FREQ_EXTENSION));
    if (segmentInfo.getHasProx()) {
      files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.PROX_EXTENSION));
    }
  }

  @Override
  public void init(IndexInput termsIn) throws IOException {

    // Make sure we are talking to the matching past writer
    CodecUtil.checkHeader(termsIn, StandardPostingsWriterImpl.CODEC,
      StandardPostingsWriterImpl.VERSION_START, StandardPostingsWriterImpl.VERSION_START);

    skipInterval = termsIn.readInt();
    maxSkipLevels = termsIn.readInt();
  }

  private static class DocTermState extends TermState {
    long freqOffset;
    long proxOffset;
    int skipOffset;

    public Object clone() {
      DocTermState other = (DocTermState) super.clone();
      other.freqOffset = freqOffset;
      other.proxOffset = proxOffset;
      other.skipOffset = skipOffset;
      return other;
    }

    public void copy(TermState _other) {
      super.copy(_other);
      DocTermState other = (DocTermState) _other;
      freqOffset = other.freqOffset;
      proxOffset = other.proxOffset;
      skipOffset = other.skipOffset;
    }

    public String toString() {
      return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset;
    }
  }

  @Override
  public TermState newTermState() {
    return new DocTermState();
  }

  @Override
  public void close() throws IOException {
    try {
      if (freqIn != null) {
        freqIn.close();
      }
    } finally {
      if (proxIn != null) {
        proxIn.close();
      }
    }
  }

  @Override
  public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm)
    throws IOException {

    final DocTermState docTermState = (DocTermState) termState;

    if (isIndexTerm) {
      docTermState.freqOffset = termsIn.readVLong();
    } else {
      docTermState.freqOffset += termsIn.readVLong();
    }

    if (docTermState.docFreq >= skipInterval) {
      docTermState.skipOffset = termsIn.readVInt();
    } else {
      docTermState.skipOffset = 0;
    }

    if (!fieldInfo.omitTermFreqAndPositions) {
      if (isIndexTerm) {
        docTermState.proxOffset = termsIn.readVLong();
      } else {
        docTermState.proxOffset += termsIn.readVLong();
      }
    }
  }
    
  @Override
  public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
    SegmentDocsEnum docsEnum;
    if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
      docsEnum = new SegmentDocsEnum(freqIn);
    } else {
      docsEnum = (SegmentDocsEnum) reuse;
      if (docsEnum.startFreqIn != freqIn) {
        // If you are using ParellelReader, and pass in a
        // reused DocsEnum, it could have come from another
        // reader also using standard codec
        docsEnum = new SegmentDocsEnum(freqIn);
      }
    }
    return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
  }

  @Override
  public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
    if (fieldInfo.omitTermFreqAndPositions) {
      return null;
    }
    SegmentDocsAndPositionsEnum docsEnum;
    if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
      docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
    } else {
      docsEnum = (SegmentDocsAndPositionsEnum) reuse;
      if (docsEnum.startFreqIn != freqIn) {
        // If you are using ParellelReader, and pass in a
        // reused DocsEnum, it could have come from another
        // reader also using standard codec
        docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
      }
    }
    return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
  }

  // Decodes only docs
  private class SegmentDocsEnum extends DocsEnum {
    final IndexInput freqIn;
    final IndexInput startFreqIn;

    boolean omitTF;                               // does current field omit term freq?
    boolean storePayloads;                        // does current field store payloads?

    int limit;                                    // number of docs in this posting
    int ord;                                      // how many docs we've read
    int doc;                                      // doc we last read
    int freq;                                     // freq we last read

    Bits skipDocs;

    long freqOffset;
    int skipOffset;

    boolean skipped;
    DefaultSkipListReader skipper;

    public SegmentDocsEnum(IndexInput freqIn) throws IOException {
      startFreqIn = freqIn;
      this.freqIn = (IndexInput) freqIn.clone();
    }

    public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
      omitTF = fieldInfo.omitTermFreqAndPositions;
      if (omitTF) {
        freq = 1;
      }
      storePayloads = fieldInfo.storePayloads;
      this.skipDocs = skipDocs;
      freqOffset = termState.freqOffset;
      skipOffset = termState.skipOffset;

      // TODO: for full enum case (eg segment merging) this
      // seek is unnecessary; maybe we can avoid in such
      // cases
      freqIn.seek(termState.freqOffset);
      limit = termState.docFreq;
      ord = 0;
      doc = 0;

      skipped = false;

      return this;
    }

    @Override
    public int nextDoc() throws IOException {
      while(true) {
        if (ord == limit) {
          return doc = NO_MORE_DOCS;
        }

        ord++;

        // Decode next doc/freq pair
        final int code = freqIn.readVInt();
        if (omitTF) {
          doc += code;
        } else {
          doc += code >>> 1;              // shift off low bit
          if ((code & 1) != 0) {          // if low bit is set
            freq = 1;                     // freq is one
          } else {
            freq = freqIn.readVInt();     // else read freq
          }
        }

        if (skipDocs == null || !skipDocs.get(doc)) {
          break;
        }
      }

      return doc;
    }

    @Override
    public int read() throws IOException {

      final int[] docs = bulkResult.docs.ints;
      final int[] freqs = bulkResult.freqs.ints;
      int i = 0;
      final int length = docs.length;
      while (i < length && ord < limit) {
        ord++;
        // manually inlined call to next() for speed
        final int code = freqIn.readVInt();
        if (omitTF) {
          doc += code;
        } else {
          doc += code >>> 1;              // shift off low bit
          if ((code & 1) != 0) {          // if low bit is set
            freq = 1;                     // freq is one
          } else {
            freq = freqIn.readVInt();     // else read freq
          }
        }

        if (skipDocs == null || !skipDocs.get(doc)) {
          docs[i] = doc;
          freqs[i] = freq;
          ++i;
        }
      }
      
      return i;
    }

    @Override
    public int docID() {
      return doc;
    }

    @Override
    public int freq() {
      return freq;
    }

    @Override
    public int advance(int target) throws IOException {

      // TODO: jump right to next() if target is < X away
      // from where we are now?

      if (skipOffset > 0) {

        // There are enough docs in the posting to have
        // skip data

        if (skipper == null) {
          // This is the first time this enum has ever been used for skipping -- do lazy init
          skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval);
        }

        if (!skipped) {

          // This is the first time this posting has
          // skipped since reset() was called, so now we
          // load the skip data for this posting

          skipper.init(freqOffset + skipOffset,
                       freqOffset, 0,
                       limit, storePayloads);

          skipped = true;
        }

        final int newOrd = skipper.skipTo(target); 

        if (newOrd > ord) {
          // Skipper moved

          ord = newOrd;
          doc = skipper.getDoc();
          freqIn.seek(skipper.getFreqPointer());
        }
      }
        
      // scan for the rest:
      do {
        nextDoc();
      } while (target > doc);

      return doc;
    }
  }

  // Decodes docs & positions
  private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
    final IndexInput startFreqIn;
    private final IndexInput freqIn;
    private final IndexInput proxIn;

    boolean storePayloads;                        // does current field store payloads?

    int limit;                                    // number of docs in this posting
    int ord;                                      // how many docs we've read
    int doc;                                      // doc we last read
    int freq;                                     // freq we last read
    int position;

    Bits skipDocs;

    long freqOffset;
    int skipOffset;
    long proxOffset;

    int posPendingCount;
    int payloadLength;
    boolean payloadPending;

    boolean skipped;
    DefaultSkipListReader skipper;
    private BytesRef payload;
    private long lazyProxPointer;

    public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
      startFreqIn = freqIn;
      this.freqIn = (IndexInput) freqIn.clone();
      this.proxIn = (IndexInput) proxIn.clone();
    }

    public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
      assert !fieldInfo.omitTermFreqAndPositions;
      storePayloads = fieldInfo.storePayloads;
      if (storePayloads && payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[1];
      }

      this.skipDocs = skipDocs;

      // TODO: for full enum case (eg segment merging) this
      // seek is unnecessary; maybe we can avoid in such
      // cases
      freqIn.seek(termState.freqOffset);
      lazyProxPointer = termState.proxOffset;

      limit = termState.docFreq;
      ord = 0;
      doc = 0;
      position = 0;

      skipped = false;
      posPendingCount = 0;
      payloadPending = false;

      freqOffset = termState.freqOffset;
      proxOffset = termState.proxOffset;
      skipOffset = termState.skipOffset;

      return this;
    }

    @Override
    public int nextDoc() throws IOException {
      while(true) {
        if (ord == limit) {
          return doc = NO_MORE_DOCS;
        }

        ord++;

        // Decode next doc/freq pair
        final int code = freqIn.readVInt();

        doc += code >>> 1;              // shift off low bit
        if ((code & 1) != 0) {          // if low bit is set
          freq = 1;                     // freq is one
        } else {
          freq = freqIn.readVInt();     // else read freq
        }
        posPendingCount += freq;

        if (skipDocs == null || !skipDocs.get(doc)) {
          break;
        }
      }

      position = 0;

      return doc;
    }

    @Override
    public int docID() {
      return doc;
    }

    @Override
    public int freq() {
      return freq;
    }

    @Override
    public int advance(int target) throws IOException {

      // TODO: jump right to next() if target is < X away
      // from where we are now?

      if (skipOffset > 0) {

        // There are enough docs in the posting to have
        // skip data

        if (skipper == null) {
          // This is the first time this enum has ever been used for skipping -- do lazy init
          skipper = new DefaultSkipListReader((IndexInput) freqIn.clone(), maxSkipLevels, skipInterval);
        }

        if (!skipped) {

          // This is the first time this posting has
          // skipped, since reset() was called, so now we
          // load the skip data for this posting

          skipper.init(freqOffset+skipOffset,
                       freqOffset, proxOffset,
                       limit, storePayloads);

          skipped = true;
        }

        final int newOrd = skipper.skipTo(target); 

        if (newOrd > ord) {
          // Skipper moved
          ord = newOrd;
          doc = skipper.getDoc();
          freqIn.seek(skipper.getFreqPointer());
          lazyProxPointer = skipper.getProxPointer();
          posPendingCount = 0;
          position = 0;
          payloadPending = false;
          payloadLength = skipper.getPayloadLength();
        }
      }
        
      // Now, linear scan for the rest:
      do {
        nextDoc();
      } while (target > doc);

      return doc;
    }

    public int nextPosition() throws IOException {

      if (lazyProxPointer != -1) {
        proxIn.seek(lazyProxPointer);
        lazyProxPointer = -1;
      }
      
      if (payloadPending && payloadLength > 0) {
        // payload of last position as never retrieved -- skip it
        proxIn.seek(proxIn.getFilePointer() + payloadLength);
        payloadPending = false;
      }

      // scan over any docs that were iterated without their positions
      while(posPendingCount > freq) {

        final int code = proxIn.readVInt();

        if (storePayloads) {
          if ((code & 1) != 0) {
            // new payload length
            payloadLength = proxIn.readVInt();
            assert payloadLength >= 0;
          }
          assert payloadLength != -1;
          proxIn.seek(proxIn.getFilePointer() + payloadLength);
        }

        posPendingCount--;
        position = 0;
        payloadPending = false;
      }

      // read next position
      if (storePayloads) {

        if (payloadPending && payloadLength > 0) {
          // payload wasn't retrieved for last position
          proxIn.seek(proxIn.getFilePointer()+payloadLength);
        }

        final int code = proxIn.readVInt();
        if ((code & 1) != 0) {
          // new payload length
          payloadLength = proxIn.readVInt();
          assert payloadLength >= 0;
        }
        assert payloadLength != -1;
          
        payloadPending = true;
        position += code >>> 1;
      } else {
        position += proxIn.readVInt();
      }

      posPendingCount--;

      assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount;

      return position;
    }

    /** Returns length of payload at current position */
    public int getPayloadLength() {
      assert lazyProxPointer == -1;
      assert posPendingCount < freq;
      return payloadLength;
    }

    /** Returns the payload at this position, or null if no
     *  payload was indexed. */
    public BytesRef getPayload() throws IOException {
      assert lazyProxPointer == -1;
      assert posPendingCount < freq;
      if (!payloadPending) {
        throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
      }
      if (payloadLength > payload.bytes.length) {
        payload.grow(payloadLength);
      }
      proxIn.readBytes(payload.bytes, 0, payloadLength);
      payload.length = payloadLength;
      payloadPending = false;

      return payload;
    }

    public boolean hasPayload() {
      return payloadPending && payloadLength > 0;
    }
  }
}