PulsingPostingsWriter.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.codecs.pulsing;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;

// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined.  Though this is
// presumably rare in practice...

/** 
 * Writer for the pulsing format. 
 * <p>
 * Wraps another postings implementation and decides 
 * (based on total number of occurrences), whether a terms 
 * postings should be inlined into the term dictionary,
 * or passed through to the wrapped writer.
 *
 * @lucene.experimental */
public final class PulsingPostingsWriter extends PostingsWriterBase {

  final static String CODEC = "PulsedPostingsWriter";

  // To add a new version, increment from the last one, and
  // change VERSION_CURRENT to point to your new version:
  final static int VERSION_START = 0;

  final static int VERSION_CURRENT = VERSION_START;

  private IndexOutput termsOut;

  private IndexOptions indexOptions;
  private boolean storePayloads;

  private static class PendingTerm {
    private final byte[] bytes;
    public PendingTerm(byte[] bytes) {
      this.bytes = bytes;
    }
  }

  private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();

  // one entry per position
  private final Position[] pending;
  private int pendingCount = 0;                           // -1 once we've hit too many positions
  private Position currentDoc;                    // first Position entry of current doc

  private static final class Position {
    BytesRef payload;
    int termFreq;                                 // only incremented on first position for a given doc
    int pos;
    int docID;
    int startOffset;
    int endOffset;
  }

  // TODO: -- lazy init this?  ie, if every single term
  // was inlined (eg for a "primary key" field) then we
  // never need to use this fallback?  Fallback writer for
  // non-inlined terms:
  final PostingsWriterBase wrappedPostingsWriter;

  /** If the total number of positions (summed across all docs
   *  for this term) is <= maxPositions, then the postings are
   *  inlined into terms dict */
  public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
    pending = new Position[maxPositions];
    for(int i=0;i<maxPositions;i++) {
      pending[i] = new Position();
    }

    // We simply wrap another postings writer, but only call
    // on it when tot positions is >= the cutoff:
    this.wrappedPostingsWriter = wrappedPostingsWriter;
  }

  @Override
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
    termsOut.writeVInt(pending.length); // encode maxPositions in header
    wrappedPostingsWriter.start(termsOut);
  }

  @Override
  public void startTerm() {
    //if (DEBUG) System.out.println("PW   startTerm");
    assert pendingCount == 0;
  }

  // TODO: -- should we NOT reuse across fields?  would
  // be cleaner

  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public void setField(FieldInfo fieldInfo) {
    this.indexOptions = fieldInfo.getIndexOptions();
    //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
    storePayloads = fieldInfo.hasPayloads();
    wrappedPostingsWriter.setField(fieldInfo);
    //DEBUG = BlockTreeTermsWriter.DEBUG;
  }

  private boolean DEBUG;

  @Override
  public void startDoc(int docID, int termDocFreq) throws IOException {
    assert docID >= 0: "got docID=" + docID;

    /*
    if (termID != -1) {
      if (docID == 0) {
        baseDocID = termID;
      } else if (baseDocID + docID != termID) {
        throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID);
      }
    }
    */

    //if (DEBUG) System.out.println("PW     doc=" + docID);

    if (pendingCount == pending.length) {
      push();
      //if (DEBUG) System.out.println("PW: wrapped.finishDoc");
      wrappedPostingsWriter.finishDoc();
    }

    if (pendingCount != -1) {
      assert pendingCount < pending.length;
      currentDoc = pending[pendingCount];
      currentDoc.docID = docID;
      if (indexOptions == IndexOptions.DOCS_ONLY) {
        pendingCount++;
      } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { 
        pendingCount++;
        currentDoc.termFreq = termDocFreq;
      } else {
        currentDoc.termFreq = termDocFreq;
      }
    } else {
      // We've already seen too many docs for this term --
      // just forward to our fallback writer
      wrappedPostingsWriter.startDoc(docID, termDocFreq);
    }
  }

  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {

    //if (DEBUG) System.out.println("PW       pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
    if (pendingCount == pending.length) {
      push();
    }

    if (pendingCount == -1) {
      // We've already seen too many docs for this term --
      // just forward to our fallback writer
      wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
    } else {
      // buffer up
      final Position pos = pending[pendingCount++];
      pos.pos = position;
      pos.startOffset = startOffset;
      pos.endOffset = endOffset;
      pos.docID = currentDoc.docID;
      if (payload != null && payload.length > 0) {
        if (pos.payload == null) {
          pos.payload = BytesRef.deepCopyOf(payload);
        } else {
          pos.payload.copyBytes(payload);
        }
      } else if (pos.payload != null) {
        pos.payload.length = 0;
      }
    }
  }

  @Override
  public void finishDoc() throws IOException {
    // if (DEBUG) System.out.println("PW     finishDoc");
    if (pendingCount == -1) {
      wrappedPostingsWriter.finishDoc();
    }
  }

  private final RAMOutputStream buffer = new RAMOutputStream();

  // private int baseDocID;

  /** Called when we are done adding docs to this term */
  @Override
  public void finishTerm(TermStats stats) throws IOException {
    // if (DEBUG) System.out.println("PW   finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());

    assert pendingCount > 0 || pendingCount == -1;

    if (pendingCount == -1) {
      wrappedPostingsWriter.finishTerm(stats);
      // Must add null entry to record terms that our
      // wrapped postings impl added
      pendingTerms.add(null);
    } else {

      // There were few enough total occurrences for this
      // term, so we fully inline our postings data into
      // terms dict, now:

      // TODO: it'd be better to share this encoding logic
      // in some inner codec that knows how to write a
      // single doc / single position, etc.  This way if a
      // given codec wants to store other interesting
      // stuff, it could use this pulsing codec to do so

      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        int lastDocID = 0;
        int pendingIDX = 0;
        int lastPayloadLength = -1;
        int lastOffsetLength = -1;
        while(pendingIDX < pendingCount) {
          final Position doc = pending[pendingIDX];

          final int delta = doc.docID - lastDocID;
          lastDocID = doc.docID;

          // if (DEBUG) System.out.println("  write doc=" + doc.docID + " freq=" + doc.termFreq);

          if (doc.termFreq == 1) {
            buffer.writeVInt((delta<<1)|1);
          } else {
            buffer.writeVInt(delta<<1);
            buffer.writeVInt(doc.termFreq);
          }

          int lastPos = 0;
          int lastOffset = 0;
          for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
            final Position pos = pending[pendingIDX++];
            assert pos.docID == doc.docID;
            final int posDelta = pos.pos - lastPos;
            lastPos = pos.pos;
            // if (DEBUG) System.out.println("    write pos=" + pos.pos);
            final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
            if (storePayloads) {
              if (payloadLength != lastPayloadLength) {
                buffer.writeVInt((posDelta << 1)|1);
                buffer.writeVInt(payloadLength);
                lastPayloadLength = payloadLength;
              } else {
                buffer.writeVInt(posDelta << 1);
              }
            } else {
              buffer.writeVInt(posDelta);
            }
            
            if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
              //System.out.println("write=" + pos.startOffset + "," + pos.endOffset);
              int offsetDelta = pos.startOffset - lastOffset;
              int offsetLength = pos.endOffset - pos.startOffset;
              if (offsetLength != lastOffsetLength) {
                buffer.writeVInt(offsetDelta << 1 | 1);
                buffer.writeVInt(offsetLength);
              } else {
                buffer.writeVInt(offsetDelta << 1);
              }
              lastOffset = pos.startOffset;
              lastOffsetLength = offsetLength;             
            }
            
            if (payloadLength > 0) {
              assert storePayloads;
              buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
            }
          }
        }
      } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
        int lastDocID = 0;
        for(int posIDX=0;posIDX<pendingCount;posIDX++) {
          final Position doc = pending[posIDX];
          final int delta = doc.docID - lastDocID;
          assert doc.termFreq != 0;
          if (doc.termFreq == 1) {
            buffer.writeVInt((delta<<1)|1);
          } else {
            buffer.writeVInt(delta<<1);
            buffer.writeVInt(doc.termFreq);
          }
          lastDocID = doc.docID;
        }
      } else if (indexOptions == IndexOptions.DOCS_ONLY) {
        int lastDocID = 0;
        for(int posIDX=0;posIDX<pendingCount;posIDX++) {
          final Position doc = pending[posIDX];
          buffer.writeVInt(doc.docID - lastDocID);
          lastDocID = doc.docID;
        }
      }

      final byte[] bytes = new byte[(int) buffer.getFilePointer()];
      buffer.writeTo(bytes, 0);
      pendingTerms.add(new PendingTerm(bytes));
      buffer.reset();
    }

    pendingCount = 0;
  }

  @Override
  public void close() throws IOException {
    wrappedPostingsWriter.close();
  }

  @Override
  public void flushTermsBlock(int start, int count) throws IOException {
    // if (DEBUG) System.out.println("PW: flushTermsBlock start=" + start + " count=" + count + " pendingTerms.size()=" + pendingTerms.size());
    int wrappedCount = 0;
    assert buffer.getFilePointer() == 0;
    assert start >= count;

    final int limit = pendingTerms.size() - start + count;

    for(int idx=pendingTerms.size()-start; idx<limit; idx++) {
      final PendingTerm term = pendingTerms.get(idx);
      if (term == null) {
        wrappedCount++;
      } else {
        buffer.writeVInt(term.bytes.length);
        buffer.writeBytes(term.bytes, 0, term.bytes.length);
      }
    }

    termsOut.writeVInt((int) buffer.getFilePointer());
    buffer.writeTo(termsOut);
    buffer.reset();

    // TDOO: this could be somewhat costly since
    // pendingTerms.size() could be biggish?
    int futureWrappedCount = 0;
    final int limit2 = pendingTerms.size();
    for(int idx=limit;idx<limit2;idx++) {
      if (pendingTerms.get(idx) == null) {
        futureWrappedCount++;
      }
    }

    // Remove the terms we just wrote:
    pendingTerms.subList(pendingTerms.size()-start, limit).clear();

    // if (DEBUG) System.out.println("PW:   len=" + buffer.getFilePointer() + " fp=" + termsOut.getFilePointer() + " futureWrappedCount=" + futureWrappedCount + " wrappedCount=" + wrappedCount);
    // TODO: can we avoid calling this if all terms
    // were inlined...?  Eg for a "primary key" field, the
    // wrapped codec is never invoked...
    wrappedPostingsWriter.flushTermsBlock(futureWrappedCount+wrappedCount, wrappedCount);
  }

  // Pushes pending positions to the wrapped codec
  private void push() throws IOException {
    // if (DEBUG) System.out.println("PW now push @ " + pendingCount + " wrapped=" + wrappedPostingsWriter);
    assert pendingCount == pending.length;
      
    wrappedPostingsWriter.startTerm();
      
    // Flush all buffered docs
    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      Position doc = null;
      for(Position pos : pending) {
        if (doc == null) {
          doc = pos;
          // if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
          wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
        } else if (doc.docID != pos.docID) {
          assert pos.docID > doc.docID;
          // if (DEBUG) System.out.println("PW: wrapped.finishDoc");
          wrappedPostingsWriter.finishDoc();
          doc = pos;
          // if (DEBUG) System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
          wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
        }
        // if (DEBUG) System.out.println("PW:   wrapped.addPos pos=" + pos.pos);
        wrappedPostingsWriter.addPosition(pos.pos, pos.payload, pos.startOffset, pos.endOffset);
      }
      //wrappedPostingsWriter.finishDoc();
    } else {
      for(Position doc : pending) {
        wrappedPostingsWriter.startDoc(doc.docID, indexOptions == IndexOptions.DOCS_ONLY ? 0 : doc.termFreq);
      }
    }
    pendingCount = -1;
  }
}