FreqProxTermsWriterPerField.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.index;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Comparator;
import java.util.Map;

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;

// TODO: break into separate freq and prox writers as
// codecs; make separate container (tii/tis/skip/*) that can
// be configured as any number of files 1..N
final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implements Comparable<FreqProxTermsWriterPerField> {

  final FreqProxTermsWriter parent;
  final TermsHashPerField termsHashPerField;
  final FieldInfo fieldInfo;
  final DocumentsWriterPerThread.DocState docState;
  final FieldInvertState fieldState;
  private boolean hasFreq;
  private boolean hasProx;
  private boolean hasOffsets;
  PayloadAttribute payloadAttribute;
  OffsetAttribute offsetAttribute;

  public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) {
    this.termsHashPerField = termsHashPerField;
    this.parent = parent;
    this.fieldInfo = fieldInfo;
    docState = termsHashPerField.docState;
    fieldState = termsHashPerField.fieldState;
    setIndexOptions(fieldInfo.getIndexOptions());
  }

  @Override
  int getStreamCount() {
    if (!hasProx) {
      return 1;
    } else {
      return 2;
    }
  }

  @Override
  void finish() {
    if (hasPayloads) {
      fieldInfo.setStorePayloads();
    }
  }

  boolean hasPayloads;

  @Override
  void skippingLongTerm() {}

  public int compareTo(FreqProxTermsWriterPerField other) {
    return fieldInfo.name.compareTo(other.fieldInfo.name);
  }

  // Called after flush
  void reset() {
    // Record, up front, whether our in-RAM format will be
    // with or without term freqs:
    setIndexOptions(fieldInfo.getIndexOptions());
    payloadAttribute = null;
  }

  private void setIndexOptions(IndexOptions indexOptions) {
    if (indexOptions == null) {
      // field could later be updated with indexed=true, so set everything on
      hasFreq = hasProx = hasOffsets = true;
    } else {
      hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
      hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
      hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
    }
  }

  @Override
  boolean start(IndexableField[] fields, int count) {
    for(int i=0;i<count;i++) {
      if (fields[i].fieldType().indexed()) {
        return true;
      }
    }
    return false;
  }

  @Override
  void start(IndexableField f) {
    if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
      payloadAttribute = fieldState.attributeSource.getAttribute(PayloadAttribute.class);
    } else {
      payloadAttribute = null;
    }
    if (hasOffsets) {
      offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
    } else {
      offsetAttribute = null;
    }
  }

  void writeProx(final int termID, int proxCode) {
    //System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
    assert hasProx;
    final BytesRef payload;
    if (payloadAttribute == null) {
      payload = null;
    } else {
      payload = payloadAttribute.getPayload();
    }

    if (payload != null && payload.length > 0) {
      termsHashPerField.writeVInt(1, (proxCode<<1)|1);
      termsHashPerField.writeVInt(1, payload.length);
      termsHashPerField.writeBytes(1, payload.bytes, payload.offset, payload.length);
      hasPayloads = true;
    } else {
      termsHashPerField.writeVInt(1, proxCode<<1);
    }

    FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
    postings.lastPositions[termID] = fieldState.position;
  }

  void writeOffsets(final int termID, int offsetAccum) {
    assert hasOffsets;
    final int startOffset = offsetAccum + offsetAttribute.startOffset();
    final int endOffset = offsetAccum + offsetAttribute.endOffset();
    //System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
    FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
    assert startOffset - postings.lastOffsets[termID] >= 0;
    termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
    termsHashPerField.writeVInt(1, endOffset - startOffset);

    postings.lastOffsets[termID] = startOffset;
  }

  @Override
  void newTerm(final int termID) {
    // First time we're seeing this term since the last
    // flush
    assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");

    FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
    postings.lastDocIDs[termID] = docState.docID;
    if (!hasFreq) {
      postings.lastDocCodes[termID] = docState.docID;
    } else {
      postings.lastDocCodes[termID] = docState.docID << 1;
      postings.termFreqs[termID] = 1;
      if (hasProx) {
        writeProx(termID, fieldState.position);
        if (hasOffsets) {
          writeOffsets(termID, fieldState.offset);
        }
      } else {
        assert !hasOffsets;
      }
    }
    fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
    fieldState.uniqueTermCount++;
  }

  @Override
  void addTerm(final int termID) {

    assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");

    FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;

    assert !hasFreq || postings.termFreqs[termID] > 0;

    if (!hasFreq) {
      assert postings.termFreqs == null;
      if (docState.docID != postings.lastDocIDs[termID]) {
        assert docState.docID > postings.lastDocIDs[termID];
        termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
        postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
        postings.lastDocIDs[termID] = docState.docID;
        fieldState.uniqueTermCount++;
      }
    } else if (docState.docID != postings.lastDocIDs[termID]) {
      assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
      // Term not yet seen in the current doc but previously
      // seen in other doc(s) since the last flush

      // Now that we know doc freq for previous doc,
      // write it & lastDocCode
      if (1 == postings.termFreqs[termID]) {
        termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
      } else {
        termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
        termsHashPerField.writeVInt(0, postings.termFreqs[termID]);
      }
      postings.termFreqs[termID] = 1;
      fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
      postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
      postings.lastDocIDs[termID] = docState.docID;
      if (hasProx) {
        writeProx(termID, fieldState.position);
        if (hasOffsets) {
          postings.lastOffsets[termID] = 0;
          writeOffsets(termID, fieldState.offset);
        }
      } else {
        assert !hasOffsets;
      }
      fieldState.uniqueTermCount++;
    } else {
      fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
      if (hasProx) {
        writeProx(termID, fieldState.position-postings.lastPositions[termID]);
      }
      if (hasOffsets) {
        writeOffsets(termID, fieldState.offset);
      }
    }
  }

  @Override
  ParallelPostingsArray createPostingsArray(int size) {
    return new FreqProxPostingsArray(size, hasFreq, hasProx, hasOffsets);
  }

  static final class FreqProxPostingsArray extends ParallelPostingsArray {
    public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
      super(size);
      if (writeFreqs) {
        termFreqs = new int[size];
      }
      lastDocIDs = new int[size];
      lastDocCodes = new int[size];
      if (writeProx) {
        lastPositions = new int[size];
        if (writeOffsets) {
          lastOffsets = new int[size];
        }
      } else {
        assert !writeOffsets;
      }
      //System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
    }

    int termFreqs[];                                   // # times this term occurs in the current doc
    int lastDocIDs[];                                  // Last docID where this term occurred
    int lastDocCodes[];                                // Code for prior doc
    int lastPositions[];                               // Last position where this term occurred
    int lastOffsets[];                                 // Last endOffset where this term occurred

    @Override
    ParallelPostingsArray newInstance(int size) {
      return new FreqProxPostingsArray(size, termFreqs != null, lastPositions != null, lastOffsets != null);
    }

    @Override
    void copyTo(ParallelPostingsArray toArray, int numToCopy) {
      assert toArray instanceof FreqProxPostingsArray;
      FreqProxPostingsArray to = (FreqProxPostingsArray) toArray;

      super.copyTo(toArray, numToCopy);

      System.arraycopy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy);
      System.arraycopy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy);
      if (lastPositions != null) {
        assert to.lastPositions != null;
        System.arraycopy(lastPositions, 0, to.lastPositions, 0, numToCopy);
      }
      if (lastOffsets != null) {
        assert to.lastOffsets != null;
        System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
      }
      if (termFreqs != null) {
        assert to.termFreqs != null;
        System.arraycopy(termFreqs, 0, to.termFreqs, 0, numToCopy);
      }
    }

    @Override
    int bytesPerPosting() {
      int bytes = ParallelPostingsArray.BYTES_PER_POSTING + 2 * RamUsageEstimator.NUM_BYTES_INT;
      if (lastPositions != null) {
        bytes += RamUsageEstimator.NUM_BYTES_INT;
      }
      if (lastOffsets != null) {
        bytes += RamUsageEstimator.NUM_BYTES_INT;
      }
      if (termFreqs != null) {
        bytes += RamUsageEstimator.NUM_BYTES_INT;
      }

      return bytes;
    }
  }

  public void abort() {}

  BytesRef payload;

  /* Walk through all unique text tokens (Posting
   * instances) found in this field and serialize them
   * into a single RAM segment. */
  void flush(String fieldName, FieldsConsumer consumer,  final SegmentWriteState state)
    throws IOException {

    if (!fieldInfo.isIndexed()) {
      return; // nothing to flush, don't bother the codec with the unindexed field
    }
    
    final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
    final Comparator<BytesRef> termComp = termsConsumer.getComparator();

    // CONFUSING: this.indexOptions holds the index options
    // that were current when we first saw this field.  But
    // it's possible this has changed, eg when other
    // documents are indexed that cause a "downgrade" of the
    // IndexOptions.  So we must decode the in-RAM buffer
    // according to this.indexOptions, but then write the
    // new segment to the directory according to
    // currentFieldIndexOptions:
    final IndexOptions currentFieldIndexOptions = fieldInfo.getIndexOptions();
    assert currentFieldIndexOptions != null;

    final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
    final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;

    final boolean readTermFreq = this.hasFreq;
    final boolean readPositions = this.hasProx;
    final boolean readOffsets = this.hasOffsets;

    //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);

    // Make sure FieldInfo.update is working correctly!:
    assert !writeTermFreq || readTermFreq;
    assert !writePositions || readPositions;
    assert !writeOffsets || readOffsets;

    assert !writeOffsets || writePositions;

    final Map<Term,Integer> segDeletes;
    if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
      segDeletes = state.segDeletes.terms;
    } else {
      segDeletes = null;
    }

    final int[] termIDs = termsHashPerField.sortPostings(termComp);
    final int numTerms = termsHashPerField.bytesHash.size();
    final BytesRef text = new BytesRef();
    final FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
    final ByteSliceReader freq = new ByteSliceReader();
    final ByteSliceReader prox = new ByteSliceReader();

    FixedBitSet visitedDocs = new FixedBitSet(state.segmentInfo.getDocCount());
    long sumTotalTermFreq = 0;
    long sumDocFreq = 0;

    for (int i = 0; i < numTerms; i++) {
      final int termID = termIDs[i];
      //System.out.println("term=" + termID);
      // Get BytesRef
      final int textStart = postings.textStarts[termID];
      termsHashPerField.bytePool.setBytesRef(text, textStart);

      termsHashPerField.initReader(freq, termID, 0);
      if (readPositions || readOffsets) {
        termsHashPerField.initReader(prox, termID, 1);
      }

      // TODO: really TermsHashPerField should take over most
      // of this loop, including merge sort of terms from
      // multiple threads and interacting with the
      // TermsConsumer, only calling out to us (passing us the
      // DocsConsumer) to handle delivery of docs/positions

      final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);

      final int delDocLimit;
      if (segDeletes != null) {
        final Integer docIDUpto = segDeletes.get(new Term(fieldName, text));
        if (docIDUpto != null) {
          delDocLimit = docIDUpto;
        } else {
          delDocLimit = 0;
        }
      } else {
        delDocLimit = 0;
      }

      // Now termStates has numToMerge FieldMergeStates
      // which all share the same term.  Now we must
      // interleave the docID streams.
      int docFreq = 0;
      long totTF = 0;
      int docID = 0;

      while(true) {
        //System.out.println("  cycle");
        final int termFreq;
        if (freq.eof()) {
          if (postings.lastDocCodes[termID] != -1) {
            // Return last doc
            docID = postings.lastDocIDs[termID];
            if (readTermFreq) {
              termFreq = postings.termFreqs[termID];
            } else {
              termFreq = -1;
            }
            postings.lastDocCodes[termID] = -1;
          } else {
            // EOF
            break;
          }
        } else {
          final int code = freq.readVInt();
          if (!readTermFreq) {
            docID += code;
            termFreq = -1;
          } else {
            docID += code >>> 1;
            if ((code & 1) != 0) {
              termFreq = 1;
            } else {
              termFreq = freq.readVInt();
            }
          }

          assert docID != postings.lastDocIDs[termID];
        }

        docFreq++;
        assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();

        // NOTE: we could check here if the docID was
        // deleted, and skip it.  However, this is somewhat
        // dangerous because it can yield non-deterministic
        // behavior since we may see the docID before we see
        // the term that caused it to be deleted.  This
        // would mean some (but not all) of its postings may
        // make it into the index, which'd alter the docFreq
        // for those terms.  We could fix this by doing two
        // passes, ie first sweep marks all del docs, and
        // 2nd sweep does the real flush, but I suspect
        // that'd add too much time to flush.
        visitedDocs.set(docID);
        postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1);
        if (docID < delDocLimit) {
          // Mark it deleted.  TODO: we could also skip
          // writing its postings; this would be
          // deterministic (just for this Term's docs).
          
          // TODO: can we do this reach-around in a cleaner way????
          if (state.liveDocs == null) {
            state.liveDocs = docState.docWriter.codec.liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount());
          }
          if (state.liveDocs.get(docID)) {
            state.delCountOnFlush++;
            state.liveDocs.clear(docID);
          }
        }

        totTF += termFreq;
        
        // Carefully copy over the prox + payload info,
        // changing the format to match Lucene's segment
        // format.

        if (readPositions || readOffsets) {
          // we did record positions (& maybe payload) and/or offsets
          int position = 0;
          int offset = 0;
          for(int j=0;j<termFreq;j++) {
            final BytesRef thisPayload;

            if (readPositions) {
              final int code = prox.readVInt();
              position += code >>> 1;

              if ((code & 1) != 0) {

                // This position has a payload
                final int payloadLength = prox.readVInt();

                if (payload == null) {
                  payload = new BytesRef();
                  payload.bytes = new byte[payloadLength];
                } else if (payload.bytes.length < payloadLength) {
                  payload.grow(payloadLength);
                }

                prox.readBytes(payload.bytes, 0, payloadLength);
                payload.length = payloadLength;
                thisPayload = payload;

              } else {
                thisPayload = null;
              }

              if (readOffsets) {
                final int startOffset = offset + prox.readVInt();
                final int endOffset = startOffset + prox.readVInt();
                if (writePositions) {
                  if (writeOffsets) {
                    assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset;
                    postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
                  } else {
                    postingsConsumer.addPosition(position, thisPayload, -1, -1);
                  }
                }
                offset = startOffset;
              } else if (writePositions) {
                postingsConsumer.addPosition(position, thisPayload, -1, -1);
              }
            }
          }
        }
        postingsConsumer.finishDoc();
      }
      termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totTF : -1));
      sumTotalTermFreq += totTF;
      sumDocFreq += docFreq;
    }

    termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality());
  }
}