TokenStreamFromTermVector.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;

/**
 * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
 * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
 * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
 * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
 * no need to wrap with a caching impl.
 *
 * @lucene.internal
 */
final class TokenStreamFromTermVector extends TokenStream {
  // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.

  /**
   * content length divided by distinct positions; an average of dense text.
   */
  private static final double AVG_CHARS_PER_POSITION = 6;

  private static final int INSERTION_SORT_THRESHOLD = 16;

  private final Terms vector;

  private final int filteredDocId;

  private final CharTermAttribute termAttribute;

  private final PositionIncrementAttribute positionIncrementAttribute;

  private final int offsetLength;

  private final float loadFactor;

  private OffsetAttribute offsetAttribute;//maybe null

  private PayloadAttribute payloadAttribute;//maybe null

  private CharsRefBuilder termCharsBuilder;//term data here

  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

  private TokenLL firstToken = null; // the head of a linked-list

  private TokenLL incrementToken = null;

  private boolean initialized = false;//lazy

  public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
    this(vector, 0, offsetLength, 1f);
  }

  /**
   * Constructor.
   *
   * @param vector        Terms that contains the data for
   *                      creating the TokenStream. Must have positions and/or offsets.
   * @param filteredDocId The docID we will process.
   * @param offsetLength  Supply the character length of the text being uninverted, or a lower value if you don't want
   *                      to invert text beyond an offset (in so doing this will act as a filter).  If you don't
   *                      know the length, pass -1.  In conjunction with {@code loadFactor}, it's used to
   *                      determine how many buckets to create during uninversion.
   *                      It's also used to filter out tokens with a start offset exceeding this value.
   * @param loadFactor    The percent of tokens from the original terms (by position count) that are
   *                      expected to be inverted.  If they are filtered (e.g.
   *                      {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
   *                      then consider using less than 1.0 to avoid wasting space.
   *                      1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
   */
  TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
    super();
    this.filteredDocId = filteredDocId;
    this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
    if (loadFactor <= 0f || loadFactor > 1f) {
      throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
    }
    this.loadFactor = loadFactor;
    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
    if (!vector.hasPositions() && !vector.hasOffsets()) {
      throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
    }
    assert vector.hasFreqs();
    this.vector = vector;
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
  }

  public Terms getTermVectorTerms() {
    return vector;
  }

  @Override
  public void reset() throws IOException {
    incrementToken = null;
    super.reset();
  }

  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
  private void init() throws IOException {
    assert !initialized;
    int dpEnumFlags = 0;
    if (vector.hasOffsets()) {
      offsetAttribute = addAttribute(OffsetAttribute.class);
      dpEnumFlags |= PostingsEnum.OFFSETS;
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
      payloadAttribute = getAttribute(PayloadAttribute.class);
      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
      spareBytesRefBuilder = new BytesRefBuilder();
      dpEnumFlags |= PostingsEnum.PAYLOADS;
    }

    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    termCharsBuilder.grow(initTotalTermCharLen());

    // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)

    final TokenLL[] tokenBuckets = initTokenBucketsArray();
    final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
    final double POSITION_TO_BUCKET_IDX = loadFactor;

    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call

    TERM_LOOP:
    while ((termBytesRef = termsEnum.next()) != null) {
      //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
      // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
      tempCharsRefBuilder.grow(termBytesRef.length);
      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
      final int termCharsOff = termCharsBuilder.length();
      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
      dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
      assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
      int currentDocId = dpEnum.advance(filteredDocId);
      if (currentDocId != filteredDocId) {
        continue; //Not expected
      }
      final int freq = dpEnum.freq();
      for (int j = 0; j < freq; j++) {
        TokenLL token = new TokenLL();
        token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
        token.termCharsOff = termCharsOff;
        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
        // copy offset (if it's there) and compute bucketIdx
        int bucketIdx;
        if (offsetAttribute != null) {
          token.startOffset = dpEnum.startOffset();
          if (offsetLength >= 0 && token.startOffset > offsetLength) {
            continue TERM_LOOP;//filter this token out; exceeds threshold
          }
          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
          bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
        } else {
          bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
        }
        if (bucketIdx >= tokenBuckets.length) {
          bucketIdx = tokenBuckets.length - 1;
        }

        if (payloadAttribute != null) {
          final BytesRef payload = dpEnum.getPayload();
          token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
        }

        //Add token to the head of the bucket linked list
        token.next = tokenBuckets[bucketIdx];
        tokenBuckets[bucketIdx] = token;
      }
    }

    // Step 2:  Link all Tokens into a linked-list and sort all tokens at the same position

    firstToken = initLinkAndSortTokens(tokenBuckets);

    // If the term vector didn't have positions, synthesize them
    if (!vector.hasPositions() && firstToken != null) {
      TokenLL prevToken = firstToken;
      prevToken.position = 0;
      for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
        if (prevToken.startOffset == token.startOffset) {
          token.position = prevToken.position;
        } else {
          token.position = prevToken.position + 1;
        }
      }
    }

    initialized = true;
  }

  private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
    TokenLL firstToken = null;
    List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use.  TODO use native array
    TokenLL prevToken = null;
    for (TokenLL tokenHead : tokenBuckets) {
      if (tokenHead == null) {
        continue;
      }
      //sort tokens at this position and link them; return the first
      TokenLL tokenTail;
      // just one token
      if (tokenHead.next == null) {
        tokenTail = tokenHead;
      } else {
        // add the linked list to a temporary array
        for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
          scratchTokenArray.add(cur);
        }
        // sort; and set tokenHead & tokenTail
        if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
          // insertion sort by creating a linked list (leave scratchTokenArray alone)
          tokenHead = tokenTail = scratchTokenArray.get(0);
          tokenHead.next = null;
          for (int i = 1; i < scratchTokenArray.size(); i++) {
            TokenLL insertToken = scratchTokenArray.get(i);
            if (insertToken.compareTo(tokenHead) <= 0) {
              // takes the place of tokenHead
              insertToken.next = tokenHead;
              tokenHead = insertToken;
            } else {
              // goes somewhere after tokenHead
              for (TokenLL prev = tokenHead; true; prev = prev.next) {
                if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
                  if (prev.next == null) {
                    tokenTail = insertToken;
                  }
                  insertToken.next = prev.next;
                  prev.next = insertToken;
                  break;
                }
              }
            }
          }
        } else {
          Collections.sort(scratchTokenArray);
          // take back out and create a linked list
          TokenLL prev = tokenHead = scratchTokenArray.get(0);
          for (int i = 1; i < scratchTokenArray.size(); i++) {
            prev.next = scratchTokenArray.get(i);
            prev = prev.next;
          }
          tokenTail = prev;
          tokenTail.next = null;
        }
        scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
      }

      //link to previous
      if (prevToken != null) {
        assert prevToken.next == null;
        prevToken.next = tokenHead; //concatenate linked-list
        assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
      } else {
        assert firstToken == null;
        firstToken = tokenHead;
      }

      prevToken = tokenTail;
    }
    return firstToken;
  }

  private int initTotalTermCharLen() throws IOException {
    int guessNumTerms;
    if (vector.size() != -1) {
      guessNumTerms = (int) vector.size();
    } else if (offsetLength != -1) {
      guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
    } else {
      return 128;
    }
    return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
  }

  private TokenLL[] initTokenBucketsArray() throws IOException {
    // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
    int positionsEstimate;
    if (offsetLength == -1) { // no clue what the char length is.
      // Estimate the number of position slots we need from term stats based on Wikipedia.
      int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
      if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
        int size = (int) vector.size();
        if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
          size = 128;
        }
        sumTotalTermFreq = (int) (size * 2.4);
      }
      positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
    } else {
      // guess number of token positions by this factor.
      positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
    }
    // apply the load factor.
    return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
  }

  @Override
  public boolean incrementToken() throws IOException {
    int posInc;
    if (incrementToken == null) {
      if (!initialized) {
        init();
        assert initialized;
      }
      incrementToken = firstToken;
      if (incrementToken == null) {
        return false;
      }
      posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
    } else if (incrementToken.next != null) {
      int lastPosition = incrementToken.position;
      incrementToken = incrementToken.next;
      posInc = incrementToken.position - lastPosition;
    } else {
      return false;
    }
    clearAttributes();
    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);

    positionIncrementAttribute.setPositionIncrement(posInc);
    if (offsetAttribute != null) {
      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
    }
    if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
      payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
    }
    return true;
  }

  private static class TokenLL implements Comparable<TokenLL> {
    // This class should weigh 32 bytes, including object header

    int termCharsOff; // see termCharsBuilder
    short termCharsLen;

    int position;
    int startOffset;
    short endOffsetInc; // add to startOffset to get endOffset
    int payloadIndex;

    TokenLL next;

    @Override
    public int compareTo(TokenLL tokenB) {
      int cmp = Integer.compare(this.position, tokenB.position);
      if (cmp == 0) {
        cmp = Integer.compare(this.startOffset, tokenB.startOffset);
        if (cmp == 0) {
          cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
        }
      }
      return cmp;
    }
  }
}