CommonGramsFilter.java example

Explorer
solrcene-master
/*
 * Licensed under the Apache License, 
 * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software distributed under the License 
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and limitations under the License. 
 */

package org.apache.lucene.analysis.commongrams;

import java.io.IOException;
import java.util.Arrays;
import java.util.Set;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

/*
 * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors 
 */

/**
 * Construct bigrams for frequently occurring terms while indexing. Single terms
 * are still indexed too, with bigrams overlaid. This is achieved through the
 * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
 * of {@link #GRAM_TYPE} Example:
 * <ul>
 * <li>input:"the quick brown fox"</li>
 * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
 * <li>"the-quick" has a position increment of 0 so it is in the same position
 * as "the" "the-quick" has a term.type() of "gram"</li>
 * 
 * </ul>
 */

/*
 * Constructors and makeCommonSet based on similar code in StopFilter
 */
public final class CommonGramsFilter extends TokenFilter {

  static final String GRAM_TYPE = "gram";
  private static final char SEPARATOR = '_';

  private final CharArraySet commonWords;

  private final StringBuilder buffer = new StringBuilder();
  
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);

  private int lastStartOffset;
  private boolean lastWasCommon;
  private State savedState;

  /** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead */
  public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
    this(Version.LUCENE_29, input, commonWords);
  }
  
  /** @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead */
  public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean ignoreCase) {
    this(Version.LUCENE_29, input, commonWords, ignoreCase);
  }
  
  /**
   * Construct a token stream filtering the given input using a Set of common
   * words to create bigrams. Outputs both unigrams with position increment and
   * bigrams with position increment 0 type=gram where one or both of the words
   * in a potential bigram are in the set of common words .
   * 
   * @param input TokenStream input in filter chain
   * @param commonWords The set of common words.
   */
  public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
    this(matchVersion, input, commonWords, false);
  }

  /**
   * Construct a token stream filtering the given input using a Set of common
   * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
   * is CharArraySet). If <code>commonWords</code> is an instance of
   * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
   * construct the set) it will be directly used and <code>ignoreCase</code>
   * will be ignored since <code>CharArraySet</code> directly controls case
   * sensitivity.
   * <p/>
   * If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
   * new CharArraySet will be constructed and <code>ignoreCase</code> will be
   * used to specify the case sensitivity of that set.
   * 
   * @param input TokenStream input in filter chain.
   * @param commonWords The set of common words.
   * @param ignoreCase -Ignore case when constructing bigrams for common words.
   */
  public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
    super(input);
    if (commonWords instanceof CharArraySet) {
      this.commonWords = (CharArraySet) commonWords;
    } else {
      this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
      this.commonWords.addAll(commonWords);
    }
  }

  /**
   * Construct a token stream filtering the given input using an Array of common
   * words to create bigrams.
   * 
   * @param input Tokenstream in filter chain
   * @param commonWords words to be used in constructing bigrams
   * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
   */
  @Deprecated
  public CommonGramsFilter(TokenStream input, String[] commonWords) {
    this(input, commonWords, false);
  }

  /**
   * Construct a token stream filtering the given input using an Array of common
   * words to create bigrams and is case-sensitive if ignoreCase is false.
   * 
   * @param input Tokenstream in filter chain
   * @param commonWords words to be used in constructing bigrams
   * @param ignoreCase -Ignore case when constructing bigrams for common words.
   * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)} instead.
   */
  @Deprecated
  public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
    super(input);
    this.commonWords = makeCommonSet(commonWords, ignoreCase);
  }

  /**
   * Build a CharArraySet from an array of common words, appropriate for passing
   * into the CommonGramsFilter constructor. This permits this commonWords
   * construction to be cached once when an Analyzer is constructed.
   *
   * @param commonWords Array of common words which will be converted into the CharArraySet
   * @return CharArraySet of the given words, appropriate for passing into the CommonGramFilter constructor
   * @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
   * @deprecated create a CharArraySet with CharArraySet instead
   */
  @Deprecated
  public static CharArraySet makeCommonSet(String[] commonWords) {
    return makeCommonSet(commonWords, false);
  }

  /**
   * Build a CharArraySet from an array of common words, appropriate for passing
   * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
   * false.
   * 
   * @param commonWords Array of common words which will be converted into the CharArraySet
   * @param ignoreCase If true, all words are lower cased first.
   * @return a Set containing the words
   * @deprecated create a CharArraySet with CharArraySet instead
   */
  @Deprecated
  public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
    CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
    commonSet.addAll(Arrays.asList(commonWords));
    return commonSet;
  }

  /**
   * Inserts bigrams for common words into a token stream. For each input token,
   * output the token. If the token and/or the following token are in the list
   * of common words also output a bigram with position increment 0 and
   * type="gram"
   *
   * TODO:Consider adding an option to not emit unigram stopwords
   * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
   * changed to work with this.
   *
   * TODO: Consider optimizing for the case of three
   * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
   * "of-the", "the-year" but with proper management of positions we could
   * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
   * position lookups.
   */
  public boolean incrementToken() throws IOException {
    // get the next piece of input
    if (savedState != null) {
      restoreState(savedState);
      savedState = null;
      saveTermBuffer();
      return true;
    } else if (!input.incrementToken()) {
        return false;
    }
    
    /* We build n-grams before and after stopwords. 
     * When valid, the buffer always contains at least the separator.
     * If its empty, there is nothing before this stopword.
     */
    if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
      savedState = captureState();
      gramToken();
      return true;      
    }

    saveTermBuffer();
    return true;
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void reset() throws IOException {
    super.reset();
    lastWasCommon = false;
    savedState = null;
    buffer.setLength(0);
  }

  // ================================================= Helper Methods ================================================

  /**
   * Determines if the current token is a common term
   *
   * @return {@code true} if the current token is a common term, {@code false} otherwise
   */
  private boolean isCommon() {
    return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
  }

  /**
   * Saves this information to form the left part of a gram
   */
  private void saveTermBuffer() {
    buffer.setLength(0);
    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
    buffer.append(SEPARATOR);
    lastStartOffset = offsetAttribute.startOffset();
    lastWasCommon = isCommon();
  }

  /**
   * Constructs a compound token.
   */
  private void gramToken() {
    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
    int endOffset = offsetAttribute.endOffset();

    clearAttributes();

    int length = buffer.length();
    char termText[] = termAttribute.buffer();
    if (length > termText.length) {
      termText = termAttribute.resizeBuffer(length);
    }
    
    buffer.getChars(0, length, termText, 0);
    termAttribute.setLength(length);
    posIncAttribute.setPositionIncrement(0);
    offsetAttribute.setOffset(lastStartOffset, endOffset);
    typeAttribute.setType(GRAM_TYPE);
    buffer.setLength(0);
  }
}