CommonGramsFilter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.commongrams;

import java.io.IOException;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

/*
 * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors 
 */

/**
 * Construct bigrams for frequently occurring terms while indexing. Single terms
 * are still indexed too, with bigrams overlaid. This is achieved through the
 * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
 * of {@link #GRAM_TYPE} Example:
 * <ul>
 * <li>input:"the quick brown fox"</li>
 * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
 * <li>"the-quick" has a position increment of 0 so it is in the same position
 * as "the" "the-quick" has a term.type() of "gram"</li>
 * 
 * </ul>
 */

/*
 * Constructors and makeCommonSet based on similar code in StopFilter
 */
public final class CommonGramsFilter extends TokenFilter {

  public static final String GRAM_TYPE = "gram";
  private static final char SEPARATOR = '_';

  private final CharArraySet commonWords;

  private final StringBuilder buffer = new StringBuilder();
  
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);

  private int lastStartOffset;
  private boolean lastWasCommon;
  private State savedState;

  /**
   * Construct a token stream filtering the given input using a Set of common
   * words to create bigrams. Outputs both unigrams with position increment and
   * bigrams with position increment 0 type=gram where one or both of the words
   * in a potential bigram are in the set of common words .
   * 
   * @param input TokenStream input in filter chain
   * @param commonWords The set of common words.
   */
  public CommonGramsFilter(TokenStream input, CharArraySet commonWords) {
    super(input);
    this.commonWords = commonWords;
  }

  /**
   * Inserts bigrams for common words into a token stream. For each input token,
   * output the token. If the token and/or the following token are in the list
   * of common words also output a bigram with position increment 0 and
   * type="gram"
   *
   * TODO:Consider adding an option to not emit unigram stopwords
   * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
   * changed to work with this.
   *
   * TODO: Consider optimizing for the case of three
   * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
   * "of-the", "the-year" but with proper management of positions we could
   * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
   * position lookups.
   */
  @Override
  public boolean incrementToken() throws IOException {
    // get the next piece of input
    if (savedState != null) {
      restoreState(savedState);
      savedState = null;
      saveTermBuffer();
      return true;
    } else if (!input.incrementToken()) {
      return false;
    }
    
    /* We build n-grams before and after stopwords. 
     * When valid, the buffer always contains at least the separator.
     * If it's empty, there is nothing before this stopword.
     */
    if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
      savedState = captureState();
      gramToken();
      return true;      
    }

    saveTermBuffer();
    return true;
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void reset() throws IOException {
    super.reset();
    lastWasCommon = false;
    savedState = null;
    buffer.setLength(0);
  }

  // ================================================= Helper Methods ================================================

  /**
   * Determines if the current token is a common term
   *
   * @return {@code true} if the current token is a common term, {@code false} otherwise
   */
  private boolean isCommon() {
    return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
  }

  /**
   * Saves this information to form the left part of a gram
   */
  private void saveTermBuffer() {
    buffer.setLength(0);
    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
    buffer.append(SEPARATOR);
    lastStartOffset = offsetAttribute.startOffset();
    lastWasCommon = isCommon();
  }

  /**
   * Constructs a compound token.
   */
  private void gramToken() {
    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
    int endOffset = offsetAttribute.endOffset();

    clearAttributes();

    int length = buffer.length();
    char termText[] = termAttribute.buffer();
    if (length > termText.length) {
      termText = termAttribute.resizeBuffer(length);
    }
    
    buffer.getChars(0, length, termText, 0);
    termAttribute.setLength(length);
    posIncAttribute.setPositionIncrement(0);
    posLenAttribute.setPositionLength(2); // bigram
    offsetAttribute.setOffset(lastStartOffset, endOffset);
    typeAttribute.setType(GRAM_TYPE);
    buffer.setLength(0);
  }
}