/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.commongrams; import java.io.IOException; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /* * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors */ /** * Construct bigrams for frequently occurring terms while indexing. Single terms * are still indexed too, with bigrams overlaid. This is achieved through the * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type * of {@link #GRAM_TYPE} Example: * <ul> * <li>input:"the quick brown fox"</li> * <li>output:|"the","the-quick"|"brown"|"fox"|</li> * <li>"the-quick" has a position increment of 0 so it is in the same position * as "the" "the-quick" has a term.type() of "gram"</li> * * </ul> */ /* * Constructors and makeCommonSet based on similar code in StopFilter */ public final class CommonGramsFilter extends TokenFilter { public static final String GRAM_TYPE = "gram"; private static final char SEPARATOR = '_'; private final CharArraySet commonWords; private final StringBuilder buffer = new StringBuilder(); private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); private int lastStartOffset; private boolean lastWasCommon; private State savedState; /** * Construct a token stream filtering the given input using a Set of common * words to create bigrams. Outputs both unigrams with position increment and * bigrams with position increment 0 type=gram where one or both of the words * in a potential bigram are in the set of common words . * * @param input TokenStream input in filter chain * @param commonWords The set of common words. */ public CommonGramsFilter(TokenStream input, CharArraySet commonWords) { super(input); this.commonWords = commonWords; } /** * Inserts bigrams for common words into a token stream. For each input token, * output the token. If the token and/or the following token are in the list * of common words also output a bigram with position increment 0 and * type="gram" * * TODO:Consider adding an option to not emit unigram stopwords * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be * changed to work with this. * * TODO: Consider optimizing for the case of three * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of", * "of-the", "the-year" but with proper management of positions we could * eliminate the middle bigram "of-the"and save a disk seek and a whole set of * position lookups. */ @Override public boolean incrementToken() throws IOException { // get the next piece of input if (savedState != null) { restoreState(savedState); savedState = null; saveTermBuffer(); return true; } else if (!input.incrementToken()) { return false; } /* We build n-grams before and after stopwords. * When valid, the buffer always contains at least the separator. * If it's empty, there is nothing before this stopword. */ if (lastWasCommon || (isCommon() && buffer.length() > 0)) { savedState = captureState(); gramToken(); return true; } saveTermBuffer(); return true; } /** * {@inheritDoc} */ @Override public void reset() throws IOException { super.reset(); lastWasCommon = false; savedState = null; buffer.setLength(0); } // ================================================= Helper Methods ================================================ /** * Determines if the current token is a common term * * @return {@code true} if the current token is a common term, {@code false} otherwise */ private boolean isCommon() { return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length()); } /** * Saves this information to form the left part of a gram */ private void saveTermBuffer() { buffer.setLength(0); buffer.append(termAttribute.buffer(), 0, termAttribute.length()); buffer.append(SEPARATOR); lastStartOffset = offsetAttribute.startOffset(); lastWasCommon = isCommon(); } /** * Constructs a compound token. */ private void gramToken() { buffer.append(termAttribute.buffer(), 0, termAttribute.length()); int endOffset = offsetAttribute.endOffset(); clearAttributes(); int length = buffer.length(); char termText[] = termAttribute.buffer(); if (length > termText.length) { termText = termAttribute.resizeBuffer(length); } buffer.getChars(0, length, termText, 0); termAttribute.setLength(length); posIncAttribute.setPositionIncrement(0); posLenAttribute.setPositionLength(2); // bigram offsetAttribute.setOffset(lastStartOffset, endOffset); typeAttribute.setType(GRAM_TYPE); buffer.setLength(0); } }