/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.suggest.analyzing; import java.io.IOException; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** Like {@link StopFilter} except it will not remove the * last token if that token was not followed by some token * separator. For example, a query 'find the' would * preserve the 'the' since it was not followed by a space or * punctuation or something, and mark it KEYWORD so future * stemmers won't touch it either while a query like "find * the popsicle' would remove 'the' as a stopword. * * <p>Normally you'd use the ordinary {@link StopFilter} * in your indexAnalyzer and then this class in your * queryAnalyzer, when using one of the analyzing suggesters. */ public final class SuggestStopFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final CharArraySet stopWords; private State endState; /** Sole constructor. */ public SuggestStopFilter(TokenStream input, CharArraySet stopWords) { super(input); this.stopWords = stopWords; } @Override public void reset() throws IOException { super.reset(); endState = null; } @Override public void end() throws IOException { if (endState == null) { super.end(); } else { // NOTE: we already called .end() from our .next() when // the stream was complete, so we do not call // super.end() here restoreState(endState); } } @Override public boolean incrementToken() throws IOException { if (endState != null) { return false; } if (!input.incrementToken()) { return false; } int skippedPositions = 0; while (true) { if (stopWords.contains(termAtt.buffer(), 0, termAtt.length())) { int posInc = posIncAtt.getPositionIncrement(); int endOffset = offsetAtt.endOffset(); // This token may be a stopword, if it's not end: State sav = captureState(); if (input.incrementToken()) { // It was a stopword; skip it skippedPositions += posInc; } else { clearAttributes(); input.end(); endState = captureState(); int finalEndOffset = offsetAtt.endOffset(); assert finalEndOffset >= endOffset; if (finalEndOffset > endOffset) { // OK there was a token separator after the // stopword, so it was a stopword return false; } else { // No token separator after final token that // looked like a stop-word; don't filter it: restoreState(sav); posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement()); keywordAtt.setKeyword(true); return true; } } } else { // Not a stopword; return the current token: posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement()); return true; } } } }