/*
* Copyright (c) 2010 Lockheed Martin Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.eurekastreams.commons.search.analysis;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Token filter that finds all tokens containing a string, removes it if it's a prefix, stores the item in a list, then
* passes it through the chain as long as it doesn't start with the prefix.
*/
public class PrefixedTokenRemoverAndExtractorTokenizer extends TokenFilter
{
/**
* Collection to store the extracted keywords.
*/
private List<String> extractedKeywords;
/**
* The prefix to look for.
*/
private String oldPrefix;
/**
* The prefix to replace.
*/
private String newPrefix;
/**
* Constructor.
*
* @param inInput
* the input
* @param inOldPrefix
* the prefix to look for
* @param inNewPrefix
* the new prefix to apply to the words added into the extracted keywords
* @param inExtractedKeywords
* list to store the extracted keywords
*/
public PrefixedTokenRemoverAndExtractorTokenizer(final TokenStream inInput, final String inOldPrefix,
final String inNewPrefix, final List<String> inExtractedKeywords)
{
super(inInput);
oldPrefix = inOldPrefix;
newPrefix = inNewPrefix;
extractedKeywords = inExtractedKeywords;
}
/**
* Get the next token, replacing the prefix string with the replacement string, and if the token begins with the
* string, store it in the extracted keywords list after the replacement. If finds an empty token, try again. Return
* null when no tokens remaining. Don't return tokens that start with the prefix.
*
* Example - if we're replacing "foo" with "#":
*
* - "foobar" --> stores "#bar" in extracted keywords list, and tries to return the next token
*
* - "foobarfoobar -> stores "#bar#bar" in keywords list, and tries to return the next token
*
* @param reusableToken
* the token to reuse if possible
* @return the reusable token with the next token - replace all replacement characters and remove it if it's a
* prefix
* @throws IOException
* on error
*/
@Override
public final Token next(final Token reusableToken) throws IOException
{
Token nextToken = null;
assert reusableToken != null;
do
{
nextToken = input.next(reusableToken);
if (nextToken == null)
{
// no terms left
return null;
}
// there's a term - see if we need to operate on it
String termText = nextToken.term();
if (termText.contains(oldPrefix))
{
// the term contains the character we're looking for - replace all occurrences
String keyword = termText.replace(oldPrefix, newPrefix);
nextToken.reinit(keyword, 0, keyword.length());
if (keyword.startsWith(newPrefix))
{
// and it started with the keyword
if (!extractedKeywords.contains(keyword))
{
// check for the special case where the term only contains the prefix
if (keyword.replace(newPrefix, "").length() > 0)
{
extractedKeywords.add(keyword);
}
}
}
}
}
while (nextToken.termLength() == 0 || nextToken.term().startsWith(newPrefix));
return nextToken;
}
}