PrefixedTokenRemoverAndExtractorTokenizer.java example

Explorer
eurekastreams-master
/*
 * Copyright (c) 2010 Lockheed Martin Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.eurekastreams.commons.search.analysis;

import java.io.IOException;
import java.util.List;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

/**
 * Token filter that finds all tokens containing a string, removes it if it's a prefix, stores the item in a list, then
 * passes it through the chain as long as it doesn't start with the prefix.
 */
public class PrefixedTokenRemoverAndExtractorTokenizer extends TokenFilter
{
    /**
     * Collection to store the extracted keywords.
     */
    private List<String> extractedKeywords;

    /**
     * The prefix to look for.
     */
    private String oldPrefix;

    /**
     * The prefix to replace.
     */
    private String newPrefix;

    /**
     * Constructor.
     *
     * @param inInput
     *            the input
     * @param inOldPrefix
     *            the prefix to look for
     * @param inNewPrefix
     *            the new prefix to apply to the words added into the extracted keywords
     * @param inExtractedKeywords
     *            list to store the extracted keywords
     */
    public PrefixedTokenRemoverAndExtractorTokenizer(final TokenStream inInput, final String inOldPrefix,
            final String inNewPrefix, final List<String> inExtractedKeywords)
    {
        super(inInput);
        oldPrefix = inOldPrefix;
        newPrefix = inNewPrefix;
        extractedKeywords = inExtractedKeywords;
    }

    /**
     * Get the next token, replacing the prefix string with the replacement string, and if the token begins with the
     * string, store it in the extracted keywords list after the replacement. If finds an empty token, try again. Return
     * null when no tokens remaining. Don't return tokens that start with the prefix.
     *
     * Example - if we're replacing "foo" with "#":
     *
     * - "foobar" --> stores "#bar" in extracted keywords list, and tries to return the next token
     *
     * - "foobarfoobar -> stores "#bar#bar" in keywords list, and tries to return the next token
     *
     * @param reusableToken
     *            the token to reuse if possible
     * @return the reusable token with the next token - replace all replacement characters and remove it if it's a
     *         prefix
     * @throws IOException
     *             on error
     */
    @Override
    public final Token next(final Token reusableToken) throws IOException
    {
        Token nextToken = null;
        assert reusableToken != null;
        do
        {
            nextToken = input.next(reusableToken);
            if (nextToken == null)
            {
                // no terms left
                return null;
            }

            // there's a term - see if we need to operate on it
            String termText = nextToken.term();
            if (termText.contains(oldPrefix))
            {
                // the term contains the character we're looking for - replace all occurrences
                String keyword = termText.replace(oldPrefix, newPrefix);
                nextToken.reinit(keyword, 0, keyword.length());

                if (keyword.startsWith(newPrefix))
                {
                    // and it started with the keyword
                    if (!extractedKeywords.contains(keyword))
                    {
                        // check for the special case where the term only contains the prefix
                        if (keyword.replace(newPrefix, "").length() > 0)
                        {
                            extractedKeywords.add(keyword);
                        }
                    }
                }
            }
        }
        while (nextToken.termLength() == 0 || nextToken.term().startsWith(newPrefix));
        return nextToken;
    }
}