/* * Copyright (c) 2010 Lockheed Martin Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.eurekastreams.commons.search.analysis; import java.io.IOException; import java.util.List; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.eurekastreams.server.domain.strategies.HashTagExtractor; import org.eurekastreams.server.domain.strategies.Substring; /** * Special filter that helps preserve # and _ for hashtags. There are two modes - literal, and non-literal. Literal mode * is meant for searching while non-literal is for indexing. Non-literal mode will add extra versions of the text to * allow for searching by different ways. For example... in non-literal (indexing) mode, this will happen: #my_hats -> * my, hat, #my_hats */ public class HashTagTokenizer extends TokenFilter { /** * Collection to store the extracted hashtags. */ private final List<String> extractedHashtags; /** * Collection to store the extracted non-hashtags that will be subject to more parsing. */ private final List<String> extractedNonHashTags; /** * The string used to replace hashtags during tokenizing. */ public static final String HASHTAG_TEMPORARY_REPLACEMENT = "xxxhashtagreplacementxxx"; /** * The string used to replace underscores during tokenizing. */ public static final String UNDERSCORE_TEMPORARY_REPLACEMENT = "xxxunderscorereplacementxxx"; /** * Work in literal mode - just restore the #'s and _'s. */ private final boolean literalMode; /** * Hashtag extractor - same one that's used on the client. */ private final HashTagExtractor hashTagExtractor; /** * Constructor. * * @param inInput * the input * @param inExtractedHashtags * list to store the extracted hashtags * @param inExtractedNonHashTags * the list to store non-hashtags that had to be extracted - these will be further processed * @param inLiteralMode * whether to use literal mode, which just passes the tokens back through after putting back the #'s and * _'s */ public HashTagTokenizer(final TokenStream inInput, final List<String> inExtractedHashtags, final List<String> inExtractedNonHashTags, final boolean inLiteralMode) { super(inInput); extractedHashtags = inExtractedHashtags; extractedNonHashTags = inExtractedNonHashTags; hashTagExtractor = new HashTagExtractor(); literalMode = inLiteralMode; } /** * Get the next token - remove a pound before any hashtag. Store hashtags in extractedKeywords. Also store any * results of splitting on underscores. In that case, don't add the first in the split - return that with the token. * * - "#bar" --> returns token with "bar", stores "#bar" in extracted keywords list * * - "#bar#snuts -> returns token with "bar", stores "#bar" and "snuts" in keywords list * * @param reusableToken * the token to reuse if possible * @return the reusable token with the next token - replace all replacement characters and remove it if it's a * prefix * @throws IOException * on error */ @Override public final Token next(final Token reusableToken) throws IOException { Token nextToken = null; assert reusableToken != null; do { nextToken = input.next(reusableToken); if (nextToken == null) { // no terms left return null; } // there's a term - see if we need to operate on it String termText = nextToken.term(); // put underscores & hashtags back - they've made it past the tokenizing, time to deal with them termText = termText.replaceAll(UNDERSCORE_TEMPORARY_REPLACEMENT, "_"); termText = termText.replaceAll(HASHTAG_TEMPORARY_REPLACEMENT, "#"); String newTokenText = ""; if (literalMode) { // this is searching, not indexing, so don't explode the content out if (termText.contains("#")) { if (!extractedHashtags.contains(termText)) { // found a word with an underscore or hash - just pass it through the list to avoid being munged // by later filters - this lets people do #foo#bar or foo_bars if they really want. extractedHashtags.add(termText); } } else { // doesn't start with a # - just pass it through newTokenText = termText; } } else { // this is indexing, not searching, so expand the text into whatever we might want later // use the HashTagExtractor to find a hashtag Substring hashTag = hashTagExtractor.extract(termText, 0); if (hashTag != null) { String hashTagText = hashTag.getContent(); if (!extractedHashtags.contains(hashTagText)) { // add the parsed hashtag into the list extractedHashtags.add(hashTagText); } } if (termText.contains("#") || termText.contains("_")) { // pass it through the list to avoid being munged by later filters - this lets people do #foo#bar or // hi_there if they really want. if (!extractedHashtags.contains(termText)) { extractedHashtags.add(termText); } // Also add literal and plural-parsed versions omitting any starting hashes and containing _ if (termText.contains("_")) { // remove all the leading hashes String temp = termText; while (temp.length() > 1 && temp.startsWith("#")) { temp = temp.substring(1); } // add it as a literal if (!extractedHashtags.contains(temp)) { extractedHashtags.add(temp); } // add it for plural removal if (!extractedNonHashTags.contains(temp)) { extractedNonHashTags.add(temp); } } // split the text on # and _ for indexing String[] parts = termText.split("[#_]"); for (int i = 0; i < parts.length; i++) { if (parts[i].length() > 0 && !extractedNonHashTags.contains(parts[i])) { extractedNonHashTags.add(parts[i]); } } } else { // simple word - just pass it through newTokenText = termText; } } nextToken.reinit(newTokenText, 0, newTokenText.length()); } while (nextToken.termLength() == 0); return nextToken; } }