package com.ringtone.server;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class SearchJanitorUtils {
private static final Logger log = Logger.getLogger(SearchJanitorUtils.class.getName());
/** From StopAnalyzer Lucene 2.9.1 */
public final static String[] stopWords = new String[]{
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
/**
* Uses english stemming (snowball + lucene) + stopwords for getting the words.
*
* @param index
* @return
*/
public static Set<String> getTokensForIndexingOrQuery(
String index_raw,
int maximumNumberOfTokensToReturn) {
String indexCleanedOfHTMLTags = index_raw.replaceAll("\\<.*?>"," ");
Set<String> returnSet = new HashSet<String>();
try {
Analyzer analyzer = new SnowballAnalyzer(
org.apache.lucene.util.Version.LUCENE_CURRENT,
"English",
stopWords);
TokenStream tokenStream = analyzer.tokenStream(
"content",
new StringReader(indexCleanedOfHTMLTags));
Token token = new Token();
while (((token = tokenStream.next()) != null)
&& (returnSet.size() < maximumNumberOfTokensToReturn)) {
returnSet.add(token.term());
}
} catch (IOException e) {
log.severe(e.getMessage());
}
return returnSet;
}
}