package org.karmaexchange.util;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.EnumSet;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import com.google.common.collect.ImmutableSet;
public class SearchUtil {
private static final Analyzer ANALYZER = new KStemEnglishAnalyzer();
public enum ReservedToken {
CAUSE_TYPE("cause-type"),
ORG("org"),
PRIMARY_ORG("org-primary");
public static final String RESERVED_TOKEN_SEPERATOR = ":";
private final String prefix;
private ReservedToken(String prefix) {
this.prefix = prefix + RESERVED_TOKEN_SEPERATOR;
}
public String create(String tokenSuffix) {
return prefix + parseTokenSuffix(tokenSuffix);
}
public static String parseTokenSuffix(String tokenSuffix) {
String parsedTokenSuffix = tokenSuffix.replaceAll("[^a-zA-Z0-9]", "").toLowerCase();
if (parsedTokenSuffix.isEmpty()) {
throw new IllegalArgumentException(
"token suffix has no searchable characters: '" + tokenSuffix + "'");
}
return parsedTokenSuffix;
}
public static boolean conflictsWithAnyReservedToken(String token) {
for (ReservedToken reservedToken : ReservedToken.values()) {
if (StringUtils.startsWithIgnoreCase(token, reservedToken.prefix)) {
return true;
}
}
return false;
}
}
public enum ParseOptions {
EXCLUDE_RESERVED_TOKENS
}
public static Set<String> getSearchableTokens(String textToParse, int maxTokens) {
BoundedHashSet<String> searchableTokens = BoundedHashSet.create(maxTokens);
addSearchableTokens(searchableTokens, textToParse, EnumSet.noneOf(ParseOptions.class));
return searchableTokens;
}
public static void addSearchableTokens(BoundedHashSet<String> searchableTokens,
String textToParse, EnumSet<ParseOptions> parseOptions) {
textToParse = extractSpecialTokens(searchableTokens, textToParse, parseOptions);
try {
TokenStream tokenStream = ANALYZER.tokenStream(null, new StringReader(textToParse));
CharTermAttribute termAttr = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (!searchableTokens.limitReached() && tokenStream.incrementToken()) {
searchableTokens.add(termAttr.toString());
}
tokenStream.end();
} finally {
tokenStream.close();
}
} catch (IOException e) {
// Should not be thrown since a string is the input.
throw new RuntimeException(e);
}
}
private static String extractSpecialTokens(BoundedHashSet<String> searchableTokens,
String textToParse, EnumSet<ParseOptions> parseOptions) {
String[] tokens = textToParse.split("\\s+");
StringBuilder remainingText = new StringBuilder();
for (int tokIdx=0; (tokIdx < tokens.length) && !searchableTokens.limitReached();
tokIdx++) {
String token = tokens[tokIdx].toLowerCase();
if (ReservedToken.conflictsWithAnyReservedToken(token)) {
// Doing this prioritizes conflicting tokens in the text being parsed. Not doing this
// causes us to deal with reserved token parsing nuances. Revisit if this becomes an issue.
if (!parseOptions.contains(ParseOptions.EXCLUDE_RESERVED_TOKENS)) {
searchableTokens.add(token);
}
// else remove this token from the searchable token stream by not adding it to
// remainingText.
} else if (TagUtil.TAG_PREFIX_PATTERN.matcher(token).find()) {
searchableTokens.add(token);
} else {
remainingText.append(token);
remainingText.append(' ');
}
}
return remainingText.toString();
}
/**
* This class is a combination of the StandardAnalyzer and the EnglishAnalyzer modified to
* use the KStemFilter and a larger stop word list.
*/
// TODO(avaliani): Consider adding a filter to handle contractions like isn't.
private static final class KStemEnglishAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */
private static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private static final Version LUCENE_VERSION = Version.LUCENE_43;
private static final CharArraySet STOP_WORDS;
static {
// This list is larger than the included stop word list in lucene.
// Source: http://www.textfixer.com/resources/common-english-words.txt
String stopWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your";
STOP_WORDS =
new CharArraySet(LUCENE_VERSION, ImmutableSet.copyOf(stopWords.split(",")), true);
}
public KStemEnglishAnalyzer() {
super(LUCENE_VERSION, STOP_WORDS);
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
TokenStream result = new StandardFilter(matchVersion, src);
result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
result = new KStemFilter(result);
// result = new PorterStemFilter(result);
return new TokenStreamComponents(src, result) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
super.setReader(reader);
}
};
}
}
}