package com.twitter.common.text.combiner;
import com.twitter.common.text.extractor.RegexExtractor;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.attribute.TokenType;
import java.util.regex.Pattern;
public class PunctuationExceptionCombiner extends ExtractorBasedTokenCombiner {
private static final String PUNCTUATION_EXCEPTIONS_CHARS = "♥";
private static final String PUNCTUATION_EXCEPTION_REGEX =
"[" + PUNCTUATION_EXCEPTIONS_CHARS + "]+";
private static final Pattern PUNCTUATION_EXCEPTIONS_PATTERN =
Pattern.compile(PUNCTUATION_EXCEPTION_REGEX);
protected PunctuationExceptionCombiner(TokenStream inputStream, Pattern exceptionsPattern) {
super(inputStream);
setExtractor(new RegexExtractor.Builder().setRegexPattern(exceptionsPattern, 0, 0)
.build());
setType(TokenType.TOKEN);
}
public static class Builder {
private String exceptionChars = null;
private TokenStream inputStream;
public Builder(TokenStream inputStream) {
this.inputStream = inputStream;
}
/**
* Add additional exception chars. For example, to add . and ! to the list of
* non-punctuation chars, additionalChars should be ".!"
*
* @param additionalChars Additional characters that should not be considered punctuation
* @return PunctuationExceptionCombiner builder instance
*/
public Builder addExceptionChars(String additionalChars) {
if (exceptionChars == null) {
exceptionChars = PUNCTUATION_EXCEPTIONS_CHARS + additionalChars;
} else {
exceptionChars += additionalChars;
}
return this;
}
public PunctuationExceptionCombiner build() {
Pattern exceptionsPattern = PUNCTUATION_EXCEPTIONS_PATTERN;
if (exceptionChars != null) {
exceptionsPattern = Pattern.compile("[" + exceptionChars + "]+");
}
return new PunctuationExceptionCombiner(inputStream, exceptionsPattern);
}
}
}