// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.text.tokenizer; import java.nio.CharBuffer; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.Nullable; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.lucene.util.AttributeSource; import com.twitter.common.text.token.TwitterTokenStream; import com.twitter.common.text.token.attribute.TokenType; /** * Tokenizes text based on regular expressions of word delimiters and punctuation characters. */ public class RegexTokenizer extends TwitterTokenStream { private Pattern delimiterPattern; private int punctuationGroup = 0; private boolean keepPunctuation = false; private List<CharBuffer> tokens; private List<TokenType> tokenTypes; private int tokenIndex = 0; // please use Builder instead. protected RegexTokenizer() { } protected RegexTokenizer(AttributeSource attributeSource) { super(attributeSource); } protected void setDelimiterPattern(Pattern delimiterPattern) { this.delimiterPattern = delimiterPattern; } protected void setPunctuationGroupInDelimiterPattern(int group) { this.punctuationGroup = group; } protected void setKeepPunctuation(boolean keepPunctuation) { this.keepPunctuation = keepPunctuation; } @Override public final boolean incrementToken() { if (tokenIndex >= tokens.size()) { return false; } CharBuffer token = tokens.get(tokenIndex); updateOffsetAndLength(token.position(), token.limit() - token.position()); updateType(tokenTypes.get(tokenIndex)); tokenIndex++; return true; } @Override public void reset() { CharSequence input = inputCharSequence(); // reset termAttr clearAttributes(); // reset tokens tokens = Lists.newArrayList(); tokenTypes = Lists.newArrayList(); // reset tokenIndex tokenIndex = 0; if (input.length() == 0) { return; } else if (input.length() == 1) { char c = input.charAt(0); if (isSpace(c)) { return; } else if (isLetter(c)) { tokens.add(CharBuffer.wrap(input)); tokenTypes.add(TokenType.TOKEN); return; } } Matcher matcher = delimiterPattern.matcher(input); int lastMatch = 0; while (matcher.find()) { if (matcher.start() != lastMatch) { tokens.add(CharBuffer.wrap(input, lastMatch, matcher.start())); tokenTypes.add(TokenType.TOKEN); } if (keepPunctuation && matcher.start(punctuationGroup) >= 0) { tokens.add(CharBuffer.wrap(input, matcher.start(punctuationGroup), matcher.end(punctuationGroup))); tokenTypes.add(TokenType.PUNCTUATION); } lastMatch = matcher.end(); } if (lastMatch < input.length()) { tokens.add(CharBuffer.wrap(input, lastMatch, input.length())); tokenTypes.add(TokenType.TOKEN); } } /** * Checks if a given character is a space or not. * A subclass can override this method to skip applying Regex * for an input with single space character. * * @param c a character to examine * @return true if a given character is a space. */ protected boolean isSpace(char c) { return false; } /** * Checks if a given character is a letter or not. * A subclass can override this method to skip applying Regex * for an input with single letter character. * * @param c a character to examine * @return true if a given character is a letter. */ protected boolean isLetter(char c) { return false; } /** * Builder for RegexTokenizer. * * @author Keita Fujii */ public static final class Builder extends AbstractBuilder<RegexTokenizer, Builder> { @Override protected RegexTokenizer buildTokenizer(@Nullable AttributeSource attributeSource) { if (attributeSource == null) { return new RegexTokenizer(); } else { return new RegexTokenizer(attributeSource); } } } public abstract static class AbstractBuilder<N extends RegexTokenizer, T extends AbstractBuilder<N, T>> { private Pattern delimiterPattern; private int punctuationGroup = 0; private boolean keepPunctuation = false; @SuppressWarnings("unchecked") protected T self() { return (T) this; } /** * Sets the Regex pattern of the delimiter. * * An input text is tokenized by the CharSequence * specified by this pattern. * * @param delimiterPattern Regex pattern of delimiter. * @return this Builder object */ public T setDelimiterPattern(Pattern delimiterPattern) { this.delimiterPattern = delimiterPattern; return self(); } /** * Sets the ID of the group in delimiterPattern that should * be handled as punctuation. * For example, you can set delimiterPattern as "([.,])\\s+" * and punctuationGroup as 1 in order to detect comma * and period as punctuations. * * @param group group ID of punctuation in delimiterPattern. * @return this Builder object */ public T setPunctuationGroupInDelimiterPattern(int group) { this.punctuationGroup = group; return self(); } /** * Specifies whether to keep punctuations (which is specified * by delimiterPattern and punctuationGroupInDelimiterPattern) * in the output token stream. * * @param keepPunctuation true to keep delimiters. false otherwise. * @return this Builder object. */ public T setKeepPunctuation(boolean keepPunctuation) { this.keepPunctuation = keepPunctuation; return self(); } protected abstract N buildTokenizer(@Nullable AttributeSource attributeSource); private void initialize(N tokenizer) { Preconditions.checkNotNull(delimiterPattern); Preconditions.checkArgument(punctuationGroup >= 0); tokenizer.setDelimiterPattern(delimiterPattern); tokenizer.setPunctuationGroupInDelimiterPattern(punctuationGroup); tokenizer.setKeepPunctuation(keepPunctuation); } public N build() { N tokenizer = buildTokenizer(null); initialize(tokenizer); return tokenizer; } public N build(AttributeSource attributeSource) { N tokenizer = buildTokenizer(attributeSource); initialize(tokenizer); return tokenizer; } } }