// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.tokenizer;
import java.nio.CharBuffer;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Tokenizes text based on regular expressions of word delimiters and punctuation characters.
*/
public class RegexTokenizer extends TokenStream {
private Pattern delimiterPattern;
private int punctuationGroup = 0;
private boolean keepPunctuation = false;
private List<CharBuffer> tokens;
private List<TokenType> tokenTypes;
private int tokenIndex = 0;
private CharSequenceTermAttribute termAttr;
private TokenTypeAttribute typeAttr;
// please use Builder instead.
protected RegexTokenizer() {
termAttr = addAttribute(CharSequenceTermAttribute.class);
typeAttr = addAttribute(TokenTypeAttribute.class);
}
protected void setDelimiterPattern(Pattern delimiterPattern) {
this.delimiterPattern = delimiterPattern;
}
protected void setPunctuationGroupInDelimiterPattern(int group) {
this.punctuationGroup = group;
}
protected void setKeepPunctuation(boolean keepPunctuation) {
this.keepPunctuation = keepPunctuation;
}
@Override
public boolean incrementToken() {
if (tokenIndex >= tokens.size()) {
return false;
}
CharBuffer token = tokens.get(tokenIndex);
termAttr.setOffset(token.position());
termAttr.setLength(token.limit() - token.position());
typeAttr.setType(tokenTypes.get(tokenIndex));
tokenIndex++;
return true;
}
@Override
public void reset(CharSequence input) {
// reset termAttr
termAttr.setCharSequence(input);
// reset tokens
tokens = Lists.newArrayList();
tokenTypes = Lists.newArrayList();
Matcher matcher = delimiterPattern.matcher(input);
int lastMatch = 0;
while (matcher.find()) {
if (matcher.start() != lastMatch) {
tokens.add(CharBuffer.wrap(input, lastMatch, matcher.start()));
tokenTypes.add(TokenType.TOKEN);
}
if (keepPunctuation && matcher.start(punctuationGroup) >= 0) {
tokens.add(CharBuffer.wrap(input, matcher.start(punctuationGroup),
matcher.end(punctuationGroup)));
tokenTypes.add(TokenType.PUNCTUATION);
}
lastMatch = matcher.end();
}
if (lastMatch < input.length()) {
tokens.add(CharBuffer.wrap(input, lastMatch, input.length()));
tokenTypes.add(TokenType.TOKEN);
}
// reset tokenIndex
tokenIndex = 0;
}
/**
* Builder for RegexTokenizer.
*
* @author Keita Fujii
*/
public static final class Builder extends AbstractBuilder<RegexTokenizer, Builder> {
public Builder() {
super(new RegexTokenizer());
}
}
public abstract static class
AbstractBuilder<N extends RegexTokenizer, T extends AbstractBuilder<N, T>> {
private final N tokenizer;
protected AbstractBuilder(N tokenizer) {
this.tokenizer = Preconditions.checkNotNull(tokenizer);
}
@SuppressWarnings("unchecked")
protected T self() {
return (T) this;
}
/**
* Sets the Regex pattern of the delimiter.
*
* An input text is tokenized by the CharSequence
* specified by this pattern.
*
* @param delimiterPattern Regex pattern of delimiter.
* @return this Builder object
*/
public T setDelimiterPattern(Pattern delimiterPattern) {
tokenizer.setDelimiterPattern(delimiterPattern);
return self();
}
/**
* Sets the ID of the group in delimiterPattern that should
* be handled as punctuation.
* For example, you can set delimiterPattern as "([.,])\\s+"
* and punctuationGroup as 1 in order to detect comma
* and period as punctuations.
*
* @param group group ID of punctuation in delimiterPattern.
* @return this Builder object
*/
public T setPunctuationGroupInDelimiterPattern(int group) {
tokenizer.setPunctuationGroupInDelimiterPattern(group);
return self();
}
/**
* Specifies whether to keep punctuations (which is specified
* by delimiterPattern and punctuationGroupInDelimiterPattern)
* in the output token stream.
*
* @param keepPunctuation true to keep delimiters. false otherwise.
* @return this Builder object.
*/
public T setKeepPunctuation(boolean keepPunctuation) {
tokenizer.setKeepPunctuation(keepPunctuation);
return self();
}
public N build() {
return tokenizer;
}
}
}