// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.detector;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
import com.twitter.common.text.token.TokenProcessor;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Updates {@code TypeAttribute} of a token if the term matches a given regular expression.
*/
public class RegexDetector extends TokenProcessor {
private CharSequenceTermAttribute inputCharSeqTermAttr;
private TokenTypeAttribute typeAttr;
private Pattern regexPattern;
private TokenType type;
protected RegexDetector(TokenStream inputStream) {
super(inputStream);
inputCharSeqTermAttr = inputStream.getAttribute(CharSequenceTermAttribute.class);
typeAttr = addAttribute(TokenTypeAttribute.class);
}
protected void setRegexPattern(Pattern regex) {
this.regexPattern = regex;
}
protected void setType(TokenType type) {
this.type = type;
}
@Override
public boolean incrementToken() {
TokenStream inputStream = getInputStream();
if (!inputStream.incrementToken()) {
return false;
}
clearAttributes();
restoreState(inputStream.captureState());
CharSequence term = inputCharSeqTermAttr.getTermCharSequence();
if (regexPattern.matcher(term).matches()) {
typeAttr.setType(type);
}
return true;
}
public static class Builder extends AbstractBuilder<RegexDetector, Builder> {
public Builder(TokenStream inputStream) {
super(new RegexDetector(inputStream));
}
}
public abstract static class
AbstractBuilder<N extends RegexDetector, T extends AbstractBuilder<N, T>> {
private final N detector;
protected AbstractBuilder(N detector) {
this.detector = Preconditions.checkNotNull(detector);
}
@SuppressWarnings("unchecked")
protected T self() {
return (T) this;
}
protected N detector() {
return detector;
}
public T setRegexPattern(Pattern regex) {
Preconditions.checkNotNull(regex);
detector.setRegexPattern(regex);
return self();
}
public T setType(TokenType type) {
Preconditions.checkNotNull(type);
detector.setType(type);
return self();
}
public N build() {
return detector;
}
}
}