// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.text.extractor; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.Preconditions; import com.twitter.common.text.token.TokenStream; import com.twitter.common.text.token.attribute.CharSequenceTermAttribute; /** * Extracts entities from text according to a given regular expression. */ public class RegexExtractor extends TokenStream { private final CharSequenceTermAttribute charSeqTermAtt = addAttribute(CharSequenceTermAttribute.class); private Pattern regexPattern; private int startGroup = 0; private int endGroup = 0; private char triggeringChar = 0; private Matcher matcher = null; /** * Protected constructor for subclass builders, clients should use a builder to create an * instance. */ protected RegexExtractor() { } /** * Sets the regular expression used in this {@code RegexExtractor}. * * @param pattern regular expression defining the entities to be extracted */ protected void setRegexPattern(Pattern pattern) { this.regexPattern = pattern; } /** * Sets the regular expression and start/end group ID used in this {@code RegexExtractor}. * * @param pattern Regex pattern of a substring to be replaced. * @param startGroup ID of the group in the pattern that matches the beginning * of the substring being replaced. Set to 0 to match the entire pattern. * @param endGroup ID of the group in the pattern that matches the end * of the substring being replace. Set to 0 to match the entire pattern. */ protected void setRegexPattern(Pattern pattern, int startGroup, int endGroup) { this.regexPattern = pattern; this.startGroup = startGroup; this.endGroup = endGroup; } /** * Sets a character that must appear in the input text. If a specified character does not appear * in the input text, this {@code RegexExtractor} does not extract entities from the text. * Specifying a {@code triggeringChar} may improve the performance by skipping unnecessary pattern * matching. * * @param triggeringChar a character that must appear in the text */ protected void setTriggeringChar(char triggeringChar) { Preconditions.checkNotNull(triggeringChar); this.triggeringChar = triggeringChar; } /** * Reset the extractor to use a new {@code CharSequence} as input. * * @param input {@code CharSequence} from which to extract the entities. */ public void reset(CharSequence input) { Preconditions.checkNotNull(input); charSeqTermAtt.setTermBuffer(input); if (triggeringChar > 0) { // triggeringChar is specified. boolean foundTriggeringChar = false; for (int i = 0; i < input.length(); i++) { if (triggeringChar == input.charAt(i)) { foundTriggeringChar = true; break; } } if (!foundTriggeringChar) { // No triggering char found. No extraction performed. matcher = null; return; } } if (regexPattern != null) { matcher = regexPattern.matcher(input); } } @Override public boolean incrementToken() { if (matcher != null && matcher.find()) { int start = matcher.start(startGroup); int end = matcher.end(endGroup); clearAttributes(); charSeqTermAtt.setOffset(start); charSeqTermAtt.setLength(end - start); return true; } else { return false; } } public static class Builder extends AbstractBuilder<RegexExtractor, Builder> { public Builder() { super(new RegexExtractor()); } } public abstract static class AbstractBuilder<N extends RegexExtractor, T extends AbstractBuilder<N, T>> { private final N extractor; protected AbstractBuilder(N transformer) { this.extractor = Preconditions.checkNotNull(transformer); } @SuppressWarnings("unchecked") protected T self() { return (T) this; } public T setRegexPattern(Pattern pattern) { Preconditions.checkNotNull(pattern); extractor.regexPattern = pattern; return self(); } public T setRegexPattern(Pattern pattern, int startGroup, int endGroup) { Preconditions.checkNotNull(pattern); Preconditions.checkArgument(startGroup >= 0); Preconditions.checkArgument(endGroup >= 0); extractor.setRegexPattern(pattern, startGroup, endGroup); return self(); } public T setTriggeringChar(char triggeringChar) { Preconditions.checkArgument(triggeringChar > 0); extractor.setTriggeringChar(triggeringChar); return self(); } public N build() { return extractor; } } }