// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.detector;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.attribute.TokenType;
/**
* Updates {@code TokenTypeAttribute} of a token to {@code TokenType.PUNCTUATION}
* if the token is identified as punctuation.
*/
public class PunctuationDetector extends RegexDetector {
// Newlines in tweets function as punctuation
private static final String SPACE_EXCEPTIONS = "\\n\\r";
public static final String SPACE_CHAR_CLASS = "\\p{C}\\p{Z}&&[^" + SPACE_EXCEPTIONS + "]";
public static final String SPACE_REGEX = "[" + SPACE_CHAR_CLASS + "]";
public static final String PUNCTUATION_CHAR_CLASS = "\\p{P}\\p{M}\\p{S}" + SPACE_EXCEPTIONS;
public static final String PUNCTUATION_REGEX = "[" + PUNCTUATION_CHAR_CLASS + "]";
private static final Pattern DEFAULT_PUNCTUATION_PATTERN = Pattern.compile(PUNCTUATION_REGEX);
protected PunctuationDetector(TokenStream inputStream) {
super(inputStream);
setRegexPattern(DEFAULT_PUNCTUATION_PATTERN);
setType(TokenType.PUNCTUATION);
}
public static class Builder extends AbstractBuilder<PunctuationDetector, Builder> {
public Builder(TokenStream inputStream) {
super(new PunctuationDetector(inputStream));
}
}
public abstract static class
AbstractBuilder<N extends PunctuationDetector, T extends AbstractBuilder<N, T>> {
private final N detector;
protected AbstractBuilder(N detector) {
this.detector = Preconditions.checkNotNull(detector);
}
@SuppressWarnings("unchecked")
protected T self() {
return (T) this;
}
protected N detector() {
return detector;
}
public N build() {
return detector;
}
}
}