package com.twitter.common.text;
import com.google.common.base.Preconditions;
import com.twitter.common.text.combiner.PunctuationExceptionCombiner;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.TokenizedCharSequence;
import com.twitter.common.text.token.TokenizedCharSequenceStream;
import com.twitter.common.text.tokenizer.LatinTokenizer;
import java.util.List;
public abstract class TextTokenizer {
protected TokenStream tokenizationStream =
new TokenizedCharSequenceStream(applyDefaultChain(
new PunctuationExceptionCombiner.Builder(
new LatinTokenizer.Builder().build()).build()));
public abstract TokenStream applyDefaultChain(TokenStream tokenizer);
/**
* Returns {@code TokenStream} to tokenize a text.
*
* @return {@code TokenStream} to tokenize the text
*/
public TokenStream getDefaultTokenStream() {
return tokenizationStream;
}
/**
* Tokenizes a {@code CharSequence}, and returns a {@code TokenizedCharSequence} as a result.
*
* @param input text to be tokenized
* @return {@code TokenizedCharSequence} instance
*/
public TokenizedCharSequence tokenize(CharSequence input) {
Preconditions.checkNotNull(input);
return TokenizedCharSequence.createFrom(input, getDefaultTokenStream());
}
/**
* Tokenizes a {@code CharSequence} into a list of Strings.
*
* @param input text to be tokenized
* @return a list of tokens as String objects
*/
public List<String> tokenizeToStrings(CharSequence input) {
Preconditions.checkNotNull(input);
TokenStream tokenizer = getDefaultTokenStream();
tokenizer.reset(input);
return tokenizer.toStringList();
}
}