// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.lucene.util.AttributeSource;
import com.twitter.common.text.example.TokenizerUsageExample;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
/**
* Abstraction to enumerate a sequence of tokens. This class represents the central abstraction in
* Twitter's text processing library, and is similar to Lucene's TokenStream, with the following
* exceptions:
*
* <ul>
* <li>This class assumes that the input text is a {@link CharSequence}.
* <li>Calls support chaining.
* <li>Instances are reusable.
* </ul>
*
* For an annotated example of how this class is used in practice, refer to
* {@link TokenizerUsageExample}.
*/
public abstract class TokenStream extends AttributeSource {
/**
* Constructs a {@code TokenStream} using the default attribute factory.
*/
public TokenStream() {
super();
}
/**
* Constructs a {@code TokenStream} using the supplied {@code AttributeFactory} for creating new
* {@code Attribute} instances.
*
* @param factory attribute factory
*/
protected TokenStream(AttributeSource.AttributeFactory factory) {
super(factory);
}
/**
* Constructs a {@code TokenStream} that uses the same attributes as the supplied one.
*
* @param input attribute source
*/
protected TokenStream(AttributeSource input) {
super(input);
}
/**
* Consumers call this method to advance the stream to the next token.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken();
/**
* Resets this {@code TokenStream} (and also downstream tokens if they exist) to parse a new
* input.
*
* @param input new text to parse.
*/
public abstract void reset(CharSequence input);
/**
* Converts this token stream into a list of {@code Strings}.
*
* @return the contents of the token stream as a list of {@code Strings}.
*/
public List<String> toStringList() {
List<String> tokens = Lists.newArrayList();
if (hasAttribute(CharSequenceTermAttribute.class)) {
CharSequenceTermAttribute termAttr = getAttribute(CharSequenceTermAttribute.class);
while (incrementToken()) {
tokens.add(termAttr.getTermString());
}
} else {
throw new UnsupportedOperationException("This instance does not support toStringList()"
+ " because it does not support CharSequenceTermAttribute.");
}
return tokens;
}
/**
* Searches and returns an instance of a specified class in this TokenStream chain.
*
* @param cls class to search for
* @return instance of the class {@code cls} if found or {@code null} if not found
*/
public <T extends TokenStream> T getInstanceOf(Class<T> cls) {
Preconditions.checkNotNull(cls);
if (cls.isInstance(this)) {
return cls.cast(this);
}
return null;
}
}