// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.AttributeSource;
import com.twitter.common.text.example.TokenizerUsageExample;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Abstraction to enumerate a sequence of tokens. This class represents the central abstraction in
* Twitter's text processing library, and is similar to Lucene's TokenStream, with the following
* exceptions:
*
* <ul>
* <li>This class assumes that the input text is a {@link CharSequence}.
* <li>Calls support chaining.
* <li>Instances are reusable.
* </ul>
*
* For an annotated example of how this class is used in practice, refer to
* {@link TokenizerUsageExample}.
*/
public abstract class TwitterTokenStream extends TokenStream {
private final CharSequenceTermAttribute termAttribute = addAttribute(CharSequenceTermAttribute.class);
private final TokenTypeAttribute typeAttribute = addAttribute(TokenTypeAttribute.class);
/**
* Constructs a {@code TwitterTokenStream} using the default attribute factory.
*/
public TwitterTokenStream() {
super();
}
/**
* Constructs a {@code TwitterTokenStream} using the supplied {@code AttributeFactory} for creating new
* {@code Attribute} instances.
*
* @param factory attribute factory
*/
protected TwitterTokenStream(AttributeSource.AttributeFactory factory) {
super(factory);
}
/**
* Constructs a {@code TwitterTokenStream} that uses the same attributes as the supplied one.
*
* @param input attribute source
*/
protected TwitterTokenStream(AttributeSource input) {
super(input);
}
/**
* Consumers call this method to advance the stream to the next token.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken();
/**
* Resets this {@code TwitterTokenStream} (and also downstream tokens if they exist) to parse a new
* input.
*/
public void reset(CharSequence input) {
updateInputCharSequence(input);
reset();
};
/**
* Subclasses should implement reset() to reinitiate the processing.
* Input CharSequence is available as inputCharSequence().
*/
public abstract void reset();
/**
* Converts this token stream into a list of {@code Strings}.
*
* @return the contents of the token stream as a list of {@code Strings}.
*/
public List<String> toStringList() {
List<String> tokens = Lists.newArrayList();
while (incrementToken()) {
tokens.add(term().toString());
}
return tokens;
}
/**
* Searches and returns an instance of a specified class in this TwitterTokenStream chain.
*
* @param cls class to search for
* @return instance of the class {@code cls} if found or {@code null} if not found
*/
public <T extends TwitterTokenStream> T getInstanceOf(Class<T> cls) {
Preconditions.checkNotNull(cls);
if (cls.isInstance(this)) {
return cls.cast(this);
}
return null;
}
/**
* Returns the offset of the current token.
*
* @return offset of the current token.
*/
public int offset() {
return termAttribute.getOffset();
}
/**
* Returns the length of the current token.
*
* @return length of the current token.
*/
public int length() {
return termAttribute.getLength();
}
/**
* Returns the {@code CharSequence} of the current token.
*
* @return {@code CharSequence} of the current token
*/
public CharSequence term() {
return termAttribute.getTermCharSequence();
}
/**
* Returns the input {@code CharSequence}.
*
* @return input {@code CharSequence}
*/
public CharSequence inputCharSequence() {
return termAttribute.getCharSequence();
}
/**
* Returns the type of the current token.
*
* @return type of the current token.
*/
public TokenType type() {
return typeAttribute.getType();
}
/**
* Sets the input {@code CharSequence}.
*
* @param inputCharSequence {@code CharSequence} analyzed by this
* {@code TwitterTokenStream}
*/
protected void updateInputCharSequence(CharSequence inputCharSequence) {
termAttribute.setCharSequence(inputCharSequence);
}
/**
* Updates the offset and length of the current token.
*
* @param offset new offset
* @param length new length
*/
protected void updateOffsetAndLength(int offset, int length) {
termAttribute.setOffset(offset);
termAttribute.setLength(length);
}
/**
* Updates the type of the current token.
*
* @param type new type
*/
protected void updateType(TokenType type) {
typeAttribute.setType(type);
}
@Override
public boolean equals(Object target) {
// Lucene's AttributeSource.equals() returns true if this has the same
// set of attributes as the target one. Let's make it more strict.
return this == target;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
}