// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.twitter.common.text.token.attribute.PartOfSpeechAttribute;
import com.twitter.common.text.token.attribute.TokenGroupAttribute;
import com.twitter.common.text.token.attribute.TokenGroupAttributeImpl;
/**
* Reproduces the result of tokenization if an input text is an instance of
* TokenizedCharSequence. Otherwise, passes the input text to downstream
* TwitterTokenStream.
*/
public class TokenizedCharSequenceStream extends TokenProcessor {
private final PartOfSpeechAttribute posAttr;
private final PositionIncrementAttribute incAttr;
private final TokenGroupAttributeImpl groupAttr;
private TokenizedCharSequence tokenized = null;
private int currentIndex = 0;
/**
* Constructor.
* If an input text is not tokenized (is not an instance of TokenizedCharSequence),
* this uses inputStream to tokenize it.
*
* @param inputStream a token stream to tokenize a text if it's not tokenized yet.
*/
public TokenizedCharSequenceStream(TwitterTokenStream inputStream) {
super(inputStream);
if (hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = getAttribute(PartOfSpeechAttribute.class);
} else {
posAttr = null;
}
if (hasAttribute(PositionIncrementAttribute.class)) {
incAttr = getAttribute(PositionIncrementAttribute.class);
} else {
incAttr = null;
}
if (hasAttribute(TokenGroupAttribute.class)) {
groupAttr = (TokenGroupAttributeImpl) getAttribute(TokenGroupAttribute.class);
} else {
groupAttr = null;
}
}
/**
* Constructor.
* This can only accept an already-tokenized text (TokenzedCharSequence) as input.
*/
public TokenizedCharSequenceStream() {
super(new TwitterTokenStream() {
@Override
public final boolean incrementToken() {
return false;
}
@Override
public void reset() {
// If no inputStream is provided, throw an exception.
throw new IllegalArgumentException("Input must be an instance of TokenizedCharSequence"
+ " because there is no TwitterTokenStream in the downstream to tokenized a text.");
}
});
posAttr = addAttribute(PartOfSpeechAttribute.class);
incAttr = addAttribute(PositionIncrementAttribute.class);
groupAttr = (TokenGroupAttributeImpl) addAttribute(TokenGroupAttribute.class);
}
@Override
public final boolean incrementToken() {
// If input is already tokenized, reproduce the TwitterTokenStream;
// otherwise, simply pass it onto the downstream TwitterTokenStream.
if (tokenized == null) {
// Input is not tokenized; let inputStream tokenize it.
return incrementInputStream();
}
if (currentIndex >= tokenized.getTokens().size()) {
// No more tokens.
return false;
}
TokenizedCharSequence.Token token = tokenized.getTokens().get(currentIndex);
updateOffsetAndLength(token.getOffset(), token.getLength());
updateType(token.getType());
if (posAttr != null) {
posAttr.setPOS(token.getPartOfSpeech());
}
if (incAttr != null) {
incAttr.setPositionIncrement(token.getPositionIncrement());
}
if (groupAttr != null) {
groupAttr.setSequence(token.getGroup());
}
currentIndex++;
return true;
}
@Override
public void reset(CharSequence input) {
// Check if input is already tokenized or not.
if (input instanceof TokenizedCharSequence) {
clearAttributes();
tokenized = (TokenizedCharSequence) input;
currentIndex = 0;
updateInputCharSequence(tokenized);
} else {
// Otherwise, let inputStream tokenize the input.
super.reset(input);
tokenized = null;
}
}
}