// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.PartOfSpeechAttribute;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Reproduces the result of tokenization if an input text is an instance of
* TokenizedCharSequence. Otherwise, passes the input text to downstream
* TokenStream.
*/
public class TokenizedCharSequenceStream extends TokenStream {
private final TokenStream inputStream;
private final CharSequenceTermAttribute termAttr;
private final TokenTypeAttribute typeAttr;
private final PartOfSpeechAttribute posAttr;
private TokenizedCharSequence tokenized = null;
private int currentIndex = 0;
/**
* Constructor.
* If an input text is not tokenized (is not an instance of TokenizedCharSequence),
* this uses inputStream to tokenize it.
*
* @param inputStream a token stream to tokenize a text if it's not tokenized yet.
*/
public TokenizedCharSequenceStream(TokenStream inputStream) {
super(inputStream.cloneAttributes());
this.inputStream = inputStream;
termAttr = addAttribute(CharSequenceTermAttribute.class);
typeAttr = addAttribute(TokenTypeAttribute.class);
if (hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = getAttribute(PartOfSpeechAttribute.class);
} else {
posAttr = null;
}
}
/**
* Constructor.
* This can only accept an already-tokenized text (TokenzedCharSequence) as input.
*/
public TokenizedCharSequenceStream() {
this.inputStream = null;
termAttr = addAttribute(CharSequenceTermAttribute.class);
typeAttr = addAttribute(TokenTypeAttribute.class);
posAttr = addAttribute(PartOfSpeechAttribute.class);
}
@Override
public boolean incrementToken() {
// If input is already tokenized, reproduce the TokenStream;
// otherwise, simply pass it onto the downstream TokenStream.
if (tokenized == null) {
// Input is not tokenized; let inputStream tokenize it.
if (!inputStream.incrementToken()) {
return false;
}
restoreState(inputStream.captureState());
return true;
}
if (currentIndex >= tokenized.getTokens().size()) {
// No more tokens.
return false;
}
TokenizedCharSequence.Token token = tokenized.getTokens().get(currentIndex);
termAttr.setOffset(token.getOffset());
termAttr.setLength(token.getLength());
typeAttr.setType(token.getType());
if (posAttr != null) {
posAttr.setPOS(token.getPartOfSpeech());
}
currentIndex++;
return true;
}
@Override
public void reset(CharSequence input) {
// Check if input is already tokenized or not.
if (input instanceof TokenizedCharSequence) {
tokenized = (TokenizedCharSequence) input;
currentIndex = 0;
termAttr.setCharSequence(tokenized);
} else if (inputStream == null) {
// If no inputStream is provided, throw an exception.
throw new IllegalArgumentException("Input must be an instance of TokenizedCharSequence"
+ " because there is no TokenStream in the downstream to tokenized a text.");
} else {
// Otherwise, let inputStream tokenize the input.
inputStream.reset(input);
tokenized = null;
}
}
}