// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import java.nio.CharBuffer;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.PartOfSpeechAttribute;
import com.twitter.common.text.token.attribute.TokenGroupAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Keeps the original text as well as its tokenized tokens.
*/
public class TokenizedCharSequence implements CharSequence {
public static final class Token {
public static final int DEFAULT_PART_OF_SPEECH = -1;
private final CharBuffer term;
private final TokenType type;
private final int pos;
protected Token(CharBuffer term, TokenType type) {
this(term, type, DEFAULT_PART_OF_SPEECH);
}
protected Token(CharBuffer term, TokenType type, int pos) {
this.term = term;
this.type = type;
this.pos = pos;
}
@Override
public String toString() {
return term.toString();
}
public CharSequence getTerm() {
return term;
}
public int getOffset() {
return term.position();
}
public int getLength() {
return term.limit() - term.position();
}
public TokenType getType() {
return type;
}
public int getPartOfSpeech() {
return pos;
}
}
private final CharSequence term;
private final List<Token> tokens;
private List<String> strTokens = null;
private Map<TokenType, List<Token>> typeToTokensMap = null;
private String strValue = null;
private int hashCode;
private boolean hashCodeCalced = false;
protected TokenizedCharSequence(CharSequence text, List<Token> tokens) {
this.tokens = Collections.unmodifiableList(tokens);
this.term = text;
}
@Override
public char charAt(int index) {
return term.charAt(index);
}
@Override
public int length() {
return term.length();
}
@Override
public CharSequence subSequence(int fromIndex, int toIndex) {
return term.subSequence(fromIndex, toIndex);
}
@Override
public String toString() {
if (strValue == null) {
strValue = term.toString();
}
return strValue;
}
@Override
public boolean equals(Object obj) {
return (obj != null)
&& (obj instanceof TokenizedCharSequence)
&& ((TokenizedCharSequence) obj).term.toString().equals(this.term.toString());
}
@Override
public int hashCode() {
if (!hashCodeCalced) {
hashCode = term.toString().hashCode();
hashCodeCalced = true;
}
return hashCode;
}
/**
* Returns all tokens.
*
* @return a list of tokens as CharBuffer objects
*/
public List<Token> getTokens() {
return tokens;
}
/**
* Returns all tokens as String.
*
* @return a list of tokens as String objects
*/
public List<String> getTokenStrings() {
if (strTokens == null) {
// lazy initialization
strTokens = Lists.newArrayListWithCapacity(tokens.size());
for (Token token : tokens) {
strTokens.add(token.getTerm().toString());
}
}
return strTokens;
}
/**
* Returns tokens of one or more specified types.
*
* @param types token type(s)
* @return tokens of the specified type(s)
*/
public List<Token> getTokensOf(TokenType... types) {
if (typeToTokensMap == null) {
// lazy initialization
typeToTokensMap = Maps.newHashMap();
for (Token token : tokens) {
List<Token> subtokens = typeToTokensMap.get(token.getType());
if (subtokens == null) {
subtokens = Lists.newArrayList(token);
typeToTokensMap.put(token.getType(), subtokens);
} else {
subtokens.add(token);
}
}
}
if (types.length == 1) {
return typeToTokensMap.get(types[0]);
}
List<Token> subtokens = Lists.newArrayList();
for (TokenType type : types) {
subtokens.addAll(typeToTokensMap.get(type));
}
return subtokens;
}
/**
* Returns tokens of one or more specified types as Strings.
*
* @param types token type(s)
* @return list of tokens of specified type(s) as String objects
*/
public List<String> getTokenStringsOf(TokenType... types) {
List<String> strSubtokens = Lists.newArrayListWithCapacity(tokens.size());
for (Token token : getTokensOf(types)) {
strSubtokens.add(token.getTerm().toString());
}
return strSubtokens;
}
public static final class Builder {
private final CharSequence origText;
private final List<Token> tokens;
public Builder(CharSequence originalText) {
Preconditions.checkNotNull(originalText);
this.origText = originalText;
tokens = Lists.newArrayList();
}
public Builder addToken(int offset, int length) {
addToken(offset, length, TokenType.TOKEN);
return this;
}
public Builder addToken(int offset, int length, TokenType type) {
addToken(offset, length, type, PartOfSpeechAttribute.UNKNOWN);
return this;
}
public Builder addToken(int offset, int length, TokenType type, int pos) {
Preconditions.checkArgument(offset >= 0);
Preconditions.checkArgument(length >= 0);
Preconditions.checkNotNull(type);
tokens.add(new Token(CharBuffer.wrap(origText, offset, offset + length), type, pos));
return this;
}
public TokenizedCharSequence build() {
return new TokenizedCharSequence(origText, tokens);
}
}
public static final TokenizedCharSequence createFrom(CharSequence text,
TokenStream tokenizer) {
tokenizer.reset(text);
CharSequenceTermAttribute termAttr = tokenizer.getAttribute(CharSequenceTermAttribute.class);
TokenTypeAttribute typeAttr = tokenizer.getAttribute(TokenTypeAttribute.class);
PartOfSpeechAttribute posAttr = null;
if (tokenizer.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = tokenizer.getAttribute(PartOfSpeechAttribute.class);
}
TokenizedCharSequence.Builder builder = new TokenizedCharSequence.Builder(text);
while (tokenizer.incrementToken()) {
builder.addToken(termAttr.getOffset(), termAttr.getLength(),
typeAttr.getType(),
posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS());
}
return builder.build();
}
public static final List<TokenizedCharSequence> createFromTokenGroupsIn(
TokenStream stream) {
CharSequenceTermAttribute termAttr = stream.getAttribute(CharSequenceTermAttribute.class);
TokenGroupAttribute groupAttr = stream.getAttribute(TokenGroupAttribute.class);
List<TokenizedCharSequence> groups = Lists.newArrayList();
while (stream.incrementToken()) {
Builder builder = new Builder(termAttr.getTermCharSequence());
TokenStream groupStream = groupAttr.getTokenGroupStream();
CharSequenceTermAttribute groupTermAttr = groupStream.getAttribute(CharSequenceTermAttribute.class);
TokenTypeAttribute typeAttr = groupStream.getAttribute(TokenTypeAttribute.class);
while (groupStream.incrementToken()) {
builder.addToken(groupTermAttr.getOffset() - termAttr.getOffset(),
groupTermAttr.getLength(),
typeAttr.getType());
}
groups.add(builder.build());
}
return groups;
}
}