// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import java.nio.CharBuffer;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.PartOfSpeechAttribute;
import com.twitter.common.text.token.attribute.TokenGroupAttribute;
import com.twitter.common.text.token.attribute.TokenGroupAttributeImpl;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Keeps the original text as well as its tokenized tokens.
*/
public class TokenizedCharSequence implements CharSequence {
public static final class Token implements CharSequence {
public static final int DEFAULT_PART_OF_SPEECH = -1;
private final CharBuffer term;
private final TokenType type;
private final int pos;
private final int inc;
private final TokenizedCharSequence group;
protected Token(CharBuffer term, TokenType type) {
this(term, type, DEFAULT_PART_OF_SPEECH);
}
protected Token(CharBuffer term, TokenType type, int pos) {
this(term, type, pos, 1, null);
}
protected Token(CharBuffer term, TokenType type, int pos, int inc, @Nullable TokenizedCharSequence group) {
this.term = term;
this.type = type;
this.pos = pos;
this.inc = inc;
this.group = group;
}
/**
* Returns a new Token object which represents a term starting with {@code offset} and with
* {@code length}.
*
* @param offset offset of the sub-token
* @param length length of the sub-token
* @return a new Token object representing a sub-token
*/
public Token tokenize(int offset, int length) {
Preconditions.checkArgument(offset >= 0 && offset < getLength());
Preconditions.checkArgument(length > 0 && length <= getLength());
return new Token((CharBuffer)term.subSequence(offset, offset + length), type, pos);
}
@Override
public String toString() {
return term.toString();
}
@Override
public int length() {
return term.limit() - term.position();
}
@Override
public char charAt(int index) {
return term.charAt(index);
}
@Override
public CharSequence subSequence(int start, int end) {
return term.subSequence(start, end);
}
public CharSequence getTerm() {
return term;
}
public int getOffset() {
return term.position();
}
public int getEndOffset() {
return term.limit();
}
public int getLength() {
return term.limit() - term.position();
}
public TokenType getType() {
return type;
}
public int getPartOfSpeech() {
return pos;
}
public int getPositionIncrement() {
return inc;
}
public TokenizedCharSequence getGroup() {
return group;
}
public boolean hasGroup() {
return group != null;
}
}
private final CharSequence term;
private final List<Token> tokens;
private String strValue = null;
private int hashCode;
private boolean hashCodeCalced = false;
protected TokenizedCharSequence(CharSequence text, List<Token> tokens) {
this.tokens = Collections.unmodifiableList(tokens);
this.term = text;
}
@Override
public char charAt(int index) {
return term.charAt(index);
}
@Override
public int length() {
return term.length();
}
@Override
public CharSequence subSequence(int fromIndex, int toIndex) {
return term.subSequence(fromIndex, toIndex);
}
@Override
public String toString() {
if (strValue == null) {
strValue = term.toString();
}
return strValue;
}
@Override
public boolean equals(Object obj) {
return (obj != null)
&& (obj instanceof TokenizedCharSequence)
&& ((TokenizedCharSequence) obj).term.toString().equals(this.term.toString());
}
@Override
public int hashCode() {
if (!hashCodeCalced) {
hashCode = term.toString().hashCode();
hashCodeCalced = true;
}
return hashCode;
}
/**
* Returns all tokens.
*
* @return a list of tokens as CharBuffer objects
*/
public List<Token> getTokens() {
return tokens;
}
private static final Function<Token, String> TOKEN_TO_STRING_CONVERTER =
new Function<Token, String>() {
@Override
public String apply(Token token) {
return token.getTerm().toString();
}
};
/**
* Returns all tokens as String.
*
* @return a list of tokens as String objects
*/
public List<String> getTokenStrings() {
return Lists.transform(tokens, TOKEN_TO_STRING_CONVERTER);
}
/**
* Returns tokens of one or more specified types.
*
* @param types token type(s)
* @return tokens of the specified type(s)
*/
public List<Token> getTokensOf(TokenType... types) {
if (types.length == 0) {
return Collections.emptyList();
}
final Set<TokenType> tokenTypeSet = EnumSet.of(types[0], types);
return ImmutableList.copyOf(
Iterables.filter(tokens, new Predicate<Token>() {
@Override
public boolean apply(Token token) {
return tokenTypeSet.contains(token.getType());
}
})
);
}
/**
* Returns tokens of one or more specified types as Strings.
*
* @param types token type(s)
* @return list of tokens of specified type(s) as String objects
*/
public List<String> getTokenStringsOf(TokenType... types) {
return Lists.transform(getTokensOf(types), TOKEN_TO_STRING_CONVERTER);
}
public static final class Builder {
private final CharSequence origText;
private final List<Token> tokens;
public Builder(CharSequence originalText) {
Preconditions.checkNotNull(originalText);
this.origText = originalText;
tokens = Lists.newArrayList();
}
public Builder addToken(int offset, int length) {
addToken(offset, length, TokenType.TOKEN);
return this;
}
public Builder addToken(int offset, int length, TokenType type) {
addToken(offset, length, type, PartOfSpeechAttribute.UNKNOWN);
return this;
}
public Builder addToken(int offset, int length, TokenType type, int pos) {
addToken(offset, length, type, pos, 1, null);
return this;
}
public Builder addToken(
int offset,
int length,
TokenType type,
int pos,
int inc,
TokenizedCharSequence group) {
Preconditions.checkArgument(offset >= 0);
Preconditions.checkArgument(length >= 0);
Preconditions.checkNotNull(type);
Preconditions.checkArgument(inc >= 0);
tokens.add(
new Token(CharBuffer.wrap(origText, offset, offset + length), type, pos, inc, group));
return this;
}
public TokenizedCharSequence build() {
return new TokenizedCharSequence(origText, tokens);
}
}
public static final TokenizedCharSequence createFrom(TwitterTokenStream tokenizer) {
CharSequenceTermAttribute termAttr = tokenizer.getAttribute(CharSequenceTermAttribute.class);
TokenTypeAttribute typeAttr = tokenizer.getAttribute(TokenTypeAttribute.class);
PartOfSpeechAttribute posAttr = null;
if (tokenizer.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = tokenizer.getAttribute(PartOfSpeechAttribute.class);
}
PositionIncrementAttribute incAttr = null;
if (tokenizer.hasAttribute(PositionIncrementAttribute.class)) {
incAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
}
TokenGroupAttributeImpl groupAttr = null;
if (tokenizer.hasAttribute(TokenGroupAttribute.class)) {
groupAttr = (TokenGroupAttributeImpl) tokenizer.getAttribute(TokenGroupAttribute.class);
}
//Need to wait for increment token for termAttr to have charsequence properly set
TokenizedCharSequence.Builder builder = null;
while (tokenizer.incrementToken()) {
if (builder == null) {
//Now we can set the term sequence for the builder.
builder = new TokenizedCharSequence.Builder(termAttr.getCharSequence());
}
builder.addToken(termAttr.getOffset(), termAttr.getLength(),
typeAttr.getType(),
posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(),
incAttr == null ? 1 : incAttr.getPositionIncrement(),
groupAttr == null || groupAttr.isEmpty() ? null :
(groupAttr.getSequence() == null ? createFrom(groupAttr.getTokenGroupStream()) :
groupAttr.getSequence())
);
}
if (builder == null) { //Never entered tokenizer loop, build an empty string
builder = new TokenizedCharSequence.Builder("");
}
return builder.build();
}
public static final TokenizedCharSequence createFrom(CharSequence text,
TwitterTokenStream tokenizer) {
tokenizer.reset(text);
return createFrom(tokenizer);
}
public static final List<TokenizedCharSequence> createFromTokenGroupsIn(
TwitterTokenStream stream) {
TokenGroupAttribute groupAttr = stream.getAttribute(TokenGroupAttribute.class);
List<TokenizedCharSequence> groups = Lists.newArrayList();
while (stream.incrementToken()) {
Builder builder = new Builder(stream.term());
TwitterTokenStream groupStream = groupAttr.getTokenGroupStream();
PartOfSpeechAttribute posAttr = null;
if (groupStream.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = groupStream.getAttribute(PartOfSpeechAttribute.class);
}
PositionIncrementAttribute incAttr = null;
if (groupStream.hasAttribute(PositionIncrementAttribute.class)) {
incAttr = groupStream.getAttribute(PositionIncrementAttribute.class);
}
TokenGroupAttributeImpl innerGroupAttr = null;
if (groupStream.hasAttribute(TokenGroupAttribute.class)) {
innerGroupAttr = (TokenGroupAttributeImpl) groupStream.getAttribute(TokenGroupAttribute.class);
}
while (groupStream.incrementToken()) {
builder.addToken(groupStream.offset() - stream.offset(),
groupStream.length(),
groupStream.type(),
posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(),
incAttr == null ? 1 : incAttr.getPositionIncrement(),
innerGroupAttr == null || innerGroupAttr.isEmpty() ? null :
(innerGroupAttr.getSequence() == null ? createFrom(innerGroupAttr.getTokenGroupStream()) :
innerGroupAttr.getSequence()));
}
groups.add(builder.build());
}
return groups;
}
}