// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.example;
import java.util.Iterator;
import org.apache.lucene.util.Attribute;
import com.twitter.common.text.DefaultTextTokenizer;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.TokenizedCharSequence;
import com.twitter.common.text.token.TokenizedCharSequence.Token;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Annotated example illustrating major features of {@link DefaultTextTokenizer}.
*/
public class TokenizerUsageExample {
private static final String[] famousTweets = {
// http://twitter.com/#!/BarackObama/status/992176676
"We just made history. All of this happened because you gave your time, talent and passion."
+ "All of this happened because of you. Thanks",
// http://twitter.com/#!/jkrums/status/1121915133
"http://twitpic.com/135xa - There's a plane in the Hudson."
+ " I'm on the ferry going to pick up the people. Crazy.",
// http://twitter.com/#!/carlbildt/status/73498110629904384
"@khalidalkhalifa Trying to get in touch with you on an issue.",
// http://twitter.com/#!/SHAQ/status/75996821360615425
"im retiring Video: http://bit.ly/kvLtE3 #ShaqRetires"
};
public static void main(String[] args) {
// This is the canonical way to create a token stream.
DefaultTextTokenizer tokenizer =
new DefaultTextTokenizer.Builder().setKeepPunctuation(true).build();
TokenStream stream = tokenizer.getDefaultTokenStream();
// We're going to ask the token stream what type of attributes it makes available. "Attributes"
// can be understood as "annotations" on the original text.
System.out.println("Attributes available:");
Iterator<Class<? extends Attribute>> iter = stream.getAttributeClassesIterator();
while (iter.hasNext()) {
Class<? extends Attribute> c = iter.next();
System.out.println(" - " + c.getCanonicalName());
}
System.out.println("");
// We're now going to iterate through a few tweets and tokenize each in turn.
for (String tweet : famousTweets) {
// We're first going to demonstrate the "token-by-token" method of consuming tweets.
System.out.println("Processing: " + tweet);
// Reset the token stream to process new input.
stream.reset(tweet);
// Now we're going to consume tokens from the stream.
int tokenCnt = 0;
while (stream.incrementToken()) {
// CharSequenceTermAttribute holds the actual token text. This is preferred over
// TermAttribute because it avoids creating new String objects.
CharSequenceTermAttribute termAttribute = stream
.getAttribute(CharSequenceTermAttribute.class);
// TokenTypeAttribute holds, as you'd expect, the type of the token.
TokenTypeAttribute typeAttribute = stream.getAttribute(TokenTypeAttribute.class);
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'",
tokenCnt, termAttribute.getOffset(), termAttribute.getLength() - termAttribute.getOffset(),
typeAttribute.getType().name, termAttribute.getTermCharSequence()));
tokenCnt++;
}
System.out.println("");
// We're now going to demonstrate the TokenizedCharSequence API.
// This should produce exactly the same result as above.
tokenCnt = 0;
System.out.println("Processing: " + tweet);
TokenizedCharSequence tokSeq = tokenizer.tokenize(tweet);
for (Token tok : tokSeq.getTokens()) {
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'",
tokenCnt, tok.getOffset(), tok.getOffset() + tok.getLength(),
tok.getType().name, tok.getTerm()));
tokenCnt++;
}
System.out.println("");
}
}
}