/**
*
*/
package com.maalaang.omtwitter.text;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Sangwon Park
*
*/
public class OMTweetTokenizer {
private final static String REGEX_TOKEN = "#\\w+|@\\w+|https?://\\S+|[\\w-'$%]+|[\\p{Punct}oOtT&&[^#@]]+";
private final static Pattern patternToken = Pattern.compile(REGEX_TOKEN);
private ArrayList<OMTweetToken> list = null;
public OMTweetTokenizer() {
list = new ArrayList<OMTweetToken>(128);
}
public OMTweetToken[] tokenize(String tweet) {
Matcher matcher = patternToken.matcher(tweet);
while (matcher.find()) {
String token = matcher.group();
int type = 0;
switch (token.charAt(0)) {
case '#':
type = OMTweetToken.TOKEN_TYPE_HASHTAG;
break;
case '@':
type = OMTweetToken.TOKEN_TYPE_USER;
break;
case 'h':
if (token.startsWith("http")) {
type = OMTweetToken.TOKEN_TYPE_URL;
break;
}
default:
type = OMTweetToken.TOKEN_TYPE_NORMAL;
}
list.add(new OMTweetToken_Impl(type, matcher.start(), matcher.end(), token));
}
OMTweetToken[] tokens = list.toArray(new OMTweetToken[list.size()]);
list.clear();
return tokens;
}
public static OMTweetToken omtToken(String token) {
int type = 0;
switch (token.charAt(0)) {
case '#':
type = OMTweetToken.TOKEN_TYPE_HASHTAG;
break;
case '@':
type = OMTweetToken.TOKEN_TYPE_USER;
break;
case 'h':
if (token.startsWith("http")) {
type = OMTweetToken.TOKEN_TYPE_URL;
break;
}
default:
type = OMTweetToken.TOKEN_TYPE_NORMAL;
}
return new OMTweetToken_Impl(type, -1, -1, token);
}
}