package edu.harvard.wcfia.yoshikoder.document.tokenizer;
import java.text.BreakIterator;
import java.util.Locale;
/**
* @author will
*/
public class BITokenizerImpl implements Tokenizer{
private Locale locale;
private BreakIterator wordIterator;
public BITokenizerImpl(Locale loc){
locale = loc;
}
public BITokenizerImpl() {
this(Locale.getDefault());
}
public Locale[] getLocales(){
return new Locale[]{locale};
}
/**
* @return Returns the locale.
*/
public Locale getLocale() {
return locale;
}
/**
* @param defaultLocale The locale to set.
*/
public void setLocale(Locale defaultLocale) {
locale = defaultLocale;
}
public TokenList getTokens(String txt){
TokenList tokens = new TokenListImpl();
wordIterator = BreakIterator.getWordInstance(locale);
//wordIterator = BreakIterator.getWordInstance();
wordIterator.setText(txt);
int start = wordIterator.first();
int end = wordIterator.next();
String word;
while (end != BreakIterator.DONE) {
int startoffset = start; // keep hold of this
word = txt.substring(start, end);
int endoffset = startoffset + word.length();
start = end;
// throws a runtime very rarely for strange characters.
try {
end = wordIterator.next();
} catch (Exception e) {
e.printStackTrace();
continue;
}
char c = word.charAt(0);
if (Character.isLetterOrDigit(c) ||
Character.getType(c)==Character.CURRENCY_SYMBOL)
tokens.add(new TokenImpl(word.intern(), startoffset, endoffset));
}
return tokens;
}
}