package org.wikibrain.core.nlp;
import org.apache.commons.lang3.Range;
import org.wikibrain.core.lang.Language;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
/**
* @author Shilad Sen
*/
public class StringTokenizer {
public List<String> getWords(Language language, String text) {
List<String> words = new ArrayList<String>();
Locale currentLocale = language.getLocale();
BreakIterator sentenceIterator = BreakIterator.getWordInstance(currentLocale);
sentenceIterator.setText(text);
int boundary = sentenceIterator.first();
int lastBoundary = 0;
while (boundary != BreakIterator.DONE) {
boundary = sentenceIterator.next();
if(boundary != BreakIterator.DONE){
String word = text.substring(lastBoundary, boundary);
if (word.length() > 0 && Character.isLetterOrDigit(word.charAt(0))) {
words.add(word);
}
}
lastBoundary = boundary;
}
return words;
}
public List<String> getSentences(Language language, String text) {
List<String> sentences = new ArrayList<String>();
Locale currentLocale = language.getLocale();
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(currentLocale);
sentenceIterator.setText(text);
int boundary = sentenceIterator.first();
int lastBoundary = 0;
while (boundary != BreakIterator.DONE) {
boundary = sentenceIterator.next();
if(boundary != BreakIterator.DONE){
sentences.add(text.substring(lastBoundary, boundary));
}
lastBoundary = boundary;
}
return sentences;
}
public List<Token> getSentenceTokens(Language language, String text) {
List<Token> sentences = new ArrayList<Token>();
Locale currentLocale = language.getLocale();
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(currentLocale);
sentenceIterator.setText(text);
int boundary = sentenceIterator.first();
int lastBoundary = 0;
while (boundary != BreakIterator.DONE) {
boundary = sentenceIterator.next();
if(boundary != BreakIterator.DONE){
sentences.add(new Token(lastBoundary, boundary, text));
}
lastBoundary = boundary;
}
return sentences;
}
public List<Token> getWordTokens(Language language, String text) {
List<Token> words = new ArrayList<Token>();
Locale currentLocale = language.getLocale();
BreakIterator sentenceIterator = BreakIterator.getWordInstance(currentLocale);
sentenceIterator.setText(text);
int boundary = sentenceIterator.first();
int lastBoundary = 0;
while (boundary != BreakIterator.DONE) {
boundary = sentenceIterator.next();
if(boundary != BreakIterator.DONE){
String word = text.substring(lastBoundary, boundary);
if (word.length() > 0 && Character.isLetterOrDigit(word.charAt(0))) {
words.add(new Token(lastBoundary, boundary, text));
}
}
lastBoundary = boundary;
}
return words;
}
public List<Token> getWordTokens(Language language, Token text) {
List<Token> words = new ArrayList<Token>();
Locale currentLocale = language.getLocale();
BreakIterator sentenceIterator = BreakIterator.getWordInstance(currentLocale);
sentenceIterator.setText(text.getToken());
int boundary = sentenceIterator.first();
int lastBoundary = 0;
while (boundary != BreakIterator.DONE) {
boundary = sentenceIterator.next();
if(boundary != BreakIterator.DONE){
String word = text.getToken().substring(lastBoundary, boundary);
if (word.length() > 0 && Character.isLetterOrDigit(word.charAt(0))) {
words.add(new Token(
lastBoundary + text.getBegin(),
boundary + text.getBegin(),
text.getFullText()));
}
}
lastBoundary = boundary;
}
return words;
}
}