package tathya.text.tokenizer;
import java.util.ArrayList;
import java.util.List;
public class SentenceTokenizer implements ITokenizer{
private List<String> sentences = new ArrayList<String>();
public List<String> tokenize(String text) {
// clear previous sentences
sentences = new ArrayList<String>();
int startIndex = 0;
int endIndex = 0;
for(int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
if(ch == '?' || ch == '!') {
endIndex = i;
addSentence(text, startIndex, endIndex);
startIndex = endIndex;
}
else if(ch == '.') {
if(isDelimiter(text, i)) {
endIndex = i;
addSentence(text, startIndex, endIndex+1);
startIndex = endIndex;
}
}
}
if(text.charAt(text.length()-1) == '?' || text.charAt(text.length()-1) == '!' || text.charAt(text.length()-1) == '.') {
addSentence(text, startIndex, text.length());
}
return sentences;
}
private void addSentence(String text, int startIndex, int endIndex) {
try {
if(startIndex != endIndex -1) {
if(startIndex == 0) {
sentences.add(text.substring(0, endIndex));
} else if((endIndex - 1) != (startIndex + 2) && text.charAt(startIndex+1) == ' ') {
sentences.add(text.substring(startIndex + 2, endIndex));
} else {
sentences.add(text.substring(startIndex + 1, endIndex));
}
}
} catch(IndexOutOfBoundsException e) {
System.out.println("Exception : " + e.getMessage());
}
}
private boolean isDelimiter(String text, int index) {
try {
//Backward
int startIndex = index;
int endIndex = index;
int prevWordStartIndex = -1;
int nextWordEndIndex = -1;
char ch;
//Backward
do {
ch = text.charAt(startIndex);
startIndex--;
} while(ch != ' ' && startIndex != -1);
//Backward for previous word
prevWordStartIndex = startIndex;
if(prevWordStartIndex >= 0) {
do {
ch = text.charAt(prevWordStartIndex);
prevWordStartIndex--;
} while(ch != ' ' && prevWordStartIndex != -1);
}
//Forward
do {
ch = text.charAt(endIndex);
endIndex++;
} while(ch != ' ' && endIndex < text.length());
//Forward for next word
if(endIndex < text.length()) {
nextWordEndIndex = endIndex + 1;
do {
ch = text.charAt(nextWordEndIndex);
nextWordEndIndex++;
} while(ch != ' ' && nextWordEndIndex < text.length());
}
String textChunk = text.substring(startIndex + 2, endIndex);
String prevWord = null;
if(prevWordStartIndex != -1) {
prevWord = text.substring(prevWordStartIndex + 2, startIndex + 1);
}
String nextWord = null;
if(nextWordEndIndex != -1) {
nextWord = text.substring(endIndex, nextWordEndIndex - 1);
}
// Rule 0: If previous word has a period or is the start of sentence, this is not a delimiter
if(prevWord == null || prevWord.charAt(prevWord.length()-1) == '.') {
return false;
}
//Rule 1 : Numbers
if(textChunk.matches("[ ]*[0-9]+\\.[0-9]+[ ,]*")) {
//System.out.println("False : " + textChunk);
return false;
}
//Rule 2 : abbreviations
if(textChunk.matches("[ ]*[A-Za-z]\\.([A-Za-z0-9]\\.)*[ ,]*") || textChunk.matches("[ ]*[A-Z][^aeiou]+\\.+[ ,]*")) {
//System.out.println("False : " + textChunk);
return false;
}
//Rule 3 : if there is a space after delimiter then return true
if((index != text.length() - 1) && text.charAt(index + 1) == ' ') {
//System.out.println("true : " + textChunk);
return true;
}
//Rule 4: urls and emails
if(textChunk.matches("^[a-z]+://.+") || textChunk.matches("^[a-z]+://.+") || textChunk.matches("^[a-zA-Z0-9_.-]+@.+")) {
return false;
}
//Rule 5: consecutive string with periods and no spaces (e.g. www.yahoo.com)
if(textChunk.matches(".*[a-zA-Z0-9-\\.]+\\.[a-zA-Z0-9-\\.]+.*")) {
return false;
}
return true;
} catch(Exception e) {
return false;
}
}
public static void main(String args[]) {
SentenceTokenizer st = new SentenceTokenizer();
//List<String> sentences = st.tokenize("please visit http://www.yahoo.com or email me at anand@semanticvoid.com. This is a test for www.google.com. Do you yahoo!.");
List<String> sentences = st.tokenize("Gov. Arnold os coming tonite. my name is anand kishore. I work at Yahoo. Sen. Palin what is up?");
for(String s : sentences) {
System.out.println("Sentence:\t" + s);
}
}
}