package tathya.text.tokenizer; import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.List; public class TwitterTokenizer implements ITokenizer { private List<String> sentences = new ArrayList<String>(); public List<String> tokenize(String text) { // clear previous sentences sentences = new ArrayList<String>(); // preprocess text = preprocess(text); int startIndex = 0; int endIndex = 0; for (int i = 0; i < text.length(); i++) { char ch = text.charAt(i); if (ch == '?' || ch == '!') { endIndex = i; addSentence(text, startIndex, endIndex); startIndex = endIndex; } else if (ch == '.' || ch == '-' || ch == ':' || ch == ',') { if (isDelimiter(text, i)) { endIndex = i; addSentence(text, startIndex, endIndex+1); startIndex = endIndex; } } } // if(text.charAt(text.length()-1) == '?' || // text.charAt(text.length()-1) == '!' || text.charAt(text.length()-1) // == '.') { // addSentence(text, startIndex, text.length()); // } if (endIndex + 1 < text.length() - 1) { addSentence(text, startIndex, text.length()); } sentences = postprocess(sentences); return sentences; } @SuppressWarnings("deprecation") private List<String> postprocess(List<String> sentences) { List<String> result = new ArrayList<String>(); for(String s : sentences) { StringBuffer temp = new StringBuffer(); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); if(!Character.isLetter(ch) && !Character.isDigit(ch) && !Character.isSpace(ch)) { if(i == 0 || (s.charAt(i-1) == ' ')) { temp.append(ch); temp.append(' '); } else if(i == s.length()-1 || (s.charAt(i+1) == ' ')) { temp.append(' '); temp.append(ch); } else { temp.append(ch); } } else { temp.append(ch); } } result.add(temp.toString()); } return result; } /* * Tweet preprocessing here */ private String preprocess(String text) { text = text.toLowerCase().replaceFirst("^rt [a-z0-9@: ]+:", ""); text = text.toLowerCase().replaceAll("http://[a-z.0-9+/%&#~\\-_]+", ""); text = text.toLowerCase().replaceAll("#", ""); return text; } private void addSentence(String text, int startIndex, int endIndex) { try { if (startIndex != endIndex - 1) { if (startIndex == 0) { sentences.add(text.substring(0, endIndex)); } else if ((endIndex - 1) != (startIndex + 2) && text.charAt(startIndex + 1) == ' ') { sentences.add(text.substring(startIndex + 2, endIndex)); } else { sentences.add(text.substring(startIndex + 1, endIndex)); } } } catch (IndexOutOfBoundsException e) { System.out.println("Exception : " + e.getMessage()); } } private boolean isDelimiter(String text, int index) { try { // Backward int startIndex = index; int endIndex = index; int prevWordStartIndex = -1; int nextWordEndIndex = -1; char ch; // Backward do { ch = text.charAt(startIndex); startIndex--; } while (ch != ' ' && startIndex != -1); // Backward for previous word prevWordStartIndex = startIndex; if (prevWordStartIndex >= 0) { do { ch = text.charAt(prevWordStartIndex); prevWordStartIndex--; } while (ch != ' ' && prevWordStartIndex != -1); } // Forward do { ch = text.charAt(endIndex); endIndex++; } while (ch != ' ' && endIndex < text.length()); // Forward for next word if (endIndex < text.length()) { nextWordEndIndex = endIndex + 1; do { ch = text.charAt(nextWordEndIndex); nextWordEndIndex++; } while (ch != ' ' && nextWordEndIndex < text.length()); } String textChunk = text.substring(startIndex + 2, endIndex); String prevWord = null; if (prevWordStartIndex != startIndex) { prevWord = text.substring(prevWordStartIndex + 2, startIndex + 1); } String nextWord = null; if (nextWordEndIndex != -1) { nextWord = text.substring(endIndex, nextWordEndIndex - 1); } // Rule 0: If previous word has a period or is the start of // sentence, this is not a delimiter if (prevWord == null || prevWord.charAt(prevWord.length() - 1) == '.') { return false; } if (text.charAt(index) == '.') { // Rule 1 : Numbers if (textChunk.matches("[ ]*[0-9]+\\.[0-9]+[ ,]*")) { // System.out.println("False : " + textChunk); return false; } // Rule 2 : abbreviations if (textChunk.matches("[ ]*[A-Za-z]\\.([A-Za-z0-9]\\.)*[ ,]*") || textChunk.matches("[ ]*[A-Z][^aeiou]+\\.+[ ,]*")) { // System.out.println("False : " + textChunk); return false; } // Rule 3 : if there is a space after delimiter then return true if ((index != text.length() - 1) && text.charAt(index + 1) == ' ') { // System.out.println("true : " + textChunk); return true; } // Rule 4: urls and emails if (textChunk.matches("^[a-z]+://.+") || textChunk.matches("^[a-z]+://.+") || textChunk.matches("^[a-zA-Z0-9_.-]+@.+")) { return false; } // Rule 5: consecutive string with periods and no spaces (e.g. // www.yahoo.com) if (textChunk .matches(".*[a-zA-Z0-9-\\.]+\\.[a-zA-Z0-9-\\.]+.*")) { return false; } return true; } else { if(text.charAt(index-1) == ' ' || text.charAt(index+1) == ' ') { return true; } } } catch (Exception e) { } return false; } public static void main(String args[]) { TwitterTokenizer st = new TwitterTokenizer(); try{ BufferedReader fr = new BufferedReader(new FileReader(args[0])); String line; while((line = fr.readLine()) != null) { List<String> sentences = st .tokenize(line); for (String s : sentences) { System.out.println(s); } } } catch(Exception e) { e.printStackTrace(); } } }