RobustTokenizer.java example

Explorer
CoreNLP-master
/*
 * RobustTokenizer.java
 * Performs tokenization of natural language English text, following ACE data
 * Use the method tokenize() for smart tokenization
 * @author Mihai
 */

package edu.stanford.nlp.ie.machinereading.domains.ace.reader; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.FileReader;
import java.io.BufferedReader;

import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.util.Generics;

public class RobustTokenizer<T extends Word> extends AbstractTokenizer<Word>  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(RobustTokenizer.class);
  
  /** Buffer to tokenize */
  String buffer;

  /** The set of known abbreviations */
  private AbbreviationMap mAbbreviations;
  
  public final static int MAX_MULTI_WORD_SIZE = 20;

  // basic tokens
  public final static String DOT         = block("\\.");
  public final static String DOTDOT      = block("\\:");
  public final static String APOSTROPHE  = block("\\'");
  public final static String SLASH       = block("\\/");
  public final static String UNDERSCORE  = block("\\_");
  public final static String MINUS       = block("\\-");
  public final static String PLUS        = block("\\+");
  public final static String COMMA       = block("\\,");
  public final static String DOTCOMMA    = block("\\;");
  public final static String QUOTES      = block(or("\\\"", "\\'\\'", "\\'", "\\`\\`", "\\`"));
  public final static String DOUBLE_QUOTES = block(or("\\\"" , "\\'\\'")); 
  public final static String LRB         = block("\\(");
  public final static String RRB         = block("\\)");
  public final static String LCB         = block("\\{");
  public final static String RCB         = block("\\}");
  public final static String GREATER     = block("\\>");
  public final static String LOWER       = block("\\<");
  public final static String AMPERSAND   = block("\\&");
  public final static String AT          = block("\\@");
  public final static String HTTP        = block("[hH][tT][tT][pP]\\:\\/\\/");

  // basic sequences
  public final static String WHITE_SPACE = block("\\s");
  public final static String DIGIT       = block("\\d");
  public final static String LETTER      = block("[a-zA-Z]");
  public final static String UPPER       = block("[A-Z]");
  public final static String SIGN        = or(MINUS,PLUS);

  // numbers
  public final static String FULLNUM = 
    block(
        zeroOrOne(SIGN) +
        oneOrMore(DIGIT) +
        zeroOrMore(
            zeroOrOne(or(DOT, COMMA, SLASH)) +
            oneOrMore(DIGIT)));
  public final static String DECNUM       = block(DOT + oneOrMore(DIGIT));
  public final static String NUM          = or(FULLNUM, DECNUM);    

  // date and time
  public final static String DATE = 
    block(oneOrMore(DIGIT) + SLASH +
        oneOrMore(DIGIT) + SLASH +
        oneOrMore(DIGIT));
  public final static String TIME = 
    block(oneOrMore(DIGIT) + 
        oneOrMore(block(
            DOTDOT + 
            oneOrMore(DIGIT))));

  // punctuation marks
  public final static String PUNC = 
    or(QUOTES, 
        block(MINUS + oneOrMore(MINUS)),
        block(DOT + oneOrMore(DOT)));

  // words
  public final static String LETTERS     = oneOrMore(LETTER);
  public final static String BLOCK       = or(NUM, LETTERS); 
  public final static String WORD        = 
    block(zeroOrOne(APOSTROPHE) + 
        BLOCK +
        zeroOrMore(block(
            zeroOrOne(or(UNDERSCORE, 
                MINUS, 
                APOSTROPHE,
                SLASH,
                AMPERSAND)) +
                BLOCK)));

  // acronyms
  public final static String ACRONYM = block(oneOrMore(LETTER + DOT));// + zeroOrOne(LETTER));

  // this matches acronyms AFTER abbreviation merging
  public final static String LOOSE_ACRONYM = 
    block(oneOrMore((oneOrMore(LETTER) + DOT)) + zeroOrMore(LETTER));

  // other possible constructs
  public final static String PAREN      = or(LRB, RRB, LCB, RCB);
  public final static String SGML       = "<[^<>]+>";
  public final static String HTMLCODE   = block(AMPERSAND + UPPER + DOTCOMMA);

  public final static String ANY        = block("\\S");

  // email addresses must start with a letter, contain @, and end with a letter
  public final static String EMAIL      = block(LETTER +
      zeroOrMore(or(LETTER,
          DIGIT,
          DOT,
          MINUS,
          UNDERSCORE)) + 
          AT + 
          zeroOrMore(or(LETTER,
              DIGIT,
              DOT,
              MINUS,
              UNDERSCORE)) +
              LETTER);
  
  // email addresses must start with a letter, contain @, and end with . com 
  public final static String DOMAIN_EMAIL      = block(LETTER +
      zeroOrMore(or(LETTER,
          DIGIT,
          DOT,
          MINUS,
          UNDERSCORE)) + 
          AT + 
          oneOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) +
          zeroOrMore(WHITE_SPACE)+ DOT + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us"));

  // URLs must start with http:// or ftp://, followed by at least a letter
  public final static String URL = 
    block(HTTP +
        oneOrMore(or(LETTER,
            DIGIT,
            DOT,
            UNDERSCORE,
            SLASH,
            AMPERSAND,
            MINUS,
            PLUS)));
  
  //URLs without http, but ending in org, com, net
  public final static String SMALL_URL = 
    block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us"));

  // keep sequence of underscores as a single token
  public final static String UNDERSCORESEQ = oneOrMore("_");

  // list bullet, e.g., "(a)"
  public final static String LIST_BULLET = block(LRB + LETTER + zeroOrOne(LETTER) + RRB); 
  // part of a phone number, e.g., "(214)"
  public final static String PHONE_PART = block(LRB + oneOrMore(DIGIT) + RRB);

  // sequence of digits
  public final static String DIGITSEQ = oneOrMore(DIGIT);

  // the complete pattern
  public final static String RECOGNISED_PATTERN 
  = block(block(TIME) + "|" +
      block(DOMAIN_EMAIL) + "|" +
      block(EMAIL) + "|" +
      block(URL) + "|" +
      // block(SMALL_URL) + "|" +
      block(ACRONYM) + "|" + 
      block(DATE) + "|" + 
      block(PHONE_PART) + "|" + // must be before WORD, otherwise it's broken into multiple tokens
      block(WORD) + "|" +
      block(PUNC) + "|" + 
      block(LIST_BULLET) + "|" + 
      block(PAREN) + "|" + 
      block(SGML) + "|" + 
      block(HTMLCODE) + "|" + 
      block(UNDERSCORESEQ) + "|" + 
      block(ANY));

  /** The overall token pattern */
  private final static Pattern wordPattern;

  /** Pattern to recognize SGML tags */
  private final static Pattern sgmlPattern;

  /** Pattern to recognize slash-separated dates */
  private final static Pattern slashDatePattern;

  /** Pattern to recognize acronyms */
  private final static Pattern acronymPattern;

  /** Pattern to recognize URLs */
  private final static Pattern urlPattern;

  /** Pattern to recognize emails */
  private final static Pattern emailPattern;

  /** Recognized sequences of digits */
  private final static Pattern digitSeqPattern;

  static{
    wordPattern          = Pattern.compile(RECOGNISED_PATTERN); 
    sgmlPattern          = Pattern.compile(SGML);
    slashDatePattern     = Pattern.compile(DATE);
    acronymPattern       = Pattern.compile(LOOSE_ACRONYM);
    urlPattern           = Pattern.compile(URL);
    emailPattern         = Pattern.compile(EMAIL);
    digitSeqPattern      = Pattern.compile(DIGITSEQ);
  }
  
  public RobustTokenizer(String buffer) {
    mAbbreviations = new AbbreviationMap(true);
    this.buffer = buffer;
    this.cachedTokens = null;
  }


  public RobustTokenizer(boolean caseInsensitive, String buffer) {
    mAbbreviations = new AbbreviationMap(caseInsensitive);
    this.buffer = buffer;
    this.cachedTokens = null;
  }

  /** any in the set */ 
  public static String range(String s){
    return block("[" + s + "]");
  }


  /** zero or one */
  public static String zeroOrOne(String s){
    return block(block(s) + "?");
  }

  /** zero or more */
  public static String zeroOrMore(String s){
    return block(block(s) + "*");
  }

  /** one or more */
  public static String oneOrMore(String s){
    return block(block(s) + "+");
  }

  /** parens */
  public static String block(String s){
    return "(" + s + ")";
  }

  /** any of the two */
  public static String or(String s1, String s2){
    return block(block(s1) + "|" + block(s2));
  }

  /** any of the three */
  public static String or(String s1, String s2, String s3){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3));
  }

  /** any of the four */
  public static String or(String s1, String s2, String s3, String s4){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4));
  }

  /** any of the five */
  public static String or(String s1, String s2, String s3, String s4, String s5){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5));
  }

  /** any of the six */
  public static String or(String s1, String s2, String s3, 
      String s4, String s5, String s6){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6));
  }

  /** any of the seven */
  public static String or(String s1, String s2, String s3, 
      String s4, String s5, String s6, String s7){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7));
  }

  /** any of the eight */
  public static String or(String s1, String s2, String s3, String s4, 
      String s5, String s6, String s7, String s8){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + 
        block(s7) + "|" + block(s8));
  }

  /** any of the nine */
  public static String or(String s1, String s2, String s3, String s4, 
      String s5, String s6, String s7, String s8, String s9){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + 
        block(s7) + "|" + block(s8) + "|" + block(s9));
  }

  public static String or(String s1, String s2, String s3, String s4, 
      String s5, String s6, String s7, String s8, 
      String s9, String s10){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + 
        block(s7) + "|" + block(s8) + "|" + block(s9) + "|" +
        block(s10));
  }

  public static String or(String s1, String s2, String s3, String s4, 
      String s5, String s6, String s7, String s8, 
      String s9, String s10, String s11){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + 
        block(s7) + "|" + block(s8) + "|" + block(s9) + "|" +
        block(s10) + "|" + block(s11));
  }

  public static String or(String s1, String s2, String s3, String s4, 
      String s5, String s6, String s7, String s8, 
      String s9, String s10, String s11, String s12){
    return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + 
        block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + 
        block(s7) + "|" + block(s8) + "|" + block(s9) + "|" +
        block(s10) + "|" + block(s11) + "|" + block(s12));
  }

  /** not */
  public static String rangeNot(String s){
    return range(block("^" + s));
  }

  private static int hasApostropheBlock(String s) {
    for(int i = s.length() - 1; i > 0; i --){
      if(s.charAt(i) == '\'' && i < s.length() - 1){
        return i;
      }

      if(! Character.isLetter(s.charAt(i))){
        return -1;
      }
    }

    return -1;
  }

  private static <T extends WordToken> String concatenate(List<T> tokens,
      int start,
      int end) {
    StringBuffer buffer = new StringBuffer();

    for(; start < end; start ++){
      buffer.append(((WordToken) tokens.get(start)).getWord());
    }
    return buffer.toString();
  }

  private static <T extends WordToken> int countNewLines(List<T> tokens,
      int start,
      int end) {
    int count = 0;
    for(int i = start + 1; i < end; i ++){
      count += tokens.get(i).getNewLineCount();
    }
    return count;
  }

  public static boolean isUrl(String s) {
    Matcher match = urlPattern.matcher(s);
    return match.find(0);
  }

  public static boolean isEmail(String s) {
    Matcher match = emailPattern.matcher(s);
    return match.find(0);
  }

  public static boolean isSgml(String s) {
    Matcher match = sgmlPattern.matcher(s);
    return match.find(0);
  }

  public static boolean isSlashDate(String s) {
    Matcher match = slashDatePattern.matcher(s);
    return match.find(0);
  }

  public static boolean isAcronym(String s) {
    Matcher match = acronymPattern.matcher(s);
    return match.find(0);
  }

  public static boolean isDigitSeq(String s) {
    Matcher match = digitSeqPattern.matcher(s);
    return match.find(0);
  }

  public int countNewLines(String s, int start, int end) {
    int count = 0;
    for(int i = start; i < end; i ++) {
      if(s.charAt(i) == '\n') count ++;
    }
    return count;
  }
  
  /**
   * Smart tokenization storing the output in an array of CoreLabel
   * Sets the following fields:
   * - TextAnnotation - the text of the token
   * - TokenBeginAnnotation - the byte offset of the token (start)
   * - TokenEndAnnotation - the byte offset of the token (end)
   */
  public Word [] tokenizeToWords() {
    List<WordToken> toks = tokenizeToWordTokens();
    Word [] labels = new Word[toks.size()];
    for(int i = 0; i < toks.size(); i ++){
      WordToken tok = toks.get(i);
      Word l = new Word(tok.getWord(), tok.getStart(), tok.getEnd());
      labels[i] = l;
    }
    return labels;
  }

  /**
   * Tokenizes a natural language string
   * @return List of WordTokens
   */
  public List<WordToken> tokenizeToWordTokens() {
    List<WordToken> result = new ArrayList<>();

    //
    // replace illegal characters with SPACE
    //
    /*
    StringBuffer buffer = new StringBuffer();
    for(int i = 0; i < originalString.length(); i ++){
    	int c = (int) originalString.charAt(i);
    	//
    	// regular character
    	//
    	if(c > 31 && c < 127) buffer.append((char) c);

    	else{
    		log.info("Control character at position " + i + ": " + c);

    		//
    		// DOS new line counts as two characters
    		// 
    		if(c == 10) buffer.append(" ");

    		//
    		// other control character
    		//
    		else buffer.append(' ');
    	}
    }
     */

    Matcher match = wordPattern.matcher(buffer);
    int previousEndMatch = 0;

    //
    // Straight tokenization, ignoring known abbreviations
    //
    while(match.find()){
      String crtMatch = match.group();
      int endMatch = match.end();
      int startMatch = endMatch - crtMatch.length();
      int i;

      // found word ending in "n't"
      if (crtMatch.endsWith("n't")){
        if (crtMatch.length() > 3){
          WordToken token1 = 
            new WordToken(
                crtMatch.substring(0, crtMatch.length() - 3), 
                startMatch, endMatch - 3,
                countNewLines(buffer, previousEndMatch, startMatch));
          result.add(token1);
        }
        WordToken token2 = 
          new WordToken(crtMatch.substring(crtMatch.length() - 3, 
              crtMatch.length()),
              endMatch - 3, endMatch, 0);
        result.add(token2);
      } 

      // found word containing an appostrophe
      // XXX: is this too relaxed? e.g. "O'Hare"
      else if ((i = hasApostropheBlock(crtMatch)) != -1){
        WordToken token1 = new WordToken(crtMatch.substring(0, i), 
            startMatch, startMatch + i, countNewLines(buffer, previousEndMatch, startMatch));
        WordToken token2 = 
          new WordToken(crtMatch.substring(i, crtMatch.length()),
              startMatch + i, endMatch, 0);
        result.add(token1);
        result.add(token2);
      }

      // just a regular word
      else{
        WordToken token = new WordToken(crtMatch, startMatch, endMatch, 
            countNewLines(buffer, previousEndMatch, startMatch));
        result.add(token);
      }

      previousEndMatch = endMatch;
    }   

    //
    // Merge known abreviations
    //
    List<WordToken> resultWithAbs = new ArrayList<>();
    for(int i = 0; i < result.size(); i ++){
      // where the mw ends
      int end = result.size();
      if(end > i + MAX_MULTI_WORD_SIZE) end = i + MAX_MULTI_WORD_SIZE; 

      boolean found = false;

      // must have at least two tokens per multiword
      for(; end > i + 1; end --){
        WordToken startToken = result.get(i);
        WordToken endToken = result.get(end - 1);
        if(countNewLines(result, i, end) == 0){ // abbreviation tokens cannot appear on different lines
          String conc = concatenate(result, i, end);
          found = false;

          // found a multiword
          if((mAbbreviations.contains(conc) == true)){
            found = true;
            WordToken token = new WordToken(conc, 
                startToken.getStart(), 
                endToken.getEnd(),
                startToken.getNewLineCount());
            resultWithAbs.add(token);
            i = end - 1;
            break;
          }
        }
      }

      // no multiword starting at this position found
      if(! found){
        resultWithAbs.add(result.get(i));
      }
    }
    
    resultWithAbs = postprocess(resultWithAbs);

    return resultWithAbs;
  }
  
  /**
   * Redefine this method to implement additional domain-specific tokenization rules
   * @param tokens
   */
  protected List<WordToken> postprocess(List<WordToken> tokens) { return tokens; };

  /** 
   * Tokenizes and adds blank spaces were needed between each token 
   */
  public String tokenizeText() throws java.io.IOException{
    List<WordToken> tokenList = tokenizeToWordTokens();
    StringBuffer strBuffer = new StringBuffer();
    Iterator<WordToken> iter = tokenList.iterator();
    if (iter.hasNext()){
      strBuffer.append(iter.next());
    }
    while(iter.hasNext()){
      strBuffer.append(" ");
      strBuffer.append(iter.next());
    }
    return strBuffer.toString().replaceAll("\\s\\s+", " ");                
  }

  public static class AbbreviationMap {

    private Set<String> mAbbrevSet;

    private static List<String> normalizeCase(boolean caseInsensitive, List<String> words) {
      if(! caseInsensitive) return words;
      List<String> normWords = new ArrayList<>();
      for(String word: words) normWords.add(word.toLowerCase());
      return normWords;
    }

    /** Creates a new instance of AbreviationMap with some know abbreviations */
    public AbbreviationMap(boolean caseInsensitive)  {
      mAbbrevSet = Generics.newHashSet(normalizeCase(caseInsensitive, Arrays.asList(new String[]{
          "1.",
          "10.",
          "11.",
          "12.",
          "13.",
          "14.",
          "15.",
          "16.",
          "17.",
          "18.",
          "19.",
          "2.",
          "20.",
          "21.",
          "22.",
          "23.",
          "24.",
          "25.",
          "26.",
          "27.",
          "28.",
          "29.",
          "3.",
          "30.",
          "31.",
          "32.",
          "33.",
          "34.",
          "35.",
          "36.",
          "37.",
          "38.",
          "39.",
          "4.",
          "40.",
          "41.",
          "42.",
          "43.",
          "44.",
          "45.",
          "46.",
          "47.",
          "48.",
          "49.",
          "5.",
          "50.",
          "6.",
          "7.",
          "8.",
          "9.",
          "A.",
          "A.C.",
          "A.D.",
          "A.D.L.",
          "A.F.",
          "A.G.",
          "A.H.",
          "A.J.C.",
          "A.L.",
          "A.M",
          "A.M.",
          "A.P.",
          "A.T.B.",
          "AUG.",
          "Act.",
          "Adm.",
          "Ala.",
          "Ariz.",
          "Ark.",
          "Assn.",
          "Ass'n.",
          "Ass'n",
          "Aug.",
          "B.",
          "B.A.T",
          "B.B.",
          "B.F.",
          "B.J.",
          "B.V.",
          "Bancorp.",
          "Bhd.",
          "Blvd.",
          "Br.",
          "Brig.",
          "Bros.",
          "C.",
          "C.B.",
          "C.D.s",
          "C.J.",
          "C.O.",
          "C.R.",
          "C.W.",
          "CEO.",
          "CO.",
          "CORP.",
          "COS.",
          "Cal.",
          "Calif.",
          "Capt.",
          "Cie.",
          "Cir.",
          "Cmdr.",
          "Co.",
          "Col.",
          "Colo.",
          "Comdr.",
          "Conn.",
          "Corp.",
          "Cos.",
          "D.",
          "D.B.",
          "D.C",
          "D.C.",
          "D.H.",
          "D.M.",
          "D.N.",
          "D.S.",
          "D.T",
          "D.T.",
          "D.s",
          "Dec.",
          "Del.",
          "Dept.",
          "Dev.",
          "Dr.",
          "Ds.",
          "E.",
          "E.E.",
          "E.F.",
          "E.I.",
          "E.M.",
          "E.R.",
          "E.W.",
          "Etc.",
          "F.",
          "F.A.",
          "F.A.O.",
          "F.C",
          "F.E.",
          "F.J.",
          "F.S.B.",
          "F.W.",
          "FEB.",
          "FL.",
          "Feb.",
          "Fed.",
          "Fla.",
          "Fran.",
          "French.",
          "Freon.",
          "Ft.",
          "G.",
          "G.D.",
          "G.L.",
          "G.O.",
          "G.S.",
          "G.m.b",
          "G.m.b.H.",
          "GP.",
          "GPO.",
          "Ga.",
          "Gen.",
          "Gov.",
          "H.",
          "H.F.",
          "H.G.",
          "H.H.",
          "H.J.",
          "H.L.",
          "H.R.",
          "Hon.",
          "I.",
          "I.B.M.",
          "I.C.H.",
          "I.E.P.",
          "I.M.",
          "I.V.",
          "I.W.",
          "II.",
          "III.",
          "INC.",
          "Intl.",
          "Int'l",
          "IV.",
          "IX.",
          "Ill.",
          "Inc.",
          "Ind.",
          "J.",
          "J.C.",
          "J.D.",
          "J.E.",
          "J.F.",
          "J.F.K.",
          "J.H.",
          "J.L.",
          "J.M.",
          "JohnQ.Public",
          "J.P.",
          "J.R.",
          "J.V",
          "J.V.",
          "J.X.",
          "Jan.",
          "Jansz.",
          "Je.",
          "Jos.",
          "Jr.",
          "K.",
          "K.C.",
          "Kan.",
          "Ky.",
          "L.",
          "L.A.",
          "L.H.",
          "L.J.",
          "L.L.",
          "L.M.",
          "L.P",
          "L.P.",
          "La.",
          "Lt.",
          "Ltd.",
          "M.",
          "M.A.",
          "M.B.A.",
          "M.D",
          "M.D.",
          "M.D.C.",
          "M.E.",
          "M.J.",
          "M.R.",
          "M.S.",
          "M.W.",
          "M8.7sp",
          "Maj.",
          "Mar.",
          "Mass.",
          "Md.",
          "Med.",
          "Messrs.",
          "Mfg.",
          "Mich.",
          "Minn.",
          "Mir.",
          "Miss.",
          "Mo.",
          "Mr.",
          "Mrs.",
          "Ms.",
          "Mt.",
          "N.",
          "N.A.",
          "N.C",
          "N.C.",
          "N.D",
          "N.D.",
          "N.H",
          "N.H.",
          "N.J",
          "N.J.",
          "N.M",
          "N.M.",
          "N.V",
          "N.V.",
          "N.Y",
          "N.Y.",
          "NOV.",
          "Neb.",
          "Nev.",
          "No.",
          "no.", 
          "Nos.",
          "Nov.",
          "O.",
          "O.P.",
          "OK.",
          "Oct.",
          "Okla.",
          "Ore.",
          "P.",
          "P.J.",
          "P.M",
          "P.M.",
          "P.R.",
          "Pa.",
          "Penn.",
          "Pfc.",
          "Ph.",
          "Ph.D.",
          "pro-U.N.",
          "Prof.",
          "Prop.",
          "Pty.",
          "Q.",
          "R.",
          "R.D.",
          "Ret.",
          "R.H.",
          "R.I",
          "R.I.",
          "R.L.",
          "R.P.",
          "R.R.",
          "R.W.",
          "RLV.",
          "Rd.",
          "Rep.",
          "Reps.",
          "Rev.",
          "S.",
          "S.A",
          "S.A.",
          "S.C",
          "S.C.",
          "S.D.",
          "S.G.",
          "S.I.",
          "S.P.",
          "S.S.",
          "S.p",
          "S.p.A",
          "S.p.A.",
          "SKr1.5",
          "Sen.",
          "Sens.",
          "Sept.",
          "Sgt.",
          "Snr.",
          "Spc.",
          "Sr.",
          "St.",
          "Sys.",
          "T.",
          "T.D.",
          "T.F.",
          "T.T.",
          "T.V.",
          "TEL.",
          "Tech.",
          "Tenn.",
          "Tex.",
          "Tx.",
          "U.",
          "U.Cal-Davis",
          "U.K",
          "U.K.",
          "U.N.",
          "U.S.",
          "U.S.A",
          "U.S.A.",
          "U.S.C.",
          "U.S.C..",
          "U.S.S.R",
          "U.S.S.R.",
          "UK.",
          "US116.7",
          "V.",
          "V.H.",
          "VI.",
          "VII.",
          "VIII.",
          "VS.",
          "Va.",
          "Vs.",
          "Vt.",
          "W.",
          "W.A.",
          "W.G.",
          "W.I.",
          "W.J.",
          "W.R.",
          "W.T.",
          "W.Va",
          "W.Va.",
          "Wash.",
          "Wis.",
          "Wyo.",
          "X.",
          "Y.",
          "Y.J.",
          "Z.",
          "a.",
          "a.d.",
          "a.k.a",
          "a.m",
          "a.m.",
          "al.",
          "b.",
          "c.",
          "c.i.f",
          "cf.",
          "cnsl.",
          "cnsls.",
          "cont'd.",
          "d.",
          "deft.",
          "defts.",
          "e.",
          "et.",
          "etc.",
          "etseq.",
          "f.",
          "f.o.b",
          "ft.",
          "g.",
          "h.",
          "i.",
          "i.e.",
          "j.",
          "k.",
          "l.",
          "m.",
          "mots.",
          "n.",
          "o.",
          "p.",
          "p.m",
          "p.m.",
          "pltf.",
          "pltfs.",
          "prelim.",
          "r.",
          "s.",
          "seq.",
          "supp.",
          "sq.",
          "t.",
          "u.",
          "v.",
          "vs.",
          "x.",
          "y.",
          "z.",
      })));

    }

    public boolean contains(String s){
      return mAbbrevSet.contains(s.toLowerCase());
    }
  }
  
  public static class WordToken {

    /** Start position */
    protected int mStart;

    /** End position */
    protected int mEnd;
    
    /** Counts how many new lines appear between this token and the previous one in the stream */
    protected int mNewLineCount;

    /** The lexem */
    protected String mWord;

    public WordToken(String w, 
        int s,
        int e) {
      mWord = w;
      mStart = s;
      mEnd = e;
      mNewLineCount = 0;
    }
    public WordToken(String w, int s, int e, int nl) {
      mWord = w;
      mStart = s;
      mEnd = e;
      mNewLineCount = nl;
    }

    public String toString() {
      StringBuffer buffer = new StringBuffer();
      buffer.append("[");
      buffer.append(mWord);
      buffer.append(", ");
      buffer.append(mStart);
      buffer.append(", ");
      buffer.append(mEnd);
      buffer.append("]");
      return buffer.toString();
    }

    public int getStart() { return mStart; }
    public void setStart(int i) { mStart = i; }

    public int getEnd() { return mEnd; }
    public void setEnd(int i) { mEnd = i; }
    
    public int getNewLineCount() { return mNewLineCount; }
    public void setNewLineCount(int i) { mNewLineCount = i; } 

    public String getWord() { return mWord; }
    public void setWord(String w) { mWord = w; }

  }
  
  /** Cached tokens for this buffer. Used by getNext */
  Word [] cachedTokens;
  /** Current position in the cachedTokens list. Used by getNext */
  int cachedPosition;
  
  @Override
  protected Word getNext() {
    if(cachedTokens == null){
      cachedTokens = tokenizeToWords();
      cachedPosition = 0;
    }
    
    if(cachedPosition >= cachedTokens.length){
      return null;
    }
    
    Word token = cachedTokens[cachedPosition];
    cachedPosition ++;
    
    return token;
  }

  public static void main(String argv[]) throws Exception {
    if(argv.length != 1){
      log.info("Usage: java edu.stanford.nlp.ie.machinereading.common.RobustTokenizer <file to tokenize>");
      System.exit(1);
    }

    // tokenize this file
    BufferedReader is = 
      new BufferedReader(new FileReader(argv[0])); 

    // read the whole file in a buffer
    // XXX: for sure there are more efficient ways of reading a file...
    int ch;
    StringBuffer buffer = new StringBuffer();
    while((ch = is.read()) != -1) buffer.append((char) ch);
    
    // create the tokenizer object
    RobustTokenizer<Word> t = new RobustTokenizer<>(buffer.toString());

    List<Word> tokens = t.tokenize();
    for (Word token : tokens) {
      System.out.println(token);
    }
  }
}