/* * RobustTokenizer.java * Performs tokenization of natural language English text, following ACE data * Use the method tokenize() for smart tokenization * @author Mihai */ package edu.stanford.nlp.ie.machinereading.domains.ace.reader; import edu.stanford.nlp.util.logging.Redwood; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.FileReader; import java.io.BufferedReader; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.AbstractTokenizer; import edu.stanford.nlp.util.Generics; public class RobustTokenizer<T extends Word> extends AbstractTokenizer<Word> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(RobustTokenizer.class); /** Buffer to tokenize */ String buffer; /** The set of known abbreviations */ private AbbreviationMap mAbbreviations; public final static int MAX_MULTI_WORD_SIZE = 20; // basic tokens public final static String DOT = block("\\."); public final static String DOTDOT = block("\\:"); public final static String APOSTROPHE = block("\\'"); public final static String SLASH = block("\\/"); public final static String UNDERSCORE = block("\\_"); public final static String MINUS = block("\\-"); public final static String PLUS = block("\\+"); public final static String COMMA = block("\\,"); public final static String DOTCOMMA = block("\\;"); public final static String QUOTES = block(or("\\\"", "\\'\\'", "\\'", "\\`\\`", "\\`")); public final static String DOUBLE_QUOTES = block(or("\\\"" , "\\'\\'")); public final static String LRB = block("\\("); public final static String RRB = block("\\)"); public final static String LCB = block("\\{"); public final static String RCB = block("\\}"); public final static String GREATER = block("\\>"); public final static String LOWER = block("\\<"); public final static String AMPERSAND = block("\\&"); public final static String AT = block("\\@"); public final static String HTTP = block("[hH][tT][tT][pP]\\:\\/\\/"); // basic sequences public final static String WHITE_SPACE = block("\\s"); public final static String DIGIT = block("\\d"); public final static String LETTER = block("[a-zA-Z]"); public final static String UPPER = block("[A-Z]"); public final static String SIGN = or(MINUS,PLUS); // numbers public final static String FULLNUM = block( zeroOrOne(SIGN) + oneOrMore(DIGIT) + zeroOrMore( zeroOrOne(or(DOT, COMMA, SLASH)) + oneOrMore(DIGIT))); public final static String DECNUM = block(DOT + oneOrMore(DIGIT)); public final static String NUM = or(FULLNUM, DECNUM); // date and time public final static String DATE = block(oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT)); public final static String TIME = block(oneOrMore(DIGIT) + oneOrMore(block( DOTDOT + oneOrMore(DIGIT)))); // punctuation marks public final static String PUNC = or(QUOTES, block(MINUS + oneOrMore(MINUS)), block(DOT + oneOrMore(DOT))); // words public final static String LETTERS = oneOrMore(LETTER); public final static String BLOCK = or(NUM, LETTERS); public final static String WORD = block(zeroOrOne(APOSTROPHE) + BLOCK + zeroOrMore(block( zeroOrOne(or(UNDERSCORE, MINUS, APOSTROPHE, SLASH, AMPERSAND)) + BLOCK))); // acronyms public final static String ACRONYM = block(oneOrMore(LETTER + DOT));// + zeroOrOne(LETTER)); // this matches acronyms AFTER abbreviation merging public final static String LOOSE_ACRONYM = block(oneOrMore((oneOrMore(LETTER) + DOT)) + zeroOrMore(LETTER)); // other possible constructs public final static String PAREN = or(LRB, RRB, LCB, RCB); public final static String SGML = "<[^<>]+>"; public final static String HTMLCODE = block(AMPERSAND + UPPER + DOTCOMMA); public final static String ANY = block("\\S"); // email addresses must start with a letter, contain @, and end with a letter public final static String EMAIL = block(LETTER + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + AT + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + LETTER); // email addresses must start with a letter, contain @, and end with . com public final static String DOMAIN_EMAIL = block(LETTER + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + AT + oneOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + zeroOrMore(WHITE_SPACE)+ DOT + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us")); // URLs must start with http:// or ftp://, followed by at least a letter public final static String URL = block(HTTP + oneOrMore(or(LETTER, DIGIT, DOT, UNDERSCORE, SLASH, AMPERSAND, MINUS, PLUS))); //URLs without http, but ending in org, com, net public final static String SMALL_URL = block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(WHITE_SPACE) + or("org", "ORG", "com", "COM", "net", "NET", "ru", "us")); // keep sequence of underscores as a single token public final static String UNDERSCORESEQ = oneOrMore("_"); // list bullet, e.g., "(a)" public final static String LIST_BULLET = block(LRB + LETTER + zeroOrOne(LETTER) + RRB); // part of a phone number, e.g., "(214)" public final static String PHONE_PART = block(LRB + oneOrMore(DIGIT) + RRB); // sequence of digits public final static String DIGITSEQ = oneOrMore(DIGIT); // the complete pattern public final static String RECOGNISED_PATTERN = block(block(TIME) + "|" + block(DOMAIN_EMAIL) + "|" + block(EMAIL) + "|" + block(URL) + "|" + // block(SMALL_URL) + "|" + block(ACRONYM) + "|" + block(DATE) + "|" + block(PHONE_PART) + "|" + // must be before WORD, otherwise it's broken into multiple tokens block(WORD) + "|" + block(PUNC) + "|" + block(LIST_BULLET) + "|" + block(PAREN) + "|" + block(SGML) + "|" + block(HTMLCODE) + "|" + block(UNDERSCORESEQ) + "|" + block(ANY)); /** The overall token pattern */ private final static Pattern wordPattern; /** Pattern to recognize SGML tags */ private final static Pattern sgmlPattern; /** Pattern to recognize slash-separated dates */ private final static Pattern slashDatePattern; /** Pattern to recognize acronyms */ private final static Pattern acronymPattern; /** Pattern to recognize URLs */ private final static Pattern urlPattern; /** Pattern to recognize emails */ private final static Pattern emailPattern; /** Recognized sequences of digits */ private final static Pattern digitSeqPattern; static{ wordPattern = Pattern.compile(RECOGNISED_PATTERN); sgmlPattern = Pattern.compile(SGML); slashDatePattern = Pattern.compile(DATE); acronymPattern = Pattern.compile(LOOSE_ACRONYM); urlPattern = Pattern.compile(URL); emailPattern = Pattern.compile(EMAIL); digitSeqPattern = Pattern.compile(DIGITSEQ); } public RobustTokenizer(String buffer) { mAbbreviations = new AbbreviationMap(true); this.buffer = buffer; this.cachedTokens = null; } public RobustTokenizer(boolean caseInsensitive, String buffer) { mAbbreviations = new AbbreviationMap(caseInsensitive); this.buffer = buffer; this.cachedTokens = null; } /** any in the set */ public static String range(String s){ return block("[" + s + "]"); } /** zero or one */ public static String zeroOrOne(String s){ return block(block(s) + "?"); } /** zero or more */ public static String zeroOrMore(String s){ return block(block(s) + "*"); } /** one or more */ public static String oneOrMore(String s){ return block(block(s) + "+"); } /** parens */ public static String block(String s){ return "(" + s + ")"; } /** any of the two */ public static String or(String s1, String s2){ return block(block(s1) + "|" + block(s2)); } /** any of the three */ public static String or(String s1, String s2, String s3){ return block(block(s1) + "|" + block(s2) + "|" + block(s3)); } /** any of the four */ public static String or(String s1, String s2, String s3, String s4){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4)); } /** any of the five */ public static String or(String s1, String s2, String s3, String s4, String s5){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5)); } /** any of the six */ public static String or(String s1, String s2, String s3, String s4, String s5, String s6){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6)); } /** any of the seven */ public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7)); } /** any of the eight */ public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7) + "|" + block(s8)); } /** any of the nine */ public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7) + "|" + block(s8) + "|" + block(s9)); } public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + block(s10)); } public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10, String s11){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + block(s10) + "|" + block(s11)); } public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10, String s11, String s12){ return block(block(s1) + "|" + block(s2) + "|" + block(s3) + "|" + block(s4) + "|" + block(s5) + "|" + block(s6) + "|" + block(s7) + "|" + block(s8) + "|" + block(s9) + "|" + block(s10) + "|" + block(s11) + "|" + block(s12)); } /** not */ public static String rangeNot(String s){ return range(block("^" + s)); } private static int hasApostropheBlock(String s) { for(int i = s.length() - 1; i > 0; i --){ if(s.charAt(i) == '\'' && i < s.length() - 1){ return i; } if(! Character.isLetter(s.charAt(i))){ return -1; } } return -1; } private static <T extends WordToken> String concatenate(List<T> tokens, int start, int end) { StringBuffer buffer = new StringBuffer(); for(; start < end; start ++){ buffer.append(((WordToken) tokens.get(start)).getWord()); } return buffer.toString(); } private static <T extends WordToken> int countNewLines(List<T> tokens, int start, int end) { int count = 0; for(int i = start + 1; i < end; i ++){ count += tokens.get(i).getNewLineCount(); } return count; } public static boolean isUrl(String s) { Matcher match = urlPattern.matcher(s); return match.find(0); } public static boolean isEmail(String s) { Matcher match = emailPattern.matcher(s); return match.find(0); } public static boolean isSgml(String s) { Matcher match = sgmlPattern.matcher(s); return match.find(0); } public static boolean isSlashDate(String s) { Matcher match = slashDatePattern.matcher(s); return match.find(0); } public static boolean isAcronym(String s) { Matcher match = acronymPattern.matcher(s); return match.find(0); } public static boolean isDigitSeq(String s) { Matcher match = digitSeqPattern.matcher(s); return match.find(0); } public int countNewLines(String s, int start, int end) { int count = 0; for(int i = start; i < end; i ++) { if(s.charAt(i) == '\n') count ++; } return count; } /** * Smart tokenization storing the output in an array of CoreLabel * Sets the following fields: * - TextAnnotation - the text of the token * - TokenBeginAnnotation - the byte offset of the token (start) * - TokenEndAnnotation - the byte offset of the token (end) */ public Word [] tokenizeToWords() { List<WordToken> toks = tokenizeToWordTokens(); Word [] labels = new Word[toks.size()]; for(int i = 0; i < toks.size(); i ++){ WordToken tok = toks.get(i); Word l = new Word(tok.getWord(), tok.getStart(), tok.getEnd()); labels[i] = l; } return labels; } /** * Tokenizes a natural language string * @return List of WordTokens */ public List<WordToken> tokenizeToWordTokens() { List<WordToken> result = new ArrayList<>(); // // replace illegal characters with SPACE // /* StringBuffer buffer = new StringBuffer(); for(int i = 0; i < originalString.length(); i ++){ int c = (int) originalString.charAt(i); // // regular character // if(c > 31 && c < 127) buffer.append((char) c); else{ log.info("Control character at position " + i + ": " + c); // // DOS new line counts as two characters // if(c == 10) buffer.append(" "); // // other control character // else buffer.append(' '); } } */ Matcher match = wordPattern.matcher(buffer); int previousEndMatch = 0; // // Straight tokenization, ignoring known abbreviations // while(match.find()){ String crtMatch = match.group(); int endMatch = match.end(); int startMatch = endMatch - crtMatch.length(); int i; // found word ending in "n't" if (crtMatch.endsWith("n't")){ if (crtMatch.length() > 3){ WordToken token1 = new WordToken( crtMatch.substring(0, crtMatch.length() - 3), startMatch, endMatch - 3, countNewLines(buffer, previousEndMatch, startMatch)); result.add(token1); } WordToken token2 = new WordToken(crtMatch.substring(crtMatch.length() - 3, crtMatch.length()), endMatch - 3, endMatch, 0); result.add(token2); } // found word containing an appostrophe // XXX: is this too relaxed? e.g. "O'Hare" else if ((i = hasApostropheBlock(crtMatch)) != -1){ WordToken token1 = new WordToken(crtMatch.substring(0, i), startMatch, startMatch + i, countNewLines(buffer, previousEndMatch, startMatch)); WordToken token2 = new WordToken(crtMatch.substring(i, crtMatch.length()), startMatch + i, endMatch, 0); result.add(token1); result.add(token2); } // just a regular word else{ WordToken token = new WordToken(crtMatch, startMatch, endMatch, countNewLines(buffer, previousEndMatch, startMatch)); result.add(token); } previousEndMatch = endMatch; } // // Merge known abreviations // List<WordToken> resultWithAbs = new ArrayList<>(); for(int i = 0; i < result.size(); i ++){ // where the mw ends int end = result.size(); if(end > i + MAX_MULTI_WORD_SIZE) end = i + MAX_MULTI_WORD_SIZE; boolean found = false; // must have at least two tokens per multiword for(; end > i + 1; end --){ WordToken startToken = result.get(i); WordToken endToken = result.get(end - 1); if(countNewLines(result, i, end) == 0){ // abbreviation tokens cannot appear on different lines String conc = concatenate(result, i, end); found = false; // found a multiword if((mAbbreviations.contains(conc) == true)){ found = true; WordToken token = new WordToken(conc, startToken.getStart(), endToken.getEnd(), startToken.getNewLineCount()); resultWithAbs.add(token); i = end - 1; break; } } } // no multiword starting at this position found if(! found){ resultWithAbs.add(result.get(i)); } } resultWithAbs = postprocess(resultWithAbs); return resultWithAbs; } /** * Redefine this method to implement additional domain-specific tokenization rules * @param tokens */ protected List<WordToken> postprocess(List<WordToken> tokens) { return tokens; }; /** * Tokenizes and adds blank spaces were needed between each token */ public String tokenizeText() throws java.io.IOException{ List<WordToken> tokenList = tokenizeToWordTokens(); StringBuffer strBuffer = new StringBuffer(); Iterator<WordToken> iter = tokenList.iterator(); if (iter.hasNext()){ strBuffer.append(iter.next()); } while(iter.hasNext()){ strBuffer.append(" "); strBuffer.append(iter.next()); } return strBuffer.toString().replaceAll("\\s\\s+", " "); } public static class AbbreviationMap { private Set<String> mAbbrevSet; private static List<String> normalizeCase(boolean caseInsensitive, List<String> words) { if(! caseInsensitive) return words; List<String> normWords = new ArrayList<>(); for(String word: words) normWords.add(word.toLowerCase()); return normWords; } /** Creates a new instance of AbreviationMap with some know abbreviations */ public AbbreviationMap(boolean caseInsensitive) { mAbbrevSet = Generics.newHashSet(normalizeCase(caseInsensitive, Arrays.asList(new String[]{ "1.", "10.", "11.", "12.", "13.", "14.", "15.", "16.", "17.", "18.", "19.", "2.", "20.", "21.", "22.", "23.", "24.", "25.", "26.", "27.", "28.", "29.", "3.", "30.", "31.", "32.", "33.", "34.", "35.", "36.", "37.", "38.", "39.", "4.", "40.", "41.", "42.", "43.", "44.", "45.", "46.", "47.", "48.", "49.", "5.", "50.", "6.", "7.", "8.", "9.", "A.", "A.C.", "A.D.", "A.D.L.", "A.F.", "A.G.", "A.H.", "A.J.C.", "A.L.", "A.M", "A.M.", "A.P.", "A.T.B.", "AUG.", "Act.", "Adm.", "Ala.", "Ariz.", "Ark.", "Assn.", "Ass'n.", "Ass'n", "Aug.", "B.", "B.A.T", "B.B.", "B.F.", "B.J.", "B.V.", "Bancorp.", "Bhd.", "Blvd.", "Br.", "Brig.", "Bros.", "C.", "C.B.", "C.D.s", "C.J.", "C.O.", "C.R.", "C.W.", "CEO.", "CO.", "CORP.", "COS.", "Cal.", "Calif.", "Capt.", "Cie.", "Cir.", "Cmdr.", "Co.", "Col.", "Colo.", "Comdr.", "Conn.", "Corp.", "Cos.", "D.", "D.B.", "D.C", "D.C.", "D.H.", "D.M.", "D.N.", "D.S.", "D.T", "D.T.", "D.s", "Dec.", "Del.", "Dept.", "Dev.", "Dr.", "Ds.", "E.", "E.E.", "E.F.", "E.I.", "E.M.", "E.R.", "E.W.", "Etc.", "F.", "F.A.", "F.A.O.", "F.C", "F.E.", "F.J.", "F.S.B.", "F.W.", "FEB.", "FL.", "Feb.", "Fed.", "Fla.", "Fran.", "French.", "Freon.", "Ft.", "G.", "G.D.", "G.L.", "G.O.", "G.S.", "G.m.b", "G.m.b.H.", "GP.", "GPO.", "Ga.", "Gen.", "Gov.", "H.", "H.F.", "H.G.", "H.H.", "H.J.", "H.L.", "H.R.", "Hon.", "I.", "I.B.M.", "I.C.H.", "I.E.P.", "I.M.", "I.V.", "I.W.", "II.", "III.", "INC.", "Intl.", "Int'l", "IV.", "IX.", "Ill.", "Inc.", "Ind.", "J.", "J.C.", "J.D.", "J.E.", "J.F.", "J.F.K.", "J.H.", "J.L.", "J.M.", "JohnQ.Public", "J.P.", "J.R.", "J.V", "J.V.", "J.X.", "Jan.", "Jansz.", "Je.", "Jos.", "Jr.", "K.", "K.C.", "Kan.", "Ky.", "L.", "L.A.", "L.H.", "L.J.", "L.L.", "L.M.", "L.P", "L.P.", "La.", "Lt.", "Ltd.", "M.", "M.A.", "M.B.A.", "M.D", "M.D.", "M.D.C.", "M.E.", "M.J.", "M.R.", "M.S.", "M.W.", "M8.7sp", "Maj.", "Mar.", "Mass.", "Md.", "Med.", "Messrs.", "Mfg.", "Mich.", "Minn.", "Mir.", "Miss.", "Mo.", "Mr.", "Mrs.", "Ms.", "Mt.", "N.", "N.A.", "N.C", "N.C.", "N.D", "N.D.", "N.H", "N.H.", "N.J", "N.J.", "N.M", "N.M.", "N.V", "N.V.", "N.Y", "N.Y.", "NOV.", "Neb.", "Nev.", "No.", "no.", "Nos.", "Nov.", "O.", "O.P.", "OK.", "Oct.", "Okla.", "Ore.", "P.", "P.J.", "P.M", "P.M.", "P.R.", "Pa.", "Penn.", "Pfc.", "Ph.", "Ph.D.", "pro-U.N.", "Prof.", "Prop.", "Pty.", "Q.", "R.", "R.D.", "Ret.", "R.H.", "R.I", "R.I.", "R.L.", "R.P.", "R.R.", "R.W.", "RLV.", "Rd.", "Rep.", "Reps.", "Rev.", "S.", "S.A", "S.A.", "S.C", "S.C.", "S.D.", "S.G.", "S.I.", "S.P.", "S.S.", "S.p", "S.p.A", "S.p.A.", "SKr1.5", "Sen.", "Sens.", "Sept.", "Sgt.", "Snr.", "Spc.", "Sr.", "St.", "Sys.", "T.", "T.D.", "T.F.", "T.T.", "T.V.", "TEL.", "Tech.", "Tenn.", "Tex.", "Tx.", "U.", "U.Cal-Davis", "U.K", "U.K.", "U.N.", "U.S.", "U.S.A", "U.S.A.", "U.S.C.", "U.S.C..", "U.S.S.R", "U.S.S.R.", "UK.", "US116.7", "V.", "V.H.", "VI.", "VII.", "VIII.", "VS.", "Va.", "Vs.", "Vt.", "W.", "W.A.", "W.G.", "W.I.", "W.J.", "W.R.", "W.T.", "W.Va", "W.Va.", "Wash.", "Wis.", "Wyo.", "X.", "Y.", "Y.J.", "Z.", "a.", "a.d.", "a.k.a", "a.m", "a.m.", "al.", "b.", "c.", "c.i.f", "cf.", "cnsl.", "cnsls.", "cont'd.", "d.", "deft.", "defts.", "e.", "et.", "etc.", "etseq.", "f.", "f.o.b", "ft.", "g.", "h.", "i.", "i.e.", "j.", "k.", "l.", "m.", "mots.", "n.", "o.", "p.", "p.m", "p.m.", "pltf.", "pltfs.", "prelim.", "r.", "s.", "seq.", "supp.", "sq.", "t.", "u.", "v.", "vs.", "x.", "y.", "z.", }))); } public boolean contains(String s){ return mAbbrevSet.contains(s.toLowerCase()); } } public static class WordToken { /** Start position */ protected int mStart; /** End position */ protected int mEnd; /** Counts how many new lines appear between this token and the previous one in the stream */ protected int mNewLineCount; /** The lexem */ protected String mWord; public WordToken(String w, int s, int e) { mWord = w; mStart = s; mEnd = e; mNewLineCount = 0; } public WordToken(String w, int s, int e, int nl) { mWord = w; mStart = s; mEnd = e; mNewLineCount = nl; } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("["); buffer.append(mWord); buffer.append(", "); buffer.append(mStart); buffer.append(", "); buffer.append(mEnd); buffer.append("]"); return buffer.toString(); } public int getStart() { return mStart; } public void setStart(int i) { mStart = i; } public int getEnd() { return mEnd; } public void setEnd(int i) { mEnd = i; } public int getNewLineCount() { return mNewLineCount; } public void setNewLineCount(int i) { mNewLineCount = i; } public String getWord() { return mWord; } public void setWord(String w) { mWord = w; } } /** Cached tokens for this buffer. Used by getNext */ Word [] cachedTokens; /** Current position in the cachedTokens list. Used by getNext */ int cachedPosition; @Override protected Word getNext() { if(cachedTokens == null){ cachedTokens = tokenizeToWords(); cachedPosition = 0; } if(cachedPosition >= cachedTokens.length){ return null; } Word token = cachedTokens[cachedPosition]; cachedPosition ++; return token; } public static void main(String argv[]) throws Exception { if(argv.length != 1){ log.info("Usage: java edu.stanford.nlp.ie.machinereading.common.RobustTokenizer <file to tokenize>"); System.exit(1); } // tokenize this file BufferedReader is = new BufferedReader(new FileReader(argv[0])); // read the whole file in a buffer // XXX: for sure there are more efficient ways of reading a file... int ch; StringBuffer buffer = new StringBuffer(); while((ch = is.read()) != -1) buffer.append((char) ch); // create the tokenizer object RobustTokenizer<Word> t = new RobustTokenizer<>(buffer.toString()); List<Word> tokens = t.tokenize(); for (Word token : tokens) { System.out.println(token); } } }