package edu.stanford.nlp.process; import java.io.*; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.*; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.objectbank.TokenizerFactory; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Timing; import edu.stanford.nlp.io.IOUtils; /** * Tokenizer implementation that conforms to the Penn Treebank tokenization * conventions. * This tokenizer is a Java implementation of Professor Chris Manning's Flex * tokenizer, pgtt-treebank.l. It reads raw text and outputs * tokens as edu.stanford.nlp.trees.Words in the Penn treebank format. It can * optionally return carriage returns as tokens. * * @author Tim Grow * @author Teg Grenager (grenager@stanford.edu) * @author Christopher Manning * @author Jenny Finkel (integrating in invertible PTB tokenizer) */ public class PTBTokenizer<T extends HasWord> extends AbstractTokenizer<T> { // todo: clean up treatment of invertible. Make it less of a type-changing parameter (from Word to CoreLabel) // todo: let Americanization be able to be turned off separately of other PTB escaping // todo: have the various options available to clients // whether carriage returns should be returned as tokens private boolean tokenizeCRs; private boolean invertible; private boolean suppressEscaping; // = false; // the underlying lexer private PTBLexer lexer; private LexedTokenFactory<T> tokenFactory; // private int position; /** * Constructs a new PTBTokenizer that treats carriage returns as normal * whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return newPTBTokenizer(r, false); } /** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token. CRs come back as Words whose text is * the value of <code>PTBLexer.cr</code>. * * @param r The Reader to read tokens from * @param tokenizeCRs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @return A PTBTokenizer which returns Word tokens */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r, boolean tokenizeCRs) { return new PTBTokenizer<Word>(r, tokenizeCRs, new WordTokenFactory()); } /** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token. CRs come back as Words whose text is * the value of <code>PTBLexer.cr</code>. * * @param r The Reader to read tokens from * @param tokenizeCRs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param invertible if set to true, then will produce CoreLabels which * will have fields for the string before and after, and the * character offsets * @return A PTBTokenizer which returns CoreLabel objects */ public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible) { return new PTBTokenizer<CoreLabel>(r, tokenizeCRs, invertible, new CoreLabelTokenFactory()); } /** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token, and has a custom LexedTokenFactory. * CRs come back as Words whose text is * the value of <code>PTBLexer.cr</code>. * * @param r The Reader to read tokens from * @param tokenizeCRs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param tokenFactory The LexedTokenFactory to use to create * tokens from the text. */ public PTBTokenizer(Reader r, boolean tokenizeCRs, LexedTokenFactory<T> tokenFactory) { this (r, tokenizeCRs, false, tokenFactory); } private PTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible, LexedTokenFactory<T> tokenFactory) { this(r, tokenizeCRs, invertible, false, tokenFactory); } private PTBTokenizer(Reader r, boolean tokenizeCRs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> tokenFactory) { this.tokenizeCRs = tokenizeCRs; this.tokenFactory = tokenFactory; this.invertible = invertible; this.suppressEscaping = suppressEscaping; setSource(r); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @Override @SuppressWarnings("unchecked") protected T getNext() { // if (lexer == null) { // return null; // } T token = null; try { token = (T) lexer.next(); // cdm 2007: this shouldn't be necessary: PTBLexer decides for itself whether to return CRs based on the same flag! // get rid of CRs if necessary // while (!tokenizeCRs && PTBLexer.cr.equals(((HasWord) token).word())) { // token = (T)lexer.next(); // } } catch (Exception e) { nextToken = null; // do nothing, return null } return token; } /** * Sets the source of this Tokenizer to be the Reader r. * @param r The Reader to tokenize from */ public final void setSource(Reader r) { if (invertible) { lexer = new PTBLexer(r, invertible, tokenizeCRs); } else { lexer = new PTBLexer(r, tokenFactory, tokenizeCRs, suppressEscaping); } // position = 0; } /** * Returns a presentable version of the given PTB-tokenized text. * PTB tokenization splits up punctuation and does various other things * that makes simply joining the tokens with spaces look bad. So join * the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. */ public static String ptb2Text(String ptbText) { StringBuilder sb = new StringBuilder(ptbText.length()); // probably an overestimate PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText)); try { for (String token; (token = lexer.next()) != null; ) { sb.append(token); } } catch (IOException e) { e.printStackTrace(); } return (sb.toString()); } /** * Returns a presentable version of the given PTB-tokenized text. * PTB tokenization splits up punctuation and does various other things * that makes simply joining the tokens with spaces look bad. So join * the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. */ public static int ptb2Text(Reader ptbText, Writer w) throws IOException { int numTokens = 0; PTB2TextLexer lexer = new PTB2TextLexer(ptbText); for (String token; (token = lexer.next()) != null; ) { numTokens++; w.write(token); } return numTokens; } private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { Timing t = new Timing(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); PrintWriter out = new PrintWriter(System.out, true); numTokens = ptb2Text(r, out); } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readReaderFromString(inputFileList.get(j), charset); PrintWriter out; if (outputFileList == null) { out = new PrintWriter(System.out, true); } else { out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true); } numTokens += ptb2Text(r, out); out.close(); } } long millis = t.stop(); double wordspersec = numTokens / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println("PTBTokenizer untokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second."); } /** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); } /** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. This method will take the word() values to prevent additional * text from creeping in (e.g., POS tags). */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<String>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); } private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd, boolean tokenizeNL, boolean preserveLines, boolean dump) throws IOException { Timing t = new Timing(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); PrintWriter out = new PrintWriter(System.out, true); numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump); } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readReaderFromString(inputFileList.get(j), charset); PrintWriter out; if (outputFileList == null) { out = new PrintWriter(System.out, true); } else { out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true); } numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump); r.close(); if (outputFileList != null) out.close(); } // end for j going through inputFileList } long millis = t.stop(); double wordspersec = numTokens / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println("PTBTokenizer tokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second."); } private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, boolean tokenizeNL, boolean preserveLines, boolean dump) { int numTokens = 0; PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(r, tokenizeNL, true); boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity boolean beginLine = true; while (tokenizer.hasNext()) { CoreLabel obj = tokenizer.next(); String str = obj.word(); if (parseInsideBegin != null && parseInsideBegin.matcher(str).matches()) { printing = true; } else if (parseInsideEnd != null && parseInsideEnd.matcher(str).matches()) { printing = false; } else if (printing) { if (dump) { // after having checked for tags, change str to be exhaustive str = obj.toString(); } if (preserveLines) { if ("*CR*".equals(str)) { beginLine = true; out.println(); } else { if ( ! beginLine) { out.print(" "); } else { beginLine = false; } out.print(str); } } else { out.println(str); } } numTokens++; } return numTokens; } public static TokenizerFactory<Word> factory() { return PTBTokenizerFactory.newPTBTokenizerFactory(); } public static TokenizerFactory<Word> factory(boolean tokenizeCRs) { return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs); } public static <T extends HasWord> TokenizerFactory<T> factory(boolean tokenizeCRs, LexedTokenFactory<T> factory) { return new PTBTokenizerFactory<T>(tokenizeCRs, factory); } public static TokenizerFactory<CoreLabel> factory(boolean tokenizeCRs, boolean invertible) { return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs, invertible); } public static TokenizerFactory<Word> factory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping) { return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeCRs, invertible, suppressEscaping); } public static class PTBTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> { protected boolean tokenizeCRs; protected boolean invertible; protected boolean suppressEscaping; // = false; protected LexedTokenFactory<T> factory; /** * Constructs a new PTBTokenizerFactory that treats carriage returns as * normal whitespace and returns Word objects. * * @return A TokenizerFactory that returns Word objects */ public static PTBTokenizerFactory<Word> newPTBTokenizerFactory() { return newPTBTokenizerFactory(false); } /** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token. * * @param tokenizeCRs If true, CRs come back as Words whose text is * the value of <code>PTBLexer.cr</code>. * @return A TokenizerFactory that returns Word objects */ public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean tokenizeCRs) { return new PTBTokenizerFactory<Word>(tokenizeCRs, new WordTokenFactory()); } public PTBTokenizerFactory(boolean tokenizeCRs, LexedTokenFactory<T> factory) { this(tokenizeCRs, false, false, factory); } public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeCRs, boolean invertible) { return new PTBTokenizerFactory<CoreLabel>(tokenizeCRs, invertible, new CoreLabelTokenFactory()); } // I'm not sure what will happen // if you set both invertible and suppressEscaping to true. // -pichuan (Wed Jan 31 23:12:04 2007) public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping) { return new PTBTokenizerFactory<Word>(tokenizeCRs, invertible, suppressEscaping, new WordTokenFactory()); } private PTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, LexedTokenFactory<T> factory) { this(tokenizeCRs, invertible, false, factory); } private PTBTokenizerFactory(boolean tokenizeCRs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) { this.tokenizeCRs = tokenizeCRs; this.invertible = invertible; this.suppressEscaping = suppressEscaping; this.factory = factory; } public Iterator<T> getIterator(Reader r) { return getTokenizer(r); } public Tokenizer<T> getTokenizer(Reader r) { return new PTBTokenizer<T>(r, tokenizeCRs, invertible, suppressEscaping, factory); } } // end static class PTBTokenizerFactory /** * Reads files named as arguments and print their tokens, by default as * one per line. This is useful either for testing or to run * standalone to turn a corpus into a one-token-per-line file of tokens. * This main method assumes that the input file is in utf-8 encoding, * unless it is specified. * <p/> * Usage: <code> * java edu.stanford.nlp.process.PTBTokenizer [options] filename+ * </code> * <p/> * Options: * <ul> * <li> -nl Tokenize newlines as tokens * <li> -preserveLines Produce space-separated tokens, except * when the original had a line break, not one-token-per-line * <li> -charset charset Specifies a character encoding * <li> -parseInside regex Names an XML-style tag or a regular expression * over such elements. The tokenizer will only tokenize inside element * that match this name. (This is done by regex matching, not an XML * parser, but works well for simply XML documents, or other SGML-style * documents, such as Linguistic Data Consortium releases.) * <li> -ioFileList file* The remaining command-line arguments are treated as * filenames that themselves contain lists of pairs of input-output * filenames (2 column, whitespace separated). * <li> -dump Print the whole of each CoreLabel, not just the value (word) * <li> -untok Heuristically untokenize tokenized text * <li>-h Print usage info * </ul> * * @param args Command line arguments * @throws IOException If any file I/O problem */ public static void main(String[] args) throws IOException { int i = 0; String charset = "utf-8"; Pattern parseInsideBegin = null; Pattern parseInsideEnd = null; boolean tokenizeNL = false; boolean preserveLines = false; boolean inputOutputFileList = false; boolean dump = false; boolean untok = false; while (i < args.length && args[i].charAt(0) == '-') { if ("-nl".equals(args[i])) { tokenizeNL = true; } else if ("-preserveLines".equals(args[i])) { preserveLines = true; tokenizeNL = true; } else if ("-dump".equals(args[i])) { dump = true; } else if ("-ioFileList".equals(args[i])) { inputOutputFileList = true; } else if ("-charset".equals(args[i]) && i < args.length - 1) { i++; charset = args[i]; } else if ("-parseInside".equals(args[i]) && i < args.length - 1) { i++; try { parseInsideBegin = Pattern.compile("<(?:" + args[i] + ")[^>]*?>"); parseInsideEnd = Pattern.compile("</(?:" + args[i] + ")[^>]*?>"); } catch (Exception e) { parseInsideBegin = null; parseInsideEnd = null; } } else if ("-untok".equals(args[i])) { untok = true; } else if ("-h".equals(args[i]) || "-help".equals(args[i]) || "--help".equals(args[i])) { System.err.println("usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*"); System.err.println(" options: -nl|-preserveLines|-dump|-ioFileList|-charset|-parseInside|-h"); return; // exit if they asked for help in options } else { System.err.println("Unknown option: " + args[i]); } i++; } ArrayList<String> inputFileList = new ArrayList<String>(); ArrayList<String> outputFileList = null; if (inputOutputFileList) { outputFileList = new ArrayList<String>(); for (int j = i; j < args.length; j++) { BufferedReader r = new BufferedReader( new InputStreamReader(new FileInputStream(args[j]), charset)); for (String inLine; (inLine = r.readLine()) != null; ) { String[] fields = inLine.split("\\s+"); inputFileList.add(fields[0]); if (fields.length > 1) { outputFileList.add(fields[1]); } else { outputFileList.add(fields[0] + ".tok"); } } r.close(); } } else { inputFileList.addAll(Arrays.asList(args).subList(i, args.length)); } if (untok) { untok(inputFileList, outputFileList, charset); } else { tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, tokenizeNL, preserveLines, dump); } } // end main } // end PTBTokenizer