HunTokenizer.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package edu.stanford.nlp.process;

// Stanford English Tokenizer -- a deterministic, fast high-quality tokenizer
// Copyright (c) 2002-2009 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    java-nlp-support@lists.stanford.edu
//    http://nlp.stanford.edu/software/

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;

/**
 * Fast, rule-based tokenizer implementation, initially written to conform to the Penn Treebank tokenization conventions, but now providing
 * a range of tokenization options over a broader space of Unicode text. It reads raw text and outputs tokens of classes that implement
 * edu.stanford.nlp.trees.HasWord (typically a Word or a CoreLabel). It can optionally return carriage returns as tokens.
 * <p>
 * New code is encouraged to use the {@link #HunTokenizer(Reader,LexedTokenFactory,String)} constructor. The other constructors are
 * historical. You specify the type of result tokens with a LexedTokenFactory, and can specify the treatment of tokens by boolean options
 * given in a comma separated String options (e.g., "invertible,normalizeParentheses=true"). If the String is <code>null</code> or empty,
 * you get the traditional Hun3 normalization behaviour (i.e., you get Hun3Escaping=false). If you want no normalization, then you should
 * pass in the String "Hun3Escaping=false". The known option names are:
 * <ol>
 * <li>invertible: Store enough information about the original form of the token and the whitespace around it that a list of tokens can be
 * faithfully converted back to the original String. Valid only if the LexedTokenFactory is an instance of CoreLabelTokenFactory. The keys
 * used in it are TextAnnotation for the tokenized form, CurrentAnnotation for the original string, BeforeAnnotation and AfterAnnotation for
 * the whitespace before and after a token, and perhaps BeginPositionAnnotation and EndPositionAnnotation to record token begin/after end
 * offsets, if they were specified to be recorded in TokenFactory construction. (Like the String class, begin and end are done so end -
 * begin gives the token length.)
 * <li>tokenizeNLs: Whether end-of-lines should become tokens (or just be treated as part of whitespace)
 * <li>Hun3Escaping: Enable all traditional Hun3 token transforms (like -LRB-, -RRB-). This is a macro flag that sets or clears all the
 * options below.
 * <li>americanize: Whether to rewrite common British English spellings as American English spellings
 * <li>normalizeSpace: Whether any spaces in tokens (phone numbers, fractions get turned into U+00A0 (non-breaking space). It's dangerous to
 * turn this off for most of our Stanford NLP software, which assumes no spaces in tokens.
 * <li>normalizeAmpersandEntity: Whether to map the XML &amp; to an ampersand
 * <li>normalizeCurrency: Whether to do some awful lossy currency mappings to turn common currency characters into $, #, or "cents",
 * reflecting the fact that nothing else appears in the old Hun3 WSJ. (No Euro!)
 * <li>normalizeFractions: Whether to map certain common composed fraction characters to spelled out letter forms like "1/2"
 * <li>normalizeParentheses: Whether to map round parentheses to -LRB-, -RRB-, as in the Penn Treebank
 * <li>normalizeOtherBrackets: Whether to map other common bracket characters to -LCB-, -LRB-, -RCB-, -RRB-, roughly as in the Penn Treebank
 * <li>asciiQuotes Whether to map quote characters to the traditional ' and "
 * <li>latexQuotes: Whether to map to ``, `, ', '' for quotes, as in Latex and the Hun3 WSJ (though this is now heavily frowned on in
 * Unicode). If true, this takes precedence over the setting of unicodeQuotes; if both are false, no mapping is done.
 * <li>unicodeQuotes: Whether to map quotes to the range U+2018 to U+201D, the preferred unicode encoding of single and double quotes.
 * <li>Hun3Ellipsis: Whether to map ellipses to ..., the old Hun3 WSJ coding of an ellipsis. If true, this takes precedence over the setting
 * of unicodeEllipsis; if both are false, no mapping is done.
 * <li>unicodeEllipsis: Whether to map dot and optional space sequences to U+2026, the Unicode ellipsis character
 * <li>Hun3Dashes: Whether to turn various dash characters into "--", the dominant encoding of dashes in the Hun3 WSJ
 * <li>escapeForwardSlashAsterisk: Whether to put a backslash escape in front of / and * as the old Hun3 WSJ does for some reason (something
 * to do with Lisp readers??).
 * <li>untokenizable: What to do with untokenizable characters (ones not known to the tokenizers. Six options combining whether to log a
 * warning for none, the first, or all, and whether to delete them or to include them as single character tokens in the output: noneDelete,
 * firstDelete, allDelete, noneKeep, firstKeep, allKeep. The default is "firstDelete".
 * </ol>
 * 
 * @author Tim Grow (his tokenizer is a Java implementation of Professor Chris Manning's Flex tokenizer, pgtt-treebank.l)
 * @author Teg Grenager (grenager@stanford.edu)
 * @author Jenny Finkel (integrating in invertible Hun tokenizer)
 * @author Christopher Manning (redid API, added many options, maintenance)
 */
public class HunTokenizer<T extends HasWord> extends AbstractTokenizer<T> {

  // the underlying lexer
  private HunPTBLexer lexer;

  /**
   * Constructs a new HunTokenizer that returns Word tokens and which treats carriage returns as normal whitespace.
   * 
   * @param r
   *          The Reader whose contents will be tokenized
   * @return A HunTokenizer that tokenizes a stream to objects of type {@link Word}
   */
  public static HunTokenizer<Word> newHunTokenizer(Reader r) {
    return newHunTokenizer(r, false);
  }

  /**
   * Constructs a new HunTokenizer that optionally returns newlines as their own token. NLs come back as Words whose text is the value of
   * <code>HunLexer.NEWLINE_TOKEN</code>.
   * 
   * @param r
   *          The Reader to read tokens from
   * @param tokenizeNLs
   *          Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
   * @return A HunTokenizer which returns Word tokens
   */
  public static HunTokenizer<Word> newHunTokenizer(Reader r, boolean tokenizeNLs) {
    return new HunTokenizer<Word>(r, tokenizeNLs, false, false, new WordTokenFactory());
  }

  /**
   * Constructs a new HunTokenizer that makes CoreLabel tokens. It optionally returns carriage returns as their own token. CRs come back as
   * Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code>.
   * 
   * @param r
   *          The Reader to read tokens from
   * @param tokenizeNLs
   *          Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
   * @param invertible
   *          if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets
   * @return A HunTokenizer which returns CoreLabel objects
   */
  public static HunTokenizer<CoreLabel> newHunTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
    return new HunTokenizer<CoreLabel>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
  }

  /**
   * Constructs a new HunTokenizer that optionally returns carriage returns as their own token, and has a custom LexedTokenFactory. If asked
   * for, CRs come back as Words whose text is the value of <code>HunLexer.cr</code>. This constructor translates between the traditional
   * boolean options of HunTokenizer and the new options String.
   * 
   * @param r
   *          The Reader to read tokens from
   * @param tokenizeNLs
   *          Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
   * @param invertible
   *          if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets
   * @param suppressEscaping
   *          If true, all the traditional Penn Treebank normalizations are turned off. Otherwise, they all happen.
   * @param tokenFactory
   *          The LexedTokenFactory to use to create tokens from the text.
   */
  private HunTokenizer(final Reader r, final boolean tokenizeNLs, final boolean invertible, final boolean suppressEscaping,
      final LexedTokenFactory<T> tokenFactory) {
    StringBuilder options = new StringBuilder();
    if (suppressEscaping) {
      options.append("Hun3Escaping=false");
    } else {
      options.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations
    }
    if (tokenizeNLs) {
      options.append(",tokenizeNLs");
    }
    if (invertible) {
      options.append(",invertible");
    }
    lexer = new HunPTBLexer(r, tokenFactory, options.toString());
  }

  /**
   * Constructs a new HunTokenizer with a custom LexedTokenFactory. Many options for tokenization and what is returned can be set via the
   * options String. See the class documentation for details on the options String. This is the new recommended constructor!
   * 
   * @param r
   *          The Reader to read tokens from.
   * @param tokenFactory
   *          The LexedTokenFactory to use to create tokens from the text.
   * @param options
   *          Options to the lexer. See the extensive documentation in the class javadoc. The String may be null or empty, which means that
   *          all traditional Hun normalizations are done. You can pass in "Hun3Escaping=false" and have no normalizations done (that is,
   *          the behavior of the old suppressEscaping=true option).
   */
  public HunTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options) {
    lexer = new HunPTBLexer(r, tokenFactory, options);
  }

  /**
   * Internally fetches the next token.
   * 
   * @return the next token in the token stream, or null if none exists.
   */
  @Override
  @SuppressWarnings("unchecked")
  protected T getNext() {
    // if (lexer == null) {
    // return null;
    // }
    T token = null;
    try {
      token = (T) lexer.next();
      // cdm 2007: this shouldn't be necessary: HunLexer decides for itself whether to return CRs based on the same
      // flag!
      // get rid of CRs if necessary
      // while (!tokenizeNLs && HunLexer.cr.equals(((HasWord) token).word())) {
      // token = (T)lexer.next();
      // }
    } catch (Exception e) {
      nextToken = null;
      // do nothing, return null
    }
    return token;
  }

  /**
   * Returns a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   * 
   * @param HunText
   *          A String in Hun3-escaped form
   * @return An approximation to the original String
   */
  public static String Hun2Text(String HunText) {
    StringBuilder sb = new StringBuilder(HunText.length()); // probably an overestimate
    PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(HunText));
    try {
      for (String token; (token = lexer.next()) != null;) {
        sb.append(token);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    return sb.toString();
  }

  /**
   * Returns a presentable version of a given Hun token. For instance, it transforms -LRB- into (.
   */
  public static String HunToken2Text(String HunText) {
    return Hun2Text(' ' + HunText + ' ').trim();
  }

  /**
   * Writes a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things that
   * makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   */
  public static int Hun2Text(Reader HunText, Writer w) throws IOException {
    int numTokens = 0;
    PTB2TextLexer lexer = new PTB2TextLexer(HunText);
    for (String token; (token = lexer.next()) != null;) {
      numTokens++;
      w.write(token);
    }
    return numTokens;
  }

  private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens = Hun2Text(r, out);
      out.close();
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }
        numTokens += Hun2Text(r, out);
        out.close();
      }
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("HunTokenizer untokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second.");
  }

  /**
   * Returns a presentable version of the given Hun-tokenized words. Pass in a List of Strings and this method will join the words with
   * spaces and call {@link #Hun2Text(String)} on the output.
   * 
   * @param HunWords
   *          A list of String
   * @return A presentable version of the given Hun-tokenized words
   */
  public static String Hun2Text(List<String> HunWords) {
    return Hun2Text(StringUtils.join(HunWords));
  }

  /**
   * Returns a presentable version of the given Hun-tokenized words. Pass in a List of Words or a Document and this method will join the
   * words with spaces and call {@link #Hun2Text(String)} on the output. This method will take the word() values to prevent additional text
   * from creeping in (e.g., POS tags).
   * 
   * @param HunWords
   *          A list of HasWord objects
   * @return A presentable version of the given Hun-tokenized words
   */
  public static String labelList2Text(List<? extends HasWord> HunWords) {
    List<String> words = new ArrayList<String>();
    for (HasWord hw : HunWords) {
      words.add(hw.word());
    }

    return Hun2Text(words);
  }

  private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd,
      String options, boolean preserveLines, boolean dump) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }

        numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
        r.close();
        if (outputFileList != null)
          out.close();
      } // end for j going through inputFileList
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("HunTokenizer tokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second.");
  }

  private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump) {
    int numTokens = 0;
    HunTokenizer<CoreLabel> tokenizer = new HunTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options);
    boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity
    boolean beginLine = true;
    while (tokenizer.hasNext()) {
      CoreLabel obj = tokenizer.next();
      String str = obj.word();

      if (parseInsideBegin != null && parseInsideBegin.matcher(str).matches()) {
        printing = true;
      } else if (parseInsideEnd != null && parseInsideEnd.matcher(str).matches()) {
        printing = false;
      } else if (printing) {
        if (dump) {
          // after having checked for tags, change str to be exhaustive
          str = obj.toString();
        }
        if (preserveLines) {
          if (HunPTBLexer.NEWLINE_TOKEN.equals(str)) {
            beginLine = true;
            out.println();
          } else {
            if (!beginLine) {
              out.print(" ");
            } else {
              beginLine = false;
            }
            out.print(str);
          }
        } else {
          out.println(str);
        }
      }
      numTokens++;
    }
    return numTokens;
  }

  public static TokenizerFactory<Word> factory() {
    return HunTokenizerFactory.newTokenizerFactory();
  }

  public static <T extends HasWord> TokenizerFactory<T> factory(boolean tokenizeNLs, LexedTokenFactory<T> factory) {
    return new HunTokenizerFactory<T>(tokenizeNLs, false, false, factory);
  }

  public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) {
    return HunTokenizerFactory.newHunTokenizerFactory(tokenizeNLs, invertible);
  }

  /**
   * Get a TokenizerFactory that does Penn Treebank tokenization. This is now the recommended factory method to use.
   * 
   * @param factory
   *          A TokenFactory that determines what form of token is returned by the Tokenizer
   * @param options
   *          A String specifying options (see the class javadoc for details)
   * @param <T>
   *          The type of the tokens built by the LexedTokenFactory
   * @return A TokenizerFactory that does Penn Treebank tokenization
   */
  public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
    return new HunTokenizerFactory<T>(factory, options);

  }

  /**
   * This class provides a factory which will vend instances of HunTokenizer which wrap a provided Reader. See the documentation for
   * {@link HunTokenizer} for details of the parameters and options.
   * 
   * @see HunTokenizer
   * @param <T>
   *          The class of the returned tokens
   */
  public static class HunTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {

    protected LexedTokenFactory<T> factory;
    protected String options;

    /**
     * Constructs a new TokenizerFactory that returns Word objects and treats carriage returns as normal whitespace. THIS METHOD IS INVOKED
     * BY REFLECTION BY SOME OF THE JAVANLP CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A TokenizerFactory.
     * 
     * @return A TokenizerFactory that returns Word objects
     */
    public static TokenizerFactory<Word> newTokenizerFactory() {
      return newHunTokenizerFactory(new WordTokenFactory(), "");
    }

    /**
     * Constructs a new HunTokenizer that optionally returns carriage returns as their own token.
     * 
     * @param tokenizeNLs
     *          If true, newlines come back as Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code> .
     * @return A TokenizerFactory that returns Word objects
     */
    public static HunTokenizerFactory<Word> newHunTokenizerFactory(boolean tokenizeNLs) {
      return new HunTokenizerFactory<Word>(tokenizeNLs, false, false, new WordTokenFactory());
    }

    /**
     * Constructs a new HunTokenizer that returns Word objects and uses the options passed in.
     * 
     * @param options
     *          A String of options
     * @return A TokenizerFactory that returns Word objects
     */
    public static HunTokenizerFactory<Word> newWordTokenizerFactory(String options) {
      return new HunTokenizerFactory<Word>(new WordTokenFactory(), options);
    }

    /**
     * Constructs a new HunTokenizer that returns CoreLabel objects and uses the options passed in.
     * 
     * @param options
     *          A String of options
     * @return A TokenizerFactory that returns CoreLabel objects o
     */
    public static HunTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
      return new HunTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), options);
    }

    /**
     * Constructs a new HunTokenizer that uses the LexedTokenFactory and options passed in.
     * 
     * @param tokenFactory
     *          The LexedTokenFactory
     * @param options
     *          A String of options
     * @return A TokenizerFactory that returns objects of the type of the LexedTokenFactory
     */
    public static <T extends HasWord> HunTokenizerFactory<T> newHunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
      return new HunTokenizerFactory<T>(tokenFactory, options);
    }

    public static HunTokenizerFactory<CoreLabel> newHunTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
      return new HunTokenizerFactory<CoreLabel>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
    }

    // Constructors

    private HunTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) {
      this.factory = factory;
      StringBuilder optionsSB = new StringBuilder();
      if (suppressEscaping) {
        optionsSB.append("Hun3Escaping=false");
      } else {
        optionsSB.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations
      }
      if (tokenizeNLs) {
        optionsSB.append(",tokenizeNLs");
      }
      if (invertible) {
        optionsSB.append(",invertible");
      }
      this.options = optionsSB.toString();
    }

    private HunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
      this.factory = tokenFactory;
      this.options = options;
    }

    /** Returns a tokenizer wrapping the given Reader. */
    public Iterator<T> getIterator(Reader r) {
      return getTokenizer(r);
    }

    /** Returns a tokenizer wrapping the given Reader. */
    public Tokenizer<T> getTokenizer(Reader r) {
      return new HunTokenizer<T>(r, factory, options);
    }

    @Override
    public void setOptions(String options) {
      this.options = options;
    }

    @Override
    public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
      if (options == null || options.equals("")) {
        return new PTBTokenizer<T>(r, factory, extraOptions);
      } else {
        return new PTBTokenizer<T>(r, factory, options + "," + extraOptions);
      }
    }

  } // end static class HunTokenizerFactory

  /**
   * Reads files named as arguments and print their tokens, by default as one per line. This is useful either for testing or to run
   * standalone to turn a corpus into a one-token-per-line file of tokens. This main method assumes that the input file is in utf-8
   * encoding, unless it is specified.
   * <p/>
   * Usage: <code>
   * java edu.stanford.nlp.process.HunTokenizer [options] filename+
   * </code>
   * <p/>
   * Options:
   * <ul>
   * <li>-options options Set various tokenization options (see the documentation in the class javadoc)
   * <li>-preserveLines Produce space-separated tokens, except when the original had a line break, not one-token-per-line
   * <li>-charset charset Specifies a character encoding
   * <li>-parseInside regex Names an XML-style tag or a regular expression over such elements. The tokenizer will only tokenize inside
   * element that match this name. (This is done by regex matching, not an XML parser, but works well for simple XML documents, or other
   * SGML-style documents, such as Linguistic Data Consortium releases, which adopt the convention that a line of a file is either XML
   * markup or character data but never both.)
   * <li>-ioFileList file* The remaining command-line arguments are treated as filenames that themselves contain lists of pairs of
   * input-output filenames (2 column, whitespace separated).
   * <li>-dump Print the whole of each CoreLabel, not just the value (word)
   * <li>-untok Heuristically untokenize tokenized text
   * <li>-h Print usage info
   * </ul>
   * 
   * @param args
   *          Command line arguments
   * @throws IOException
   *           If any file I/O problem
   */
  public static void main(String[] args) throws IOException {
    int i = 0;
    String charset = "utf-8";
    Pattern parseInsideBegin = null;
    Pattern parseInsideEnd = null;
    StringBuilder optionsSB = new StringBuilder();
    boolean preserveLines = false;
    boolean inputOutputFileList = false;
    boolean dump = false;
    boolean untok = false;

    while (i < args.length && args[i].charAt(0) == '-') {
      if ("-options".equals(args[i])) {
        i++;
        optionsSB.append(',');
        optionsSB.append(args[i]);
      } else if ("-preserveLines".equals(args[i])) {
        optionsSB.append(",tokenizeNLs");
        preserveLines = true;
      } else if ("-dump".equals(args[i])) {
        dump = true;
      } else if ("-ioFileList".equals(args[i])) {
        inputOutputFileList = true;
      } else if ("-charset".equals(args[i]) && i < args.length - 1) {
        i++;
        charset = args[i];
      } else if ("-parseInside".equals(args[i]) && i < args.length - 1) {
        i++;
        try {
          parseInsideBegin = Pattern.compile("<(?:" + args[i] + ")[^>]*?>");
          parseInsideEnd = Pattern.compile("</(?:" + args[i] + ")[^>]*?>");
        } catch (Exception e) {
          parseInsideBegin = null;
          parseInsideEnd = null;
        }
      } else if ("-untok".equals(args[i])) {
        untok = true;
      } else if ("-h".equals(args[i]) || "-help".equals(args[i]) || "--help".equals(args[i])) {
        System.err.println("usage: java edu.stanford.nlp.process.HunTokenizer [options]* filename*");
        System.err.println("  options: -preserveLines|-dump|-ioFileList|-charset|-parseInside elementRegex|-options options|-h");
        return; // exit if they asked for help in options
      } else {
        System.err.println("Unknown option: " + args[i]);
      }
      i++;
    }

    ArrayList<String> inputFileList = new ArrayList<String>();
    ArrayList<String> outputFileList = null;

    if (inputOutputFileList) {
      outputFileList = new ArrayList<String>();
      for (int j = i; j < args.length; j++) {
        BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[j]), charset));
        for (String inLine; (inLine = r.readLine()) != null;) {
          String[] fields = inLine.split("\\s+");
          inputFileList.add(fields[0]);
          if (fields.length > 1) {
            outputFileList.add(fields[1]);
          } else {
            outputFileList.add(fields[0] + ".tok");
          }
        }
        r.close();
      }
    } else {
      inputFileList.addAll(Arrays.asList(args).subList(i, args.length));
    }

    if (untok) {
      untok(inputFileList, outputFileList, charset);
    } else {
      tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, optionsSB.toString(), preserveLines, dump);
    }
  } // end main

} // end HunTokenizer