package edu.stanford.nlp.process;
// Stanford English Tokenizer -- a deterministic, fast high-quality tokenizer
// Copyright (c) 2002-2009 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
// java-nlp-support@lists.stanford.edu
// http://nlp.stanford.edu/software/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
/**
* Fast, rule-based tokenizer implementation, initially written to conform to the Penn Treebank tokenization conventions, but now providing
* a range of tokenization options over a broader space of Unicode text. It reads raw text and outputs tokens of classes that implement
* edu.stanford.nlp.trees.HasWord (typically a Word or a CoreLabel). It can optionally return carriage returns as tokens.
* <p>
* New code is encouraged to use the {@link #HunTokenizer(Reader,LexedTokenFactory,String)} constructor. The other constructors are
* historical. You specify the type of result tokens with a LexedTokenFactory, and can specify the treatment of tokens by boolean options
* given in a comma separated String options (e.g., "invertible,normalizeParentheses=true"). If the String is <code>null</code> or empty,
* you get the traditional Hun3 normalization behaviour (i.e., you get Hun3Escaping=false). If you want no normalization, then you should
* pass in the String "Hun3Escaping=false". The known option names are:
* <ol>
* <li>invertible: Store enough information about the original form of the token and the whitespace around it that a list of tokens can be
* faithfully converted back to the original String. Valid only if the LexedTokenFactory is an instance of CoreLabelTokenFactory. The keys
* used in it are TextAnnotation for the tokenized form, CurrentAnnotation for the original string, BeforeAnnotation and AfterAnnotation for
* the whitespace before and after a token, and perhaps BeginPositionAnnotation and EndPositionAnnotation to record token begin/after end
* offsets, if they were specified to be recorded in TokenFactory construction. (Like the String class, begin and end are done so end -
* begin gives the token length.)
* <li>tokenizeNLs: Whether end-of-lines should become tokens (or just be treated as part of whitespace)
* <li>Hun3Escaping: Enable all traditional Hun3 token transforms (like -LRB-, -RRB-). This is a macro flag that sets or clears all the
* options below.
* <li>americanize: Whether to rewrite common British English spellings as American English spellings
* <li>normalizeSpace: Whether any spaces in tokens (phone numbers, fractions get turned into U+00A0 (non-breaking space). It's dangerous to
* turn this off for most of our Stanford NLP software, which assumes no spaces in tokens.
* <li>normalizeAmpersandEntity: Whether to map the XML & to an ampersand
* <li>normalizeCurrency: Whether to do some awful lossy currency mappings to turn common currency characters into $, #, or "cents",
* reflecting the fact that nothing else appears in the old Hun3 WSJ. (No Euro!)
* <li>normalizeFractions: Whether to map certain common composed fraction characters to spelled out letter forms like "1/2"
* <li>normalizeParentheses: Whether to map round parentheses to -LRB-, -RRB-, as in the Penn Treebank
* <li>normalizeOtherBrackets: Whether to map other common bracket characters to -LCB-, -LRB-, -RCB-, -RRB-, roughly as in the Penn Treebank
* <li>asciiQuotes Whether to map quote characters to the traditional ' and "
* <li>latexQuotes: Whether to map to ``, `, ', '' for quotes, as in Latex and the Hun3 WSJ (though this is now heavily frowned on in
* Unicode). If true, this takes precedence over the setting of unicodeQuotes; if both are false, no mapping is done.
* <li>unicodeQuotes: Whether to map quotes to the range U+2018 to U+201D, the preferred unicode encoding of single and double quotes.
* <li>Hun3Ellipsis: Whether to map ellipses to ..., the old Hun3 WSJ coding of an ellipsis. If true, this takes precedence over the setting
* of unicodeEllipsis; if both are false, no mapping is done.
* <li>unicodeEllipsis: Whether to map dot and optional space sequences to U+2026, the Unicode ellipsis character
* <li>Hun3Dashes: Whether to turn various dash characters into "--", the dominant encoding of dashes in the Hun3 WSJ
* <li>escapeForwardSlashAsterisk: Whether to put a backslash escape in front of / and * as the old Hun3 WSJ does for some reason (something
* to do with Lisp readers??).
* <li>untokenizable: What to do with untokenizable characters (ones not known to the tokenizers. Six options combining whether to log a
* warning for none, the first, or all, and whether to delete them or to include them as single character tokens in the output: noneDelete,
* firstDelete, allDelete, noneKeep, firstKeep, allKeep. The default is "firstDelete".
* </ol>
*
* @author Tim Grow (his tokenizer is a Java implementation of Professor Chris Manning's Flex tokenizer, pgtt-treebank.l)
* @author Teg Grenager (grenager@stanford.edu)
* @author Jenny Finkel (integrating in invertible Hun tokenizer)
* @author Christopher Manning (redid API, added many options, maintenance)
*/
public class HunTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
// the underlying lexer
private HunPTBLexer lexer;
/**
* Constructs a new HunTokenizer that returns Word tokens and which treats carriage returns as normal whitespace.
*
* @param r
* The Reader whose contents will be tokenized
* @return A HunTokenizer that tokenizes a stream to objects of type {@link Word}
*/
public static HunTokenizer<Word> newHunTokenizer(Reader r) {
return newHunTokenizer(r, false);
}
/**
* Constructs a new HunTokenizer that optionally returns newlines as their own token. NLs come back as Words whose text is the value of
* <code>HunLexer.NEWLINE_TOKEN</code>.
*
* @param r
* The Reader to read tokens from
* @param tokenizeNLs
* Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
* @return A HunTokenizer which returns Word tokens
*/
public static HunTokenizer<Word> newHunTokenizer(Reader r, boolean tokenizeNLs) {
return new HunTokenizer<Word>(r, tokenizeNLs, false, false, new WordTokenFactory());
}
/**
* Constructs a new HunTokenizer that makes CoreLabel tokens. It optionally returns carriage returns as their own token. CRs come back as
* Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code>.
*
* @param r
* The Reader to read tokens from
* @param tokenizeNLs
* Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
* @param invertible
* if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets
* @return A HunTokenizer which returns CoreLabel objects
*/
public static HunTokenizer<CoreLabel> newHunTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
return new HunTokenizer<CoreLabel>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
}
/**
* Constructs a new HunTokenizer that optionally returns carriage returns as their own token, and has a custom LexedTokenFactory. If asked
* for, CRs come back as Words whose text is the value of <code>HunLexer.cr</code>. This constructor translates between the traditional
* boolean options of HunTokenizer and the new options String.
*
* @param r
* The Reader to read tokens from
* @param tokenizeNLs
* Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace)
* @param invertible
* if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets
* @param suppressEscaping
* If true, all the traditional Penn Treebank normalizations are turned off. Otherwise, they all happen.
* @param tokenFactory
* The LexedTokenFactory to use to create tokens from the text.
*/
private HunTokenizer(final Reader r, final boolean tokenizeNLs, final boolean invertible, final boolean suppressEscaping,
final LexedTokenFactory<T> tokenFactory) {
StringBuilder options = new StringBuilder();
if (suppressEscaping) {
options.append("Hun3Escaping=false");
} else {
options.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations
}
if (tokenizeNLs) {
options.append(",tokenizeNLs");
}
if (invertible) {
options.append(",invertible");
}
lexer = new HunPTBLexer(r, tokenFactory, options.toString());
}
/**
* Constructs a new HunTokenizer with a custom LexedTokenFactory. Many options for tokenization and what is returned can be set via the
* options String. See the class documentation for details on the options String. This is the new recommended constructor!
*
* @param r
* The Reader to read tokens from.
* @param tokenFactory
* The LexedTokenFactory to use to create tokens from the text.
* @param options
* Options to the lexer. See the extensive documentation in the class javadoc. The String may be null or empty, which means that
* all traditional Hun normalizations are done. You can pass in "Hun3Escaping=false" and have no normalizations done (that is,
* the behavior of the old suppressEscaping=true option).
*/
public HunTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options) {
lexer = new HunPTBLexer(r, tokenFactory, options);
}
/**
* Internally fetches the next token.
*
* @return the next token in the token stream, or null if none exists.
*/
@Override
@SuppressWarnings("unchecked")
protected T getNext() {
// if (lexer == null) {
// return null;
// }
T token = null;
try {
token = (T) lexer.next();
// cdm 2007: this shouldn't be necessary: HunLexer decides for itself whether to return CRs based on the same
// flag!
// get rid of CRs if necessary
// while (!tokenizeNLs && HunLexer.cr.equals(((HasWord) token).word())) {
// token = (T)lexer.next();
// }
} catch (Exception e) {
nextToken = null;
// do nothing, return null
}
return token;
}
/**
* Returns a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things
* that makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice
* looking text. It's not perfect, but it works pretty well.
*
* @param HunText
* A String in Hun3-escaped form
* @return An approximation to the original String
*/
public static String Hun2Text(String HunText) {
StringBuilder sb = new StringBuilder(HunText.length()); // probably an overestimate
PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(HunText));
try {
for (String token; (token = lexer.next()) != null;) {
sb.append(token);
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
/**
* Returns a presentable version of a given Hun token. For instance, it transforms -LRB- into (.
*/
public static String HunToken2Text(String HunText) {
return Hun2Text(' ' + HunText + ' ').trim();
}
/**
* Writes a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things that
* makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice
* looking text. It's not perfect, but it works pretty well.
*/
public static int Hun2Text(Reader HunText, Writer w) throws IOException {
int numTokens = 0;
PTB2TextLexer lexer = new PTB2TextLexer(HunText);
for (String token; (token = lexer.next()) != null;) {
numTokens++;
w.write(token);
}
return numTokens;
}
private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
Timing t = new Timing();
int numTokens = 0;
int sz = inputFileList.size();
if (sz == 0) {
Reader r = new InputStreamReader(System.in, charset);
PrintWriter out = new PrintWriter(System.out, true);
numTokens = Hun2Text(r, out);
out.close();
} else {
for (int j = 0; j < sz; j++) {
Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
PrintWriter out;
if (outputFileList == null) {
out = new PrintWriter(System.out, true);
} else {
out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
}
numTokens += Hun2Text(r, out);
out.close();
}
}
long millis = t.stop();
double wordspersec = numTokens / (((double) millis) / 1000);
NumberFormat nf = new DecimalFormat("0.00"); // easier way!
System.err.println("HunTokenizer untokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second.");
}
/**
* Returns a presentable version of the given Hun-tokenized words. Pass in a List of Strings and this method will join the words with
* spaces and call {@link #Hun2Text(String)} on the output.
*
* @param HunWords
* A list of String
* @return A presentable version of the given Hun-tokenized words
*/
public static String Hun2Text(List<String> HunWords) {
return Hun2Text(StringUtils.join(HunWords));
}
/**
* Returns a presentable version of the given Hun-tokenized words. Pass in a List of Words or a Document and this method will join the
* words with spaces and call {@link #Hun2Text(String)} on the output. This method will take the word() values to prevent additional text
* from creeping in (e.g., POS tags).
*
* @param HunWords
* A list of HasWord objects
* @return A presentable version of the given Hun-tokenized words
*/
public static String labelList2Text(List<? extends HasWord> HunWords) {
List<String> words = new ArrayList<String>();
for (HasWord hw : HunWords) {
words.add(hw.word());
}
return Hun2Text(words);
}
private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd,
String options, boolean preserveLines, boolean dump) throws IOException {
Timing t = new Timing();
int numTokens = 0;
int sz = inputFileList.size();
if (sz == 0) {
Reader r = new InputStreamReader(System.in, charset);
PrintWriter out = new PrintWriter(System.out, true);
numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
} else {
for (int j = 0; j < sz; j++) {
Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
PrintWriter out;
if (outputFileList == null) {
out = new PrintWriter(System.out, true);
} else {
out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
}
numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
r.close();
if (outputFileList != null)
out.close();
} // end for j going through inputFileList
}
long millis = t.stop();
double wordspersec = numTokens / (((double) millis) / 1000);
NumberFormat nf = new DecimalFormat("0.00"); // easier way!
System.err.println("HunTokenizer tokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second.");
}
private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump) {
int numTokens = 0;
HunTokenizer<CoreLabel> tokenizer = new HunTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options);
boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity
boolean beginLine = true;
while (tokenizer.hasNext()) {
CoreLabel obj = tokenizer.next();
String str = obj.word();
if (parseInsideBegin != null && parseInsideBegin.matcher(str).matches()) {
printing = true;
} else if (parseInsideEnd != null && parseInsideEnd.matcher(str).matches()) {
printing = false;
} else if (printing) {
if (dump) {
// after having checked for tags, change str to be exhaustive
str = obj.toString();
}
if (preserveLines) {
if (HunPTBLexer.NEWLINE_TOKEN.equals(str)) {
beginLine = true;
out.println();
} else {
if (!beginLine) {
out.print(" ");
} else {
beginLine = false;
}
out.print(str);
}
} else {
out.println(str);
}
}
numTokens++;
}
return numTokens;
}
public static TokenizerFactory<Word> factory() {
return HunTokenizerFactory.newTokenizerFactory();
}
public static <T extends HasWord> TokenizerFactory<T> factory(boolean tokenizeNLs, LexedTokenFactory<T> factory) {
return new HunTokenizerFactory<T>(tokenizeNLs, false, false, factory);
}
public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) {
return HunTokenizerFactory.newHunTokenizerFactory(tokenizeNLs, invertible);
}
/**
* Get a TokenizerFactory that does Penn Treebank tokenization. This is now the recommended factory method to use.
*
* @param factory
* A TokenFactory that determines what form of token is returned by the Tokenizer
* @param options
* A String specifying options (see the class javadoc for details)
* @param <T>
* The type of the tokens built by the LexedTokenFactory
* @return A TokenizerFactory that does Penn Treebank tokenization
*/
public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
return new HunTokenizerFactory<T>(factory, options);
}
/**
* This class provides a factory which will vend instances of HunTokenizer which wrap a provided Reader. See the documentation for
* {@link HunTokenizer} for details of the parameters and options.
*
* @see HunTokenizer
* @param <T>
* The class of the returned tokens
*/
public static class HunTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {
protected LexedTokenFactory<T> factory;
protected String options;
/**
* Constructs a new TokenizerFactory that returns Word objects and treats carriage returns as normal whitespace. THIS METHOD IS INVOKED
* BY REFLECTION BY SOME OF THE JAVANLP CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A TokenizerFactory.
*
* @return A TokenizerFactory that returns Word objects
*/
public static TokenizerFactory<Word> newTokenizerFactory() {
return newHunTokenizerFactory(new WordTokenFactory(), "");
}
/**
* Constructs a new HunTokenizer that optionally returns carriage returns as their own token.
*
* @param tokenizeNLs
* If true, newlines come back as Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code> .
* @return A TokenizerFactory that returns Word objects
*/
public static HunTokenizerFactory<Word> newHunTokenizerFactory(boolean tokenizeNLs) {
return new HunTokenizerFactory<Word>(tokenizeNLs, false, false, new WordTokenFactory());
}
/**
* Constructs a new HunTokenizer that returns Word objects and uses the options passed in.
*
* @param options
* A String of options
* @return A TokenizerFactory that returns Word objects
*/
public static HunTokenizerFactory<Word> newWordTokenizerFactory(String options) {
return new HunTokenizerFactory<Word>(new WordTokenFactory(), options);
}
/**
* Constructs a new HunTokenizer that returns CoreLabel objects and uses the options passed in.
*
* @param options
* A String of options
* @return A TokenizerFactory that returns CoreLabel objects o
*/
public static HunTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
return new HunTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), options);
}
/**
* Constructs a new HunTokenizer that uses the LexedTokenFactory and options passed in.
*
* @param tokenFactory
* The LexedTokenFactory
* @param options
* A String of options
* @return A TokenizerFactory that returns objects of the type of the LexedTokenFactory
*/
public static <T extends HasWord> HunTokenizerFactory<T> newHunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
return new HunTokenizerFactory<T>(tokenFactory, options);
}
public static HunTokenizerFactory<CoreLabel> newHunTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
return new HunTokenizerFactory<CoreLabel>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
}
// Constructors
private HunTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) {
this.factory = factory;
StringBuilder optionsSB = new StringBuilder();
if (suppressEscaping) {
optionsSB.append("Hun3Escaping=false");
} else {
optionsSB.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations
}
if (tokenizeNLs) {
optionsSB.append(",tokenizeNLs");
}
if (invertible) {
optionsSB.append(",invertible");
}
this.options = optionsSB.toString();
}
private HunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
this.factory = tokenFactory;
this.options = options;
}
/** Returns a tokenizer wrapping the given Reader. */
public Iterator<T> getIterator(Reader r) {
return getTokenizer(r);
}
/** Returns a tokenizer wrapping the given Reader. */
public Tokenizer<T> getTokenizer(Reader r) {
return new HunTokenizer<T>(r, factory, options);
}
@Override
public void setOptions(String options) {
this.options = options;
}
@Override
public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
if (options == null || options.equals("")) {
return new PTBTokenizer<T>(r, factory, extraOptions);
} else {
return new PTBTokenizer<T>(r, factory, options + "," + extraOptions);
}
}
} // end static class HunTokenizerFactory
/**
* Reads files named as arguments and print their tokens, by default as one per line. This is useful either for testing or to run
* standalone to turn a corpus into a one-token-per-line file of tokens. This main method assumes that the input file is in utf-8
* encoding, unless it is specified.
* <p/>
* Usage: <code>
* java edu.stanford.nlp.process.HunTokenizer [options] filename+
* </code>
* <p/>
* Options:
* <ul>
* <li>-options options Set various tokenization options (see the documentation in the class javadoc)
* <li>-preserveLines Produce space-separated tokens, except when the original had a line break, not one-token-per-line
* <li>-charset charset Specifies a character encoding
* <li>-parseInside regex Names an XML-style tag or a regular expression over such elements. The tokenizer will only tokenize inside
* element that match this name. (This is done by regex matching, not an XML parser, but works well for simple XML documents, or other
* SGML-style documents, such as Linguistic Data Consortium releases, which adopt the convention that a line of a file is either XML
* markup or character data but never both.)
* <li>-ioFileList file* The remaining command-line arguments are treated as filenames that themselves contain lists of pairs of
* input-output filenames (2 column, whitespace separated).
* <li>-dump Print the whole of each CoreLabel, not just the value (word)
* <li>-untok Heuristically untokenize tokenized text
* <li>-h Print usage info
* </ul>
*
* @param args
* Command line arguments
* @throws IOException
* If any file I/O problem
*/
public static void main(String[] args) throws IOException {
int i = 0;
String charset = "utf-8";
Pattern parseInsideBegin = null;
Pattern parseInsideEnd = null;
StringBuilder optionsSB = new StringBuilder();
boolean preserveLines = false;
boolean inputOutputFileList = false;
boolean dump = false;
boolean untok = false;
while (i < args.length && args[i].charAt(0) == '-') {
if ("-options".equals(args[i])) {
i++;
optionsSB.append(',');
optionsSB.append(args[i]);
} else if ("-preserveLines".equals(args[i])) {
optionsSB.append(",tokenizeNLs");
preserveLines = true;
} else if ("-dump".equals(args[i])) {
dump = true;
} else if ("-ioFileList".equals(args[i])) {
inputOutputFileList = true;
} else if ("-charset".equals(args[i]) && i < args.length - 1) {
i++;
charset = args[i];
} else if ("-parseInside".equals(args[i]) && i < args.length - 1) {
i++;
try {
parseInsideBegin = Pattern.compile("<(?:" + args[i] + ")[^>]*?>");
parseInsideEnd = Pattern.compile("</(?:" + args[i] + ")[^>]*?>");
} catch (Exception e) {
parseInsideBegin = null;
parseInsideEnd = null;
}
} else if ("-untok".equals(args[i])) {
untok = true;
} else if ("-h".equals(args[i]) || "-help".equals(args[i]) || "--help".equals(args[i])) {
System.err.println("usage: java edu.stanford.nlp.process.HunTokenizer [options]* filename*");
System.err.println(" options: -preserveLines|-dump|-ioFileList|-charset|-parseInside elementRegex|-options options|-h");
return; // exit if they asked for help in options
} else {
System.err.println("Unknown option: " + args[i]);
}
i++;
}
ArrayList<String> inputFileList = new ArrayList<String>();
ArrayList<String> outputFileList = null;
if (inputOutputFileList) {
outputFileList = new ArrayList<String>();
for (int j = i; j < args.length; j++) {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[j]), charset));
for (String inLine; (inLine = r.readLine()) != null;) {
String[] fields = inLine.split("\\s+");
inputFileList.add(fields[0]);
if (fields.length > 1) {
outputFileList.add(fields[1]);
} else {
outputFileList.add(fields[0] + ".tok");
}
}
r.close();
}
} else {
inputFileList.addAll(Arrays.asList(args).subList(i, args.length));
}
if (untok) {
untok(inputFileList, outputFileList, charset);
} else {
tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, optionsSB.toString(), preserveLines, dump);
}
} // end main
} // end HunTokenizer