package edu.stanford.nlp.process; // Stanford English Tokenizer -- a deterministic, fast high-quality tokenizer // Copyright (c) 2002-2009 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // java-nlp-support@lists.stanford.edu // http://nlp.stanford.edu/software/ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.io.Writer; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Timing; /** * Fast, rule-based tokenizer implementation, initially written to conform to the Penn Treebank tokenization conventions, but now providing * a range of tokenization options over a broader space of Unicode text. It reads raw text and outputs tokens of classes that implement * edu.stanford.nlp.trees.HasWord (typically a Word or a CoreLabel). It can optionally return carriage returns as tokens. * <p> * New code is encouraged to use the {@link #HunTokenizer(Reader,LexedTokenFactory,String)} constructor. The other constructors are * historical. You specify the type of result tokens with a LexedTokenFactory, and can specify the treatment of tokens by boolean options * given in a comma separated String options (e.g., "invertible,normalizeParentheses=true"). If the String is <code>null</code> or empty, * you get the traditional Hun3 normalization behaviour (i.e., you get Hun3Escaping=false). If you want no normalization, then you should * pass in the String "Hun3Escaping=false". The known option names are: * <ol> * <li>invertible: Store enough information about the original form of the token and the whitespace around it that a list of tokens can be * faithfully converted back to the original String. Valid only if the LexedTokenFactory is an instance of CoreLabelTokenFactory. The keys * used in it are TextAnnotation for the tokenized form, CurrentAnnotation for the original string, BeforeAnnotation and AfterAnnotation for * the whitespace before and after a token, and perhaps BeginPositionAnnotation and EndPositionAnnotation to record token begin/after end * offsets, if they were specified to be recorded in TokenFactory construction. (Like the String class, begin and end are done so end - * begin gives the token length.) * <li>tokenizeNLs: Whether end-of-lines should become tokens (or just be treated as part of whitespace) * <li>Hun3Escaping: Enable all traditional Hun3 token transforms (like -LRB-, -RRB-). This is a macro flag that sets or clears all the * options below. * <li>americanize: Whether to rewrite common British English spellings as American English spellings * <li>normalizeSpace: Whether any spaces in tokens (phone numbers, fractions get turned into U+00A0 (non-breaking space). It's dangerous to * turn this off for most of our Stanford NLP software, which assumes no spaces in tokens. * <li>normalizeAmpersandEntity: Whether to map the XML &amp; to an ampersand * <li>normalizeCurrency: Whether to do some awful lossy currency mappings to turn common currency characters into $, #, or "cents", * reflecting the fact that nothing else appears in the old Hun3 WSJ. (No Euro!) * <li>normalizeFractions: Whether to map certain common composed fraction characters to spelled out letter forms like "1/2" * <li>normalizeParentheses: Whether to map round parentheses to -LRB-, -RRB-, as in the Penn Treebank * <li>normalizeOtherBrackets: Whether to map other common bracket characters to -LCB-, -LRB-, -RCB-, -RRB-, roughly as in the Penn Treebank * <li>asciiQuotes Whether to map quote characters to the traditional ' and " * <li>latexQuotes: Whether to map to ``, `, ', '' for quotes, as in Latex and the Hun3 WSJ (though this is now heavily frowned on in * Unicode). If true, this takes precedence over the setting of unicodeQuotes; if both are false, no mapping is done. * <li>unicodeQuotes: Whether to map quotes to the range U+2018 to U+201D, the preferred unicode encoding of single and double quotes. * <li>Hun3Ellipsis: Whether to map ellipses to ..., the old Hun3 WSJ coding of an ellipsis. If true, this takes precedence over the setting * of unicodeEllipsis; if both are false, no mapping is done. * <li>unicodeEllipsis: Whether to map dot and optional space sequences to U+2026, the Unicode ellipsis character * <li>Hun3Dashes: Whether to turn various dash characters into "--", the dominant encoding of dashes in the Hun3 WSJ * <li>escapeForwardSlashAsterisk: Whether to put a backslash escape in front of / and * as the old Hun3 WSJ does for some reason (something * to do with Lisp readers??). * <li>untokenizable: What to do with untokenizable characters (ones not known to the tokenizers. Six options combining whether to log a * warning for none, the first, or all, and whether to delete them or to include them as single character tokens in the output: noneDelete, * firstDelete, allDelete, noneKeep, firstKeep, allKeep. The default is "firstDelete". * </ol> * * @author Tim Grow (his tokenizer is a Java implementation of Professor Chris Manning's Flex tokenizer, pgtt-treebank.l) * @author Teg Grenager (grenager@stanford.edu) * @author Jenny Finkel (integrating in invertible Hun tokenizer) * @author Christopher Manning (redid API, added many options, maintenance) */ public class HunTokenizer<T extends HasWord> extends AbstractTokenizer<T> { // the underlying lexer private HunPTBLexer lexer; /** * Constructs a new HunTokenizer that returns Word tokens and which treats carriage returns as normal whitespace. * * @param r * The Reader whose contents will be tokenized * @return A HunTokenizer that tokenizes a stream to objects of type {@link Word} */ public static HunTokenizer<Word> newHunTokenizer(Reader r) { return newHunTokenizer(r, false); } /** * Constructs a new HunTokenizer that optionally returns newlines as their own token. NLs come back as Words whose text is the value of * <code>HunLexer.NEWLINE_TOKEN</code>. * * @param r * The Reader to read tokens from * @param tokenizeNLs * Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace) * @return A HunTokenizer which returns Word tokens */ public static HunTokenizer<Word> newHunTokenizer(Reader r, boolean tokenizeNLs) { return new HunTokenizer<Word>(r, tokenizeNLs, false, false, new WordTokenFactory()); } /** * Constructs a new HunTokenizer that makes CoreLabel tokens. It optionally returns carriage returns as their own token. CRs come back as * Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code>. * * @param r * The Reader to read tokens from * @param tokenizeNLs * Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace) * @param invertible * if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets * @return A HunTokenizer which returns CoreLabel objects */ public static HunTokenizer<CoreLabel> newHunTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) { return new HunTokenizer<CoreLabel>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory()); } /** * Constructs a new HunTokenizer that optionally returns carriage returns as their own token, and has a custom LexedTokenFactory. If asked * for, CRs come back as Words whose text is the value of <code>HunLexer.cr</code>. This constructor translates between the traditional * boolean options of HunTokenizer and the new options String. * * @param r * The Reader to read tokens from * @param tokenizeNLs * Whether to return newlines as separate tokens (otherwise they normally disappear as whitespace) * @param invertible * if set to true, then will produce CoreLabels which will have fields for the string before and after, and the character offsets * @param suppressEscaping * If true, all the traditional Penn Treebank normalizations are turned off. Otherwise, they all happen. * @param tokenFactory * The LexedTokenFactory to use to create tokens from the text. */ private HunTokenizer(final Reader r, final boolean tokenizeNLs, final boolean invertible, final boolean suppressEscaping, final LexedTokenFactory<T> tokenFactory) { StringBuilder options = new StringBuilder(); if (suppressEscaping) { options.append("Hun3Escaping=false"); } else { options.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations } if (tokenizeNLs) { options.append(",tokenizeNLs"); } if (invertible) { options.append(",invertible"); } lexer = new HunPTBLexer(r, tokenFactory, options.toString()); } /** * Constructs a new HunTokenizer with a custom LexedTokenFactory. Many options for tokenization and what is returned can be set via the * options String. See the class documentation for details on the options String. This is the new recommended constructor! * * @param r * The Reader to read tokens from. * @param tokenFactory * The LexedTokenFactory to use to create tokens from the text. * @param options * Options to the lexer. See the extensive documentation in the class javadoc. The String may be null or empty, which means that * all traditional Hun normalizations are done. You can pass in "Hun3Escaping=false" and have no normalizations done (that is, * the behavior of the old suppressEscaping=true option). */ public HunTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options) { lexer = new HunPTBLexer(r, tokenFactory, options); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @Override @SuppressWarnings("unchecked") protected T getNext() { // if (lexer == null) { // return null; // } T token = null; try { token = (T) lexer.next(); // cdm 2007: this shouldn't be necessary: HunLexer decides for itself whether to return CRs based on the same // flag! // get rid of CRs if necessary // while (!tokenizeNLs && HunLexer.cr.equals(((HasWord) token).word())) { // token = (T)lexer.next(); // } } catch (Exception e) { nextToken = null; // do nothing, return null } return token; } /** * Returns a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things * that makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. * * @param HunText * A String in Hun3-escaped form * @return An approximation to the original String */ public static String Hun2Text(String HunText) { StringBuilder sb = new StringBuilder(HunText.length()); // probably an overestimate PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(HunText)); try { for (String token; (token = lexer.next()) != null;) { sb.append(token); } } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } /** * Returns a presentable version of a given Hun token. For instance, it transforms -LRB- into (. */ public static String HunToken2Text(String HunText) { return Hun2Text(' ' + HunText + ' ').trim(); } /** * Writes a presentable version of the given Hun-tokenized text. Hun tokenization splits up punctuation and does various other things that * makes simply joining the tokens with spaces look bad. So join the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. */ public static int Hun2Text(Reader HunText, Writer w) throws IOException { int numTokens = 0; PTB2TextLexer lexer = new PTB2TextLexer(HunText); for (String token; (token = lexer.next()) != null;) { numTokens++; w.write(token); } return numTokens; } private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { Timing t = new Timing(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); PrintWriter out = new PrintWriter(System.out, true); numTokens = Hun2Text(r, out); out.close(); } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readerFromString(inputFileList.get(j), charset); PrintWriter out; if (outputFileList == null) { out = new PrintWriter(System.out, true); } else { out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true); } numTokens += Hun2Text(r, out); out.close(); } } long millis = t.stop(); double wordspersec = numTokens / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println("HunTokenizer untokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second."); } /** * Returns a presentable version of the given Hun-tokenized words. Pass in a List of Strings and this method will join the words with * spaces and call {@link #Hun2Text(String)} on the output. * * @param HunWords * A list of String * @return A presentable version of the given Hun-tokenized words */ public static String Hun2Text(List<String> HunWords) { return Hun2Text(StringUtils.join(HunWords)); } /** * Returns a presentable version of the given Hun-tokenized words. Pass in a List of Words or a Document and this method will join the * words with spaces and call {@link #Hun2Text(String)} on the output. This method will take the word() values to prevent additional text * from creeping in (e.g., POS tags). * * @param HunWords * A list of HasWord objects * @return A presentable version of the given Hun-tokenized words */ public static String labelList2Text(List<? extends HasWord> HunWords) { List<String> words = new ArrayList<String>(); for (HasWord hw : HunWords) { words.add(hw.word()); } return Hun2Text(words); } private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump) throws IOException { Timing t = new Timing(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); PrintWriter out = new PrintWriter(System.out, true); numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump); } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readerFromString(inputFileList.get(j), charset); PrintWriter out; if (outputFileList == null) { out = new PrintWriter(System.out, true); } else { out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true); } numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump); r.close(); if (outputFileList != null) out.close(); } // end for j going through inputFileList } long millis = t.stop(); double wordspersec = numTokens / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.err.println("HunTokenizer tokenized " + numTokens + " tokens at " + nf.format(wordspersec) + " tokens per second."); } private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump) { int numTokens = 0; HunTokenizer<CoreLabel> tokenizer = new HunTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity boolean beginLine = true; while (tokenizer.hasNext()) { CoreLabel obj = tokenizer.next(); String str = obj.word(); if (parseInsideBegin != null && parseInsideBegin.matcher(str).matches()) { printing = true; } else if (parseInsideEnd != null && parseInsideEnd.matcher(str).matches()) { printing = false; } else if (printing) { if (dump) { // after having checked for tags, change str to be exhaustive str = obj.toString(); } if (preserveLines) { if (HunPTBLexer.NEWLINE_TOKEN.equals(str)) { beginLine = true; out.println(); } else { if (!beginLine) { out.print(" "); } else { beginLine = false; } out.print(str); } } else { out.println(str); } } numTokens++; } return numTokens; } public static TokenizerFactory<Word> factory() { return HunTokenizerFactory.newTokenizerFactory(); } public static <T extends HasWord> TokenizerFactory<T> factory(boolean tokenizeNLs, LexedTokenFactory<T> factory) { return new HunTokenizerFactory<T>(tokenizeNLs, false, false, factory); } public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) { return HunTokenizerFactory.newHunTokenizerFactory(tokenizeNLs, invertible); } /** * Get a TokenizerFactory that does Penn Treebank tokenization. This is now the recommended factory method to use. * * @param factory * A TokenFactory that determines what form of token is returned by the Tokenizer * @param options * A String specifying options (see the class javadoc for details) * @param <T> * The type of the tokens built by the LexedTokenFactory * @return A TokenizerFactory that does Penn Treebank tokenization */ public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) { return new HunTokenizerFactory<T>(factory, options); } /** * This class provides a factory which will vend instances of HunTokenizer which wrap a provided Reader. See the documentation for * {@link HunTokenizer} for details of the parameters and options. * * @see HunTokenizer * @param <T> * The class of the returned tokens */ public static class HunTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> { protected LexedTokenFactory<T> factory; protected String options; /** * Constructs a new TokenizerFactory that returns Word objects and treats carriage returns as normal whitespace. THIS METHOD IS INVOKED * BY REFLECTION BY SOME OF THE JAVANLP CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A TokenizerFactory. * * @return A TokenizerFactory that returns Word objects */ public static TokenizerFactory<Word> newTokenizerFactory() { return newHunTokenizerFactory(new WordTokenFactory(), ""); } /** * Constructs a new HunTokenizer that optionally returns carriage returns as their own token. * * @param tokenizeNLs * If true, newlines come back as Words whose text is the value of <code>HunLexer.NEWLINE_TOKEN</code> . * @return A TokenizerFactory that returns Word objects */ public static HunTokenizerFactory<Word> newHunTokenizerFactory(boolean tokenizeNLs) { return new HunTokenizerFactory<Word>(tokenizeNLs, false, false, new WordTokenFactory()); } /** * Constructs a new HunTokenizer that returns Word objects and uses the options passed in. * * @param options * A String of options * @return A TokenizerFactory that returns Word objects */ public static HunTokenizerFactory<Word> newWordTokenizerFactory(String options) { return new HunTokenizerFactory<Word>(new WordTokenFactory(), options); } /** * Constructs a new HunTokenizer that returns CoreLabel objects and uses the options passed in. * * @param options * A String of options * @return A TokenizerFactory that returns CoreLabel objects o */ public static HunTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) { return new HunTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), options); } /** * Constructs a new HunTokenizer that uses the LexedTokenFactory and options passed in. * * @param tokenFactory * The LexedTokenFactory * @param options * A String of options * @return A TokenizerFactory that returns objects of the type of the LexedTokenFactory */ public static <T extends HasWord> HunTokenizerFactory<T> newHunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) { return new HunTokenizerFactory<T>(tokenFactory, options); } public static HunTokenizerFactory<CoreLabel> newHunTokenizerFactory(boolean tokenizeNLs, boolean invertible) { return new HunTokenizerFactory<CoreLabel>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory()); } // Constructors private HunTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) { this.factory = factory; StringBuilder optionsSB = new StringBuilder(); if (suppressEscaping) { optionsSB.append("Hun3Escaping=false"); } else { optionsSB.append("Hun3Escaping=true"); // i.e., turn on all the historical Hun normalizations } if (tokenizeNLs) { optionsSB.append(",tokenizeNLs"); } if (invertible) { optionsSB.append(",invertible"); } this.options = optionsSB.toString(); } private HunTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) { this.factory = tokenFactory; this.options = options; } /** Returns a tokenizer wrapping the given Reader. */ public Iterator<T> getIterator(Reader r) { return getTokenizer(r); } /** Returns a tokenizer wrapping the given Reader. */ public Tokenizer<T> getTokenizer(Reader r) { return new HunTokenizer<T>(r, factory, options); } @Override public void setOptions(String options) { this.options = options; } @Override public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { if (options == null || options.equals("")) { return new PTBTokenizer<T>(r, factory, extraOptions); } else { return new PTBTokenizer<T>(r, factory, options + "," + extraOptions); } } } // end static class HunTokenizerFactory /** * Reads files named as arguments and print their tokens, by default as one per line. This is useful either for testing or to run * standalone to turn a corpus into a one-token-per-line file of tokens. This main method assumes that the input file is in utf-8 * encoding, unless it is specified. * <p/> * Usage: <code> * java edu.stanford.nlp.process.HunTokenizer [options] filename+ * </code> * <p/> * Options: * <ul> * <li>-options options Set various tokenization options (see the documentation in the class javadoc) * <li>-preserveLines Produce space-separated tokens, except when the original had a line break, not one-token-per-line * <li>-charset charset Specifies a character encoding * <li>-parseInside regex Names an XML-style tag or a regular expression over such elements. The tokenizer will only tokenize inside * element that match this name. (This is done by regex matching, not an XML parser, but works well for simple XML documents, or other * SGML-style documents, such as Linguistic Data Consortium releases, which adopt the convention that a line of a file is either XML * markup or character data but never both.) * <li>-ioFileList file* The remaining command-line arguments are treated as filenames that themselves contain lists of pairs of * input-output filenames (2 column, whitespace separated). * <li>-dump Print the whole of each CoreLabel, not just the value (word) * <li>-untok Heuristically untokenize tokenized text * <li>-h Print usage info * </ul> * * @param args * Command line arguments * @throws IOException * If any file I/O problem */ public static void main(String[] args) throws IOException { int i = 0; String charset = "utf-8"; Pattern parseInsideBegin = null; Pattern parseInsideEnd = null; StringBuilder optionsSB = new StringBuilder(); boolean preserveLines = false; boolean inputOutputFileList = false; boolean dump = false; boolean untok = false; while (i < args.length && args[i].charAt(0) == '-') { if ("-options".equals(args[i])) { i++; optionsSB.append(','); optionsSB.append(args[i]); } else if ("-preserveLines".equals(args[i])) { optionsSB.append(",tokenizeNLs"); preserveLines = true; } else if ("-dump".equals(args[i])) { dump = true; } else if ("-ioFileList".equals(args[i])) { inputOutputFileList = true; } else if ("-charset".equals(args[i]) && i < args.length - 1) { i++; charset = args[i]; } else if ("-parseInside".equals(args[i]) && i < args.length - 1) { i++; try { parseInsideBegin = Pattern.compile("<(?:" + args[i] + ")[^>]*?>"); parseInsideEnd = Pattern.compile("</(?:" + args[i] + ")[^>]*?>"); } catch (Exception e) { parseInsideBegin = null; parseInsideEnd = null; } } else if ("-untok".equals(args[i])) { untok = true; } else if ("-h".equals(args[i]) || "-help".equals(args[i]) || "--help".equals(args[i])) { System.err.println("usage: java edu.stanford.nlp.process.HunTokenizer [options]* filename*"); System.err.println(" options: -preserveLines|-dump|-ioFileList|-charset|-parseInside elementRegex|-options options|-h"); return; // exit if they asked for help in options } else { System.err.println("Unknown option: " + args[i]); } i++; } ArrayList<String> inputFileList = new ArrayList<String>(); ArrayList<String> outputFileList = null; if (inputOutputFileList) { outputFileList = new ArrayList<String>(); for (int j = i; j < args.length; j++) { BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[j]), charset)); for (String inLine; (inLine = r.readLine()) != null;) { String[] fields = inLine.split("\\s+"); inputFileList.add(fields[0]); if (fields.length > 1) { outputFileList.add(fields[1]); } else { outputFileList.add(fields[0] + ".tok"); } } r.close(); } } else { inputFileList.addAll(Arrays.asList(args).subList(i, args.length)); } if (untok) { untok(inputFileList, outputFileList, charset); } else { tok(inputFileList, outputFileList, charset, parseInsideBegin, parseInsideEnd, optionsSB.toString(), preserveLines, dump); } } // end main } // end HunTokenizer