ChineseLexiconAndWordSegmenter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.TokenizerFactory;
import java.util.function.Function;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Timing;

import java.io.*;
import java.util.*;
import java.util.zip.GZIPOutputStream;
import java.util.zip.GZIPInputStream;
import java.net.URL;
import java.net.URLConnection;

/**
 * This class lets you train a lexicon and segmenter at the same time.
 *
 * @author Galen Andrew
 * @author Pi-Chuan Chang
 */
public class ChineseLexiconAndWordSegmenter implements Lexicon, WordSegmenter  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseLexiconAndWordSegmenter.class);

  private final ChineseLexicon chineseLexicon;
  private final WordSegmenter wordSegmenter;

  public ChineseLexiconAndWordSegmenter(ChineseLexicon lex, WordSegmenter seg) {
    chineseLexicon = lex;
    wordSegmenter = seg;
  }

  @Override
  public List<HasWord> segment(String s) {
    return wordSegmenter.segment(s);
  }

  @Override
  public boolean isKnown(int word) {
    return chineseLexicon.isKnown(word);
  }

  @Override
  public boolean isKnown(String word) {
    return chineseLexicon.isKnown(word);
  }

  /** {@inheritDoc} */
  @Override
  public Set<String> tagSet(Function<String,String> basicCategoryFunction) {
    return chineseLexicon.tagSet(basicCategoryFunction);
  }


  @Override
  public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) {
    return chineseLexicon.ruleIteratorByWord(word, loc, null);
  }

  @Override
  public Iterator<IntTaggedWord> ruleIteratorByWord(String word, int loc, String featureSpec) {
    return chineseLexicon.ruleIteratorByWord(word, loc, null);
  }

  /** Returns the number of rules (tag rewrites as word) in the Lexicon.
   *  This method assumes that the lexicon has been initialized.
   */
  @Override
  public int numRules() {
    return chineseLexicon.numRules();
  }

  @Override
  public void initializeTraining(double numTrees) {
    chineseLexicon.initializeTraining(numTrees);
    wordSegmenter.initializeTraining(numTrees);
  }

  @Override
  public void train(Collection<Tree> trees) {
    train(trees, 1.0);
  }

  @Override
  public void train(Collection<Tree> trees, double weight) {
    for (Tree tree : trees) {
      train(tree, weight);
    }
  }

  @Override
  public void train(Tree tree) {
    train(tree, 1.0);
  }

  @Override
  public void train(Tree tree, double weight) {
    train(tree.taggedYield(), weight);
  }

  @Override
  public void train(List<TaggedWord> sentence) {
    train(sentence, 1.0);
  }

  @Override
  public void train(List<TaggedWord> sentence, double weight) {
    chineseLexicon.train(sentence, weight);
    wordSegmenter.train(sentence);
  }

  @Override
  public void trainUnannotated(List<TaggedWord> sentence, double weight) {
    // TODO: for now we just punt on these
    throw new UnsupportedOperationException("This version of the parser does not support non-tree training data");
  }

  @Override
  public void incrementTreesRead(double weight) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void train(TaggedWord tw, int loc, double weight) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void finishTraining() {
    chineseLexicon.finishTraining();
    wordSegmenter.finishTraining();
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
    return chineseLexicon.score(iTW, loc, word, null);
  } // end score()


  @Override
  public void loadSegmenter(String filename) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void readData(BufferedReader in) throws IOException {
    chineseLexicon.readData(in);
  }

  @Override
  public void writeData(Writer w) throws IOException {
    chineseLexicon.writeData(w);
  }

  // the data & functions below are for standalone segmenter. -pichuan
  private Options op;
  // helper function
  private static int numSubArgs(String[] args, int index) {
    int i = index;
    while (i + 1 < args.length && args[i + 1].charAt(0) != '-') {
      i++;
    }
    return i - index;
  }

  private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, Index<String> wordIndex, Index<String> tagIndex) {
    ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex);
    chineseLexicon = cs.chineseLexicon;
    wordSegmenter = cs.wordSegmenter;
  }

  private static ChineseLexiconAndWordSegmenter getSegmenterDataFromTreebank(Treebank trainTreebank, Options op, Index<String> wordIndex, Index<String> tagIndex) {
    System.out.println("Currently " + new Date());
    //    printOptions(true, op);
    Timing.startTime();
    // setup tree transforms
    TreebankLangParserParams tlpParams = op.tlpParams;
    if (op.testOptions.verbose) {
      System.out.print("Training ");
      System.out.println(trainTreebank.textualSummary());
    }

    System.out.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer; // initialized below
    if (!op.trainOptions.leftToRight) {
      binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
      binarizer = new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.treebankLanguagePack());
    }
    List<Tree> binaryTrainTrees = new ArrayList<>();
    // List<Tree> binaryTuneTrees = new ArrayList<Tree>();

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.treebankLanguagePack());
      if (op.testOptions.verbose) {
        log.info("Parent split categories: " + op.trainOptions.splitters);
      }
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.treebankLanguagePack());
      if (op.testOptions.verbose) {
        log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
      }
    }
    if (op.trainOptions.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        tree = binarizer.transformTree(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }

    Timing.tick("done.");
    if (op.testOptions.verbose) {
      binarizer.dumpStats();
    }
    System.out.print("Extracting Lexicon...");
    ChineseLexiconAndWordSegmenter clex = (ChineseLexiconAndWordSegmenter) op.tlpParams.lex(op, wordIndex, tagIndex);
    clex.initializeTraining(binaryTrainTrees.size());
    clex.train(binaryTrainTrees);
    clex.finishTraining();
    Timing.tick("done.");
    return clex;
  }

  private static void printArgs(String[] args, PrintStream ps) {
    ps.print("ChineseLexiconAndWordSegmenter invoked with arguments:");
    for (String arg : args) {
      ps.print(" " + arg);
    }
    ps.println();
  }

  static void saveSegmenterDataToSerialized(ChineseLexiconAndWordSegmenter cs, String filename) {
    try {
      log.info("Writing segmenter in serialized format to file " + filename + " ");
      ObjectOutputStream out = IOUtils.writeStreamFromString(filename);

      out.writeObject(cs);
      out.close();
      log.info("done.");
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
  }


  static void saveSegmenterDataToText(ChineseLexiconAndWordSegmenter cs, String filename) {
    try {
      log.info("Writing parser in text grammar format to file " + filename);
      OutputStream os;
      if (filename.endsWith(".gz")) {
        // it's faster to do the buffering _outside_ the gzipping as here
        os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename)));
      } else {
        os = new BufferedOutputStream(new FileOutputStream(filename));
      }
      PrintWriter out = new PrintWriter(os);
      String prefix = "BEGIN ";
      //      out.println(prefix + "OPTIONS");
      //      if (pd.pt != null) {
      //        pd.pt.writeData(out);
      //      }
      //      out.println();
      //      log.info(".");
      out.println(prefix + "LEXICON");
      if (cs != null) {
        cs.writeData(out);
      }
      out.println();
      log.info(".");
      out.flush();
      out.close();
      log.info("done.");
    } catch (IOException e) {
      log.info("Trouble saving segmenter data to ASCII format.");
      e.printStackTrace();
    }
  }

  private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) {
    log.info("Training a segmenter from treebank dir: " + treebankPath);
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    log.info("Reading trees...");
    if (filt == null) {
      trainTreebank.loadPath(treebankPath);
    } else {
      trainTreebank.loadPath(treebankPath, filt);
    }

    Timing.tick("done [read " + trainTreebank.size() + " trees].");
    return trainTreebank;
  }

  /**
   * Construct a new ChineseLexiconAndWordSegmenter.  This loads a segmenter file that
   * was previously assembled and stored.
   *
   * @throws IllegalArgumentException If segmenter data cannot be loaded
   */
  public ChineseLexiconAndWordSegmenter(String segmenterFileOrUrl, Options op) {
    ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromFile(segmenterFileOrUrl, op);
    this.op = cs.op; // in case a serialized options was read in
    chineseLexicon = cs.chineseLexicon;
    wordSegmenter = cs.wordSegmenter;
  }

  public static ChineseLexiconAndWordSegmenter getSegmenterDataFromFile(String parserFileOrUrl, Options op) {
    ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromSerializedFile(parserFileOrUrl);
    if (cs == null) {
//      pd = getSegmenterDataFromTextFile(parserFileOrUrl, op);
    }
    return cs;
  }

  protected static ChineseLexiconAndWordSegmenter getSegmenterDataFromSerializedFile(String serializedFileOrUrl) {
    ChineseLexiconAndWordSegmenter cs = null;
    try {
      log.info("Loading segmenter from serialized file " + serializedFileOrUrl + " ...");
      ObjectInputStream in;
      InputStream is;
      if (serializedFileOrUrl.startsWith("http://")) {
        URL u = new URL(serializedFileOrUrl);
        URLConnection uc = u.openConnection();
        is = uc.getInputStream();
      } else {
        is = new FileInputStream(serializedFileOrUrl);
      }
      if (serializedFileOrUrl.endsWith(".gz")) {
        // it's faster to do the buffering _outside_ the gzipping as here
        in = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(is)));
      } else {
        in = new ObjectInputStream(new BufferedInputStream(is));
      }
      cs = (ChineseLexiconAndWordSegmenter) in.readObject();
      in.close();
      log.info(" done.");
      return cs;
    } catch (InvalidClassException ice) {
      // For this, it's not a good idea to continue and try it as a text file!
      log.info();   // as in middle of line from above message
      throw new RuntimeException(ice);
    } catch (FileNotFoundException fnfe) {
      // For this, it's not a good idea to continue and try it as a text file!
      log.info();   // as in middle of line from above message
      throw new RuntimeException(fnfe);
    } catch (StreamCorruptedException sce) {
      // suppress error message, on the assumption that we've really got
      // a text grammar, and that'll be tried next
    } catch (Exception e) {
      log.info();   // as in middle of line from above message
      e.printStackTrace();
    }
    return null;
  }

  /** This method lets you train and test a segmenter relative to a
   *  Treebank.
   *  <p>
   *  <i>Implementation note:</i> This method is largely cloned from
   *  LexicalizedParser's main method.  Should we try to have it be able
   *  to train segmenters to stop things going out of sync?
   */
  public static void main(String[] args) {
    boolean train = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;
    Treebank testTreebank = null;
    // Treebank tuneTreebank = null;
    String testPath = null;
    FileFilter testFilter = null;
    FileFilter trainFilter = null;
    String encoding = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<Word> tokenizerFactory = null;
//    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = new ChineseEscaper();
    // int tagDelimiter = -1;
    // String sentenceDelimiter = "\n";
    // boolean fromXML = false;
    int argIndex = 0;
    if (args.length < 1) {
      log.info("usage: java edu.stanford.nlp.parser.lexparser." +
                         "LexicalizedParser parserFileOrUrl filename*");
      return;
    }

    Options op = new Options();
    op.tlpParams = new ChineseTreebankParserParams();

    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
      if (args[argIndex].equalsIgnoreCase("-train")) {
        train = true;
        saveToSerializedFile = true;
        int numSubArgs = numSubArgs(args, argIndex);
        argIndex++;
        if (numSubArgs > 1) {
          treebankPath = args[argIndex];
          argIndex++;
        } else {
          throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
        }
        if (numSubArgs == 2) {
          trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs >= 3) {
          try {
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams
        encoding = args[argIndex + 1];
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
        // load the parser from a binary serialized file
        // the next argument must be the path to the parser file
        serializedInputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
        // doesn't make sense to load from TextFile -pichuan
        //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
        //        // load the parser from declarative text file
        //        // the next argument must be the path to the parser file
        //        textInputFileOrUrl = args[argIndex + 1];
        //        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
        saveToSerializedFile = true;
        serializedOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
        // save the parser to declarative text file
        saveToTextFile = true;
        textOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-treebank")) {
        // the next argument is the treebank path and range for testing
        int numSubArgs = numSubArgs(args, argIndex);
        argIndex++;
        if (numSubArgs == 1) {
          testFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs > 1) {
          testPath = args[argIndex++];
          if (numSubArgs == 2) {
            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
          } else if (numSubArgs >= 3) {
            try {
              int low = Integer.parseInt(args[argIndex]);
              int high = Integer.parseInt(args[argIndex + 1]);
              testFilter = new NumberRangeFileFilter(low, high, true);
              argIndex += 2;
            } catch (NumberFormatException e) {
              // maybe it's a ranges expression?
              testFilter = new NumberRangesFileFilter(args[argIndex++], true);
            }
          }
        }
      } else {
        int j = op.tlpParams.setOptionFlag(args, argIndex);
        if (j == argIndex) {
          log.info("Unknown option ignored: " + args[argIndex]);
          j++;
        }
        argIndex = j;
      }
    } // end while loop through arguments

    TreebankLangParserParams tlpParams = op.tlpParams;

    // all other arguments are order dependent and
    // are processed in order below

    ChineseLexiconAndWordSegmenter cs = null;
    if (!train && op.testOptions.verbose) {
      System.out.println("Currently " + new Date());
      printArgs(args, System.out);
    }
    if (train) {
      printArgs(args, System.out);
      // so we train a parser using the treebank
      if (treebankPath == null) {
        // the next arg must be the treebank path, since it wasn't give earlier
        treebankPath = args[argIndex];
        argIndex++;
        if (args.length > argIndex + 1) {
          try {
            // the next two args might be the range
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      }
      Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
      Index<String> wordIndex = new HashIndex<>();
      Index<String> tagIndex = new HashIndex<>();
      cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
    } else if (textInputFileOrUrl != null) {
      // so we load the segmenter from a text grammar file
      // XXXXX fix later -pichuan
      //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
    } else {
      // so we load a serialized segmenter
      if (serializedInputFileOrUrl == null) {
        // the next argument must be the path to the serialized parser
        serializedInputFileOrUrl = args[argIndex];
        argIndex++;
      }
      try {
        cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
      } catch (IllegalArgumentException e) {
        log.info("Error loading segmenter, exiting...");
        System.exit(0);
      }
    }

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    TreePrint treePrint = op.testOptions.treePrint(tlpParams);

    if (testFilter != null) {
      if (testPath == null) {
        if (treebankPath == null) {
          throw new RuntimeException("No test treebank path specified...");
        } else {
          log.info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
          testPath = treebankPath;
        }
      }
      testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(testPath, testFilter);
    }

    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));

    // at this point we should be sure that op.tlpParams is
    // set appropriately (from command line, or from grammar file),
    // and will never change again.  We also set the tlpParams of the
    // LexicalizedParser instance to be the same object.  This is
    // redundancy that we probably should take out eventually.
    //
    // -- Roger
    if (op.testOptions.verbose) {
      log.info("Lexicon is " + cs.getClass().getName());
    }

    PrintWriter pwOut = tlpParams.pw();
    PrintWriter pwErr = tlpParams.pw(System.err);


    // Now what do we do with the parser we've made
    if (saveToTextFile) {
      // save the parser to textGrammar format
      if (textOutputFileOrUrl != null) {
        saveSegmenterDataToText(cs, textOutputFileOrUrl);
      } else {
        log.info("Usage: must specify a text segmenter data output path");
      }
    }
    if (saveToSerializedFile) {
      if (serializedOutputFileOrUrl == null && argIndex < args.length) {
        // the next argument must be the path to serialize to
        serializedOutputFileOrUrl = args[argIndex];
        argIndex++;
      }
      if (serializedOutputFileOrUrl != null) {
        saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
      } else if (textOutputFileOrUrl == null && testTreebank == null) {
        // no saving/parsing request has been specified
        log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
      }
    }
    /* --------------------- Testing part!!!! ----------------------- */
    if (op.testOptions.verbose) {
//      printOptions(false, op);
    }
    if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
      // test parser on treebank
      if (testTreebank == null) {
        // the next argument is the treebank path and range for testing
        testTreebank = tlpParams.testMemoryTreebank();
        if (args.length < argIndex + 4) {
          testTreebank.loadPath(args[argIndex + 1]);
        } else {
          int testlow = Integer.parseInt(args[argIndex + 2]);
          int testhigh = Integer.parseInt(args[argIndex + 3]);
          testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
        }
      }
      /* TODO - test segmenting on treebank. -pichuan */
//      lp.testOnTreebank(testTreebank);
//    } else if (argIndex >= args.length) {
//      // no more arguments, so we just parse our own test sentence
//      if (lp.parse(op.tlpParams.defaultTestSentence())) {
//        treePrint.printTree(lp.getBestParse(), pwOut);
//      } else {
//        pwErr.println("Error. Can't parse test sentence: " +
//              lp.parse(op.tlpParams.defaultTestSentence()));
//      }
    }
//wsg2010: This code block doesn't actually do anything. It appears to read and tokenize a file, and then just print it.
//         There are easier ways to do that. This code was copied from an old version of LexicalizedParser.
//    else {
//      // We parse filenames given by the remaining arguments
//      int numWords = 0;
//      Timing timer = new Timing();
//      // set the tokenizer
//      if (tokenized) {
//        tokenizerFactory = WhitespaceTokenizer.factory();
//      }
//      TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
//      if (tokenizerFactory == null) {
//        tokenizerFactory = (TokenizerFactory<Word>) tlp.getTokenizerFactory();
//      }
//      documentPreprocessor.setTokenizerFactory(tokenizerFactory);
//      documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
//      if (encoding != null) {
//        documentPreprocessor.setEncoding(encoding);
//      }
//      timer.start();
//      for (int i = argIndex; i < args.length; i++) {
//        String filename = args[i];
//        try {
//          List document = null;
//          if (fromXML) {
//            document = documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized);
//          } else {
//            document = documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter);
//          }
//          log.info("Segmenting file: " + filename + " with " + document.size() + " sentences.");
//          PrintWriter pwo = pwOut;
//          if (op.testOptions.writeOutputFiles) {
//            try {
//              pwo = tlpParams.pw(new FileOutputStream(filename + ".stp"));
//            } catch (IOException ioe) {
//              ioe.printStackTrace();
//            }
//          }
//          int num = 0;
//          treePrint.printHeader(pwo, tlp.getEncoding());
//          for (Iterator it = document.iterator(); it.hasNext();) {
//            num++;
//            List sentence = (List) it.next();
//            int len = sentence.size();
//            numWords += len;
////            pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + sentence);
//            pwo.println(Sentence.listToString(sentence));
//          }
//          treePrint.printFooter(pwo);
//          if (op.testOptions.writeOutputFiles) {
//            pwo.close();
//          }
//        } catch (IOException e) {
//          pwErr.println("Couldn't find file: " + filename);
//        }
//
//      } // end for each file
//      long millis = timer.stop();
//      double wordspersec = numWords / (((double) millis) / 1000);
//      NumberFormat nf = new DecimalFormat("0.00"); // easier way!
//      pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second.");
//    }
  }

  private static final long serialVersionUID = -6554995189795187918L;


  @Override
  public UnknownWordModel getUnknownWordModel() {
    return chineseLexicon.getUnknownWordModel();
  }

  @Override
  public void setUnknownWordModel(UnknownWordModel uwm) {
    chineseLexicon.setUnknownWordModel(uwm);
  }

  @Override
  public void train(Collection<Tree> trees, Collection<Tree> rawTrees) {
    train(trees);

  }

}