package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.NumberRangeFileFilter; import edu.stanford.nlp.io.NumberRangesFileFilter; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.TokenizerFactory; import java.util.function.Function; import edu.stanford.nlp.process.WordSegmenter; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.HashIndex; import edu.stanford.nlp.util.Index; import edu.stanford.nlp.util.Timing; import java.io.*; import java.util.*; import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPInputStream; import java.net.URL; import java.net.URLConnection; /** * This class lets you train a lexicon and segmenter at the same time. * * @author Galen Andrew * @author Pi-Chuan Chang */ public class ChineseLexiconAndWordSegmenter implements Lexicon, WordSegmenter { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseLexiconAndWordSegmenter.class); private final ChineseLexicon chineseLexicon; private final WordSegmenter wordSegmenter; public ChineseLexiconAndWordSegmenter(ChineseLexicon lex, WordSegmenter seg) { chineseLexicon = lex; wordSegmenter = seg; } @Override public List<HasWord> segment(String s) { return wordSegmenter.segment(s); } @Override public boolean isKnown(int word) { return chineseLexicon.isKnown(word); } @Override public boolean isKnown(String word) { return chineseLexicon.isKnown(word); } /** {@inheritDoc} */ @Override public Set<String> tagSet(Function<String,String> basicCategoryFunction) { return chineseLexicon.tagSet(basicCategoryFunction); } @Override public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) { return chineseLexicon.ruleIteratorByWord(word, loc, null); } @Override public Iterator<IntTaggedWord> ruleIteratorByWord(String word, int loc, String featureSpec) { return chineseLexicon.ruleIteratorByWord(word, loc, null); } /** Returns the number of rules (tag rewrites as word) in the Lexicon. * This method assumes that the lexicon has been initialized. */ @Override public int numRules() { return chineseLexicon.numRules(); } @Override public void initializeTraining(double numTrees) { chineseLexicon.initializeTraining(numTrees); wordSegmenter.initializeTraining(numTrees); } @Override public void train(Collection<Tree> trees) { train(trees, 1.0); } @Override public void train(Collection<Tree> trees, double weight) { for (Tree tree : trees) { train(tree, weight); } } @Override public void train(Tree tree) { train(tree, 1.0); } @Override public void train(Tree tree, double weight) { train(tree.taggedYield(), weight); } @Override public void train(List<TaggedWord> sentence) { train(sentence, 1.0); } @Override public void train(List<TaggedWord> sentence, double weight) { chineseLexicon.train(sentence, weight); wordSegmenter.train(sentence); } @Override public void trainUnannotated(List<TaggedWord> sentence, double weight) { // TODO: for now we just punt on these throw new UnsupportedOperationException("This version of the parser does not support non-tree training data"); } @Override public void incrementTreesRead(double weight) { throw new UnsupportedOperationException(); } @Override public void train(TaggedWord tw, int loc, double weight) { throw new UnsupportedOperationException(); } @Override public void finishTraining() { chineseLexicon.finishTraining(); wordSegmenter.finishTraining(); } @Override public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) { return chineseLexicon.score(iTW, loc, word, null); } // end score() @Override public void loadSegmenter(String filename) { throw new UnsupportedOperationException(); } @Override public void readData(BufferedReader in) throws IOException { chineseLexicon.readData(in); } @Override public void writeData(Writer w) throws IOException { chineseLexicon.writeData(w); } // the data & functions below are for standalone segmenter. -pichuan private Options op; // helper function private static int numSubArgs(String[] args, int index) { int i = index; while (i + 1 < args.length && args[i + 1].charAt(0) != '-') { i++; } return i - index; } private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, Index<String> wordIndex, Index<String> tagIndex) { ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex); chineseLexicon = cs.chineseLexicon; wordSegmenter = cs.wordSegmenter; } private static ChineseLexiconAndWordSegmenter getSegmenterDataFromTreebank(Treebank trainTreebank, Options op, Index<String> wordIndex, Index<String> tagIndex) { System.out.println("Currently " + new Date()); // printOptions(true, op); Timing.startTime(); // setup tree transforms TreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.out.print("Training "); System.out.println(trainTreebank.textualSummary()); } System.out.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.treebankLanguagePack()); } List<Tree> binaryTrainTrees = new ArrayList<>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.treebankLanguagePack()); if (op.testOptions.verbose) { log.info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.treebankLanguagePack()); if (op.testOptions.verbose) { log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } Timing.tick("done."); if (op.testOptions.verbose) { binarizer.dumpStats(); } System.out.print("Extracting Lexicon..."); ChineseLexiconAndWordSegmenter clex = (ChineseLexiconAndWordSegmenter) op.tlpParams.lex(op, wordIndex, tagIndex); clex.initializeTraining(binaryTrainTrees.size()); clex.train(binaryTrainTrees); clex.finishTraining(); Timing.tick("done."); return clex; } private static void printArgs(String[] args, PrintStream ps) { ps.print("ChineseLexiconAndWordSegmenter invoked with arguments:"); for (String arg : args) { ps.print(" " + arg); } ps.println(); } static void saveSegmenterDataToSerialized(ChineseLexiconAndWordSegmenter cs, String filename) { try { log.info("Writing segmenter in serialized format to file " + filename + " "); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(cs); out.close(); log.info("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } static void saveSegmenterDataToText(ChineseLexiconAndWordSegmenter cs, String filename) { try { log.info("Writing parser in text grammar format to file " + filename); OutputStream os; if (filename.endsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))); } else { os = new BufferedOutputStream(new FileOutputStream(filename)); } PrintWriter out = new PrintWriter(os); String prefix = "BEGIN "; // out.println(prefix + "OPTIONS"); // if (pd.pt != null) { // pd.pt.writeData(out); // } // out.println(); // log.info("."); out.println(prefix + "LEXICON"); if (cs != null) { cs.writeData(out); } out.println(); log.info("."); out.flush(); out.close(); log.info("done."); } catch (IOException e) { log.info("Trouble saving segmenter data to ASCII format."); e.printStackTrace(); } } private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) { log.info("Training a segmenter from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.memoryTreebank(); log.info("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; } /** * Construct a new ChineseLexiconAndWordSegmenter. This loads a segmenter file that * was previously assembled and stored. * * @throws IllegalArgumentException If segmenter data cannot be loaded */ public ChineseLexiconAndWordSegmenter(String segmenterFileOrUrl, Options op) { ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromFile(segmenterFileOrUrl, op); this.op = cs.op; // in case a serialized options was read in chineseLexicon = cs.chineseLexicon; wordSegmenter = cs.wordSegmenter; } public static ChineseLexiconAndWordSegmenter getSegmenterDataFromFile(String parserFileOrUrl, Options op) { ChineseLexiconAndWordSegmenter cs = getSegmenterDataFromSerializedFile(parserFileOrUrl); if (cs == null) { // pd = getSegmenterDataFromTextFile(parserFileOrUrl, op); } return cs; } protected static ChineseLexiconAndWordSegmenter getSegmenterDataFromSerializedFile(String serializedFileOrUrl) { ChineseLexiconAndWordSegmenter cs = null; try { log.info("Loading segmenter from serialized file " + serializedFileOrUrl + " ..."); ObjectInputStream in; InputStream is; if (serializedFileOrUrl.startsWith("http://")) { URL u = new URL(serializedFileOrUrl); URLConnection uc = u.openConnection(); is = uc.getInputStream(); } else { is = new FileInputStream(serializedFileOrUrl); } if (serializedFileOrUrl.endsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here in = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(is))); } else { in = new ObjectInputStream(new BufferedInputStream(is)); } cs = (ChineseLexiconAndWordSegmenter) in.readObject(); in.close(); log.info(" done."); return cs; } catch (InvalidClassException ice) { // For this, it's not a good idea to continue and try it as a text file! log.info(); // as in middle of line from above message throw new RuntimeException(ice); } catch (FileNotFoundException fnfe) { // For this, it's not a good idea to continue and try it as a text file! log.info(); // as in middle of line from above message throw new RuntimeException(fnfe); } catch (StreamCorruptedException sce) { // suppress error message, on the assumption that we've really got // a text grammar, and that'll be tried next } catch (Exception e) { log.info(); // as in middle of line from above message e.printStackTrace(); } return null; } /** This method lets you train and test a segmenter relative to a * Treebank. * <p> * <i>Implementation note:</i> This method is largely cloned from * LexicalizedParser's main method. Should we try to have it be able * to train segmenters to stop things going out of sync? */ public static void main(String[] args) { boolean train = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; String testPath = null; FileFilter testFilter = null; FileFilter trainFilter = null; String encoding = null; // variables needed to process the files to be parsed TokenizerFactory<Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.length < 1) { log.info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train")) { train = true; saveToSerializedFile = true; int numSubArgs = numSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new RuntimeException("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else if (numSubArgs >= 3) { try { int low = Integer.parseInt(args[argIndex]); int high = Integer.parseInt(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException e) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = numSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else if (numSubArgs >= 3) { try { int low = Integer.parseInt(args[argIndex]); int high = Integer.parseInt(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException e) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } else { int j = op.tlpParams.setOptionFlag(args, argIndex); if (j == argIndex) { log.info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } // end while loop through arguments TreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.out.println("Currently " + new Date()); printArgs(args, System.out); } if (train) { printArgs(args, System.out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.length > argIndex + 1) { try { // the next two args might be the range int low = Integer.parseInt(args[argIndex]); int high = Integer.parseInt(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException e) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); Index<String> wordIndex = new HashIndex<>(); Index<String> tagIndex = new HashIndex<>(); cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else if (textInputFileOrUrl != null) { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); } else { // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (IllegalArgumentException e) { log.info("Error loading segmenter, exiting..."); System.exit(0); } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.treePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.testMemoryTreebank(); testTreebank.loadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.info("Lexicon is " + cs.getClass().getName()); } PrintWriter pwOut = tlpParams.pw(); PrintWriter pwErr = tlpParams.pw(System.err); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { saveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { // printOptions(false, op); } if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.testMemoryTreebank(); if (args.length < argIndex + 4) { testTreebank.loadPath(args[argIndex + 1]); } else { int testlow = Integer.parseInt(args[argIndex + 2]); int testhigh = Integer.parseInt(args[argIndex + 3]); testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } /* TODO - test segmenting on treebank. -pichuan */ // lp.testOnTreebank(testTreebank); // } else if (argIndex >= args.length) { // // no more arguments, so we just parse our own test sentence // if (lp.parse(op.tlpParams.defaultTestSentence())) { // treePrint.printTree(lp.getBestParse(), pwOut); // } else { // pwErr.println("Error. Can't parse test sentence: " + // lp.parse(op.tlpParams.defaultTestSentence())); // } } //wsg2010: This code block doesn't actually do anything. It appears to read and tokenize a file, and then just print it. // There are easier ways to do that. This code was copied from an old version of LexicalizedParser. // else { // // We parse filenames given by the remaining arguments // int numWords = 0; // Timing timer = new Timing(); // // set the tokenizer // if (tokenized) { // tokenizerFactory = WhitespaceTokenizer.factory(); // } // TreebankLanguagePack tlp = tlpParams.treebankLanguagePack(); // if (tokenizerFactory == null) { // tokenizerFactory = (TokenizerFactory<Word>) tlp.getTokenizerFactory(); // } // documentPreprocessor.setTokenizerFactory(tokenizerFactory); // documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords()); // if (encoding != null) { // documentPreprocessor.setEncoding(encoding); // } // timer.start(); // for (int i = argIndex; i < args.length; i++) { // String filename = args[i]; // try { // List document = null; // if (fromXML) { // document = documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized); // } else { // document = documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter); // } // log.info("Segmenting file: " + filename + " with " + document.size() + " sentences."); // PrintWriter pwo = pwOut; // if (op.testOptions.writeOutputFiles) { // try { // pwo = tlpParams.pw(new FileOutputStream(filename + ".stp")); // } catch (IOException ioe) { // ioe.printStackTrace(); // } // } // int num = 0; // treePrint.printHeader(pwo, tlp.getEncoding()); // for (Iterator it = document.iterator(); it.hasNext();) { // num++; // List sentence = (List) it.next(); // int len = sentence.size(); // numWords += len; //// pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + sentence); // pwo.println(Sentence.listToString(sentence)); // } // treePrint.printFooter(pwo); // if (op.testOptions.writeOutputFiles) { // pwo.close(); // } // } catch (IOException e) { // pwErr.println("Couldn't find file: " + filename); // } // // } // end for each file // long millis = timer.stop(); // double wordspersec = numWords / (((double) millis) / 1000); // NumberFormat nf = new DecimalFormat("0.00"); // easier way! // pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second."); // } } private static final long serialVersionUID = -6554995189795187918L; @Override public UnknownWordModel getUnknownWordModel() { return chineseLexicon.getUnknownWordModel(); } @Override public void setUnknownWordModel(UnknownWordModel uwm) { chineseLexicon.setUnknownWordModel(uwm); } @Override public void train(Collection<Tree> trees, Collection<Tree> rawTrees) { train(trees); } }