// Stanford Parser -- a probabilistic lexicalized NL CFG parser // Copyright (c) 2002 - 2014 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software Foundation, // Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // parser-support@lists.stanford.edu // http://nlp.stanford.edu/software/lex-parser.shtml package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.parser.common.ArgUtils; import edu.stanford.nlp.parser.common.ParserGrammar; import edu.stanford.nlp.parser.common.ParserQuery; import edu.stanford.nlp.parser.common.ParserUtils; import edu.stanford.nlp.parser.metrics.Eval; import edu.stanford.nlp.parser.metrics.ParserQueryEval; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.util.ErasureUtils; import java.util.function.Function; import edu.stanford.nlp.util.HashIndex; import edu.stanford.nlp.util.Index; import edu.stanford.nlp.tagger.io.TaggedFileRecord; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.ReflectionLoading; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Timing; import edu.stanford.nlp.util.Triple; import edu.stanford.nlp.util.concurrent.MulticoreWrapper; import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor; import edu.stanford.nlp.util.logging.Redwood; import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; /** * This class provides the top-level API and command-line interface to a set * of reasonably good treebank-trained parsers. The name reflects the main * factored parsing model, which provides a lexicalized PCFG parser * implemented as a product * model of a plain PCFG parser and a lexicalized dependency parser. * But you can also run either component parser alone. In particular, it * is often useful to do unlexicalized PCFG parsing by using just that * component parser. * <p> * See the package documentation for more details and examples of use. * <p> * For information on invoking the parser from the command-line, and for * a more detailed list of options, see the {@link #main} method. * <p> * Note that training on a 1 million word treebank requires a fair amount of * memory to run. Try -mx1500m to increase the memory allocated by the JVM. * * @author Dan Klein (original version) * @author Christopher Manning (better features, ParserParams, serialization) * @author Roger Levy (internationalization) * @author Teg Grenager (grammar compaction, tokenization, etc.) * @author Galen Andrew (considerable refactoring) * @author John Bauer (made threadsafe) */ public class LexicalizedParser extends ParserGrammar implements Serializable { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(LexicalizedParser.class); public Lexicon lex; public BinaryGrammar bg; public UnaryGrammar ug; public DependencyGrammar dg; public Index<String> stateIndex, wordIndex, tagIndex; private Options op; @Override public Options getOp() { return op; } public Reranker reranker; // = null; @Override public TreebankLangParserParams getTLPParams() { return op.tlpParams; } @Override public TreebankLanguagePack treebankLanguagePack() { return getTLPParams().treebankLanguagePack(); } @Override public String[] defaultCoreNLPFlags() { return getTLPParams().defaultCoreNLPFlags(); } @Override public boolean requiresTags() { return false; } private static final String SERIALIZED_PARSER_PROPERTY = "edu.stanford.nlp.SerializedLexicalizedParser"; public static final String DEFAULT_PARSER_LOC = ((System.getenv("NLP_PARSER") != null) ? System.getenv("NLP_PARSER") : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); /** * Construct a new LexicalizedParser object from a previously * serialized grammar read from a System property * {@code edu.stanford.nlp.SerializedLexicalizedParser}, or a * default classpath location * ({@code edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz}). */ public static LexicalizedParser loadModel() { return loadModel(new Options()); } /** * Construct a new LexicalizedParser object from a previously * serialized grammar read from a System property * {@code edu.stanford.nlp.SerializedLexicalizedParser}, or a * default classpath location * ({@code edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz}). * * @param op Options to the parser. These get overwritten by the * Options read from the serialized parser; I think the only * thing determined by them is the encoding of the grammar * iff it is a text grammar */ public static LexicalizedParser loadModel(Options op, String ... extraFlags) { String source = System.getProperty(SERIALIZED_PARSER_PROPERTY); if (source == null) { source = DEFAULT_PARSER_LOC; } return loadModel(source, op, extraFlags); } public static LexicalizedParser loadModel(String parserFileOrUrl, String ... extraFlags) { return loadModel(parserFileOrUrl, new Options(), extraFlags); } public static LexicalizedParser loadModel(String parserFileOrUrl, List<String> extraFlags) { String[] flags = new String[extraFlags.size()]; extraFlags.toArray(flags); return loadModel(parserFileOrUrl, flags); } /** * Construct a new LexicalizedParser. This loads a grammar * that was previously assembled and stored as a serialized file. * @param parserFileOrUrl Filename/URL to load parser from * @param op Options for this parser. These will normally be overwritten * by options stored in the file * @throws IllegalArgumentException If parser data cannot be loaded */ public static LexicalizedParser loadModel(String parserFileOrUrl, Options op, String ... extraFlags) { // log.info("Loading parser from file " + parserFileOrUrl); LexicalizedParser parser = getParserFromFile(parserFileOrUrl, op); if (extraFlags.length > 0) { parser.setOptionFlags(extraFlags); } return parser; } /** * Reads one object from the given ObjectInputStream, which is * assumed to be a LexicalizedParser. Throws a ClassCastException * if this is not true. The stream is not closed. */ public static LexicalizedParser loadModel(ObjectInputStream ois) { try { Object o = ois.readObject(); if (o instanceof LexicalizedParser) { return (LexicalizedParser) o; } throw new ClassCastException("Wanted LexicalizedParser, got " + o.getClass()); } catch (IOException e) { throw new RuntimeIOException(e); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } public static LexicalizedParser loadModelFromZip(String zipFilename, String modelName) { LexicalizedParser parser = null; try { File file = new File(zipFilename); if (file.exists()) { ZipFile zin = new ZipFile(file); ZipEntry zentry = zin.getEntry(modelName); if (zentry != null) { InputStream in = zin.getInputStream(zentry); // gunzip it if necessary if (modelName.endsWith(".gz")) { in = new GZIPInputStream(in); } ObjectInputStream ois = new ObjectInputStream(in); parser = loadModel(ois); ois.close(); in.close(); } zin.close(); } else { throw new FileNotFoundException("Could not find " + modelName + " inside " + zipFilename); } } catch (IOException e) { throw new RuntimeIOException(e); } return parser; } public static LexicalizedParser copyLexicalizedParser(LexicalizedParser parser) { return new LexicalizedParser(parser.lex, parser.bg, parser.ug, parser.dg, parser.stateIndex, parser.wordIndex, parser.tagIndex, parser.op); } public LexicalizedParser(Lexicon lex, BinaryGrammar bg, UnaryGrammar ug, DependencyGrammar dg, Index<String> stateIndex, Index<String> wordIndex, Index<String> tagIndex, Options op) { this.lex = lex; this.bg = bg; this.ug = ug; this.dg = dg; this.stateIndex = stateIndex; this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.op = op; } /** * Construct a new LexicalizedParser. * * @param trainTreebank a treebank to train from */ public static LexicalizedParser trainFromTreebank(Treebank trainTreebank, GrammarCompactor compactor, Options op) { return getParserFromTreebank(trainTreebank, null, 1.0, compactor, op, null, null); } public static LexicalizedParser trainFromTreebank(String treebankPath, FileFilter filt, Options op) { return trainFromTreebank(makeTreebank(treebankPath, op, filt), op); } public static LexicalizedParser trainFromTreebank(Treebank trainTreebank, Options op) { return trainFromTreebank(trainTreebank, null, op); } /** * Will process a list of strings into a list of HasWord and return * the parse tree associated with that list. */ public Tree parseStrings(List<String> lst) { List<Word> words = new ArrayList<>(); for (String word : lst) { words.add(new Word(word)); } return parse(words); } /** * Parses the list of HasWord. If the parse fails for some reason, * an X tree is returned instead of barfing. */ public Tree parse(List<? extends HasWord> lst) { try { ParserQuery pq = parserQuery(); if (pq.parse(lst)) { Tree bestparse = pq.getBestParse(); // -10000 denotes unknown words bestparse.setScore(pq.getPCFGScore() % -10000.0); return bestparse; } } catch (Exception e) { log.info("Following exception caught during parsing:"); e.printStackTrace(); log.info("Recovering using fall through strategy: will construct an (X ...) tree."); } // if can't parse or exception, fall through return ParserUtils.xTree(lst); } public List<Tree> parseMultiple(final List<? extends List<? extends HasWord>> sentences) { List<Tree> trees = new ArrayList<>(); for (List<? extends HasWord> sentence : sentences) { trees.add(parse(sentence)); } return trees; } /** * Will launch multiple threads which calls {@code parse} on * each of the {@code sentences} in order, returning the * resulting parse trees in the same order. */ public List<Tree> parseMultiple(final List<? extends List<? extends HasWord>> sentences, final int nthreads) { MulticoreWrapper<List<? extends HasWord>, Tree> wrapper = new MulticoreWrapper<>(nthreads, new ThreadsafeProcessor<List<? extends HasWord>, Tree>() { @Override public Tree process(List<? extends HasWord> sentence) { return parse(sentence); } @Override public ThreadsafeProcessor<List<? extends HasWord>, Tree> newInstance() { return this; } }); List<Tree> trees = new ArrayList<>(); for (List<? extends HasWord> sentence : sentences) { wrapper.put(sentence); while (wrapper.peek()) { trees.add(wrapper.poll()); } } wrapper.join(); while (wrapper.peek()) { trees.add(wrapper.poll()); } return trees; } /** Return a TreePrint for formatting parsed output trees. * @return A TreePrint for formatting parsed output trees. */ public TreePrint getTreePrint() { return op.testOptions.treePrint(op.tlpParams); } /** * Similar to parse(), but instead of returning an X tree on failure, returns null. */ public Tree parseTree(List<? extends HasWord> sentence) { ParserQuery pq = parserQuery(); if (pq.parse(sentence)) { return pq.getBestParse(); } else { return null; } } @Override public List<Eval> getExtraEvals() { if (reranker != null) { return reranker.getEvals(); } else { return Collections.emptyList(); } } @Override public List<ParserQueryEval> getParserQueryEvals() { return Collections.emptyList(); } @Override public ParserQuery parserQuery() { if (reranker == null) { return new LexicalizedParserQuery(this); } else { return new RerankingParserQuery(op, new LexicalizedParserQuery(this), reranker); } } public LexicalizedParserQuery lexicalizedParserQuery() { return new LexicalizedParserQuery(this); } public static LexicalizedParser getParserFromFile(String parserFileOrUrl, Options op) { LexicalizedParser pd = getParserFromSerializedFile(parserFileOrUrl); if (pd == null) { pd = getParserFromTextFile(parserFileOrUrl, op); } return pd; } private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) { log.info("Training a parser from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.diskTreebank(); log.info("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; } private static DiskTreebank makeSecondaryTreebank(String treebankPath, Options op, FileFilter filt) { log.info("Additionally training using secondary disk treebank: " + treebankPath + ' ' + filt); DiskTreebank trainTreebank = op.tlpParams.diskTreebank(); log.info("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; } public Lexicon getLexicon() { return lex; } /** * Saves the parser defined by pd to the given filename. * If there is an error, a RuntimeIOException is thrown. */ public void saveParserToSerialized(String filename) { try { log.info("Writing parser in serialized format to file " + filename + ' '); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(this); out.close(); log.info("done."); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } /** * Saves the parser defined by pd to the given filename. * If there is an error, a RuntimeIOException is thrown. */ // todo: [cdm 2015] This doesn't use character encoding and it should! public void saveParserToTextFile(String filename) { if (reranker != null) { throw new UnsupportedOperationException("Sorry, but parsers with rerankers cannot be saved to text file"); } try { log.info("Writing parser in text grammar format to file " + filename); OutputStream os; if (filename.endsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))); } else { os = new BufferedOutputStream(new FileOutputStream(filename)); } PrintWriter out = new PrintWriter(os); String prefix = "BEGIN "; out.println(prefix + "OPTIONS"); op.writeData(out); out.println(); log.info("."); out.println(prefix + "STATE_INDEX"); stateIndex.saveToWriter(out); out.println(); log.info("."); out.println(prefix + "WORD_INDEX"); wordIndex.saveToWriter(out); out.println(); log.info("."); out.println(prefix + "TAG_INDEX"); tagIndex.saveToWriter(out); out.println(); log.info("."); String uwmClazz = ((lex.getUnknownWordModel() == null) ? "null" : lex.getUnknownWordModel().getClass().getCanonicalName()); out.println(prefix + "LEXICON " + uwmClazz); lex.writeData(out); out.println(); log.info("."); out.println(prefix + "UNARY_GRAMMAR"); ug.writeData(out); out.println(); log.info("."); out.println(prefix + "BINARY_GRAMMAR"); bg.writeData(out); out.println(); log.info("."); out.println(prefix + "DEPENDENCY_GRAMMAR"); if (dg != null) { dg.writeData(out); } out.println(); log.info("."); out.flush(); out.close(); log.info("done."); } catch (IOException e) { log.info("Trouble saving parser data to ASCII format."); throw new RuntimeIOException(e); } } private static void confirmBeginBlock(String file, String line) { if (line == null) { throw new RuntimeException(file + ": expecting BEGIN block; got end of file."); } else if (! line.startsWith("BEGIN")) { throw new RuntimeException(file + ": expecting BEGIN block; got " + line); } } protected static LexicalizedParser getParserFromTextFile(String textFileOrUrl, Options op) { try { Timing tim = new Timing(); BufferedReader in = IOUtils.readerFromString(textFileOrUrl); Timing.startTime(); String line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); op.readData(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index<String> stateIndex = HashIndex.loadFromReader(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index<String> wordIndex = HashIndex.loadFromReader(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index<String> tagIndex = HashIndex.loadFromReader(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Lexicon lex = op.tlpParams.lex(op, wordIndex, tagIndex); String uwmClazz = line.split(" +")[2]; if (!uwmClazz.equals("null")) { UnknownWordModel model = ReflectionLoading.loadByReflection(uwmClazz, op, lex, wordIndex, tagIndex); lex.setUnknownWordModel(model); } lex.readData(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); UnaryGrammar ug = new UnaryGrammar(stateIndex); ug.readData(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); BinaryGrammar bg = new BinaryGrammar(stateIndex); bg.readData(in); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); DependencyGrammar dg = new MLEDependencyGrammar(op.tlpParams, op.directional, op.distance, op.coarseDistance, op.trainOptions.basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex); dg.readData(in); in.close(); log.info("Loading parser from text file " + textFileOrUrl + " ... done [" + tim.toSecondsString() + " sec]."); return new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op); } catch (IOException e) { e.printStackTrace(); } return null; } public static LexicalizedParser getParserFromSerializedFile(String serializedFileOrUrl) { try { Timing tim = new Timing(); ObjectInputStream in = IOUtils.readStreamFromString(serializedFileOrUrl); LexicalizedParser pd = loadModel(in); in.close(); log.info("Loading parser from serialized file " + serializedFileOrUrl + " ... done [" + tim.toSecondsString() + " sec]."); return pd; } catch (InvalidClassException ice) { // For this, it's not a good idea to continue and try it as a text file! throw new RuntimeException("Invalid class in file: " + serializedFileOrUrl, ice); } catch (FileNotFoundException fnfe) { // For this, it's not a good idea to continue and try it as a text file! throw new RuntimeException("File not found: " + serializedFileOrUrl, fnfe); } catch (StreamCorruptedException sce) { // suppress error message, on the assumption that we've really got // a text grammar, and that'll be tried next } catch (Exception e) { e.printStackTrace(); } return null; } private static void printOptions(boolean train, Options op) { op.display(); if (train) { op.trainOptions.display(); } else { op.testOptions.display(); } op.tlpParams.display(); } public static TreeAnnotatorAndBinarizer buildTrainBinarizer(Options op) { TreebankLangParserParams tlpParams = op.tlpParams; if (!op.trainOptions.leftToRight) { return new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op); } else { return new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op); } } public static CompositeTreeTransformer buildTrainTransformer(Options op) { TreeAnnotatorAndBinarizer binarizer = buildTrainBinarizer(op); return buildTrainTransformer(op, binarizer); } // todo [cdm2015]: This method should be used in TreeAnnotatorAndBinarizer#getAnnotatedBinaryTreebankFromTreebank and moved to that class public static CompositeTreeTransformer buildTrainTransformer(Options op, TreeAnnotatorAndBinarizer binarizer) { TreebankLangParserParams tlpParams = op.tlpParams; TreebankLanguagePack tlp = tlpParams.treebankLanguagePack(); CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.addTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.addTransformer(collinsPuncTransformer); } trainTransformer.addTransformer(binarizer); if (op.wordFunction != null) { TreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.addTransformer(wordFunctionTransformer); } return trainTransformer; } /** @return A triple of binaryTrainTreebank, binarySecondaryTrainTreebank, binaryTuneTreebank. */ @SuppressWarnings("UnusedDeclaration") // todo [cdm2015]: This method should be difference-resolved with TreeAnnotatorAndBinarizer#getAnnotatedBinaryTreebankFromTreebank and then deleted public static Triple<Treebank, Treebank, Treebank> getAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms TreebankLangParserParams tlpParams = op.tlpParams; TreebankLanguagePack tlp = tlpParams.treebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.pw(System.err); pwErr.print("Training "); pwErr.println(trainTreebank.textualSummary(tlp)); if (secondaryTreebank != null) { pwErr.print("Secondary training "); pwErr.println(secondaryTreebank.textualSummary(tlp)); } } log.info("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = buildTrainBinarizer(op); CompositeTreeTransformer trainTransformer = buildTrainTransformer(op, binarizer); Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); removeDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { List<String> list = new ArrayList<>(op.trainOptions.splitters); Collections.sort(list); log.info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op); wholeTreebank = wholeTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.setDoSelectiveSplit(false); for (Tree tree : wholeTreebank) { trainTransformer.transformTree(tree); } binarizer.setDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.printStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.printRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.transform(trainTransformer); } Timing.tick("done."); if (op.testOptions.verbose) { binarizer.dumpStats(); } return new Triple<>(trainTreebank, secondaryTreebank, tuneTreebank); } private static void removeDeleteSplittersFromSplitters(TreebankLanguagePack tlp, Options op) { if (op.trainOptions.deleteSplitters != null) { List<String> deleted = new ArrayList<>(); for (String del : op.trainOptions.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } if (op.testOptions.verbose) { log.info("Removed from vertical splitters: " + deleted); } } } // TODO: Make below method work with arbitrarily large secondary treebank via iteration // TODO: Have weight implemented for training lexicon /** * A method for training from two different treebanks, the second of which is presumed * to be orders of magnitude larger. * <p/> * Trees are not read into memory but processed as they are read from disk. * <p/> * A weight (typically <= 1) can be put on the second treebank. * * @param trainTreebank A treebank to train from * @param secondaryTrainTreebank Another treebank to train from * @param weight A weight factor to give the secondary treebank. If the weight * is 0.25, each example in the secondaryTrainTreebank will be treated as * 1/4 of an example sentence. * @param compactor A class for compacting grammars. May be null. * @param op Options for how the grammar is built from the treebank * @param tuneTreebank A treebank to tune free params on (may be null) * @param extraTaggedWords A list of words to add to the Lexicon * @return The trained LexicalizedParser */ public static LexicalizedParser getParserFromTreebank(Treebank trainTreebank, Treebank secondaryTrainTreebank, double weight, GrammarCompactor compactor, Options op, Treebank tuneTreebank, List<List<TaggedWord>> extraTaggedWords) { // log.info("Currently " + new Date()); // now printed when command-line args are printed printOptions(true, op); Timing.startTime(); Triple<Treebank, Treebank, Treebank> treebanks = TreeAnnotatorAndBinarizer.getAnnotatedBinaryTreebankFromTreebank(trainTreebank, secondaryTrainTreebank, tuneTreebank, op); Timing.tick("done."); Treebank trainTreebankRaw = trainTreebank; trainTreebank = treebanks.first(); secondaryTrainTreebank = treebanks.second(); tuneTreebank = treebanks.third(); // +1 to account for the boundary symbol trainTreebank = new FilteringTreebank(trainTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); if (secondaryTrainTreebank != null) { secondaryTrainTreebank = new FilteringTreebank(secondaryTrainTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); } if (tuneTreebank != null) { tuneTreebank = new FilteringTreebank(tuneTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); } Index<String> stateIndex; Index<String> wordIndex; Index<String> tagIndex; Pair<UnaryGrammar, BinaryGrammar> bgug; Lexicon lex; if (op.trainOptions.predictSplits) { SplittingGrammarExtractor extractor = new SplittingGrammarExtractor(op); log.info("Extracting PCFG..."); // TODO: make use of the tagged text if (secondaryTrainTreebank == null) { extractor.extract(trainTreebank); } else { extractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } bgug = extractor.bgug; lex = extractor.lex; stateIndex = extractor.stateIndex; wordIndex = extractor.wordIndex; tagIndex = extractor.tagIndex; Timing.tick("done."); } else { stateIndex = new HashIndex<>(); wordIndex = new HashIndex<>(); tagIndex = new HashIndex<>(); // extract grammars BinaryGrammarExtractor bgExtractor = new BinaryGrammarExtractor(op, stateIndex); // Extractor lexExtractor = new LexiconExtractor(); //TreeExtractor uwmExtractor = new UnknownWordModelExtractor(trainTreebank.size()); log.info("Extracting PCFG..."); if (secondaryTrainTreebank == null) { bgug = bgExtractor.extract(trainTreebank); } else { bgug = bgExtractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } Timing.tick("done."); log.info("Extracting Lexicon..."); lex = op.tlpParams.lex(op, wordIndex, tagIndex); double trainSize = trainTreebank.size(); if (secondaryTrainTreebank != null) { trainSize += (secondaryTrainTreebank.size() * weight); } if (extraTaggedWords != null) { trainSize += extraTaggedWords.size(); } lex.initializeTraining(trainSize); // wsg2012: The raw treebank has CoreLabels, which we need for FactoredLexicon // training. If TreeAnnotator is updated so that it produces CoreLabels, then we can // remove the trainTreebankRaw. lex.train(trainTreebank, trainTreebankRaw); if (secondaryTrainTreebank != null) { lex.train(secondaryTrainTreebank, weight); } if (extraTaggedWords != null) { for (List<TaggedWord> sentence : extraTaggedWords) { // TODO: specify a weight? lex.trainUnannotated(sentence, 1.0); } } lex.finishTraining(); Timing.tick("done."); } //TODO: wsg2011 Not sure if this should come before or after //grammar compaction if (op.trainOptions.ruleSmoothing) { log.info("Smoothing PCFG..."); Function<Pair<UnaryGrammar,BinaryGrammar>,Pair<UnaryGrammar,BinaryGrammar>> smoother = new LinearGrammarSmoother(op.trainOptions, stateIndex, tagIndex); bgug = smoother.apply(bgug); Timing.tick("done."); } if (compactor != null) { log.info("Compacting grammar..."); Triple<Index<String>, UnaryGrammar, BinaryGrammar> compacted = compactor.compactGrammar(bgug, stateIndex); stateIndex = compacted.first(); bgug.setFirst(compacted.second()); bgug.setSecond(compacted.third()); Timing.tick("done."); } log.info("Compiling grammar..."); BinaryGrammar bg = bgug.second; bg.splitRules(); UnaryGrammar ug = bgug.first; ug.purgeRules(); Timing.tick("done"); DependencyGrammar dg = null; if (op.doDep) { log.info("Extracting Dependencies..."); AbstractTreeExtractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); if (secondaryTrainTreebank == null) { dg = dgExtractor.extract(trainTreebank); } else { dg = dgExtractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } //log.info("Extracting Unknown Word Model..."); //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(trainTreebank); //Timing.tick("done."); Timing.tick("done."); if (tuneTreebank != null) { log.info("Tuning Dependency Model..."); dg.setLexicon(lex); // MG2008: needed if using PwGt model dg.tune(tuneTreebank); Timing.tick("done."); } } log.info("Done training parser."); if (op.trainOptions.trainTreeFile!=null) { try { log.info("Writing out binary trees to "+ op.trainOptions.trainTreeFile+"..."); IOUtils.writeObjectToFile(trainTreebank, op.trainOptions.trainTreeFile); IOUtils.writeObjectToFile(secondaryTrainTreebank, op.trainOptions.trainTreeFile); Timing.tick("done."); } catch (Exception e) { log.info("Problem writing out binary trees."); } } return new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op); } /** * This will set options to the parser, in a way exactly equivalent to * passing in the same sequence of command-line arguments. This is a useful * convenience method when building a parser programmatically. The options * passed in should * be specified like command-line arguments, including with an initial * minus sign. * <p/> * <i>Notes:</i> This can be used to set parsing-time flags for a * serialized parser. You can also still change things serialized * in Options, but this will probably degrade parsing performance. * The vast majority of command line flags can be passed to this * method, but you cannot pass in options that specify the treebank * or grammar to be loaded, the grammar to be written, trees or * files to be parsed or details of their encoding, nor the * TreebankLangParserParams ({@code -tLPP}) to use. The * TreebankLangParserParams should be set up on construction of a * LexicalizedParser, by constructing an Options that uses * the required TreebankLangParserParams, and passing that to a * LexicalizedParser constructor. Note that despite this * method being an instance method, many flags are actually set as * static class variables. * * @param flags Arguments to the parser, for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @throws IllegalArgumentException If an unknown flag is passed in */ @Override public void setOptionFlags(String... flags) { op.setOptions(flags); } /** * A main program for using the parser with various options. * This program can be used for building and serializing * a parser from treebank data, for parsing sentences from a file * or URL using a serialized or text grammar parser, * and (mainly for parser quality testing) * for training and testing a parser on a treebank all in one go. * * <p> * Sample Usages: * <ul> * <li> <b>Train a parser (saved to <i>serializedGrammarFilename</i>) * from a directory of trees (<i>trainFilesPath</i>, with an optional <i>fileRange</i>, e.g., 0-1000):</b> * {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename} * </li> * * <li> <b>Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees</b> * {@code java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] } * </li> * * <li> <b>Parse one or more files, given a serialized grammar and a list of files</b> * {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename]*} * </li> * * <li> <b>Test and report scores for a serialized grammar on trees in an output directory</b> * {@code java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange]} * </li> * </ul> * *<p> * If the {@code serializedGrammarPath} ends in {@code .gz}, * then the grammar is written and read as a compressed file (GZip). * If the {@code serializedGrammarPath} is a URL, starting with * {@code http://}, then the parser is read from the URL. * A fileRange specifies a numeric value that must be included within a * filename for it to be used in training or testing (this works well with * most current treebanks). It can be specified like a range of pages to be * printed, for instance as {@code 200-2199} or * {@code 1-300,500-725,9000} or just as {@code 1} (if all your * trees are in a single file, either omit this parameter or just give a dummy * argument such as {@code 0}). * If the filename to parse is "-" then the parser parses from stdin. * If no files are supplied to parse, then a hardwired sentence * is parsed. * * <p> * The parser can write a grammar as either a serialized Java object file * or in a text format (or as both), specified with the following options: * <blockquote>{@code * java edu.stanford.nlp.parser.lexparser.LexicalizedParser * [-v] -train * trainFilesPath [fileRange] [-saveToSerializedFile grammarPath] * [-saveToTextFile grammarPath] * }</blockquote> * * <p> * In the same position as the verbose flag ({@code -v}), many other * options can be specified. The most useful to an end user are: * <ul> * <LI>{@code -tLPP class} Specify a different * TreebankLangParserParams, for when using a different language or * treebank (the default is English Penn Treebank). <i>This option MUST occur * before any other language-specific options that are used (or else they * are ignored!).</i> * (It's usually a good idea to specify this option even when loading a * serialized grammar; it is necessary if the language pack specifies a * needed character encoding or you wish to specify language-specific * options on the command line.)</LI> * <LI>{@code -encoding charset} Specify the character encoding of the * input and output files. This will override the value in the * {@code TreebankLangParserParams}, provided this option appears * <i>after</i> any {@code -tLPP} option.</LI> * <LI>{@code -tokenized} Says that the input is already separated * into whitespace-delimited tokens. If this option is specified, any * tokenizer specified for the language is ignored, and a universal (Unicode) * tokenizer, which divides only on whitespace, is used. * Unless you also specify * {@code -escaper}, the tokens <i>must</i> all be correctly * tokenized tokens of the appropriate treebank for the parser to work * well (for instance, if using the Penn English Treebank, you must have * coded "(" as "-LRB-", "3/4" as "3\/4", etc.)</LI> * <li>{@code -escaper class} Specify a class of type * {@link Function}<List<HasWord>,List<HasWord>> to do * customized escaping of tokenized text. This class will be run over the * tokenized text and can fix the representation of tokens. For instance, * it could change "(" to "-LRB-" for the Penn English Treebank. A * provided escaper that does such things for the Penn English Treebank is * {@code edu.stanford.nlp.process.PTBEscapingProcessor} * <li>{@code -tokenizerFactory class} Specifies a * TokenizerFactory class to be used for tokenization</li> * <li>{@code -tokenizerOptions options} Specifies options to a * TokenizerFactory class to be used for tokenization. A comma-separated * list. For PTBTokenizer, options of interest include * {@code americanize=false} and {@code asciiQuotes} (for German). * Note that any choice of tokenizer options that conflicts with the * tokenization used in the parser training data will likely degrade parser * performance. </li> * <li>{@code -sentences token } Specifies a token that marks sentence * boundaries. A value of {@code newline} causes sentence breaking on * newlines. A value of {@code onePerElement} causes each element * (using the XML {@code -parseInside} option) to be treated as a * sentence. All other tokens will be interpreted literally, and must be * exactly the same as tokens returned by the tokenizer. For example, * you might specify "|||" and put that symbol sequence as a token between * sentences. * If no explicit sentence breaking option is chosen, sentence breaking * is done based on a set of language-particular sentence-ending patterns. * </li> * <LI>{@code -parseInside element} Specifies that parsing should only * be done for tokens inside the indicated XML-style * elements (done as simple pattern matching, rather than XML parsing). * For example, if this is specified as {@code sentence}, then * the text inside the {@code sentence} element * would be parsed. * Using "-parseInside s" gives you support for the input format of * Charniak's parser. Sentences cannot span elements. Whether the * contents of the element are treated as one sentence or potentially * multiple sentences is controlled by the {@code -sentences} flag. * The default is potentially multiple sentences. * This option gives support for extracting and parsing * text from very simple SGML and XML documents, and is provided as a * user convenience for that purpose. If you want to really parse XML * documents before NLP parsing them, you should use an XML parser, and then * call to a LexicalizedParser on appropriate CDATA. * <LI>{@code -tagSeparator char} Specifies to look for tags on words * following the word and separated from it by a special character * {@code char}. For instance, many tagged corpora have the * representation "house/NN" and you would use {@code -tagSeparator /}. * Notes: This option requires that the input be pretokenized. * The separator has to be only a single character, and there is no * escaping mechanism. However, splitting is done on the <i>last</i> * instance of the character in the token, so that cases like * "3\/4/CD" are handled correctly. The parser will in all normal * circumstances use the tag you provide, but will override it in the * case of very common words in cases where the tag that you provide * is not one that it regards as a possible tagging for the word. * The parser supports a format where only some of the words in a sentence * have a tag (if you are calling the parser programmatically, you indicate * them by having them implement the {@code HasTag} interface). * You can do this at the command-line by only having tags after some words, * but you are limited by the fact that there is no way to escape the * tagSeparator character.</LI> * <LI>{@code -maxLength leng} Specify the longest sentence that * will be parsed (and hence indirectly the amount of memory * needed for the parser). If this is not specified, the parser will * try to dynamically grow its parse chart when long sentence are * encountered, but may run out of memory trying to do so.</LI> * <LI>{@code -outputFormat styles} Choose the style(s) of output * sentences: {@code penn} for prettyprinting as in the Penn * treebank files, or {@code oneline} for printing sentences one * per line, {@code words}, {@code wordsAndTags}, * {@code dependencies}, {@code typedDependencies}, * or {@code typedDependenciesCollapsed}. * Multiple options may be specified as a comma-separated * list. See TreePrint class for further documentation.</LI> * <LI>{@code -outputFormatOptions} Provide options that control the * behavior of various {@code -outputFormat} choices, such as * {@code lexicalize}, {@code stem}, {@code markHeadNodes}, * or {@code xml}. {@link edu.stanford.nlp.trees.TreePrint} * Options are specified as a comma-separated list.</LI> * <LI>{@code -writeOutputFiles} Write output files corresponding * to the input files, with the same name but a {@code ".stp"} * file extension. The format of these files depends on the * {@code outputFormat} option. (If not specified, output is sent * to stdout.)</LI> * <LI>{@code -outputFilesExtension} The extension that is appended to * the filename that is being parsed to produce an output file name (with the * -writeOutputFiles option). The default is {@code stp}. Don't * include the period. * <LI>{@code -outputFilesDirectory} The directory in which output * files are written (when the -writeOutputFiles option is specified). * If not specified, output files are written in the same directory as the * input files. * <LI>{@code -nthreads} Parsing files and testing on treebanks * can use multiple threads. This option tells the parser how many * threads to use. A negative number indicates to use as many * threads as the machine has cores. * </ul> * See also the package documentation for more details and examples of use. * * @param args Command line arguments, as above */ public static void main(String[] args) { boolean train = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPath = null; Treebank testTreebank = null; Treebank tuneTreebank = null; String testPath = null; FileFilter testFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilter = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); List<String> optionArgs = new ArrayList<>(); String encoding = null; // while loop through option arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { train = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPath = treebankDescription.first(); trainFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-train2")) { // train = true; // cdm july 2005: should require -train for this Triple<String, FileFilter, Double> treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; secondaryTreebankPath = treebankDescription.first(); secondaryTrainFilter = treebankDescription.second(); secondaryTreebankWeight = treebankDescription.third(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenized")) { tokenized = true; argIndex += 1; } else if (args[argIndex].equalsIgnoreCase("-escaper")) { try { escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]); } catch (Exception e) { log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) { tokenizerOptions = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) { tokenizerFactoryClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) { tokenizerMethod = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentences")) { sentenceDelimiter = args[argIndex + 1]; if (sentenceDelimiter.equalsIgnoreCase("newline")) { sentenceDelimiter = "\n"; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parseInside")) { elementDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) { tagDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // load the parser from declarative text file // the next argument must be the path to the parser file textInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; if (ArgUtils.numSubArgs(args, argIndex) < 1) { log.info("Missing path: -saveToSerialized filename"); } else { serializedOutputFileOrUrl = args[argIndex + 1]; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) { // save the training trees to a binary file op.trainOptions.trainTreeFile = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPath = treebankDescription.first(); testFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tune")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; tunePath = treebankDescription.first(); tuneFilter = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = op.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments // all other arguments are order dependent and // are processed in order below if (tuneFilter != null || tunePath != null) { if (tunePath == null) { if (treebankPath == null) { throw new RuntimeException("No tune treebank path specified..."); } else { log.info("No tune treebank path specified. Using train path: \"" + treebankPath + '\"'); tunePath = treebankPath; } } tuneTreebank = op.tlpParams.testMemoryTreebank(); tuneTreebank.loadPath(tunePath, tuneFilter); } if (!train && op.testOptions.verbose) { StringUtils.logInvocationString(log, args); } LexicalizedParser lp; // always initialized in next if-then-else block if (train) { StringUtils.logInvocationString(log, args); // so we train a parser using the treebank GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); Treebank secondaryTrainTreebank = null; if (secondaryTreebankPath != null) { secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter); } List<List<TaggedWord>> extraTaggedWords = null; if (op.trainOptions.taggedFiles != null) { extraTaggedWords = new ArrayList<>(); List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles); for (TaggedFileRecord record : fileRecords) { for (List<TaggedWord> sentence : record.reader()) { extraTaggedWords.add(sentence); } } } lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords); } else if (textInputFileOrUrl != null) { // so we load the parser from a text grammar file lp = getParserFromTextFile(textInputFileOrUrl, op); } else { // so we load a serialized parser if (serializedInputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } if (serializedInputFileOrUrl == null) { log.info("No grammar specified, exiting..."); return; } String[] extraArgs = new String[optionArgs.size()]; extraArgs = optionArgs.toArray(extraArgs); try { lp = loadModel(serializedInputFileOrUrl, op, extraArgs); op = lp.op; } catch (IllegalArgumentException e) { log.info("Error loading parser, exiting..."); throw e; } } // set up tokenizerFactory with options if provided if (tokenizerFactoryClass != null || tokenizerOptions != null) { try { if (tokenizerFactoryClass != null) { Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass)); Method factoryMethod; if (tokenizerOptions != null) { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions)); } else { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory"); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null)); } } else { // have options but no tokenizer factory. use the parser // langpack's factory and set its options tokenizerFactory = lp.op.langpack().getTokenizerFactory(); tokenizerFactory.setOptions(tokenizerOptions); } } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) { log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions); throw new RuntimeException(e); } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encoding != null) { op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); } if (testFilter != null || testPath != null) { if (testPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPath + '\"'); testPath = treebankPath; } } testTreebank = op.tlpParams.testMemoryTreebank(); testTreebank.loadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. -- Roger // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { lp.saveParserToTextFile(textOutputFileOrUrl); } else { log.info("Usage: must specify a text grammar output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl != null) { lp.saveParserToSerialized(serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename"); } } if (op.testOptions.verbose || train) { // Tell the user a little or a lot about what we have made // get lexicon size separately as it may have its own prints in it.... String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()): ""; log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules(): "") + '\t' + (lp.bg != null ? lp.bg.numRules(): "") + '\t' + lexNumRules); log.info("ParserPack is " + op.tlpParams.getClass().getName()); log.info("Lexicon is " + lp.lex.getClass().getName()); if (op.testOptions.verbose) { log.info("Tags are: " + lp.tagIndex); // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)! } printOptions(false, op); } if (testTreebank != null) { // test parser on treebank EvaluateTreebank evaluator = new EvaluateTreebank(lp); evaluator.testOnTreebank(testTreebank); } else if (argIndex >= args.length) { // no more arguments, so we just parse our own test sentence PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); ParserQuery pq = lp.parserQuery(); if (pq.parse(op.tlpParams.defaultTestSentence())) { lp.getTreePrint().printTree(pq.getBestParse(), pwOut); } else { pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence()); } } else { // We parse filenames given by the remaining arguments ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp); } } // end main private static final long serialVersionUID = 2; } // end class LexicalizedParser