package edu.stanford.nlp.parser.common; import java.io.IOException; import java.io.StringReader; import java.util.List; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.parser.metrics.Eval; import edu.stanford.nlp.parser.metrics.ParserQueryEval; import edu.stanford.nlp.process.Morphology; import edu.stanford.nlp.process.Tokenizer; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreebankLanguagePack; import java.util.function.Function; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.ReflectionLoading; import edu.stanford.nlp.util.Timing; // TODO: it would be nice to move these to common, but that would // wreck all existing models import edu.stanford.nlp.parser.lexparser.Options; import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams; /** * An interface for the classes which store the data for a parser. * Objects which inherit this interface have a way to produce * ParserQuery objects, have a general Options object, and return a * list of Evals to perform on a parser. This helps classes such as * {@link edu.stanford.nlp.parser.lexparser.EvaluateTreebank} * analyze the performance of a parser. * * TODO: it would be nice to actually make this an interface again. * Perhaps Java 8 will allow that * * @author John Bauer */ public abstract class ParserGrammar implements Function<List<? extends HasWord>, Tree> { private static Redwood.RedwoodChannels logger = Redwood.channels(ParserGrammar.class); public abstract ParserQuery parserQuery(); /** * Parses the list of HasWord. If the parse fails for some reason, * an X tree is returned instead of barfing. * * @param words The input sentence (a List of words) * @return A Tree that is the parse tree for the sentence. If the parser * fails, a new Tree is synthesized which attaches all words to the * root. */ @Override public Tree apply(List<? extends HasWord> words) { return parse(words); } /** * Tokenize the text using the parser's tokenizer */ public List<? extends HasWord> tokenize(String sentence) { TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory(); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence)); List<? extends HasWord> tokens = tokenizer.tokenize(); return tokens; } /** * Will parse the text in <code>sentence</code> as if it represented * a single sentence by first processing it with a tokenizer. */ public Tree parse(String sentence) { List<? extends HasWord> tokens = tokenize(sentence); if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tokens = tagger.apply(tokens); } return parse(tokens); } private transient Function<List<? extends HasWord>, List<TaggedWord>> tagger; private transient String taggerPath; public Function<List<? extends HasWord>, List<TaggedWord>> loadTagger() { Options op = getOp(); if (op.testOptions.preTag) { synchronized(this) { // TODO: rather coarse synchronization if (!op.testOptions.taggerSerializedFile.equals(taggerPath)) { taggerPath = op.testOptions.taggerSerializedFile; tagger = ReflectionLoading.loadByReflection("edu.stanford.nlp.tagger.maxent.MaxentTagger", taggerPath); } return tagger; } } else { return null; } } public List<CoreLabel> lemmatize(String sentence) { List<? extends HasWord> tokens = tokenize(sentence); return lemmatize(tokens); } /** * Only works on English, as it is hard coded for using the * Morphology class, which is English-only */ public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) { List<TaggedWord> tagged; if (getOp().testOptions.preTag) { Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger(); tagged = tagger.apply(tokens); } else { Tree tree = parse(tokens); tagged = tree.taggedYield(); } Morphology morpha = new Morphology(); List<CoreLabel> lemmas = Generics.newArrayList(); for (TaggedWord token : tagged) { CoreLabel label = new CoreLabel(); label.setWord(token.word()); label.setTag(token.tag()); morpha.stem(label); lemmas.add(label); } return lemmas; } /** * Parses the list of HasWord. If the parse fails for some reason, * an X tree is returned instead of barfing. * * @param words The input sentence (a List of words) * @return A Tree that is the parse tree for the sentence. If the parser * fails, a new Tree is synthesized which attaches all words to the * root. */ public abstract Tree parse(List<? extends HasWord> words); /** * Returns a list of extra Eval objects to use when scoring the parser. */ public abstract List<Eval> getExtraEvals(); /** * Return a list of Eval-style objects which care about the whole * ParserQuery, not just the finished tree */ public abstract List<ParserQueryEval> getParserQueryEvals(); public abstract Options getOp(); public abstract TreebankLangParserParams getTLPParams(); public abstract TreebankLanguagePack treebankLanguagePack(); /** * Returns a set of options which should be set by default when used * in corenlp. For example, the English PCFG/RNN models want * -retainTmpSubcategories, and the ShiftReduceParser models may * want -beamSize 4 depending on how they were trained. * <br> * TODO: right now completely hardcoded, should be settable as a training time option */ public abstract String[] defaultCoreNLPFlags(); public abstract void setOptionFlags(String ... flags); /** * The model requires text to be pretagged */ public abstract boolean requiresTags(); public static ParserGrammar loadModel(String path, String ... extraFlags) { ParserGrammar parser; try { Timing timing = new Timing(); parser = IOUtils.readObjectFromURLOrClasspathOrFileSystem(path); timing.done(logger, "Loading parser from serialized file " + path); } catch (IOException | ClassNotFoundException e) { throw new RuntimeIOException(e); } if (extraFlags.length > 0) { parser.setOptionFlags(extraFlags); } return parser; } }