package edu.stanford.nlp.pipeline; import java.util.*; import java.util.function.Function; import java.util.function.Predicate; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.parser.common.NoSuchParseException; import edu.stanford.nlp.parser.common.ParserAnnotations; import edu.stanford.nlp.parser.common.ParserConstraint; import edu.stanford.nlp.parser.common.ParserGrammar; import edu.stanford.nlp.parser.common.ParserQuery; import edu.stanford.nlp.parser.common.ParserUtils; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.parser.lexparser.TreeBinarizer; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.logging.Redwood; /** * This class will add parse information to an Annotation. * It assumes that the Annotation already contains the tokenized words * as a {@code List<CoreLabel>} in the TokensAnnotation under each * particular CoreMap in the SentencesAnnotation. * If the words have POS tags, they will be used. * <br> * Parse trees are added to each sentence's CoreMap (get with * {@code CoreAnnotations.SentencesAnnotation}) under * {@code CoreAnnotations.TreeAnnotation}). * * @author Jenny Finkel */ public class ParserAnnotator extends SentenceAnnotator { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(ParserAnnotator.class); private final boolean VERBOSE; private final boolean BUILD_GRAPHS; private final ParserGrammar parser; private final Function<Tree, Tree> treeMap; /** Do not parse sentences larger than this sentence length */ private final int maxSentenceLength; /** * Stop parsing if we exceed this time limit, in milliseconds. * Use 0 for no limit. */ private final long maxParseTime; private final int kBest; private final GrammaticalStructureFactory gsf; private final int nThreads; private final boolean saveBinaryTrees; /** Whether to include punctuation dependencies in the output. Starting in 2015, the default is true. */ private final boolean keepPunct; /** If true, don't re-annotate sentences that already have a tree annotation */ private final boolean noSquash; private final GrammaticalStructure.Extras extraDependencies; public ParserAnnotator(boolean verbose, int maxSent) { this(System.getProperty("parse.model", LexicalizedParser.DEFAULT_PARSER_LOC), verbose, maxSent, StringUtils.EMPTY_STRING_ARRAY); } public ParserAnnotator(String parserLoc, boolean verbose, int maxSent, String[] flags) { this(loadModel(parserLoc, verbose, flags), verbose, maxSent); } public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent) { this(parser, verbose, maxSent, null); } public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Function<Tree, Tree> treeMap) { this.VERBOSE = verbose; this.BUILD_GRAPHS = parser.getTLPParams().supportsBasicDependencies(); this.parser = parser; this.maxSentenceLength = maxSent; this.treeMap = treeMap; this.maxParseTime = 0; this.kBest = 1; this.keepPunct = true; if (this.BUILD_GRAPHS) { TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack(); this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = 1; this.saveBinaryTrees = false; this.noSquash = false; this.extraDependencies = GrammaticalStructure.Extras.NONE; } public ParserAnnotator(String annotatorName, Properties props) { String model = props.getProperty(annotatorName + ".model", LexicalizedParser.DEFAULT_PARSER_LOC); if (model == null) { throw new IllegalArgumentException("No model specified for Parser annotator " + annotatorName); } this.VERBOSE = PropertiesUtils.getBool(props, annotatorName + ".debug", false); String[] flags = convertFlagsToArray(props.getProperty(annotatorName + ".flags")); this.parser = loadModel(model, VERBOSE, flags); this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", -1); String treeMapClass = props.getProperty(annotatorName + ".treemap"); if (treeMapClass == null) { this.treeMap = null; } else { this.treeMap = ReflectionLoading.loadByReflection(treeMapClass, props); } this.maxParseTime = PropertiesUtils.getLong(props, annotatorName + ".maxtime", -1); this.kBest = PropertiesUtils.getInt(props, annotatorName + ".kbest", 1); this.keepPunct = PropertiesUtils.getBool(props, annotatorName + ".keepPunct", true); String buildGraphsProperty = annotatorName + ".buildgraphs"; if (!this.parser.getTLPParams().supportsBasicDependencies()) { if (props.getProperty(buildGraphsProperty) != null && PropertiesUtils.getBool(props, buildGraphsProperty)) { log.info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.getTLPParams().getClass() + " does not support dependencies"); } this.BUILD_GRAPHS = false; } else { this.BUILD_GRAPHS = PropertiesUtils.getBool(props, buildGraphsProperty, true); } if (this.BUILD_GRAPHS) { boolean generateOriginalDependencies = PropertiesUtils.getBool(props, annotatorName + ".originalDependencies", false); parser.getTLPParams().setGenerateOriginalDependencies(generateOriginalDependencies); TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack(); Predicate<String> punctFilter = this.keepPunct ? Filters.acceptFilter() : tlp.punctuationWordRejectFilter(); this.gsf = tlp.grammaticalStructureFactory(punctFilter, parser.getTLPParams().typedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1)); boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props); this.saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary); this.noSquash = PropertiesUtils.getBool(props, annotatorName + ".nosquash", false); this.extraDependencies = MetaClass.cast(props.getProperty(annotatorName + ".extradependencies", "NONE"), GrammaticalStructure.Extras.class); } @SuppressWarnings("StringConcatenationInsideStringBufferAppend") public static String signature(String annotatorName, Properties props) { StringBuilder os = new StringBuilder(); os.append(annotatorName + ".model:" + props.getProperty(annotatorName + ".model", LexicalizedParser.DEFAULT_PARSER_LOC)); os.append(annotatorName + ".debug:" + props.getProperty(annotatorName + ".debug", "false")); os.append(annotatorName + ".flags:" + props.getProperty(annotatorName + ".flags", "")); os.append(annotatorName + ".maxlen:" + props.getProperty(annotatorName + ".maxlen", "-1")); os.append(annotatorName + ".treemap:" + props.getProperty(annotatorName + ".treemap", "")); os.append(annotatorName + ".maxtime:" + props.getProperty(annotatorName + ".maxtime", "-1")); os.append(annotatorName + ".originalDependencies:" + props.getProperty(annotatorName + ".originalDependencies", "false")); os.append(annotatorName + ".buildgraphs:" + props.getProperty(annotatorName + ".buildgraphs", "true")); os.append(annotatorName + ".nthreads:" + props.getProperty(annotatorName + ".nthreads", props.getProperty("nthreads", ""))); os.append(annotatorName + ".nosquash:" + props.getProperty(annotatorName + ".nosquash", "false")); os.append(annotatorName + ".keepPunct:" + props.getProperty(annotatorName + ".keepPunct", "true")); os.append(annotatorName + ".extradependencies:" + props.getProperty(annotatorName + ".extradependences", "NONE").toLowerCase()); boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props); boolean saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary); os.append(annotatorName + ".binaryTrees:" + saveBinaryTrees); return os.toString(); } private static String[] convertFlagsToArray(String parserFlags) { if (parserFlags == null || parserFlags.trim().isEmpty()) { return StringUtils.EMPTY_STRING_ARRAY; } else { return parserFlags.trim().split("\\s+"); } } private static ParserGrammar loadModel(String parserLoc, boolean verbose, String[] flags) { if (verbose) { log.info("Loading Parser Model [" + parserLoc + "] ..."); log.info(" Flags:"); for (String flag : flags) { log.info(" " + flag); } log.info(); } ParserGrammar result = ParserGrammar.loadModel(parserLoc); result.setOptionFlags(result.defaultCoreNLPFlags()); result.setOptionFlags(flags); return result; } @Override protected int nThreads() { return nThreads; } @Override protected long maxTime() { return maxParseTime; } @Override protected void doOneSentence(Annotation annotation, CoreMap sentence) { // If "noSquash" is set, don't re-annotate sentences which already have a tree annotation if (noSquash && sentence.get(TreeCoreAnnotations.TreeAnnotation.class) != null && !"X".equalsIgnoreCase(sentence.get(TreeCoreAnnotations.TreeAnnotation.class).label().value())) { return; } final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class); if (VERBOSE) { log.info("Parsing: " + words); } List<Tree> trees = null; // generate the constituent tree if (maxSentenceLength <= 0 || words.size() <= maxSentenceLength) { try { final List<ParserConstraint> constraints = sentence.get(ParserAnnotations.ConstraintAnnotation.class); trees = doOneSentence(constraints, words); } catch (RuntimeInterruptedException e) { if (VERBOSE) { log.info("Took too long parsing: " + words); } trees = null; } } // tree == null may happen if the parser takes too long or if // the sentence is longer than the max length if (trees == null || trees.size() < 1) { doOneFailedSentence(annotation, sentence); } else { finishSentence(sentence, trees); } } @Override public void doOneFailedSentence(Annotation annotation, CoreMap sentence) { final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class); Tree tree = ParserUtils.xTree(words); for (CoreLabel word : words) { if (word.tag() == null) { word.setTag("XX"); } } List<Tree> trees = Generics.newArrayList(1); trees.add(tree); finishSentence(sentence, trees); } private void finishSentence(CoreMap sentence, List<Tree> trees) { if (treeMap != null) { List<Tree> mappedTrees = Generics.newLinkedList(); for (Tree tree : trees) { Tree mappedTree = treeMap.apply(tree); mappedTrees.add(mappedTree); } trees = mappedTrees; } ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, extraDependencies); if (saveBinaryTrees) { TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); Tree binarized = binarizer.transformTree(trees.get(0)); Trees.convertToCoreLabels(binarized); sentence.set(TreeCoreAnnotations.BinarizedTreeAnnotation.class, binarized); } // for some reason in some corner cases nodes aren't having sentenceIndex set // do a pass and make sure all nodes have sentenceIndex set SemanticGraph sg = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); if (sg != null) { for (IndexedWord iw : sg.vertexSet()) { if (iw.get(CoreAnnotations.SentenceIndexAnnotation.class) == null && sentence.get(CoreAnnotations.SentenceIndexAnnotation.class) != null) { iw.setSentIndex(sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); } } } } private List<Tree> doOneSentence(List<ParserConstraint> constraints, List<CoreLabel> words) { ParserQuery pq = parser.parserQuery(); pq.setConstraints(constraints); pq.parse(words); List<Tree> trees = Generics.newLinkedList(); try { // Use bestParse if kBest is set to 1. if (this.kBest == 1) { Tree t = pq.getBestParse(); if (t == null) { log.warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.listToString(words)); } else { double score = pq.getBestScore(); t.setScore(score % -10000.0); trees.add(t); } } else { List<ScoredObject<Tree>> scoredObjects = pq.getKBestParses(this.kBest); if (scoredObjects == null || scoredObjects.size() < 1) { log.warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.listToString(words)); } else { for (ScoredObject<Tree> so : scoredObjects) { // -10000 denotes unknown words Tree tree = so.object(); tree.setScore(so.score() % -10000.0); trees.add(tree); } } } } catch (OutOfMemoryError e) { log.error(e); // Beware that we can now get an OOM in logging, too. log.warn("Parsing of sentence ran out of memory (length=" + words.size() + "). " + "Will ignore and try to continue."); } catch (NoSuchParseException e) { log.warn("Parsing of sentence failed, possibly because of out of memory. " + "Will ignore and continue: " + SentenceUtils.listToString(words)); } return trees; } @Override public Set<Class<? extends CoreAnnotation>> requires() { if (parser.requiresTags()) { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.ValueAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.SentenceIndexAnnotation.class, CoreAnnotations.PartOfSpeechAnnotation.class ))); } else { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.ValueAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.SentenceIndexAnnotation.class ))); } } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { if (this.BUILD_GRAPHS) { if (this.saveBinaryTrees) { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.PartOfSpeechAnnotation.class, TreeCoreAnnotations.TreeAnnotation.class, TreeCoreAnnotations.BinarizedTreeAnnotation.class, SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, CoreAnnotations.BeginIndexAnnotation.class, CoreAnnotations.EndIndexAnnotation.class, CoreAnnotations.CategoryAnnotation.class ))); } else { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.PartOfSpeechAnnotation.class, TreeCoreAnnotations.TreeAnnotation.class, SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, CoreAnnotations.BeginIndexAnnotation.class, CoreAnnotations.EndIndexAnnotation.class, CoreAnnotations.CategoryAnnotation.class ))); } } else { if (this.saveBinaryTrees) { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.PartOfSpeechAnnotation.class, TreeCoreAnnotations.TreeAnnotation.class, TreeCoreAnnotations.BinarizedTreeAnnotation.class, CoreAnnotations.CategoryAnnotation.class ))); } else { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.PartOfSpeechAnnotation.class, TreeCoreAnnotations.TreeAnnotation.class, CoreAnnotations.CategoryAnnotation.class ))); } } } }