package edu.stanford.nlp.naturalli; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ie.util.RelationTriple; import edu.stanford.nlp.international.Language; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.*; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.util.*; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import edu.stanford.nlp.coref.CorefCoreAnnotations; import edu.stanford.nlp.coref.data.CorefChain; /** * <p> * An OpenIE system based on valid Natural Logic deletions of a sentence. * The system is described in: * </p> * * <pre> * "Leveraging Linguistic Structure For Open Domain Information Extraction." Gabor Angeli, Melvin Johnson Premkumar, Christopher Manning. ACL 2015. * </pre> * * <p> * The paper can be found at <a href="http://nlp.stanford.edu/pubs/2015angeli-openie.pdf">http://nlp.stanford.edu/pubs/2015angeli-openie.pdf</a>. * </p> * <p> * Documentation on the system can be found on * <a href="http://nlp.stanford.edu/software/openie.shtml">the project homepage</a>, * or the <a href="http://stanfordnlp.github.io/CoreNLP/openie.html">CoreNLP annotator documentation page</a>. * The simplest invocation of the system would be something like: * </p> * * <pre> * java -mx1g -cp stanford-openie.jar:stanford-openie-models.jar edu.stanford.nlp.naturalli.OpenIE * </pre> * * <p> * Note that this class serves both as an entry point for the OpenIE system, but also as a CoreNLP annotator * which can be plugged into the CoreNLP pipeline (or any other annotation pipeline). * </p> * * @see OpenIE#annotate(Annotation) * @see OpenIE#main(String[]) * * @author Gabor Angeli */ // // TODO(gabor): handle things like "One example of chemical energy is that found in the food that we eat ." // @SuppressWarnings({"FieldCanBeLocal", "UnusedDeclaration"}) public class OpenIE implements Annotator { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(OpenIE.class); private enum OutputFormat { REVERB, OLLIE, DEFAULT, QA_SRL } /** * A pattern for rewriting "NN_1 is a JJ NN_2" --> NN_1 is JJ" */ private static SemgrexPattern adjectivePattern = SemgrexPattern.compile("{}=obj >nsubj {}=subj >cop {}=be >det {word:/an?/} >amod {}=adj ?>/prep_.*/=prep {}=pobj"); // // Static Options (for running standalone) // @ArgumentParser.Option(name="format", gloss="The format to output the triples in.") private static OutputFormat FORMAT = OutputFormat.DEFAULT; @ArgumentParser.Option(name="filelist", gloss="The files to annotate, as a list of files one per line.") private static File FILELIST = null; @ArgumentParser.Option(name="output", gloss="The files to annotate, as a list of files one per line.") private static PrintStream OUTPUT = System.out; // // Annotator Options (for running in the pipeline) // @ArgumentParser.Option(name="splitter.model", gloss="The location of the clause splitting model.") private String splitterModel = DefaultPaths.DEFAULT_OPENIE_CLAUSE_SEARCHER; @ArgumentParser.Option(name="splitter.nomodel", gloss="If true, don't load a clause splitter model. This is primarily useful for training.") private boolean noModel = false; @ArgumentParser.Option(name="splitter.threshold", gloss="The minimum threshold for accepting a clause.") private double splitterThreshold = 0.1; @ArgumentParser.Option(name="splitter.disable", gloss="If true, don't run the sentence splitter") private boolean splitterDisable = false; @ArgumentParser.Option(name="max_entailments_per_clause", gloss="The maximum number of entailments allowed per sentence of input.") private int entailmentsPerSentence = 1000; @ArgumentParser.Option(name="ignore_affinity", gloss="If true, don't use the affinity models for dobj and pp attachment.") private boolean ignoreAffinity = false; @ArgumentParser.Option(name="affinity_models", gloss="The directory (or classpath directory) containing the affinity models for pp/obj attachments.") private String affinityModels = DefaultPaths.DEFAULT_NATURALLI_AFFINITIES; @ArgumentParser.Option(name="affinity_probability_cap", gloss="The affinity to consider 1.0") private double affinityProbabilityCap = 1.0 / 3.0; @ArgumentParser.Option(name="triple.strict", gloss="If true, only generate triples if the entire fragment has been consumed.") private boolean consumeAll = true; @ArgumentParser.Option(name="triple.all_nominals", gloss="If true, generate not only named entity nominal relations.") private boolean allNominals = false; @ArgumentParser.Option(name="resolve_coref", gloss="If true, resolve pronouns to their canonical mention") private boolean resolveCoref = false; @ArgumentParser.Option(name="strip_entailments", gloss="If true, don't keep the entailed sentences annotations around.") private boolean stripEntailments = false; /** * The natural logic weights loaded from the models file. * This is primarily the prepositional attachment statistics. */ private final NaturalLogicWeights weights; /** * The clause splitter model, if one is to be used. * This component splits a sentence into a set of entailed clauses, but does not yet * maximally shorten them. * This is the implementation of stage 1 of the OpenIE pipeline. */ public final Optional<ClauseSplitter> clauseSplitter; /** * The forward entailer model, running a search from clauses to maximally shortened clauses. * This is the implementation of stage 2 of the OpenIE pipeline. */ public final ForwardEntailer forwardEntailer; /** * The relation triple segmenter, which converts a maximally shortened clause into an OpenIE * extraction triple. * This is the implementation of stage 3 of the OpenIE pipeline. */ public RelationTripleSegmenter segmenter; /** Create a new OpenIE system, with default properties */ @SuppressWarnings("UnusedDeclaration") public OpenIE() { this(new Properties()); } /** * Create a ne OpenIE system, based on the given properties. * @param props The properties to parametrize the system with. */ public OpenIE(Properties props) { // Fill the properties ArgumentParser.fillOptions(this, props); Properties withoutOpenIEPrefix = new Properties(); Enumeration<Object> keys = props.keys(); while (keys.hasMoreElements()) { String key = keys.nextElement().toString(); withoutOpenIEPrefix.setProperty(key.replace("openie.", ""), props.getProperty(key)); } ArgumentParser.fillOptions(this, withoutOpenIEPrefix); // Create the clause splitter try { if (splitterDisable) { clauseSplitter = Optional.empty(); } else { if (noModel) { log.info("Not loading a splitter model"); clauseSplitter = Optional.of(ClauseSplitterSearchProblem::new); } else { clauseSplitter = Optional.of(ClauseSplitter.load(splitterModel)); } } } catch (IOException e) { //throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel + ": " + e.getClass() + ": " + e.getMessage()); throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel, e); } // Create the forward entailer try { this.weights = ignoreAffinity ? new NaturalLogicWeights(affinityProbabilityCap) : new NaturalLogicWeights(affinityModels, affinityProbabilityCap); } catch (IOException e) { throw new RuntimeIOException("Could not load affinity model at " + affinityModels + ": " + e.getMessage()); } forwardEntailer = new ForwardEntailer(entailmentsPerSentence, weights); // Create the relation segmenter segmenter = new RelationTripleSegmenter(allNominals); } /** * Find the clauses in a sentence, where the sentence is expressed as a dependency tree. * * @param tree The dependency tree representation of the sentence. * @param assumedTruth The assumed truth of the sentence. This is almost always true, unless you are * doing some more nuanced reasoning. * * @return A set of clauses extracted from the sentence. This includes the original sentence. */ @SuppressWarnings("unchecked") public List<SentenceFragment> clausesInSentence(SemanticGraph tree, boolean assumedTruth) { if (clauseSplitter.isPresent()) { return clauseSplitter.get().apply(tree, assumedTruth).topClauses(splitterThreshold, 32); } else { return Collections.emptyList(); } } /** * Find the clauses in a sentence. * This runs the clause splitting component of the OpenIE system only. * * @see OpenIE#clausesInSentence(SemanticGraph, boolean) * * @param sentence The raw sentence to extract clauses from. * * @return A set of clauses extracted from the sentence. This includes the original sentence. */ public List<SentenceFragment> clausesInSentence(CoreMap sentence) { return clausesInSentence(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), true); } /** * Returns all of the entailed shortened clauses (as per natural logic) from the given clause. * This runs the forward entailment component of the OpenIE system only. * It is usually chained together with the clause splitting component: {@link OpenIE#clausesInSentence(CoreMap)}. * * @param clause The premise clause, as a sentence fragment in itself. * * @return A list of entailed clauses. */ @SuppressWarnings("unchecked") public List<SentenceFragment> entailmentsFromClause(SentenceFragment clause) { if (clause.parseTree.isEmpty()) { return Collections.emptyList(); } else { // Get the forward entailments List<SentenceFragment> list = new ArrayList<>(); if (entailmentsPerSentence > 0) { list.addAll(forwardEntailer.apply(clause.parseTree, true).search() .stream().map(x -> x.changeScore(x.score * clause.score)).collect(Collectors.toList())); } list.add(clause); // A special case for adjective entailments List<SentenceFragment> adjFragments = new ArrayList<>(); SemgrexMatcher matcher = adjectivePattern.matcher(clause.parseTree); OUTER: while (matcher.find()) { // (get nodes) IndexedWord subj = matcher.getNode("subj"); IndexedWord be = matcher.getNode("be"); IndexedWord adj = matcher.getNode("adj"); IndexedWord obj = matcher.getNode("obj"); IndexedWord pobj = matcher.getNode("pobj"); String prep = matcher.getRelnString("prep"); // (if the adjective, or any earlier adjective, is privative, then all bets are off) for (SemanticGraphEdge edge : clause.parseTree.outgoingEdgeIterable(obj)) { if ("amod".equals(edge.getRelation().toString()) && edge.getDependent().index() <= adj.index() && Util.PRIVATIVE_ADJECTIVES.contains(edge.getDependent().word().toLowerCase())) { continue OUTER; } } // (create the core tree) SemanticGraph tree = new SemanticGraph(); tree.addRoot(adj); tree.addVertex(subj); tree.addVertex(be); tree.addEdge(adj, be, GrammaticalRelation.valueOf(Language.English, "cop"), Double.NEGATIVE_INFINITY, false); tree.addEdge(adj, subj, GrammaticalRelation.valueOf(Language.English, "nsubj"), Double.NEGATIVE_INFINITY, false); // (add pp attachment, if it existed) if (pobj != null) { assert prep != null; tree.addEdge(adj, pobj, GrammaticalRelation.valueOf(Language.English, prep), Double.NEGATIVE_INFINITY, false); } // (check for monotonicity) if (adj.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards() && be.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards()) { // (add tree) adjFragments.add(new SentenceFragment(tree, clause.assumedTruth, false)); } } list.addAll(adjFragments); return list; } } /** * Returns all the maximally shortened entailed fragments (as per natural logic) * from the given collection of clauses. * * @param clauses The clauses to shorten further. * * @return A set of sentence fragments corresponding to the maximally shortened entailed clauses. */ public Set<SentenceFragment> entailmentsFromClauses(Collection<SentenceFragment> clauses) { Set<SentenceFragment> entailments = new HashSet<>(); for (SentenceFragment clause : clauses) { entailments.addAll(entailmentsFromClause(clause)); } return entailments; } /** * Returns the possible relation triple in this sentence fragment. * * @see OpenIE#relationInFragment(SentenceFragment, CoreMap) */ public Optional<RelationTriple> relationInFragment(SentenceFragment fragment) { return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll); } /** * Returns the possible relation triple in this set of sentence fragments. * * @see OpenIE#relationsInFragments(Collection, CoreMap) */ public List<RelationTriple> relationsInFragments(Collection<SentenceFragment> fragments) { return fragments.stream().map(this::relationInFragment).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()); } /** * Returns the possible relation triple in this sentence fragment. * * @param fragment The sentence fragment to try to extract relations from. * @param sentence The containing sentence for the fragment. * * @return A relation triple if we could find one; otherwise, {@link Optional#empty()}. */ private Optional<RelationTriple> relationInFragment(SentenceFragment fragment, CoreMap sentence) { return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll); } /** * Returns a list of OpenIE relations from the given set of sentence fragments. * * @param fragments The sentence fragments to extract relations from. * @param sentence The containing sentence that these fragments were extracted from. * * @return A list of OpenIE triples, corresponding to all the triples that could be extracted from the given fragments. */ private List<RelationTriple> relationsInFragments(Collection<SentenceFragment> fragments, CoreMap sentence) { return fragments.stream().map(x -> relationInFragment(x, sentence)).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()); } /** * Extract the relations in this clause. * * @see OpenIE#entailmentsFromClause(SentenceFragment) * @see OpenIE#relationsInFragments(Collection) */ public List<RelationTriple> relationsInClause(SentenceFragment clause) { return relationsInFragments(entailmentsFromClause(clause)); } /** * Extract the relations in this sentence. * * @see OpenIE#clausesInSentence(CoreMap) * @see OpenIE#entailmentsFromClause(SentenceFragment) * @see OpenIE#relationsInFragments(Collection) */ public List<RelationTriple> relationsInSentence(CoreMap sentence) { return relationsInFragments(entailmentsFromClauses(clausesInSentence(sentence))); } /** * Create a copy of the passed parse tree, canonicalizing pronominal nodes with their canonical mention. * Canonical mentions are tied together with the <i>compound</i> dependency arc; otherwise, the structure of * the tree remains unchanged. * * @param parse The original dependency parse of the sentence. * @param canonicalMentionMap The map from tokens to their canonical mentions. * * @return A <b>copy</b> of the passed parse tree, with pronouns replaces with their canonical mention. */ private static SemanticGraph canonicalizeCoref(SemanticGraph parse, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) { parse = new SemanticGraph(parse); for (IndexedWord node : new HashSet<>(parse.vertexSet())) { // copy the vertex set to prevent ConcurrentModificationExceptions if (node.tag() != null && node.tag().startsWith("PRP")) { List<CoreLabel> canonicalMention = canonicalMentionMap.get(node.backingLabel()); if (canonicalMention != null) { // Case: this node is a preposition with a valid antecedent. // 1. Save the attaching edges List<SemanticGraphEdge> incomingEdges = parse.incomingEdgeList(node); List<SemanticGraphEdge> outgoingEdges = parse.outgoingEdgeList(node); // 2. Remove the node parse.removeVertex(node); // 3. Add the new head word IndexedWord headWord = new IndexedWord(canonicalMention.get(canonicalMention.size() - 1)); headWord.setPseudoPosition(node.pseudoPosition()); parse.addVertex(headWord); for (SemanticGraphEdge edge : incomingEdges) { parse.addEdge(edge.getGovernor(), headWord, edge.getRelation(), edge.getWeight(), edge.isExtra()); } for (SemanticGraphEdge edge : outgoingEdges) { parse.addEdge(headWord, edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra()); } // 4. Add other words double pseudoPosition = headWord.pseudoPosition() - 1e-3; for (int i = canonicalMention.size() - 2; i >= 0; --i) { // Create the node IndexedWord dependent = new IndexedWord(canonicalMention.get(i)); // Set its pseudo position appropriately dependent.setPseudoPosition(pseudoPosition); pseudoPosition -= 1e-3; // Add the node to the graph parse.addVertex(dependent); parse.addEdge(headWord, dependent, UniversalEnglishGrammaticalRelations.COMPOUND_MODIFIER, 1.0, false); } } } } return parse; } /** * <p> * Annotate a single sentence. * </p> * <p> * This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation} * and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations. * </p> */ @SuppressWarnings("unchecked") public void annotateSentence(CoreMap sentence, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); if (tokens.size() < 2) { // Short sentence. Skip annotating it. sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, Collections.emptyList()); if (!stripEntailments) { sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, Collections.emptySet()); } } else { // Get the dependency tree SemanticGraph parse = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class); if (parse == null) { parse = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); } if (parse == null) { throw new IllegalStateException("Cannot run OpenIE without a parse tree!"); } // Clean the tree parse = new SemanticGraph(parse); Util.cleanTree(parse); // Resolve Coreference SemanticGraph canonicalizedParse = parse; if (resolveCoref && !canonicalMentionMap.isEmpty()) { canonicalizedParse = canonicalizeCoref(parse, canonicalMentionMap); } // Run OpenIE // (clauses) List<SentenceFragment> clauses = clausesInSentence(canonicalizedParse, true); // note: uses coref-canonicalized parse // (entailment) Set<SentenceFragment> fragments = entailmentsFromClauses(clauses); // (segment) List<RelationTriple> extractions = segmenter.extract(parse, tokens); // note: uses non-coref-canonicalized parse! extractions.addAll(relationsInFragments(fragments, sentence)); // Set the annotations sentence.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, new HashSet<>(clauses)); sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, fragments); sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, new ArrayList<>(new HashSet<>(extractions))); // uniq the extractions if (stripEntailments) { sentence.remove(NaturalLogicAnnotations.EntailedSentencesAnnotation.class); } } } /** * {@inheritDoc} * * <p> * This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation} * and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations. * </p> */ @Override public void annotate(Annotation annotation) { // Accumulate Coref data Map<Integer, CorefChain> corefChains; Map<CoreLabel, List<CoreLabel>> canonicalMentionMap = new IdentityHashMap<>(); if (resolveCoref && (corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class)) != null) { for (CorefChain chain : corefChains.values()) { // Make sure it's a real chain and not a singleton if (chain.getMentionsInTextualOrder().size() < 2) { continue; } // Metadata List<CoreLabel> canonicalMention = null; double canonicalMentionScore = Double.NEGATIVE_INFINITY; Set<CoreLabel> tokensToMark = new HashSet<>(); List<CorefChain.CorefMention> mentions = chain.getMentionsInTextualOrder(); // Iterate over mentions for (int i = 0; i < mentions.size(); ++i) { // Get some data on this mention Pair<List<CoreLabel>, Double> info = grokCorefMention(annotation, mentions.get(i)); // Figure out if it should be the canonical mention double score = info.second + ((double) i) / ((double) mentions.size()) + (mentions.get(i) == chain.getRepresentativeMention() ? 1.0 : 0.0); if (canonicalMention == null || score > canonicalMentionScore) { canonicalMention = info.first; canonicalMentionScore = score; } // Register the participating tokens if (info.first.size() == 1) { // Only mark single-node tokens! tokensToMark.addAll(info.first); } } // Mark the tokens as coreferent assert canonicalMention != null; for (CoreLabel token : tokensToMark) { List<CoreLabel> existingMention = canonicalMentionMap.get(token); if (existingMention == null || existingMention.isEmpty() || "O".equals(existingMention.get(0).ner())) { // Don't clobber existing good mentions canonicalMentionMap.put(token, canonicalMention); } } } } // Annotate each sentence annotation.get(CoreAnnotations.SentencesAnnotation.class).forEach(x -> this.annotateSentence(x, canonicalMentionMap)); } /** {@inheritDoc} */ @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( NaturalLogicAnnotations.RelationTriplesAnnotation.class, NaturalLogicAnnotations.EntailedSentencesAnnotation.class ))); } /** {@inheritDoc} */ @Override public Set<Class<? extends CoreAnnotation>> requires() { Set<Class<? extends CoreAnnotation>> requirements = new HashSet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.SentenceIndexAnnotation.class, CoreAnnotations.PartOfSpeechAnnotation.class, CoreAnnotations.LemmaAnnotation.class, NaturalLogicAnnotations.PolarityAnnotation.class, SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class //CoreAnnotations.OriginalTextAnnotation.class )); if (resolveCoref) { requirements.add(edu.stanford.nlp.coref.CorefCoreAnnotations.CorefChainAnnotation.class); } return Collections.unmodifiableSet(requirements); } /** * A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are * associated with this mention, and it returns a score for how much we think this mention should be the canonical * mention. * * @param doc The document this mention is referenced into. * @param mention The mention itself. * @return A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention. */ private static Pair<List<CoreLabel>, Double> grokCorefMention(Annotation doc, CorefChain.CorefMention mention) { List<CoreLabel> tokens = doc.get(CoreAnnotations.SentencesAnnotation.class).get(mention.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> mentionAsTokens = tokens.subList(mention.startIndex - 1, mention.endIndex - 1); // Try to assess this mention's NER type Counter<String> nerVotes = new ClassicCounter<>(); mentionAsTokens.stream().filter(token -> token.ner() != null && !"O".equals(token.ner())).forEach(token -> nerVotes.incrementCount(token.ner())); String ner = Counters.argmax(nerVotes, (o1, o2) -> o1 == null ? 0 : o1.compareTo(o2)); double nerCount = nerVotes.getCount(ner); double nerScore = nerCount * nerCount / ((double) mentionAsTokens.size()); // Return return Pair.makePair(mentionAsTokens, nerScore); } /** * Prints an OpenIE triple to a String, according to the output format requested in * the annotator. * * @param extraction The triple to write. * @param docid The document ID (for the ReVerb format) * @param sentence The sentence the triple was extracted from (for the ReVerb format) * * @return A String representation of the triple. */ public static String tripleToString(RelationTriple extraction, String docid, CoreMap sentence) { switch (FORMAT) { case REVERB: return extraction.toReverbString(docid, sentence); case OLLIE: return extraction.confidenceGloss() + ": (" + extraction.subjectGloss() + "; " + extraction.relationGloss() + "; " + extraction.objectGloss() + ")"; case DEFAULT: return extraction.toString(); case QA_SRL: return extraction.toQaSrlString(sentence); default: throw new IllegalStateException("Format is not implemented: " + FORMAT); } } /** * Process a single file or line of standard in. * @param pipeline The annotation pipeline to run the lines of the input through. * @param docid The docid of the document we are extracting. * @param document the document to annotate. */ @SuppressWarnings("SynchronizeOnNonFinalField") private static void processDocument(AnnotationPipeline pipeline, String docid, String document) { // Error checks if (document.trim().equals("")) { return; } // Annotate the document Annotation ann = new Annotation(document); pipeline.annotate(ann); // Get the extractions boolean empty = true; synchronized (OUTPUT) { for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) { for (RelationTriple extraction : sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class)) { // Print the extractions OUTPUT.println(tripleToString(extraction, docid, sentence)); empty = false; } } } if (empty) { log.info("No extractions in: " + ("stdin".equals(docid) ? document : docid)); } } /** * An entry method for annotating standard in with OpenIE extractions. */ public static void main(String[] args) throws IOException, InterruptedException { // Parse the arguments Properties props = StringUtils.argsToProperties(args, new HashMap<String, Integer>(){{ put("openie.resolve_coref", 0); put("resolve_coref", 0); put("openie.splitter.nomodel", 0); put("splitter.nomodel", 0); put("openie.splitter.disable", 0); put("splitter.disable", 0); put("openie.ignore_affinity", 0); put("splitter.ignore_affinity", 0); put("openie.triple.strict", 0); put("splitter.triple.strict", 0); put("openie.triple.all_nominals", 0); put("splitter.triple.all_nominals", 0); }}); ArgumentParser.fillOptions(new Class[]{OpenIE.class, ArgumentParser.class}, props); AtomicInteger exceptionCount = new AtomicInteger(0); ExecutorService exec = Executors.newFixedThreadPool(ArgumentParser.threads); // Parse the files to process String[] filesToProcess; if (FILELIST != null) { filesToProcess = IOUtils.linesFromFile(FILELIST.getPath()).stream() .map(String::trim) .map(path -> path.replaceAll("^~", "$HOME")) .map(path -> new File(path).exists() ? path : StringUtils.expandEnvironmentVariables(path)) .toArray(String[]::new); } else if (!"".equals(props.getProperty("", ""))) { filesToProcess = props.getProperty("", "").split("\\s+"); } else { filesToProcess = new String[0]; } // Tweak the arguments if ("".equals(props.getProperty("annotators", ""))) { if (!"false".equalsIgnoreCase(props.getProperty("resolve_coref", props.getProperty("openie.resolve_coref", "false")))) { props.setProperty("coref.md.type", "dep"); // so we don't need the `parse` annotator props.setProperty("coref.mode", "statistical"); // explicitly ask for scoref props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,ner,mention,coref,natlog,openie"); } else { props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie"); } } if ("".equals(props.getProperty("depparse.extradependencies", ""))) { props.setProperty("depparse.extradependencies", "ref_only_uncollapsed"); } if ("".equals(props.getProperty("parse.extradependencies", ""))) { props.setProperty("parse.extradependencies", "ref_only_uncollapsed"); } if ("".equals(props.getProperty("tokenize.class", ""))) { props.setProperty("tokenize.class", "PTBTokenizer"); } if ("".equals(props.getProperty("tokenize.language", ""))) { props.setProperty("tokenize.language", "en"); } // Tweak properties for console mode. // In particular, in this mode we can assume every line of standard in is a new sentence. if (filesToProcess.length == 0 && "".equals(props.getProperty("ssplit.isOneSentence", ""))) { props.setProperty("ssplit.isOneSentence", "true"); } // Some error checks on the arguments if (!props.getProperty("annotators").toLowerCase().contains("openie")) { log.error("If you specify custom annotators, you must at least include 'openie'"); System.exit(1); } // Copy properties that are missing the 'openie' prefix new HashSet<>(props.keySet()).stream().filter(key -> !key.toString().startsWith("openie.")).forEach(key -> props.setProperty("openie." + key.toString(), props.getProperty(key.toString()))); // Create the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Run OpenIE if (filesToProcess.length == 0) { // Running from stdin; one document per line. log.info("Processing from stdin. Enter one sentence per line."); Scanner scanner = new Scanner(System.in); String line; try { line = scanner.nextLine(); } catch (NoSuchElementException e) { log.info("No lines found on standard in"); return; } while (line != null) { processDocument(pipeline, "stdin", line); try { line = scanner.nextLine(); } catch (NoSuchElementException e) { return; } } } else { // Running from file parameters. // Make sure we can read all the files in the queue. // This will prevent a nasty surprise 10 hours into a running job... for (String file : filesToProcess) { if (!new File(file).exists() || !new File(file).canRead()) { log.error("Cannot read file (or file does not exist: '" + file + "'"); } } // Actually process the files. for (String file : filesToProcess) { log.info("Processing file: " + file); if (ArgumentParser.threads > 1) { // Multi-threaded: submit a job to run final String fileToSubmit = file; exec.submit(() -> { try { processDocument(pipeline, file, IOUtils.slurpFile(new File(fileToSubmit))); } catch (Throwable t) { t.printStackTrace(); exceptionCount.incrementAndGet(); } }); } else { // Single-threaded: just run the job processDocument(pipeline, file, IOUtils.slurpFile(new File(file))); } } } // Exit exec.shutdown(); log.info("All files have been queued; awaiting termination..."); exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); log.info("DONE processing files. " + exceptionCount.get() + " exceptions encountered."); System.exit(exceptionCount.get()); } }