package edu.stanford.nlp.naturalli;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.*;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
/**
* <p>
* An OpenIE system based on valid Natural Logic deletions of a sentence.
* The system is described in:
* </p>
*
* <pre>
* "Leveraging Linguistic Structure For Open Domain Information Extraction." Gabor Angeli, Melvin Johnson Premkumar, Christopher Manning. ACL 2015.
* </pre>
*
* <p>
* The paper can be found at <a href="http://nlp.stanford.edu/pubs/2015angeli-openie.pdf">http://nlp.stanford.edu/pubs/2015angeli-openie.pdf</a>.
* </p>
* <p>
* Documentation on the system can be found on
* <a href="http://nlp.stanford.edu/software/openie.shtml">the project homepage</a>,
* or the <a href="http://stanfordnlp.github.io/CoreNLP/openie.html">CoreNLP annotator documentation page</a>.
* The simplest invocation of the system would be something like:
* </p>
*
* <pre>
* java -mx1g -cp stanford-openie.jar:stanford-openie-models.jar edu.stanford.nlp.naturalli.OpenIE
* </pre>
*
* <p>
* Note that this class serves both as an entry point for the OpenIE system, but also as a CoreNLP annotator
* which can be plugged into the CoreNLP pipeline (or any other annotation pipeline).
* </p>
*
* @see OpenIE#annotate(Annotation)
* @see OpenIE#main(String[])
*
* @author Gabor Angeli
*/
//
// TODO(gabor): handle things like "One example of chemical energy is that found in the food that we eat ."
//
@SuppressWarnings({"FieldCanBeLocal", "UnusedDeclaration"})
public class OpenIE implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(OpenIE.class);
private enum OutputFormat { REVERB, OLLIE, DEFAULT, QA_SRL }
/**
* A pattern for rewriting "NN_1 is a JJ NN_2" --> NN_1 is JJ"
*/
private static SemgrexPattern adjectivePattern = SemgrexPattern.compile("{}=obj >nsubj {}=subj >cop {}=be >det {word:/an?/} >amod {}=adj ?>/prep_.*/=prep {}=pobj");
//
// Static Options (for running standalone)
//
@ArgumentParser.Option(name="format", gloss="The format to output the triples in.")
private static OutputFormat FORMAT = OutputFormat.DEFAULT;
@ArgumentParser.Option(name="filelist", gloss="The files to annotate, as a list of files one per line.")
private static File FILELIST = null;
@ArgumentParser.Option(name="output", gloss="The files to annotate, as a list of files one per line.")
private static PrintStream OUTPUT = System.out;
//
// Annotator Options (for running in the pipeline)
//
@ArgumentParser.Option(name="splitter.model", gloss="The location of the clause splitting model.")
private String splitterModel = DefaultPaths.DEFAULT_OPENIE_CLAUSE_SEARCHER;
@ArgumentParser.Option(name="splitter.nomodel", gloss="If true, don't load a clause splitter model. This is primarily useful for training.")
private boolean noModel = false;
@ArgumentParser.Option(name="splitter.threshold", gloss="The minimum threshold for accepting a clause.")
private double splitterThreshold = 0.1;
@ArgumentParser.Option(name="splitter.disable", gloss="If true, don't run the sentence splitter")
private boolean splitterDisable = false;
@ArgumentParser.Option(name="max_entailments_per_clause", gloss="The maximum number of entailments allowed per sentence of input.")
private int entailmentsPerSentence = 1000;
@ArgumentParser.Option(name="ignore_affinity", gloss="If true, don't use the affinity models for dobj and pp attachment.")
private boolean ignoreAffinity = false;
@ArgumentParser.Option(name="affinity_models", gloss="The directory (or classpath directory) containing the affinity models for pp/obj attachments.")
private String affinityModels = DefaultPaths.DEFAULT_NATURALLI_AFFINITIES;
@ArgumentParser.Option(name="affinity_probability_cap", gloss="The affinity to consider 1.0")
private double affinityProbabilityCap = 1.0 / 3.0;
@ArgumentParser.Option(name="triple.strict", gloss="If true, only generate triples if the entire fragment has been consumed.")
private boolean consumeAll = true;
@ArgumentParser.Option(name="triple.all_nominals", gloss="If true, generate not only named entity nominal relations.")
private boolean allNominals = false;
@ArgumentParser.Option(name="resolve_coref", gloss="If true, resolve pronouns to their canonical mention")
private boolean resolveCoref = false;
@ArgumentParser.Option(name="strip_entailments", gloss="If true, don't keep the entailed sentences annotations around.")
private boolean stripEntailments = false;
/**
* The natural logic weights loaded from the models file.
* This is primarily the prepositional attachment statistics.
*/
private final NaturalLogicWeights weights;
/**
* The clause splitter model, if one is to be used.
* This component splits a sentence into a set of entailed clauses, but does not yet
* maximally shorten them.
* This is the implementation of stage 1 of the OpenIE pipeline.
*/
public final Optional<ClauseSplitter> clauseSplitter;
/**
* The forward entailer model, running a search from clauses to maximally shortened clauses.
* This is the implementation of stage 2 of the OpenIE pipeline.
*/
public final ForwardEntailer forwardEntailer;
/**
* The relation triple segmenter, which converts a maximally shortened clause into an OpenIE
* extraction triple.
* This is the implementation of stage 3 of the OpenIE pipeline.
*/
public RelationTripleSegmenter segmenter;
/** Create a new OpenIE system, with default properties */
@SuppressWarnings("UnusedDeclaration")
public OpenIE() {
this(new Properties());
}
/**
* Create a ne OpenIE system, based on the given properties.
* @param props The properties to parametrize the system with.
*/
public OpenIE(Properties props) {
// Fill the properties
ArgumentParser.fillOptions(this, props);
Properties withoutOpenIEPrefix = new Properties();
Enumeration<Object> keys = props.keys();
while (keys.hasMoreElements()) {
String key = keys.nextElement().toString();
withoutOpenIEPrefix.setProperty(key.replace("openie.", ""), props.getProperty(key));
}
ArgumentParser.fillOptions(this, withoutOpenIEPrefix);
// Create the clause splitter
try {
if (splitterDisable) {
clauseSplitter = Optional.empty();
} else {
if (noModel) {
log.info("Not loading a splitter model");
clauseSplitter = Optional.of(ClauseSplitterSearchProblem::new);
} else {
clauseSplitter = Optional.of(ClauseSplitter.load(splitterModel));
}
}
} catch (IOException e) {
//throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel + ": " + e.getClass() + ": " + e.getMessage());
throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel, e);
}
// Create the forward entailer
try {
this.weights = ignoreAffinity ? new NaturalLogicWeights(affinityProbabilityCap) : new NaturalLogicWeights(affinityModels, affinityProbabilityCap);
} catch (IOException e) {
throw new RuntimeIOException("Could not load affinity model at " + affinityModels + ": " + e.getMessage());
}
forwardEntailer = new ForwardEntailer(entailmentsPerSentence, weights);
// Create the relation segmenter
segmenter = new RelationTripleSegmenter(allNominals);
}
/**
* Find the clauses in a sentence, where the sentence is expressed as a dependency tree.
*
* @param tree The dependency tree representation of the sentence.
* @param assumedTruth The assumed truth of the sentence. This is almost always true, unless you are
* doing some more nuanced reasoning.
*
* @return A set of clauses extracted from the sentence. This includes the original sentence.
*/
@SuppressWarnings("unchecked")
public List<SentenceFragment> clausesInSentence(SemanticGraph tree, boolean assumedTruth) {
if (clauseSplitter.isPresent()) {
return clauseSplitter.get().apply(tree, assumedTruth).topClauses(splitterThreshold, 32);
} else {
return Collections.emptyList();
}
}
/**
* Find the clauses in a sentence.
* This runs the clause splitting component of the OpenIE system only.
*
* @see OpenIE#clausesInSentence(SemanticGraph, boolean)
*
* @param sentence The raw sentence to extract clauses from.
*
* @return A set of clauses extracted from the sentence. This includes the original sentence.
*/
public List<SentenceFragment> clausesInSentence(CoreMap sentence) {
return clausesInSentence(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), true);
}
/**
* Returns all of the entailed shortened clauses (as per natural logic) from the given clause.
* This runs the forward entailment component of the OpenIE system only.
* It is usually chained together with the clause splitting component: {@link OpenIE#clausesInSentence(CoreMap)}.
*
* @param clause The premise clause, as a sentence fragment in itself.
*
* @return A list of entailed clauses.
*/
@SuppressWarnings("unchecked")
public List<SentenceFragment> entailmentsFromClause(SentenceFragment clause) {
if (clause.parseTree.isEmpty()) {
return Collections.emptyList();
} else {
// Get the forward entailments
List<SentenceFragment> list = new ArrayList<>();
if (entailmentsPerSentence > 0) {
list.addAll(forwardEntailer.apply(clause.parseTree, true).search()
.stream().map(x -> x.changeScore(x.score * clause.score)).collect(Collectors.toList()));
}
list.add(clause);
// A special case for adjective entailments
List<SentenceFragment> adjFragments = new ArrayList<>();
SemgrexMatcher matcher = adjectivePattern.matcher(clause.parseTree);
OUTER: while (matcher.find()) {
// (get nodes)
IndexedWord subj = matcher.getNode("subj");
IndexedWord be = matcher.getNode("be");
IndexedWord adj = matcher.getNode("adj");
IndexedWord obj = matcher.getNode("obj");
IndexedWord pobj = matcher.getNode("pobj");
String prep = matcher.getRelnString("prep");
// (if the adjective, or any earlier adjective, is privative, then all bets are off)
for (SemanticGraphEdge edge : clause.parseTree.outgoingEdgeIterable(obj)) {
if ("amod".equals(edge.getRelation().toString()) && edge.getDependent().index() <= adj.index() &&
Util.PRIVATIVE_ADJECTIVES.contains(edge.getDependent().word().toLowerCase())) {
continue OUTER;
}
}
// (create the core tree)
SemanticGraph tree = new SemanticGraph();
tree.addRoot(adj);
tree.addVertex(subj);
tree.addVertex(be);
tree.addEdge(adj, be, GrammaticalRelation.valueOf(Language.English, "cop"), Double.NEGATIVE_INFINITY, false);
tree.addEdge(adj, subj, GrammaticalRelation.valueOf(Language.English, "nsubj"), Double.NEGATIVE_INFINITY, false);
// (add pp attachment, if it existed)
if (pobj != null) {
assert prep != null;
tree.addEdge(adj, pobj, GrammaticalRelation.valueOf(Language.English, prep), Double.NEGATIVE_INFINITY, false);
}
// (check for monotonicity)
if (adj.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards() &&
be.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards()) {
// (add tree)
adjFragments.add(new SentenceFragment(tree, clause.assumedTruth, false));
}
}
list.addAll(adjFragments);
return list;
}
}
/**
* Returns all the maximally shortened entailed fragments (as per natural logic)
* from the given collection of clauses.
*
* @param clauses The clauses to shorten further.
*
* @return A set of sentence fragments corresponding to the maximally shortened entailed clauses.
*/
public Set<SentenceFragment> entailmentsFromClauses(Collection<SentenceFragment> clauses) {
Set<SentenceFragment> entailments = new HashSet<>();
for (SentenceFragment clause : clauses) {
entailments.addAll(entailmentsFromClause(clause));
}
return entailments;
}
/**
* Returns the possible relation triple in this sentence fragment.
*
* @see OpenIE#relationInFragment(SentenceFragment, CoreMap)
*/
public Optional<RelationTriple> relationInFragment(SentenceFragment fragment) {
return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll);
}
/**
* Returns the possible relation triple in this set of sentence fragments.
*
* @see OpenIE#relationsInFragments(Collection, CoreMap)
*/
public List<RelationTriple> relationsInFragments(Collection<SentenceFragment> fragments) {
return fragments.stream().map(this::relationInFragment).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());
}
/**
* Returns the possible relation triple in this sentence fragment.
*
* @param fragment The sentence fragment to try to extract relations from.
* @param sentence The containing sentence for the fragment.
*
* @return A relation triple if we could find one; otherwise, {@link Optional#empty()}.
*/
private Optional<RelationTriple> relationInFragment(SentenceFragment fragment, CoreMap sentence) {
return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll);
}
/**
* Returns a list of OpenIE relations from the given set of sentence fragments.
*
* @param fragments The sentence fragments to extract relations from.
* @param sentence The containing sentence that these fragments were extracted from.
*
* @return A list of OpenIE triples, corresponding to all the triples that could be extracted from the given fragments.
*/
private List<RelationTriple> relationsInFragments(Collection<SentenceFragment> fragments, CoreMap sentence) {
return fragments.stream().map(x -> relationInFragment(x, sentence)).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());
}
/**
* Extract the relations in this clause.
*
* @see OpenIE#entailmentsFromClause(SentenceFragment)
* @see OpenIE#relationsInFragments(Collection)
*/
public List<RelationTriple> relationsInClause(SentenceFragment clause) {
return relationsInFragments(entailmentsFromClause(clause));
}
/**
* Extract the relations in this sentence.
*
* @see OpenIE#clausesInSentence(CoreMap)
* @see OpenIE#entailmentsFromClause(SentenceFragment)
* @see OpenIE#relationsInFragments(Collection)
*/
public List<RelationTriple> relationsInSentence(CoreMap sentence) {
return relationsInFragments(entailmentsFromClauses(clausesInSentence(sentence)));
}
/**
* Create a copy of the passed parse tree, canonicalizing pronominal nodes with their canonical mention.
* Canonical mentions are tied together with the <i>compound</i> dependency arc; otherwise, the structure of
* the tree remains unchanged.
*
* @param parse The original dependency parse of the sentence.
* @param canonicalMentionMap The map from tokens to their canonical mentions.
*
* @return A <b>copy</b> of the passed parse tree, with pronouns replaces with their canonical mention.
*/
private static SemanticGraph canonicalizeCoref(SemanticGraph parse, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) {
parse = new SemanticGraph(parse);
for (IndexedWord node : new HashSet<>(parse.vertexSet())) { // copy the vertex set to prevent ConcurrentModificationExceptions
if (node.tag() != null && node.tag().startsWith("PRP")) {
List<CoreLabel> canonicalMention = canonicalMentionMap.get(node.backingLabel());
if (canonicalMention != null) {
// Case: this node is a preposition with a valid antecedent.
// 1. Save the attaching edges
List<SemanticGraphEdge> incomingEdges = parse.incomingEdgeList(node);
List<SemanticGraphEdge> outgoingEdges = parse.outgoingEdgeList(node);
// 2. Remove the node
parse.removeVertex(node);
// 3. Add the new head word
IndexedWord headWord = new IndexedWord(canonicalMention.get(canonicalMention.size() - 1));
headWord.setPseudoPosition(node.pseudoPosition());
parse.addVertex(headWord);
for (SemanticGraphEdge edge : incomingEdges) {
parse.addEdge(edge.getGovernor(), headWord, edge.getRelation(), edge.getWeight(), edge.isExtra());
}
for (SemanticGraphEdge edge : outgoingEdges) {
parse.addEdge(headWord, edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra());
}
// 4. Add other words
double pseudoPosition = headWord.pseudoPosition() - 1e-3;
for (int i = canonicalMention.size() - 2; i >= 0; --i) {
// Create the node
IndexedWord dependent = new IndexedWord(canonicalMention.get(i));
// Set its pseudo position appropriately
dependent.setPseudoPosition(pseudoPosition);
pseudoPosition -= 1e-3;
// Add the node to the graph
parse.addVertex(dependent);
parse.addEdge(headWord, dependent, UniversalEnglishGrammaticalRelations.COMPOUND_MODIFIER, 1.0, false);
}
}
}
}
return parse;
}
/**
* <p>
* Annotate a single sentence.
* </p>
* <p>
* This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation}
* and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations.
* </p>
*/
@SuppressWarnings("unchecked")
public void annotateSentence(CoreMap sentence, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
if (tokens.size() < 2) {
// Short sentence. Skip annotating it.
sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, Collections.emptyList());
if (!stripEntailments) {
sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, Collections.emptySet());
}
} else {
// Get the dependency tree
SemanticGraph parse = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class);
if (parse == null) {
parse = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
if (parse == null) {
throw new IllegalStateException("Cannot run OpenIE without a parse tree!");
}
// Clean the tree
parse = new SemanticGraph(parse);
Util.cleanTree(parse);
// Resolve Coreference
SemanticGraph canonicalizedParse = parse;
if (resolveCoref && !canonicalMentionMap.isEmpty()) {
canonicalizedParse = canonicalizeCoref(parse, canonicalMentionMap);
}
// Run OpenIE
// (clauses)
List<SentenceFragment> clauses = clausesInSentence(canonicalizedParse, true); // note: uses coref-canonicalized parse
// (entailment)
Set<SentenceFragment> fragments = entailmentsFromClauses(clauses);
// (segment)
List<RelationTriple> extractions = segmenter.extract(parse, tokens); // note: uses non-coref-canonicalized parse!
extractions.addAll(relationsInFragments(fragments, sentence));
// Set the annotations
sentence.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, new HashSet<>(clauses));
sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, fragments);
sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class,
new ArrayList<>(new HashSet<>(extractions))); // uniq the extractions
if (stripEntailments) {
sentence.remove(NaturalLogicAnnotations.EntailedSentencesAnnotation.class);
}
}
}
/**
* {@inheritDoc}
*
* <p>
* This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation}
* and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations.
* </p>
*/
@Override
public void annotate(Annotation annotation) {
// Accumulate Coref data
Map<Integer, CorefChain> corefChains;
Map<CoreLabel, List<CoreLabel>> canonicalMentionMap = new IdentityHashMap<>();
if (resolveCoref && (corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class)) != null) {
for (CorefChain chain : corefChains.values()) {
// Make sure it's a real chain and not a singleton
if (chain.getMentionsInTextualOrder().size() < 2) {
continue;
}
// Metadata
List<CoreLabel> canonicalMention = null;
double canonicalMentionScore = Double.NEGATIVE_INFINITY;
Set<CoreLabel> tokensToMark = new HashSet<>();
List<CorefChain.CorefMention> mentions = chain.getMentionsInTextualOrder();
// Iterate over mentions
for (int i = 0; i < mentions.size(); ++i) {
// Get some data on this mention
Pair<List<CoreLabel>, Double> info = grokCorefMention(annotation, mentions.get(i));
// Figure out if it should be the canonical mention
double score = info.second + ((double) i) / ((double) mentions.size()) + (mentions.get(i) == chain.getRepresentativeMention() ? 1.0 : 0.0);
if (canonicalMention == null || score > canonicalMentionScore) {
canonicalMention = info.first;
canonicalMentionScore = score;
}
// Register the participating tokens
if (info.first.size() == 1) { // Only mark single-node tokens!
tokensToMark.addAll(info.first);
}
}
// Mark the tokens as coreferent
assert canonicalMention != null;
for (CoreLabel token : tokensToMark) {
List<CoreLabel> existingMention = canonicalMentionMap.get(token);
if (existingMention == null || existingMention.isEmpty() ||
"O".equals(existingMention.get(0).ner())) { // Don't clobber existing good mentions
canonicalMentionMap.put(token, canonicalMention);
}
}
}
}
// Annotate each sentence
annotation.get(CoreAnnotations.SentencesAnnotation.class).forEach(x -> this.annotateSentence(x, canonicalMentionMap));
}
/** {@inheritDoc} */
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
NaturalLogicAnnotations.RelationTriplesAnnotation.class,
NaturalLogicAnnotations.EntailedSentencesAnnotation.class
)));
}
/** {@inheritDoc} */
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
Set<Class<? extends CoreAnnotation>> requirements = new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class,
CoreAnnotations.PartOfSpeechAnnotation.class,
CoreAnnotations.LemmaAnnotation.class,
NaturalLogicAnnotations.PolarityAnnotation.class,
SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class
//CoreAnnotations.OriginalTextAnnotation.class
));
if (resolveCoref) {
requirements.add(edu.stanford.nlp.coref.CorefCoreAnnotations.CorefChainAnnotation.class);
}
return Collections.unmodifiableSet(requirements);
}
/**
* A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are
* associated with this mention, and it returns a score for how much we think this mention should be the canonical
* mention.
*
* @param doc The document this mention is referenced into.
* @param mention The mention itself.
* @return A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention.
*/
private static Pair<List<CoreLabel>, Double> grokCorefMention(Annotation doc, CorefChain.CorefMention mention) {
List<CoreLabel> tokens = doc.get(CoreAnnotations.SentencesAnnotation.class).get(mention.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> mentionAsTokens = tokens.subList(mention.startIndex - 1, mention.endIndex - 1);
// Try to assess this mention's NER type
Counter<String> nerVotes = new ClassicCounter<>();
mentionAsTokens.stream().filter(token -> token.ner() != null && !"O".equals(token.ner())).forEach(token -> nerVotes.incrementCount(token.ner()));
String ner = Counters.argmax(nerVotes, (o1, o2) -> o1 == null ? 0 : o1.compareTo(o2));
double nerCount = nerVotes.getCount(ner);
double nerScore = nerCount * nerCount / ((double) mentionAsTokens.size());
// Return
return Pair.makePair(mentionAsTokens, nerScore);
}
/**
* Prints an OpenIE triple to a String, according to the output format requested in
* the annotator.
*
* @param extraction The triple to write.
* @param docid The document ID (for the ReVerb format)
* @param sentence The sentence the triple was extracted from (for the ReVerb format)
*
* @return A String representation of the triple.
*/
public static String tripleToString(RelationTriple extraction, String docid, CoreMap sentence) {
switch (FORMAT) {
case REVERB:
return extraction.toReverbString(docid, sentence);
case OLLIE:
return extraction.confidenceGloss() + ": (" + extraction.subjectGloss() + "; " + extraction.relationGloss() + "; " + extraction.objectGloss() + ")";
case DEFAULT:
return extraction.toString();
case QA_SRL:
return extraction.toQaSrlString(sentence);
default:
throw new IllegalStateException("Format is not implemented: " + FORMAT);
}
}
/**
* Process a single file or line of standard in.
* @param pipeline The annotation pipeline to run the lines of the input through.
* @param docid The docid of the document we are extracting.
* @param document the document to annotate.
*/
@SuppressWarnings("SynchronizeOnNonFinalField")
private static void processDocument(AnnotationPipeline pipeline, String docid, String document) {
// Error checks
if (document.trim().equals("")) {
return;
}
// Annotate the document
Annotation ann = new Annotation(document);
pipeline.annotate(ann);
// Get the extractions
boolean empty = true;
synchronized (OUTPUT) {
for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
for (RelationTriple extraction : sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class)) {
// Print the extractions
OUTPUT.println(tripleToString(extraction, docid, sentence));
empty = false;
}
}
}
if (empty) {
log.info("No extractions in: " + ("stdin".equals(docid) ? document : docid));
}
}
/**
* An entry method for annotating standard in with OpenIE extractions.
*/
public static void main(String[] args) throws IOException, InterruptedException {
// Parse the arguments
Properties props = StringUtils.argsToProperties(args, new HashMap<String, Integer>(){{
put("openie.resolve_coref", 0);
put("resolve_coref", 0);
put("openie.splitter.nomodel", 0);
put("splitter.nomodel", 0);
put("openie.splitter.disable", 0);
put("splitter.disable", 0);
put("openie.ignore_affinity", 0);
put("splitter.ignore_affinity", 0);
put("openie.triple.strict", 0);
put("splitter.triple.strict", 0);
put("openie.triple.all_nominals", 0);
put("splitter.triple.all_nominals", 0);
}});
ArgumentParser.fillOptions(new Class[]{OpenIE.class, ArgumentParser.class}, props);
AtomicInteger exceptionCount = new AtomicInteger(0);
ExecutorService exec = Executors.newFixedThreadPool(ArgumentParser.threads);
// Parse the files to process
String[] filesToProcess;
if (FILELIST != null) {
filesToProcess = IOUtils.linesFromFile(FILELIST.getPath()).stream()
.map(String::trim)
.map(path -> path.replaceAll("^~", "$HOME"))
.map(path -> new File(path).exists() ? path : StringUtils.expandEnvironmentVariables(path))
.toArray(String[]::new);
} else if (!"".equals(props.getProperty("", ""))) {
filesToProcess = props.getProperty("", "").split("\\s+");
} else {
filesToProcess = new String[0];
}
// Tweak the arguments
if ("".equals(props.getProperty("annotators", ""))) {
if (!"false".equalsIgnoreCase(props.getProperty("resolve_coref", props.getProperty("openie.resolve_coref", "false")))) {
props.setProperty("coref.md.type", "dep"); // so we don't need the `parse` annotator
props.setProperty("coref.mode", "statistical"); // explicitly ask for scoref
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,ner,mention,coref,natlog,openie");
} else {
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
}
}
if ("".equals(props.getProperty("depparse.extradependencies", ""))) {
props.setProperty("depparse.extradependencies", "ref_only_uncollapsed");
}
if ("".equals(props.getProperty("parse.extradependencies", ""))) {
props.setProperty("parse.extradependencies", "ref_only_uncollapsed");
}
if ("".equals(props.getProperty("tokenize.class", ""))) {
props.setProperty("tokenize.class", "PTBTokenizer");
}
if ("".equals(props.getProperty("tokenize.language", ""))) {
props.setProperty("tokenize.language", "en");
}
// Tweak properties for console mode.
// In particular, in this mode we can assume every line of standard in is a new sentence.
if (filesToProcess.length == 0 && "".equals(props.getProperty("ssplit.isOneSentence", ""))) {
props.setProperty("ssplit.isOneSentence", "true");
}
// Some error checks on the arguments
if (!props.getProperty("annotators").toLowerCase().contains("openie")) {
log.error("If you specify custom annotators, you must at least include 'openie'");
System.exit(1);
}
// Copy properties that are missing the 'openie' prefix
new HashSet<>(props.keySet()).stream().filter(key -> !key.toString().startsWith("openie.")).forEach(key -> props.setProperty("openie." + key.toString(), props.getProperty(key.toString())));
// Create the pipeline
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
// Run OpenIE
if (filesToProcess.length == 0) {
// Running from stdin; one document per line.
log.info("Processing from stdin. Enter one sentence per line.");
Scanner scanner = new Scanner(System.in);
String line;
try {
line = scanner.nextLine();
} catch (NoSuchElementException e) {
log.info("No lines found on standard in");
return;
}
while (line != null) {
processDocument(pipeline, "stdin", line);
try {
line = scanner.nextLine();
} catch (NoSuchElementException e) {
return;
}
}
} else {
// Running from file parameters.
// Make sure we can read all the files in the queue.
// This will prevent a nasty surprise 10 hours into a running job...
for (String file : filesToProcess) {
if (!new File(file).exists() || !new File(file).canRead()) {
log.error("Cannot read file (or file does not exist: '" + file + "'");
}
}
// Actually process the files.
for (String file : filesToProcess) {
log.info("Processing file: " + file);
if (ArgumentParser.threads > 1) {
// Multi-threaded: submit a job to run
final String fileToSubmit = file;
exec.submit(() -> {
try {
processDocument(pipeline, file, IOUtils.slurpFile(new File(fileToSubmit)));
} catch (Throwable t) {
t.printStackTrace();
exceptionCount.incrementAndGet();
}
});
} else {
// Single-threaded: just run the job
processDocument(pipeline, file, IOUtils.slurpFile(new File(file)));
}
}
}
// Exit
exec.shutdown();
log.info("All files have been queued; awaiting termination...");
exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
log.info("DONE processing files. " + exceptionCount.get() + " exceptions encountered.");
System.exit(exceptionCount.get());
}
}