package edu.stanford.nlp.trees;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.*;
import java.util.concurrent.locks.Lock;
import java.util.function.Predicate;
import edu.stanford.nlp.graph.DirectedMultiGraph;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.AbstractCoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.trees.ud.EnhancementOptions;
import edu.stanford.nlp.util.Filters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import static edu.stanford.nlp.trees.GrammaticalRelation.DEPENDENT;
import static edu.stanford.nlp.trees.GrammaticalRelation.ROOT;
/**
* A {@code GrammaticalStructure} stores dependency relations between
* nodes in a tree. A new {@code GrammaticalStructure} is constructed
* from an existing parse tree with the help of {@link
* GrammaticalRelation {@code GrammaticalRelation}}, which
* defines a hierarchy of grammatical relations, along with
* patterns for identifying them in parse trees. The constructor for
* {@code GrammaticalStructure} uses these definitions to
* populate the new {@code GrammaticalStructure} with as many
* labeled grammatical relations as it can. Once constructed, the new
* {@code GrammaticalStructure} can be printed in various
* formats, or interrogated using the interface methods in this
* class. Internally, this uses a representation via a {@code TreeGraphNode},
* that is, a tree with additional labeled
* arcs between nodes, for representing the grammatical relations in a
* parse tree.
*
* @author Bill MacCartney
* @author Galen Andrew (refactoring English-specific stuff)
* @author Ilya Sherman (dependencies)
* @author Daniel Cer
* @see EnglishGrammaticalRelations
* @see GrammaticalRelation
* @see EnglishGrammaticalStructure
*/
public abstract class GrammaticalStructure implements Serializable {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(GrammaticalStructure.class);
private static final boolean PRINT_DEBUGGING = System.getProperty("GrammaticalStructure", null) != null;
/**
* A specification for the types of extra edges to add to the dependency tree.
* If you're in doubt, use {@link edu.stanford.nlp.trees.GrammaticalStructure.Extras#NONE}.
*/
public enum Extras {
/**
* <p> Don't include any additional edges. </p>
* <p>
* Note: In older code (2014 and before) including extras was a boolean flag. This option is the equivalent of
* the {@code false} flag.
* </p>
*/
NONE(false, false, false),
/**
* Include only the extra reference edges, and save them as reference edges without collapsing.
*/
REF_ONLY_UNCOLLAPSED(true, false, false),
/**
* Include only the extra reference edges, but collapsing these edges to clone the edge type of the referent.
* So, for example, <i>My dog who eats sausage</i> may have a "ref" edge from <i>who</i> to <i>dog</i>
* that would be deleted and replaced with an "nsubj" edge from <i>eats</i> to <i>dog</i>.
*/
REF_ONLY_COLLAPSED(true, false, true),
/**
* Add extra subjects only, not adding any of the other extra edge types.
*/
SUBJ_ONLY(false, true, false),
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#SUBJ_ONLY
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_ONLY_UNCOLLAPSED
*/
REF_UNCOLLAPSED_AND_SUBJ(true, true, false),
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#SUBJ_ONLY
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_ONLY_COLLAPSED
*/
REF_COLLAPSED_AND_SUBJ(true, true, true),
/**
* <p>
* Do the maximal amount of extra processing.
* Currently, this is equivalent to {@link edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_COLLAPSED_AND_SUBJ}.
* </p>
* <p>
* Note: In older code (2014 and before) including extras was a boolean flag. This option is the equivalent of
* the {@code true} flag.
* </p>
*/
MAXIMAL(true, true, true);
/** Add "ref" edges */
public final boolean doRef;
/** Add extra subject edges */
public final boolean doSubj;
/** collapse the "ref" edges */
public final boolean collapseRef;
/** Constructor. Nothing exciting here. */
Extras(boolean doRef, boolean doSubj, boolean collapseRef) {
this.doRef = doRef;
this.doSubj = doSubj;
this.collapseRef = collapseRef;
}
} // end enum Extras
protected final List<TypedDependency> typedDependencies;
protected final List<TypedDependency> allTypedDependencies;
protected final Predicate<String> puncFilter;
protected final Predicate<String> tagFilter;
/**
* The root Tree node for this GrammaticalStructure.
*/
protected final TreeGraphNode root;
/**
* A map from arbitrary integer indices to nodes.
*/
private final Map<Integer, TreeGraphNode> indexMap = Generics.newHashMap();
/**
* Create a new GrammaticalStructure, analyzing the parse tree and
* populate the GrammaticalStructure with as many labeled
* grammatical relation arcs as possible.
*
* @param t A Tree to analyze
* @param relations A set of GrammaticalRelations to consider
* @param relationsLock Something needed to make this thread-safe when iterating over relations
* @param transformer A tree transformer to apply to the tree before converting (this argument
* may be null if no transformer is required)
* @param hf A HeadFinder for analysis
* @param puncFilter A Filter to reject punctuation. To delete punctuation
* dependencies, this filter should return false on
* punctuation word strings, and true otherwise.
* If punctuation dependencies should be kept, you
* should pass in a {@code Filters.<String>acceptFilter()}.
* @param tagFilter Appears to be unused (filters out tags??)
*/
public GrammaticalStructure(Tree t, Collection<GrammaticalRelation> relations,
Lock relationsLock, TreeTransformer transformer,
HeadFinder hf, Predicate<String> puncFilter,
Predicate<String> tagFilter) {
TreeGraphNode treeGraph = new TreeGraphNode(t, (TreeGraphNode) null);
// TODO: create the tree and reuse the leaf labels in one pass,
// avoiding a wasteful copy of the labels.
Trees.setLeafLabels(treeGraph, t.yield());
Trees.setLeafTagsIfUnset(treeGraph);
if (transformer != null) {
Tree transformed = transformer.transformTree(treeGraph);
if (!(transformed instanceof TreeGraphNode)) {
throw new RuntimeException("Transformer did not change TreeGraphNode into another TreeGraphNode: " + transformer);
}
this.root = (TreeGraphNode) transformed;
} else {
this.root = treeGraph;
}
indexNodes(this.root);
// add head word and tag to phrase nodes
if (hf == null) {
throw new AssertionError("Cannot use null HeadFinder");
}
root.percolateHeads(hf);
if (root.value() == null) {
root.setValue("ROOT"); // todo: cdm: it doesn't seem like this line should be here
}
// add dependencies, using heads
this.puncFilter = puncFilter;
this.tagFilter = tagFilter;
// NoPunctFilter puncDepFilter = new NoPunctFilter(puncFilter);
NoPunctTypedDependencyFilter puncTypedDepFilter = new NoPunctTypedDependencyFilter(puncFilter, tagFilter);
DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> basicGraph = new DirectedMultiGraph<>();
DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> completeGraph = new DirectedMultiGraph<>();
// analyze the root (and its descendants, recursively)
if (relationsLock != null) {
relationsLock.lock();
}
try {
analyzeNode(root, root, relations, hf, puncFilter, tagFilter, basicGraph, completeGraph);
}
finally {
if (relationsLock != null) {
relationsLock.unlock();
}
}
attachStrandedNodes(root, root, false, puncFilter, tagFilter, basicGraph);
// add typed dependencies
typedDependencies = getDeps(puncTypedDepFilter, basicGraph);
allTypedDependencies = Generics.newArrayList(typedDependencies);
getExtraDeps(allTypedDependencies, puncTypedDepFilter, completeGraph);
}
/**
* Assign sequential integer indices (starting with 1) to all
* nodes of the subtree rooted at this
* {@code Tree}. The leaves are indexed first,
* from left to right. Then the internal nodes are indexed,
* using a pre-order tree traversal.
*/
private void indexNodes(TreeGraphNode tree) {
indexNodes(tree, indexLeaves(tree, 1));
}
/**
* Assign sequential integer indices to the leaves of the subtree
* rooted at this {@code TreeGraphNode}, beginning with
* {@code startIndex}, and traversing the leaves from left
* to right. If node is already indexed, then it uses the existing index.
*
* @param startIndex index for this node
* @return the next index still unassigned
*/
private int indexLeaves(TreeGraphNode tree, int startIndex) {
if (tree.isLeaf()) {
int oldIndex = tree.index();
if (oldIndex >= 0) {
startIndex = oldIndex;
} else {
tree.setIndex(startIndex);
}
addNodeToIndexMap(startIndex, tree);
startIndex++;
} else {
for (TreeGraphNode child : tree.children) {
startIndex = indexLeaves(child, startIndex);
}
}
return startIndex;
}
/**
* Assign sequential integer indices to all nodes of the subtree
* rooted at this {@code TreeGraphNode}, beginning with
* {@code startIndex}, and doing a pre-order tree traversal.
* Any node which already has an index will not be re-indexed
* — this is so that we can index the leaves first, and
* then index the rest.
*
* @param startIndex index for this node
* @return the next index still unassigned
*/
private int indexNodes(TreeGraphNode tree, int startIndex) {
if (tree.index() < 0) { // if this node has no index
addNodeToIndexMap(startIndex, tree);
tree.setIndex(startIndex++);
}
if (!tree.isLeaf()) {
for (TreeGraphNode child : tree.children) {
startIndex = indexNodes(child, startIndex);
}
}
return startIndex;
}
/**
* Store a mapping from an arbitrary integer index to a node in
* this treegraph. Normally a client shouldn't need to use this,
* as the nodes are automatically indexed by the
* {@code TreeGraph} constructor.
*
* @param index the arbitrary integer index
* @param node the {@code TreeGraphNode} to be indexed
*/
private void addNodeToIndexMap(int index, TreeGraphNode node) {
indexMap.put(Integer.valueOf(index), node);
}
/**
* Return the node in the this treegraph corresponding to the
* specified integer index.
*
* @param index the integer index of the node you want
* @return the {@code TreeGraphNode} having the specified
* index (or {@code null} if such does not exist)
*/
private TreeGraphNode getNodeByIndex(int index) {
return indexMap.get(Integer.valueOf(index));
}
/**
* Return the root Tree of this GrammaticalStructure.
*
* @return the root Tree of this GrammaticalStructure
*/
public TreeGraphNode root() {
return root;
}
private static void throwDepFormatException(String dep) {
throw new RuntimeException(String.format("Dependencies should be for the format 'type(arg-idx, arg-idx)'. Could not parse '%s'", dep));
}
/**
* Create a grammatical structure from its string representation.
*
* Like buildCoNLLXGrammaticalStructure,
* this method fakes up the parts of the tree structure that are not
* used by the grammatical relation transformation operations.
*
* <i>Note:</i> Added by daniel cer
*
* @param tokens
* @param posTags
* @param deps
*/
public static GrammaticalStructure fromStringReps(List<String> tokens, List<String> posTags, List<String> deps) {
if (tokens.size() != posTags.size()) {
throw new RuntimeException(String.format(
"tokens.size(): %d != pos.size(): %d%n", tokens.size(), posTags
.size()));
}
List<TreeGraphNode> tgWordNodes = new ArrayList<>(tokens.size());
List<TreeGraphNode> tgPOSNodes = new ArrayList<>(tokens.size());
CoreLabel rootLabel = new CoreLabel();
rootLabel.setValue("ROOT");
List<IndexedWord> nodeWords = new ArrayList<>(tgPOSNodes.size() + 1);
nodeWords.add(new IndexedWord(rootLabel));
UniversalSemanticHeadFinder headFinder = new UniversalSemanticHeadFinder();
Iterator<String> posIter = posTags.iterator();
for (String wordString : tokens) {
String posString = posIter.next();
CoreLabel wordLabel = new CoreLabel();
wordLabel.setWord(wordString);
wordLabel.setValue(wordString);
wordLabel.setTag(posString);
TreeGraphNode word = new TreeGraphNode(wordLabel);
CoreLabel tagLabel = new CoreLabel();
tagLabel.setValue(posString);
tagLabel.setWord(posString);
TreeGraphNode pos = new TreeGraphNode(tagLabel);
tgWordNodes.add(word);
tgPOSNodes.add(pos);
TreeGraphNode[] childArr = {word};
pos.setChildren(childArr);
word.setParent(pos);
pos.percolateHeads(headFinder);
nodeWords.add(new IndexedWord(wordLabel));
}
TreeGraphNode root = new TreeGraphNode(rootLabel);
root.setChildren(tgPOSNodes.toArray(new TreeGraphNode[tgPOSNodes.size()]));
root.setIndex(0);
// Build list of TypedDependencies
List<TypedDependency> tdeps = new ArrayList<>(deps.size());
for (String depString : deps) {
int firstBracket = depString.indexOf('(');
if (firstBracket == -1) throwDepFormatException(depString);
String type = depString.substring(0, firstBracket);
if (depString.charAt(depString.length() - 1) != ')') throwDepFormatException(depString);
String args = depString.substring(firstBracket + 1, depString.length() - 1);
int argSep = args.indexOf(", ");
if (argSep == -1) throwDepFormatException(depString);
String parentArg = args.substring(0, argSep);
String childArg = args.substring(argSep + 2);
int parentDash = parentArg.lastIndexOf('-');
if (parentDash == -1) throwDepFormatException(depString);
int childDash = childArg.lastIndexOf('-');
if (childDash == -1) throwDepFormatException(depString);
//System.err.printf("parentArg: %s%n", parentArg);
int parentIdx = Integer.parseInt(parentArg.substring(parentDash+1).replace("'", ""));
int childIdx = Integer.parseInt(childArg.substring(childDash+1).replace("'", ""));
GrammaticalRelation grel = new GrammaticalRelation(Language.Any, type, null, DEPENDENT);
TypedDependency tdep = new TypedDependency(grel, nodeWords.get(parentIdx), nodeWords.get(childIdx));
tdeps.add(tdep);
}
// TODO add some elegant way to construct language
// appropriate GrammaticalStructures (e.g., English, Chinese, etc.)
return new GrammaticalStructure(tdeps, root) {
private static final long serialVersionUID = 1L;
};
}
public GrammaticalStructure(List<TypedDependency> projectiveDependencies, TreeGraphNode root) {
this.root = root;
indexNodes(this.root);
this.puncFilter = Filters.acceptFilter();
this.tagFilter = Filters.acceptFilter();
allTypedDependencies = typedDependencies = new ArrayList<>(projectiveDependencies);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(root.toPrettyString(0).substring(1));
sb.append("Typed Dependencies:\n");
sb.append(typedDependencies);
return sb.toString();
}
private static void attachStrandedNodes(TreeGraphNode t, TreeGraphNode root, boolean attach, Predicate<String> puncFilter, Predicate<String> tagFilter, DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> basicGraph) {
if (t.isLeaf()) {
return;
}
if (attach && puncFilter.test(t.headWordNode().label().value()) &&
tagFilter.test(t.headWordNode().label().tag())) {
// make faster by first looking for links from parent
// it is necessary to look for paths using all directions
// because sometimes there are edges created from lower nodes to
// nodes higher up
TreeGraphNode parent = t.parent().highestNodeWithSameHead();
if (!basicGraph.isEdge(parent, t) && basicGraph.getShortestPath(root, t, false) == null) {
basicGraph.add(parent, t, GrammaticalRelation.DEPENDENT);
}
}
for (TreeGraphNode kid : t.children()) {
attachStrandedNodes(kid, root, (kid.headWordNode() != t.headWordNode()), puncFilter, tagFilter, basicGraph);
}
}
// cdm dec 2009: I changed this to automatically fail on preterminal nodes, since they shouldn't match for GR parent patterns. Should speed it up.
private static void analyzeNode(TreeGraphNode t, TreeGraphNode root, Collection<GrammaticalRelation> relations, HeadFinder hf, Predicate<String> puncFilter, Predicate<String> tagFilter, DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> basicGraph, DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> completeGraph) {
if (t.isPhrasal()) { // don't do leaves or preterminals!
TreeGraphNode tHigh = t.highestNodeWithSameHead();
for (GrammaticalRelation egr : relations) {
if (egr.isApplicable(t)) {
for (TreeGraphNode u : egr.getRelatedNodes(t, root, hf)) {
TreeGraphNode uHigh = u.highestNodeWithSameHead();
if (uHigh == tHigh) {
continue;
}
if (!puncFilter.test(uHigh.headWordNode().label().value()) ||
! tagFilter.test(uHigh.headWordNode().label().tag())) {
continue;
}
completeGraph.add(tHigh, uHigh, egr);
// If there are two patterns that add dependencies, X --> Z and Y --> Z, and X dominates Y, then the dependency Y --> Z is not added to the basic graph to prevent unwanted duplication.
// Similarly, if there is already a path from X --> Y, and an expression would trigger Y --> X somehow, we ignore that
Set<TreeGraphNode> parents = basicGraph.getParents(uHigh);
if ((parents == null || parents.size() == 0 || parents.contains(tHigh)) &&
basicGraph.getShortestPath(uHigh, tHigh, true) == null) {
// log.info("Adding " + egr.getShortName() + " from " + t + " to " + u + " tHigh=" + tHigh + "(" + tHigh.headWordNode() + ") uHigh=" + uHigh + "(" + uHigh.headWordNode() + ")");
basicGraph.add(tHigh, uHigh, egr);
}
}
}
}
// now recurse into children
for (TreeGraphNode kid : t.children()) {
analyzeNode(kid, root, relations, hf, puncFilter, tagFilter, basicGraph, completeGraph);
}
}
}
private void getExtraDeps(List<TypedDependency> deps, Predicate<TypedDependency> puncTypedDepFilter, DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> completeGraph) {
getExtras(deps);
// adds stuff to basicDep based on the tregex patterns over the tree
this.getTreeDeps(deps, completeGraph, puncTypedDepFilter, extraTreeDepFilter());
Collections.sort(deps);
}
/**
* Helps the constructor build a list of typed dependencies using
* information from a {@code GrammaticalStructure}.
*/
private List<TypedDependency> getDeps(Predicate<TypedDependency> puncTypedDepFilter, DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> basicGraph) {
List<TypedDependency> basicDep = Generics.newArrayList();
for (TreeGraphNode gov : basicGraph.getAllVertices()) {
for (TreeGraphNode dep : basicGraph.getChildren(gov)) {
GrammaticalRelation reln = getGrammaticalRelationCommonAncestor(gov.headWordNode().label(), gov.label(), dep.headWordNode().label(), dep.label(), basicGraph.getEdges(gov, dep));
// log.info(" Gov: " + gov + " Dep: " + dep + " Reln: " + reln);
basicDep.add(new TypedDependency(reln, new IndexedWord(gov.headWordNode().label()), new IndexedWord(dep.headWordNode().label())));
}
}
// add the root
TreeGraphNode dependencyRoot = new TreeGraphNode(new Word("ROOT"));
dependencyRoot.setIndex(0);
TreeGraphNode rootDep = root().headWordNode();
if (rootDep == null) {
List<Tree> leaves = Trees.leaves(root());
if (leaves.size() > 0) {
Tree leaf = leaves.get(0);
if (!(leaf instanceof TreeGraphNode)) {
throw new AssertionError("Leaves should be TreeGraphNodes");
}
rootDep = (TreeGraphNode) leaf;
if (rootDep.headWordNode() != null) {
rootDep = rootDep.headWordNode();
}
}
}
if (rootDep != null) {
TypedDependency rootTypedDep = new TypedDependency(ROOT, new IndexedWord(dependencyRoot.label()), new IndexedWord(rootDep.label()));
if (puncTypedDepFilter.test(rootTypedDep)) {
basicDep.add(rootTypedDep);
} else { // Root is a punctuation character
/* Heuristic to find a root for the graph.
* Make the first child of the current root the
* new root and attach all other children to
* the new root.
*/
IndexedWord root = rootTypedDep.dep();
IndexedWord newRoot = null;
Collections.sort(basicDep);
for (TypedDependency td : basicDep) {
if (td.gov().equals(root)) {
if (newRoot != null) {
td.setGov(newRoot);
} else {
td.setGov(td.gov());
td.setReln(ROOT);
newRoot = td.dep();
}
}
}
}
}
postProcessDependencies(basicDep);
Collections.sort(basicDep);
return basicDep;
}
/**
* Returns a Filter which checks dependencies for usefulness as
* extra tree-based dependencies. By default, everything is
* accepted. One example of how this can be useful is in the
* English dependencies, where the REL dependency is used as an
* intermediate and we do not want this to be added when we make a
* second pass over the trees for missing dependencies.
*/
protected Predicate<TypedDependency> extraTreeDepFilter() {
return Filters.acceptFilter();
}
/**
* Post process the dependencies in whatever way this language
* requires. For example, English might replace "rel" dependencies
* with either dobj or pobj depending on the surrounding
* dependencies.
*/
protected void postProcessDependencies(List<TypedDependency> basicDep) {
// no post processing by default
}
/**
* Get extra dependencies that do not depend on the tree structure,
* but rather only depend on the existing dependency structure.
* For example, the English xsubj dependency can be extracted that way.
*/
protected void getExtras(List<TypedDependency> basicDep) {
// no extra dependencies by default
}
/** Look through the tree t and adds to the List basicDep
* additional dependencies which aren't
* in the List but which satisfy the filter puncTypedDepFilter.
*
* @param deps The list of dependencies which may be augmented
* @param completeGraph a graph of all the tree dependencies found earlier
* @param puncTypedDepFilter The filter that may skip punctuation dependencies
* @param extraTreeDepFilter Additional dependencies are added only if they pass this filter
*/
protected void getTreeDeps(List<TypedDependency> deps,
DirectedMultiGraph<TreeGraphNode, GrammaticalRelation> completeGraph,
Predicate<TypedDependency> puncTypedDepFilter,
Predicate<TypedDependency> extraTreeDepFilter) {
for (TreeGraphNode gov : completeGraph.getAllVertices()) {
for (TreeGraphNode dep : completeGraph.getChildren(gov)) {
for (GrammaticalRelation rel : removeGrammaticalRelationAncestors(completeGraph.getEdges(gov, dep))) {
TypedDependency newDep = new TypedDependency(rel, new IndexedWord(gov.headWordNode().label()), new IndexedWord(dep.headWordNode().label()));
if (!deps.contains(newDep) && puncTypedDepFilter.test(newDep) && extraTreeDepFilter.test(newDep)) {
newDep.setExtra();
deps.add(newDep);
}
}
}
}
}
private static class NoPunctFilter implements Predicate<Dependency<Label, Label, Object>>, Serializable {
private Predicate<String> npf;
NoPunctFilter(Predicate<String> f) {
this.npf = f;
}
@Override
public boolean test(Dependency<Label, Label, Object> d) {
if (d == null) {
return false;
}
Label lab = d.dependent();
if (lab == null) {
return false;
}
return npf.test(lab.value());
}
// Automatically generated by Eclipse
private static final long serialVersionUID = -2319891944796663180L;
} // end static class NoPunctFilter
private static class NoPunctTypedDependencyFilter implements Predicate<TypedDependency>, Serializable {
private Predicate<String> npf;
private Predicate<String> tf;
NoPunctTypedDependencyFilter(Predicate<String> f, Predicate<String> tf) {
this.npf = f;
this.tf = tf;
}
@Override
public boolean test(TypedDependency d) {
if (d == null) return false;
IndexedWord l = d.dep();
if (l == null) return false;
return npf.test(l.value()) && tf.test(l.tag());
}
private static final long serialVersionUID = -2872766864289207468L;
} // end static class NoPunctTypedDependencyFilter
/**
* Get GrammaticalRelation between gov and dep, and null if gov is not the
* governor of dep
*/
public GrammaticalRelation getGrammaticalRelation(int govIndex, int depIndex) {
TreeGraphNode gov = getNodeByIndex(govIndex);
TreeGraphNode dep = getNodeByIndex(depIndex);
// TODO: this is pretty ugly
return getGrammaticalRelation(new IndexedWord(gov.label()), new IndexedWord(dep.label()));
}
/**
* Get GrammaticalRelation between gov and dep, and null if gov is not the
* governor of dep
*/
public GrammaticalRelation getGrammaticalRelation(IndexedWord gov, IndexedWord dep) {
List<GrammaticalRelation> labels = Generics.newArrayList();
for (TypedDependency dependency : typedDependencies(Extras.MAXIMAL)) {
if (dependency.gov().equals(gov) && dependency.dep().equals(dep)) {
labels.add(dependency.reln());
}
}
return getGrammaticalRelationCommonAncestor(gov, gov, dep, dep, labels);
}
/**
* Returns the GrammaticalRelation which is the highest common
* ancestor of the list of relations passed in. The Labels are
* passed in only for debugging reasons. gov & dep are the
* labels with the text, govH and depH can be higher labels in the
* tree which represent the category
*/
private static GrammaticalRelation getGrammaticalRelationCommonAncestor(AbstractCoreLabel gov, AbstractCoreLabel govH, AbstractCoreLabel dep, AbstractCoreLabel depH, List<GrammaticalRelation> labels) {
GrammaticalRelation reln = GrammaticalRelation.DEPENDENT;
List<GrammaticalRelation> sortedLabels;
if (labels.size() <= 1) {
sortedLabels = labels;
} else {
sortedLabels = new ArrayList<>(labels);
Collections.sort(sortedLabels, new NameComparator<>());
}
// log.info(" gov " + govH + " dep " + depH + " arc labels: " + sortedLabels);
for (GrammaticalRelation reln2 : sortedLabels) {
if (reln.isAncestor(reln2)) {
reln = reln2;
} else if (PRINT_DEBUGGING && ! reln2.isAncestor(reln)) {
log.info("@@@\t" + reln + "\t" + reln2 + "\t" +
govH.get(CoreAnnotations.ValueAnnotation.class) + "\t" + depH.get(CoreAnnotations.ValueAnnotation.class));
}
}
if (PRINT_DEBUGGING && reln.equals(GrammaticalRelation.DEPENDENT)) {
String topCat = govH.get(CoreAnnotations.ValueAnnotation.class);
String topTag = gov.tag();
String topWord = gov.value();
String botCat = depH.get(CoreAnnotations.ValueAnnotation.class);
String botTag = dep.tag();
String botWord = dep.value();
log.info("### dep\t" + topCat + "\t" + topTag + "\t" + topWord +
"\t" + botCat + "\t" + botTag + "\t" + botWord + "\t");
}
return reln;
}
private static List<GrammaticalRelation> removeGrammaticalRelationAncestors(List<GrammaticalRelation> original) {
List<GrammaticalRelation> filtered = Generics.newArrayList();
for (GrammaticalRelation reln : original) {
boolean descendantFound = false;
for (int index = 0; index < filtered.size(); ++index) {
GrammaticalRelation gr = filtered.get(index);
//if the element in the list is an ancestor of the current
//relation, remove it (we will replace it later)
if (gr.isAncestor(reln)) {
filtered.remove(index);
--index;
} else if (reln.isAncestor(gr)) {
//if the relation is not an ancestor of an element in the
//list, we add the relation
descendantFound = true;
}
}
if (!descendantFound) {
filtered.add(reln);
}
}
return filtered;
}
/**
* Returns the typed dependencies of this grammatical structure. These
* are the basic word-level typed dependencies, where each word is dependent
* on one other thing, either a word or the starting ROOT, and the
* dependencies have a tree structure. This corresponds to the
* command-line option "basicDependencies".
*
* @return The typed dependencies of this grammatical structure
*/
public Collection<TypedDependency> typedDependencies() {
return typedDependencies(Extras.NONE);
}
/**
* Returns all the typed dependencies of this grammatical structure.
* These are like the basic (uncollapsed) dependencies, but may include
* extra arcs for control relationships, etc. This corresponds to the
* "nonCollapsed" option.
*/
public Collection<TypedDependency> allTypedDependencies() {
return typedDependencies(Extras.MAXIMAL);
}
/**
* Returns the typed dependencies of this grammatical structure. These
* are non-collapsed dependencies (basic or nonCollapsed).
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", and does not follow a tree structure.
* @return The typed dependencies of this grammatical structure
*/
public List<TypedDependency> typedDependencies(Extras includeExtras) {
List<TypedDependency> deps;
// This copy has to be done because of the broken way
// TypedDependency objects can be mutated by downstream methods
// such as collapseDependencies. Without the copy here it is
// possible for two consecutive calls to
// typedDependenciesCollapsed to get different results. For
// example, the English dependencies rename existing objects KILL
// to note that they should be removed.
if (includeExtras != Extras.NONE) {
deps = new ArrayList<>(allTypedDependencies.size());
for (TypedDependency dep : allTypedDependencies) {
deps.add(new TypedDependency(dep));
}
} else {
deps = new ArrayList<>(typedDependencies.size());
for (TypedDependency dep : typedDependencies) {
deps.add(new TypedDependency(dep));
}
}
//TODO (sebschu): prevent correctDependencies from getting called multiple times
correctDependencies(deps);
return deps;
}
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure#typedDependencies(edu.stanford.nlp.trees.GrammaticalStructure.Extras)
*/
@Deprecated
public List<TypedDependency> typedDependencies(boolean includeExtras) {
return typedDependencies(includeExtras ? Extras.MAXIMAL : Extras.NONE);
}
/**
* Get the typed dependencies after collapsing them.
* Collapsing dependencies refers to turning certain function words
* such as prepositions and conjunctions into arcs, so they disappear from
* the set of nodes.
* There is no guarantee that the dependencies are a tree. While the
* dependencies are normally tree-like, the collapsing may introduce
* not only re-entrancies but even small cycles.
*
* @return A set of collapsed dependencies
*/
public Collection<TypedDependency> typedDependenciesCollapsed() {
return typedDependenciesCollapsed(Extras.NONE);
}
// todo [cdm 2012]: The semantics of this method is the opposite of the others.
// The other no argument methods correspond to includeExtras being
// true, but for this one it is false. This should probably be made uniform.
/**
* Get the typed dependencies after mostly collapsing them, but keep a tree
* structure. In order to do this, the code does:
* <ol>
* <li> no relative clause processing
* <li> no xsubj relations
* <li> no propagation of conjuncts
* </ol>
* This corresponds to the "tree" option.
*
* @return collapsed dependencies keeping a tree structure
*/
public Collection<TypedDependency> typedDependenciesCollapsedTree() {
List<TypedDependency> tdl = typedDependencies(Extras.NONE);
collapseDependenciesTree(tdl);
return tdl;
}
/**
* Get the typed dependencies after collapsing them.
* The "collapsed" option corresponds to calling this method with argument
* {@code true}.
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", like controlling subjects
* @return collapsed dependencies
*/
public List<TypedDependency> typedDependenciesCollapsed(Extras includeExtras) {
List<TypedDependency> tdl = typedDependencies(includeExtras);
collapseDependencies(tdl, false, includeExtras);
return tdl;
}
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure#typedDependenciesCollapsed(edu.stanford.nlp.trees.GrammaticalStructure.Extras)
*/
@Deprecated
public List<TypedDependency> typedDependenciesCollapsed(boolean includeExtras) {
return typedDependenciesCollapsed(includeExtras ? Extras.MAXIMAL : Extras.NONE);
}
/**
* Get the typed dependencies after collapsing them and processing eventual
* CC complements. The effect of this part is to distributed conjoined
* arguments across relations or conjoined predicates across their arguments.
* This is generally useful, and we generally recommend using the output of
* this method with the second argument being {@code true}.
* The "CCPropagated" option corresponds to calling this method with an
* argument of {@code true}.
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", such as controlled subject links.
* @return collapsed dependencies with CC processed
*/
public List<TypedDependency> typedDependenciesCCprocessed(Extras includeExtras) {
List<TypedDependency> tdl = typedDependencies(includeExtras);
collapseDependencies(tdl, true, includeExtras);
return tdl;
}
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure#typedDependenciesCCprocessed(edu.stanford.nlp.trees.GrammaticalStructure.Extras)
*/
@Deprecated
public List<TypedDependency> typedDependenciesCCprocessed(boolean includeExtras) {
return typedDependenciesCCprocessed(includeExtras ? Extras.MAXIMAL : Extras.NONE);
}
public List<TypedDependency> typedDependenciesEnhanced() {
List<TypedDependency> tdl = typedDependencies(Extras.MAXIMAL);
addEnhancements(tdl, UniversalEnglishGrammaticalStructure.ENHANCED_OPTIONS);
return tdl;
}
public List<TypedDependency> typedDependenciesEnhancedPlusPlus() {
List<TypedDependency> tdl = typedDependencies(Extras.MAXIMAL);
addEnhancements(tdl, UniversalEnglishGrammaticalStructure.ENHANCED_PLUS_PLUS_OPTIONS);
return tdl;
}
/**
* Get a list of the typed dependencies, including extras like control
* dependencies, collapsing them and distributing relations across
* coordination. This method is generally recommended for best
* representing the semantic and syntactic relations of a sentence. In
* general it returns a directed graph (i.e., the output may not be a tree
* and it may contain (small) cycles).
* The "CCPropagated" option corresponds to calling this method.
*
* @return collapsed dependencies with CC processed
*/
public List<TypedDependency> typedDependenciesCCprocessed() {
return typedDependenciesCCprocessed(Extras.MAXIMAL);
}
/**
* Destructively modify the {@code Collection<TypedDependency>} to collapse
* language-dependent transitive dependencies.
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
* @param list A list of dependencies to process for possible collapsing
* @param CCprocess apply CC process?
*/
protected void collapseDependencies(List<TypedDependency> list, boolean CCprocess, Extras includeExtras) {
// do nothing as default operation
}
/**
*
* Destructively applies different enhancements to the dependency graph.
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
* @param list A list of dependencies
* @param options Options that determine which enhancements are applied to the dependency graph.
*/
protected void addEnhancements(List<TypedDependency> list, EnhancementOptions options) {
// do nothing as default operation
}
/**
* Destructively modify the {@code Collection<TypedDependency>} to collapse
* language-dependent transitive dependencies but keeping a tree structure.
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
* @param list A list of dependencies to process for possible collapsing
*
*/
protected void collapseDependenciesTree(List<TypedDependency> list) {
// do nothing as default operation
}
/**
* Destructively modify the {@code TypedDependencyGraph} to correct
* language-dependent dependencies. (e.g., nsubjpass in a relative clause)
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
*/
protected void correctDependencies(List<TypedDependency> list) {
// do nothing as default operation
}
/**
* Checks if all the typeDependencies are connected
* @param list a list of typedDependencies
* @return true if the list represents a connected graph, false otherwise
*/
public static boolean isConnected(Collection<TypedDependency> list) {
return getRoots(list).size() <= 1; // there should be no more than one root to have a connected graph
// there might be no root in the way we look when you have a relative clause
// ex.: Apple is a society that sells computers
// (the root "society" will also be the nsubj of "sells")
}
/**
* Return a list of TypedDependencies which are not dependent on any node from the list.
*
* @param list The list of TypedDependencies to check
* @return A list of TypedDependencies which are not dependent on any node from the list
*/
public static Collection<TypedDependency> getRoots(Collection<TypedDependency> list) {
Collection<TypedDependency> roots = new ArrayList<>();
// need to see if more than one governor is not listed somewhere as a dependent
// first take all the deps
Collection<IndexedWord> deps = Generics.newHashSet();
for (TypedDependency typedDep : list) {
deps.add(typedDep.dep());
}
// go through the list and add typedDependency for which the gov is not a dep
Collection<IndexedWord> govs = Generics.newHashSet();
for (TypedDependency typedDep : list) {
IndexedWord gov = typedDep.gov();
if (!deps.contains(gov) && !govs.contains(gov)) {
roots.add(typedDep);
}
govs.add(gov);
}
return roots;
}
private static final long serialVersionUID = 2286294455343892678L;
private static class NameComparator<X> implements Comparator<X> {
@Override
public int compare(X o1, X o2) {
String n1 = o1.toString();
String n2 = o2.toString();
return n1.compareTo(n2);
}
}
// Note that these field constants are 0-based whereas much documentation is 1-based
public static final int CoNLLX_WordField = 1;
public static final int CoNLLX_POSField = 4;
public static final int CoNLLX_GovField = 6;
public static final int CoNLLX_RelnField = 7;
public static final int CoNLLX_FieldCount = 10;
/**
* Read in a file containing a CoNLL-X dependency treebank and return a
* corresponding list of GrammaticalStructures.
*
* @throws IOException
*/
public static List<GrammaticalStructure> readCoNLLXGrammaticalStructureCollection(String fileName, Map<String, GrammaticalRelation> shortNameToGRel, GrammaticalStructureFromDependenciesFactory factory) throws IOException {
LineNumberReader reader = new LineNumberReader(IOUtils.readerFromString(fileName));
List<GrammaticalStructure> gsList = new LinkedList<>();
List<List<String>> tokenFields = new ArrayList<>();
for (String inline = reader.readLine(); inline != null;
inline = reader.readLine()) {
if ( ! inline.isEmpty()) {
// read in a single sentence token by token
List<String> fields = Arrays.asList(inline.split("\t"));
if (fields.size() != CoNLLX_FieldCount) {
throw new RuntimeException(String.format("Error (line %d): 10 fields expected but %d are present", reader.getLineNumber(), fields.size()));
}
tokenFields.add(fields);
} else {
if (tokenFields.isEmpty())
continue; // skip excess empty lines
gsList.add(buildCoNLLXGrammaticalStructure(tokenFields, shortNameToGRel, factory));
tokenFields = new ArrayList<>();
}
}
return gsList;
}
public static GrammaticalStructure buildCoNLLXGrammaticalStructure(List<List<String>> tokenFields,
Map<String, GrammaticalRelation> shortNameToGRel,
GrammaticalStructureFromDependenciesFactory factory) {
List<IndexedWord> tgWords = new ArrayList<>(tokenFields.size());
List<TreeGraphNode> tgPOSNodes = new ArrayList<>(tokenFields.size());
SemanticHeadFinder headFinder = new SemanticHeadFinder();
// Construct TreeGraphNodes for words and POS tags
for (List<String> fields : tokenFields) {
CoreLabel word = new CoreLabel();
word.setValue(fields.get(CoNLLX_WordField));
word.setWord(fields.get(CoNLLX_WordField));
word.setTag(fields.get(CoNLLX_POSField));
word.setIndex(tgWords.size() + 1);
CoreLabel pos = new CoreLabel();
pos.setTag(fields.get(CoNLLX_POSField));
pos.setValue(fields.get(CoNLLX_POSField));
TreeGraphNode wordNode = new TreeGraphNode(word);
TreeGraphNode posNode =new TreeGraphNode(pos);
tgWords.add(new IndexedWord(word));
tgPOSNodes.add(posNode);
TreeGraphNode[] childArr = { wordNode };
posNode.setChildren(childArr);
wordNode.setParent(posNode);
posNode.percolateHeads(headFinder);
}
// We fake up the parts of the tree structure that are not
// actually used by the grammatical relation transformation
// operations.
//
// That is, the constructed TreeGraphs consist of a flat tree,
// without any phrase bracketing, but that does preserve the
// parent child relationship between words and their POS tags.
//
// e.g. (ROOT (PRP I) (VBD hit) (DT the) (NN ball) (. .))
TreeGraphNode root =
new TreeGraphNode(new Word("ROOT-" + (tgPOSNodes.size() + 1)));
root.setChildren(tgPOSNodes.toArray(new TreeGraphNode[tgPOSNodes.size()]));
// Build list of TypedDependencies
List<TypedDependency> tdeps = new ArrayList<>(tgWords.size());
// Create a node outside the tree useful for root dependencies;
// we want to keep those if they were stored in the conll file
CoreLabel rootLabel = new CoreLabel();
rootLabel.setValue("ROOT");
rootLabel.setWord("ROOT");
rootLabel.setIndex(0);
IndexedWord dependencyRoot = new IndexedWord(rootLabel);
for (int i = 0; i < tgWords.size(); i++) {
String parentIdStr = tokenFields.get(i).get(CoNLLX_GovField);
if (StringUtils.isNullOrEmpty(parentIdStr)) {
continue;
}
int parentId = Integer.parseInt(parentIdStr) - 1;
String grelString = tokenFields.get(i).get(CoNLLX_RelnField);
if (grelString.equals("null") || grelString.equals("erased"))
continue;
GrammaticalRelation grel = shortNameToGRel.get(grelString.toLowerCase());
TypedDependency tdep;
if (grel == null) {
if (grelString.toLowerCase().equals("root")) {
tdep = new TypedDependency(ROOT, dependencyRoot, tgWords.get(i));
} else {
throw new RuntimeException("Unknown grammatical relation '" +
grelString + "' fields: " +
tokenFields.get(i) + "\nNode: " +
tgWords.get(i) + "\n" +
"Known Grammatical relations: ["+shortNameToGRel.keySet()+"]" );
}
} else {
if (parentId >= tgWords.size()) {
System.err.printf("Warning: Invalid Parent Id %d Sentence Length: %d%n", parentId+1, tgWords.size());
System.err.printf(" Assigning to root (0)%n");
parentId = -1;
}
tdep = new TypedDependency(grel, (parentId == -1 ? dependencyRoot : tgWords.get(parentId)),
tgWords.get(i));
}
tdeps.add(tdep);
}
return factory.build(tdeps, root);
}
public static void main(String[] args) {
/* Language-specific default properties. The default
* options produce English Universal dependencies.
* This should be overwritten in every subclass.
*
*/
GrammaticalStructureConversionUtils.convertTrees(args, "en");
}
}