package edu.stanford.nlp.trees;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.concurrent.locks.Lock;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.GrammaticalRelation.GrammaticalRelationAnnotation;
import edu.stanford.nlp.util.ErasureUtils;
import edu.stanford.nlp.util.Filter;
import edu.stanford.nlp.util.Filters;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.StringUtils;
import static edu.stanford.nlp.trees.GrammaticalRelation.DEPENDENT;
import static edu.stanford.nlp.trees.GrammaticalRelation.GOVERNOR;
import static edu.stanford.nlp.trees.GrammaticalRelation.ROOT;
/**
* A {@code GrammaticalStructure} is a {@link TreeGraph
* <code>TreeGraph</code>} (that is, a tree with additional labeled
* arcs between nodes) for representing the grammatical relations in a
* parse tree. A new <code>GrammaticalStructure</code> is constructed
* from an existing parse tree with the help of {@link
* GrammaticalRelation <code>GrammaticalRelation</code>}, which
* defines a hierarchy of grammatical relations, along with
* patterns for identifying them in parse trees. The constructor for
* <code>GrammaticalStructure</code> uses these definitions to
* populate the new <code>GrammaticalStructure</code> with as many
* labeled grammatical relations as it can. Once constructed, the new
* <code>GrammaticalStructure</code> can be printed in various
* formats, or interrogated using the interface methods in this
* class.
* <p/>
* <b>Caveat emptor!</b> This is a work in progress.
* Nothing in here should be relied upon to function perfectly.
* Feedback welcome.
*
* @author Bill MacCartney
* @author Galen Andrew (refactoring English-specific stuff)
* @author Ilya Sherman (dependencies)
* @author Daniel Cer
* @see EnglishGrammaticalRelations
* @see GrammaticalRelation
* @see EnglishGrammaticalStructure
*/
public abstract class GrammaticalStructure extends TreeGraph {
private static final boolean PRINT_DEBUGGING = System.getProperty("GrammaticalStructure", null) != null;
protected final Set<Dependency<Label, Label, Object>> dependencies;
protected final List<TypedDependency> typedDependencies;
protected final List<TypedDependency> allTypedDependencies;
protected final Filter<String> puncFilter;
/**
* Create a new GrammaticalStructure, analyzing the parse tree and
* populate the GrammaticalStructure with as many labeled
* grammatical relation arcs as possible.
*
* @param t A Tree to analyze
* @param relations A set of GrammaticalRelations to consider
* @param relationsLock Something needed to make this thread-safe
* @param hf A HeadFinder for analysis
* @param puncFilter A Filter to reject punctuation. To delete punctuation
* dependencies, this filter should return false on
* punctuation word strings, and true otherwise.
* If punctuation dependencies should be kept, you
* should pass in a Filters.<String>acceptFilter().
*/
public GrammaticalStructure(Tree t, Collection<GrammaticalRelation> relations,
Lock relationsLock, HeadFinder hf, Filter<String> puncFilter) {
super(t); // makes a Tree with TreeGraphNode nodes
// add head word and tag to phrase nodes
if (hf == null) {
throw new AssertionError("Cannot use null HeadFinder");
}
root.percolateHeads(hf);
if (root.value() == null) {
root.setValue("ROOT"); // todo: cdm: it doesn't seem like this line should be here
}
// add dependencies, using heads
this.puncFilter = puncFilter;
NoPunctFilter puncDepFilter = new NoPunctFilter(puncFilter);
NoPunctTypedDependencyFilter puncTypedDepFilter = new NoPunctTypedDependencyFilter(puncFilter);
dependencies = root.dependencies(puncDepFilter, null);
for (Dependency<Label, Label, Object> p : dependencies) {
//System.err.println("dep found " + p);
TreeGraphNode gov = (TreeGraphNode) p.governor();
TreeGraphNode dep = (TreeGraphNode) p.dependent();
dep.addArc(GrammaticalRelation.getAnnotationClass(GOVERNOR), gov);
}
// analyze the root (and its descendants, recursively)
if (relationsLock != null) {
relationsLock.lock();
}
try {
analyzeNode(root, root, relations, hf);
}
finally {
if (relationsLock != null) {
relationsLock.unlock();
}
}
// add typed dependencies
typedDependencies = getDeps(false, puncTypedDepFilter);
allTypedDependencies = getDeps(true, puncTypedDepFilter);
}
private static void throwDepFormatException(String dep) {
throw new RuntimeException(String.format("Dependencies should be for the format 'type(arg-idx, arg-idx)'. Could not parse '%s'", dep));
}
/**
* Create a grammatical structure from its string representation.
*
* Like buildCoNLLXGrammaticalStructure,
* this method fakes up the parts of the tree structure that are not
* used by the grammatical relation transformation operations.
*
* <i>Note:</i> Added by daniel cer
*
* @param tokens
* @param posTags
* @param deps
*/
public static GrammaticalStructure fromStringReps(List<String> tokens, List<String> posTags, List<String> deps) {
if (tokens.size() != posTags.size()) {
throw new RuntimeException(String.format(
"tokens.size(): %d != pos.size(): %d\n", tokens.size(), posTags
.size()));
}
List<TreeGraphNode> tgWordNodes = new ArrayList<TreeGraphNode>(tokens.size());
List<TreeGraphNode> tgPOSNodes = new ArrayList<TreeGraphNode>(tokens.size());
SemanticHeadFinder headFinder = new SemanticHeadFinder();
Iterator<String> posIter = posTags.iterator();
for (String wordString : tokens) {
String posString = posIter.next();
TreeGraphNode word = new TreeGraphNode(new Word(wordString));
TreeGraphNode pos = new TreeGraphNode(new Word(posString));
tgWordNodes.add(word);
tgPOSNodes.add(pos);
TreeGraphNode[] childArr = {word};
pos.setChildren(childArr);
word.setParent(pos);
pos.percolateHeads(headFinder);
}
TreeGraphNode root = new TreeGraphNode(new StringLabel("ROOT"));
root.setChildren(tgPOSNodes.toArray(new TreeGraphNode[tgPOSNodes.size()]));
root.setIndex(0);
// Build list of TypedDependencies
List<TypedDependency> tdeps = new ArrayList<TypedDependency>(deps.size());
for (String depString : deps) {
int firstBracket = depString.indexOf('(');
if (firstBracket == -1) throwDepFormatException(depString);
String type = depString.substring(0, firstBracket);
if (depString.charAt(depString.length() - 1) != ')') throwDepFormatException(depString);
String args = depString.substring(firstBracket + 1, depString.length() - 1);
int argSep = args.indexOf(", ");
if (argSep == -1) throwDepFormatException(depString);
String parentArg = args.substring(0, argSep);
String childArg = args.substring(argSep + 2);
int parentDash = parentArg.lastIndexOf('-');
if (parentDash == -1) throwDepFormatException(depString);
int childDash = childArg.lastIndexOf('-');
if (childDash == -1) throwDepFormatException(depString);
//System.err.printf("parentArg: %s\n", parentArg);
int parentIdx = Integer.parseInt(parentArg.substring(parentDash+1).replace("'", ""));
int childIdx = Integer.parseInt(childArg.substring(childDash+1).replace("'", ""));
GrammaticalRelation grel = new GrammaticalRelation(GrammaticalRelation.Language.Any, type, null, null, DEPENDENT);
TypedDependency tdep = new TypedDependency(grel, (parentIdx == 0 ? root: tgWordNodes.get(parentIdx-1)), tgWordNodes.get(childIdx-1));
tdeps.add(tdep);
}
// TODO add some elegant way to construct language
// appropriate GrammaticalStructures (e.g., English, Chinese, etc.)
return new GrammaticalStructure(tdeps, root) {
private static final long serialVersionUID = 1L;
};
}
public GrammaticalStructure(List<TypedDependency> projectiveDependencies, TreeGraphNode root) {
super(root);
this.puncFilter = Filters.acceptFilter();
allTypedDependencies = typedDependencies = new ArrayList<TypedDependency>(projectiveDependencies);
dependencies = Generics.newHashSet();
for (TypedDependency tdep : projectiveDependencies) {
dependencies.add(new NamedDependency(tdep.gov().toString(), tdep.dep().toString(), tdep.reln()));
}
}
public GrammaticalStructure(Tree t, Collection<GrammaticalRelation> relations,
HeadFinder hf, Filter<String> puncFilter) {
this(t, relations, null, hf, puncFilter);
}
// @Override
// public String toString() {
// StringBuilder sb = new StringBuilder(super.toString());
// sb.append("Dependencies:");
// sb.append("\n" + dependencies);
// sb.append("Typed Dependencies:");
// sb.append("\n" + typedDependencies);
// sb.append("More Typed Dependencies:");
// sb.append("\n" + moreTypedDependencies());
// return sb.toString();
// }
// cdm dec 2009: I changed this to automatically fail on preterminal nodes, since they shouldn't match for GR parent patterns. Should speed it up.
private static void analyzeNode(TreeGraphNode t, TreeGraphNode root, Collection<GrammaticalRelation> relations, HeadFinder hf) {
if (t.isPhrasal()) { // don't do leaves or preterminals!
TreeGraphNode tHigh = t.highestNodeWithSameHead();
for (GrammaticalRelation egr : relations) {
if (egr.isApplicable(t)) {
for (Tree u : egr.getRelatedNodes(t, root, hf)) {
//System.err.println("Adding " + egr.getShortName() + " from " + t + " to " + u + " tHigh=" + tHigh);
tHigh.addArc(GrammaticalRelation.getAnnotationClass(egr), (TreeGraphNode) u);
}
}
}
// now recurse into children
for (TreeGraphNode kid : t.children()) {
analyzeNode(kid, root, relations, hf);
}
}
}
/**
* The constructor builds a list of typed dependencies using
* information from a <code>GrammaticalStructure</code>.
*
* @param getExtra If true, the list of typed dependencies will contain extra ones.
* If false, the list of typed dependencies will respect the tree structure.
*/
private List<TypedDependency> getDeps(boolean getExtra, Filter<TypedDependency> puncTypedDepFilter) {
List<TypedDependency> basicDep = Generics.newArrayList();
for (Dependency<Label, Label, Object> d : dependencies()) {
TreeGraphNode gov = (TreeGraphNode) d.governor();
TreeGraphNode dep = (TreeGraphNode) d.dependent();
GrammaticalRelation reln = getGrammaticalRelation(gov, dep);
// System.err.print("Gov: " + gov);
// System.err.print(" Dep: " + dep);
// System.err.println(" Reln: " + reln);
basicDep.add(new TypedDependency(reln, gov, dep));
}
// add the root
TreeGraphNode dependencyRoot = new TreeGraphNode(new Word("ROOT"));
dependencyRoot.setIndex(0);
TreeGraphNode rootDep = null;
Collection<TypedDependency> roots = getRoots(basicDep);
if (roots.size() == 0) {
// This can happen if the sentence has only one non-punctuation
// word. In that case, we still want to add the root->word
// dependency, but we won't find any roots using the getRoots()
// method. Instead we use the HeadFinder and the tree.
rootDep = root().headWordNode();
if (rootDep == null) {
List<Tree> leaves = Trees.leaves(root());
if (leaves.size() > 0) {
Tree leaf = leaves.get(0);
if (!(leaf instanceof TreeGraphNode)) {
throw new AssertionError("Leaves should be TreeGraphNodes");
}
rootDep = (TreeGraphNode) leaf;
if (rootDep.headWordNode() != null) {
rootDep = rootDep.headWordNode();
}
}
}
} else {
// since roots.size() > 0, there must be at least one element
Iterator<TypedDependency> iterator = roots.iterator();
rootDep = iterator.next().gov();
}
if (rootDep != null) {
TypedDependency rootTypedDep =
new TypedDependency(ROOT, dependencyRoot, rootDep);
if (puncTypedDepFilter.accept(rootTypedDep)) {
basicDep.add(rootTypedDep);
}
}
postProcessDependencies(basicDep);
if (getExtra) {
getExtras(basicDep);
// adds stuff to basicDep based on the tregex patterns over the tree
getTreeDeps(root(), basicDep, puncTypedDepFilter, extraTreeDepFilter());
}
Collections.sort(basicDep);
return basicDep;
}
/**
* Returns a Filter which checks dependencies for usefulness as
* extra tree-based dependencies. By default, everything is
* accepted. One example of how this can be useful is in the
* English dependencies, where the REL dependency is used as an
* intermediate and we do not want this to be added when we make a
* second pass over the trees for missing dependencies.
*/
protected Filter<TypedDependency> extraTreeDepFilter() {
return Filters.acceptFilter();
}
/**
* Post process the dependencies in whatever way this language
* requires. For example, English might replace "rel" dependencies
* with either dobj or pobj depending on the surrounding
* dependencies.
*/
protected void postProcessDependencies(List<TypedDependency> basicDep) {
// no post processing by default
}
/**
* Get extra dependencies that do not depend on the tree structure,
* but rather only depend on the existing dependency structure.
* For example, the English xsubj dependency can be extracted that way.
*/
protected void getExtras(List<TypedDependency> basicDep) {
// no extra dependencies by default
}
/** Look through the tree t and adds to the List basicDep
* additional dependencies which aren't
* in the List but which satisfy the filter puncTypedDepFilter.
*
* @param t The tree to examine (not changed)
* @param basicDep The list of dependencies which may be augmented
* @param puncTypedDepFilter The filter that may skip punctuation dependencies
* @param extraTreeDepFilter Additional dependencies are added only if they pass this filter
*/
private static void getTreeDeps(TreeGraphNode t, List<TypedDependency> basicDep,
Filter<TypedDependency> puncTypedDepFilter,
Filter<TypedDependency> extraTreeDepFilter) {
if (t.isPhrasal()) { // don't do leaves or POS tags (chris changed this from numChildren > 0 in 2010)
Map<Class<? extends GrammaticalRelationAnnotation>, Set<TreeGraphNode>> depMap = getAllDependents(t);
for (Class<? extends GrammaticalRelationAnnotation> depName : depMap.keySet()) {
for (TreeGraphNode depNode : depMap.get(depName)) {
TreeGraphNode gov = t.headWordNode();
TreeGraphNode dep = depNode.headWordNode();
if (gov != dep) {
List<GrammaticalRelation> rels = getListGrammaticalRelation(t, depNode);
if (!rels.isEmpty()) {
for (GrammaticalRelation rel : rels) {
TypedDependency newDep = new TypedDependency(rel, gov, dep);
if (!basicDep.contains(newDep) && puncTypedDepFilter.accept(newDep) && extraTreeDepFilter.accept(newDep)) {
newDep.setExtra();
basicDep.add(newDep);
}
}
}
}
}
}
// now recurse into children
for (Tree kid : t.children()) {
getTreeDeps((TreeGraphNode) kid, basicDep, puncTypedDepFilter, extraTreeDepFilter);
}
}
}
private static class NoPunctFilter implements Filter<Dependency<Label, Label, Object>> {
private Filter<String> npf;
NoPunctFilter(Filter<String> f) {
this.npf = f;
}
@Override
public boolean accept(Dependency<Label, Label, Object> d) {
if (d == null) {
return false;
}
Label lab = d.dependent();
if (lab == null) {
return false;
}
return npf.accept(lab.value());
}
// Automatically generated by Eclipse
private static final long serialVersionUID = -2319891944796663180L;
} // end static class NoPunctFilter
private static class NoPunctTypedDependencyFilter implements Filter<TypedDependency> {
private Filter<String> npf;
NoPunctTypedDependencyFilter(Filter<String> f) {
this.npf = f;
}
@Override
public boolean accept(TypedDependency d) {
if (d == null) return false;
TreeGraphNode s = d.dep();
if (s == null) return false;
Label l = s.label();
if (l == null) return false;
return npf.accept(l.value());
}
// Automatically generated by Eclipse
private static final long serialVersionUID = -2872766864289207468L;
} // end static class NoPunctTypedDependencyFilter
/**
* Returns the set of (governor, dependent) dependencies in this
* <code>GrammaticalStructure</code>.
* @return The set of (governor, dependent) dependencies in this
* <code>GrammaticalStructure</code>.
*/
public Set<Dependency<Label, Label, Object>> dependencies() {
return dependencies;
}
/**
* Get GrammaticalRelation between gov and dep, and null if gov is not the
* governor of dep
*/
public GrammaticalRelation getGrammaticalRelation(int govIndex, int depIndex) {
TreeGraphNode gov = getNodeByIndex(govIndex);
TreeGraphNode dep = getNodeByIndex(depIndex);
return getGrammaticalRelation(gov, dep);
}
/**
* Get GrammaticalRelation between gov and dep, and null if gov is not the
* governor of dep
*/
public static GrammaticalRelation getGrammaticalRelation(TreeGraphNode gov, TreeGraphNode dep) {
GrammaticalRelation reln = GrammaticalRelation.DEPENDENT;
TreeGraphNode govH = gov.highestNodeWithSameHead();
TreeGraphNode depH = dep.highestNodeWithSameHead();
// System.err.println(" gov node " + gov);
// System.err.println(" govH " + govH);
// System.err.println(" dep node " + dep);
// System.err.println(" depH " + depH);
// Set sortedSet = new TreeSet(new NameComparator());
// sortedSet.addAll(govH.arcLabelsToNode(depH));
// Set<Class<? extends GrammaticalRelationAnnotation>> arcLabels = sortedSet;
Set<Class<? extends GrammaticalRelationAnnotation>> arcLabels = new TreeSet<Class<? extends GrammaticalRelationAnnotation>>(new NameComparator<Class<? extends GrammaticalRelationAnnotation>>());
arcLabels.addAll(govH.arcLabelsToNode(depH));
//System.err.println("arcLabels: " + arcLabels);
for (Class<? extends GrammaticalRelationAnnotation> arcLabel : arcLabels) {
if (arcLabel != null) {
GrammaticalRelation reln2;
try {
reln2 = GrammaticalRelation.getRelation(arcLabel);
} catch (Exception e) {
continue;
}
//GrammaticalRelation reln2 = r;
if (reln.isAncestor(reln2)) {
reln = reln2;
} else if (PRINT_DEBUGGING && ! reln2.isAncestor(reln)) {
System.err.println("@@@\t" + reln + "\t" + reln2 + "\t" +
govH.label().get(CoreAnnotations.ValueAnnotation.class) + "\t" + depH.label().get(CoreAnnotations.ValueAnnotation.class));
}
}
}
if (PRINT_DEBUGGING && reln.equals(GrammaticalRelation.DEPENDENT)) {
String topCat = govH.label().get(CoreAnnotations.ValueAnnotation.class);
String topTag = govH.label().get(TreeCoreAnnotations.HeadTagAnnotation.class).value();
String topWord = govH.label().get(TreeCoreAnnotations.HeadWordAnnotation.class).value();
String botCat = depH.label().get(CoreAnnotations.ValueAnnotation.class);
String botTag = depH.label().get(TreeCoreAnnotations.HeadTagAnnotation.class).value();
String botWord = depH.label().get(TreeCoreAnnotations.HeadWordAnnotation.class).value();
System.err.println("### dep\t" + topCat + "\t" + topTag + "\t" + topWord +
"\t" + botCat + "\t" + botTag + "\t" + botWord + "\t");
}
return reln;
}
/**
* Get a list of GrammaticalRelation between gov and dep. Useful for getting extra dependencies, in which
* two nodes can be linked by multiple arcs.
*/
public static List<GrammaticalRelation> getListGrammaticalRelation(TreeGraphNode gov, TreeGraphNode dep) {
List<GrammaticalRelation> list = new ArrayList<GrammaticalRelation>();
TreeGraphNode govH = gov.highestNodeWithSameHead();
TreeGraphNode depH = dep.highestNodeWithSameHead();
/*System.out.println("Extra gov node " + gov);
System.out.println("govH " + govH);
System.out.println("dep node " + dep);
System.out.println("depH " + depH);*/
Set<Class<? extends GrammaticalRelationAnnotation>> arcLabels = govH.arcLabelsToNode(depH);
//System.out.println("arcLabels: " + arcLabels);
if (dep != depH) {
Set<Class<? extends GrammaticalRelationAnnotation>> arcLabels2 = govH.arcLabelsToNode(dep);
//System.out.println("arcLabels2: " + arcLabels2);
arcLabels.addAll(arcLabels2);
}
//System.out.println("arcLabels: " + arcLabels);
for (Class<? extends GrammaticalRelationAnnotation> arcLabel : arcLabels) {
if (arcLabel != null) {
GrammaticalRelation reln = GrammaticalRelation.getRelation(arcLabel);
boolean descendantFound = false;
for (int index = 0; index < list.size(); ++index) {
GrammaticalRelation gr = list.get(index);
//if the element in the list is an ancestor of the current
//relation, remove it (we will replace it later)
if (gr.isAncestor(reln)) {
list.remove(index);
--index;
} else if (reln.isAncestor(gr)) {
//if the relation is not an ancestor of an element in the
//list, we add the relation
descendantFound = true;
}
}
if (!descendantFound) {
list.add(reln);
}
}
}
//System.out.println("in list " + list);
return list;
}
/**
* Returns the typed dependencies of this grammatical structure. These
* are the basic word-level typed dependencies, where each word is dependent
* on one other thing, either a word or the starting ROOT, and the
* dependencies have a tree structure. This corresponds to the
* command-line option "basicDependencies".
*
* @return The typed dependencies of this grammatical structure
*/
public Collection<TypedDependency> typedDependencies() {
return typedDependencies(false);
}
/**
* Returns all the typed dependencies of this grammatical structure.
* These are like the basic (uncollapsed) dependencies, but may include
* extra arcs for control relationships, etc. This corresponds to the
* "nonCollapsed" option.
*/
public Collection<TypedDependency> allTypedDependencies() {
return typedDependencies(true);
}
/**
* Returns the typed dependencies of this grammatical structure. These
* are non-collapsed dependencies (basic or nonCollapsed).
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", and does not follow a tree structure.
* @return The typed dependencies of this grammatical structure
*/
public List<TypedDependency> typedDependencies(boolean includeExtras) {
List<TypedDependency> deps = new ArrayList<TypedDependency>(includeExtras ? allTypedDependencies : typedDependencies);
correctDependencies(deps);
return deps;
}
/**
* Get the typed dependencies after collapsing them.
* Collapsing dependencies refers to turning certain function words
* such as prepositions and conjunctions into arcs, so they disappear from
* the set of nodes.
* There is no guarantee that the dependencies are a tree. While the
* dependencies are normally tree-like, the collapsing may introduce
* not only re-entrancies but even small cycles.
*
* @return A set of collapsed dependencies
*/
public Collection<TypedDependency> typedDependenciesCollapsed() {
return typedDependenciesCollapsed(false);
}
// todo [cdm 2012]: The semantics of this method is the opposite of the others.
// The other no argument methods correspond to includeExtras being
// true, but for this one it is false. This should probably be made uniform.
/**
* Get the typed dependencies after mostly collapsing them, but keep a tree
* structure. In order to do this, the code does:
* <ol>
* <li> no relative clause processing
* <li> no xsubj relations
* <li> no propagation of conjuncts
* </ol>
* This corresponds to the "tree" option.
*
* @return collapsed dependencies keeping a tree structure
*/
public Collection<TypedDependency> typedDependenciesCollapsedTree() {
List<TypedDependency> tdl = typedDependencies(false);
collapseDependenciesTree(tdl);
return tdl;
}
/**
* Get the typed dependencies after collapsing them.
* The "collapsed" option corresponds to calling this method with argument
* {@code true}.
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", like controlling subjects
* @return collapsed dependencies
*/
public List<TypedDependency> typedDependenciesCollapsed(boolean includeExtras) {
List<TypedDependency> tdl = typedDependencies(false);
// Adds stuff to the basic dependencies.
// We don't want to simply call typedDependencies with
// "includeExtras" because the collapseDependencies method may add
// the extras in a way that makes more logical sense. For
// example, the English dependencies, when CC processed, have more
// nsubjs than they originally do. If we wait until that occurs
// to add xsubj for xcomp dependencies, we get better coverage.
// TODO: this might not be necessary any more
if (includeExtras) {
getExtras(tdl);
getTreeDeps(root(), tdl, new NoPunctTypedDependencyFilter(puncFilter), extraTreeDepFilter());
}
collapseDependencies(tdl, false, includeExtras);
return tdl;
}
/**
* Get the typed dependencies after collapsing them and processing eventual
* CC complements. The effect of this part is to distributed conjoined
* arguments across relations or conjoined predicates across their arguments.
* This is generally useful, and we generally recommend using the output of
* this method with the second argument being {@code true}.
* The "CCPropagated" option corresponds to calling this method with an
* argument of {@code true}.
*
* @param includeExtras If true, the list of typed dependencies
* returned may include "extras", such as controlled subject links.
* @return collapsed dependencies with CC processed
*/
public List<TypedDependency> typedDependenciesCCprocessed(boolean includeExtras) {
List<TypedDependency> tdl = typedDependencies(false);
// Adds stuff to the basic dependencies.
// We don't want to simply call typedDependencies with
// "includeExtras" because the collapseDependencies method may add
// the extras in a way that makes more logical sense. For
// example, the English dependencies, when CC processed, have more
// nsubjs than they originally do. If we wait until that occurs
// to add xsubj for xcomp dependencies, we get better coverage.
// TODO: this might not be necessary any more
if (includeExtras) {
getExtras(tdl);
getTreeDeps(root(), tdl, new NoPunctTypedDependencyFilter(puncFilter), extraTreeDepFilter());
}
collapseDependencies(tdl, true, includeExtras);
return tdl;
}
/**
* Get a list of the typed dependencies, including extras like control
* dependencies, collapsing them and distributing relations across
* coordination. This method is generally recommended for best
* representing the semantic and syntactic relations of a sentence. In
* general it returns a directed graph (i.e., the output may not be a tree
* and it may contain (small) cycles).
* The "CCPropagated" option corresponds to calling this method.
*
* @return collapsed dependencies with CC processed
*/
public List<TypedDependency> typedDependenciesCCprocessed() {
return typedDependenciesCCprocessed(true);
}
/**
* Destructively modify the <code>Collection<TypedDependency></code> to collapse
* language-dependent transitive dependencies.
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
* @param list A list of dependencies to process for possible collapsing
* @param CCprocess apply CC process?
*/
protected void collapseDependencies(List<TypedDependency> list, boolean CCprocess, boolean includeExtras) {
// do nothing as default operation
}
/**
* Destructively modify the <code>Collection<TypedDependency></code> to collapse
* language-dependent transitive dependencies but keeping a tree structure.
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
* @param list A list of dependencies to process for possible collapsing
*
*/
protected void collapseDependenciesTree(List<TypedDependency> list) {
// do nothing as default operation
}
/**
* Destructively modify the <code>TypedDependencyGraph</code> to correct
* language-dependent dependencies. (e.g., nsubjpass in a relative clause)
* <p/>
* Default is no-op; to be over-ridden in subclasses.
*
*/
protected void correctDependencies(Collection<TypedDependency> list) {
// do nothing as default operation
}
/**
* Returns the dependency path as a list of String, from node to root, it is assumed that
* that root is an ancestor of node
*
* @return A list of dependency labels
*/
public List<String> getDependencyPath(int nodeIndex, int rootIndex) {
TreeGraphNode node = getNodeByIndex(nodeIndex);
TreeGraphNode rootTree = getNodeByIndex(rootIndex);
return getDependencyPath(node, rootTree);
}
/**
* Returns the dependency path as a list of String, from node to root, it is assumed that
* that root is an ancestor of node
*
* @param node Note to return path from
* @param root The root of the tree, an ancestor of node
* @return A list of dependency labels
*/
// used only by unused method above.
private static List<String> getDependencyPath(TreeGraphNode node, TreeGraphNode root) {
List<String> path = new ArrayList<String>();
while (!node.equals(root)) {
TreeGraphNode gov = node.getGovernor();
// System.out.println("Governor for \"" + node.value() + "\": \"" + gov.value() + "\"");
List<GrammaticalRelation> relations = getListGrammaticalRelation(gov, node);
StringBuilder sb = new StringBuilder();
for (GrammaticalRelation relation : relations) {
//if (!arcLabel.equals(GOVERNOR))
sb.append((sb.length() == 0 ? "" : "+")).append(relation.toString());
}
path.add(sb.toString());
node = gov;
}
return path;
}
/**
* Returns all the dependencies of a certain node.
*
* @param node The node to return dependents for
* @return map of dependencies
*/
private static <GR extends GrammaticalRelationAnnotation> // separating this out helps some compilers
Map<Class<? extends GrammaticalRelationAnnotation>, Set<TreeGraphNode>> getAllDependents(TreeGraphNode node) {
Map<Class<? extends GrammaticalRelationAnnotation>, Set<TreeGraphNode>> newMap = Generics.newHashMap();
for (Class<?> o : node.label.keySet()) {
if (GrammaticalRelationAnnotation.class.isAssignableFrom(o)) {
// ignore any non-GrammaticalRelationAnnotation element
Class<GR> typedKey = ErasureUtils.uncheckedCast(o);
newMap.put(typedKey, node.label.get(typedKey));
}
}
return newMap;
}
/**
* Checks if all the typeDependencies are connected
* @param list a list of typedDependencies
* @return true if the list represents a connected graph, false otherwise
*/
public static boolean isConnected(Collection<TypedDependency> list) {
return getRoots(list).size() <= 1; // there should be no more than one root to have a connected graph
// there might be no root in the way we look when you have a relative clause
// ex.: Apple is a society that sells computers
// (the root "society" will also be the nsubj of "sells")
}
/**
* Return a list of TypedDependencies which are not dependent on any node from the list.
*
* @param list The list of TypedDependencies to check
* @return A list of TypedDependencies which are not dependent on any node from the list
*/
public static Collection<TypedDependency> getRoots(Collection<TypedDependency> list) {
Collection<TypedDependency> roots = new ArrayList<TypedDependency>();
// need to see if more than one governor is not listed somewhere as a dependent
// first take all the deps
Collection<TreeGraphNode> deps = Generics.newHashSet();
for (TypedDependency typedDep : list) {
deps.add(typedDep.dep());
}
// go through the list and add typedDependency for which the gov is not a dep
Collection<TreeGraphNode> govs = Generics.newHashSet();
for (TypedDependency typedDep : list) {
TreeGraphNode gov = typedDep.gov();
if (!deps.contains(gov) && !govs.contains(gov)) {
roots.add(typedDep);
}
govs.add(gov);
}
return roots;
}
private static final long serialVersionUID = 2286294455343892678L;
private static class NameComparator<X> implements Comparator<X> {
@Override
public int compare(X o1, X o2) {
String n1 = o1.toString();
String n2 = o2.toString();
return n1.compareTo(n2);
}
}
public static final String DEFAULT_PARSER_FILE = "/u/nlp/data/lexparser/englishPCFG.ser.gz";
/**
* Print typed dependencies in either the Stanford dependency representation
* or in the conllx format.
*
* @param deps
* Typed dependencies to print
* @param tree
* Tree corresponding to typed dependencies (only necessary if conllx
* == true)
* @param conllx
* If true use conllx format, otherwise use Stanford representation
* @param extraSep
* If true, in the Stanford representation, the extra dependencies
* (which do not preserve the tree structure) are printed after the
* basic dependencies
*/
public static void printDependencies(GrammaticalStructure gs, Collection<TypedDependency> deps, Tree tree, boolean conllx, boolean extraSep) {
System.out.println(dependenciesToString(gs, deps, tree, conllx, extraSep));
}
public static String dependenciesToString(GrammaticalStructure gs, Collection<TypedDependency> deps, Tree tree, boolean conllx, boolean extraSep) {
StringBuilder bf = new StringBuilder();
Map<Integer, Integer> indexToPos = Generics.newHashMap();
indexToPos.put(0,0); // to deal with the special node "ROOT"
List<Tree> gsLeaves = gs.root.getLeaves();
for (int i = 0; i < gsLeaves.size(); i++) {
TreeGraphNode leaf = (TreeGraphNode) gsLeaves.get(i);
indexToPos.put(leaf.label.index(), i + 1);
}
if (conllx) {
List<Tree> leaves = tree.getLeaves();
String[] words = new String[leaves.size()];
String[] pos = new String[leaves.size()];
String[] relns = new String[leaves.size()];
int[] govs = new int[leaves.size()];
int index = 0;
for (Tree leaf : leaves) {
index++;
if (!indexToPos.containsKey(index)) {
continue;
}
int depPos = indexToPos.get(index) - 1;
words[depPos] = leaf.value();
pos[depPos] = leaf.parent(tree).value(); // use slow, but safe, parent look up
}
for (TypedDependency dep : deps) {
int depPos = indexToPos.get(dep.dep().index()) - 1;
govs[depPos] = indexToPos.get(dep.gov().index());
relns[depPos] = dep.reln().toString();
}
for (int i = 0; i < relns.length; i++) {
if (words[i] == null) {
continue;
}
String out = String.format("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_\n", i + 1, words[i], pos[i], pos[i], govs[i], (relns[i] != null ? relns[i] : "erased"));
bf.append(out);
}
} else {
if (extraSep) {
List<TypedDependency> extraDeps = new ArrayList<TypedDependency>();
for (TypedDependency dep : deps) {
if (dep.extra()) {
extraDeps.add(dep);
} else {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
// now we print the separator for extra dependencies, and print these if
// there are some
if (!extraDeps.isEmpty()) {
bf.append("======\n");
for (TypedDependency dep : extraDeps) {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
} else {
for (TypedDependency dep : deps) {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
}
return bf.toString();
}
private static String toStringIndex(TypedDependency td, Map<Integer, Integer> indexToPos) {
TreeGraphNode gov = td.gov();
TreeGraphNode dep = td.dep();
return td.reln() + "(" + gov.value() + "-" + indexToPos.get(gov.index()) + gov.toPrimes() + ", " + dep.value() + "-" + indexToPos.get(dep.index()) + dep.toPrimes() + ")";
}
// Note that these field constants are 0-based whereas much documentation is 1-based
public static final int CoNLLX_WordField = 1;
public static final int CoNLLX_POSField = 3;
public static final int CoNLLX_GovField = 6;
public static final int CoNLLX_RelnField = 7;
public static final int CoNLLX_FieldCount = 10;
/**
* Read in a file containing a CoNLL-X dependency treebank and return a
* corresponding list of GrammaticalStructures.
*
* @throws IOException
*/
public static List<GrammaticalStructure> readCoNLLXGrammaticalStructureCollection(String fileName, Map<String, GrammaticalRelation> shortNameToGRel, GrammaticalStructureFromDependenciesFactory factory) throws IOException {
LineNumberReader reader = new LineNumberReader(new FileReader(fileName));
List<GrammaticalStructure> gsList = new LinkedList<GrammaticalStructure>();
List<List<String>> tokenFields = new ArrayList<List<String>>();
for (String inline = reader.readLine(); inline != null;
inline = reader.readLine()) {
if (!"".equals(inline)) {
// read in a single sentence token by token
List<String> fields = Arrays.asList(inline.split("\t"));
if (fields.size() != CoNLLX_FieldCount) {
throw new RuntimeException(String.format("Error (line %d): 10 fields expected but %d are present", reader.getLineNumber(), fields.size()));
}
tokenFields.add(fields);
} else {
if (tokenFields.isEmpty())
continue; // skip excess empty lines
gsList.add(buildCoNLLXGrammaticalStructure(tokenFields, shortNameToGRel, factory));
tokenFields = new ArrayList<List<String>>();
}
}
return gsList;
}
public static GrammaticalStructure
buildCoNLLXGrammaticalStructure(List<List<String>> tokenFields,
Map<String, GrammaticalRelation> shortNameToGRel,
GrammaticalStructureFromDependenciesFactory factory) {
List<TreeGraphNode> tgWordNodes =
new ArrayList<TreeGraphNode>(tokenFields.size());
List<TreeGraphNode> tgPOSNodes =
new ArrayList<TreeGraphNode>(tokenFields.size());
SemanticHeadFinder headFinder = new SemanticHeadFinder();
// Construct TreeGraphNodes for words and POS tags
for (List<String> fields : tokenFields) {
TreeGraphNode word =
new TreeGraphNode(new Word(fields.get(CoNLLX_WordField)));
TreeGraphNode pos =
new TreeGraphNode(new Word(fields.get(CoNLLX_POSField)));
tgWordNodes.add(word);
tgPOSNodes.add(pos);
TreeGraphNode[] childArr = { word };
pos.setChildren(childArr);
word.setParent(pos);
pos.percolateHeads(headFinder);
}
// We fake up the parts of the tree structure that are not
// actually used by the grammatical relation transformation
// operations.
//
// That is, the constructed TreeGraphs consist of a flat tree,
// without any phrase bracketing, but that does preserve the
// parent child relationship between words and their POS tags.
//
// e.g. (ROOT (PRP I) (VBD hit) (DT the) (NN ball) (. .))
// cdm Nov 2009: This next bit wasn't used so I commented it out
// List<List<Integer>> children = new
// ArrayList<List<Integer>>(tokenFields.size());
// for (int i = 0; i < tgWordNodes.size(); i++) {
// children.add(new ArrayList<Integer>());
// }
TreeGraphNode root =
new TreeGraphNode(new Word("ROOT-" + (tgWordNodes.size() + 1)));
root.setChildren(tgPOSNodes.toArray(new TreeGraphNode[tgPOSNodes.size()]));
// Build list of TypedDependencies
List<TypedDependency> tdeps =
new ArrayList<TypedDependency>(tgWordNodes.size());
// Create a node outside the tree useful for root dependencies;
// we want to keep those if they were stored in the conll file
TreeGraphNode dependencyRoot = new TreeGraphNode(new Word("ROOT"));
dependencyRoot.setIndex(0);
for (int i = 0; i < tgWordNodes.size(); i++) {
String parentIdStr = tokenFields.get(i).get(CoNLLX_GovField);
if (parentIdStr == null || parentIdStr.equals(""))
continue;
int parentId = Integer.parseInt(parentIdStr) - 1;
String grelString = tokenFields.get(i).get(CoNLLX_RelnField);
if (grelString.equals("null") || grelString.equals("erased"))
continue;
GrammaticalRelation grel = shortNameToGRel.get(grelString.toLowerCase());
TypedDependency tdep;
if (grel == null) {
if (grelString.toLowerCase().equals("root")) {
tdep = new TypedDependency(ROOT, dependencyRoot, tgWordNodes.get(i));
} else {
throw new RuntimeException("Unknown grammatical relation '" +
grelString + "' fields: " +
tokenFields.get(i) + "\nNode: " +
tgWordNodes.get(i) + "\n" +
"Known Grammatical relations: ["+shortNameToGRel.keySet()+"]" );
}
} else {
if (parentId >= tgWordNodes.size()) {
System.err.printf("Warning: Invalid Parent Id %d Sentence Length: %d%n", parentId+1, tgWordNodes.size());
System.err.printf(" Assigning to root (0)%n");
parentId = -1;
}
tdep = new TypedDependency(grel, (parentId == -1 ? root : tgWordNodes.get(parentId)),
tgWordNodes.get(i));
}
tdeps.add(tdep);
}
return factory.build(tdeps, root);
}
private static String[] parseClassConstructArgs(String namePlusArgs) {
String[] args = StringUtils.EMPTY_STRING_ARRAY;
String name = namePlusArgs;
if (namePlusArgs.matches(".*\\([^)]*\\)$")) {
String argStr = namePlusArgs.replaceFirst("^.*\\(([^)]*)\\)$", "$1");
args = argStr.split(",");
name = namePlusArgs.replaceFirst("\\([^)]*\\)$", "");
}
String[] tokens = new String[1 + args.length];
tokens[0] = name;
System.arraycopy(args, 0, tokens, 1, args.length);
return tokens;
}
private static DependencyReader loadAlternateDependencyReader(String altDepReaderName) {
Class<? extends DependencyReader> altDepReaderClass = null;
String[] toks = parseClassConstructArgs(altDepReaderName);
altDepReaderName = toks[0];
String[] depReaderArgs = new String[toks.length - 1];
System.arraycopy(toks, 1, depReaderArgs, 0, toks.length - 1);
try {
Class<?> cl = Class.forName(altDepReaderName);
altDepReaderClass = cl.asSubclass(DependencyReader.class);
} catch (ClassNotFoundException e) {
// have a second go below
}
if (altDepReaderClass == null) {
try {
Class<?> cl = Class.forName("edu.stanford.nlp.trees." + altDepReaderName);
altDepReaderClass = cl.asSubclass(DependencyReader.class);
} catch (ClassNotFoundException e) {
//
}
}
if (altDepReaderClass == null) {
System.err.println("Can't load dependency reader " + altDepReaderName + " or edu.stanford.nlp.trees." + altDepReaderName);
return null;
}
DependencyReader altDepReader; // initialized below
if (depReaderArgs.length == 0) {
try {
altDepReader = altDepReaderClass.newInstance();
} catch (InstantiationException e) {
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
System.err.println("No argument constructor to " + altDepReaderName + " is not public");
return null;
}
} else {
try {
altDepReader = altDepReaderClass.getConstructor(String[].class).newInstance((Object) depReaderArgs);
} catch (IllegalArgumentException e) {
throw new RuntimeException(e);
} catch (SecurityException e) {
throw new RuntimeException(e);
} catch (InstantiationException e) {
e.printStackTrace();
return null;
} catch (IllegalAccessException e) {
System.err.println(depReaderArgs.length + " argument constructor to " + altDepReaderName + " is not public.");
return null;
} catch (InvocationTargetException e) {
throw new RuntimeException(e);
} catch (NoSuchMethodException e) {
System.err.println("String arguments constructor to " + altDepReaderName + " does not exist.");
return null;
}
}
return altDepReader;
}
private static DependencyPrinter loadAlternateDependencyPrinter(String altDepPrinterName) {
Class<? extends DependencyPrinter> altDepPrinterClass = null;
String[] toks = parseClassConstructArgs(altDepPrinterName);
altDepPrinterName = toks[0];
String[] depPrintArgs = new String[toks.length - 1];
System.arraycopy(toks, 1, depPrintArgs, 0, toks.length - 1);
try {
Class<?> cl = Class.forName(altDepPrinterName);
altDepPrinterClass = cl.asSubclass(DependencyPrinter.class);
} catch (ClassNotFoundException e) {
//
}
if (altDepPrinterClass == null) {
try {
Class<?> cl = Class.forName("edu.stanford.nlp.trees." + altDepPrinterName);
altDepPrinterClass = cl.asSubclass(DependencyPrinter.class);
} catch (ClassNotFoundException e) {
//
}
}
if (altDepPrinterClass == null) {
System.err.printf("Unable to load alternative printer %s or %s. Is your classpath set correctly?\n", altDepPrinterName, "edu.stanford.nlp.trees." + altDepPrinterName);
return null;
}
try {
DependencyPrinter depPrinter;
if (depPrintArgs.length == 0) {
depPrinter = altDepPrinterClass.newInstance();
} else {
depPrinter = altDepPrinterClass.getConstructor(String[].class).newInstance((Object) depPrintArgs);
}
return depPrinter;
} catch (IllegalArgumentException e) {
e.printStackTrace();
return null;
} catch (SecurityException e) {
e.printStackTrace();
return null;
} catch (InstantiationException e) {
e.printStackTrace();
return null;
} catch (IllegalAccessException e) {
e.printStackTrace();
return null;
} catch (InvocationTargetException e) {
e.printStackTrace();
return null;
} catch (NoSuchMethodException e) {
if (depPrintArgs == null) {
System.err.printf("Can't find no-argument constructor %s().\n", altDepPrinterName);
} else {
System.err.printf("Can't find constructor %s(%s).\n", altDepPrinterName, Arrays.toString(depPrintArgs));
}
return null;
}
}
private static Function<List<? extends HasWord>, Tree> loadParser(String parserFile, String parserOptions, boolean makeCopulaHead) {
if (parserFile == null || "".equals(parserFile)) {
parserFile = DEFAULT_PARSER_FILE;
if (parserOptions == null) {
parserOptions = "-retainTmpSubcategories";
}
}
if (parserOptions == null) {
parserOptions = "";
}
if (makeCopulaHead) {
parserOptions = "-makeCopulaHead " + parserOptions;
}
parserOptions = parserOptions.trim();
// Load parser by reflection, so that this class doesn't require parser
// for runtime use
// LexicalizedParser lp = LexicalizedParser.loadModel(parserFile);
// For example, the tregex package uses TreePrint, which uses
// GrammaticalStructure, which would then import the
// LexicalizedParser. The tagger can read trees, which means it
// would depend on tregex and therefore depend on the parser.
Function<List<? extends HasWord>, Tree> lp;
try {
Class<?>[] classes = new Class<?>[] { String.class, String[].class };
Method method = Class.forName("edu.stanford.nlp.parser.lexparser.LexicalizedParser").getMethod("loadModel", classes);
String[] opts = {};
if (parserOptions.length() > 0) {
opts = parserOptions.split(" +");
}
lp = (Function<List<? extends HasWord>,Tree>) method.invoke(null, parserFile, opts);
} catch (Exception cnfe) {
throw new RuntimeException(cnfe);
}
return lp;
}
/**
* Allow a collection of trees, that is a Treebank, appear to be a collection
* of GrammaticalStructures.
*
* @author danielcer
*
*/
private static class TreeBankGrammaticalStructureWrapper implements Iterable<GrammaticalStructure> {
private final Iterable<Tree> trees;
private final boolean keepPunct;
private final TreebankLangParserParams params;
private final Map<GrammaticalStructure, Tree> origTrees = new WeakHashMap<GrammaticalStructure, Tree>();
public TreeBankGrammaticalStructureWrapper(Iterable<Tree> wrappedTrees, boolean keepPunct, TreebankLangParserParams params) {
trees = wrappedTrees;
this.keepPunct = keepPunct;
this.params = params;
}
@Override
public Iterator<GrammaticalStructure> iterator() {
return new GsIterator();
}
public Tree getOriginalTree(GrammaticalStructure gs) {
return origTrees.get(gs);
}
private class GsIterator implements Iterator<GrammaticalStructure> {
private final Iterator<Tree> tbIterator = trees.iterator();
private final Filter<String> puncFilter;
private final HeadFinder hf;
private GrammaticalStructure next;
public GsIterator() {
// TODO: this is very english specific
if (keepPunct) {
puncFilter = Filters.acceptFilter();
} else {
puncFilter = new PennTreebankLanguagePack().punctuationWordRejectFilter();
}
hf = params.typedDependencyHeadFinder();
primeGs();
}
private void primeGs() {
GrammaticalStructure gs = null;
while (gs == null && tbIterator.hasNext()) {
Tree t = tbIterator.next();
// System.err.println("GsIterator: Next tree is");
// System.err.println(t);
if (t == null) {
continue;
}
try {
gs = params.getGrammaticalStructure(t, puncFilter, hf);
origTrees.put(gs, t);
next = gs;
// System.err.println("GsIterator: Next tree is");
// System.err.println(t);
return;
} catch (NullPointerException npe) {
System.err.println("Bung (empty?) tree caused below dump. Continuing....");
npe.printStackTrace();
}
}
next = null;
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public GrammaticalStructure next() {
GrammaticalStructure ret = next;
if (ret == null) {
throw new NoSuchElementException();
}
primeGs();
return ret;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
} // end static class TreebankGrammaticalStructureWrapper
/**
* Given sentences or trees, output the typed dependencies.
* <p>
* By default, the method outputs the collapsed typed dependencies with
* processing of conjuncts. The input can be given as plain text (one sentence
* by line) using the option -sentFile, or as trees using the option
* -treeFile. For -sentFile, the input has to be strictly one sentence per
* line. You can specify where to find a parser with -parserFile
* serializedParserPath. See LexicalizedParser for more flexible processing of
* text files (including with Stanford Dependencies output). The above options
* assume a file as input. You can also feed trees (only) via stdin by using
* the option -filter. If one does not specify a -parserFile, one
* can specify which language pack to use with -tLPP, This option
* specifies a class which determines which GrammaticalStructure to
* use, which HeadFinder to use, etc. It will default to
* edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams,
* but any TreebankLangParserParams can be specified.
* <p>
* If no method of producing trees is given other than to use the
* LexicalizedParser, but no parser is specified, a default parser
* is used, the English parser. You can specify options to load
* with the parser using the -parserOpts flag. If the default
* parser is used, and no options are provided, the option
* -retainTmpSubcategories is used.
* <p>
* The following options can be used to specify the types of dependencies
* wanted: </p>
* <ul>
* <li> -collapsed collapsed dependencies
* <li> -basic non-collapsed dependencies that preserve a tree structure
* <li> -nonCollapsed non-collapsed dependencies that do not preserve a tree
* structure (the basic dependencies plus the extra ones)
* <li> -CCprocessed
* collapsed dependencies and conjunctions processed (dependencies are added
* for each conjunct) -- this is the default if no options are passed
* <li> -collapsedTree collapsed dependencies retaining a tree structure
* <li> -makeCopulaHead Contrary to the approach argued for in the SD papers,
* nevertheless make the verb 'to be' the head, not the predicate noun, adjective,
* etc. (However, when the verb 'to be' is used as an auxiliary verb, the main
* verb is still treated as the head.)
* </ul>
* <p>
* The {@code -conllx} option will output the dependencies in the CoNLL format,
* instead of in the standard Stanford format (relation(governor,dependent))
* and will retain punctuation by default.
* When used in the "collapsed" format, words such as prepositions, conjunctions
* which get collapsed into the grammatical relations and are not part of the
* sentence per se anymore will be annotated with "erased" as grammatical relation
* and attached to the fake "ROOT" node with index 0.
* <p/><p>
* There is also an option to retain dependencies involving punctuation:
* {@code -keepPunct}
* </p><p>
* The {@code -extraSep} option used with -nonCollapsed will print the basic
* dependencies first, then a separator ======, and then the extra
* dependencies that do not preserve the tree structure. The -test option is
* used for debugging: it prints the grammatical structure, as well as the
* basic, collapsed and CCprocessed dependencies. It also checks the
* connectivity of the collapsed dependencies. If the collapsed dependencies
* list doesn't constitute a connected graph, it prints the possible offending
* nodes (one of them is the real root of the graph).
* </p><p>
* Using the -conllxFile, you can pass a file containing Stanford dependencies
* in the CoNLL format (e.g., the basic dependencies), and obtain another
* representation using one of the representation options.
* </p><p>
* Usage: <br>
* <code>java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter] <br>
* [-collapsed -basic -CCprocessed -test]</code>
*
* @param args Command-line arguments, as above
*/
@SuppressWarnings("unchecked")
public static void main(String[] args) {
// System.out.print("GrammaticalRelations under DEPENDENT:");
// System.out.println(DEPENDENT.toPrettyString());
MemoryTreebank tb = new MemoryTreebank(new TreeNormalizer());
Iterable<Tree> trees = tb;
Iterable<GrammaticalStructure> gsBank = null;
Properties props = StringUtils.argsToProperties(args);
String encoding = props.getProperty("encoding", "utf-8");
try {
System.setOut(new PrintStream(System.out, true, encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
String treeFileName = props.getProperty("treeFile");
String sentFileName = props.getProperty("sentFile");
String conllXFileName = props.getProperty("conllxFile");
String altDepPrinterName = props.getProperty("altprinter");
String altDepReaderName = props.getProperty("altreader");
String altDepReaderFilename = props.getProperty("altreaderfile");
String filter = props.getProperty("filter");
boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null;
// TODO: if a parser is specified, load this from the parser
// instead of ever loading it from this way
String tLPP = props.getProperty("tLPP", "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams");
TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP);
if (makeCopulaHead) {
// TODO: generalize and allow for more options
String[] options = { "-makeCopulaHead" };
params.setOptionFlag(options, 0);
}
if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) {
try {
System.err.println("Usage: java GrammaticalStructure [options]* [-sentFile|-treeFile|-conllxFile file] [-testGraph]");
System.err.println(" options: -basic, -collapsed, -CCprocessed [the default], -collapsedTree, -parseTree, -test, -parserFile file, -conllx, -keepPunct, -altprinter -altreader -altreaderfile");
TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))"));
tb.add(tr.readTree());
} catch (Exception e) {
System.err.println("Horrible error: " + e);
e.printStackTrace();
}
} else if (altDepReaderName != null && altDepReaderFilename != null) {
DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName);
try {
gsBank = altDepReader.readDependencies(altDepReaderFilename);
} catch (IOException e) {
System.err.println("Error reading " + altDepReaderFilename);
return;
}
} else if (treeFileName != null) {
tb.loadPath(treeFileName);
} else if (filter != null) {
tb.load(new BufferedReader(new InputStreamReader(System.in)));
} else if (conllXFileName != null) {
try {
gsBank = params.readGrammaticalStructureFromFile(conllXFileName);
} catch (RuntimeIOException e) {
System.err.println("Error reading " + conllXFileName);
return;
}
} else {
String parserFile = props.getProperty("parserFile");
String parserOpts = props.getProperty("parserOpts");
boolean tokenized = props.getProperty("tokenized") != null;
Function<List<? extends HasWord>, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead);
trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp);
// Instead of getting this directly from the LP, use reflection
// so that a package which uses GrammaticalStructure doesn't
// necessarily have to use LexicalizedParser
try {
Method method = lp.getClass().getMethod("getTLPParams");
params = (TreebankLangParserParams) method.invoke(lp);
} catch (Exception cnfe) {
throw new RuntimeException(cnfe);
}
}
// treats the output according to the options passed
boolean basic = props.getProperty("basic") != null;
boolean collapsed = props.getProperty("collapsed") != null;
boolean CCprocessed = props.getProperty("CCprocessed") != null;
boolean collapsedTree = props.getProperty("collapsedTree") != null;
boolean nonCollapsed = props.getProperty("nonCollapsed") != null;
boolean extraSep = props.getProperty("extraSep") != null;
boolean parseTree = props.getProperty("parseTree") != null;
boolean test = props.getProperty("test") != null;
boolean keepPunct = props.getProperty("keepPunct") != null;
boolean conllx = props.getProperty("conllx") != null;
// todo: Support checkConnected on more options (including basic)
boolean checkConnected = props.getProperty("checkConnected") != null;
boolean portray = props.getProperty("portray") != null;
// enforce keepPunct if conllx is turned on
if(conllx) {
keepPunct = true;
}
// If requested load alternative printer
DependencyPrinter altDepPrinter = null;
if (altDepPrinterName != null) {
altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName);
}
// System.err.println("First tree in tb is");
// System.err.println(((MemoryTreebank) tb).get(0));
Method m = null;
if (test) {
// see if we can use SemanticGraph(Factory) to check for being a DAG
// Do this by reflection to avoid this becoming a dependency when we distribute the parser
try {
Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory");
m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Filter.class, String.class, int.class);
} catch (Exception e) {
System.err.println("Test cannot check for cycles in tree format (classes not available)");
}
}
if (gsBank == null) {
gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params);
}
for (GrammaticalStructure gs : gsBank) {
Tree tree;
if (gsBank instanceof TreeBankGrammaticalStructureWrapper) {
// System.err.println("Using TreeBankGrammaticalStructureWrapper branch");
tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs);
// System.err.println("Tree is: ");
// System.err.println(t);
} else {
// System.err.println("Using gs.root() branch");
tree = gs.root(); // recover tree
// System.err.println("Tree from gs is");
// System.err.println(t);
}
if (test) {// print the grammatical structure, the basic, collapsed and
// CCprocessed
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
System.out.println("------------- GrammaticalStructure -------------");
System.out.println(gs);
System.out.println("------------- basic dependencies ---------------");
System.out.println(StringUtils.join(gs.typedDependencies(false), "\n"));
System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------");
System.out.println(StringUtils.join(gs.typedDependencies(true), "\n"));
System.out.println("------------- collapsed dependencies -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(true), "\n"));
System.out.println("------------- collapsed dependencies tree -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n"));
System.out.println("------------- CCprocessed dependencies --------");
System.out.println(StringUtils.join(gs.typedDependenciesCCprocessed(true), "\n"));
System.out.println("-----------------------------------------------");
// connectivity test
boolean connected = GrammaticalStructure.isConnected(gs.typedDependenciesCollapsed(true));
System.out.println("collapsed dependencies form a connected graph: " + connected);
if (!connected) {
System.out.println("possible offending nodes: " + GrammaticalStructure.getRoots(gs.typedDependenciesCollapsed(true)));
}
// test for collapsed dependencies being a tree:
// make sure at least it doesn't contain cycles (i.e., is a DAG)
// Do this by reflection so parser doesn't need SemanticGraph and its
// libraries
if (m != null) {
try {
// the first arg is null because it's a static method....
Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0);
Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph");
Method mDag = sg.getDeclaredMethod("isDag");
boolean isDag = (Boolean) mDag.invoke(semGraph);
System.out.println("tree dependencies form a DAG: " + isDag);
} catch (Exception e) {
e.printStackTrace();
}
}
}// end of "test" output
else {
if (parseTree) {
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
}
if (basic) {
if (collapsed || CCprocessed || collapsedTree || nonCollapsed) {
System.out.println("------------- basic dependencies ---------------");
}
if (altDepPrinter == null) {
printDependencies(gs, gs.typedDependencies(false), tree, conllx, false);
} else {
System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(false), tree));
}
}
if (nonCollapsed) {
if (basic || CCprocessed || collapsed || collapsedTree) {
System.out.println("----------- non-collapsed dependencies (basic + extra) -----------");
}
printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep);
}
if (collapsed) {
if (basic || CCprocessed || collapsedTree || nonCollapsed) {
System.out.println("----------- collapsed dependencies -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsed(true), tree, conllx, false);
}
if (CCprocessed) {
if (basic || collapsed || collapsedTree || nonCollapsed) {
System.out.println("---------- CCprocessed dependencies ----------");
}
List<TypedDependency> deps = gs.typedDependenciesCCprocessed(true);
if (checkConnected) {
if (!GrammaticalStructure.isConnected(deps)) {
System.err.println("Graph is not connected for:");
System.err.println(tree);
System.err.println("possible offending nodes: " + GrammaticalStructure.getRoots(deps));
}
}
printDependencies(gs, deps, tree, conllx, false);
}
if (collapsedTree) {
if (basic || CCprocessed || collapsed || nonCollapsed) {
System.out.println("----------- collapsed dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false);
}
// default use: CCprocessed (to parallel what happens within the parser)
if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed) {
// System.out.println("----------- CCprocessed dependencies -----------");
printDependencies(gs, gs.typedDependenciesCCprocessed(true), tree, conllx, false);
}
}
if (portray) {
try {
// put up a window showing it
Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils");
Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class);
// the first arg is null because it's a static method....
mRender.invoke(null, gs, "Collapsed, CC processed deps");
} catch (Exception e) {
throw new RuntimeException("Couldn't use swing to portray semantic graph", e);
}
}
} // end for
} // end main
// todo [cdm 2013]: Take this out and make it a trees class: TreeIterableByParsing
static class LazyLoadTreesByParsing implements Iterable<Tree> {
final Reader reader;
final String filename;
final boolean tokenized;
final String encoding;
final Function<List<? extends HasWord>, Tree> lp;
public LazyLoadTreesByParsing(String filename, String encoding, boolean tokenized, Function<List<? extends HasWord>, Tree> lp) {
this.filename = filename;
this.encoding = encoding;
this.reader = null;
this.tokenized = tokenized;
this.lp = lp;
}
public LazyLoadTreesByParsing(Reader reader, boolean tokenized, Function<List<? extends HasWord>, Tree> lp) {
this.filename = null;
this.encoding = null;
this.reader = reader;
this.tokenized = tokenized;
this.lp = lp;
}
@Override
public Iterator<Tree> iterator() {
final BufferedReader iReader;
if (reader != null) {
iReader = new BufferedReader(reader);
} else {
try {
iReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return new Iterator<Tree>() {
String line = null;
@Override
public boolean hasNext() {
if (line != null) {
return true;
} else {
try {
line = iReader.readLine();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line == null) {
try {
if (reader == null) iReader.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
return false;
}
return true;
}
}
@Override
public Tree next() {
if (line == null) {
throw new NoSuchElementException();
}
Reader lineReader = new StringReader(line);
line = null;
List<Word> words;
if (tokenized) {
words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
} else {
words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
}
if (!words.isEmpty()) {
// the parser throws an exception if told to parse an empty sentence.
Tree parseTree = lp.apply(words);
return parseTree;
} else {
return new SimpleTree();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
} // end static class LazyLoadTreesByParsing
}