package edu.stanford.nlp.trees;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.trees.international.pennchinese.ChineseEnglishWordMap;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.XMLUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
/**
* A class for customizing the print method(s) for a
* {@code edu.stanford.nlp.trees.Tree} as the output of the
* parser. This class supports printing in multiple ways and altering
* behavior via properties specified at construction.
*
* @author Roger Levy
* @author Christopher Manning
* @author Galen Andrew
*/
public class TreePrint {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TreePrint.class);
// TODO: Add support for makeCopulaHead as an outputFormatOption here.
public static final String rootLabelOnlyFormat = "rootSymbolOnly";
public static final String headMark = "=H";
/** The legal output tree formats. */
public static final String[] outputTreeFormats = {
"penn",
"oneline",
rootLabelOnlyFormat,
"words",
"wordsAndTags",
"dependencies",
"typedDependencies",
"typedDependenciesCollapsed",
"latexTree",
"xmlTree",
"collocations",
"semanticGraph",
"conllStyleDependencies",
"conll2007"
};
private final Properties formats;
private final Properties options;
private final boolean markHeadNodes; // = false;
private final boolean lexicalize; // = false;
private final boolean removeEmpty;
private final boolean ptb2text;
private final boolean transChinese; // = false;
private final boolean basicDependencies;
private final boolean collapsedDependencies;
private final boolean nonCollapsedDependencies;
private final boolean nonCollapsedDependenciesSeparated;
private final boolean CCPropagatedDependencies;
private final boolean treeDependencies;
private final boolean includeTags;
private final HeadFinder hf;
private final TreebankLanguagePack tlp;
private final WordStemmer stemmer;
private final Predicate<Dependency<Label, Label, Object>> dependencyFilter;
private final Predicate<Dependency<Label, Label, Object>> dependencyWordFilter;
private final GrammaticalStructureFactory gsf;
/** Pool use of one WordNetConnection. I don't really know if
* Dan Bikel's WordNet code is thread safe, but it definitely doesn't
* close its files, and too much of our code makes TreePrint objects and
* then drops them on the floor, and so we run out of file handles.
* That is, if this variable isn't static, code crashes.
* Maybe we should change this code to use jwnl(x)?
* CDM July 2006.
*/
private static WordNetConnection wnc;
/** This PrintWriter is used iff the user doesn't pass one in to a
* call to printTree(). It prints to System.out.
*/
private final PrintWriter pw = new PrintWriter(System.out, true);
/** Construct a new TreePrint that will print the given formats.
* Warning! This is the anglocentric constructor.
* It will work correctly only for English.
*
* @param formats The formats to print the tree in.
*/
public TreePrint(String formats) {
this(formats, "", new PennTreebankLanguagePack());
}
/** Make a TreePrint instance with no options specified. */
public TreePrint(String formats, TreebankLanguagePack tlp) {
this(formats, "", tlp);
}
/** Make a TreePrint instance. This one uses the default tlp headFinder. */
public TreePrint(String formats, String options, TreebankLanguagePack tlp) {
this(formats, options, tlp, tlp.headFinder(), tlp.typedDependencyHeadFinder());
}
/**
* Make a TreePrint instance.
*
* @param formatString A comma separated list of ways to print each Tree.
* For instance, "penn" or "words,typedDependencies".
* Known formats are: oneline, penn, latexTree, xmlTree, words,
* wordsAndTags, rootSymbolOnly, dependencies,
* typedDependencies, typedDependenciesCollapsed,
* collocations, semanticGraph, conllStyleDependencies,
* conll2007. The last two are both tab-separated values
* formats. The latter has a lot more columns filled with
* underscores. All of them print a blank line after
* the output except for oneline. oneline is also not
* meaningful in XML output (it is ignored: use penn instead).
* (Use of typedDependenciesCollapsed is deprecated. It
* works but we recommend instead selecting a type of
* dependencies using the optionsString argument. Note in
* particular that typedDependenciesCollapsed does not do
* CC propagation, which we generally recommend.)
* @param optionsString Options that additionally specify how trees are to
* be printed (for instance, whether stemming should be done).
* Known options are: {@code stem, lexicalize, markHeadNodes,
* xml, removeTopBracket, transChinese,
* includePunctuationDependencies, basicDependencies, treeDependencies,
* CCPropagatedDependencies, collapsedDependencies, nonCollapsedDependencies,
* nonCollapsedDependenciesSeparated, includeTags}.
* @param tlp The TreebankLanguagePack used to do things like delete
* or ignore punctuation in output
* @param hf The HeadFinder used in printing output
*/
public TreePrint(String formatString, String optionsString, TreebankLanguagePack tlp, HeadFinder hf, HeadFinder typedDependencyHF) {
formats = StringUtils.stringToProperties(formatString);
options = StringUtils.stringToProperties(optionsString);
List<String> okOutputs = Arrays.asList(outputTreeFormats);
for (Object formObj : formats.keySet()) {
String format = (String) formObj;
if ( ! okOutputs.contains(format)) {
throw new RuntimeException("Error: output tree format " + format + " not supported. Known formats are: " + okOutputs);
}
}
this.hf = hf;
this.tlp = tlp;
boolean includePunctuationDependencies;
includePunctuationDependencies = propertyToBoolean(this.options,
"includePunctuationDependencies");
boolean generateOriginalDependencies = tlp.generateOriginalDependencies();
Predicate<String> puncFilter;
if (includePunctuationDependencies) {
dependencyFilter = Filters.acceptFilter();
dependencyWordFilter = Filters.acceptFilter();
puncFilter = Filters.acceptFilter();
} else {
dependencyFilter = new Dependencies.DependentPuncTagRejectFilter<>(tlp.punctuationTagRejectFilter());
dependencyWordFilter = new Dependencies.DependentPuncWordRejectFilter<>(tlp.punctuationWordRejectFilter());
//Universal dependencies filter punction by tags
puncFilter = generateOriginalDependencies ? tlp.punctuationWordRejectFilter() : tlp.punctuationTagRejectFilter();
}
if (propertyToBoolean(this.options, "stem")) {
stemmer = new WordStemmer();
} else {
stemmer = null;
}
if (formats.containsKey("typedDependenciesCollapsed") ||
formats.containsKey("typedDependencies") ||
(formats.containsKey("conll2007") && tlp.supportsGrammaticalStructures())) {
gsf = tlp.grammaticalStructureFactory(puncFilter, typedDependencyHF);
} else {
gsf = null;
}
lexicalize = propertyToBoolean(this.options, "lexicalize");
markHeadNodes = propertyToBoolean(this.options, "markHeadNodes");
transChinese = propertyToBoolean(this.options, "transChinese");
ptb2text = propertyToBoolean(this.options, "ptb2text");
removeEmpty = propertyToBoolean(this.options, "noempty") || ptb2text;
basicDependencies = propertyToBoolean(this.options, "basicDependencies");
collapsedDependencies = propertyToBoolean(this.options, "collapsedDependencies");
nonCollapsedDependencies = propertyToBoolean(this.options, "nonCollapsedDependencies");
nonCollapsedDependenciesSeparated = propertyToBoolean(this.options, "nonCollapsedDependenciesSeparated");
treeDependencies = propertyToBoolean(this.options, "treeDependencies");
includeTags = propertyToBoolean(this.options, "includeTags");
// if no option format for the dependencies is specified, CCPropagated is the default
if ( ! basicDependencies && ! collapsedDependencies && ! nonCollapsedDependencies && ! nonCollapsedDependenciesSeparated && ! treeDependencies) {
CCPropagatedDependencies = true;
} else {
CCPropagatedDependencies = propertyToBoolean(this.options, "CCPropagatedDependencies");
}
}
private static boolean propertyToBoolean(Properties prop, String key) {
return Boolean.parseBoolean(prop.getProperty(key));
}
/**
* Prints the tree to the default PrintWriter.
* @param t The tree to display
*/
public void printTree(Tree t) {
printTree(t, pw);
}
/**
* Prints the tree, with an empty ID.
* @param t The tree to display
* @param pw The PrintWriter to print it to
*/
public void printTree(final Tree t, PrintWriter pw) {
printTree(t, "", pw);
}
/**
* Prints the tree according to the options specified for this instance.
* If the tree {@code t} is {@code null}, then the code prints
* a line indicating a skipped tree. Under the XML option this is
* an {@code s} element with the {@code skipped} attribute having
* value {@code true}, and, otherwise, it is the token
* {@code SENTENCE_SKIPPED_OR_UNPARSABLE}.
*
* @param t The tree to display
* @param id A name for this sentence
* @param pw Where to display the tree
*/
public void printTree(final Tree t, final String id, final PrintWriter pw) {
final boolean inXml = propertyToBoolean(options, "xml");
if (t == null) {
// Parsing didn't succeed.
if (inXml) {
pw.print("<s");
if ( ! StringUtils.isNullOrEmpty(id)) {
pw.print(" id=\"" + XMLUtils.escapeXML(id) + '\"');
}
pw.println(" skipped=\"true\"/>");
pw.println();
} else {
pw.println("SENTENCE_SKIPPED_OR_UNPARSABLE");
}
} else {
if (inXml) {
pw.print("<s");
if ( ! StringUtils.isNullOrEmpty(id)) {
pw.print(" id=\"" + XMLUtils.escapeXML(id) + '\"');
}
pw.println(">");
}
printTreeInternal(t, pw, inXml);
if (inXml) {
pw.println("</s>");
pw.println();
}
}
}
/**
* Prints the trees according to the options specified for this instance.
* If the tree {@code t} is {@code null}, then the code prints
* a line indicating a skipped tree. Under the XML option this is
* an {@code s} element with the {@code skipped} attribute having
* value {@code true}, and, otherwise, it is the token
* {@code SENTENCE_SKIPPED_OR_UNPARSABLE}.
*
* @param trees The list of trees to display
* @param id A name for this sentence
* @param pw Where to dislay the tree
*/
public void printTrees(final List<ScoredObject<Tree>> trees, final String id, final PrintWriter pw) {
final boolean inXml = propertyToBoolean(options, "xml");
int ii = 0; // incremented before used, so first tree is numbered 1
for (ScoredObject<Tree> tp : trees) {
ii++;
Tree t = tp.object();
double score = tp.score();
if (t == null) {
// Parsing didn't succeed.
if (inXml) {
pw.print("<s");
if ( ! StringUtils.isNullOrEmpty(id)) {
pw.print(" id=\"" + XMLUtils.escapeXML(id) + '\"');
}
pw.print(" n=\"");
pw.print(ii);
pw.print('\"');
pw.print(" score=\"" + score + '\"');
pw.println(" skipped=\"true\"/>");
pw.println();
} else {
pw.println("SENTENCE_SKIPPED_OR_UNPARSABLE Parse #" + ii + " with score " + score);
}
} else {
if (inXml) {
pw.print("<s");
if (id != null && ! "".equals(id)) {
pw.print(" id=\"");
pw.print(XMLUtils.escapeXML(id));
pw.print('\"');
}
pw.print(" n=\"");
pw.print(ii);
pw.print('\"');
pw.print(" score=\"");
pw.print(score);
pw.print('\"');
pw.println(">");
} else {
pw.print("# Parse ");
pw.print(ii);
pw.print(" with score ");
pw.println(score);
}
printTreeInternal(t, pw, inXml);
if (inXml) {
pw.println("</s>");
pw.println();
}
}
}
}
/** Print the internal part of a tree having already identified it.
* The ID and outer XML element is printed wrapping this method, but none
* of the internal content.
*
* @param t The tree to print. Now known to be non-null
* @param pw Where to print it to
* @param inXml Whether to use XML style printing
*/
private void printTreeInternal(final Tree t, final PrintWriter pw, final boolean inXml) {
Tree outputTree = t;
if (formats.containsKey("conll2007") || removeEmpty) {
outputTree = outputTree.prune(new BobChrisTreeNormalizer.EmptyFilter());
}
if (formats.containsKey("words")) {
if (inXml) {
ArrayList<Label> sentUnstemmed = outputTree.yield();
pw.println(" <words>");
int i = 1;
for (Label w : sentUnstemmed) {
pw.println(" <word ind=\"" + i + "\">" + XMLUtils.escapeXML(w.value()) + "</word>");
i++;
}
pw.println(" </words>");
} else {
String sent = SentenceUtils.listToString(outputTree.yield(), false);
if(ptb2text) {
pw.println(PTBTokenizer.ptb2Text(sent));
} else {
pw.println(sent);
pw.println();
}
}
}
if (propertyToBoolean(options, "removeTopBracket")) {
String s = outputTree.label().value();
if (tlp.isStartSymbol(s)) {
if (outputTree.isUnaryRewrite()) {
outputTree = outputTree.firstChild();
} else {
// It's not quite clear what to do if the tree isn't unary at the top
// but we then don't strip the ROOT symbol, since that seems closer
// than losing part of the tree altogether....
log.info("TreePrint: can't remove top bracket: not unary");
}
}
// Note that TreePrint is also called on dependency trees that have
// a word as the root node, and so we don't error if there isn't
// the root symbol at the top; rather we silently assume that this
// is a dependency tree!!
}
if (stemmer != null) {
stemmer.visitTree(outputTree);
}
if (lexicalize) {
outputTree = Trees.lexicalize(outputTree, hf);
Function<Tree, Tree> a =
TreeFunctions.getLabeledToDescriptiveCoreLabelTreeFunction();
outputTree = a.apply(outputTree);
}
if (formats.containsKey("collocations")) {
outputTree = getCollocationProcessedTree(outputTree, hf);
}
if (!lexicalize) { // delexicalize the output tree
Function<Tree, Tree> a =
TreeFunctions.getLabeledTreeToStringLabeledTreeFunction();
outputTree = a.apply(outputTree);
}
Tree outputPSTree = outputTree; // variant with head-marking, translations
if (markHeadNodes) {
outputPSTree = markHeadNodes(outputPSTree);
}
if (transChinese) {
TreeTransformer tt = t1 -> {
t1 = t1.treeSkeletonCopy();
for (Tree subtree : t1) {
if (subtree.isLeaf()) {
Label oldLabel = subtree.label();
String translation = ChineseEnglishWordMap.getInstance().getFirstTranslation(oldLabel.value());
if (translation == null) translation = "[UNK]";
Label newLabel = new StringLabel(oldLabel.value() + ':' + translation);
subtree.setLabel(newLabel);
}
}
return t1;
};
outputPSTree = tt.transformTree(outputPSTree);
}
if (propertyToBoolean(options, "xml")) {
if (formats.containsKey("wordsAndTags")) {
ArrayList<TaggedWord> sent = outputTree.taggedYield();
pw.println(" <words pos=\"true\">");
int i = 1;
for (TaggedWord tw : sent) {
pw.println(" <word ind=\"" + i + "\" pos=\"" + XMLUtils.escapeXML(tw.tag()) + "\">" + XMLUtils.escapeXML(tw.word()) + "</word>");
i++;
}
pw.println(" </words>");
}
if (formats.containsKey("penn")) {
pw.println(" <tree style=\"penn\">");
StringWriter sw = new StringWriter();
PrintWriter psw = new PrintWriter(sw);
outputPSTree.pennPrint(psw);
pw.print(XMLUtils.escapeXML(sw.toString()));
pw.println(" </tree>");
}
if (formats.containsKey("latexTree")) {
pw.println(" <tree style=\"latexTrees\">");
pw.println(".[");
StringWriter sw = new StringWriter();
PrintWriter psw = new PrintWriter(sw);
outputTree.indentedListPrint(psw,false);
pw.print(XMLUtils.escapeXML(sw.toString()));
pw.println(".]");
pw.println(" </tree>");
}
if (formats.containsKey("xmlTree")) {
pw.println("<tree style=\"xml\">");
outputTree.indentedXMLPrint(pw,false);
pw.println("</tree>");
}
if (formats.containsKey("dependencies")) {
Tree indexedTree = outputTree.deepCopy(outputTree.treeFactory(),
CoreLabel.factory());
indexedTree.indexLeaves();
Set<Dependency<Label, Label, Object>> depsSet = indexedTree.mapDependencies(dependencyWordFilter, hf);
List<Dependency<Label, Label, Object>> sortedDeps = new ArrayList<>(depsSet);
Collections.sort(sortedDeps, Dependencies.dependencyIndexComparator());
pw.println("<dependencies style=\"untyped\">");
for (Dependency<Label, Label, Object> d : sortedDeps) {
pw.println(d.toString("xml"));
}
pw.println("</dependencies>");
}
if (formats.containsKey("conll2007") || formats.containsKey("conllStyleDependencies")) {
log.info("The \"conll2007\" and \"conllStyleDependencies\" formats are ignored in xml.");
}
if (formats.containsKey("typedDependencies")) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(outputTree);
if (basicDependencies) {
print(gs.typedDependencies(), "xml", includeTags, pw);
}
if (nonCollapsedDependencies || nonCollapsedDependenciesSeparated) {
print(gs.allTypedDependencies(), "xml", includeTags, pw);
}
if (collapsedDependencies) {
print(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "xml", includeTags, pw);
}
if (CCPropagatedDependencies) {
print(gs.typedDependenciesCCprocessed(), "xml", includeTags, pw);
}
if(treeDependencies) {
print(gs.typedDependenciesCollapsedTree(), "xml", includeTags, pw);
}
}
if (formats.containsKey("typedDependenciesCollapsed")) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(outputTree);
print(gs.typedDependenciesCCprocessed(), "xml", includeTags, pw);
}
// This makes parser require jgrapht. Bad.
// if (formats.containsKey("semanticGraph")) {
// SemanticGraph sg = SemanticGraph.makeFromTree(outputTree, true, false, false, null);
// pw.println(sg.toFormattedString());
// }
} else {
// non-XML printing
if (formats.containsKey("wordsAndTags")) {
pw.println(SentenceUtils.listToString(outputTree.taggedYield(), false));
pw.println();
}
if (formats.containsKey("oneline")) {
pw.println(outputPSTree);
}
if (formats.containsKey("penn")) {
outputPSTree.pennPrint(pw);
pw.println();
}
if (formats.containsKey(rootLabelOnlyFormat)) {
pw.println(outputTree.label().value());
}
if (formats.containsKey("latexTree")) {
pw.println(".[");
outputTree.indentedListPrint(pw,false);
pw.println(".]");
}
if (formats.containsKey("xmlTree")) {
outputTree.indentedXMLPrint(pw,false);
}
if (formats.containsKey("dependencies")) {
Tree indexedTree = outputTree.deepCopy(outputTree.treeFactory());
indexedTree.indexLeaves();
List<Dependency<Label, Label, Object>> sortedDeps = getSortedDeps(indexedTree, dependencyWordFilter);
for (Dependency<Label, Label, Object> d : sortedDeps) {
pw.println(d.toString("predicate"));
}
pw.println();
}
if (formats.containsKey("conll2007")) {
// CoNLL-X 2007 format: http://ilk.uvt.nl/conll/#dataformat
// wsg: This code should be retained (and not subsumed into EnglishGrammaticalStructure) so
// that dependencies for other languages can be printed.
// wsg2011: This code currently ignores the dependency label since the present implementation
// of mapDependencies() returns UnnamedDependency objects.
// TODO: if there is a GrammaticalStructureFactory available, use that instead of mapDependencies
Tree it = outputTree.deepCopy(outputTree.treeFactory(), CoreLabel.factory());
it.indexLeaves();
List<CoreLabel> tagged = it.taggedLabeledYield();
List<Dependency<Label, Label, Object>> sortedDeps = getSortedDeps(it, Filters.acceptFilter());
for (Dependency<Label, Label, Object> d : sortedDeps) {
if (!dependencyFilter.test(d)) {
continue;
}
if (!(d.dependent() instanceof HasIndex) || !(d.governor() instanceof HasIndex)) {
throw new IllegalArgumentException("Expected labels to have indices");
}
HasIndex dep = (HasIndex) d.dependent();
HasIndex gov = (HasIndex) d.governor();
int depi = dep.index();
int govi = gov.index();
CoreLabel w = tagged.get(depi - 1);
// Used for both course and fine POS tag fields
String tag = PTBTokenizer.ptbToken2Text(w.tag());
String word = PTBTokenizer.ptbToken2Text(w.word());
String lemma = "_";
String feats = "_";
String pHead = "_";
String pDepRel = "_";
String depRel;
if (d.name() != null) {
depRel = d.name().toString();
} else {
depRel = (govi == 0) ? "ROOT" : "NULL";
}
// The 2007 format has 10 fields
pw.printf("%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s%n", depi, word, lemma, tag, tag, feats, govi, depRel, pHead, pDepRel);
}
pw.println();
}
if (formats.containsKey("conllStyleDependencies")) {
// TODO: Rewrite this to output StanfordDependencies using EnglishGrammaticalStructure code
BobChrisTreeNormalizer tn = new BobChrisTreeNormalizer();
Tree indexedTree = outputTree.deepCopy(outputTree.treeFactory(),
CoreLabel.factory());
// TODO: Can the below for-loop be deleted now? (Now that the HeadFinder knows about NML.)
for (Tree node : indexedTree) {
if (node.label().value().startsWith("NML")) {
node.label().setValue("NP");
}
}
indexedTree = tn.normalizeWholeTree(indexedTree, outputTree.treeFactory());
indexedTree.indexLeaves();
Set<Dependency<Label, Label, Object>> depsSet = null;
boolean failed = false;
try {
depsSet = indexedTree.mapDependencies(dependencyFilter, hf);
} catch (Exception e) {
failed = true;
}
if (failed) {
log.info("failed: ");
log.info(t);
log.info();
} else {
Map<Integer,Integer> deps = Generics.newHashMap();
for (Dependency<Label, Label, Object> dep : depsSet) {
CoreLabel child = (CoreLabel)dep.dependent();
CoreLabel parent = (CoreLabel)dep.governor();
Integer childIndex =
child.get(CoreAnnotations.IndexAnnotation.class);
Integer parentIndex =
parent.get(CoreAnnotations.IndexAnnotation.class);
// log.info(childIndex+"\t"+parentIndex);
deps.put(childIndex, parentIndex);
}
boolean foundRoot = false;
int index = 1;
for (Tree node : indexedTree.getLeaves()) {
String word = node.label().value();
String tag = node.parent(indexedTree).label().value();
int parent = 0;
if (deps.containsKey(index)) {
parent = deps.get(index);
} else {
if (foundRoot) { throw new RuntimeException(); }
foundRoot = true;
}
pw.println(index + '\t' + word + '\t' + tag + '\t' + parent);
index++;
}
pw.println();
}
}
if (formats.containsKey("typedDependencies")) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(outputTree);
if (basicDependencies) {
print(gs.typedDependencies(), includeTags, pw);
}
if (nonCollapsedDependencies) {
print(gs.allTypedDependencies(), includeTags, pw);
}
if (nonCollapsedDependenciesSeparated) {
print(gs.allTypedDependencies(), "separator", includeTags, pw);
}
if (collapsedDependencies) {
print(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), includeTags, pw);
}
if (CCPropagatedDependencies) {
print(gs.typedDependenciesCCprocessed(), includeTags, pw);
}
if (treeDependencies) {
print(gs.typedDependenciesCollapsedTree(), includeTags, pw);
}
}
if (formats.containsKey("typedDependenciesCollapsed")) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(outputTree);
print(gs.typedDependenciesCCprocessed(), includeTags, pw);
}
// This makes parser require jgrapht. Bad
// if (formats.containsKey("semanticGraph")) {
// SemanticGraph sg = SemanticGraph.makeFromTree(outputTree, true, false, false, null);
// pw.println(sg.toFormattedString());
// }
}
// flush to make sure we see all output
pw.flush();
}
private List<Dependency<Label, Label, Object>> getSortedDeps(Tree tree, Predicate<Dependency<Label, Label, Object>> filter) {
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> deps = gs.typedDependencies(GrammaticalStructure.Extras.NONE);
List<Dependency<Label, Label, Object>> sortedDeps = new ArrayList<>();
for (TypedDependency dep : deps) {
sortedDeps.add(new NamedDependency(dep.gov(), dep.dep(), dep.reln().toString()));
}
Collections.sort(sortedDeps, Dependencies.dependencyIndexComparator());
return sortedDeps;
} else {
Set<Dependency<Label, Label, Object>> depsSet = tree.mapDependencies(filter, hf, "root");
List<Dependency<Label, Label, Object>> sortedDeps = new ArrayList<>(depsSet);
Collections.sort(sortedDeps, Dependencies.dependencyIndexComparator());
return sortedDeps;
}
}
/** For the input tree, collapse any collocations in it that exist in
* WordNet and are contiguous in the tree into a single node.
* A single static Wordnet connection is used by all instances of this
* class. Reflection to check that a Wordnet connection exists. Otherwise
* we print an error and do nothing.
*
* @param tree The input tree. NOTE: This tree is mangled by this method
* @param hf The head finder to use
* @return The collocation collapsed tree
*/
private static synchronized Tree getCollocationProcessedTree(Tree tree,
HeadFinder hf) {
if (wnc == null) {
try {
Class<?> cl = Class.forName("edu.stanford.nlp.trees.WordNetInstance");
wnc = (WordNetConnection) cl.newInstance();
} catch (Exception e) {
log.info("Couldn't open WordNet Connection. Aborting collocation detection.");
log.info(e);
wnc = null;
}
}
if (wnc != null) {
CollocationFinder cf = new CollocationFinder(tree, wnc, hf);
tree = cf.getMangledTree();
} else {
log.error("WordNetConnection unavailable for collocations.");
}
return tree;
}
public void printHeader(PrintWriter pw, String charset) {
if (propertyToBoolean(options, "xml")) {
pw.println("<?xml version=\"1.0\" encoding=\"" + charset + "\"?>");
pw.println("<corpus>");
}
}
public void printFooter(PrintWriter pw) {
if (propertyToBoolean(options, "xml")) {
pw.println("</corpus>");
}
}
public Tree markHeadNodes(Tree t) {
return markHeadNodes(t, null);
}
private Tree markHeadNodes(Tree t, Tree head) {
if (t.isLeaf()) {
return t; // don't worry about head-marking leaves
}
Label newLabel;
if (t == head) {
newLabel = headMark(t.label());
} else {
newLabel = t.label();
}
Tree newHead = hf.determineHead(t);
return t.treeFactory().newTreeNode(newLabel, Arrays.asList(headMarkChildren(t, newHead)));
}
private static Label headMark(Label l) {
Label l1 = l.labelFactory().newLabel(l);
l1.setValue(l1.value() + headMark);
return l1;
}
private Tree[] headMarkChildren(Tree t, Tree head) {
Tree[] kids = t.children();
Tree[] newKids = new Tree[kids.length];
for (int i = 0, n = kids.length; i < n; i++) {
newKids[i] = markHeadNodes(kids[i], head);
}
return newKids;
}
/** This provides a simple main method for calling TreePrint.
* Flags supported are:
* <ol>
* <li> -format format (like -outputFormat of parser, default "penn")
* <li> -options options (like -outputFormatOptions of parser, default "")
* <li> -tLP class (the TreebankLanguagePack, default "edu.stanford.nlp.tree.PennTreebankLanguagePack")
* <li> -hf class (the HeadFinder, default, the one in the class specified by -tLP)
* <li> -useTLPTreeReader (use the treeReaderFactory() inside
* the -tLP class; otherwise a PennTreeReader with no normalization is used)
* </ol>
* The single argument should be a file containing Trees in the format that is either
* Penn Treebank s-expressions or as specified by -useTLPTreeReader and the -tLP class,
* or if there is no such argument, trees are read from stdin and the program runs as a
* filter.
*
* @param args Command line arguments, as above.
*/
public static void main(String[] args) {
String format = "penn";
String options = "";
String tlpName = "edu.stanford.nlp.trees.PennTreebankLanguagePack";
String hfName = null;
Map<String,Integer> flagMap = Generics.newHashMap();
flagMap.put("-format", 1);
flagMap.put("-options", 1);
flagMap.put("-tLP", 1);
flagMap.put("-hf", 1);
Map<String,String[]> argsMap = StringUtils.argsToMap(args,flagMap);
args = argsMap.get(null);
if(argsMap.keySet().contains("-format")) {
format = argsMap.get("-format")[0];
}
if(argsMap.keySet().contains("-options")) {
options = argsMap.get("-options")[0];
}
if (argsMap.keySet().contains("-tLP")) {
tlpName = argsMap.get("-tLP")[0];
}
if (argsMap.keySet().contains("-hf")) {
hfName = argsMap.get("-hf")[0];
}
TreebankLanguagePack tlp;
try {
tlp = (TreebankLanguagePack) Class.forName(tlpName).newInstance();
} catch (Exception e) {
log.warning(e);
return;
}
HeadFinder hf;
if (hfName != null) {
try {
hf = (HeadFinder) Class.forName(hfName).newInstance();
} catch (Exception e) {
log.warning(e);
return;
}
} else {
hf = tlp.headFinder();
}
TreePrint print = new TreePrint(format, options, tlp, (hf == null) ? tlp.headFinder(): hf, tlp.typedDependencyHeadFinder());
Iterator<Tree> i; // initialized below
if (args.length > 0) {
Treebank trees; // initialized below
TreeReaderFactory trf;
if (argsMap.keySet().contains("-useTLPTreeReader")) {
trf = tlp.treeReaderFactory();
} else {
trf = in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new StringLabelFactory()), new TreeNormalizer());
}
trees = new DiskTreebank(trf);
trees.loadPath(args[0]);
i = trees.iterator();
} else {
i = tlp.treeTokenizerFactory().getTokenizer(new BufferedReader(new InputStreamReader(System.in)));
}
while(i.hasNext()) {
print.printTree(i.next());
}
}
/**
* NO OUTSIDE USE
* Returns a String representation of the result of this set of
* typed dependencies in a user-specified format.
* Currently, three formats are supported:
* <dl>
* <dt>"plain"</dt>
* <dd>(Default.) Formats the dependencies as logical relations,
* as exemplified by the following:
* <pre>
* nsubj(died-1, Sam-0)
* tmod(died-1, today-2)
* </pre>
* </dd>
* <dt>"readable"</dt>
* <dd>Formats the dependencies as a table with columns
* {@code dependent}, {@code relation}, and
* {@code governor}, as exemplified by the following:
* <pre>
* Sam-0 nsubj died-1
* today-2 tmod died-1
* </pre>
* </dd>
* <dt>"xml"</dt>
* <dd>Formats the dependencies as XML, as exemplified by the following:
* <pre>
* <dependencies>
* <dep type="nsubj">
* <governor idx="1">died</governor>
* <dependent idx="0">Sam</dependent>
* </dep>
* <dep type="tmod">
* <governor idx="1">died</governor>
* <dependent idx="2">today</dependent>
* </dep>
* </dependencies>
* </pre>
* </dd>
* </dl>
*
* @param dependencies The TypedDependencies to print
* @param format a {@code String} specifying the desired format
* @return a {@code String} representation of the typed
* dependencies in this {@code GrammaticalStructure}
*/
private static String toString(Collection<TypedDependency> dependencies, String format, boolean includeTags) {
if (format != null && format.equals("xml")) {
return toXMLString(dependencies, includeTags);
} else if (format != null && format.equals("readable")) {
return toReadableString(dependencies);
} else if (format != null && format.equals("separator")) {
return toString(dependencies, true, includeTags);
} else {
return toString(dependencies, false, includeTags);
}
}
/**
* NO OUTSIDE USE
* Returns a String representation of this set of typed dependencies
* as exemplified by the following:
* <pre>
* tmod(died-6, today-9)
* nsubj(died-6, Sam-3)
* </pre>
*
* @param dependencies The TypedDependencies to print
* @param extraSep boolean indicating whether the extra dependencies have to be printed separately, after the basic ones
* @return a {@code String} representation of this set of
* typed dependencies
*/
private static String toString(Collection<TypedDependency> dependencies, boolean extraSep, boolean includeTags) {
CoreLabel.OutputFormat labelFormat = (includeTags) ? CoreLabel.OutputFormat.VALUE_TAG_INDEX : CoreLabel.OutputFormat.VALUE_INDEX;
StringBuilder buf = new StringBuilder();
if (extraSep) {
List<TypedDependency> extraDeps = new ArrayList<>();
for (TypedDependency td : dependencies) {
if (td.extra()) {
extraDeps.add(td);
} else {
buf.append(td.toString(labelFormat)).append('\n');
}
}
// now we print the separator for extra dependencies, and print these if there are some
if (!extraDeps.isEmpty()) {
buf.append("======\n");
for (TypedDependency td : extraDeps) {
buf.append(td.toString(labelFormat)).append('\n');
}
}
} else {
for (TypedDependency td : dependencies) {
buf.append(td.toString(labelFormat)).append('\n');
}
}
return buf.toString();
}
// NO OUTSIDE USE
private static String toReadableString(Collection<TypedDependency> dependencies) {
StringBuilder buf = new StringBuilder();
buf.append(String.format("%-20s%-20s%-20s%n", "dep", "reln", "gov"));
buf.append(String.format("%-20s%-20s%-20s%n", "---", "----", "---"));
for (TypedDependency td : dependencies) {
buf.append(String.format("%-20s%-20s%-20s%n", td.dep(), td.reln(), td.gov()));
}
return buf.toString();
}
// NO OUTSIDE USE
private static String toXMLString(Collection<TypedDependency> dependencies, boolean includeTags) {
StringBuilder buf = new StringBuilder("<dependencies style=\"typed\">\n");
for (TypedDependency td : dependencies) {
String reln = td.reln().toString();
String gov = td.gov().value();
String govTag = td.gov().tag();
int govIdx = td.gov().index();
String dep = td.dep().value();
String depTag = td.dep().tag();
int depIdx = td.dep().index();
boolean extra = td.extra();
// add an attribute if the node is a copy
// (this happens in collapsing when different prepositions are conjuncts)
String govCopy = "";
int copyGov = td.gov().copyCount();
if (copyGov > 0) {
govCopy = " copy=\"" + copyGov + '\"';
}
String depCopy = "";
int copyDep = td.dep().copyCount();
if (copyDep > 0) {
depCopy = " copy=\"" + copyDep + '\"';
}
String govTagAttribute = (includeTags && govTag != null) ? " tag=\"" + govTag + "\"" : "";
String depTagAttribute = (includeTags && depTag != null) ? " tag=\"" + depTag + "\"" : "";
// add an attribute if the typed dependency is an extra relation (do not preserve the tree structure)
String extraAttr = "";
if (extra) {
extraAttr = " extra=\"yes\"";
}
buf.append(" <dep type=\"").append(XMLUtils.escapeXML(reln)).append('\"').append(extraAttr).append(">\n");
buf.append(" <governor idx=\"").append(govIdx).append('\"').append(govCopy).append(govTagAttribute).append('>').append(XMLUtils.escapeXML(gov)).append("</governor>\n");
buf.append(" <dependent idx=\"").append(depIdx).append('\"').append(depCopy).append(depTagAttribute).append('>').append(XMLUtils.escapeXML(dep)).append("</dependent>\n");
buf.append(" </dep>\n");
}
buf.append("</dependencies>");
return buf.toString();
}
/**
* USED BY TREEPRINT AND WSD.SUPWSD.PREPROCESS
* Prints this set of typed dependencies to the specified
* {@code PrintWriter}.
* @param dependencies The collection of TypedDependency to print
* @param pw Where to print them
*/
public static void print(Collection<TypedDependency> dependencies, boolean includeTags, PrintWriter pw) {
pw.println(toString(dependencies, false, includeTags));
}
/**
* USED BY TREEPRINT
* Prints this set of typed dependencies to the specified
* {@code PrintWriter} in the specified format.
* @param dependencies The collection of TypedDependency to print
* @param format "xml" or "readable" or other
* @param pw Where to print them
*/
public static void print(Collection<TypedDependency> dependencies, String format, boolean includeTags, PrintWriter pw) {
pw.println(toString(dependencies, format, includeTags));
}
}