package edu.stanford.nlp.parser.lexparser; import java.util.*; import edu.stanford.nlp.trees.TreeTransformer; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; /** This class manipulates punctuation in trees (used with training trees) * in the same manner that Collins manipulated punctuation in trees when * building his parsing model. This is the same punctuation that is * the punctuation ignored in the standard EvalB evaluation is promoted * as high in the tree as possible. * * @author Dan Klein * @author Christopher Manning */ public class CollinsPuncTransformer implements TreeTransformer { private TreebankLanguagePack tlp; boolean isPunc(Tree t) { if (t.isPreTerminal()) { String s = t.label().value(); if (tlp.isEvalBIgnoredPunctuationTag(s)) { return true; } } return false; } static LinkedList<Tree> preTerms(Tree t) { LinkedList<Tree> l = new LinkedList<>(); preTermHelper(t, l); return l; } static void preTermHelper(Tree t, List<Tree> l) { if (t.isLeaf()) { return; } if (t.isPreTerminal()) { l.add(t); return; } Tree[] children = t.children(); for (Tree child : children) { preTermHelper(child, l); } } Tree transformRoot(Tree tree, TreeFactory tf) { // XXXX TODO: use tlp and don't assume 1 daughter of ROOT! // leave the root intact // if (tlp.isStartSymbol(tlp.basicCategory(tree.label().value()))) if (tree.label().toString().startsWith("ROOT")) { return tf.newTreeNode(tree.label(), Collections.singletonList(transformNode(tree.children()[0], tf))); } return transformNode(tree, tf); } Tree transformNode(Tree tree, TreeFactory tf) { if (tree.isLeaf()) { return tf.newLeaf(tree.label()); } if (tree.isPreTerminal()) { return tf.newTreeNode(tree.label(), Collections.singletonList(tf.newLeaf(tree.children()[0].label()))); } List<Tree> children = tree.getChildrenAsList(); LinkedList<Tree> newChildren = new LinkedList<>(); // promote lower punctuation for (Tree child : children) { LinkedList<Tree> preTerms = preTerms(child); while (!preTerms.isEmpty() && isPunc(preTerms.getFirst())) { newChildren.add(preTerms.getFirst()); preTerms.removeFirst(); } Tree newChild = transformNode(child, tf); LinkedList<Tree> temp = new LinkedList<>(); if (newChild.children().length > 0) { newChildren.add(newChild); } while (!preTerms.isEmpty() && isPunc(preTerms.getLast())) { temp.addFirst(preTerms.getLast()); preTerms.removeLast(); } newChildren.addAll(temp); } // remove local punctuation while (!newChildren.isEmpty() && isPunc(newChildren.getFirst())) { newChildren.removeFirst(); } while (!newChildren.isEmpty() && isPunc(newChildren.getLast())) { newChildren.removeLast(); } return tf.newTreeNode(tree.label(), newChildren); } // public Tree transformTree(Tree tree) { // //System.out.println("PUNCTUATION TRANSFORM:"); // //tree.pennPrint(); // //System.out.println("BECOMES:"); // //transformRoot(tree, tf).pennPrint(); // return transformRoot(tree, tf); // } public Tree transformTree(Tree tree) { return transformRoot(tree, tree.treeFactory()); } public CollinsPuncTransformer(TreebankLanguagePack tlp) { this.tlp = tlp; } }