package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.StringLabel; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TreeTransformer; import java.util.ArrayList; import java.util.List; /** * Performs collinization operations on Chinese trees similar to * those for English Namely: <ul> * <li> strips all functional & automatically-added tags * <li> strips all punctuation * <li> merges PRN and ADVP * <li> eliminates ROOT (note that there are a few non-unary ROOT nodes; * these are not eliminated) * </ul> * * @author Roger Levy * @author Christopher Manning */ public class ChineseCollinizer implements TreeTransformer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseCollinizer.class); private final static boolean VERBOSE = false; private final boolean deletePunct; ChineseTreebankLanguagePack ctlp; protected TreeFactory tf = new LabeledScoredTreeFactory(); public ChineseCollinizer(ChineseTreebankLanguagePack ctlp) { this(ctlp, true); } public ChineseCollinizer(ChineseTreebankLanguagePack ctlp, boolean deletePunct) { this.deletePunct = deletePunct; this.ctlp = ctlp; } public Tree transformTree(Tree tree) { return transformTree(tree, true); } private Tree transformTree(Tree tree, boolean isRoot) { String label = tree.label().value(); // log.info("ChineseCollinizer: Node label is " + label); if (tree.isLeaf()) { if (deletePunct && ctlp.isPunctuationWord(label)) { return null; } else { return tf.newLeaf(new StringLabel(label)); } } if (tree.isPreTerminal() && deletePunct && ctlp.isPunctuationTag(label)) { // System.out.println("Deleting punctuation"); return null; } List<Tree> children = new ArrayList<>(); if (label.matches("ROOT.*") && tree.numChildren() == 1) { // keep non-unary roots for now return transformTree(tree.children()[0], true); } //System.out.println("Enhanced label is " + label); // remove all functional and machine-generated annotations label = label.replaceFirst("[^A-Z].*$", ""); // merge parentheticals with adverb phrases label = label.replaceFirst("PRN", "ADVP"); //System.out.println("New label is " + label); for (int cNum = 0; cNum < tree.children().length; cNum++) { Tree child = tree.children()[cNum]; Tree newChild = transformTree(child, false); if (newChild != null) { children.add(newChild); } } // We don't delete the root because there are trees in the // Chinese treebank that only have punctuation in them!!! if (children.isEmpty() && ! isRoot) { if (VERBOSE) { log.info("ChineseCollinizer: all children of " + label + " deleted; returning null"); } return null; } return tf.newTreeNode(new StringLabel(label), children); } }