package edu.stanford.nlp.trees.international.pennchinese; import java.io.Serializable; import java.util.regex.Pattern; import java.util.*; import edu.stanford.nlp.trees.BobChrisTreeNormalizer; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TreeTransformer; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import java.util.function.Predicate; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.io.EncodingPrintWriter; /** * This was originally written to correct a few errors Galen found in CTB3. * The thinking was that perhaps when we get CTB4 they would be gone and we * could revert to BobChris. Alas, CTB4 contained only more errors.... * It has since been extended to allow some functional tags from CTB to be * maintained. This is so far much easier than in NPTmpRetainingTN, since * we don't do any tag percolation (helped by CTB marking temporal nouns). * <p> * <i>Implementation note:</i> This now loads CharacterLevelTagExtender by * reflection if that option is invoked. * * @author Galen Andrew * @author Christopher Manning */ public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer { private static final long serialVersionUID = -8203853817025401845L; private static final Pattern NPTmpPattern = Pattern.compile("NP.*-TMP.*"); private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer") != null; @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) private final TreeTransformer tagExtender; private final boolean splitNPTMP; private final boolean splitPPTMP; private final boolean splitXPTMP; /** Constructor with all of the options of the other constructor false */ public CTBErrorCorrectingTreeNormalizer() { this(false, false, false, false); } /** * Build a CTBErrorCorrectingTreeNormalizer. * * @param splitNPTMP Temporal annotation on NPs * @param splitPPTMP Temporal annotation on PPs * @param splitXPTMP Temporal annotation on any phrase marked in CTB * @param charTags Whether you wish to push POS tags down on to the * characters of a word (for unsegmented text) */ public CTBErrorCorrectingTreeNormalizer(boolean splitNPTMP, boolean splitPPTMP, boolean splitXPTMP, boolean charTags) { this.splitNPTMP = splitNPTMP; this.splitPPTMP = splitPPTMP; this.splitXPTMP = splitXPTMP; if (charTags) { try { tagExtender = (TreeTransformer) Class.forName("edu.stanford.nlp.trees.international.pennchinese.CharacterLevelTagExtender").newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } else { tagExtender = null; } } /** * Remove things like hyphened functional tags and equals from the * end of a node label. But keep occasional functional tags as * determined by class parameters, particularly TMP * * @param label The label to be cleaned up */ @Override protected String cleanUpLabel(String label) { if (label == null) { return "ROOT"; } else { boolean nptemp = NPTmpPattern.matcher(label).matches(); boolean pptemp = PPTmpPattern.matcher(label).matches(); boolean anytemp = TmpPattern.matcher(label).matches(); label = tlp.basicCategory(label); if (anytemp && splitXPTMP) { label += "-TMP"; } else if (pptemp && splitPPTMP) { label = label + "-TMP"; } else if (nptemp && splitNPTMP) { label = label + "-TMP"; } return label; } } private static class ChineseEmptyFilter implements Predicate<Tree>, Serializable { private static final long serialVersionUID = 8914098359495987617L; /** Doesn't accept nodes that only cover an empty. */ @Override public boolean test(Tree t) { Tree[] kids = t.children(); Label l = t.label(); if ((l != null) && l.value() != null && // there appears to be a mistake in CTB3 where the label "-NONE-1" is used once // presumably it should be "-NONE-" and be spliced out here. (l.value().matches("-NONE-.*")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf()) { // Delete empty/trace nodes (ones marked '-NONE-') if ( ! l.value().equals("-NONE-")) { EncodingPrintWriter.err.println("Deleting errant node " + l.value() + " as if -NONE-: " + t, ChineseTreebankLanguagePack.ENCODING); } return false; } return true; } } @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) private final Predicate<Tree> chineseEmptyFilter = new ChineseEmptyFilter(); private static final TregexPattern[] fixupTregex = { TregexPattern.compile("PU=punc < 她{"), TregexPattern.compile("@NP <1 (@NP <1 NR <2 (PU=bad < /^<$/)) <2 (FLR=dest <2 (NT < /English/))"), TregexPattern.compile("@IP < (FLR=dest <: (PU < /^〈$/) $. (__=bad1 $. (PU=bad2 < /^〉$/)))"), TregexPattern.compile("@DFL|FLR|IMG|SKIP=junk <<, (PU < /^[〈{{<\\[[]$/) <<- (PU < /^[〉}}>\\]]]$/) <3 __"), TregexPattern.compile("WHPP=bad"), }; private static final TsurgeonPattern[] fixupTsurgeon = { Tsurgeon.parseOperation("replace punc (PN 她) (PU {)"), Tsurgeon.parseOperation("move bad >1 dest"), Tsurgeon.parseOperation("[move bad1 >-1 dest] [move bad2 >-1 dest]"), Tsurgeon.parseOperation("delete junk"), Tsurgeon.parseOperation("relabel bad PP"), }; static { if (fixupTregex.length != fixupTsurgeon.length) { throw new AssertionError("fixupTregex and fixupTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer."); } } // We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation. // New phrasal categories in CTB 7 and later: // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // EMO = Emoticon. For emoticons. Fine to keep. // FLR = Filler. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those. // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep. // INTJ = Interjection. Fine to keep. // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata // OTH = ??. Weird but just leave. // SKIP = ??. Always has NOI under it. Omit or keep? // TYPO = seems like should mainly go, but sometimes a branching node?? // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP. // // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >) // which just seems an error - should all be under FLR. // // POS tags are now 38. Original 33 plus these: // EM = Emoticon. Often but not always under EMO. // IC = Incomplete word rendered in pinyin, usually under DFL. // NOI = // URL = URL. // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist! @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { Tree newTree = tree.prune(chineseEmptyFilter, tf).spliceOut(aOverAFilter); // Report non-unary initial rewrites & fix 'obvious ones' Tree[] kids = newTree.children(); if (kids.length > 1) { /* -------------- don't do this as probably shouldn't for test set (and doesn't help anyway) if (kids.length == 2 && "PU".equals(kids[kids.length - 1].value()) && kids[0].isPhrasal()) { printlnErr("Correcting error: non-unary initial rewrite fixed by tucking punctuation inside constituent: " + newTree.localTree()); List kidkids = kids[0].getChildrenAsList(); kidkids.add(kids[1]); Tree bigger = tf.newTreeNode(kids[0].label(), kidkids); newTree = tf.newTreeNode(newTree.label(), Collections.singletonList(bigger)); } else { -------------------- */ EncodingPrintWriter.err.println("Possible error: non-unary initial rewrite: " + newTree.localTree(), ChineseTreebankLanguagePack.ENCODING); // } } else if (kids.length > 0) { // ROOT has 1 child - the normal case Tree child = kids[0]; if ( ! child.isPhrasal()) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING); } Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids)); newTree.setChild(0, added); } else if (child.label().value().equals("META")) { // Delete the one bogus META tree in CTB 9 EncodingPrintWriter.err.println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.ENCODING); return null; } } else { EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING); } // note that there's also at least 1 tree that is an IP with no surrounding ROOT node // there are also several places where "NP" is used as a preterminal tag // and presumably should be "NN" // a couple of other random errors are corrected here for (Tree subtree : newTree) { if (subtree.value().equals("CP") && subtree.numChildren() == 1) { Tree subsubtree = subtree.firstChild(); if (subsubtree.value().equals("ROOT")) { if (subsubtree.firstChild().isLeaf() && "CP".equals(subsubtree.firstChild().value())) { EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.ENCODING); List<Tree> children = subsubtree.getChildrenAsList(); children = children.subList(1,children.size()); subtree.setChildren(children); EncodingPrintWriter.err.println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above } } } // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help. if (subtree.isPreTerminal()) { if (subtree.value().matches("NP")) { if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(subtree.firstChild().value())) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: NP preterminal over douhao; preterminal changed to PU: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("PU"); } else if (subtree.parent(newTree).value().matches("NP")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: NP preterminal w/ NP parent; preterminal changed to NN: " + subtree.parent(newTree), ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("NN"); } else { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: NP preterminal w/o NP parent, changing preterminal to NN: " + subtree.parent(newTree), ChineseTreebankLanguagePack.ENCODING); } // Tree newChild = tf.newTreeNode("NN", Collections.singletonList(subtree.firstChild())); // subtree.setChildren(Collections.singletonList(newChild)); subtree.setValue("NN"); } } else if (subtree.value().matches("PU")) { if (subtree.firstChild().value().matches("他")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("PN"); } else if (subtree.firstChild().value().equals("里")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to LC: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("LC"); } else if (subtree.firstChild().value().equals("是")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("VC"); } else if (subtree.firstChild().value().matches("tw|半穴式")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("NN"); } else if (subtree.firstChild().value().matches("33")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"33\" under PU tag; tag changed to CD: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("CD"); } } } else if (subtree.value().matches("NN")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: NN phrasal tag changed to NP: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("NP"); } else if (subtree.value().matches("MSP")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: MSP phrasal tag changed to VP: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("VP"); } } for (int i = 0; i < fixupTregex.length; ++i) { if (DEBUG) { Tree preProcessed = newTree.deepCopy(); newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree); if (!preProcessed.equals(newTree)) { EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println(" from: " + preProcessed, ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println(" to: " + newTree, ChineseTreebankLanguagePack.ENCODING); } } else { newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree); } } // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree. if (newTree.numChildren() == 0) { if (DEBUG) { EncodingPrintWriter.err.println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.ENCODING); } return null; } if (tagExtender != null) { newTree = tagExtender.transformTree(newTree); } return newTree; } /** So you can create a TreeReaderFactory using this TreeNormalizer easily by reflection. */ public static class CTBErrorCorrectingTreeReaderFactory extends CTBTreeReaderFactory { public CTBErrorCorrectingTreeReaderFactory() { super(new CTBErrorCorrectingTreeNormalizer(false, false, false, false)); } } // end class CTBErrorCorrectingTreeReaderFactory } // end class CTBErrorCorrectingTreeNormalizer