package edu.stanford.nlp.trees; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.ling.LabelFactory; import java.util.function.Predicate; import java.io.Reader; import java.util.regex.Pattern; import java.util.*; /** * Same TreeNormalizer as BobChrisTreeNormalizer, but optionally provides * four extras. I.e., the class name is now a misnomer.<br> * 1) retains -TMP labels on NP with the new identification NP-TMP, * and provides various options to percolate that option downwards * to the head noun, and perhaps also to inherit this from a PP-TMP.<br> * 2) Annotates S nodes which contain a gapped subject: i.e., * <code>S < (/^NP-SBJ/ < -NONE-) --> S-G</code> <br> * 3) Leave all functional tags on nodes. <br> * 4) Keeps -ADV labels on NP and marks head tag with &`^ADV * <p/> * <i>Performance note:</i> At one point in time, PCFG labeled F1 results * for the various TEMPORAL options in lexparser were: * 0=86.7, 1=87.49, 2=86.87, 3=87.49, 4=87.48, 5=87.5, 6=87.07. * So, mainly avoid values of 0, 2, and 6. * <p/> * At another point they were: * 0=86.53, 1=87.1, 2=87.14, 3=87.22, 4=87.1, 5=87.13, 6=86.95, 7=87.16 * * @author Christopher Manning * @author Dan Klein */ public class NPTmpRetainingTreeNormalizer extends BobChrisTreeNormalizer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(NPTmpRetainingTreeNormalizer.class); private static final long serialVersionUID = 7548777133196579107L; public static final int TEMPORAL_NONE = 0; public static final int TEMPORAL_ACL03PCFG = 1; public static final int TEMPORAL_ANY_TMP_PERCOLATED = 2; public static final int TEMPORAL_ALL_TERMINALS = 3; public static final int TEMPORAL_ALL_NP = 4; public static final int TEMPORAL_ALL_NP_AND_PP = 5; public static final int TEMPORAL_NP_AND_PP_WITH_NP_HEAD = 6; public static final int TEMPORAL_ALL_NP_EVEN_UNDER_PP = 7; public static final int TEMPORAL_ALL_NP_PP_ADVP = 8; public static final int TEMPORAL_9 = 9; private static final boolean onlyTagAnnotateNstar = true; private static final Pattern NPTmpPattern = Pattern.compile("NP.*-TMP.*"); private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern ADVPTmpPattern = Pattern.compile("ADVP.*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); private static final Pattern NPSbjPattern = Pattern.compile("NP.*-SBJ.*"); private static final Pattern NPAdvPattern = Pattern.compile("NP.*-ADV.*"); private final int temporalAnnotation; private final boolean doSGappedStuff; private final int leaveItAll; private final boolean doAdverbialNP; private final HeadFinder headFinder; public NPTmpRetainingTreeNormalizer() { this(TEMPORAL_ACL03PCFG, false); } public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff) { this(temporalAnnotation, doSGappedStuff, 0, false); } public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP) { this(temporalAnnotation, doSGappedStuff, leaveItAll, doAdverbialNP, new ModCollinsHeadFinder()); } /** * Create a TreeNormalizer that maintains some functional annotations, * particularly those involving temporal annotation. * * @param temporalAnnotation One of the constants: * TEMPORAL_NONE (no temporal annotation kept on trees), * TEMPORAL_ACL03PCFG (temporal annotation on NPs, and percolated down * to head of constituent until and including POS tag), * TEMPORAL_ANY_TMP_PERCOLATED (temporal annotation on any phrase is * kept and percolated via head chain to and including POS tag), * TEMPORAL_ALL_TERMINALS (temporal annotation is kept on NPs, and * is placed on all POS tag daughters of that NP (but is not * percolated down a head chain through phrasal categories), * TEMPORAL_ALL_NP (temporal annotation on NPs, and it is percolated * down via the head chain, but only through NPs: annotation stops * at either a POS tag (which is annotated) or a non-NP head * (which isn't annotated)), * TEMPORAL_ALL_NP_AND_PP (keeps temporal annotation on NPs and PPs, * and it is percolated down via the head chain, but only through * NPs: annotation stops at either a POS tag (which is annotated) * or a non-NP head (which isn't annotated)). * TEMPORAL_NP_AND_PP_WITH_NP_HEAD (like TEMPORAL_ALL_NP_AND_PP * except an NP is regarded as the head of a PP) * TEMPORAL_ALL_NP_EVEN_UNDER_PP (like TEMPORAL_ALL_NP, but a PP-TMP * annotation above an NP is 'passed down' to annotate that NP * as temporal (but the PP itself isn't marked)) * TEMPORAL_ALL_NP_PP_ADVP (keeps temporal annotation on NPs, PPs, and * ADVPs * and it is percolated down via the head chain, but only through * those categories: annotation stops at either a POS tag * (which is annotated) * or a non-NP/PP/ADVP head (which isn't annotated)), * TEMPORAL_9 (annotates like the previous one but * does all NP inside node, and their children if * pre-pre-terminal rather than only if head). * @param doSGappedStuff Leave -SBJ marking on subject NP and then mark * S-G sentences with a gapped subject. * @param leaveItAll 0 means the usual stripping of functional tags and indices; * 1 leaves all functional tags but still strips indices; * 2 leaves everything * @param doAdverbialNP Leave -ADV functional tag on adverbial NPs and * maybe add it to their head * @param headFinder A head finder that is used with some of the * options for temporalAnnotation */ public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP, HeadFinder headFinder) { this.temporalAnnotation = temporalAnnotation; this.doSGappedStuff = doSGappedStuff; this.leaveItAll = leaveItAll; this.doAdverbialNP = doAdverbialNP; this.headFinder = headFinder; } /** * Remove things like hyphened functional tags and equals from the * end of a node label. */ @Override protected String cleanUpLabel(String label) { if (label == null) { return "ROOT"; // String constants are always interned } else if (leaveItAll == 1) { return tlp.categoryAndFunction(label); } else if (leaveItAll == 2) { return label; } else { boolean nptemp = NPTmpPattern.matcher(label).matches(); boolean pptemp = PPTmpPattern.matcher(label).matches(); boolean advptemp = ADVPTmpPattern.matcher(label).matches(); boolean anytemp = TmpPattern.matcher(label).matches(); boolean subj = NPSbjPattern.matcher(label).matches(); boolean npadv = NPAdvPattern.matcher(label).matches(); label = tlp.basicCategory(label); if (anytemp && temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) { label += "-TMP"; } else if (pptemp && (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP || temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) { label = label + "-TMP"; } else if (advptemp && (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) { label = label + "-TMP"; } else if (temporalAnnotation > 0 && nptemp) { label = label + "-TMP"; } if (doAdverbialNP && npadv) { label = label + "-ADV"; } if (doSGappedStuff && subj) { label = label + "-SBJ"; } return label; } } private static boolean includesEmptyNPSubj(Tree t) { if (t == null) { return false; } Tree[] kids = t.children(); if (kids == null) { return false; } boolean foundNullSubj = false; for (Tree kid : kids) { Tree[] kidkids = kid.children(); if (NPSbjPattern.matcher(kid.value()).matches()) { kid.setValue("NP"); if (kidkids != null && kidkids.length == 1 && kidkids[0].value().equals("-NONE-")) { // only set flag, since there are 2 a couple of times (errors) foundNullSubj = true; } } } return foundNullSubj; } /** * Normalize a whole tree -- one can assume that this is the root. * This implementation deletes empty elements (ones with nonterminal * tag label '-NONE-') from the tree. */ @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { TreeTransformer transformer1 = t -> { if (doSGappedStuff) { String lab = t.label().value(); if (lab.equals("S") && includesEmptyNPSubj(t)) { LabelFactory lf = t.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! t.setLabel(lf.newLabel(t.label().value() + "-G")); } } return t; }; Predicate<Tree> subtreeFilter = new Predicate<Tree>() { private static final long serialVersionUID = -7250433816896327901L; @Override public boolean test(Tree t) { Tree[] kids = t.children(); Label l = t.label(); // The special Switchboard non-terminals clause. // Note that it deletes IP which other Treebanks might use! if ("RS".equals(t.label().value()) || "RM".equals(t.label().value()) || "IP".equals(t.label().value()) || "CODE".equals(t.label().value())) { return t.isLeaf(); //Prevents deletion of the word "IP" } if ((l != null) && l.value() != null && (l.value().equals("-NONE-")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf()) { // Delete empty/trace nodes (ones marked '-NONE-') return false; } return true; } }; Predicate<Tree> nodeFilter = new Predicate<Tree>() { private static final long serialVersionUID = 9000955019205336311L; @Override public boolean test(Tree t) { if (t.isLeaf() || t.isPreTerminal()) { return true; } // The special switchboard non-terminals clause. Try keeping EDITED for now.... // if ("EDITED".equals(t.label().value())) { // return false; // } if (t.numChildren() != 1) { return true; } if (t.label() != null && t.label().value() != null && t.label().value().equals(t.children()[0].label().value())) { return false; } return true; } }; TreeTransformer transformer2 = t -> { if (temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) { String lab = t.label().value(); if (TmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); oldT = ht; } while (!ht.isPreTerminal()); if (lab.startsWith("PP")) { ht = headFinder.determineHead(t); // look to right int j = t.objectIndexOf(ht); int sz = t.children().length; if (j + 1 < sz) { ht = t.getChild(j + 1); } if (ht.label().value().startsWith("NP")) { while (!ht.isLeaf()) { LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); ht = headFinder.determineHead(ht); } } } } } else if (temporalAnnotation == TEMPORAL_ALL_TERMINALS) { String lab = t.label().value(); if (NPTmpPattern.matcher(lab).matches()) { Tree ht; ht = headFinder.determineHead(t); if (ht.isPreTerminal()) { // change all tags to -TMP LabelFactory lf = ht.label().labelFactory(); Tree[] kids = t.children(); for (Tree kid : kids) { if (kid.isPreTerminal()) { // Note: this changes the tree label, rather // than creating a new tree node. Beware! kid.setLabel(lf.newLabel(kid.value() + "-TMP")); } } } else { Tree oldT = t; do { ht = headFinder.determineHead(oldT); oldT = ht; } while (!ht.isPreTerminal()); LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); } } } else if (temporalAnnotation == TEMPORAL_ALL_NP) { String lab = t.label().value(); if (NPTmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); oldT = ht; } } while (ht.value().startsWith("NP")); } } else if (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) { // also allow chain to start with PP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches()) { Tree oldT = t; do { Tree ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } else if ((temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) && (ht.value().equals("IN") || ht.value().equals("TO"))) { // change the head to be NP if possible Tree[] kidlets = oldT.children(); for (int k = kidlets.length - 1; k > 0; k--) { if (kidlets[k].value().startsWith("NP")) { ht = kidlets[k]; } } } LabelFactory lf = ht.labelFactory(); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP")) { ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } if (temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP && oldT.value().startsWith("PP")) { oldT.setLabel(lf.newLabel(tlp.basicCategory(oldT.value()))); } oldT = ht; } while (oldT.value().startsWith("NP") || oldT.value().startsWith("PP")); } } else if (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP) { // also allow chain to start with PP or ADVP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) { Tree oldT = t; do { Tree ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } oldT = ht; } while (oldT.value().startsWith("NP")); } } else if (temporalAnnotation == TEMPORAL_9) { // also allow chain to start with PP or ADVP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) { // log.info("TMP: Annotating " + t); addTMP9(t); } } else if (temporalAnnotation == TEMPORAL_ACL03PCFG) { String lab = t.label().value(); if (lab != null && NPTmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } oldT = ht; } while (!ht.isPreTerminal()); if ( ! onlyTagAnnotateNstar || ht.label().value().startsWith("N")) { LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); } } } if (doAdverbialNP) { String lab = t.value(); if (NPAdvPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-ADV")); oldT = ht; } } while (ht.value().startsWith("NP")); } } return t; }; // if there wasn't an empty nonterminal at the top, but an S, wrap it. if (tree.label().value().equals("S")) { tree = tf.newTreeNode("ROOT", Collections.singletonList(tree)); } // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP for (Tree subtree : tree) { if (subtree.isPhrasal() && "VB".equals(subtree.label().value())) { subtree.setValue("VP"); } } tree = tree.transform(transformer1); if (tree == null) { return null; } tree = tree.prune(subtreeFilter, tf); if (tree == null) { return null; } tree = tree.spliceOut(nodeFilter, tf); if (tree == null) { return null; } return tree.transform(transformer2, tf); } /** * Add -TMP when not present within an NP * @param tree The tree to add temporal info to. */ private void addTMP9(final Tree tree) { // do the head chain under it Tree ht = headFinder.determineHead(tree); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = tree.objectIndexOf(ht); if (j > 0) { ht = tree.getChild(j - 1); } } // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP") || ht.value().startsWith("PP") || ht.value().startsWith("ADVP")) { if (!TmpPattern.matcher(ht.value()).matches()) { LabelFactory lf = ht.labelFactory(); // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } if (ht.value().startsWith("NP") || ht.value().startsWith("PP") || ht.value().startsWith("ADVP")) { addTMP9(ht); } } // do the NPs under it (which may or may not be the head chain Tree[] kidlets = tree.children(); for (Tree kidlet : kidlets) { ht = kidlet; LabelFactory lf; if (tree.isPrePreTerminal() && !TmpPattern.matcher(ht.value()).matches()) { // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); lf = ht.labelFactory(); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } else if (ht.value().startsWith("NP")) { // don't add -TMP twice! if (!TmpPattern.matcher(ht.value()).matches()) { lf = ht.labelFactory(); // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } addTMP9(ht); } } } /** Implementation of TreeReaderFactory, mainly for convenience of * constructing by reflection. */ public static class NPTmpRetainingTreeReaderFactory implements TreeReaderFactory { @Override public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in, new LabeledScoredTreeFactory(), new NPTmpRetainingTreeNormalizer()); } } /** Implementation of TreeReaderFactory, mainly for convenience of * constructing by reflection. This one corresponds to what's currently * used in englishPCFG accurate unlexicalized parser. */ public static class NPTmpAdvRetainingTreeReaderFactory implements TreeReaderFactory { @Override public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in, new LabeledScoredTreeFactory(), new NPTmpRetainingTreeNormalizer(NPTmpRetainingTreeNormalizer.TEMPORAL_ACL03PCFG, false, 0, true)); } } }