CTBErrorCorrectingTreeNormalizer.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.pennchinese;

import java.io.Serializable;
import java.util.regex.Pattern;
import java.util.*;

import edu.stanford.nlp.trees.BobChrisTreeNormalizer;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import java.util.function.Predicate;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.io.EncodingPrintWriter;


/**
 * This was originally written to correct a few errors Galen found in CTB3.
 * The thinking was that perhaps when we get CTB4 they would be gone and we
 * could revert to BobChris.  Alas, CTB4 contained only more errors....
 * It has since been extended to allow some functional tags from CTB to be
 * maintained.  This is so far much easier than in NPTmpRetainingTN, since
 * we don't do any tag percolation (helped by CTB marking temporal nouns).
 * <p>
 * <i>Implementation note:</i> This now loads CharacterLevelTagExtender by
 * reflection if that option is invoked.
 *
 * @author Galen Andrew
 * @author Christopher Manning
 */
public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer {

  private static final long serialVersionUID = -8203853817025401845L;

  private static final Pattern NPTmpPattern = Pattern.compile("NP.*-TMP.*");
  private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*");
  private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*");

  private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer") != null;

  @SuppressWarnings({"NonSerializableFieldInSerializableClass"})
  private final TreeTransformer tagExtender;

  private final boolean splitNPTMP;
  private final boolean splitPPTMP;
  private final boolean splitXPTMP;

  /** Constructor with all of the options of the other constructor false */
  public CTBErrorCorrectingTreeNormalizer() {
    this(false, false, false, false);
  }

  /**
   * Build a CTBErrorCorrectingTreeNormalizer.
   *
   * @param splitNPTMP Temporal annotation on NPs
   * @param splitPPTMP Temporal annotation on PPs
   * @param splitXPTMP Temporal annotation on any phrase marked in CTB
   * @param charTags Whether you wish to push POS tags down on to the
   *           characters of a word (for unsegmented text)
   */
  public CTBErrorCorrectingTreeNormalizer(boolean splitNPTMP, boolean splitPPTMP, boolean splitXPTMP, boolean charTags) {
    this.splitNPTMP = splitNPTMP;
    this.splitPPTMP = splitPPTMP;
    this.splitXPTMP = splitXPTMP;
    if (charTags) {
      try {
        tagExtender = (TreeTransformer) Class.forName("edu.stanford.nlp.trees.international.pennchinese.CharacterLevelTagExtender").newInstance();
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    } else {
      tagExtender = null;
    }
  }


  /**
   * Remove things like hyphened functional tags and equals from the
   * end of a node label.  But keep occasional functional tags as
   * determined by class parameters, particularly TMP
   *
   * @param label The label to be cleaned up
   */
  @Override
  protected String cleanUpLabel(String label) {
    if (label == null) {
      return "ROOT";
    } else {
      boolean nptemp = NPTmpPattern.matcher(label).matches();
      boolean pptemp = PPTmpPattern.matcher(label).matches();
      boolean anytemp = TmpPattern.matcher(label).matches();
      label = tlp.basicCategory(label);
      if (anytemp && splitXPTMP) {
        label += "-TMP";
      } else if (pptemp && splitPPTMP) {
        label = label + "-TMP";
      } else if (nptemp && splitNPTMP) {
        label = label + "-TMP";
      }
      return label;
    }
  }


  private static class ChineseEmptyFilter implements Predicate<Tree>, Serializable {

    private static final long serialVersionUID = 8914098359495987617L;

    /** Doesn't accept nodes that only cover an empty. */
    @Override
    public boolean test(Tree t) {
      Tree[] kids = t.children();
      Label l = t.label();
      if ((l != null) && l.value() != null && // there appears to be a mistake in CTB3 where the label "-NONE-1" is used once
              // presumably it should be "-NONE-" and be spliced out here.
              (l.value().matches("-NONE-.*")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf()) {
        // Delete empty/trace nodes (ones marked '-NONE-')
        if ( ! l.value().equals("-NONE-")) {
          EncodingPrintWriter.err.println("Deleting errant node " + l.value() + " as if -NONE-: " + t, ChineseTreebankLanguagePack.ENCODING);
        }
        return false;
      }
      return true;
    }

  }

  @SuppressWarnings({"NonSerializableFieldInSerializableClass"})
  private final Predicate<Tree> chineseEmptyFilter = new ChineseEmptyFilter();

  private static final TregexPattern[] fixupTregex = {
          TregexPattern.compile("PU=punc < 她｛"),
          TregexPattern.compile("@NP <1 (@NP <1 NR <2 (PU=bad < /^＜$/)) <2 (FLR=dest <2 (NT < /Ｅｎｇｌｉｓｈ/))"),
          TregexPattern.compile("@IP < (FLR=dest <: (PU < /^〈$/) $. (__=bad1 $. (PU=bad2 < /^〉$/)))"),
          TregexPattern.compile("@DFL|FLR|IMG|SKIP=junk <<, (PU < /^[〈｛{＜\\[［]$/) <<- (PU < /^[〉｝}＞\\]］]$/)  <3 __"),
          TregexPattern.compile("WHPP=bad"),
  };
  private static final TsurgeonPattern[] fixupTsurgeon = {
          Tsurgeon.parseOperation("replace punc (PN 她) (PU ｛)"),
          Tsurgeon.parseOperation("move bad >1 dest"),
          Tsurgeon.parseOperation("[move bad1 >-1 dest] [move bad2 >-1 dest]"),
          Tsurgeon.parseOperation("delete junk"),
          Tsurgeon.parseOperation("relabel bad PP"),
  };

  static {
    if (fixupTregex.length != fixupTsurgeon.length) {
      throw new AssertionError("fixupTregex and fixupTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer.");
    }
  }

  // We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex
  // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation.

  // New phrasal categories in CTB 7 and later:
  // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
  // EMO = Emoticon. For emoticons. Fine to keep.
  // FLR = Filler.  Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
  // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those.
  // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep.
  // INTJ = Interjection. Fine to keep.
  // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata
  // OTH = ??. Weird but just leave.
  // SKIP = ??. Always has NOI under it. Omit or keep?
  // TYPO = seems like should mainly go, but sometimes a branching node??
  // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP.
  //
  // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >)
  // which just seems an error - should all be under FLR.
  //
  // POS tags are now 38. Original 33 plus these:
  // EM = Emoticon. Often but not always under EMO.
  // IC = Incomplete word rendered in pinyin, usually under DFL.
  // NOI =
  // URL = URL.
  // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist!


  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    Tree newTree = tree.prune(chineseEmptyFilter, tf).spliceOut(aOverAFilter);

    // Report non-unary initial rewrites & fix 'obvious ones'
    Tree[] kids = newTree.children();
    if (kids.length > 1) {
    /* -------------- don't do this as probably shouldn't for test set (and doesn't help anyway)
      if (kids.length == 2 &&
          "PU".equals(kids[kids.length - 1].value()) &&
          kids[0].isPhrasal()) {
        printlnErr("Correcting error: non-unary initial rewrite fixed by tucking punctuation inside constituent: " + newTree.localTree());
        List kidkids = kids[0].getChildrenAsList();
        kidkids.add(kids[1]);
        Tree bigger = tf.newTreeNode(kids[0].label(), kidkids);
        newTree = tf.newTreeNode(newTree.label(), Collections.singletonList(bigger));
      } else {
    -------------------- */
      EncodingPrintWriter.err.println("Possible error: non-unary initial rewrite: " +
                             newTree.localTree(), ChineseTreebankLanguagePack.ENCODING);
      // }
    } else if (kids.length > 0) { // ROOT has 1 child - the normal case
      Tree child = kids[0];
      if ( ! child.isPhrasal()) {
        if (DEBUG) {
          EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING);
        }
        Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids));
        newTree.setChild(0, added);
      } else if (child.label().value().equals("META")) {
        // Delete the one bogus META tree in CTB 9
        EncodingPrintWriter.err.println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.ENCODING);
        return null;
      }

    } else {
      EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING);
    }

    // note that there's also at least 1 tree that is an IP with no surrounding ROOT node

    // there are also several places where "NP" is used as a preterminal tag
    // and presumably should be "NN"
    // a couple of other random errors are corrected here
    for (Tree subtree : newTree) {
      if (subtree.value().equals("CP") && subtree.numChildren() == 1) {
        Tree subsubtree = subtree.firstChild();
        if (subsubtree.value().equals("ROOT")) {
          if (subsubtree.firstChild().isLeaf() && "CP".equals(subsubtree.firstChild().value())) {
            EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.ENCODING);
            List<Tree> children = subsubtree.getChildrenAsList();
            children = children.subList(1,children.size());
            subtree.setChildren(children);
            EncodingPrintWriter.err.println("  Corrected as:                                                    " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above
          }
        }
      }
      // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help.
      if (subtree.isPreTerminal()) {
        if (subtree.value().matches("NP")) {
          if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(subtree.firstChild().value())) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: NP preterminal over douhao; preterminal changed to PU: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("PU");
          } else if (subtree.parent(newTree).value().matches("NP")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: NP preterminal w/ NP parent; preterminal changed to NN: " + subtree.parent(newTree), ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("NN");
          } else {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: NP preterminal w/o NP parent, changing preterminal to NN: " + subtree.parent(newTree), ChineseTreebankLanguagePack.ENCODING);
            }
            // Tree newChild = tf.newTreeNode("NN", Collections.singletonList(subtree.firstChild()));
            // subtree.setChildren(Collections.singletonList(newChild));
            subtree.setValue("NN");
          }
        } else if (subtree.value().matches("PU")) {
          if (subtree.firstChild().value().matches("他")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("PN");
          } else if (subtree.firstChild().value().equals("里")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to LC: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("LC");
          } else if (subtree.firstChild().value().equals("是")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("VC");
          } else if (subtree.firstChild().value().matches("tw|半穴式")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("NN");
          } else if (subtree.firstChild().value().matches("33")) {
            if (DEBUG) {
              EncodingPrintWriter.err.println("Correcting error: \"33\" under PU tag; tag changed to CD: " + subtree, ChineseTreebankLanguagePack.ENCODING);
            }
            subtree.setValue("CD");
          }
        }
      } else if (subtree.value().matches("NN")) {
        if (DEBUG) {
          EncodingPrintWriter.err.println("Correcting error: NN phrasal tag changed to NP: " + subtree, ChineseTreebankLanguagePack.ENCODING);
        }
        subtree.setValue("NP");
      } else if (subtree.value().matches("MSP")) {
        if (DEBUG) {
          EncodingPrintWriter.err.println("Correcting error: MSP phrasal tag changed to VP: " + subtree, ChineseTreebankLanguagePack.ENCODING);
        }
        subtree.setValue("VP");
      }
    }

    for (int i = 0; i < fixupTregex.length; ++i) {
      if (DEBUG) {
        Tree preProcessed = newTree.deepCopy();
        newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
        if (!preProcessed.equals(newTree)) {
          EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.ENCODING);
          EncodingPrintWriter.err.println("  from: " + preProcessed, ChineseTreebankLanguagePack.ENCODING);
          EncodingPrintWriter.err.println("    to: " + newTree, ChineseTreebankLanguagePack.ENCODING);
        }
      } else {
        newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
      }
    }

    // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree.
    if (newTree.numChildren() == 0) {
      if (DEBUG) {
        EncodingPrintWriter.err.println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.ENCODING);
      }
      return null;
    }

    if (tagExtender != null) {
      newTree = tagExtender.transformTree(newTree);
    }
    return newTree;
  }

  /** So you can create a TreeReaderFactory using this TreeNormalizer easily by reflection. */
  public static class CTBErrorCorrectingTreeReaderFactory extends CTBTreeReaderFactory {

    public CTBErrorCorrectingTreeReaderFactory() {
      super(new CTBErrorCorrectingTreeNormalizer(false, false, false, false));
    }

  } // end class CTBErrorCorrectingTreeReaderFactory

} // end class CTBErrorCorrectingTreeNormalizer