TreeAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;

import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.HasCategory;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;

/**
 * Performs non-language specific annotation of Trees.
 *
 * @author Dan Klein
 * @author Christopher Manning
 */
public class TreeAnnotator implements TreeTransformer {

  private TreeFactory tf;
  private TreebankLangParserParams tlpParams;
  private HeadFinder hf;
  private TrainOptions trainOptions;

  public TreeAnnotator(HeadFinder hf, TreebankLangParserParams tlpp,
                       Options op) {
    this.tlpParams = tlpp;
    this.hf = hf;
    this.tf = new LabeledScoredTreeFactory();
    this.trainOptions = op.trainOptions;
  }

  /** Do the category splitting of the tree passed in.
   *  This method defensively copies its argument, which is not changed.
   *
   *  @param t The tree to be annotated.  This can be any tree with a
   *     {@code value()} stored in Labels.  The tree is assumed to have
   *     preterminals that are parts of speech.
   *  @return The annotated version of the Tree (which is a completely
   *     separate Tree with new tree structure and new labels).  The
   *     non-leaf nodes of the tree will be CategoryWordTag objects.
   */
  @Override
  public Tree transformTree(Tree t) {
    // make a defensive copy which the helper method can then mangle
    Tree copy = t.deepCopy(tf);
    if (trainOptions.markStrahler) {
      markStrahler(copy);
    }
    return transformTreeHelper(copy, copy);
  }

  /**
   * Do the category splitting of the tree passed in.
   * This is initially called on the root node of a tree, and it recursively
   * calls itself on children.  A depth first left-to-right traversal is
   * done whereby a tree node's children are first transformed and then
   * the parent is transformed.  At the time of calling, the original root
   * always sits above the current node.  This routine can be assumed to,
   * and does, change the tree passed in: it destructively modifies tree nodes,
   * and makes new tree structure when it needs to.
   *
   * @param t The tree node to subcategorize.
   * @param root The root of the tree.  It must contain {@code t} or
   *     this code will throw a NullPointerException.
   * @return The annotated tree.
   */
  private Tree transformTreeHelper(Tree t, Tree root) {
    if (t == null) {
      // handle null
      return null;
    }
    if (t.isLeaf()) {
      //No need to change the label
      return t;
    }

    String cat = t.label().value();
    Tree parent;
    String parentStr;
    String grandParentStr;
    if (root == null || t.equals(root)) {
      parent = null;
      parentStr = "";
    } else {
      parent = t.parent(root);
      parentStr = parent.label().value();
    }
    if (parent == null || parent.equals(root)) {
      grandParentStr = "";
    } else {
      grandParentStr = parent.parent(root).label().value();
    }
    String baseParentStr = tlpParams.treebankLanguagePack().basicCategory(parentStr);
    String baseGrandParentStr = tlpParams.treebankLanguagePack().basicCategory(grandParentStr);
    //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr);

    if (t.isPreTerminal()) {
      // handle tags
      Tree childResult = transformTreeHelper(t.children()[0], null); // recurse
      String word = childResult.value();  // would be nicer if Word/CWT ??

      if ( ! trainOptions.noTagSplit) {
        if (trainOptions.tagPA) {
          String test = cat + "^" + baseParentStr;
          if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.contains(test)) {
            cat = test;
          }
        }
        if (trainOptions.markUnaryTags && parent.numChildren() == 1) {
          cat = cat + "^U";
        }
      } // otherwise, leave the tags alone!

      // Label label = new CategoryWordTag(cat, word, cat);
      Label label = t.label().labelFactory().newLabel(t.label());
      label.setValue(cat);
      if(label instanceof HasCategory)
        ((HasCategory) label).setCategory(cat);
      if(label instanceof HasWord)
        ((HasWord) label).setWord(word);
      if(label instanceof HasTag)
        ((HasTag) label).setTag(cat);


      t.setLabel(label);
      t.setChild(0, childResult);  // just in case word is changed
      if (trainOptions.noTagSplit) {
        return t;
      } else {
        // language-specific transforms
        return tlpParams.transformTree(t, root);
      }
    } // end isPreTerminal()

    // handle phrasal categories
    Tree[] kids = t.children();
    for (int childNum = 0; childNum < kids.length; childNum++) {
      Tree child = kids[childNum];
      Tree childResult = transformTreeHelper(child, root); // recursive call
      t.setChild(childNum, childResult);
    }

    Tree headChild = hf.determineHead(t);
    if(headChild == null || headChild.label() == null) {
      throw new RuntimeException("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t);
    }

    Label headLabel = headChild.label();

    if( ! (headLabel instanceof HasWord))
      throw new RuntimeException("TreeAnnotator: Head label lacks a Word annotation!");
    if( ! (headLabel instanceof HasTag))
      throw new RuntimeException("TreeAnnotator: Head label lacks a Tag annotation!");

    String word = ((HasWord) headLabel).word();
    String tag = ((HasTag) headLabel).tag();

    // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag);
    String baseCat = tlpParams.treebankLanguagePack().basicCategory(cat);

    /* Sister annotation. Potential problem: if multiple sisters are
     * strong indicators for a single category's expansions.  This
     * happens concretely in the Chinese Treebank when NP (object)
     * has left sisters VV and AS.  Could lead to too much
     * sparseness.  The ideal solution would be to give the
     * splitting list an ordering, and take only the highest (~most
     * informative/reliable) sister annotation.
     */
    if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.length() > 0) {
      List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
      List<String> rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));

      List<String> leftAnn = new ArrayList<>();
      List<String> rightAnn = new ArrayList<>();

      for (String s : leftSis) {
        //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s);
        leftAnn.add(baseCat + "=l=" + tlpParams.treebankLanguagePack().basicCategory(s));
        //System.out.println("left-annotated test string " + s);
      }
      for (String s : rightSis) {
        //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s);
        rightAnn.add(baseCat + "=r=" + tlpParams.treebankLanguagePack().basicCategory(s));
      }
      for (Iterator<String> j = rightAnn.iterator(); j.hasNext();) {
        //System.out.println("new rightsis " + (String)j.next()); //debugging
      }
      for (String annCat : trainOptions.sisterSplitters) {
        //System.out.println("annotated test string " + annCat);
        if (leftAnn.contains(annCat) || rightAnn.contains(annCat)) {
          cat = cat + annCat.replaceAll("^" + baseCat, "");
          break;
        }
      }
    }

    if (trainOptions.PA && !trainOptions.smoothing && baseParentStr.length() > 0) {
      String cat2 = baseCat + "^" + baseParentStr;
      if (!trainOptions.selectiveSplit || trainOptions.splitters.contains(cat2)) {
        cat = cat + "^" + baseParentStr;
      }
    }
    if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.length() > 0) {
      if (trainOptions.selectiveSplit) {
        String cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr;
        if (cat.contains("^") && trainOptions.splitters.contains(cat2)) {
          cat = cat + "~" + baseGrandParentStr;
        }
      } else {
        cat = cat + "~" + baseGrandParentStr;
      }
    }
    if (trainOptions.markUnary > 0) {
      if (trainOptions.markUnary == 1 && kids.length == 1 && kids[0].depth() >= 2) {
        cat = cat + "-U";
      } else if (trainOptions.markUnary == 2 && parent != null && parent.numChildren() == 1 && t.depth() >= 2) {
        cat = cat + "-u";
      }
    }
    if (trainOptions.rightRec && rightRec(t, baseCat)) {
      cat = cat + "-R";
    }
    if (trainOptions.leftRec && leftRec(t, baseCat)) {
      cat = cat + "-L";
    }
    if (trainOptions.splitPrePreT && t.isPrePreTerminal()) {
      cat = cat + "-PPT";
    }

//    Label label = new CategoryWordTag(cat, word, tag);
    Label label = t.label().labelFactory().newLabel(t.label());
    label.setValue(cat);
    if(label instanceof HasCategory)
      ((HasCategory) label).setCategory(cat);
    if(label instanceof HasWord)
      ((HasWord) label).setWord(word);
    if(label instanceof HasTag)
      ((HasTag) label).setTag(tag);

    t.setLabel(label);

    return tlpParams.transformTree(t, root);
  }


  private List<String> listBasicCategories(List<String> l) {
    List<String> l1 = new ArrayList<>();
    for (String str : l) {
      l1.add(tlpParams.treebankLanguagePack().basicCategory(str));
    }
    return l1;
  }


  private static boolean rightRec(Tree t, String baseCat) {
    if (//! baseCat.equals("S") &&
            !baseCat.equals("NP")) {
      return false;
    }
    while (!t.isLeaf()) {
      t = t.lastChild();
      String str = t.label().value();
      if (str.startsWith(baseCat)) {
        return true;
      }
    }
    return false;
  }

  private static boolean leftRec(Tree t, String baseCat) {
    while (!t.isLeaf()) {
      t = t.firstChild();
      String str = t.label().value();
      if (str.startsWith(baseCat)) {
        return true;
      }
    }
    return false;
  }

  private static int markStrahler(Tree t) {
    if (t.isLeaf()) {
      // don't annotate the words at leaves!
      return 1;
    } else {
      String cat = t.label().value();
      int maxStrahler = -1;
      int maxMultiplicity = 0;
      for (int i = 0; i < t.numChildren(); i++) {
        int strahler = markStrahler(t.getChild(i));
        if (strahler > maxStrahler) {
          maxStrahler = strahler;
          maxMultiplicity = 1;
        } else if (strahler == maxStrahler) {
          maxMultiplicity++;
        }
      }
      if (maxMultiplicity > 1) {
        maxStrahler++;  // this is the one case where it grows
      }
      cat = cat + '~' + maxStrahler;
      Label label = t.label().labelFactory().newLabel(t.label());
      label.setValue(cat);
      t.setLabel(label);
      return maxStrahler;
    }
  }

}