GrammarCoverageChecker.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Timing;

import java.util.*;

/**
 * Checks the coverage of rules in a grammar on a test treebank.
 *
 * @author Teg Grenager
 */
public class GrammarCoverageChecker {
  private Options op;

  private void testOnTreebank(LexicalizedParser pd, TreebankLangParserParams tlpParams, Treebank testTreebank, String treebankRoot, Index<String> stateIndex) {
    Timing.startTime();
    TreeTransformer annotator = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
    // CDM: Aug 2004: With new implementation of treebank split categories,
    // I've hardwired this to load English ones.  Otherwise need training data.
    // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
    op.trainOptions.splitters = ParentAnnotationStats.getEnglishSplitCategories(treebankRoot);
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    for (Tree goldTree : testTreebank) {
      goldTree = annotator.transformTree(goldTree);
      //      System.out.println();
      //      System.out.println("Checking tree: " + goldTree);
      for (Tree localTree : goldTree) {
        // now try to use the grammar to score this local tree
        if (localTree.isLeaf() || localTree.isPreTerminal() || localTree.children().length < 2) {
          continue;
        }
        System.out.println(localTreeToRule(localTree));
        double score = computeLocalTreeScore(localTree, stateIndex, pd);
        if (score == Double.NEGATIVE_INFINITY) {
          //          System.out.println(localTreeToRule(localTree));
        }
        System.out.println("score: " + score);
      }
    }
  }

  private static String localTreeToRule(Tree localTree) {
    StringBuilder sb = new StringBuilder();
    sb.append(localTree.value()).append(" -> ");
    for (int i = 0; i < localTree.children().length - 1; i++) {
      sb.append(localTree.children()[i].value()).append(" ");
    }
    sb.append(localTree.children()[localTree.children().length - 1].value());
    return sb.toString();
  }

  private static double computeLocalTreeScore(Tree localTree, Index<String> stateIndex, LexicalizedParser pd) {
    try {
      String parent = localTree.value();
      int parentState = stateIndex.indexOf(parent);
      //      System.out.println("parentState: " + parentState);
      Tree[] children = localTree.children();
      // let's find the unary to kick things off with the left child (since we assume a left to right grammar
      // first we create the synthetic parent of the leftmost child
      String nextChild = children[0].value();
      // childState = stateIndex.indexOf(nextChild);
      String current = "@" + parent + "| [ [" + nextChild + "] ";
      int currentState = stateIndex.indexOf(current);
      List<UnaryRule> rules = pd.ug.rulesByParent(currentState);
      UnaryRule ur = rules.get(0);
      //      System.out.println("rule: " + ur);
      double localTreeScore = ur.score();
      // go through rest of rules
      for (int i = 1; i < children.length; i++) {
        // find rules in BinaryGrammar that can extend this state
        //        System.out.println("currentState: " + currentState);
        nextChild = children[i].value();
        int childState = stateIndex.indexOf(nextChild);
        //        System.out.println("childState: " + childState);
        List<BinaryRule> l = pd.bg.ruleListByLeftChild(currentState);
        BinaryRule foundBR = null;
        if (i < children.length - 1) {
          // need to the rewrite that doesn't rewrite to the parent
          for (BinaryRule br : l) {
            //            System.out.println("\t\trule: " + br + " parent: " + br.parent + " right: " + br.rightChild);
            if (br.rightChild == childState && br.parent != parentState) {
              foundBR = br;
              break;
            }
          }
        } else {
          // this is the last rule, need to find the rewrite to the parent of the whole local tree
          for (BinaryRule br : l) {
            //            System.out.println("\t\trule: " + br + " parent: " + br.parent + " right: " + br.rightChild);
            if (br.rightChild == childState && br.parent == parentState) {
              foundBR = br;
              break;
            }
          }
        }
        if (foundBR == null) {
          // we never found a matching rule!
          //          System.out.println("broke on " + nextChild);
          return Double.NEGATIVE_INFINITY;
        }
        //        System.out.println("rule: " + foundBR);
        currentState = foundBR.parent;
        localTreeScore += foundBR.score;
      } // end loop through children
      return localTreeScore;
    } catch (NoSuchElementException e) {
      // we couldn't find a state for one of the needed categories
      //      System.out.println("no state found: " + e.toString());
      //      List tempRules = pd.ug.rulesByChild(childState);
      //      for (Iterator iter = tempRules.iterator(); iter.hasNext();) {
      //        UnaryRule ur = (UnaryRule) iter.next();
      //        System.out.println("\t\t\trule with child: " + ur);
      //      }
      return Double.NEGATIVE_INFINITY;
    }
  }


  /**
   * Usage: java edu.stanford.nlp.parser.lexparser.GrammarCoverageChecker parserFile treebankPath low high [optionFlags*]
   */
  public static void main(String[] args) {
    new GrammarCoverageChecker().runTest(args);
  }

  public void runTest(String[] args) {
    // get a parser from file
    LexicalizedParser pd = LexicalizedParser.loadModel(args[0]);
    op = pd.getOp(); // in case a serialized options was read in
    Treebank testTreebank = op.tlpParams.memoryTreebank();
    int testlow = Integer.parseInt(args[2]);
    int testhigh = Integer.parseInt(args[3]);
    testTreebank.loadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
    op.setOptionsOrWarn(args, 4, args.length);
    testOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
  }
}