package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.io.NumberRangeFileFilter; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeTransformer; import edu.stanford.nlp.trees.Treebank; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Index; import edu.stanford.nlp.util.Timing; import java.util.*; /** * Checks the coverage of rules in a grammar on a test treebank. * * @author Teg Grenager */ public class GrammarCoverageChecker { private Options op; private void testOnTreebank(LexicalizedParser pd, TreebankLangParserParams tlpParams, Treebank testTreebank, String treebankRoot, Index<String> stateIndex) { Timing.startTime(); TreeTransformer annotator = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.getEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); for (Tree goldTree : testTreebank) { goldTree = annotator.transformTree(goldTree); // System.out.println(); // System.out.println("Checking tree: " + goldTree); for (Tree localTree : goldTree) { // now try to use the grammar to score this local tree if (localTree.isLeaf() || localTree.isPreTerminal() || localTree.children().length < 2) { continue; } System.out.println(localTreeToRule(localTree)); double score = computeLocalTreeScore(localTree, stateIndex, pd); if (score == Double.NEGATIVE_INFINITY) { // System.out.println(localTreeToRule(localTree)); } System.out.println("score: " + score); } } } private static String localTreeToRule(Tree localTree) { StringBuilder sb = new StringBuilder(); sb.append(localTree.value()).append(" -> "); for (int i = 0; i < localTree.children().length - 1; i++) { sb.append(localTree.children()[i].value()).append(" "); } sb.append(localTree.children()[localTree.children().length - 1].value()); return sb.toString(); } private static double computeLocalTreeScore(Tree localTree, Index<String> stateIndex, LexicalizedParser pd) { try { String parent = localTree.value(); int parentState = stateIndex.indexOf(parent); // System.out.println("parentState: " + parentState); Tree[] children = localTree.children(); // let's find the unary to kick things off with the left child (since we assume a left to right grammar // first we create the synthetic parent of the leftmost child String nextChild = children[0].value(); // childState = stateIndex.indexOf(nextChild); String current = "@" + parent + "| [ [" + nextChild + "] "; int currentState = stateIndex.indexOf(current); List<UnaryRule> rules = pd.ug.rulesByParent(currentState); UnaryRule ur = rules.get(0); // System.out.println("rule: " + ur); double localTreeScore = ur.score(); // go through rest of rules for (int i = 1; i < children.length; i++) { // find rules in BinaryGrammar that can extend this state // System.out.println("currentState: " + currentState); nextChild = children[i].value(); int childState = stateIndex.indexOf(nextChild); // System.out.println("childState: " + childState); List<BinaryRule> l = pd.bg.ruleListByLeftChild(currentState); BinaryRule foundBR = null; if (i < children.length - 1) { // need to the rewrite that doesn't rewrite to the parent for (BinaryRule br : l) { // System.out.println("\t\trule: " + br + " parent: " + br.parent + " right: " + br.rightChild); if (br.rightChild == childState && br.parent != parentState) { foundBR = br; break; } } } else { // this is the last rule, need to find the rewrite to the parent of the whole local tree for (BinaryRule br : l) { // System.out.println("\t\trule: " + br + " parent: " + br.parent + " right: " + br.rightChild); if (br.rightChild == childState && br.parent == parentState) { foundBR = br; break; } } } if (foundBR == null) { // we never found a matching rule! // System.out.println("broke on " + nextChild); return Double.NEGATIVE_INFINITY; } // System.out.println("rule: " + foundBR); currentState = foundBR.parent; localTreeScore += foundBR.score; } // end loop through children return localTreeScore; } catch (NoSuchElementException e) { // we couldn't find a state for one of the needed categories // System.out.println("no state found: " + e.toString()); // List tempRules = pd.ug.rulesByChild(childState); // for (Iterator iter = tempRules.iterator(); iter.hasNext();) { // UnaryRule ur = (UnaryRule) iter.next(); // System.out.println("\t\t\trule with child: " + ur); // } return Double.NEGATIVE_INFINITY; } } /** * Usage: java edu.stanford.nlp.parser.lexparser.GrammarCoverageChecker parserFile treebankPath low high [optionFlags*] */ public static void main(String[] args) { new GrammarCoverageChecker().runTest(args); } public void runTest(String[] args) { // get a parser from file LexicalizedParser pd = LexicalizedParser.loadModel(args[0]); op = pd.getOp(); // in case a serialized options was read in Treebank testTreebank = op.tlpParams.memoryTreebank(); int testlow = Integer.parseInt(args[2]); int testhigh = Integer.parseInt(args[3]); testTreebank.loadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true)); op.setOptionsOrWarn(args, 4, args.length); testOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex); } }