FilterConfusingRules.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.dvparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;

import edu.stanford.nlp.parser.lexparser.BinaryGrammar;
import edu.stanford.nlp.parser.lexparser.BinaryRule;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.parser.lexparser.UnaryGrammar;
import edu.stanford.nlp.parser.lexparser.UnaryRule;
import edu.stanford.nlp.trees.Tree;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TwoDimensionalSet;

/**
 * This filter rejects Trees which have unary or binary productions
 * which the given parser does not contain.  
 * <br>
 * One situation where this happens often is when grammar compaction
 * is turned on; this can often result in a Tree where there is no
 * BinaryRule which could explicitely create a particular node, but
 * the Tree is still valid.  However, for various applications of the
 * DVParser, this kind of Tree is useless.  A good way to eliminate
 * most of this kind of tree is to make sure the parser is trained
 * with <code>-compactGrammar 0</code>.
 */
public class FilterConfusingRules implements Predicate<Tree>, Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FilterConfusingRules.class);
  final Set<String> unaryRules = new HashSet<>();
  final TwoDimensionalSet<String, String> binaryRules = new TwoDimensionalSet<>();
  
  static final boolean DEBUG = false;

  public FilterConfusingRules(LexicalizedParser parser) {
    BinaryGrammar binaryGrammar = parser.bg;
    UnaryGrammar unaryGrammar = parser.ug;
    Options op = parser.getOp();
    Index<String> stateIndex = parser.stateIndex;
    
    for (UnaryRule unaryRule : unaryGrammar) {
      // only make one matrix for each parent state, and only use the
      // basic category for that      
      String childState = stateIndex.get(unaryRule.child);
      String childBasic = op.langpack().basicCategory(childState);
      
      unaryRules.add(childBasic);
    }
    
    for (BinaryRule binaryRule : binaryGrammar) {
      // only make one matrix for each parent state, and only use the
      // basic category for that
      String leftState = stateIndex.get(binaryRule.leftChild);
      String leftBasic = op.langpack().basicCategory(leftState);
      String rightState = stateIndex.get(binaryRule.rightChild);
      String rightBasic = op.langpack().basicCategory(rightState);
      
      binaryRules.add(leftBasic, rightBasic);
    }

    if (DEBUG) {
      log.info("UNARY RULES");
      for (String rule : unaryRules) {
        log.info("  " + rule);
      }
      log.info();
      log.info("BINARY RULES");
      for (Pair<String, String> rule : binaryRules) {
        log.info("  " + rule);
      }
      log.info();
      log.info();
    }
  }

  public boolean test(Tree tree) {
    if (tree.isLeaf() || tree.isPreTerminal()) {
      return true;
    }
    if (tree.children().length == 0 || tree.children().length > 2) {
      throw new AssertionError("Tree not binarized");
    }
    if (tree.children().length == 1) {
      if (!unaryRules.contains(tree.children()[0].label().value())) {
        if (DEBUG) {
          log.info("Filtered tree because of unary rule: " + tree.children()[0].label().value());
        }
        return false;
      }
    } else {
      if (!binaryRules.contains(tree.children()[0].label().value(), tree.children()[1].label().value())) {
        if (DEBUG) {
          log.info("Filtered tree because of binary rule: " + tree.children()[0].label().value() + "," + tree.children()[1].label().value());
        }
        return false;
      }
    }
    for (Tree child : tree.children()) {
      if (!test(child)) {
        return false;
      }
    }
    return true;
  }

}