package edu.stanford.nlp.parser.dvparser;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import edu.stanford.nlp.parser.lexparser.BinaryGrammar;
import edu.stanford.nlp.parser.lexparser.BinaryRule;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.parser.lexparser.UnaryGrammar;
import edu.stanford.nlp.parser.lexparser.UnaryRule;
import edu.stanford.nlp.trees.Tree;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TwoDimensionalSet;
/**
* This filter rejects Trees which have unary or binary productions
* which the given parser does not contain.
* <br>
* One situation where this happens often is when grammar compaction
* is turned on; this can often result in a Tree where there is no
* BinaryRule which could explicitely create a particular node, but
* the Tree is still valid. However, for various applications of the
* DVParser, this kind of Tree is useless. A good way to eliminate
* most of this kind of tree is to make sure the parser is trained
* with <code>-compactGrammar 0</code>.
*/
public class FilterConfusingRules implements Predicate<Tree>, Serializable {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(FilterConfusingRules.class);
final Set<String> unaryRules = new HashSet<>();
final TwoDimensionalSet<String, String> binaryRules = new TwoDimensionalSet<>();
static final boolean DEBUG = false;
public FilterConfusingRules(LexicalizedParser parser) {
BinaryGrammar binaryGrammar = parser.bg;
UnaryGrammar unaryGrammar = parser.ug;
Options op = parser.getOp();
Index<String> stateIndex = parser.stateIndex;
for (UnaryRule unaryRule : unaryGrammar) {
// only make one matrix for each parent state, and only use the
// basic category for that
String childState = stateIndex.get(unaryRule.child);
String childBasic = op.langpack().basicCategory(childState);
unaryRules.add(childBasic);
}
for (BinaryRule binaryRule : binaryGrammar) {
// only make one matrix for each parent state, and only use the
// basic category for that
String leftState = stateIndex.get(binaryRule.leftChild);
String leftBasic = op.langpack().basicCategory(leftState);
String rightState = stateIndex.get(binaryRule.rightChild);
String rightBasic = op.langpack().basicCategory(rightState);
binaryRules.add(leftBasic, rightBasic);
}
if (DEBUG) {
log.info("UNARY RULES");
for (String rule : unaryRules) {
log.info(" " + rule);
}
log.info();
log.info("BINARY RULES");
for (Pair<String, String> rule : binaryRules) {
log.info(" " + rule);
}
log.info();
log.info();
}
}
public boolean test(Tree tree) {
if (tree.isLeaf() || tree.isPreTerminal()) {
return true;
}
if (tree.children().length == 0 || tree.children().length > 2) {
throw new AssertionError("Tree not binarized");
}
if (tree.children().length == 1) {
if (!unaryRules.contains(tree.children()[0].label().value())) {
if (DEBUG) {
log.info("Filtered tree because of unary rule: " + tree.children()[0].label().value());
}
return false;
}
} else {
if (!binaryRules.contains(tree.children()[0].label().value(), tree.children()[1].label().value())) {
if (DEBUG) {
log.info("Filtered tree because of binary rule: " + tree.children()[0].label().value() + "," + tree.children()[1].label().value());
}
return false;
}
}
for (Tree child : tree.children()) {
if (!test(child)) {
return false;
}
}
return true;
}
}