package edu.stanford.nlp.ie.machinereading.common; import edu.stanford.nlp.ling.CategoryWordTag; import edu.stanford.nlp.trees.DiskTreebank; import edu.stanford.nlp.trees.HeadFinder; import edu.stanford.nlp.trees.ModCollinsHeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeVisitor; import edu.stanford.nlp.trees.Treebank; /** * Simple variant of the ModCollinsHeadFinder avoids supplying punctuation tags * as heads whenever possible. * * @author David McClosky (mcclosky@stanford.edu) * */ public class NoPunctuationHeadFinder extends ModCollinsHeadFinder { private static final long serialVersionUID = 1201891305937180385L; /** * Returns whether a part of speech tag is the tag for a punctuation mark (by * checking whether the first character is a letter. * * @param label * part of speech tag * @return whether the tag is (typically) assigned to punctuation */ private boolean isPunctuationLabel(String label) { return !Character.isLetter(label.charAt(0)) && !(label.equals("$") || label.equals("%")); } protected int postOperationFix(int headIdx, Tree[] daughterTrees) { int index = super.postOperationFix(headIdx, daughterTrees); // if the current index is a punctuation mark, we search left until we // find a non-punctuation mark tag or hit the left end of the sentence while (index > 0) { String label = daughterTrees[index].label().value(); if (isPunctuationLabel(label)) { index--; } else { break; } } return index; } public static void main(String[] args) { // simple testing code Treebank treebank = new DiskTreebank(); CategoryWordTag.suppressTerminalDetails = true; treebank.loadPath(args[0]); final HeadFinder chf = new NoPunctuationHeadFinder(); treebank.apply(pt -> { pt.percolateHeads(chf); pt.pennPrint(); System.out.println(); }); } }