package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.util.logging.Redwood; import java.util.regex.Pattern; /** * Look for verbs selecting a VBN verb. * This is now a zeroeth order observed data only feature. * But reminiscent of what was done in Toutanova and Manning 2000. * It doesn't seem to help tagging performance any more. * * @author Christopher Manning */ public class ExtractorVerbalVBNZero extends DictionaryExtractor { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ExtractorVerbalVBNZero.class); private static final String vbnTag = "VBN"; private static final String vbdTag = "VBD"; private static final String jjTag = "JJ"; private static final String edSuff = "ed"; private static final String enSuff = "en"; private static final String oneSt = "1"; private static final String naWord = "NA"; private final int bound; private static final Pattern stopper = Pattern.compile("(?i:and|or|but|,|;|-|--)"); private static final Pattern vbnWord = Pattern.compile("(?i:have|has|having|had|is|am|are|was|were|be|being|been|'ve|'s|s|'d|'re|'m|gotten|got|gets|get|getting)"); // cf. list in EnglishPTBTreebankCorrector public ExtractorVerbalVBNZero(int bound) { this.bound = bound; } @Override public boolean precondition(String tag) { log.info("VBN: Testing precondition on " + tag + ": " + (tag.equals(vbnTag) || tag.equals(vbdTag) || tag.equals(jjTag))); return tag.equals(vbnTag) || tag.equals(vbdTag) || tag.equals(jjTag); } @Override String extract(History h, PairsHolder pH) { String cword = pH.getWord(h, 0); int allCount = dict.sum(cword); int vBNCount = dict.getCount(cword, vbnTag); int vBDCount = dict.getCount(cword, vbdTag); // Conditions for deciding inapplicable if ((allCount == 0) && (!(cword.endsWith(edSuff) || cword.endsWith(enSuff)))) { return zeroSt; } if ((allCount > 0) && (vBNCount + vBDCount <= allCount / 100)) { return zeroSt; } String lastverb = naWord; //String lastvtag = zeroSt; // mg: written but never read for (int index = -1; index >= -bound; index--) { String word2 = pH.getWord(h, index); if ("NA".equals(word2)) { break; } if (stopper.matcher(word2).matches()) { break; } if (vbnWord.matcher(word2).matches()) { lastverb = word2; break; } index--; } if ( ! lastverb.equals(naWord)) { log.info("VBN: For " + cword + ", found preceding VBN cue " + lastverb); return oneSt; } return zeroSt; } @Override public String toString() { return "ExtractorVerbalVBNZero(bound=" + bound + ')'; } private static final long serialVersionUID = -5881204185400060636L; }