package edu.stanford.nlp.trees.international.negra; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.trees.AbstractTreebankLanguagePack; import edu.stanford.nlp.trees.TreeReaderFactory; import edu.stanford.nlp.trees.HeadFinder; /** * Language pack for Negra and Tiger treebanks <em>after</em> conversion to * PTB format. * * @author Roger Levy * @author Spence Green */ public class NegraPennLanguagePack extends AbstractTreebankLanguagePack { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(NegraPennLanguagePack.class); private static final long serialVersionUID = 9081305982861675328L; /** Grammatical function parameters. If this is true, keep subj, obj, iobj functional tags, only. */ private boolean leaveGF = false; private static String[] gfToKeepArray = {"SB", "OA", "DA"}; /** * Gives a handle to the TreebankLanguagePack */ public NegraPennLanguagePack() { this(false, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR); } /** * Gives a handle to the TreebankLanguagePack */ public NegraPennLanguagePack(boolean leaveGF) { this(leaveGF, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR); } /** * Make a new language pack with grammatical functions used based on the value of leaveGF * and marked with the character gfChar. gfChar should *not* be an annotation introducing character. */ public NegraPennLanguagePack(boolean leaveGF, char gfChar) { super(gfChar); this.leaveGF = leaveGF; } private static final String NEGRA_ENCODING = "ISO-8859-1"; private static final String[] evalBignoredTags = {"$.", "$,"}; private static final String[] negraSFPunctTags = {"$."}; private static final String[] negraSFPunctWords = {".", "!", "?"}; private static final String[] negraPunctTags = {"$.", "$,", "$*LRB*"}; /** * The unicode escape is for a middle dot character */ private static final String[] negraPunctWords = {"-", ",", ";", ":", "!", "?", "/", ".", "...", "\u00b7", "'", "\"", "(", ")", "*LRB*", "*RRB*"}; /** * The first 3 are used by the Penn Treebank; # is used by the * BLLIP corpus, and ^ and ~ are used by Klein's lexparser. */ private static char[] annotationIntroducingChars = {'-', '%', '=', '|', '#', '^', '~'}; /** * This is valid for "BobChrisTreeNormalizer" conventions only. */ private static String[] pennStartSymbols = {"ROOT"}; /** * Returns a String array of punctuation tags for this treebank/language. * * @return The punctuation tags */ @Override public String[] punctuationTags() { return negraPunctTags; } /** * Returns a String array of punctuation words for this treebank/language. * * @return The punctuation words */ @Override public String[] punctuationWords() { return negraPunctWords; } /** * Returns a String array of sentence final punctuation tags for this * treebank/language. * * @return The sentence final punctuation tags */ @Override public String[] sentenceFinalPunctuationTags() { return negraSFPunctTags; } /** * Returns a String array of sentence final punctuation words for this * treebank/language. * * @return The sentence final punctuation tags */ public String[] sentenceFinalPunctuationWords() { return negraSFPunctWords; } //wsg2010: Disabled limited grammatical functions for now, which decrease F1 by ~10.0. @Override public String basicCategory(String category) { String basicCat; if (leaveGF) { basicCat = stripGF(category); } else { basicCat = super.basicCategory(category); } // log.info("NPLP stripping " + category + " with leaveGF = " + leaveGF + " gives " + basicCat); return basicCat; } @Override public String stripGF(String category) { if(category == null) { return null; } int index = category.lastIndexOf(gfCharacter); if(index > 0) { if(!containsKeptGF(category, index)) category = category.substring(0, index); } return category; } /** * Helper method for determining if the gf in category * is one of those in the array gfToKeepArray. Index is the * index where the gfCharacter appears. */ private static boolean containsKeptGF(String category, int index) { for(String gf : gfToKeepArray) { int gfLength = gf.length(); if(gfLength < (category.length() - index)) { if(category.substring(index+1, index+1+gfLength).equals(gf)) return true; } } return false; } /** * Returns a String array of punctuation tags that EVALB-style evaluation * should ignore for this treebank/language. * Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and * period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ @Override public String[] evalBIgnoredPunctuationTags() { return evalBignoredTags; } /** * Return an array of characters at which a String should be * truncated to give the basic syntactic category of a label. * The idea here is that Penn treebank style labels follow a syntactic * category with various functional and crossreferencing information * introduced by special characters (such as "NP-SBJ=1"). This would * be truncated to "NP" by the array containing '-' and "=". * * @return An array of characters that set off label name suffixes */ @Override public char[] labelAnnotationIntroducingCharacters() { return annotationIntroducingChars; } /** * Returns a String array of treebank start symbols. * * @return The start symbols */ @Override public String[] startSymbols() { return pennStartSymbols; } /** * Return the input Charset encoding for the Treebank. * See documentation for the <code>Charset</code> class. * * @return Name of Charset */ @Override public String getEncoding() { return NEGRA_ENCODING; } /** * Returns the extension of treebank files for this treebank. * This is "mrg". */ public String treebankFileExtension() { return "mrg"; } public boolean isLeaveGF() { return leaveGF; } public void setLeaveGF(boolean leaveGF) { this.leaveGF = leaveGF; } @Override public TreeReaderFactory treeReaderFactory() { return new NegraPennTreeReaderFactory(this); } /** {@inheritDoc} */ public HeadFinder headFinder() { return new NegraHeadFinder(this); } /** {@inheritDoc} */ public HeadFinder typedDependencyHeadFinder() { return new NegraHeadFinder(this); } /** * Return a tokenizer which might be suitable for tokenizing text that * will be used with this Treebank/Language pair, without tokenizing carriage * returns (i.e., treating them as white space). For German (Negra) we used * to only provide a {@link edu.stanford.nlp.process.WhitespaceTokenizer}, * but people didn't much like that. * So now we provide {@link PTBTokenizer}. It's not customized to German, but * will nevertheless do better than WhitespaceTokenizer at tokenizing German! * * @return A tokenizer */ @Override public TokenizerFactory<Word> getTokenizerFactory() { return PTBTokenizer.factory(); } }