package edu.berkeley.nlp.treebank; import java.io.Serializable; /** * Specifies the treebank/language specific components needed for * parsing the English Penn Treebank. * * @author Christopher Manning * @version 1.1 */ public class PennTreebankLanguagePack extends AbstractTreebankLanguagePack implements Serializable { /** * Gives a handle to the TreebankLanguagePack */ public PennTreebankLanguagePack() { } private static String[] pennPunctTags = {"''", "``", "-LRB-", "-RRB-", ".", ":", ","}; private static String[] pennSFPunctTags = {"."}; private static String[] collinsPunctTags = {"''", "``", ".", ":", ","}; private static String[] pennPunctWords = {"''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";"}; private static String[] pennSFPunctWords = {".", "!", "?"}; /** * The first 3 are used by the Penn Treebank; # is used by the * BLLIP corpus, and ^ and ~ are used by Klein's lexparser. * Teg added the last one _ (let me know if it hurts). */ private static char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~', '_'}; /** * This is valid for "BobChrisTreeNormalizer" conventions only. */ private static String[] pennStartSymbols = {"ROOT", "TOP"}; /** * Returns a String array of punctuation tags for this treebank/language. * * @return The punctuation tags */ public String[] punctuationTags() { return pennPunctTags; } /** * Returns a String array of punctuation words for this treebank/language. * * @return The punctuation words */ public String[] punctuationWords() { return pennPunctWords; } /** * Returns a String array of sentence final punctuation tags for this * treebank/language. * * @return The sentence final punctuation tags */ public String[] sentenceFinalPunctuationTags() { return pennSFPunctTags; } /** * Returns a String array of sentence final punctuation words for this * treebank/language. * * @return The sentence final punctuation tags */ public String[] sentenceFinalPunctuationWords() { return pennSFPunctWords; } /** * Returns a String array of punctuation tags that EVALB-style evaluation * should ignore for this treebank/language. * Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and * period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ public String[] evalBIgnoredPunctuationTags() { return collinsPunctTags; } /** * Return an array of characters at which a String should be * truncated to give the basic syntactic category of a label. * The idea here is that Penn treebank style labels follow a syntactic * category with various functional and crossreferencing information * introduced by special characters (such as "NP-SBJ=1"). This would * be truncated to "NP" by the array containing '-' and "=". * * @return An array of characters that set off label name suffixes */ public char[] labelAnnotationIntroducingCharacters() { return annotationIntroducingChars; } /** * Returns a String array of treebank start symbols. * * @return The start symbols */ public String[] startSymbols() { return pennStartSymbols; } /** * Returns the extension of treebank files for this treebank. * This is "mrg". */ public String treebankFileExtension() { return "mrg"; } /** Prints a few aspects of the TreebankLanguagePack, just for debugging. */ public static void main(String[] args) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); System.out.println("Start symbol: " + tlp.startSymbol()); String start = tlp.startSymbol(); System.out.println("Should be true: " + (tlp.isStartSymbol(start))); String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3"}; for (int i = 0; i < strs.length; i++) { String str = strs[i]; System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str)); } } private static final long serialVersionUID = 9081305982861675328L; }