package edu.stanford.nlp.trees.international.tuebadz; import edu.stanford.nlp.trees.AbstractTreebankLanguagePack; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.TreeReaderFactory; import edu.stanford.nlp.trees.HeadFinder; /** Language pack for the Tuebingen Treebank of Written German (TueBa-D/Z). * http://www.sfs.nphil.uni-tuebingen.de/en_tuebadz.shtml * This treebank is in utf-8. * * @author Roger Levy (rog@stanford.edu) */ public class TueBaDZLanguagePack extends AbstractTreebankLanguagePack { private boolean limitedGF = false; private static String[] gfToKeepArray = {"ON", "OA", "OD"}; private static String[] tuebadzPunctTags = {"$.","$,","$-LRB"}; private static String[] tuebadzSFPunctTags = {"$."}; private static String[] tuebadzPunctWords = { "`", "-", ",", ";", ":", "!", "?", "/", ".", "...","'", "\"", "[", "]", "*"}; private static String[] tuebadzSFPunctWords = {".", "!", "?"}; /** * The first one is used by the TueBaDZ Treebank, and the rest are used by Klein's lexparser. */ private static char[] annotationIntroducingChars = {':', '^', '~', '%', '#', '='}; /** * Gives a handle to the TreebankLanguagePack */ public TueBaDZLanguagePack() { this(false); } /** * Make a new language pack with grammatical functions used based on the value of leaveGF */ public TueBaDZLanguagePack(boolean leaveGF) { this(leaveGF, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR); } /** * Make a new language pack with grammatical functions used based on the value of leaveGF * and marked with the character gfChar. gfChar should *not* be an annotation introducing character. */ public TueBaDZLanguagePack(boolean leaveGF, char gfChar) { this(false, leaveGF, gfChar); } /** * Make a new language pack with grammatical functions used based on the value of leaveGF * and marked with the character gfChar. gfChar should *not* be an annotation introducing character. */ public TueBaDZLanguagePack(boolean useLimitedGF, boolean leaveGF, char gfChar) { super(gfChar); this.leaveGF = leaveGF; this.limitedGF = useLimitedGF; } /** * Return an array of characters at which a String should be * truncated to give the basic syntactic category of a label. * The idea here is that Penn treebank style labels follow a syntactic * category with various functional and crossreferencing information * introduced by special characters (such as "NP-SBJ=1"). This would * be truncated to "NP" by the array containing '-' and "=". * * @return An array of characters that set off label name suffixes */ @Override public char[] labelAnnotationIntroducingCharacters() { return annotationIntroducingChars; } @Override public String[] punctuationTags() { return tuebadzPunctTags; } @Override public String[] punctuationWords() { return tuebadzPunctWords; } @Override public String[] sentenceFinalPunctuationTags() { return tuebadzSFPunctTags; } @Override public String[] startSymbols() { return new String[] {"TOP"}; } public String[] sentenceFinalPunctuationWords() { return tuebadzSFPunctWords; } public String treebankFileExtension() { return ".penn"; } private boolean leaveGF = false; @Override public String basicCategory(String category) { String basicCat = super.basicCategory(category); if(!leaveGF) { basicCat = stripGF(basicCat); } return basicCat; } @Override public String stripGF(String category) { if(category == null) { return null; } int index = category.lastIndexOf(gfCharacter); if(index > 0) { if(!limitedGF || !containsKeptGF(category, index)) category = category.substring(0, index); } return category; } /** * Helper method for determining if the gf in category * is one of those in the array gfToKeepArray. Index is the * index where the gfCharacter appears. */ private static boolean containsKeptGF(String category, int index) { for(String gf : gfToKeepArray) { int gfLength = gf.length(); if(gfLength < (category.length() - index)) { if(category.substring(index+1).equals(gf))//category.substring(index+1, index+1+gfLength).equals(gf)) return true; } } return false; } public boolean isLeaveGF() { return leaveGF; } public void setLeaveGF(boolean leaveGF) { this.leaveGF = leaveGF; } /** * Return the input Charset encoding for the Treebank. * See documentation for the <code>Charset</code> class. * * @return Name of Charset */ @Override public String getEncoding() { return "iso-8859-15"; } /** Prints a few aspects of the TreebankLanguagePack, just for debugging. */ public static void main(String[] args) { TreebankLanguagePack tlp = new TueBaDZLanguagePack(); System.out.println("Start symbol: " + tlp.startSymbol()); String start = tlp.startSymbol(); System.out.println("Should be true: " + (tlp.isStartSymbol(start))); String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"}; for (String str : strs) { System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str)); } } private static final long serialVersionUID = 2697418320262700673L; public boolean isLimitedGF() { return limitedGF; } public void setLimitedGF(boolean limitedGF) { this.limitedGF = limitedGF; } @Override public TreeReaderFactory treeReaderFactory() { return new TueBaDZTreeReaderFactory(this); } /** {@inheritDoc} */ public HeadFinder headFinder() { return new TueBaDZHeadFinder(); } /** {@inheritDoc} */ public HeadFinder typedDependencyHeadFinder() { return new TueBaDZHeadFinder(); } }