package edu.stanford.nlp.trees.international.arabic; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification; import edu.stanford.nlp.international.arabic.process.ArabicTokenizer; import edu.stanford.nlp.international.morph.MorphoFeatureSpecification; import edu.stanford.nlp.ling.HasWord; /** * Specifies the treebank/language specific components needed for * parsing the Penn Arabic Treebank (ATB). This language pack has been updated for * ATB1v4, ATB2v3, and ATB3v3.2 * <p> * The encoding for the ATB is the default UTF-8 specified in AbstractTreebankLanguagePack. * * @author Christopher Manning * @author Mona Diab * @author Roger Levy * @author Spence Green * */ public class ArabicTreebankLanguagePack extends AbstractTreebankLanguagePack { private static final long serialVersionUID = 9081305982861675328L; private static final String[] collinsPunctTags = {"PUNC"}; private static final String[] pennPunctTags = {"PUNC"}; private static final String[] pennPunctWords = {".","\"",",","-LRB-","-RRB-","-",":","/","?","_","*","%","!",">","-PLUS-","...",";","..","&","=","ر","'","\\","`","......"}; private static final String[] pennSFPunctTags = {"PUNC"}; private static final String[] pennSFPunctWords = {".", "!", "?"}; /** * The first 3 are used by the Penn Treebank; # is used by the * BLLIP corpus, and ^ and ~ are used by Klein's lexparser. * Chris deleted '_' for Arabic as it appears in tags (NO_FUNC). * June 2006: CDM tested _ again with true (new) Treebank tags to see if it * was useful for densening up the tag space, but the results were negative. * Roger added + for Arabic but Chris deleted it again, since unless you've * recoded determiners, it screws up DET+NOUN, etc. (That is, it would only be useful if * you always wanted to cut at the first '+', but in practice that is not viable, certainly * not with the IBM ATB processing either.) */ private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'}; /** * This is valid for "BobChrisTreeNormalizer" conventions only. * wsg: "ROOT" should always be the first value. See {@link #startSymbol} in * the parent class. */ private static final String[] pennStartSymbols = {"ROOT"}; /** * Returns a String array of punctuation tags for this treebank/language. * * @return The punctuation tags */ @Override public String[] punctuationTags() { return pennPunctTags; } /** * Returns a String array of punctuation words for this treebank/language. * * @return The punctuation words */ @Override public String[] punctuationWords() { return pennPunctWords; } /** * Returns a String array of sentence final punctuation tags for this * treebank/language. * * @return The sentence final punctuation tags */ @Override public String[] sentenceFinalPunctuationTags() { return pennSFPunctTags; } /** * Returns a String array of sentence final punctuation words for this * treebank/language. * * @return The sentence final punctuation tags */ public String[] sentenceFinalPunctuationWords() { return pennSFPunctWords; } /** * Returns a String array of punctuation tags that EVALB-style evaluation * should ignore for this treebank/language. * Traditionally, EVALB has ignored a subset of the total set of * punctuation tags in the English Penn Treebank (quotes and * period, comma, colon, etc., but not brackets) * * @return Whether this is a EVALB-ignored punctuation tag */ @Override public String[] evalBIgnoredPunctuationTags() { return collinsPunctTags; } /** * Return an array of characters at which a String should be * truncated to give the basic syntactic category of a label. * The idea here is that Penn treebank style labels follow a syntactic * category with various functional and crossreferencing information * introduced by special characters (such as "NP-SBJ=1"). This would * be truncated to "NP" by the array containing '-' and "=". * * @return An array of characters that set off label name suffixes */ @Override public char[] labelAnnotationIntroducingCharacters() { return annotationIntroducingChars; } /** * Returns a String array of treebank start symbols. * * @return The start symbols */ @Override public String[] startSymbols() { return pennStartSymbols; } /** * TODO: there is no way to change this using options. */ private TokenizerFactory<? extends HasWord> tf = ArabicTokenizer.atbFactory(); /** * Return a tokenizer which might be suitable for tokenizing text * that will be used with this Treebank/Language pair. We tokenize * the Arabic using the ArabicTokenizer class. * * @return A tokenizer */ @Override public TokenizerFactory<? extends HasWord> getTokenizerFactory() { return tf; } /** * Returns the extension of treebank files for this treebank. * This is "tree". */ public String treebankFileExtension() { return "tree"; } @Override public TreeReaderFactory treeReaderFactory() { return new ArabicTreeReaderFactory(); } @Override public String toString() { return "ArabicTreebankLanguagePack"; } /** {@inheritDoc} */ public HeadFinder headFinder() { return new ArabicHeadFinder(this); } /** {@inheritDoc} */ public HeadFinder typedDependencyHeadFinder() { return new ArabicHeadFinder(this); } @Override public MorphoFeatureSpecification morphFeatureSpec() { return new ArabicMorphoFeatureSpecification(); } /** * * @param args */ public static void main(String[] args) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); System.out.println("Start symbol: " + tlp.startSymbol()); String start = tlp.startSymbol(); System.out.println("Should be true: " + (tlp.isStartSymbol(start))); String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3"}; for (String str : strs) { System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str)); } } }