package edu.stanford.nlp.trees.international.french; import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification; import edu.stanford.nlp.international.french.process.FrenchTokenizer; import edu.stanford.nlp.international.morph.MorphoFeatureSpecification; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.trees.AbstractTreebankLanguagePack; import edu.stanford.nlp.trees.HeadFinder; /** * Language pack for the French treebank. * * @author mcdm */ public class FrenchTreebankLanguagePack extends AbstractTreebankLanguagePack { private static final long serialVersionUID = -7338244949063822519L; //wsg2011: The distributed treebank is encoding in ISO8859_1, but //the current FrenchTreebankParserParams is currently configured to //read UTF-8, PTB style trees that have been extracted from the XML //files. public static final String FTB_ENCODING = "ISO8859_1"; //The raw treebank uses "PONCT". Change to LDC convention. private static final String[] frenchPunctTags = {"PUNC"}; private static final String[] frenchSFPunctTags = {"PUNC"}; private static final String[] frenchPunctWords = {"=","*","/","\\","]","[","\"","''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";", """}; private static final String[] frenchSFPunctWords = {".", "!", "?"}; private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'}; private static final String[] frenchStartSymbols = {"ROOT"}; @Override public TokenizerFactory<? extends HasWord> getTokenizerFactory() { return FrenchTokenizer.ftbFactory(); } @Override public String getEncoding() { return FTB_ENCODING; } /** * Returns a String array of punctuation tags for this treebank/language. * * @return The punctuation tags */ @Override public String[] punctuationTags() { return frenchPunctTags; } /** * Returns a String array of punctuation words for this treebank/language. * * @return The punctuation words */ @Override public String[] punctuationWords() { return frenchPunctWords; } /** * Returns a String array of sentence final punctuation tags for this * treebank/language. * * @return The sentence final punctuation tags */ @Override public String[] sentenceFinalPunctuationTags() { return frenchSFPunctTags; } /** * Returns a String array of sentence final punctuation words for this * treebank/language. * * @return The sentence final punctuation tags */ public String[] sentenceFinalPunctuationWords() { return frenchSFPunctWords; } /** * Return an array of characters at which a String should be * truncated to give the basic syntactic category of a label. * The idea here is that French treebank style labels follow a syntactic * category with various functional and crossreferencing information * introduced by special characters (such as "NP-SUBJ"). This would * be truncated to "NP" by the array containing '-'. * * @return An array of characters that set off label name suffixes */ @Override public char[] labelAnnotationIntroducingCharacters() { return annotationIntroducingChars; } /** * Returns a String array of treebank start symbols. * * @return The start symbols */ @Override public String[] startSymbols() { return frenchStartSymbols; } /** * Returns the extension of treebank files for this treebank. */ public String treebankFileExtension() { return "xml"; } /** {@inheritDoc} */ public HeadFinder headFinder() { return new FrenchHeadFinder(this); } /** {@inheritDoc} */ public HeadFinder typedDependencyHeadFinder() { return new FrenchHeadFinder(this); } @Override public MorphoFeatureSpecification morphFeatureSpec() { return new FrenchMorphoFeatureSpecification(); } }