FrenchTreebankLanguagePack.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.french;

import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.HeadFinder;


/**
 * Language pack for the French treebank.
 *
 * @author mcdm
 */
public class FrenchTreebankLanguagePack extends AbstractTreebankLanguagePack {

  private static final long serialVersionUID = -7338244949063822519L;

  //wsg2011: The distributed treebank is encoding in ISO8859_1, but
  //the current FrenchTreebankParserParams is currently configured to
  //read UTF-8, PTB style trees that have been extracted from the XML
  //files.
  public static final String FTB_ENCODING = "ISO8859_1";

  //The raw treebank uses "PONCT". Change to LDC convention.
  private static final String[] frenchPunctTags = {"PUNC"};

  private static final String[] frenchSFPunctTags = {"PUNC"};

  private static final String[] frenchPunctWords = {"=","*","/","\\","]","[","\"","''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";", """};

  private static final String[] frenchSFPunctWords = {".", "!", "?"};

  private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};

  private static final String[] frenchStartSymbols = {"ROOT"};

  @Override
  public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
    return FrenchTokenizer.ftbFactory();
  }
  
  @Override
  public String getEncoding() {
    return FTB_ENCODING;
  }
  
  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  @Override
  public String[] punctuationTags() {
    return frenchPunctTags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  @Override
  public String[] punctuationWords() {
    return frenchPunctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  @Override
  public String[] sentenceFinalPunctuationTags() {
    return frenchSFPunctTags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationWords() {
    return frenchSFPunctWords;
  }


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that French treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SUBJ").  This would
   * be truncated to "NP" by the array containing '-'.
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }


  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  @Override
  public String[] startSymbols() {
    return frenchStartSymbols;
  }


  /**
   * Returns the extension of treebank files for this treebank.
   */
  public String treebankFileExtension() {
    return "xml";
  }

  /** {@inheritDoc} */
  public HeadFinder headFinder() {
    return new FrenchHeadFinder(this);
  }
  
  /** {@inheritDoc} */
  public HeadFinder typedDependencyHeadFinder() {
    return new FrenchHeadFinder(this);
  }
  
  @Override
  public MorphoFeatureSpecification morphFeatureSpec() {
    return new FrenchMorphoFeatureSpecification();
  }

}