FTBDataset.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.french.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.trees.treebank.AbstractDataset;
import edu.stanford.nlp.trees.treebank.ConfigParser;
import edu.stanford.nlp.trees.treebank.DefaultMapper;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.international.french.FrenchTreebankLanguagePack;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory;
import edu.stanford.nlp.trees.tregex.TregexParseException;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.DataFilePaths;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;

/**
 * Produces the pre-processed version of the FTB used in the experiments of
 * Green et al. (2011).
 *
 * @author Spence Green
 *
 */
public class FTBDataset extends AbstractDataset  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FTBDataset.class);

  private boolean CC_TAGSET = false;

  private Set<String> splitSet;

  public FTBDataset() {
    super();

    //Need to use a MemoryTreebank so that we can compute gross corpus
    //stats for MWE pre-processing
    // The treebank may be reset if setOptions changes CC_TAGSET
    treebank = new MemoryTreebank(new FrenchXMLTreeReaderFactory(CC_TAGSET), FrenchTreebankLanguagePack.FTB_ENCODING);
    treeFileExtension = "xml";
  }

  /**
   * Return the ID of this tree according to the Candito split files.
   */
  private String getCanditoTreeID(Tree t) {
    String canditoName = null;
    if (t.label() instanceof CoreLabel) {
      String fileName = ((CoreLabel) t.label()).docID();
      fileName = fileName.substring(0, fileName.lastIndexOf('.'));
      String ftbID = ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
      if (fileName != null && ftbID != null) {
        canditoName = fileName + "-" + ftbID;
      } else {
        throw new NullPointerException("fileName " + fileName + ", ftbID " + ftbID);
      }
    } else {
      throw new IllegalArgumentException("Trees constructed without CoreLabels! Can't extract metadata!");
    }
    return canditoName;
  }

  @Override
  public void build() {
    for(File path : pathsToData) {
      treebank.loadPath(path,treeFileExtension,false);
    }

    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));
      flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null;

      outputFileList.add(outFileName);

      if(makeFlatFile) {
        outputFileList.add(flatFileName);
        toStringBuffer.append(" Made flat files\n");
      }

      preprocessMWEs();

      List<TregexPattern> badTrees = new ArrayList<>();
      //These trees appear in the Candito training set
      //They are mangled by the TreeCorrector, so discard them ahead of time.
      badTrees.add(TregexPattern.compile("@SENT <: @PUNC"));
      badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __"));

      //wsg2011: This filters out tree #552 in the Candito test set. We saved this tree for the
      //EMNLP2011 paper, but since it consists entirely of punctuation, it won't be evaluated anyway.
      //Since we aren't doing the split in this data set, just remove the tree.
      badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC <3 @PUNC <4 @PUNC !<5 __"));

      for(Tree t : treebank) {
        //Filter out bad trees
        boolean skipTree = false;
        for(TregexPattern p : badTrees) {
          skipTree = p.matcher(t).find();
          if(skipTree) break;
        }
        if(skipTree) {
          log.info("Discarding tree: " + t.toString());
          continue;
        }

        // Filter out trees that aren't in this part of the split
        if (splitSet != null) {
          String canditoTreeID = getCanditoTreeID(t);
          if ( ! splitSet.contains(canditoTreeID)) {
            continue;
          }
        }

        if(customTreeVisitor != null)
          customTreeVisitor.visitTree(t);

        // outfile.printf("%s\t%s%n",treeName,t.toString());
        outfile.println(t.toString());

        if(makeFlatFile) {
          String flatString = (removeEscapeTokens) ?
              ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(t)) : ATBTreeUtils.flattenTree(t);
              flatFile.println(flatString);
        }
      }

    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName());
      e.printStackTrace();

    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName);

    } catch (TregexParseException e) {
      System.err.printf("%s: Could not compile Tregex expressions%n", this.getClass().getName());
      e.printStackTrace();

    } finally {
      if(outfile != null)
        outfile.close();
      if(flatFile != null)
        flatFile.close();
    }
  }

  /**
   * Corrects MWE annotations that lack internal POS labels.
   */
  private void preprocessMWEs() {

    TwoDimensionalCounter<String,String> labelTerm =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> termLabel =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> labelPreterm =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> pretermLabel =
            new TwoDimensionalCounter<>();

    TwoDimensionalCounter<String,String> unigramTagger =
            new TwoDimensionalCounter<>();

    for (Tree t : treebank) {
      MWEPreprocessor.countMWEStatistics(t, unigramTagger,
          labelPreterm, pretermLabel,
          labelTerm, termLabel);
    }

    for (Tree t : treebank) {
      MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
    }
  }


  @Override
  public boolean setOptions(Properties opts) {
    boolean ret = super.setOptions(opts);

    if (opts.containsKey(ConfigParser.paramSplit)) {
      String splitFileName = opts.getProperty(ConfigParser.paramSplit);
      splitSet = makeSplitSet(splitFileName);
    }

    CC_TAGSET = PropertiesUtils.getBool(opts, ConfigParser.paramCCTagset, false);
    treebank = new MemoryTreebank(new FrenchXMLTreeReaderFactory(CC_TAGSET), FrenchTreebankLanguagePack.FTB_ENCODING);

    if(lexMapper == null) {
      lexMapper = new DefaultMapper();
      lexMapper.setup(null, lexMapOptions.split(","));
    }

    if(pathsToMappings.size() != 0) {
      if(posMapper == null)
        posMapper = new DefaultMapper();
      for(File path : pathsToMappings)
        posMapper.setup(path);
    }

    return ret;
  }

  private Set<String> makeSplitSet(String splitFileName) {
    splitFileName = DataFilePaths.convert(splitFileName);
    Set<String> splitSet = Generics.newHashSet();
    LineNumberReader reader = null;
    try {
      reader = new LineNumberReader(new FileReader(splitFileName));
      for (String line; (line = reader.readLine()) != null;) {
        splitSet.add(line.trim());
      }
      reader.close();

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      System.err.printf("%s: Error reading %s (line %d)%n", this.getClass().getName(), splitFileName, reader.getLineNumber());
      e.printStackTrace();
    }
    return splitSet;
  }
}