SplitCanditoTrees.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.french.scripts; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;

import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.pipeline.FTBCorrector;
import edu.stanford.nlp.international.french.pipeline.MWEPreprocessor;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatures;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.Generics;

/**
 * Performs the pre-processing of raw (XML) FTB trees for the EMNLP2011 and CL2011 experiments.
 *
 * @author John Bauer
 * @author Spence Green
 *
 */
public final class SplitCanditoTrees  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(SplitCanditoTrees.class);

  /**
   * true -- mwetoolkit experiments, factored lexicon experiments
   * false -- basic parsing experiments
   */
  private static final boolean LEMMAS_AS_LEAVES = false;

  /**
   * true -- factored lexicon experiments
   * false -- mwetoolkit experiments, basic parsing experiments
   */
  private static final boolean ADD_MORPHO_TO_LEAVES = false;

  /**
   * true -- Use the CC tagset
   * false -- Use the default tagset
   */
  private static final boolean CC_TAGSET = true;

  /**
   * Output Morfette training files instead of PTB-style trees
   */
  private static final boolean MORFETTE_OUTPUT = false;
  
  
  // Statistics
  private static int nTokens = 0;
  private static int nMorphAnalyses = 0;
  
  private static final Integer[] fSizes = {1235,1235,9881,10000000};
  private static final String[] fNames = {"candito.test", "candito.dev",
                                  "candito.train",
                                  "candito.train.extended"};

  private SplitCanditoTrees() {} // static main method only

  static List<String> readIds(String filename)
    throws IOException
  {
    List<String> ids = new ArrayList<>();
    BufferedReader fin =
      new BufferedReader(new InputStreamReader
                         (new FileInputStream(filename), "ISO8859_1"));
    String line;
    while ((line = fin.readLine()) != null) {
      String[] pieces = line.split("\t");
      ids.add(pieces[0].trim());
    }
    return ids;
  }

  static Map<String, Tree> readTrees(String[] filenames)
    throws IOException
  {
    // TODO: perhaps we can just pass in CC_TAGSET and get rid of replacePOSTags
    // need to test that
    final TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false); 
    Map<String, Tree> treeMap = Generics.newHashMap();
    for (String filename : filenames) {
      File file = new File(filename);
      String canonicalFilename =
        file.getName().substring(0, file.getName().lastIndexOf('.'));

      FrenchXMLTreeReader tr = (FrenchXMLTreeReader)
        trf.newTreeReader(new BufferedReader
                          (new InputStreamReader
                           (new FileInputStream(file),"ISO8859_1")));

      Tree t = null;
      int numTrees;
      for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
        String id = canonicalFilename + "-" + ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
        treeMap.put(id, t);
      }

      tr.close();
      System.err.printf("%s: %d trees%n", file.getName(), numTrees);
    }
    return treeMap;
  }

  static void preprocessMWEs(Map<String, Tree> treeMap) {

    TwoDimensionalCounter<String,String> labelTerm =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> termLabel =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> labelPreterm =
            new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String,String> pretermLabel =
            new TwoDimensionalCounter<>();

    TwoDimensionalCounter<String,String> unigramTagger =
            new TwoDimensionalCounter<>();

    for (Tree t : treeMap.values()) {
      MWEPreprocessor.countMWEStatistics(t, unigramTagger,
                                         labelPreterm, pretermLabel,
                                         labelTerm, termLabel);
    }

    for (Tree t : treeMap.values()) {
      MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
    }
  }

  public static void mungeLeaves(Tree tree, boolean lemmasAsLeaves, boolean addMorphoToLeaves) {
    List<Label> labels = tree.yield();
    
    for (Label label : labels) {
      ++nTokens;
      if (!(label instanceof CoreLabel)) {
        throw new IllegalArgumentException("Only works with CoreLabels trees");
      }

      CoreLabel coreLabel = (CoreLabel) label;
      String lemma = coreLabel.lemma();
      //PTB escaping since we're going to put this in the leaf
      if (lemma == null) {
        // No lemma, so just add the surface form
        lemma = coreLabel.word();
      } else if (lemma.equals("(")) {
        lemma = "-LRB-";
      } else if (lemma.equals(")")) {
        lemma = "-RRB-";
      }

      if (lemmasAsLeaves) {
        String escapedLemma = lemma;
        coreLabel.setWord(escapedLemma);
        coreLabel.setValue(escapedLemma);
        coreLabel.setLemma(lemma);
      }

      if (addMorphoToLeaves) {
        String morphStr = coreLabel.originalText();
        if(morphStr == null || morphStr.equals("")) {
          morphStr = MorphoFeatureSpecification.NO_ANALYSIS;
        } else {
          ++nMorphAnalyses;
        }
        // Normalize punctuation analyses
        if (morphStr.startsWith("PONCT")) {
          morphStr = "PUNC";
        }
        
        String newLeaf = String.format("%s%s%s%s%s", coreLabel.value(),
                                                   MorphoFeatureSpecification.MORPHO_MARK,
                                                   lemma,
                                                   MorphoFeatureSpecification.LEMMA_MARK,
                                                   morphStr);
        coreLabel.setValue(newLeaf);
        coreLabel.setWord(newLeaf);
      }
    }
  }

  private static void replacePOSTags(Tree tree) {
    List<Label> yield = tree.yield();
    List<Label> preYield = tree.preTerminalYield();

    assert yield.size() == preYield.size();

    MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification();
    for(int i = 0; i < yield.size(); i++) {
      // Morphological Analysis
      String morphStr = ((CoreLabel) yield.get(i)).originalText();
      if (morphStr == null || morphStr.equals("")) {
        morphStr = preYield.get(i).value();
        // POS subcategory
        String subCat = ((CoreLabel) yield.get(i)).category();
        if (subCat != null && subCat != "") {
          morphStr += "-" + subCat + "--";
        } else {
          morphStr += "---";
        }
      }
      MorphoFeatures feats = spec.strToFeatures(morphStr);
      if(feats.getAltTag() != null && !feats.getAltTag().equals("")) {
        CoreLabel cl = (CoreLabel) preYield.get(i);
        cl.setValue(feats.getAltTag());
        cl.setTag(feats.getAltTag());
      }
    }
  }

  /**
   * Right now this outputs trees in PTB format.  It outputs one tree
   * at a time until we have output enough trees to fill the given
   * file, then moves on to the next file.  Trees are output in the
   * order given in the <code>ids</code> list.
   * <br>
   * Trees have their words replaced with the words' lemmas, if those
   * lemmas exist.
   */
  public static void outputSplits(List<String> ids,
                                  Map<String, Tree> treeMap)
    throws IOException
  {
    Queue<Integer> fSizeQueue = new LinkedList<>(Arrays.asList(fSizes));
    Queue<String> fNameQueue = new LinkedList<>(Arrays.asList(fNames));

    TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
    TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
    
    final TreeTransformer tt = new FTBCorrector();

    int size = fSizeQueue.remove();
    String filename = fNameQueue.remove();

    log.info("Outputing " + filename);

    PrintWriter writer =
      new PrintWriter(new BufferedWriter
                      (new OutputStreamWriter
                       (new FileOutputStream(filename), "UTF-8")));

    int outputCount = 0;
    for (String id : ids) {
      if (!treeMap.containsKey(id)) {
        log.info("Missing id: " + id);
        continue;
      }

      Tree tree = treeMap.get(id);
      TregexMatcher m = pBadTree.matcher(tree);
      TregexMatcher m2 = pBadTree2.matcher(tree);
      if(m.find() || m2.find()) {
        log.info("Discarding tree: " + tree.toString());
        continue;
      }
      
      // Punctuation normalization, etc.
      Tree backupCopy = tree.deepCopy();
      tree = tt.transformTree(tree);
      if (tree.firstChild().children().length == 0) {
        // Some trees have only punctuation. Tregex will mangle these. Don't throw those away.
        log.info("Saving tree: " + tree.toString());
        log.info("Backup: " + backupCopy.toString());
        tree = backupCopy;
      }
      
      if(LEMMAS_AS_LEAVES || ADD_MORPHO_TO_LEAVES) {
        mungeLeaves(tree,LEMMAS_AS_LEAVES,ADD_MORPHO_TO_LEAVES);
      }

      if(CC_TAGSET) {
        replacePOSTags(tree);
      }

      if (MORFETTE_OUTPUT) {
        writer.println(treeToMorfette(tree));
      } else {
        writer.println(tree.toString());
      }

      ++outputCount;
      if (outputCount == size) {
        outputCount = 0;
        size = fSizeQueue.remove();
        filename = fNameQueue.remove();
        log.info("Outputing " + filename);
        writer.close();
        writer =
          new PrintWriter(new BufferedWriter
                          (new OutputStreamWriter
                           (new FileOutputStream(filename), "UTF-8")));
      }
    }
    writer.close();
  }

  /**
   * Converts a tree to the Morfette training format.
   */
  private static String treeToMorfette(Tree tree) {
    StringBuilder sb = new StringBuilder();
    List<Label> yield = tree.yield();
    List<Label> tagYield = tree.preTerminalYield();
    assert yield.size() == tagYield.size();
    int listLen = yield.size();
    for (int i = 0; i < listLen; ++i) {
      CoreLabel token = (CoreLabel) yield.get(i);
      CoreLabel tag = (CoreLabel) tagYield.get(i);
      String morphStr = token.originalText();
      if (morphStr == null || morphStr.equals("")) {
        morphStr = tag.value();
      }
      String lemma = token.lemma();
      if (lemma == null || lemma.equals("")) {
        lemma = token.value();
      }
      sb.append(String.format("%s %s %s%n", token.value(), lemma, morphStr));
    }
    return sb.toString();
  }

  /**
   * Sample command line:
   * <br>
   * java edu.stanford.nlp.international.french.scripts.SplitCanditoTrees
   * projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/ftb-uc-2010.id_mrg
   * ../data/french/corpus-fonctions/*.xml
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 2) {
      System.err.printf("Usage: java %s id_file [xml files]%n", SplitCanditoTrees.class.getName());
      System.exit(-1);
    }
    // first arg is expected to be the file of IDs
    // all subsequent args are .xml files with the trees in them
    List<String> ids = readIds(args[0]);

    log.info("Read " + ids.size() + " ids");

    String[] newArgs = new String[args.length - 1];
    for (int i = 1; i < args.length; ++i)
      newArgs[i - 1] = args[i];

    Map<String, Tree> treeMap = readTrees(newArgs);
    log.info("Read " + treeMap.size() + " trees");

    preprocessMWEs(treeMap);

    outputSplits(ids, treeMap);
    
    if (nTokens != 0) {
      log.info("CORPUS STATISTICS");
      System.err.printf("#tokens:\t%d%n", nTokens);
      System.err.printf("#with morph:\t%d%n", nMorphAnalyses);
    }
  }
}