BuildBinarizedDataset.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.sentiment; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.StringReader;
import java.util.List;
import java.util.Map;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
 * @author John Bauer
 * @author Richard Socher
 */
public class BuildBinarizedDataset  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(BuildBinarizedDataset.class);

  private BuildBinarizedDataset() {} // static methods only

  /**
   * Sets all of the labels on a tree to the given default value.
   */
  public static void setUnknownLabels(Tree tree, Integer defaultLabel) {
    if (tree.isLeaf()) {
      return;
    }

    for (Tree child : tree.children()) {
      setUnknownLabels(child, defaultLabel);
    }

    tree.label().setValue(defaultLabel.toString());
  }

  public static void setPredictedLabels(Tree tree) {
    if (tree.isLeaf()) {
      return;
    }

    for (Tree child : tree.children()) {
      setPredictedLabels(child);
    }

    tree.label().setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
  }

  public static void extractLabels(Map<Pair<Integer, Integer>, String> spanToLabels, List<HasWord> tokens, String line) {
    String[] pieces = line.trim().split("\\s+");
    if (pieces.length == 0) {
      return;
    }
    if (pieces.length == 1) {
      String error = "Found line with label " + line + " but no tokens to associate with that line";
      throw new RuntimeException(error);
    }

    //TODO: BUG: The pieces are tokenized differently than the splitting, e.g., on possessive markers as in "actors' expenses"
    for (int i = 0; i < tokens.size() - pieces.length + 2; ++i) {
      boolean found = true;
      for (int j = 1; j < pieces.length; ++j) {
        if (!tokens.get(i + j - 1).word().equals(pieces[j])) {
          found = false;
          break;
        }
      }
      if (found) {
        spanToLabels.put(new Pair<>(i, i + pieces.length - 1), pieces[0]);
      }
    }
  }

  public static boolean setSpanLabel(Tree tree, Pair<Integer, Integer> span, String value) {
    if (!(tree.label() instanceof CoreLabel)) {
      throw new AssertionError("Expected CoreLabels");
    }
    CoreLabel label = (CoreLabel) tree.label();
    if (label.get(CoreAnnotations.BeginIndexAnnotation.class).equals(span.first) &&
        label.get(CoreAnnotations.EndIndexAnnotation.class).equals(span.second)) {
      label.setValue(value);
      return true;
    }
    if (label.get(CoreAnnotations.BeginIndexAnnotation.class) > span.first &&
        label.get(CoreAnnotations.EndIndexAnnotation.class) < span.second) {
      return false;
    }
    for (Tree child : tree.children()) {
      if (setSpanLabel(child, span, value)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Turns a text file into trees for use in a RNTN classifier such as
   * the treebank used in the Sentiment project.
   * <br>
   * The expected input file is one sentence per line, with sentences
   * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
   * Lines after the first sentence line but before
   * the blank line will be treated as labeled sub-phrases.  The
   * labels should start with the label and then contain a list of
   * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
   *  For example:
   * <br>
   * <code>
   * 1 Today is not a good day.<br>
   * 3 good<br>
   * 3 good day <br>
   * 3 a good day <br>
   * <br>
   * (next block starts here) <br>
   * </code>
   * By default the englishPCFG parser is used.  This can be changed
   * with the <code>-parserModel</code> flag.  Specify an input file
   * with <code>-input</code>.
   * <br>
   * If a sentiment model is provided with -sentimentModel, that model
   * will be used to prelabel the sentences.  Any spans with given
   * labels will then be used to adjust those labels.
   */
  public static void main(String[] args) {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();

    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

    String inputPath = null;

    String sentimentModelPath = null;
    SentimentModel sentimentModel = null;

    for (int argIndex = 0; argIndex < args.length; ) {
      if (args[argIndex].equalsIgnoreCase("-input")) {
        inputPath = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
        parserModel = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
        sentimentModelPath = args[argIndex + 1];
        argIndex += 2;
      } else {
        log.info("Unknown argument " + args[argIndex]);
        System.exit(2);
      }
    }

    if (inputPath == null) {
      throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
      sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
      if (chunk.trim().isEmpty()) {
        continue;
      }
      // The expected format is that line 0 will be the text of the
      // sentence, and each subsequence line, if any, will be a value
      // followed by the sequence of tokens that get that value.

      // Here we take the first line and tokenize it as one sentence.
      String[] lines = chunk.trim().split("\\n");
      String sentence = lines[0];
      StringReader sin = new StringReader(sentence);
      DocumentPreprocessor document = new DocumentPreprocessor(sin);
      document.setSentenceFinalPuncWords(new String[] {"\n"});
      List<HasWord> tokens = document.iterator().next();
      Integer mainLabel = new Integer(tokens.get(0).word());
      //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
      tokens = tokens.subList(1, tokens.size());
      //log.info(tokens);

      Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
      for (int i = 1; i < lines.length; ++i) {
        extractLabels(spanToLabels, tokens, lines[i]);
      }

      // TODO: add an option which treats the spans as constraints when parsing

      Tree tree = parser.apply(tokens);
      Tree binarized = binarizer.transformTree(tree);
      Tree collapsedUnary = transformer.transformTree(binarized);

      // if there is a sentiment model for use in prelabeling, we
      // label here and then use the user given labels to adjust
      if (sentimentModel != null) {
        Trees.convertToCoreLabels(collapsedUnary);
        SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
        scorer.forwardPropagateTree(collapsedUnary);
        setPredictedLabels(collapsedUnary);
      } else {
        setUnknownLabels(collapsedUnary, mainLabel);
      }

      Trees.convertToCoreLabels(collapsedUnary);
      collapsedUnary.indexSpans();

      for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
        setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
      }

      System.out.println(collapsedUnary);
      //System.out.println();
    }
  } // end main

}