IETestUtils.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.util;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.Pair;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

/**
 * Factor out some commonly used code (e.g., make a tree from a CoNLL spec)
 *
 * @author Gabor Angeli
 */
public class IETestUtils {

  /**
   * Create a dummy word, just with a given word at a given index.
   * Mostly useful for making semantic graphs.
   */
  public static CoreLabel mkWord(String gloss, int index) {
    CoreLabel w = new CoreLabel();
    w.setWord(gloss);
    w.setValue(gloss);
    if (index >= 0) {
      w.setIndex(index);
    }
    return w;
  }

  /**
   * Parse a CoNLL formatted string into a SemanticGraph.
   * This is useful for tests so that you don't need to load the model (and are robust to
   * model changes).
   *
   * @param conll The CoNLL format for the tree.
   * @return A semantic graph, as well as the flat tokens of the sentence.
   */
  public static Pair<SemanticGraph,List<CoreLabel>> parseCoNLL(String conll) {
    List<CoreLabel> sentence = new ArrayList<>();
    SemanticGraph tree = new SemanticGraph();
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) { continue; }
      String[] fields = line.trim().split("\\s+");
      int index = Integer.parseInt(fields[0]);
      String word = fields[1];
      CoreLabel label = mkWord(word, index);
      sentence.add(label);
      if (fields[2].equals("0")) {
        tree.addRoot(new IndexedWord(label));
      } else {
        tree.addVertex(new IndexedWord(label));
      }
      if (fields.length > 4) {
        label.setTag(fields[4]);
      }
      if (fields.length > 5) {
        label.setNER(fields[5]);
      }
      if (fields.length > 6) {
        label.setLemma(fields[6]);
      }
    }
    int i = 0;
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) { continue; }
      String[] fields = line.trim().split("\\s+");
      int parent = Integer.parseInt(fields[2]);
      String reln = fields[3];
      if (parent > 0) {
        tree.addEdge(
            new IndexedWord(sentence.get(parent - 1)),
            new IndexedWord(sentence.get(i)),
            new GrammaticalRelation(Language.UniversalEnglish, reln, null, null),
            1.0, false
        );
      }
      i += 1;
    }

    return Pair.makePair(tree, sentence);
  }


  /**
   * Create a sentence (list of CoreLabels) from a given text.
   * The resulting labels will have a word, lemma (guessed poorly), and
   * a part of speech if one is specified on the input.
   *
   * @param text The text to parse.
   *
   * @return A sentence corresponding to the text.
   */
  public static List<CoreLabel> parseSentence(String text) {
    return Arrays.asList(text.split("\\s+")).stream().map(w -> {
      CoreLabel token = new CoreLabel();
      if (w.contains("/")) {
        String[] fields = w.split("/");
        token.setWord(fields[0]);
        token.setTag(fields[1]);
      } else {
        token.setWord(w);
      }
      token.setValue(token.word());
      token.setLemma(token.word());
      if (token.word().equals("is") || token.word().equals("was") || token.word().equals("are")) {
        token.setLemma("be");
      }
      if (token.word().equals("has")) {
        token.setLemma("have");
      }
      if (token.word().equals("did") | token.word().equals("will") || token.word().equals("does")) {
        token.setLemma("do");
      }
      if (token.word().endsWith("ed")) {
        token.setLemma(token.word().substring(0, token.word().length() - 1));
      }
      if (token.word().endsWith("ing")) {
        token.setLemma(token.word().substring(0, token.word().length() - 3));
      }
      return token;
    }).collect(Collectors.toList());
  }
}