CoNLLOutputter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * <p>Write a subset of our CoreNLP output in CoNLL format.</p>
 *
 * <p>The fields currently output are:</p>
 *
 * <table>
 *   <tr>
 *     <td>Field Number</td>
 *     <td>Field Name</td>
 *     <td>Description</td>
 *   </tr>
 *   <tr>
 *     <td>1</td>
 *     <td>ID</td>
 *     <td>Token Counter, starting at 1 for each new sentence.</td>
 *   </tr>
 *   <tr>
 *     <td>2</td>
 *     <td>FORM</td>
 *     <td>Word form or punctuation symbol.</td>
 *   </tr>
 *   <tr>
 *     <td>3</td>
 *     <td>LEMMA</td>
 *     <td>Lemma of word form, or an underscore if not available.</td>
 *   </tr>
 *   <tr>
 *     <td>4</td>
 *     <td>POSTAG</td>
 *     <td>Fine-grained part-of-speech tag, or underscore if not available.</td>
 *   </tr>
 *   <tr>
 *     <td>5</td>
 *     <td>NER</td>
 *     <td>Named Entity tag, or underscore if not available.</td>
 *   </tr>
 *   <tr>
 *     <td>6</td>
 *     <td>HEAD</td>
 *     <td>Head of the current token, which is either a value of ID or zero ('0').
 *         This is underscore if not available.</td>
 *   </tr>
 *   <tr>
 *     <td>7</td>
 *     <td>DEPREL</td>
 *     <td>Dependency relation to the HEAD, or underscore if not available.</td>
 *   </tr>
 * </table>
 *
 * @author Gabor Angeli
 */
public class CoNLLOutputter extends AnnotationOutputter {

  private static final String NULL_PLACEHOLDER = "_";

  public CoNLLOutputter() { }

  private static String orNull(String in) {
    if (in == null) {
      return NULL_PLACEHOLDER;
    } else {
      return in;
    }
  }

  /**
   * Produce a line of the CoNLL output.
   */
  private static String line(int index,
                      CoreLabel token,
                      int head, String deprel) {
    ArrayList<String> fields = new ArrayList<>(16);

    fields.add(Integer.toString(index)); // 1
    fields.add(orNull(token.word()));    // 2
    fields.add(orNull(token.lemma()));   // 3
    fields.add(orNull(token.tag()));     // 4
    fields.add(orNull(token.ner()));     // 5
    if (head >= 0) {
      fields.add(Integer.toString(head));  // 6
      fields.add(deprel);                  // 7
    } else {
      fields.add(NULL_PLACEHOLDER);
      fields.add(NULL_PLACEHOLDER);
    }

    return StringUtils.join(fields, "\t");
  }

  @Override
  public void print(Annotation doc, OutputStream target, Options options) throws IOException {
    PrintWriter writer = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));

    // vv A bunch of nonsense to get tokens vv
    if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
      for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
        if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
          List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
          SemanticGraph depTree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
          for (int i = 0; i < tokens.size(); ++i) {
            // ^^ end nonsense to get tokens ^^

            // Newline if applicable
            if (i > 0) {
              writer.println();
            }

            // Try to get the incoming dependency edge
            int head = -1;
            String deprel = null;
            if (depTree != null) {
              Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
              IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
              if (node != null) {
                List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
                if (!edgeList.isEmpty()) {
                  assert edgeList.size() == 1;
                  head = edgeList.get(0).getGovernor().index();
                  deprel = edgeList.get(0).getRelation().toString();
                } else if (rootSet.contains(i + 1)) {
                  head = 0;
                  deprel = "ROOT";
                }
              }
            }

            // Write the token
            writer.print(line(i + 1, tokens.get(i), head, deprel));
          }
        }
        writer.println();
        writer.println();
      }
    }
    writer.flush();
  }

  public static void conllPrint(Annotation annotation, OutputStream os) throws IOException {
    new CoNLLOutputter().print(annotation, os);
  }

  public static void conllPrint(Annotation annotation, OutputStream os, StanfordCoreNLP pipeline) throws IOException {
    new CoNLLOutputter().print(annotation, os, pipeline);
  }

  public static void conllPrint(Annotation annotation, OutputStream os, Options options) throws IOException {
    new CoNLLOutputter().print(annotation, os, options);
  }

}