CoNLLUDocumentWriter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.ud;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;

import java.io.PrintWriter;
import java.util.HashMap;
import java.util.regex.Pattern;

/**
 * @author Sebastian Schuster
 */
public class CoNLLUDocumentWriter {


    private static final String LRB_PATTERN = "(?i)-LRB-";
    private static final String RRB_PATTERN = "(?i)-RRB-";


    public String printSemanticGraph(SemanticGraph sg) {
        return printSemanticGraph(sg, true);
    }

    public String printSemanticGraph(SemanticGraph sg, boolean unescapeParenthesis) {


        StringBuilder sb = new StringBuilder();

        /* Print comments. */
        for (String comment : sg.getComments()) {
            sb.append(comment).append("\n");
        }

        for (IndexedWord token : sg.vertexListSorted()) {
            /* Check for multiword tokens. */
            if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
                IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
                if (tokenSpan.getSource() == token.index()) {
                    String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
                    sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
                }
            }

            /* Try to find main governor and additional dependencies. */
            int govIdx = -1;
            GrammaticalRelation reln = null;
            HashMap<Integer, String> additionalDeps = new HashMap<>();
            for (IndexedWord parent : sg.getParents(token)) {
                SemanticGraphEdge edge = sg.getEdge(parent, token);
                if ( govIdx == -1 && ! edge.isExtra()) {
                    govIdx = parent.index();
                    reln = edge.getRelation();
                } else {
                    additionalDeps.put(parent.index(), edge.getRelation().toString());
                }
            }


            String additionalDepsString = CoNLLUUtils.toExtraDepsString(additionalDeps);
            String word = token.word();
            String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
            String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
            String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
            String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
            String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
            String relnName = reln == null ? "_" : reln.toString();

            /* Root. */
            if (govIdx == -1 && sg.getRoots().contains(token)) {
                govIdx = 0;
                relnName = GrammaticalRelation.ROOT.toString();
            }

            if (unescapeParenthesis) {
                word = word.replaceAll(LRB_PATTERN, "(");
                word = word.replaceAll(RRB_PATTERN, ")");
                lemma = lemma.replaceAll(LRB_PATTERN, "(");
                lemma = lemma.replaceAll(RRB_PATTERN, ")");
            }

            sb.append(String.format("%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s%n", token.index(), word,
                    lemma, upos, pos, featuresString, govIdx, relnName, additionalDepsString, misc));
        }
        sb.append("\n");

        return sb.toString();
    }

}