package edu.stanford.nlp.trees.ud;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.regex.Pattern;
/**
* @author Sebastian Schuster
*/
public class CoNLLUDocumentWriter {
private static final String LRB_PATTERN = "(?i)-LRB-";
private static final String RRB_PATTERN = "(?i)-RRB-";
public String printSemanticGraph(SemanticGraph sg) {
return printSemanticGraph(sg, true);
}
public String printSemanticGraph(SemanticGraph sg, boolean unescapeParenthesis) {
StringBuilder sb = new StringBuilder();
/* Print comments. */
for (String comment : sg.getComments()) {
sb.append(comment).append("\n");
}
for (IndexedWord token : sg.vertexListSorted()) {
/* Check for multiword tokens. */
if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
IntPair tokenSpan = token.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
if (tokenSpan.getSource() == token.index()) {
String range = String.format("%d-%d", tokenSpan.getSource(), tokenSpan.getTarget());
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.originalText()));
}
}
/* Try to find main governor and additional dependencies. */
int govIdx = -1;
GrammaticalRelation reln = null;
HashMap<Integer, String> additionalDeps = new HashMap<>();
for (IndexedWord parent : sg.getParents(token)) {
SemanticGraphEdge edge = sg.getEdge(parent, token);
if ( govIdx == -1 && ! edge.isExtra()) {
govIdx = parent.index();
reln = edge.getRelation();
} else {
additionalDeps.put(parent.index(), edge.getRelation().toString());
}
}
String additionalDepsString = CoNLLUUtils.toExtraDepsString(additionalDeps);
String word = token.word();
String featuresString = CoNLLUUtils.toFeatureString(token.get(CoreAnnotations.CoNLLUFeats.class));
String pos = token.getString(CoreAnnotations.PartOfSpeechAnnotation.class, "_");
String upos = token.getString(CoreAnnotations.CoarseTagAnnotation.class, "_");
String misc = token.getString(CoreAnnotations.CoNLLUMisc.class, "_");
String lemma = token.getString(CoreAnnotations.LemmaAnnotation.class, "_");
String relnName = reln == null ? "_" : reln.toString();
/* Root. */
if (govIdx == -1 && sg.getRoots().contains(token)) {
govIdx = 0;
relnName = GrammaticalRelation.ROOT.toString();
}
if (unescapeParenthesis) {
word = word.replaceAll(LRB_PATTERN, "(");
word = word.replaceAll(RRB_PATTERN, ")");
lemma = lemma.replaceAll(LRB_PATTERN, "(");
lemma = lemma.replaceAll(RRB_PATTERN, ")");
}
sb.append(String.format("%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s%n", token.index(), word,
lemma, upos, pos, featuresString, govIdx, relnName, additionalDepsString, misc));
}
sb.append("\n");
return sb.toString();
}
}