package edu.stanford.nlp.util;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import javax.json.*;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Pattern;
/**
* A set of utilities for parsing TSV files into CoreMaps
*
* @author Gabor Angeli
*/
public class TSVUtils {
static String unescapeSQL(String input) {
// If the string is quoted
if (input.startsWith("\"") && input.endsWith("\"")) {
input = input.substring(1, input.length()-1);
}
return input.replace("\"\"","\"").replace("\\\\", "\\");
}
/**
* Parse an SQL array.
* @param array The array to parse.
* @return The parsed array, as a list.
*/
public static List<String> parseArray(String array) {
array = unescapeSQL(array);
if (array.startsWith("{") && array.endsWith("}")) array = array.substring(1, array.length()-1);
char[] input = array.toCharArray();
List<String> output = new ArrayList<>();
StringBuilder elem = new StringBuilder();
boolean inQuotes = false;
boolean escaped = false;
for (char c : input) {
if (escaped) {
elem.append(c);
escaped = false;
} else if (c == '"') {
inQuotes = !inQuotes;
escaped = false;
} else if (c == '\\') {
escaped = true;
} else {
if (inQuotes) {
elem.append(c);
} else if (c == ',') {
output.add(elem.toString());
elem.setLength(0); // This is basically .clear()
} else {
elem.append(c);
}
escaped = false;
}
}
if (elem.length() > 0) {
output.add(elem.toString());
}
return output;
}
private static final Pattern newline = Pattern.compile("\\\\n");
private static final Pattern tab = Pattern.compile("\\\\t");
/**
* Parse a CoNLL formatted tree into a SemanticGraph.
* @param conll The CoNLL tree to parse.
* @param tokens The tokens of the sentence, to form the backing labels of the tree.
* @return A semantic graph of the sentence, according to the given tree.
*/
public static SemanticGraph parseTree(String conll, List<CoreLabel> tokens) {
SemanticGraph tree = new SemanticGraph();
if (conll == null || conll.isEmpty()) {
return tree;
}
String[] treeLines = newline.split(conll);
IndexedWord[] vertices = new IndexedWord[tokens.size() + 2];
// Add edges
for (String line : treeLines) {
// Parse row
String[] fields = tab.split(line);
int dependentIndex = Integer.parseInt(fields[0]);
if (vertices[dependentIndex] == null) {
if (dependentIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1));
}
IndexedWord dependent = vertices[dependentIndex];
int governorIndex = Integer.parseInt(fields[1]);
if (governorIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
if (vertices[governorIndex] == null && governorIndex > 0) {
vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1));
}
IndexedWord governor = vertices[governorIndex];
String relation = fields[2];
// Process row
if (governorIndex == 0) {
tree.addRoot(dependent);
} else {
tree.addVertex(dependent);
if (!tree.containsVertex(governor)) {
tree.addVertex(governor);
}
if (!"ref".equals(relation)) {
tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false);
}
}
}
return tree;
}
/**
* Parse a JSON formatted tree into a SemanticGraph.
* @param jsonString The JSON string tree to parse, e.g:
* "[{\"\"dependent\"\": 7, \"\"dep\"\": \"\"root\"\", \"\"governorgloss\"\": \"\"root\"\", \"\"governor\"\": 0, \"\"dependentgloss\"\": \"\"sport\"\"}, {\"\"dependent\"\": 1, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 2, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"is\"\"}, {\"\"dependent\"\": 3, \"\"dep\"\": \"\"neg\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"not\"\"}, {\"\"dependent\"\": 4, \"\"dep\"\": \"\"det\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"a\"\"}, {\"\"dependent\"\": 5, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"physical\"\", \"\"governor\"\": 6, \"\"dependentgloss\"\": \"\"predominantly\"\"}, {\"\"dependent\"\": 6, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"physical\"\"}, {\"\"dependent\"\": 9, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"yet\"\"}, {\"\"dependent\"\": 10, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"neither\"\"}, {\"\"dependent\"\": 11, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"are\"\"}, {\"\"dependent\"\": 12, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"shooting\"\"}, {\"\"dependent\"\": 13, \"\"dep\"\": \"\"cc\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"and\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"conj:and\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 16, \"\"dep\"\": \"\"nsubjpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"which\"\"}, {\"\"dependent\"\": 18, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"fact\"\", \"\"governor\"\": 19, \"\"dependentgloss\"\": \"\"in\"\"}, {\"\"dependent\"\": 19, \"\"dep\"\": \"\"nmod:in\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"fact\"\"}, {\"\"dependent\"\": 21, \"\"dep\"\": \"\"aux\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"has\"\"}, {\"\"dependent\"\": 22, \"\"dep\"\": \"\"auxpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"been\"\"}, {\"\"dependent\"\": 23, \"\"dep\"\": \"\"dep\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"nicknamed\"\"}, {\"\"dependent\"\": 25, \"\"dep\"\": \"\"dobj\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 26, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"ice\"\", \"\"governor\"\": 27, \"\"dependentgloss\"\": \"\"on\"\"}, {\"\"dependent\"\": 27, \"\"dep\"\": \"\"nmod:on\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"ice\"\"}, {\"\"dependent\"\": 29, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"5\"\"}]");
* @param tokens The tokens of the sentence, to form the backing labels of the tree.
* @return A semantic graph of the sentence, according to the given tree.
*/
public static SemanticGraph parseJsonTree(String jsonString, List<CoreLabel> tokens) {
// Escape quoted string parts
JsonReader json = Json.createReader(new StringReader(jsonString));
SemanticGraph tree = new SemanticGraph();
JsonArray array = json.readArray();
if (array == null || array.isEmpty()) {
return tree;
}
IndexedWord[] vertices = new IndexedWord[tokens.size() + 2];
// Add edges
for(int i = 0; i < array.size(); i++) {
JsonObject entry = array.getJsonObject(i);
// Parse row
int dependentIndex = entry.getInt("dependent");
if (vertices[dependentIndex] == null) {
if (dependentIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1));
}
IndexedWord dependent = vertices[dependentIndex];
int governorIndex = entry.getInt("governor");
if (governorIndex > tokens.size()) {
// Bizarre mismatch in sizes; the malt parser seems to do this often
return new SemanticGraph();
}
if (vertices[governorIndex] == null && governorIndex > 0) {
vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1));
}
IndexedWord governor = vertices[governorIndex];
String relation = entry.getString("dep");
// Process row
if (governorIndex == 0) {
tree.addRoot(dependent);
} else {
tree.addVertex(dependent);
if (!tree.containsVertex(governor)) {
tree.addVertex(governor);
}
if (!"ref".equals(relation)) {
tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false);
}
}
}
return tree;
}
/** Create an Annotation object (with a single sentence) from the given specification */
private static Annotation parseSentence(Optional<String> docid, Optional<Integer> sentenceIndex, String gloss,
Function<List<CoreLabel>,SemanticGraph> tree,
Function<List<CoreLabel>,SemanticGraph> maltTree,
List<String> words, List<String> lemmas, List<String> pos, List<String> ner,
Optional<String> sentenceid) {
// Error checks
if (lemmas.size() != words.size()) {
throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + lemmas.size() + " (sentence " + sentenceid.orElse("???") +")");
}
if (pos.size() != words.size()) {
throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + pos.size() + " (sentence " + sentenceid.orElse("???") +")");
}
if (ner.size() != words.size()) {
throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + ner.size() + " (sentence " + sentenceid.orElse("???") +")");
}
// Create structure
List<CoreLabel> tokens = new ArrayList<>(words.size());
int beginChar = 0;
for (int i = 0; i < words.size(); ++i) {
CoreLabel token = new CoreLabel(12);
token.setWord(words.get(i));
token.setValue(words.get(i));
token.setBeginPosition(beginChar);
token.setEndPosition(beginChar + words.get(i).length());
beginChar += words.get(i).length() + 1;
token.setLemma(lemmas.get(i));
token.setTag(pos.get(i));
token.setNER(ner.get(i));
token.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
token.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
token.set(CoreAnnotations.IndexAnnotation.class, i + 1);
token.set(CoreAnnotations.TokenBeginAnnotation.class, i);
token.set(CoreAnnotations.TokenEndAnnotation.class, i + 1);
tokens.add(token);
}
gloss = gloss.replace("\\n", "\n").replace("\\t", "\t");
CoreMap sentence = new ArrayCoreMap(16);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
SemanticGraph graph = tree.apply(tokens);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph);
SemanticGraph maltGraph = maltTree.apply(tokens);
sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, maltGraph);
sentence.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
sentence.set(CoreAnnotations.TextAnnotation.class, gloss);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokens.size());
Annotation doc = new Annotation(gloss);
doc.set(CoreAnnotations.TokensAnnotation.class, tokens);
doc.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
doc.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
doc.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
return doc;
}
/** Create an Annotation object (with a single sentence) from the given specification, as Postgres would output them */
public static Annotation parseSentence(Optional<String> docid, Optional<String> sentenceIndex,
String gloss, String dependencies, String maltDependencies,
String words, String lemmas, String posTags, String nerTags,
Optional<String> sentenceid) {
return parseSentence(docid, sentenceIndex.map(Integer::parseInt), gloss,
tokens -> parseTree(dependencies, tokens),
tokens -> parseTree(maltDependencies, tokens),
parseArray(words), parseArray(lemmas), parseArray(posTags), parseArray(nerTags), sentenceid);
}
}