package edu.stanford.nlp.util; import edu.stanford.nlp.international.Language; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.trees.GrammaticalRelation; import javax.json.*; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.function.Function; import java.util.regex.Pattern; /** * A set of utilities for parsing TSV files into CoreMaps * * @author Gabor Angeli */ public class TSVUtils { static String unescapeSQL(String input) { // If the string is quoted if (input.startsWith("\"") && input.endsWith("\"")) { input = input.substring(1, input.length()-1); } return input.replace("\"\"","\"").replace("\\\\", "\\"); } /** * Parse an SQL array. * @param array The array to parse. * @return The parsed array, as a list. */ public static List<String> parseArray(String array) { array = unescapeSQL(array); if (array.startsWith("{") && array.endsWith("}")) array = array.substring(1, array.length()-1); char[] input = array.toCharArray(); List<String> output = new ArrayList<>(); StringBuilder elem = new StringBuilder(); boolean inQuotes = false; boolean escaped = false; for (char c : input) { if (escaped) { elem.append(c); escaped = false; } else if (c == '"') { inQuotes = !inQuotes; escaped = false; } else if (c == '\\') { escaped = true; } else { if (inQuotes) { elem.append(c); } else if (c == ',') { output.add(elem.toString()); elem.setLength(0); // This is basically .clear() } else { elem.append(c); } escaped = false; } } if (elem.length() > 0) { output.add(elem.toString()); } return output; } private static final Pattern newline = Pattern.compile("\\\\n"); private static final Pattern tab = Pattern.compile("\\\\t"); /** * Parse a CoNLL formatted tree into a SemanticGraph. * @param conll The CoNLL tree to parse. * @param tokens The tokens of the sentence, to form the backing labels of the tree. * @return A semantic graph of the sentence, according to the given tree. */ public static SemanticGraph parseTree(String conll, List<CoreLabel> tokens) { SemanticGraph tree = new SemanticGraph(); if (conll == null || conll.isEmpty()) { return tree; } String[] treeLines = newline.split(conll); IndexedWord[] vertices = new IndexedWord[tokens.size() + 2]; // Add edges for (String line : treeLines) { // Parse row String[] fields = tab.split(line); int dependentIndex = Integer.parseInt(fields[0]); if (vertices[dependentIndex] == null) { if (dependentIndex > tokens.size()) { // Bizarre mismatch in sizes; the malt parser seems to do this often return new SemanticGraph(); } vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1)); } IndexedWord dependent = vertices[dependentIndex]; int governorIndex = Integer.parseInt(fields[1]); if (governorIndex > tokens.size()) { // Bizarre mismatch in sizes; the malt parser seems to do this often return new SemanticGraph(); } if (vertices[governorIndex] == null && governorIndex > 0) { vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1)); } IndexedWord governor = vertices[governorIndex]; String relation = fields[2]; // Process row if (governorIndex == 0) { tree.addRoot(dependent); } else { tree.addVertex(dependent); if (!tree.containsVertex(governor)) { tree.addVertex(governor); } if (!"ref".equals(relation)) { tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false); } } } return tree; } /** * Parse a JSON formatted tree into a SemanticGraph. * @param jsonString The JSON string tree to parse, e.g: * "[{\"\"dependent\"\": 7, \"\"dep\"\": \"\"root\"\", \"\"governorgloss\"\": \"\"root\"\", \"\"governor\"\": 0, \"\"dependentgloss\"\": \"\"sport\"\"}, {\"\"dependent\"\": 1, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 2, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"is\"\"}, {\"\"dependent\"\": 3, \"\"dep\"\": \"\"neg\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"not\"\"}, {\"\"dependent\"\": 4, \"\"dep\"\": \"\"det\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"a\"\"}, {\"\"dependent\"\": 5, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"physical\"\", \"\"governor\"\": 6, \"\"dependentgloss\"\": \"\"predominantly\"\"}, {\"\"dependent\"\": 6, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"physical\"\"}, {\"\"dependent\"\": 9, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"yet\"\"}, {\"\"dependent\"\": 10, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"neither\"\"}, {\"\"dependent\"\": 11, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"are\"\"}, {\"\"dependent\"\": 12, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"shooting\"\"}, {\"\"dependent\"\": 13, \"\"dep\"\": \"\"cc\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"and\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"conj:and\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 16, \"\"dep\"\": \"\"nsubjpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"which\"\"}, {\"\"dependent\"\": 18, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"fact\"\", \"\"governor\"\": 19, \"\"dependentgloss\"\": \"\"in\"\"}, {\"\"dependent\"\": 19, \"\"dep\"\": \"\"nmod:in\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"fact\"\"}, {\"\"dependent\"\": 21, \"\"dep\"\": \"\"aux\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"has\"\"}, {\"\"dependent\"\": 22, \"\"dep\"\": \"\"auxpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"been\"\"}, {\"\"dependent\"\": 23, \"\"dep\"\": \"\"dep\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"nicknamed\"\"}, {\"\"dependent\"\": 25, \"\"dep\"\": \"\"dobj\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 26, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"ice\"\", \"\"governor\"\": 27, \"\"dependentgloss\"\": \"\"on\"\"}, {\"\"dependent\"\": 27, \"\"dep\"\": \"\"nmod:on\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"ice\"\"}, {\"\"dependent\"\": 29, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"5\"\"}]"); * @param tokens The tokens of the sentence, to form the backing labels of the tree. * @return A semantic graph of the sentence, according to the given tree. */ public static SemanticGraph parseJsonTree(String jsonString, List<CoreLabel> tokens) { // Escape quoted string parts JsonReader json = Json.createReader(new StringReader(jsonString)); SemanticGraph tree = new SemanticGraph(); JsonArray array = json.readArray(); if (array == null || array.isEmpty()) { return tree; } IndexedWord[] vertices = new IndexedWord[tokens.size() + 2]; // Add edges for(int i = 0; i < array.size(); i++) { JsonObject entry = array.getJsonObject(i); // Parse row int dependentIndex = entry.getInt("dependent"); if (vertices[dependentIndex] == null) { if (dependentIndex > tokens.size()) { // Bizarre mismatch in sizes; the malt parser seems to do this often return new SemanticGraph(); } vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1)); } IndexedWord dependent = vertices[dependentIndex]; int governorIndex = entry.getInt("governor"); if (governorIndex > tokens.size()) { // Bizarre mismatch in sizes; the malt parser seems to do this often return new SemanticGraph(); } if (vertices[governorIndex] == null && governorIndex > 0) { vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1)); } IndexedWord governor = vertices[governorIndex]; String relation = entry.getString("dep"); // Process row if (governorIndex == 0) { tree.addRoot(dependent); } else { tree.addVertex(dependent); if (!tree.containsVertex(governor)) { tree.addVertex(governor); } if (!"ref".equals(relation)) { tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false); } } } return tree; } /** Create an Annotation object (with a single sentence) from the given specification */ private static Annotation parseSentence(Optional<String> docid, Optional<Integer> sentenceIndex, String gloss, Function<List<CoreLabel>,SemanticGraph> tree, Function<List<CoreLabel>,SemanticGraph> maltTree, List<String> words, List<String> lemmas, List<String> pos, List<String> ner, Optional<String> sentenceid) { // Error checks if (lemmas.size() != words.size()) { throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + lemmas.size() + " (sentence " + sentenceid.orElse("???") +")"); } if (pos.size() != words.size()) { throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + pos.size() + " (sentence " + sentenceid.orElse("???") +")"); } if (ner.size() != words.size()) { throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + ner.size() + " (sentence " + sentenceid.orElse("???") +")"); } // Create structure List<CoreLabel> tokens = new ArrayList<>(words.size()); int beginChar = 0; for (int i = 0; i < words.size(); ++i) { CoreLabel token = new CoreLabel(12); token.setWord(words.get(i)); token.setValue(words.get(i)); token.setBeginPosition(beginChar); token.setEndPosition(beginChar + words.get(i).length()); beginChar += words.get(i).length() + 1; token.setLemma(lemmas.get(i)); token.setTag(pos.get(i)); token.setNER(ner.get(i)); token.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???")); token.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1)); token.set(CoreAnnotations.IndexAnnotation.class, i + 1); token.set(CoreAnnotations.TokenBeginAnnotation.class, i); token.set(CoreAnnotations.TokenEndAnnotation.class, i + 1); tokens.add(token); } gloss = gloss.replace("\\n", "\n").replace("\\t", "\t"); CoreMap sentence = new ArrayCoreMap(16); sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); SemanticGraph graph = tree.apply(tokens); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph); sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph); SemanticGraph maltGraph = maltTree.apply(tokens); sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, maltGraph); sentence.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???")); sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1)); sentence.set(CoreAnnotations.TextAnnotation.class, gloss); sentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0); sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokens.size()); Annotation doc = new Annotation(gloss); doc.set(CoreAnnotations.TokensAnnotation.class, tokens); doc.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); doc.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???")); doc.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1)); return doc; } /** Create an Annotation object (with a single sentence) from the given specification, as Postgres would output them */ public static Annotation parseSentence(Optional<String> docid, Optional<String> sentenceIndex, String gloss, String dependencies, String maltDependencies, String words, String lemmas, String posTags, String nerTags, Optional<String> sentenceid) { return parseSentence(docid, sentenceIndex.map(Integer::parseInt), gloss, tokens -> parseTree(dependencies, tokens), tokens -> parseTree(maltDependencies, tokens), parseArray(words), parseArray(lemmas), parseArray(posTags), parseArray(nerTags), sentenceid); } }