package edu.stanford.nlp.pipeline; import edu.stanford.nlp.coref.CorefCoreAnnotations; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.ie.machinereading.structure.Span; import edu.stanford.nlp.ie.util.RelationTriple; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.StringOutputStream; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.naturalli.NaturalLogicAnnotations; import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; import edu.stanford.nlp.time.TimeAnnotations; import edu.stanford.nlp.time.Timex; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.trees.TreePrint; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.Pointer; import java.io.*; import java.text.DecimalFormat; import java.util.*; import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.Stream; /** * Output an Annotation to human readable JSON. * This is not a lossless operation; for more strict serialization, * see {@link edu.stanford.nlp.pipeline.AnnotationSerializer}; e.g., * {@link edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer}. * * @author Gabor Angeli */ @SuppressWarnings("unused") public class JSONOutputter extends AnnotationOutputter { protected static final String INDENT_CHAR = " "; public static String cleanJSON(String s) { return s .replace("\\", "\\\\") .replace("\b", "\\b") .replace("\f", "\\f") .replace("\n", "\\n") .replace("\r", "\\r") .replace("\t", "\\t") .replace("\"", "\\\""); } /** {@inheritDoc} */ @SuppressWarnings("RedundantCast") // It's lying; we need the "redundant" casts (as of 2014-09-08) @Override public void print(Annotation doc, OutputStream target, Options options) throws IOException { PrintWriter writer = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding)); JSONWriter l0 = new JSONWriter(writer, options); l0.object(l1 -> { // Add annotations attached to a Document l1.set("docId", doc.get(CoreAnnotations.DocIDAnnotation.class)); l1.set("docDate", doc.get(CoreAnnotations.DocDateAnnotation.class)); l1.set("docSourceType", doc.get(CoreAnnotations.DocSourceTypeAnnotation.class)); l1.set("docType", doc.get(CoreAnnotations.DocTypeAnnotation.class)); l1.set("author", doc.get(CoreAnnotations.AuthorAnnotation.class)); l1.set("location", doc.get(CoreAnnotations.LocationAnnotation.class)); if (options.includeText) { l1.set("text", doc.get(CoreAnnotations.TextAnnotation.class)); } // Add sentences if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) { l1.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> (Consumer<Writer>) (Writer l2) -> { // Add a single sentence // (metadata) l2.set("id", sentence.get(CoreAnnotations.SentenceIDAnnotation.class)); l2.set("index", sentence.get(CoreAnnotations.SentenceIndexAnnotation.class)); l2.set("line", sentence.get(CoreAnnotations.LineNumberAnnotation.class)); // (constituency tree) StringWriter treeStrWriter = new StringWriter(); TreePrint treePrinter = options.constituentTreePrinter; if (treePrinter == AnnotationOutputter.DEFAULT_CONSTITUENT_TREE_PRINTER) { // note the '==' -- we're overwriting the default, but only if it was not explicitly set otherwise treePrinter = new TreePrint("oneline"); } treePrinter.printTree(sentence.get(TreeCoreAnnotations.TreeAnnotation.class), new PrintWriter(treeStrWriter, true)); String treeStr = treeStrWriter.toString().trim(); // strip the trailing newline if (!"SENTENCE_SKIPPED_OR_UNPARSABLE".equals(treeStr)) { l2.set("parse", treeStr); } // (dependency trees) l2.set("basicDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class))); l2.set("enhancedDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class))); l2.set("enhancedPlusPlusDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class))); // (sentiment) Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); if (sentimentTree != null) { int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree); String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class); l2.set("sentimentValue", Integer.toString(sentiment)); l2.set("sentiment", sentimentClass.replaceAll(" ", "")); } // (openie) Collection<RelationTriple> openIETriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class); if (openIETriples != null) { l2.set("openie", openIETriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> { tripleWriter.set("subject", triple.subjectGloss()); tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan())); tripleWriter.set("relation", triple.relationGloss()); tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan())); tripleWriter.set("object", triple.objectGloss()); tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan())); })); } // (kbp) Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class); if (kbpTriples != null) { l2.set("kbp", kbpTriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> { tripleWriter.set("subject", triple.subjectGloss()); tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan())); tripleWriter.set("relation", triple.relationGloss()); tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan())); tripleWriter.set("object", triple.objectGloss()); tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan())); })); } // (entity mentions) if (sentence.get(CoreAnnotations.MentionsAnnotation.class) != null) { Integer sentTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); l2.set("entitymentions", sentence.get(CoreAnnotations.MentionsAnnotation.class).stream().map(m -> (Consumer<Writer>) (Writer l3) -> { Integer tokenBegin = m.get(CoreAnnotations.TokenBeginAnnotation.class); Integer tokenEnd = m.get(CoreAnnotations.TokenEndAnnotation.class); l3.set("docTokenBegin", tokenBegin); l3.set("docTokenEnd", tokenEnd); if (tokenBegin != null && sentTokenBegin != null) { l3.set("tokenBegin", tokenBegin - sentTokenBegin); } if (tokenEnd != null && sentTokenBegin != null) { l3.set("tokenEnd", tokenEnd - sentTokenBegin); } l3.set("text", m.get(CoreAnnotations.TextAnnotation.class)); //l3.set("originalText", m.get(CoreAnnotations.OriginalTextAnnotation.class)); //l3.set("lemma", m.get(CoreAnnotations.LemmaAnnotation.class)); l3.set("characterOffsetBegin", m.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); l3.set("characterOffsetEnd", m.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); //l3.set("pos", m.get(CoreAnnotations.PartOfSpeechAnnotation.class)); l3.set("ner", m.get(CoreAnnotations.NamedEntityTagAnnotation.class)); l3.set("normalizedNER", m.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)); l3.set("entitylink", m.get(CoreAnnotations.WikipediaEntityAnnotation.class)); // Timex Timex time = m.get(TimeAnnotations.TimexAnnotation.class); if (time != null) { Timex.Range range = time.range(); l3.set("timex", (Consumer<Writer>) l4 -> { l4.set("tid", time.tid()); l4.set("type", time.timexType()); l4.set("value", time.value()); l4.set("altValue", time.altVal()); l4.set("range", (range != null)? (Consumer<Writer>) l5 -> { l5.set("begin", range.begin); l5.set("end", range.end); l5.set("duration", range.duration); } : null); }); } })); } // (add tokens) if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) { l2.set("tokens", sentence.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> (Consumer<Writer>) (Writer l3) -> { // Add a single token l3.set("index", token.index()); l3.set("word", token.word()); l3.set("originalText", token.originalText()); l3.set("lemma", token.lemma()); l3.set("characterOffsetBegin", token.beginPosition()); l3.set("characterOffsetEnd", token.endPosition()); l3.set("pos", token.tag()); l3.set("ner", token.ner()); l3.set("normalizedNER", token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)); l3.set("speaker", token.get(CoreAnnotations.SpeakerAnnotation.class)); l3.set("truecase", token.get(CoreAnnotations.TrueCaseAnnotation.class)); l3.set("truecaseText", token.get(CoreAnnotations.TrueCaseTextAnnotation.class)); l3.set("before", token.get(CoreAnnotations.BeforeAnnotation.class)); l3.set("after", token.get(CoreAnnotations.AfterAnnotation.class)); l3.set("entitylink", token.get(CoreAnnotations.WikipediaEntityAnnotation.class)); // Timex Timex time = token.get(TimeAnnotations.TimexAnnotation.class); if (time != null) { Timex.Range range = time.range(); l3.set("timex", (Consumer<Writer>) l4 -> { l4.set("tid", time.tid()); l4.set("type", time.timexType()); l4.set("value", time.value()); l4.set("altValue", time.altVal()); l4.set("range", (range != null)? (Consumer<Writer>) l5 -> { l5.set("begin", range.begin); l5.set("end", range.end); l5.set("duration", range.duration); } : null); }); } })); } })); } else { if (doc.get(CoreAnnotations.TokensAnnotation.class) != null) { l1.set("tokens", doc.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> (Consumer<Writer>) (Writer l2) -> { l2.set("index", token.index()); l2.set("word", token.word()); l2.set("originalText", token.originalText()); l2.set("characterOffsetBegin", token.beginPosition()); l2.set("characterOffsetEnd", token.endPosition()); })); } } // Add coref values if (doc.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) { Map<Integer, CorefChain> corefChains = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class); if (corefChains != null) { l1.set("corefs", (Consumer<Writer>) chainWriter -> { for (CorefChain chain : corefChains.values()) { CorefChain.CorefMention representative = chain.getRepresentativeMention(); chainWriter.set(Integer.toString(chain.getChainID()), chain.getMentionsInTextualOrder().stream().map(mention -> (Consumer<Writer>) (Writer mentionWriter) -> { mentionWriter.set("id", mention.mentionID); mentionWriter.set("text", mention.mentionSpan); mentionWriter.set("type", mention.mentionType); mentionWriter.set("number", mention.number); mentionWriter.set("gender", mention.gender); mentionWriter.set("animacy", mention.animacy); mentionWriter.set("startIndex", mention.startIndex); mentionWriter.set("endIndex", mention.endIndex); mentionWriter.set("headIndex", mention.headIndex); mentionWriter.set("sentNum", mention.sentNum); mentionWriter.set("position", Arrays.stream(mention.position.elems()).boxed().collect(Collectors.toList())); mentionWriter.set("isRepresentativeMention", mention == representative); })); } }); } } // quotes if (doc.get(CoreAnnotations.QuotationsAnnotation.class) != null) { List<CoreMap> quotes = QuoteAnnotator.gatherQuotes(doc); l1.set("quotes", quotes.stream().map(quote -> (Consumer<Writer>) (Writer l2) -> { l2.set("id", quote.get(CoreAnnotations.QuotationIndexAnnotation.class)); l2.set("text", quote.get(CoreAnnotations.TextAnnotation.class)); l2.set("beginIndex", quote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); l2.set("endIndex", quote.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); l2.set("beginToken", quote.get(CoreAnnotations.TokenBeginAnnotation.class)); l2.set("endToken", quote.get(CoreAnnotations.TokenEndAnnotation.class)); l2.set("beginSentence", quote.get(CoreAnnotations.SentenceBeginAnnotation.class)); l2.set("endSentence", quote.get(CoreAnnotations.SentenceEndAnnotation.class)); })); } }); l0.flush(); // flush } /** * Convert a dependency graph to a format expected as input to {@link Writer#set(String, Object)}. */ @SuppressWarnings("RedundantCast") // It's lying; we need the "redundant" casts (as of 2014-09-08) private static Object buildDependencyTree(SemanticGraph graph) { if(graph != null) { return Stream.concat( // Roots graph.getRoots().stream().map( (IndexedWord root) -> (Consumer<Writer>) dep -> { dep.set("dep", "ROOT"); dep.set("governor", 0); dep.set("governorGloss", "ROOT"); dep.set("dependent", root.index()); dep.set("dependentGloss", root.word()); }), // Regular edges graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer<Writer>) (Writer dep) -> { dep.set("dep", edge.getRelation().toString()); dep.set("governor", edge.getGovernor().index()); dep.set("governorGloss", edge.getGovernor().word()); dep.set("dependent", edge.getDependent().index()); dep.set("dependentGloss", edge.getDependent().word()); }) ); } else { return null; } } public static String jsonPrint(Annotation annotation) throws IOException { StringOutputStream os = new StringOutputStream(); new JSONOutputter().print(annotation, os); return os.toString(); } public static void jsonPrint(Annotation annotation, OutputStream os) throws IOException { new JSONOutputter().print(annotation, os); } public static void jsonPrint(Annotation annotation, OutputStream os, StanfordCoreNLP pipeline) throws IOException { new JSONOutputter().print(annotation, os, pipeline); } public static void jsonPrint(Annotation annotation, OutputStream os, Options options) throws IOException { new JSONOutputter().print(annotation, os, options); } /** * Our very own little JSON writing class. * For usage, see the test cases in JSONOutputterTest. * * For the love of all that is holy, don't try to write JSON multithreaded. * It should go without saying that this is not threadsafe. */ public static class JSONWriter { private final PrintWriter writer; private final Options options; public JSONWriter(PrintWriter writer, Options options) { this.writer = writer; this.options = options; } @SuppressWarnings({"unchecked", "UnnecessaryBoxing"}) private void routeObject(int indent, Object value) { if (value instanceof String) { // Case: simple string (this is easy!) writer.write("\""); writer.write(cleanJSON(value.toString())); writer.write("\""); } else if (value instanceof Collection) { // Case: collection writer.write("["); newline(); Iterator<Object> elems = ((Collection<Object>) value).iterator(); while (elems.hasNext()) { indent(indent + 1); routeObject(indent + 1, elems.next()); if (elems.hasNext()) { writer.write(","); } newline(); } indent(indent); writer.write("]"); } else if (value instanceof Enum) { // Case: enumeration constant writer.write("\""); writer.write(cleanJSON(((Enum) value).name())); writer.write("\""); } else if (value instanceof Pair) { routeObject(indent, Arrays.asList(((Pair) value).first, ((Pair) value).second)); } else if (value instanceof Span) { writer.write("["); writer.write(Integer.toString(((Span) value).start())); writer.write(","); space(); writer.write(Integer.toString(((Span) value).end())); writer.write("]"); } else if (value instanceof Consumer) { object(indent, (Consumer<Writer>) value); } else if (value instanceof Stream) { routeObject(indent, ((Stream) value).collect(Collectors.toList())); } else if (value.getClass().isArray()) { // Arrays make life miserable in Java Class<?> componentType = value.getClass().getComponentType(); if (componentType.isPrimitive()) { if (int.class.isAssignableFrom(componentType)) { ArrayList<Integer> lst = new ArrayList<>(); for (int elem : ((int[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (short.class.isAssignableFrom(componentType)) { ArrayList<Short> lst = new ArrayList<>(); for (short elem : ((short[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (byte.class.isAssignableFrom(componentType)) { ArrayList<Byte> lst = new ArrayList<>(); for (byte elem : ((byte[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (long.class.isAssignableFrom(componentType)) { ArrayList<Long> lst = new ArrayList<>(); for (long elem : ((long[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (char.class.isAssignableFrom(componentType)) { ArrayList<Character> lst = new ArrayList<>(); for (char elem : ((char[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (float.class.isAssignableFrom(componentType)) { ArrayList<Float> lst = new ArrayList<>(); for (float elem : ((float[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (double.class.isAssignableFrom(componentType)) { ArrayList<Double> lst = new ArrayList<>(); for (double elem : ((double[]) value)) { lst.add(elem); } routeObject(indent, lst); } else if (boolean.class.isAssignableFrom(componentType)) { ArrayList<Boolean> lst = new ArrayList<>(); for (boolean elem : ((boolean[]) value)) { lst.add(elem); } routeObject(indent, lst); } else { throw new IllegalStateException("Unhandled primitive type in array: " + componentType); } } else { routeObject(indent, Arrays.asList((Object[]) value)); } } else if (value instanceof Integer) { writer.write(Integer.toString((Integer) value)); } else if (value instanceof Short) { writer.write(Short.toString((Short) value)); } else if (value instanceof Byte) { writer.write(Byte.toString((Byte) value)); } else if (value instanceof Long) { writer.write(Long.toString((Long) value)); } else if (value instanceof Character) { writer.write(Character.toString((Character) value)); } else if (value instanceof Float) { writer.write(new DecimalFormat("0.#######").format(value)); } else if (value instanceof Double) { writer.write(new DecimalFormat("0.##############").format(value)); } else if (value instanceof Boolean) { writer.write(Boolean.toString((Boolean) value)); } else if (int.class.isAssignableFrom(value.getClass())) { routeObject(indent, Integer.valueOf((int) value)); } else if (short.class.isAssignableFrom(value.getClass())) { routeObject(indent, Short.valueOf((short) value)); } else if (byte.class.isAssignableFrom(value.getClass())) { routeObject(indent, Byte.valueOf((byte) value)); } else if (long.class.isAssignableFrom(value.getClass())) { routeObject(indent, Long.valueOf((long) value)); } else if (char.class.isAssignableFrom(value.getClass())) { routeObject(indent, Character.valueOf((char) value)); } else if (float.class.isAssignableFrom(value.getClass())) { routeObject(indent, Float.valueOf((float) value)); } else if (double.class.isAssignableFrom(value.getClass())) { routeObject(indent, Double.valueOf((double) value)); } else if (boolean.class.isAssignableFrom(value.getClass())) { routeObject(indent, Boolean.valueOf((boolean) value)); } else { throw new RuntimeException("Unknown object to serialize: " + value); } } public void object(int indent, Consumer<Writer> callback) { writer.write("{"); final Pointer<Boolean> firstCall = new Pointer<>(true); callback.accept((key, value) -> { if (key != null && value != null) { // First call overhead if (!firstCall.dereference().orElse(false)) { writer.write(","); } firstCall.set(false); // Write the key newline(); indent(indent + 1); writer.write("\""); writer.write(cleanJSON(key)); writer.write("\":"); space(); // Write the value routeObject(indent + 1, value); } }); newline(); indent(indent); writer.write("}"); } public void object(Consumer<Writer> callback) { object(0, callback); } private void indent(int num) { if (options.pretty) { for (int i = 0; i < num; ++i) { writer.write(INDENT_CHAR); } } } public void flush() { writer.flush(); } private void space() { if (options.pretty) { writer.write(" "); } } private void newline() { if (options.pretty) { writer.write("\n"); } } public static String objectToJSON(Consumer<Writer> callback) { OutputStream os = new ByteArrayOutputStream(); PrintWriter out = new PrintWriter(os); new JSONWriter(out, new Options()).object(callback); out.close(); return os.toString(); } } /** * A tiny little functional interface for writing a (key, value) pair. * The key should always be a String, the value can be either a String, * a Collection of valid values, or a Callback taking a Writer (this is how * we represent objects while creating JSON). */ @FunctionalInterface public interface Writer { /** * Set a (key, value) pair in a JSON object. * Note that if either the key or the value is null, nothing will be set. * @param key The key of the object. * @param value The value of the object. */ void set(String key, Object value); } }