package edu.stanford.nlp.pipeline; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.TwoDimensionalMap; public abstract class AnnotationSerializer { /** * Append a single object to this stream. Subsequent calls to append on the same stream must supply the returned * output stream; furthermore, implementations of this function must be prepared to handle * the same output stream being passed in as it returned on the previous write. * * @param corpus The document to serialize to the stream. * @param os The output stream to serialize to. * @return The output stream which should be closed when done writing, and which should be passed into subsequent * calls to write() on this serializer. * @throws IOException Thrown if the underlying output stream throws the exception. */ public abstract OutputStream write(Annotation corpus, OutputStream os) throws IOException; /** * Read a single object from this stream. Subsequent calls to read on the same input stream must supply the * returned input stream; furthermore, implementations of this function must be prepared to handle the same * input stream being passed to it as it returned on the previous read. * * @param is The input stream to read a document from. * @return A pair of the read document, and the implementation-specific input stream which it was actually read from. * This stream should be passed to subsequent calls to read on the same stream, and should be closed when reading * completes. * @throws IOException Thrown if the underlying stream throws the exception. * @throws ClassNotFoundException Thrown if an object was read that does not exist in the classpath. * @throws ClassCastException Thrown if the signature of a class changed in way that was incompatible with the serialized document. */ public abstract Pair<Annotation, InputStream> read(InputStream is) throws IOException, ClassNotFoundException, ClassCastException; public static class IntermediateNode { String docId; int sentIndex; int index; int copyAnnotation; boolean isRoot; public IntermediateNode(String docId, int sentIndex, int index, int copy, boolean isRoot) { this.docId = docId; this.sentIndex = sentIndex; this.index = index; this.copyAnnotation = copy; this.isRoot = isRoot; } } public static class IntermediateEdge { int source; int sourceCopy; int target; int targetCopy; String dep; boolean isExtra; public IntermediateEdge(String dep, int source, int sourceCopy, int target, int targetCopy, boolean isExtra) { this.dep = dep; this.source = source; this.sourceCopy = sourceCopy; this.target = target; this.targetCopy = targetCopy; this.isExtra = isExtra; } } public static class IntermediateSemanticGraph { public List<IntermediateNode> nodes; public List<IntermediateEdge> edges; public IntermediateSemanticGraph() { nodes = new ArrayList<>(); edges = new ArrayList<>(); } public IntermediateSemanticGraph(List<IntermediateNode> nodes, List<IntermediateEdge> edges) { this.nodes = new ArrayList<>(nodes); this.edges = new ArrayList<>(edges); } private static final Object LOCK = new Object(); public SemanticGraph convertIntermediateGraph(List<CoreLabel> sentence) { SemanticGraph graph = new SemanticGraph(); // First construct the actual nodes; keep them indexed by their index and copy count. // Sentences such as "I went over the river and through the woods" have // two copies for "went" in the collapsed dependencies. TwoDimensionalMap<Integer, Integer, IndexedWord> nodeMap = TwoDimensionalMap.hashMap(); for (IntermediateNode in: nodes){ CoreLabel token = sentence.get(in.index - 1); // index starts at 1! IndexedWord word; if (in.copyAnnotation > 0) { // TODO: if we make a copy wrapper CoreLabel, use it here instead word = new IndexedWord(new CoreLabel(token)); word.setCopyCount(in.copyAnnotation); } else { word = new IndexedWord(token); } // for backwards compatibility - new annotations should have // these fields set, but annotations older than August 2014 might not if (word.docID() == null && in.docId != null) { word.setDocID(in.docId); } if (word.sentIndex() < 0 && in.sentIndex >= 0) { word.setSentIndex(in.sentIndex); } if (word.index() < 0 && in.index >= 0) { word.setIndex(in.index); } nodeMap.put(word.index(), word.copyCount(), word); graph.addVertex(word); if (in.isRoot) { graph.addRoot(word); } } // add all edges to the actual graph for (IntermediateEdge ie: edges) { IndexedWord source = nodeMap.get(ie.source, ie.sourceCopy); if (source == null) { throw new RuntimeIOException("Failed to find node " + ie.source + "-" + ie.sourceCopy); } IndexedWord target = nodeMap.get(ie.target, ie.targetCopy); if (target == null) { throw new RuntimeIOException("Failed to find node " + ie.target + "-" + ie.targetCopy); } // assert(target != null); synchronized (LOCK) { // this is not thread-safe: there are static fields in GrammaticalRelation GrammaticalRelation rel = GrammaticalRelation.valueOf(ie.dep); graph.addEdge(source, target, rel, 1.0, ie.isExtra); } } // compute root nodes if they weren't stored in the graph if (!graph.isEmpty() && graph.getRoots().size() == 0){ graph.resetRoots(); } return graph; } } }