package edu.stanford.nlp.pipeline; import edu.stanford.nlp.util.logging.Redwood; import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import edu.stanford.nlp.coref.CorefCoreAnnotations; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.PennTreeReader; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.util.*; /** * Serializes Annotation objects using our own format. * * Note[gabor]: This is a lossy serialization! For similar performance, and * lossless (or less lossy) serialization see, * {@link edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer}. * * @author Mihai */ public class CustomAnnotationSerializer extends AnnotationSerializer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(CustomAnnotationSerializer.class); private final boolean compress; /** * If true, it means we store/load also AntecedentAnnotation * This annotation is used ONLY in our KBP annotation. * By default, it is not needed because we store the entire coref graph anyway. */ private final boolean haveExplicitAntecedent; public CustomAnnotationSerializer() { this(true, false); } public CustomAnnotationSerializer(boolean compress, boolean haveAnte) { this.compress = compress; this.haveExplicitAntecedent = haveAnte; } private static IntermediateSemanticGraph loadDependencyGraph(BufferedReader reader) throws IOException { IntermediateSemanticGraph graph = new IntermediateSemanticGraph(); // first line: list of nodes String line = reader.readLine().trim(); // System.out.println("PARSING LINE: " + line); if(line.length() > 0){ String [] bits = line.split("\t"); if(bits.length < 3) throw new RuntimeException("ERROR: Invalid dependency node line: " + line); String docId = bits[0]; if(docId.equals("-")) docId = ""; int sentIndex = Integer.valueOf(bits[1]); for(int i = 2; i < bits.length; i ++){ String bit = bits[i]; String[] bbits = bit.split("-"); int copyAnnotation = -1; boolean isRoot = false; if(bbits.length > 3){ throw new RuntimeException("ERROR: Invalid format for dependency graph: " + line); } else if(bbits.length == 2){ copyAnnotation = Integer.valueOf(bbits[1]); } else if(bbits.length == 3){ copyAnnotation = Integer.valueOf(bbits[1]); isRoot = bbits[2].equals("R"); } int index = Integer.valueOf(bbits[0]); graph.nodes.add(new IntermediateNode(docId, sentIndex, index, copyAnnotation, isRoot)); } } // second line: list of deps line = reader.readLine().trim(); if(line.length() > 0){ String [] bits = line.split("\t"); for(String bit: bits){ String [] bbits = bit.split(" "); if(bbits.length < 3 || bbits.length > 6){ throw new RuntimeException("ERROR: Invalid format for dependency graph: " + line); } String dep = bbits[0]; int source = Integer.valueOf(bbits[1]); int target = Integer.valueOf(bbits[2]); boolean isExtra = (bbits.length == 4) ? Boolean.valueOf(bbits[3]) : false; int sourceCopy = (bbits.length > 4) ? Integer.valueOf(bbits[4]) : 0; int targetCopy = (bbits.length > 5) ? Integer.valueOf(bbits[5]) : 0; graph.edges.add(new IntermediateEdge(dep, source, sourceCopy, target, targetCopy, isExtra)); } } return graph; } /** * Saves all arcs in the graph on two lines: first line contains the vertices, second the edges. * @param graph * @param pw */ private static void saveDependencyGraph(SemanticGraph graph, PrintWriter pw) { if(graph == null){ pw.println(); pw.println(); return; } boolean outputHeader = false; for (IndexedWord node: graph.vertexSet()){ // first line: sentence index for all nodes; we recover the words // from the original tokens the first two tokens in this line // indicate: docid, sentence index if (!outputHeader) { String docId = node.get(CoreAnnotations.DocIDAnnotation.class); if(docId != null && docId.length() > 0) pw.print(docId); else pw.print("-"); pw.print("\t"); pw.print(node.get(CoreAnnotations.SentenceIndexAnnotation.class)); outputHeader = true; } pw.print("\t"); pw.print(node.index()); // CopyAnnotations indicate copied (or virtual nodes) generated due to CCs (see EnglishGrammaticalStructure) // These annotations are usually not set, so print them only if necessary if(node.copyCount() > 0){ pw.print("-"); pw.print(node.copyCount()); // System.out.println("FOUND COPY ANNOTATION: " + node.get(CoreAnnotations.CopyAnnotation.class)); } if (graph.getRoots().contains(node)) { if (node.copyCount() > 0) { pw.print("-R"); } else { pw.print("-0-R"); } } } pw.println(); // second line: all edges boolean first = true; for (SemanticGraphEdge edge : graph.edgeIterable()) { if(! first) pw.print("\t"); String rel = edge.getRelation().toString(); // no spaces allowed in the relation name // note that they might occur due to the tokenization of HTML/XML/RDF tags rel = rel.replaceAll("\\s+", ""); pw.print(rel); pw.print(" "); pw.print(edge.getSource().index()); pw.print(" "); pw.print(edge.getTarget().index()); if (edge.isExtra() || edge.getSource().copyCount() > 0 || edge.getTarget().copyCount() > 0) { pw.print(" "); pw.print(edge.isExtra()); pw.print(" "); pw.print(edge.getSource().copyCount()); pw.print(" "); pw.print(edge.getTarget().copyCount()); } first = false; } pw.println(); } /** Serializes the CorefChain objects * * @param chains all clusters in a doc * @param pw the buffer */ private static void saveCorefChains(Map<Integer, CorefChain> chains, PrintWriter pw) { if(chains == null) { pw.println(); return; } // how many clusters pw.println(chains.size()); // save each cluster for (Map.Entry<Integer, CorefChain> integerCorefChainEntry : chains.entrySet()) { // cluster id + how many mentions in the cluster saveCorefChain(pw, integerCorefChainEntry.getKey(), integerCorefChainEntry.getValue()); } // an empty line at end pw.println(); } private static int countMentions(CorefChain cluster) { int count = 0; for(IntPair mid: cluster.getMentionMap().keySet()) { count += cluster.getMentionMap().get(mid).size(); } return count; } /** * Serializes one coref cluster (i.e., one entity). * * @param pw the buffer * @param cid id of cluster to save * @param cluster the cluster */ public static void saveCorefChain(PrintWriter pw, int cid, CorefChain cluster) { pw.println(cid + " " + countMentions(cluster)); // each mention saved on one line Map<IntPair, Set<CorefChain.CorefMention>> mentionMap = cluster.getMentionMap(); for (Map.Entry<IntPair, Set<CorefChain.CorefMention>> intPairSetEntry : mentionMap.entrySet()) { // all mentions with the same head IntPair mentionIndices = intPairSetEntry.getKey(); Set<CorefChain.CorefMention> mentions = intPairSetEntry.getValue(); for (CorefChain.CorefMention mention: mentions) { // one mention per line pw.print(mentionIndices.getSource() + " " + mentionIndices.getTarget()); if(mention == cluster.getRepresentativeMention()) pw.print(" " + 1); else pw.print(" " + 0); pw.print(" " + mention.mentionType); pw.print(" " + mention.number); pw.print(" " + mention.gender); pw.print(" " + mention.animacy); pw.print(" " + mention.startIndex); pw.print(" " + mention.endIndex); pw.print(" " + mention.headIndex); pw.print(" " + mention.corefClusterID); pw.print(" " + mention.mentionID); pw.print(" " + mention.sentNum); pw.print(" " + mention.position.length()); for(int i = 0; i < mention.position.length(); i ++) pw.print(" " + mention.position.get(i)); pw.print(" " + escapeSpace(mention.mentionSpan)); pw.println(); } } } private static String escapeSpace(String s) { return s.replaceAll("\\s", SPACE_HOLDER); } private static String unescapeSpace(String s) { return s.replaceAll(SPACE_HOLDER, " "); } private static Dictionaries.MentionType parseMentionType(String s) { return Dictionaries.MentionType.valueOf(s); } private static Dictionaries.Number parseNumber(String s) { return Dictionaries.Number.valueOf(s); } private static Dictionaries.Gender parseGender(String s) { return Dictionaries.Gender.valueOf(s); } private static Dictionaries.Animacy parseAnimacy(String s) { return Dictionaries.Animacy.valueOf(s); } /** * Loads the CorefChain objects from the serialized buffer * @param reader the buffer * @return A map from cluster id to clusters * @throws IOException */ private static Map<Integer, CorefChain> loadCorefChains(BufferedReader reader) throws IOException { String line = reader.readLine().trim(); if (line.isEmpty()) return null; int clusterCount = Integer.valueOf(line); Map<Integer, CorefChain> chains = Generics.newHashMap(); // read each cluster for(int c = 0; c < clusterCount; c ++) { line = reader.readLine().trim(); String [] bits = line.split("\\s"); int cid = Integer.valueOf(bits[0]); int mentionCount = Integer.valueOf(bits[1]); Map<IntPair, Set<CorefChain.CorefMention>> mentionMap = Generics.newHashMap(); CorefChain.CorefMention representative = null; // read each mention in this cluster for(int m = 0; m < mentionCount; m ++) { line = reader.readLine(); bits = line.split("\\s"); IntPair key = new IntPair( Integer.valueOf(bits[0]), Integer.valueOf(bits[1])); boolean rep = bits[2].equals("1"); Dictionaries.MentionType mentionType = parseMentionType(bits[3]); Dictionaries.Number number = parseNumber(bits[4]); Dictionaries.Gender gender = parseGender(bits[5]); Dictionaries.Animacy animacy = parseAnimacy(bits[6]); int startIndex = Integer.valueOf(bits[7]); int endIndex = Integer.valueOf(bits[8]); int headIndex = Integer.valueOf(bits[9]); int clusterID = Integer.valueOf(bits[10]); int mentionID = Integer.valueOf(bits[11]); int sentNum = Integer.valueOf(bits[12]); int posLen = Integer.valueOf(bits[13]); int [] posElems = new int[posLen]; for(int i = 0; i < posLen; i ++) { posElems[i] = Integer.valueOf(bits[14 + i]); } IntTuple position = new IntTuple(posElems); String span = unescapeSpace(bits[14 + posLen]); CorefChain.CorefMention mention = new CorefChain.CorefMention( mentionType, number, gender, animacy, startIndex, endIndex, headIndex, clusterID, mentionID, sentNum, position, span); Set<CorefChain.CorefMention> mentionsWithThisHead = mentionMap.get(key); if(mentionsWithThisHead == null) { mentionsWithThisHead = Generics.newHashSet(); mentionMap.put(key, mentionsWithThisHead); } mentionsWithThisHead.add(mention); if(rep) representative = mention; } // construct the cluster CorefChain chain = new CorefChain(cid, mentionMap, representative); chains.put(cid, chain); } reader.readLine(); return chains; } @Override public OutputStream write(Annotation corpus, OutputStream os) throws IOException { if (!(os instanceof GZIPOutputStream)) { if(compress) os = new GZIPOutputStream(os); } PrintWriter pw = new PrintWriter(os); // save the coref graph in the new format Map<Integer, CorefChain> chains = corpus.get(CorefCoreAnnotations.CorefChainAnnotation.class); saveCorefChains(chains, pw); // save the coref graph on one line // Note: this is the old format! List<Pair<IntTuple, IntTuple>> corefGraph = corpus.get(CorefCoreAnnotations.CorefGraphAnnotation.class); if(corefGraph != null){ boolean first = true; for(Pair<IntTuple, IntTuple> arc: corefGraph){ if(! first) pw.print(" "); pw.printf("%d %d %d %d", arc.first.get(0), arc.first.get(1), arc.second.get(0), arc.second.get(1)); first = false; } } pw.println(); // save sentences separated by an empty line List<CoreMap> sentences = corpus.get(CoreAnnotations.SentencesAnnotation.class); for(CoreMap sent: sentences){ // save the parse tree first, on a single line Tree tree = sent.get(TreeCoreAnnotations.TreeAnnotation.class); if(tree != null){ String treeString = tree.toString(); // no \n allowed in the parse tree string (might happen due to tokenization of HTML/XML/RDF tags) treeString = treeString.replaceAll("\n", " "); pw.println(treeString); } else pw.println(); SemanticGraph collapsedDeps = sent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); saveDependencyGraph(collapsedDeps, pw); SemanticGraph uncollapsedDeps = sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); saveDependencyGraph(uncollapsedDeps, pw); SemanticGraph ccDeps = sent.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class); saveDependencyGraph(ccDeps, pw); // save all sentence tokens List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class); if(tokens != null){ for(CoreLabel token: tokens){ saveToken(token, haveExplicitAntecedent, pw); pw.println(); } } // add an empty line after every sentence pw.println(); } pw.flush(); return os; } @Override public Pair<Annotation, InputStream> read(InputStream is) throws IOException { if(compress && !(is instanceof GZIPInputStream)) is = new GZIPInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); Annotation doc = new Annotation(""); String line; // read the coref graph (new format) Map<Integer, CorefChain> chains = loadCorefChains(reader); if(chains != null) doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains); // read the coref graph (old format) line = reader.readLine().trim(); if(line.length() > 0){ String [] bits = line.split(" "); if(bits.length % 4 != 0){ throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line); } List<Pair<IntTuple, IntTuple>> corefGraph = new ArrayList<>(); for(int i = 0; i < bits.length; i += 4){ IntTuple src = new IntTuple(2); IntTuple dst = new IntTuple(2); src.set(0, Integer.parseInt(bits[i])); src.set(1, Integer.parseInt(bits[i + 1])); dst.set(0, Integer.parseInt(bits[i + 2])); dst.set(1, Integer.parseInt(bits[i + 3])); corefGraph.add(new Pair<>(src, dst)); } doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph); } // read individual sentences List<CoreMap> sentences = new ArrayList<>(); while((line = reader.readLine()) != null){ CoreMap sentence = new Annotation(""); // first line is the parse tree. construct it with CoreLabels in Tree nodes Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree(); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); // read the dependency graphs IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader); IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader); IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader); // the remaining lines until empty line are tokens List<CoreLabel> tokens = new ArrayList<>(); while((line = reader.readLine()) != null){ if(line.length() == 0) break; CoreLabel token = loadToken(line, haveExplicitAntecedent); tokens.add(token); } sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); // convert the intermediate graph to an actual SemanticGraph SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens); sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps); SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps); SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens); sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps); sentences.add(sentence); } doc.set(CoreAnnotations.SentencesAnnotation.class, sentences); return Pair.makePair(doc, is); } private static final String SPACE_HOLDER = "##"; private static CoreLabel loadToken(String line, boolean haveExplicitAntecedent) { CoreLabel token = new CoreLabel(); String [] bits = line.split("\t", -1); if(bits.length < 7) throw new RuntimeIOException("ERROR: Invalid format token for serialized token (only " + bits.length + " tokens): " + line); // word String word = bits[0].replaceAll(SPACE_HOLDER, " "); token.set(CoreAnnotations.TextAnnotation.class, word); token.set(CoreAnnotations.ValueAnnotation.class, word); // if(word.length() == 0) log.info("FOUND 0-LENGTH TOKEN!"); // lemma if(bits[1].length() > 0 || bits[0].length() == 0){ String lemma = bits[1].replaceAll(SPACE_HOLDER, " "); token.set(CoreAnnotations.LemmaAnnotation.class, lemma); } // POS tag if(bits[2].length() > 0) token.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[2]); // NE tag if(bits[3].length() > 0) token.set(CoreAnnotations.NamedEntityTagAnnotation.class, bits[3]); // Normalized NE tag if(bits[4].length() > 0) token.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, bits[4]); // Character offsets if(bits[5].length() > 0) token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, Integer.parseInt(bits[5])); if(bits[6].length() > 0) token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, Integer.parseInt(bits[6])); if(haveExplicitAntecedent){ // This block is specific to KBP // We may have AntecedentAnnotation if(bits.length > 7){ String aa = bits[7].replaceAll(SPACE_HOLDER, " "); if(aa.length() > 0) token.set(CoreAnnotations.AntecedentAnnotation.class, aa); } } return token; } /** * Saves one individual sentence token, in a simple tabular format, in the style of CoNLL * @param token * @param pw */ private static void saveToken(CoreLabel token, boolean haveExplicitAntecedent, PrintWriter pw) { String word = token.get(CoreAnnotations.TextAnnotation.class); if (word == null) { word = token.get(CoreAnnotations.ValueAnnotation.class); } if(word != null){ word = word.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting pw.print(word); } pw.print("\t"); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); if(lemma != null){ lemma = lemma.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting pw.print(lemma); } pw.print("\t"); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if(pos != null) pw.print(pos); pw.print("\t"); String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); if(ner != null) pw.print(ner); pw.print("\t"); String normNer = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class); if(normNer != null) pw.print(normNer); pw.print("\t"); Integer charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); if(charBegin != null) pw.print(charBegin); pw.print("\t"); Integer charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); if(charEnd != null) pw.print(charEnd); if(haveExplicitAntecedent){ // This block is specific to KBP // in some cases where we now the entity in focus (i.e., web queries), AntecedentAnnotation is generated // let's save it as an optional, always last, token String aa = token.get(CoreAnnotations.AntecedentAnnotation.class); if(aa != null){ pw.print("\t"); aa = aa.replaceAll("\\s+", SPACE_HOLDER); // spaces are used for formatting pw.print(aa); } } } public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); String file = props.getProperty("file"); String loadFile = props.getProperty("loadFile"); if (loadFile != null && ! loadFile.isEmpty()) { CustomAnnotationSerializer ser = new CustomAnnotationSerializer(false, false); InputStream is = new FileInputStream(loadFile); Pair<Annotation, InputStream> pair = ser.read(is); pair.second.close(); Annotation anno = pair.first; System.out.println(anno.toShorterString(StringUtils.EMPTY_STRING_ARRAY)); is.close(); } else if (file != null && ! file.equals("")) { String text = edu.stanford.nlp.io.IOUtils.slurpFile(file); Annotation doc = new Annotation(text); pipeline.annotate(doc); CustomAnnotationSerializer ser = new CustomAnnotationSerializer(false, false); PrintStream os = new PrintStream(new FileOutputStream(file + ".ser")); ser.write(doc, os).close(); log.info("Serialized annotation saved in " + file + ".ser"); } else { log.info("usage: CustomAnnotationSerializer [-file file] [-loadFile file]"); } } }