package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ie.NumberNormalizer; import edu.stanford.nlp.ie.machinereading.structure.EntityMention; import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject; import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.*; import edu.stanford.nlp.ie.machinereading.structure.RelationMention; import edu.stanford.nlp.ie.machinereading.structure.Span; import edu.stanford.nlp.ie.util.RelationTriple; import edu.stanford.nlp.international.Language; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.ling.SegmenterCoreAnnotations; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.naturalli.*; import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; import edu.stanford.nlp.time.Timex; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.LabeledScoredTreeNode; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.*; import edu.stanford.nlp.ling.CoreAnnotations.*; import edu.stanford.nlp.trees.TreeCoreAnnotations.*; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.*; import edu.stanford.nlp.time.TimeAnnotations.*; import java.io.*; import java.util.*; import java.util.stream.Collectors; import edu.stanford.nlp.coref.CorefCoreAnnotations.*; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.data.SpeakerInfo; /** * <p> * A serializer using Google's protocol buffer format. * The files produced by this serializer, in addition to being language-independent, * are a little over 10% the size and 4x faster to read+write versus the default Java serialization * (see GenericAnnotationSerializer), when both files are compressed with gzip. * </p> * * <p> * Note that this handles only a subset of the possible annotations * that can be attached to a sentence. Nonetheless, it is guaranteed to be * lossless with the default set of named annotators you can create from a * {@link StanfordCoreNLP} pipeline, with default properties defined for each annotator. * Note that the serializer does not gzip automatically -- this must be done by passing in a GZipOutputStream * and calling a GZipInputStream manually. For most Annotations, gzipping provides a notable decrease in size (~2.5x) * due to most of the data being raw Strings. * </p> * * <p> * To allow lossy serialization, use {@link ProtobufAnnotationSerializer#ProtobufAnnotationSerializer(boolean)}. * Otherwise, an exception is thrown if an unknown key appears in the annotation which would not be saved to th * protocol buffer. * If such keys exist, and are a part of the standard CoreNLP pipeline, please let us know! * If you would like to serialize keys in addition to those serialized by default (e.g., you are attaching * your own annotations), then you should do the following: * </p> * * <ol> * <li> * Create a .proto file which extends one or more of Document, Sentence, or Token. Each of these have fields * 100-255 left open for user extensions. An example of such an extension is: * <pre> * package edu.stanford.nlp.pipeline; * * option java_package = "com.example.my.awesome.nlp.app"; * option java_outer_classname = "MyAppProtos"; * * import "CoreNLP.proto"; * * extend Sentence { * optional uint32 myNewField = 101; * } * </pre> * </li> * * <li> * Compile your .proto file with protoc. For example (from CORENLP_HOME): * <pre> * protoc -I=src/edu/stanford/nlp/pipeline/:/path/to/folder/contining/your/proto/file --java_out=/path/to/output/src/folder/ /path/to/proto/file * </pre> * </li> * * <li> * <p> * Extend {@link ProtobufAnnotationSerializer} to serialize and deserialize your field. * Generally, this entail overriding two functions -- one to write the proto and one to read it. * In both cases, you usually want to call the superclass' implementation of the function, and add on to it * from there. * In our running example, adding a field to the {@link CoreNLPProtos.Sentence} proto, you would overwrite: * </p> * * <ul> * <li>{@link ProtobufAnnotationSerializer#toProtoBuilder(edu.stanford.nlp.util.CoreMap, java.util.Set)}</li> * <li>{@link ProtobufAnnotationSerializer#fromProtoNoTokens(edu.stanford.nlp.pipeline.CoreNLPProtos.Sentence)}</li> * </ul> * * <p> * Note, importantly, that for the serializer to be able to check for lossless serialization, all annotations added * to the proto must be registered as added by being removed from the set passed to * {@link ProtobufAnnotationSerializer#toProtoBuilder(edu.stanford.nlp.util.CoreMap, java.util.Set)} (and the analogous * functions for documents and tokens). * </p> * * <p> * Lastly, the new annotations must be registered in the original .proto file; this can be achieved by including * a static block in the overwritten class: * </p> * <pre> * static { * ExtensionRegistry registry = ExtensionRegistry.newInstance(); * registry.add(MyAppProtos.myNewField); * CoreNLPProtos.registerAllExtensions(registry); * } * </pre> * </li> * </ol> * * * TODOs * <ul> * <li>In CoreNLP, the leaves of a tree are == to the tokens in a sentence. This is not the case for a deserialized proto.</li> * </ul> * * @author Gabor Angeli */ public class ProtobufAnnotationSerializer extends AnnotationSerializer { /** A global lock; necessary since dependency tree creation is not threadsafe */ private static final Object globalLock = "I'm a lock :)"; /** * An exception to denote that the serialization would be lossy. * This exception is thrown at serialization time. * * @see ProtobufAnnotationSerializer#enforceLosslessSerialization * @see ProtobufAnnotationSerializer#ProtobufAnnotationSerializer(boolean) */ public static class LossySerializationException extends RuntimeException { private LossySerializationException(String msg) { super(msg); } } /** * If true, serialization is guaranteed to be lossless or else a runtime exception is thrown * at serialization time. */ public final boolean enforceLosslessSerialization; /** * Create a new Annotation serializer outputting to a protocol buffer format. * This is guaranteed to either be a lossless compression, or throw an exception at * serialization time. */ public ProtobufAnnotationSerializer() { this(true); } /** * Create a new Annotation serializer outputting to a protocol buffer format. * * @param enforceLosslessSerialization If set to true, a {@link ProtobufAnnotationSerializer.LossySerializationException} * is thrown at serialization * time if the serialization would be lossy. If set to false, * these exceptions are ignored. * */ public ProtobufAnnotationSerializer(boolean enforceLosslessSerialization) { this.enforceLosslessSerialization = enforceLosslessSerialization; } /** {@inheritDoc} */ @Override public OutputStream write(Annotation corpus, OutputStream os) throws IOException { CoreNLPProtos.Document serialized = toProto(corpus); serialized.writeDelimitedTo(os); os.flush(); return os; } /** {@inheritDoc} */ @Override public Pair<Annotation, InputStream> read(InputStream is) throws IOException, ClassNotFoundException, ClassCastException { CoreNLPProtos.Document doc = CoreNLPProtos.Document.parseDelimitedFrom(is); return Pair.makePair(fromProto(doc), is); } /** * Read a single protocol buffer, which constitutes the entire stream. * This is in contrast to the default, where mutliple buffers may come out of the stream, * and therefore each one is prepended by the length of the buffer to follow. * * @param in The file to read. * @return A parsed Annotation. * @throws IOException In case the stream cannot be read from. */ @SuppressWarnings({"UnusedDeclaration", "ThrowFromFinallyBlock"}) public Annotation readUndelimited(File in) throws IOException { FileInputStream undelimited = new FileInputStream(in); CoreNLPProtos.Document doc; try (FileInputStream delimited = new FileInputStream(in)) { doc = CoreNLPProtos.Document.parseFrom(delimited); } catch (Exception e) { doc = CoreNLPProtos.Document.parseDelimitedFrom(undelimited); } finally { undelimited.close(); } return fromProto(doc); } /** * Get a particular key from a CoreMap, registering it as being retrieved. * @param map The CoreMap to retrieve the key from. * @param keysToRegister A set of keys to remove this key from, representing to keys which should be retrieved by the serializer. * @param key The key key to retrieve. * @param <E> The class of the item which is being retrieved. * @return CoreMap.get(key) */ private static <E> E getAndRegister(CoreMap map, Set<Class<?>> keysToRegister, Class<? extends CoreAnnotation<E>> key) { keysToRegister.remove(key); return map.get(key); } /** * Create a CoreLabel proto from a CoreLabel instance. * This is not static, as it optionally throws an exception if the serialization is lossy. * @param coreLabel The CoreLabel to convert * @return A protocol buffer message corresponding to this CoreLabel */ public CoreNLPProtos.Token toProto(CoreLabel coreLabel) { Set<Class<?>> keysToSerialize = new HashSet<>(coreLabel.keySetNotNull()); CoreNLPProtos.Token.Builder builder = toProtoBuilder(coreLabel, keysToSerialize); // Completeness check if (enforceLosslessSerialization && !keysToSerialize.isEmpty()) { throw new LossySerializationException("Keys are not being serialized: " + StringUtils.join(keysToSerialize)); } return builder.build(); } /** * <p> * The method to extend by subclasses of the Protobuf Annotator if custom additions are added to Tokens. * In contrast to {@link ProtobufAnnotationSerializer#toProto(edu.stanford.nlp.ling.CoreLabel)}, this function * returns a builder that can be extended. * </p> * * @param coreLabel The sentence to save to a protocol buffer * @param keysToSerialize A set tracking which keys have been saved. It's important to remove any keys added to the proto * from this set, as the code tracks annotations to ensure lossless serialization */ protected CoreNLPProtos.Token.Builder toProtoBuilder(CoreLabel coreLabel, Set<Class<?>> keysToSerialize) { CoreNLPProtos.Token.Builder builder = CoreNLPProtos.Token.newBuilder(); Set<Class<?>> keySet = coreLabel.keySetNotNull(); // Remove items serialized elsewhere from the required list keysToSerialize.remove(TextAnnotation.class); keysToSerialize.remove(SentenceIndexAnnotation.class); keysToSerialize.remove(DocIDAnnotation.class); keysToSerialize.remove(IndexAnnotation.class); keysToSerialize.remove(ParagraphAnnotation.class); // Remove items populated by number normalizer keysToSerialize.remove(NumericCompositeObjectAnnotation.class); keysToSerialize.remove(NumericCompositeTypeAnnotation.class); keysToSerialize.remove(NumericCompositeValueAnnotation.class); keysToSerialize.remove(NumericTypeAnnotation.class); keysToSerialize.remove(NumericValueAnnotation.class); // Remove items which were never supposed to be there in the first place keysToSerialize.remove(ForcedSentenceUntilEndAnnotation.class); keysToSerialize.remove(ForcedSentenceEndAnnotation.class); keysToSerialize.remove(HeadWordLabelAnnotation.class); keysToSerialize.remove(HeadTagLabelAnnotation.class); // Set the word (this may be null if the CoreLabel is storing a character (as in case of segmenter) if (coreLabel.word() != null) builder.setWord(coreLabel.word()); // Optional fields if (keySet.contains(PartOfSpeechAnnotation.class)) { builder.setPos(coreLabel.tag()); keysToSerialize.remove(PartOfSpeechAnnotation.class); } if (keySet.contains(ValueAnnotation.class)) { builder.setValue(coreLabel.value()); keysToSerialize.remove(ValueAnnotation.class); } if (keySet.contains(CategoryAnnotation.class)) { builder.setCategory(coreLabel.category()); keysToSerialize.remove(CategoryAnnotation.class); } if (keySet.contains(BeforeAnnotation.class)) { builder.setBefore(coreLabel.before()); keysToSerialize.remove(BeforeAnnotation.class); } if (keySet.contains(AfterAnnotation.class)) { builder.setAfter(coreLabel.after()); keysToSerialize.remove(AfterAnnotation.class); } if (keySet.contains(OriginalTextAnnotation.class)) { builder.setOriginalText(coreLabel.originalText()); keysToSerialize.remove(OriginalTextAnnotation.class); } if (keySet.contains(NamedEntityTagAnnotation.class)) { builder.setNer(coreLabel.ner()); keysToSerialize.remove(NamedEntityTagAnnotation.class); } if (keySet.contains(CharacterOffsetBeginAnnotation.class)) { builder.setBeginChar(coreLabel.beginPosition()); keysToSerialize.remove(CharacterOffsetBeginAnnotation.class); } if (keySet.contains(CharacterOffsetEndAnnotation.class)) { builder.setEndChar(coreLabel.endPosition()); keysToSerialize.remove(CharacterOffsetEndAnnotation.class); } if (keySet.contains(LemmaAnnotation.class)) { builder.setLemma(coreLabel.lemma()); keysToSerialize.remove(LemmaAnnotation.class); } if (keySet.contains(UtteranceAnnotation.class)) { builder.setUtterance(getAndRegister(coreLabel, keysToSerialize, UtteranceAnnotation.class)); } if (keySet.contains(SpeakerAnnotation.class)) { builder.setSpeaker(getAndRegister(coreLabel, keysToSerialize, SpeakerAnnotation.class)); } if (keySet.contains(BeginIndexAnnotation.class)) { builder.setBeginIndex(getAndRegister(coreLabel, keysToSerialize, BeginIndexAnnotation.class)); } if (keySet.contains(EndIndexAnnotation.class)) { builder.setEndIndex(getAndRegister(coreLabel, keysToSerialize, EndIndexAnnotation.class)); } if (keySet.contains(TokenBeginAnnotation.class)) { builder.setTokenBeginIndex(getAndRegister(coreLabel, keysToSerialize, TokenBeginAnnotation.class)); } if (keySet.contains(TokenEndAnnotation.class)) { builder.setTokenEndIndex(getAndRegister(coreLabel, keysToSerialize, TokenEndAnnotation.class)); } if (keySet.contains(NormalizedNamedEntityTagAnnotation.class)) { builder.setNormalizedNER(getAndRegister(coreLabel, keysToSerialize, NormalizedNamedEntityTagAnnotation.class)); } if (keySet.contains(TimexAnnotation.class)) { builder.setTimexValue(toProto(getAndRegister(coreLabel, keysToSerialize, TimexAnnotation.class))); } if (keySet.contains(AnswerAnnotation.class)) { builder.setAnswer(getAndRegister(coreLabel, keysToSerialize, AnswerAnnotation.class)); } if (keySet.contains(WikipediaEntityAnnotation.class)) { builder.setWikipediaEntity(getAndRegister(coreLabel, keysToSerialize, WikipediaEntityAnnotation.class)); } if (keySet.contains(XmlContextAnnotation.class)) { builder.setHasXmlContext(true); builder.addAllXmlContext(getAndRegister(coreLabel, keysToSerialize, XmlContextAnnotation.class)); } else { builder.setHasXmlContext(false); } if (keySet.contains(CorefClusterIdAnnotation.class)) { builder.setCorefClusterID(getAndRegister(coreLabel, keysToSerialize, CorefClusterIdAnnotation.class)); } if (keySet.contains(NaturalLogicAnnotations.OperatorAnnotation.class)) { builder.setOperator(toProto(getAndRegister(coreLabel, keysToSerialize, NaturalLogicAnnotations.OperatorAnnotation.class))); } if (keySet.contains(NaturalLogicAnnotations.PolarityAnnotation.class)) { builder.setPolarity(toProto(getAndRegister(coreLabel, keysToSerialize, NaturalLogicAnnotations.PolarityAnnotation.class))); } if (keySet.contains(SpanAnnotation.class)) { IntPair span = getAndRegister(coreLabel, keysToSerialize, SpanAnnotation.class); builder.setSpan(CoreNLPProtos.Span.newBuilder().setBegin(span.getSource()).setEnd(span.getTarget()).build()); } if (keySet.contains(SentimentCoreAnnotations.SentimentClass.class)) { builder.setSentiment(getAndRegister(coreLabel, keysToSerialize, SentimentCoreAnnotations.SentimentClass.class)); } if (keySet.contains(QuotationIndexAnnotation.class)) { builder.setQuotationIndex(getAndRegister(coreLabel, keysToSerialize, QuotationIndexAnnotation.class)); } if (keySet.contains(CoNLLUFeats.class)) { builder.setConllUFeatures(toMapStringStringProto(getAndRegister(coreLabel, keysToSerialize, CoNLLUFeats.class))); } if (keySet.contains(CoNLLUTokenSpanAnnotation.class)) { IntPair span = getAndRegister(coreLabel, keysToSerialize, CoNLLUTokenSpanAnnotation.class); builder.setConllUTokenSpan(CoreNLPProtos.Span.newBuilder().setBegin(span.getSource()).setEnd(span.getTarget()).build()); } if (keySet.contains(CoNLLUMisc.class)) { builder.setConllUMisc(getAndRegister(coreLabel, keysToSerialize, CoNLLUMisc.class));} if (keySet.contains(CoarseTagAnnotation.class)) { builder.setCoarseTag(getAndRegister(coreLabel, keysToSerialize, CoarseTagAnnotation.class));} if (keySet.contains(CoNLLUSecondaryDepsAnnotation.class)) { builder.setConllUSecondaryDeps(toMapIntStringProto(getAndRegister(coreLabel, keysToSerialize, CoNLLUSecondaryDepsAnnotation.class)));} // Non-default annotators if (keySet.contains(GenderAnnotation.class)) { builder.setGender(getAndRegister(coreLabel, keysToSerialize, GenderAnnotation.class)); } if (keySet.contains(TrueCaseAnnotation.class)) { builder.setTrueCase(getAndRegister(coreLabel, keysToSerialize, TrueCaseAnnotation.class)); } if (keySet.contains(TrueCaseTextAnnotation.class)) { builder.setTrueCaseText(getAndRegister(coreLabel, keysToSerialize, TrueCaseTextAnnotation.class)); } // Chinese character related stuff if (keySet.contains(ChineseCharAnnotation.class)) { builder.setChineseChar(getAndRegister(coreLabel, keysToSerialize, ChineseCharAnnotation.class)); } if (keySet.contains(ChineseSegAnnotation.class)) { builder.setChineseSeg(getAndRegister(coreLabel, keysToSerialize, ChineseSegAnnotation.class)); } // Return return builder; } /** * Create a protobuf builder, rather than a compiled protobuf. * Useful for, e.g., the simple CoreNLP interface. * @param sentence The sentence to serialize. * @return A Sentence builder. */ @SuppressWarnings("unchecked") public CoreNLPProtos.Sentence.Builder toProtoBuilder(CoreMap sentence) { return toProtoBuilder(sentence, Collections.EMPTY_SET); } /** * Create a Sentence proto from a CoreMap instance. * This is not static, as it optionally throws an exception if the serialization is lossy. * @param sentence The CoreMap to convert. Note that it should not be a CoreLabel or an Annotation, * and should represent a sentence. * @return A protocol buffer message corresponding to this sentence * @throws IllegalArgumentException If the sentence is not a valid sentence (e.g., is a document or a word). */ public CoreNLPProtos.Sentence toProto(CoreMap sentence) { Set<Class<?>> keysToSerialize = new HashSet<>(sentence.keySet()); CoreNLPProtos.Sentence.Builder builder = toProtoBuilder(sentence, keysToSerialize); // Completeness check if (enforceLosslessSerialization && !keysToSerialize.isEmpty()) { throw new LossySerializationException("Keys are not being serialized: " + StringUtils.join(keysToSerialize)); } return builder.build(); } /** * <p> * The method to extend by subclasses of the Protobuf Annotator if custom additions are added to Tokens. * In contrast to {@link ProtobufAnnotationSerializer#toProto(edu.stanford.nlp.ling.CoreLabel)}, this function * returns a builder that can be extended. * </p> * * @param sentence The sentence to save to a protocol buffer * @param keysToSerialize A set tracking which keys have been saved. It's important to remove any keys added to the proto * from this set, as the code tracks annotations to ensure lossless serialization. */ @SuppressWarnings("deprecation") protected CoreNLPProtos.Sentence.Builder toProtoBuilder(CoreMap sentence, Set<Class<?>> keysToSerialize) { // Error checks if (sentence instanceof CoreLabel) { throw new IllegalArgumentException("CoreMap is actually a CoreLabel"); } CoreNLPProtos.Sentence.Builder builder = CoreNLPProtos.Sentence.newBuilder(); // Remove items serialized elsewhere from the required list keysToSerialize.remove(TextAnnotation.class); keysToSerialize.remove(NumerizedTokensAnnotation.class); // Required fields builder.setTokenOffsetBegin(getAndRegister(sentence, keysToSerialize, TokenBeginAnnotation.class)); builder.setTokenOffsetEnd(getAndRegister(sentence, keysToSerialize, TokenEndAnnotation.class)); // Get key set of CoreMap Set<Class<?>> keySet; if (sentence instanceof ArrayCoreMap) { keySet = ((ArrayCoreMap) sentence).keySetNotNull(); } else { keySet = new IdentityHashSet<>(sentence.keySet()); } // Tokens if (sentence.containsKey(TokensAnnotation.class)) { for (CoreLabel tok : sentence.get(TokensAnnotation.class)) { builder.addToken(toProto(tok)); } keysToSerialize.remove(TokensAnnotation.class); } // Characters if (sentence.containsKey(SegmenterCoreAnnotations.CharactersAnnotation.class)) { for (CoreLabel c : sentence.get(SegmenterCoreAnnotations.CharactersAnnotation.class)) { builder.addCharacter(toProto(c)); } keysToSerialize.remove(SegmenterCoreAnnotations.CharactersAnnotation.class); } // Optional fields if (keySet.contains(SentenceIndexAnnotation.class)) { builder.setSentenceIndex(getAndRegister(sentence, keysToSerialize, SentenceIndexAnnotation.class)); } if (keySet.contains(CharacterOffsetBeginAnnotation.class)) { builder.setCharacterOffsetBegin(getAndRegister(sentence, keysToSerialize, CharacterOffsetBeginAnnotation.class)); } if (keySet.contains(CharacterOffsetEndAnnotation.class)) { builder.setCharacterOffsetEnd(getAndRegister(sentence, keysToSerialize, CharacterOffsetEndAnnotation.class)); } if (keySet.contains(TreeAnnotation.class)) { builder.setParseTree(toProto(getAndRegister(sentence, keysToSerialize, TreeAnnotation.class))); } if (keySet.contains(BinarizedTreeAnnotation.class)) { builder.setBinarizedParseTree(toProto(getAndRegister(sentence, keysToSerialize, BinarizedTreeAnnotation.class))); } if (keySet.contains(KBestTreesAnnotation.class)) { for (Tree tree : sentence.get(KBestTreesAnnotation.class)) { builder.addKBestParseTrees(toProto(tree)); keysToSerialize.remove(KBestTreesAnnotation.class); } } if (keySet.contains(SentimentCoreAnnotations.SentimentAnnotatedTree.class)) { builder.setAnnotatedParseTree(toProto(getAndRegister(sentence, keysToSerialize, SentimentCoreAnnotations.SentimentAnnotatedTree.class))); } if (keySet.contains(SentimentCoreAnnotations.SentimentClass.class)) { builder.setSentiment(getAndRegister(sentence, keysToSerialize, SentimentCoreAnnotations.SentimentClass.class)); } if (keySet.contains(BasicDependenciesAnnotation.class)) { builder.setBasicDependencies(toProto(getAndRegister(sentence, keysToSerialize, BasicDependenciesAnnotation.class))); } if (keySet.contains(CollapsedDependenciesAnnotation.class)) { builder.setCollapsedDependencies(toProto(getAndRegister(sentence, keysToSerialize, CollapsedDependenciesAnnotation.class))); } if (keySet.contains(CollapsedCCProcessedDependenciesAnnotation.class)) { builder.setCollapsedCCProcessedDependencies(toProto(getAndRegister(sentence, keysToSerialize, CollapsedCCProcessedDependenciesAnnotation.class))); } if (keySet.contains(AlternativeDependenciesAnnotation.class)) { builder.setAlternativeDependencies(toProto(getAndRegister(sentence, keysToSerialize, AlternativeDependenciesAnnotation.class))); } if (keySet.contains(EnhancedDependenciesAnnotation.class)) { builder.setEnhancedDependencies(toProto(getAndRegister(sentence, keysToSerialize, EnhancedDependenciesAnnotation.class))); } if (keySet.contains(EnhancedPlusPlusDependenciesAnnotation.class)) { builder.setEnhancedPlusPlusDependencies(toProto(getAndRegister(sentence, keysToSerialize, EnhancedPlusPlusDependenciesAnnotation.class))); } if (keySet.contains(TokensAnnotation.class) && getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).size() > 0 && getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).get(0).containsKey(ParagraphAnnotation.class)) { builder.setParagraph(getAndRegister(sentence, keysToSerialize, TokensAnnotation.class).get(0).get(ParagraphAnnotation.class)); } if (keySet.contains(NumerizedTokensAnnotation.class)) { builder.setHasNumerizedTokensAnnotation(true); } else { builder.setHasNumerizedTokensAnnotation(false); } if (keySet.contains(NaturalLogicAnnotations.EntailedSentencesAnnotation.class)) { for (SentenceFragment entailedSentence : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.EntailedSentencesAnnotation.class)) { builder.addEntailedSentence(toProto(entailedSentence)); } } if (keySet.contains(NaturalLogicAnnotations.EntailedClausesAnnotation.class)) { for (SentenceFragment entailedClause : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.EntailedClausesAnnotation.class)) { builder.addEntailedClause(toProto(entailedClause)); } } if (keySet.contains(NaturalLogicAnnotations.RelationTriplesAnnotation.class)) { for (RelationTriple triple : getAndRegister(sentence, keysToSerialize, NaturalLogicAnnotations.RelationTriplesAnnotation.class)) { builder.addOpenieTriple(toProto(triple)); } } if (keySet.contains(KBPTriplesAnnotation.class)) { for (RelationTriple triple : getAndRegister(sentence, keysToSerialize, KBPTriplesAnnotation.class)) { builder.addKbpTriple(toProto(triple)); } } // Non-default annotators if (keySet.contains(EntityMentionsAnnotation.class)) { builder.setHasRelationAnnotations(true); for (EntityMention entity : getAndRegister(sentence, keysToSerialize, EntityMentionsAnnotation.class)) { builder.addEntity(toProto(entity)); } } else { builder.setHasRelationAnnotations(false); } if (keySet.contains(RelationMentionsAnnotation.class)) { if (!builder.getHasRelationAnnotations()) { throw new IllegalStateException("Registered entity mentions without relation mentions"); } for (RelationMention relation : getAndRegister(sentence, keysToSerialize, RelationMentionsAnnotation.class)) { builder.addRelation(toProto(relation)); } } // add each of the mentions in the List<Mentions> for this sentence if (keySet.contains(CorefMentionsAnnotation.class)) { builder.setHasCorefMentionsAnnotation(true); for (Mention m : sentence.get(CorefMentionsAnnotation.class)) { builder.addMentionsForCoref(toProto(m)); } keysToSerialize.remove(CorefMentionsAnnotation.class); } // Entity mentions if (keySet.contains(MentionsAnnotation.class)) { for (CoreMap mention : sentence.get(MentionsAnnotation.class)) { builder.addMentions(toProtoMention(mention)); } keysToSerialize.remove(MentionsAnnotation.class); } // add a sentence id if it exists if (keySet.contains(SentenceIDAnnotation.class)) builder.setSentenceID(getAndRegister(sentence, keysToSerialize, SentenceIDAnnotation.class)); // add section index if (keySet.contains(SectionIndexAnnotation.class)) builder.setSectionIndex(getAndRegister(sentence, keysToSerialize, SectionIndexAnnotation.class)); // add section date if (keySet.contains(SectionDateAnnotation.class)) builder.setSectionDate(getAndRegister(sentence, keysToSerialize, SectionDateAnnotation.class)); // Return return builder; } /** * Create a Document proto from a CoreMap instance. * This is not static, as it optionally throws an exception if the serialization is lossy. * @param doc The Annotation to convert. * @return A protocol buffer message corresponding to this document */ public CoreNLPProtos.Document toProto(Annotation doc) { Set<Class<?>> keysToSerialize = new HashSet<>(doc.keySet()); keysToSerialize.remove(TokensAnnotation.class); // note(gabor): tokens are saved in the sentence CoreNLPProtos.Document.Builder builder = toProtoBuilder(doc, keysToSerialize); // Completeness Check if (enforceLosslessSerialization && !keysToSerialize.isEmpty()) { throw new LossySerializationException("Keys are not being serialized: " + StringUtils.join(keysToSerialize)); } return builder.build(); } /** * Create a protobuf builder, rather than a compiled protobuf. * Useful for, e.g., the simple CoreNLP interface. * @param doc The document to serialize. * @return A Document builder. */ @SuppressWarnings("unchecked") public CoreNLPProtos.Document.Builder toProtoBuilder(Annotation doc) { return toProtoBuilder(doc, Collections.EMPTY_SET); } /** * <p> * The method to extend by subclasses of the Protobuf Annotator if custom additions are added to Tokens. * In contrast to {@link ProtobufAnnotationSerializer#toProto(edu.stanford.nlp.ling.CoreLabel)}, this function * returns a builder that can be extended. * </p> * * @param doc The sentence to save to a protocol buffer * @param keysToSerialize A set tracking which keys have been saved. It's important to remove any keys added to the proto * from this set, as the code tracks annotations to ensure lossless serializationA set tracking which keys have been saved. It's important to remove any keys added to the proto* * from this set, as the code tracks annotations to ensure lossless serialization. */ protected CoreNLPProtos.Document.Builder toProtoBuilder(Annotation doc, Set<Class<?>> keysToSerialize) { CoreNLPProtos.Document.Builder builder = CoreNLPProtos.Document.newBuilder(); // Required fields builder.setText(doc.get(TextAnnotation.class)); keysToSerialize.remove(TextAnnotation.class); // Optional fields if (doc.containsKey(SentencesAnnotation.class)) { for (CoreMap sentence : doc.get(SentencesAnnotation.class)) { builder.addSentence(toProto(sentence)); } keysToSerialize.remove(SentencesAnnotation.class); } else if (doc.containsKey(TokensAnnotation.class)) { for (CoreLabel token : doc.get(TokensAnnotation.class)) { builder.addSentencelessToken(toProto(token)); } } if (doc.containsKey(DocIDAnnotation.class)) { builder.setDocID(doc.get(DocIDAnnotation.class)); keysToSerialize.remove(DocIDAnnotation.class); } if (doc.containsKey(DocDateAnnotation.class)) { builder.setDocDate(doc.get(DocDateAnnotation.class)); keysToSerialize.remove(DocDateAnnotation.class); } if (doc.containsKey(CalendarAnnotation.class)) { builder.setCalendar(doc.get(CalendarAnnotation.class).toInstant().toEpochMilli()); keysToSerialize.remove(CalendarAnnotation.class); } if (doc.containsKey(CorefChainAnnotation.class)) { for (Map.Entry<Integer, CorefChain> chain : doc.get(CorefChainAnnotation.class).entrySet()) { builder.addCorefChain(toProto(chain.getValue())); } keysToSerialize.remove(CorefChainAnnotation.class); } if (doc.containsKey(QuotationsAnnotation.class)) { for (CoreMap quote : doc.get(QuotationsAnnotation.class)) { builder.addQuote(toProtoQuote(quote)); } keysToSerialize.remove(QuotationsAnnotation.class); } if (doc.containsKey(MentionsAnnotation.class)) { for (CoreMap mention : doc.get(MentionsAnnotation.class)) { builder.addMentions(toProtoMention(mention)); } keysToSerialize.remove(MentionsAnnotation.class); } // add character info from segmenter if (doc.containsKey(SegmenterCoreAnnotations.CharactersAnnotation.class)) { for (CoreLabel c : doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class)) { builder.addCharacter(toProto(c)); } keysToSerialize.remove(SegmenterCoreAnnotations.CharactersAnnotation.class); } // add section info if (doc.containsKey(SectionsAnnotation.class)) { for (CoreMap section : doc.get(SectionsAnnotation.class)) { builder.addSections(toProtoSection(section)); } keysToSerialize.remove(SectionsAnnotation.class); } // Return return builder; } /** * Create a ParseTree proto from a Tree. If the Tree is a scored tree, the scores will * be preserved. * @param parseTree The parse tree to convert. * @return A protocol buffer message corresponding to this tree. */ public CoreNLPProtos.ParseTree toProto(Tree parseTree) { CoreNLPProtos.ParseTree.Builder builder = CoreNLPProtos.ParseTree.newBuilder(); // Required fields for (Tree child : parseTree.children()) { builder.addChild(toProto(child)); } // Optional fields IntPair span = parseTree.getSpan(); if (span != null) { builder.setYieldBeginIndex(span.getSource()); builder.setYieldEndIndex(span.getTarget()); } if (parseTree.label() != null) { builder.setValue(parseTree.label().value()); } if (!Double.isNaN(parseTree.score())) { builder.setScore(parseTree.score()); } Integer sentiment; if (parseTree.label() instanceof CoreMap && (sentiment = ((CoreMap) parseTree.label()).get(RNNCoreAnnotations.PredictedClass.class)) != null) { builder.setSentiment(CoreNLPProtos.Sentiment.valueOf(sentiment)); } // Return return builder.build(); } /** * Create a compact representation of the semantic graph for this dependency parse. * @param graph The dependency graph to save. * @return A protocol buffer message corresponding to this parse. */ public static CoreNLPProtos.DependencyGraph toProto(SemanticGraph graph) { CoreNLPProtos.DependencyGraph.Builder builder = CoreNLPProtos.DependencyGraph.newBuilder(); // Roots Set<Integer> rootSet = graph.getRoots().stream().map(IndexedWord::index).collect(Collectors.toCollection(IdentityHashSet::new)); // Nodes for (IndexedWord node : graph.vertexSet()) { // Register node CoreNLPProtos.DependencyGraph.Node.Builder nodeBuilder = CoreNLPProtos.DependencyGraph.Node.newBuilder() .setSentenceIndex(node.get(SentenceIndexAnnotation.class)) .setIndex(node.index()); if (node.copyCount() > 0) { nodeBuilder.setCopyAnnotation(node.copyCount()); } builder.addNode(nodeBuilder.build()); // Register root if (rootSet.contains(node.index())) { builder.addRoot(node.index()); } } // Edges for (SemanticGraphEdge edge : graph.edgeIterable()) { // Set edge builder.addEdge(CoreNLPProtos.DependencyGraph.Edge.newBuilder() .setSource(edge.getSource().index()) .setTarget(edge.getTarget().index()) .setDep(edge.getRelation().toString()) .setIsExtra(edge.isExtra()) .setSourceCopy(edge.getSource().copyCount()) .setTargetCopy(edge.getTarget().copyCount()) .setLanguage(toProto(edge.getRelation().getLanguage()))); } // Return return builder.build(); } /** * Create a CorefChain protocol buffer from the given coref chain. * @param chain The coref chain to convert. * @return A protocol buffer message corresponding to this chain. */ public CoreNLPProtos.CorefChain toProto(CorefChain chain) { CoreNLPProtos.CorefChain.Builder builder = CoreNLPProtos.CorefChain.newBuilder(); // Set ID builder.setChainID(chain.getChainID()); // Set mentions Map<CorefChain.CorefMention, Integer> mentionToIndex = new IdentityHashMap<>(); for (Map.Entry<IntPair, Set<CorefChain.CorefMention>> entry : chain.getMentionMap().entrySet()) { for (CorefChain.CorefMention mention : entry.getValue()) { mentionToIndex.put(mention, mentionToIndex.size()); builder.addMention(CoreNLPProtos.CorefChain.CorefMention.newBuilder() .setMentionID(mention.mentionID) .setMentionType(mention.mentionType.name()) .setNumber(mention.number.name()) .setGender(mention.gender.name()) .setAnimacy(mention.animacy.name()) .setBeginIndex(mention.startIndex - 1) .setEndIndex(mention.endIndex - 1) .setHeadIndex(mention.headIndex - 1) .setSentenceIndex(mention.sentNum - 1) .setPosition(mention.position.get(1)) ); } } // Set representative mention builder.setRepresentative(mentionToIndex.get(chain.getRepresentativeMention())); // Return return builder.build(); } /** * Create a Section CoreMap protocol buffer from the given Section CoreMap * @param section * @return */ public CoreNLPProtos.Section toProtoSection(CoreMap section) { CoreNLPProtos.Section.Builder builder = CoreNLPProtos.Section.newBuilder(); // Set char start builder.setCharBegin(section.get(CharacterOffsetBeginAnnotation.class)); // Set char end builder.setCharEnd(section.get(CharacterOffsetEndAnnotation.class)); // Set author if (section.get(AuthorAnnotation.class) != null) builder.setAuthor(section.get(AuthorAnnotation.class)); // Set date time if (section.get(SectionDateAnnotation.class) != null) builder.setDatetime(section.get(SectionDateAnnotation.class)); // add the sentence indexes for the sentences in this section for (CoreMap sentence : section.get(SentencesAnnotation.class)) { int sentenceIndex = sentence.get(SentenceIndexAnnotation.class); builder.addSentenceIndexes(sentenceIndex); } return builder.build(); } public CoreNLPProtos.IndexedWord createIndexedWordProtoFromIW(IndexedWord iw) { CoreNLPProtos.IndexedWord.Builder builder = CoreNLPProtos.IndexedWord.newBuilder(); if (iw == null) { builder.setSentenceNum(-1); builder.setTokenIndex(-1); } else { builder.setSentenceNum(iw.get(SentenceIndexAnnotation.class) - 1); builder.setTokenIndex(iw.get(IndexAnnotation.class) - 1); builder.setCopyCount(iw.copyCount()); } return builder.build(); } public CoreNLPProtos.IndexedWord createIndexedWordProtoFromCL(CoreLabel cl) { CoreNLPProtos.IndexedWord.Builder builder = CoreNLPProtos.IndexedWord.newBuilder(); if (cl == null) { builder.setSentenceNum(-1); builder.setTokenIndex(-1); } else { builder.setSentenceNum(cl.get(SentenceIndexAnnotation.class) - 1); builder.setTokenIndex(cl.get(IndexAnnotation.class) - 1); } return builder.build(); } public CoreNLPProtos.Mention toProto(Mention mention) { // create the builder CoreNLPProtos.Mention.Builder builder = CoreNLPProtos.Mention.newBuilder(); // set enums if (mention.mentionType != null) { builder.setMentionType(mention.mentionType.name()); } if (mention.gender != null) { builder.setGender(mention.gender.name()); } if (mention.number != null) { builder.setNumber(mention.number.name()); } if (mention.animacy != null) { builder.setAnimacy(mention.animacy.name()); } if (mention.person != null) { builder.setPerson(mention.person.name()); } if (mention.headString != null) { builder.setHeadString(mention.headString); } if (mention.nerString != null) { builder.setNerString(mention.nerString); } builder.setStartIndex(mention.startIndex); builder.setEndIndex(mention.endIndex); builder.setHeadIndex(mention.headIndex); builder.setMentionID(mention.mentionID); builder.setOriginalRef(mention.originalRef); builder.setGoldCorefClusterID(mention.goldCorefClusterID); builder.setCorefClusterID(mention.corefClusterID); builder.setMentionNum(mention.mentionNum); builder.setSentNum(mention.sentNum); builder.setUtter(mention.utter); builder.setParagraph(mention.paragraph); builder.setIsSubject(mention.isSubject); builder.setIsDirectObject(mention.isDirectObject); builder.setIsIndirectObject(mention.isIndirectObject); builder.setIsPrepositionObject(mention.isPrepositionObject); builder.setHasTwin(mention.hasTwin); builder.setGeneric(mention.generic); builder.setIsSingleton(mention.isSingleton); // handle the two sets of Strings if (mention.dependents != null) { mention.dependents.forEach(builder::addDependents); } if (mention.preprocessedTerms != null) { mention.preprocessedTerms.forEach(builder::addPreprocessedTerms); } // set IndexedWords by storing (sentence number, token index) pairs builder.setDependingVerb(createIndexedWordProtoFromIW(mention.dependingVerb)); builder.setHeadIndexedWord(createIndexedWordProtoFromIW(mention.headIndexedWord)); builder.setHeadWord(createIndexedWordProtoFromCL(mention.headWord)); //CoreLabel headWord = (mention.headWord != null) ? mention.headWord : null; //builder.setHeadWord(createCoreLabelPositionProto(mention.headWord)); // add positions for each CoreLabel in sentence if (mention.sentenceWords != null) { for (CoreLabel cl : mention.sentenceWords) { builder.addSentenceWords(createIndexedWordProtoFromCL(cl)); } } if (mention.originalSpan != null) { for (CoreLabel cl : mention.originalSpan) { builder.addOriginalSpan(createIndexedWordProtoFromCL(cl)); } } // flag if this Mention should get basicDependency, collapsedDependency, and contextParseTree or not builder.setHasBasicDependency((mention.basicDependency != null)); builder.setHasEnhancedDepenedncy((mention.enhancedDependency != null)); builder.setHasContextParseTree((mention.contextParseTree != null)); // handle the sets of Mentions, just store mentionID if (mention.appositions != null) { for (Mention m : mention.appositions) { builder.addAppositions(m.mentionID); } } if (mention.predicateNominatives != null) { for (Mention m : mention.predicateNominatives) { builder.addPredicateNominatives(m.mentionID); } } if (mention.relativePronouns != null) { for (Mention m : mention.relativePronouns) { builder.addRelativePronouns(m.mentionID); } } if (mention.listMembers != null) { for (Mention m : mention.listMembers) { builder.addListMembers(m.mentionID); } } if (mention.belongToLists != null) { for (Mention m : mention.belongToLists) { builder.addBelongToLists(m.mentionID); } } if (mention.speakerInfo != null) { builder.setSpeakerInfo(toProto(mention.speakerInfo)); } return builder.build(); } public CoreNLPProtos.SpeakerInfo toProto(SpeakerInfo speakerInfo) { CoreNLPProtos.SpeakerInfo.Builder builder = CoreNLPProtos.SpeakerInfo.newBuilder(); builder.setSpeakerName(speakerInfo.getSpeakerName()); // mentionID's should be set by MentionAnnotator for (Mention m : speakerInfo.getMentions()) { builder.addMentions(m.mentionID); } return builder.build(); } /** * Convert the given Timex object to a protocol buffer. * @param timex The timex to convert. * @return A protocol buffer corresponding to this Timex object. */ public CoreNLPProtos.Timex toProto(Timex timex) { CoreNLPProtos.Timex.Builder builder = CoreNLPProtos.Timex.newBuilder(); if (timex.value() != null) { builder.setValue(timex.value()); } if (timex.altVal() != null) { builder.setAltValue(timex.altVal()); } if (timex.text() != null) { builder.setText(timex.text()); } if (timex.timexType() != null) { builder.setType(timex.timexType()); } if (timex.tid() != null) { builder.setTid(timex.tid()); } if (timex.beginPoint() >= 0) { builder.setBeginPoint(timex.beginPoint()); } if (timex.endPoint() >= 0) { builder.setEndPoint(timex.endPoint()); } return builder.build(); } /** * Serialize the given entity mention to the corresponding protocol buffer. * @param ent The entity mention to serialize. * @return A protocol buffer corresponding to the serialized entity mention. */ public CoreNLPProtos.Entity toProto(EntityMention ent) { CoreNLPProtos.Entity.Builder builder = CoreNLPProtos.Entity.newBuilder(); // From ExtractionObject if (ent.getObjectId() != null) { builder.setObjectID(ent.getObjectId()); } if (ent.getExtent() != null) { builder.setExtentStart(ent.getExtent().start()).setExtentEnd(ent.getExtent().end()); } if (ent.getType() != null) { builder.setType(ent.getType()); } if (ent.getSubType() != null) { builder.setSubtype(ent.getSubType()); } // From Entity if (ent.getHead() != null) { builder.setHeadStart(ent.getHead().start()); builder.setHeadEnd(ent.getHead().end()); } if (ent.getMentionType() != null) { builder.setMentionType(ent.getMentionType()); } if (ent.getNormalizedName() != null) { builder.setNormalizedName(ent.getNormalizedName()); } if (ent.getSyntacticHeadTokenPosition() >= 0) { builder.setHeadTokenIndex(ent.getSyntacticHeadTokenPosition()); } if (ent.getCorefID() != null) { builder.setCorefID(ent.getCorefID()); } // Return return builder.build(); } /** * Serialize the given relation mention to the corresponding protocol buffer. * @param rel The relation mention to serialize. * @return A protocol buffer corresponding to the serialized relation mention. */ public CoreNLPProtos.Relation toProto(RelationMention rel) { CoreNLPProtos.Relation.Builder builder = CoreNLPProtos.Relation.newBuilder(); // From ExtractionObject if (rel.getObjectId() != null) { builder.setObjectID(rel.getObjectId()); } if (rel.getExtent() != null) { builder.setExtentStart(rel.getExtent().start()).setExtentEnd(rel.getExtent().end()); } if (rel.getType() != null) { builder.setType(rel.getType()); } if (rel.getSubType() != null) { builder.setSubtype(rel.getSubType()); } // From Relation if (rel.getArgNames() != null) { rel.getArgNames().forEach(builder::addArgName); } if (rel.getArgs() != null) { for (ExtractionObject arg : rel.getArgs()) { builder.addArg(toProto((EntityMention) arg)); } } // Return return builder.build(); } /** * Serialize a CoreNLP Language to a Protobuf Language. * @param lang The language to serialize. * @return The language in a Protobuf enum. */ public static CoreNLPProtos.Language toProto(Language lang) { switch (lang) { case Arabic: return CoreNLPProtos.Language.Arabic; case Chinese: return CoreNLPProtos.Language.Chinese; case UniversalChinese: return CoreNLPProtos.Language.UniversalChinese; case English: return CoreNLPProtos.Language.English; case UniversalEnglish: return CoreNLPProtos.Language.UniversalEnglish; case German: return CoreNLPProtos.Language.German; case French: return CoreNLPProtos.Language.French; case Hebrew: return CoreNLPProtos.Language.Hebrew; case Spanish: return CoreNLPProtos.Language.Spanish; case Unknown: return CoreNLPProtos.Language.Unknown; case Any: return CoreNLPProtos.Language.Any; default: throw new IllegalStateException("Unknown language: " + lang); } } /** * Return a Protobuf operator from an OperatorSpec (Natural Logic). */ public static CoreNLPProtos.Operator toProto(OperatorSpec op) { return CoreNLPProtos.Operator.newBuilder() .setName(op.instance.name()).setQuantifierSpanBegin(op.quantifierBegin).setQuantifierSpanEnd(op.quantifierEnd) .setSubjectSpanBegin(op.subjectBegin).setSubjectSpanEnd(op.subjectEnd) .setObjectSpanBegin(op.objectBegin).setObjectSpanEnd(op.objectEnd).build(); } /** * Return a Protobuf polarity from a CoreNLP Polarity (Natural Logic). */ public static CoreNLPProtos.Polarity toProto(Polarity pol) { return CoreNLPProtos.Polarity.newBuilder() .setProjectEquivalence(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.EQUIVALENT).fixedIndex)) .setProjectForwardEntailment(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.FORWARD_ENTAILMENT).fixedIndex)) .setProjectReverseEntailment(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.REVERSE_ENTAILMENT).fixedIndex)) .setProjectNegation(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.NEGATION).fixedIndex)) .setProjectAlternation(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.ALTERNATION).fixedIndex)) .setProjectCover(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.COVER).fixedIndex)) .setProjectIndependence(CoreNLPProtos.NaturalLogicRelation.valueOf(pol.projectLexicalRelation(NaturalLogicRelation.INDEPENDENCE).fixedIndex)) .build(); } /** * Return a Protobuf RelationTriple from a RelationTriple. */ public static CoreNLPProtos.SentenceFragment toProto(SentenceFragment fragment) { return CoreNLPProtos.SentenceFragment.newBuilder() .setAssumedTruth(fragment.assumedTruth) .setScore(fragment.score) .addAllTokenIndex(fragment.words.stream().map(x -> x.index() - 1).collect(Collectors.toList())) .setRoot(fragment.parseTree.getFirstRoot().index() - 1) .build(); } /** * Return a Protobuf RelationTriple from a RelationTriple. */ public static CoreNLPProtos.RelationTriple toProto(RelationTriple triple) { CoreNLPProtos.RelationTriple.Builder builder = CoreNLPProtos.RelationTriple.newBuilder() .setSubject(triple.subjectGloss()) .setRelation(triple.relationGloss()) .setObject(triple.objectGloss()) .setConfidence(triple.confidence) .addAllSubjectTokens(triple.subject.stream().map(token -> CoreNLPProtos.TokenLocation.newBuilder() .setSentenceIndex(token.sentIndex()) .setTokenIndex(token.index() - 1) .build()) .collect(Collectors.toList())) .addAllRelationTokens( triple.relation.size() == 1 && triple.relation.get(0).get(IndexAnnotation.class) == null ? Collections.emptyList() // case: this is not a real relation token, but rather a placeholder relation : triple.relation.stream().map(token -> CoreNLPProtos.TokenLocation.newBuilder() .setSentenceIndex(token.sentIndex()) .setTokenIndex(token.index() - 1) .build()) .collect(Collectors.toList())) .addAllObjectTokens(triple.object.stream().map(token -> CoreNLPProtos.TokenLocation.newBuilder() .setSentenceIndex(token.sentIndex()) .setTokenIndex(token.index() - 1) .build()) .collect(Collectors.toList())); Optional<SemanticGraph> treeOptional = triple.asDependencyTree(); if (treeOptional.isPresent()) { builder.setTree(toProto(treeOptional.get())); } return builder.build(); } /** * Serialize a Map (from Strings to Strings) to a proto. * * @param map The map to serialize. * * @return A proto representation of the map. */ public static CoreNLPProtos.MapStringString toMapStringStringProto(Map<String,String> map) { CoreNLPProtos.MapStringString.Builder proto = CoreNLPProtos.MapStringString.newBuilder(); for (Map.Entry<String, String> entry : map.entrySet()) { proto.addKey(entry.getKey()); proto.addValue(entry.getValue()); } return proto.build(); } /** * Serialize a Map (from Integers to Strings) to a proto. * * @param map The map to serialize. * * @return A proto representation of the map. */ public static CoreNLPProtos.MapIntString toMapIntStringProto(Map<Integer,String> map) { CoreNLPProtos.MapIntString.Builder proto = CoreNLPProtos.MapIntString.newBuilder(); for (Map.Entry<Integer, String> entry : map.entrySet()) { proto.addKey(entry.getKey()); proto.addValue(entry.getValue()); } return proto.build(); } /** * Convert a quote object to a protocol buffer. */ public static CoreNLPProtos.Quote toProtoQuote(CoreMap quote) { CoreNLPProtos.Quote.Builder builder = CoreNLPProtos.Quote.newBuilder(); if (quote.get(TextAnnotation.class) != null) { builder.setText(quote.get(TextAnnotation.class)); } if (quote.get(DocIDAnnotation.class) != null) { builder.setDocid(quote.get(DocIDAnnotation.class)); } if (quote.get(CharacterOffsetBeginAnnotation.class) != null) { builder.setBegin(quote.get(CharacterOffsetBeginAnnotation.class)); } if (quote.get(CharacterOffsetEndAnnotation.class) != null) { builder.setEnd(quote.get(CharacterOffsetEndAnnotation.class)); } if (quote.get(SentenceBeginAnnotation.class) != null) { builder.setSentenceBegin(quote.get(SentenceBeginAnnotation.class)); } if (quote.get(SentenceEndAnnotation.class) != null) { builder.setSentenceEnd(quote.get(SentenceEndAnnotation.class)); } if (quote.get(TokenBeginAnnotation.class) != null) { builder.setTokenBegin(quote.get(TokenBeginAnnotation.class)); } if (quote.get(TokenEndAnnotation.class) != null) { builder.setTokenEnd(quote.get(TokenEndAnnotation.class)); } if (quote.get(QuotationIndexAnnotation.class) != null) { builder.setIndex(quote.get(QuotationIndexAnnotation.class)); } return builder.build(); } /** * Convert a mention object to a protocol buffer. */ public CoreNLPProtos.NERMention toProtoMention(CoreMap mention) { CoreNLPProtos.NERMention.Builder builder = CoreNLPProtos.NERMention.newBuilder(); if (mention.get(SentenceIndexAnnotation.class) != null) { builder.setSentenceIndex(mention.get(SentenceIndexAnnotation.class)); } if (mention.get(TokenBeginAnnotation.class) != null) { builder.setTokenStartInSentenceInclusive(mention.get(TokenBeginAnnotation.class)); } if (mention.get(TokenEndAnnotation.class) != null) { builder.setTokenEndInSentenceExclusive(mention.get(TokenEndAnnotation.class)); } if (mention.get(NamedEntityTagAnnotation.class) != null) { builder.setNer(mention.get(NamedEntityTagAnnotation.class)); } if (mention.get(NormalizedNamedEntityTagAnnotation.class) != null) { builder.setNormalizedNER(mention.get(NormalizedNamedEntityTagAnnotation.class)); } if (mention.get(EntityTypeAnnotation.class) != null) { builder.setEntityType(mention.get(EntityTypeAnnotation.class)); } if (mention.get(TimexAnnotation.class) != null) { builder.setTimex(toProto(mention.get(TimexAnnotation.class))); } if (mention.get(WikipediaEntityAnnotation.class) != null) { builder.setWikipediaEntity(mention.get(WikipediaEntityAnnotation.class)); } return builder.build(); } /** * Create a CoreLabel from its serialized counterpart. * Note that this is, by itself, a lossy operation. Fields like the docid (sentence index, etc.) are only known * from the enclosing document, and are not tracked in the protobuf. * @param proto The serialized protobuf to read the CoreLabel from. * @return A CoreLabel, missing the fields that are not stored in the CoreLabel protobuf. */ public CoreLabel fromProto(CoreNLPProtos.Token proto) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } CoreLabel word = new CoreLabel(); // Required fields word.setWord(proto.getWord()); // Optional fields if (proto.hasPos()) { word.setTag(proto.getPos()); } if (proto.hasValue()) { word.setValue(proto.getValue()); } if (proto.hasCategory()) { word.setCategory(proto.getCategory()); } if (proto.hasBefore()) { word.setBefore(proto.getBefore()); } if (proto.hasAfter()) { word.setAfter(proto.getAfter()); } if (proto.hasOriginalText()) { word.setOriginalText(proto.getOriginalText()); } if (proto.hasNer()) { word.setNER(proto.getNer()); } if (proto.hasLemma()) { word.setLemma(proto.getLemma()); } if (proto.hasBeginChar()) { word.setBeginPosition(proto.getBeginChar()); } if (proto.hasEndChar()) { word.setEndPosition(proto.getEndChar()); } if (proto.hasSpeaker()) { word.set(SpeakerAnnotation.class, proto.getSpeaker()); } if (proto.hasUtterance()) { word.set(UtteranceAnnotation.class, proto.getUtterance()); } if (proto.hasBeginIndex()) { word.set(BeginIndexAnnotation.class, proto.getBeginIndex()); } if (proto.hasEndIndex()) { word.set(EndIndexAnnotation.class, proto.getEndIndex()); } if (proto.hasTokenBeginIndex()) { word.set(TokenBeginAnnotation.class, proto.getTokenBeginIndex()); } if (proto.hasTokenEndIndex()) { word.set(TokenEndAnnotation.class, proto.getTokenEndIndex()); } if (proto.hasNormalizedNER()) { word.set(NormalizedNamedEntityTagAnnotation.class, proto.getNormalizedNER()); } if (proto.hasTimexValue()) { word.set(TimexAnnotation.class, fromProto(proto.getTimexValue())); } if (proto.hasHasXmlContext() && proto.getHasXmlContext()) { word.set(XmlContextAnnotation.class, proto.getXmlContextList()); } if (proto.hasCorefClusterID()) { word.set(CorefClusterIdAnnotation.class, proto.getCorefClusterID()); } if (proto.hasAnswer()) { word.set(AnswerAnnotation.class, proto.getAnswer()); } if (proto.hasOperator()) { word.set(NaturalLogicAnnotations.OperatorAnnotation.class, fromProto(proto.getOperator())); } if (proto.hasPolarity()) { word.set(NaturalLogicAnnotations.PolarityAnnotation.class, fromProto(proto.getPolarity())); } if (proto.hasSpan()) { word.set(SpanAnnotation.class, new IntPair(proto.getSpan().getBegin(), proto.getSpan().getEnd())); } if (proto.hasSentiment()) { word.set(SentimentCoreAnnotations.SentimentClass.class, proto.getSentiment()); } if (proto.hasQuotationIndex()) { word.set(QuotationIndexAnnotation.class, proto.getQuotationIndex()); } if (proto.hasConllUFeatures()) { word.set(CoNLLUFeats.class, fromProto(proto.getConllUFeatures())); } if (proto.hasConllUMisc()) { word.set(CoNLLUMisc.class, proto.getConllUMisc()); } if (proto.hasCoarseTag()) { word.set(CoarseTagAnnotation.class, proto.getCoarseTag()); } if (proto.hasConllUTokenSpan()) { word.set(CoNLLUTokenSpanAnnotation.class, new IntPair(proto.getConllUTokenSpan().getBegin(), proto.getSpan().getEnd())); } if (proto.hasConllUSecondaryDeps()) { word.set(CoNLLUSecondaryDepsAnnotation.class, fromProto(proto.getConllUSecondaryDeps())); } if (proto.hasWikipediaEntity()) { word.set(WikipediaEntityAnnotation.class, proto.getWikipediaEntity()); } // Chinese char info if (proto.hasChineseChar()) { word.set(ChineseCharAnnotation.class, proto.getChineseChar()) ; } if (proto.hasChineseSeg()) { word.set(ChineseSegAnnotation.class, proto.getChineseSeg()) ; } // Non-default annotators if (proto.hasGender()) { word.set(GenderAnnotation.class, proto.getGender()); } if (proto.hasTrueCase()) { word.set(TrueCaseAnnotation.class, proto.getTrueCase()); } if (proto.hasTrueCaseText()) { word.set(TrueCaseTextAnnotation.class, proto.getTrueCaseText()); } // Return return word; } /** * Create a CoreMap representing a sentence from this protocol buffer. * This should not be used if you are reading a whole document, as it populates the tokens independent of the * document tokens, which is not the behavior an {@link edu.stanford.nlp.pipeline.Annotation} expects. * * @param proto The protocol buffer to read from. * @return A CoreMap representing the sentence. */ @SuppressWarnings("deprecation") @Deprecated public CoreMap fromProto(CoreNLPProtos.Sentence proto) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } CoreMap lossySentence = fromProtoNoTokens(proto); // Add tokens -- missing by default as they're populated as sublists of the document tokens List<CoreLabel> tokens = proto.getTokenList().stream().map(this::fromProto).collect(Collectors.toList()); lossySentence.set(TokensAnnotation.class, tokens); // Add dependencies if (proto.hasBasicDependencies()) { lossySentence.set(BasicDependenciesAnnotation.class, fromProto(proto.getBasicDependencies(), tokens, null)); } if (proto.hasCollapsedDependencies()) { lossySentence.set(CollapsedDependenciesAnnotation.class, fromProto(proto.getCollapsedDependencies(), tokens, null)); } if (proto.hasCollapsedCCProcessedDependencies()) { lossySentence.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(proto.getCollapsedCCProcessedDependencies(), tokens, null)); } if (proto.hasAlternativeDependencies()) { lossySentence.set(AlternativeDependenciesAnnotation.class, fromProto(proto.getAlternativeDependencies(), tokens, null)); } if (proto.hasEnhancedDependencies()) { lossySentence.set(EnhancedDependenciesAnnotation.class, fromProto(proto.getEnhancedDependencies(), tokens, null)); } if (proto.hasEnhancedPlusPlusDependencies()) { lossySentence.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(proto.getEnhancedPlusPlusDependencies(), tokens, null)); } // Add entailed sentences if (proto.getEntailedSentenceCount() > 0) { List<SentenceFragment> entailedSentences = proto.getEntailedSentenceList().stream().map(frag -> fromProto(frag, lossySentence.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toList()); lossySentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences); } // Add entailed clauses if (proto.getEntailedClauseCount() > 0) { List<SentenceFragment> entailedClauses = proto.getEntailedClauseList().stream().map(frag -> fromProto(frag, lossySentence.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toList()); lossySentence.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses); } // Add relation triples if (proto.getOpenieTripleCount() > 0) { throw new IllegalStateException("Cannot deserialize OpenIE triples with this method!"); } if (proto.getKbpTripleCount() > 0) { throw new IllegalStateException("Cannot deserialize KBP triples with this method!"); } // Add chinese characters if (proto.getCharacterCount() > 0) { List<CoreLabel> sentenceCharacters = proto.getCharacterList().stream().map(c -> fromProto(c)).collect(Collectors.toList()); lossySentence.set(SegmenterCoreAnnotations.CharactersAnnotation.class, sentenceCharacters); } // Add text -- missing by default as it's populated from the Document lossySentence.set(TextAnnotation.class, recoverOriginalText(tokens, proto)); // Return return lossySentence; } /** * Create a CoreMap representing a sentence from this protocol buffer. * Note that the sentence is very lossy -- most glaringly, the tokens are missing, awaiting a document * to be filled in from. * @param proto The serialized protobuf to read the sentence from. * @return A CoreMap, representing a sentence as stored in the protocol buffer (and therefore missing some fields) */ protected CoreMap fromProtoNoTokens(CoreNLPProtos.Sentence proto) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } CoreMap sentence = new ArrayCoreMap(); // Required fields sentence.set(TokenBeginAnnotation.class, proto.getTokenOffsetBegin()); sentence.set(TokenEndAnnotation.class, proto.getTokenOffsetEnd()); // Optional fields if (proto.hasSentenceIndex()) { sentence.set(SentenceIndexAnnotation.class, proto.getSentenceIndex()); } if (proto.hasCharacterOffsetBegin()) { sentence.set(CharacterOffsetBeginAnnotation.class, proto.getCharacterOffsetBegin()); } if (proto.hasCharacterOffsetEnd()) { sentence.set(CharacterOffsetEndAnnotation.class, proto.getCharacterOffsetEnd()); } if (proto.hasParseTree()) { sentence.set(TreeAnnotation.class, fromProto(proto.getParseTree())); } if (proto.hasBinarizedParseTree()) { sentence.set(BinarizedTreeAnnotation.class, fromProto(proto.getBinarizedParseTree())); } if (proto.getKBestParseTreesCount() > 0) { List<Tree> trees = proto.getKBestParseTreesList().stream().map(this::fromProto).collect(Collectors.toCollection(LinkedList::new)); sentence.set(KBestTreesAnnotation.class, trees); } if (proto.hasAnnotatedParseTree()) { sentence.set(SentimentCoreAnnotations.SentimentAnnotatedTree.class, fromProto(proto.getAnnotatedParseTree())); } if (proto.hasSentiment()) { sentence.set(SentimentCoreAnnotations.SentimentClass.class, proto.getSentiment()); } // Non-default fields if (proto.hasHasRelationAnnotations() && proto.getHasRelationAnnotations()) { // set entities List<EntityMention> entities = proto.getEntityList().stream().map(entity -> fromProto(entity, sentence)).collect(Collectors.toList()); sentence.set(EntityMentionsAnnotation.class, entities); // set relations List<RelationMention> relations = proto.getRelationList().stream().map(relation -> fromProto(relation, sentence)).collect(Collectors.toList()); sentence.set(RelationMentionsAnnotation.class, relations); } // if there are mentions for this sentence, add them to the annotation loadSentenceMentions(proto, sentence); // Return return sentence; } protected void loadSentenceMentions(CoreNLPProtos.Sentence proto, CoreMap sentence) { // add all Mentions for this sentence if (proto.getHasCorefMentionsAnnotation()) { sentence.set(CorefMentionsAnnotation.class, new ArrayList<>()); } if (proto.getMentionsForCorefList().size() != 0) { HashMap<Integer, Mention> idToMention = new HashMap<>(); List<Mention> sentenceMentions = sentence.get(CorefMentionsAnnotation.class); // initial set up of all mentions for (CoreNLPProtos.Mention protoMention : proto.getMentionsForCorefList()) { Mention m = fromProtoNoTokens(protoMention); sentenceMentions.add(m); idToMention.put(m.mentionID, m); } // populate sets of Mentions for each Mention for (CoreNLPProtos.Mention protoMention : proto.getMentionsForCorefList()) { Mention m = idToMention.get(protoMention.getMentionID()); if (protoMention.getAppositionsList().size() != 0) { m.appositions = new HashSet<>(); m.appositions.addAll(protoMention.getAppositionsList().stream() .map(idToMention::get) .collect(Collectors.toList())); } if (protoMention.getPredicateNominativesList().size() != 0) { m.predicateNominatives = new HashSet<>(); m.predicateNominatives.addAll(protoMention.getPredicateNominativesList().stream() .map(idToMention::get) .collect(Collectors.toList())); } if (protoMention.getRelativePronounsList().size() != 0) { m.relativePronouns = new HashSet<>(); m.relativePronouns.addAll(protoMention.getRelativePronounsList().stream() .map(idToMention::get) .collect(Collectors.toList())); } if (protoMention.getListMembersList().size() != 0) { m.listMembers = new HashSet<>(); m.listMembers.addAll(protoMention.getListMembersList().stream() .map(idToMention::get) .collect(Collectors.toList())); } if (protoMention.getBelongToListsList().size() != 0) { m.belongToLists = new HashSet<>(); m.belongToLists.addAll(protoMention.getBelongToListsList().stream() .map(idToMention::get) .collect(Collectors.toList())); } } } } /** * Returns a complete document, intended to mimic a document passes as input to * {@link ProtobufAnnotationSerializer#toProto(Annotation)} as closely as possible. * That is, most common fields are serialized, but there is not guarantee that custom additions * will be saved and retrieved. * * @param proto The protocol buffer to read the document from. * @return An Annotation corresponding to the read protobuf. */ @SuppressWarnings("deprecation") public Annotation fromProto(CoreNLPProtos.Document proto) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } // Set text Annotation ann = new Annotation(proto.getText()); // if there are characters, add characters if (proto.getCharacterCount() > 0) { List<CoreLabel> docChars = new ArrayList<CoreLabel>(); for (CoreNLPProtos.Token c : proto.getCharacterList()) { docChars.add(fromProto(c)); } ann.set(SegmenterCoreAnnotations.CharactersAnnotation.class, docChars); } // Add tokens List<CoreLabel> tokens = new ArrayList<>(); if (proto.getSentenceCount() > 0) { // Populate the tokens from the sentence for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) { // It's conceivable that the sentences are not contiguous -- pad this with nulls while (sentence.hasTokenOffsetBegin() && tokens.size() < sentence.getTokenOffsetBegin()) { tokens.add(null); } // Read the sentence for (CoreNLPProtos.Token token : sentence.getTokenList()) { CoreLabel coreLabel = fromProto(token); // Set docid if (proto.hasDocID()) { coreLabel.setDocID(proto.getDocID()); } if (token.hasTokenBeginIndex() && token.hasTokenEndIndex()) { // This is usually true, if enough annotators are defined while (tokens.size() < sentence.getTokenOffsetEnd()) { tokens.add(null); } for (int i = token.getTokenBeginIndex(); i < token.getTokenEndIndex(); ++i) { tokens.set(token.getTokenBeginIndex(), coreLabel); } } else { // Assume this token spans a single token, and just add it to the tokens list tokens.add(coreLabel); } } } } else if (proto.getSentencelessTokenCount() > 0) { // Eek -- no sentences. Try to recover tokens directly if (proto.getSentencelessTokenCount() > 0) { for (CoreNLPProtos.Token token : proto.getSentencelessTokenList()) { CoreLabel coreLabel = fromProto(token); // Set docid if (proto.hasDocID()) { coreLabel.setDocID(proto.getDocID()); } tokens.add(coreLabel); } } } if (!tokens.isEmpty()) { ann.set(TokensAnnotation.class, tokens); } // Add sentences List<CoreMap> sentences = new ArrayList<>(proto.getSentenceCount()); for (int sentIndex = 0; sentIndex < proto.getSentenceCount(); ++sentIndex) { CoreNLPProtos.Sentence sentence = proto.getSentence(sentIndex); CoreMap map = fromProtoNoTokens(sentence); if (!tokens.isEmpty() && sentence.hasTokenOffsetBegin() && sentence.hasTokenOffsetEnd() && map.get(TokensAnnotation.class) == null) { // Set tokens for sentence int tokenBegin = sentence.getTokenOffsetBegin(); int tokenEnd = sentence.getTokenOffsetEnd(); assert tokenBegin <= tokens.size() && tokenBegin <= tokenEnd; assert tokenEnd <= tokens.size(); map.set(TokensAnnotation.class, tokens.subList(tokenBegin, tokenEnd)); // Set sentence index + token index + paragraph index for (int i = tokenBegin; i < tokenEnd; ++i) { tokens.get(i).setSentIndex(sentIndex); tokens.get(i).setIndex(i - sentence.getTokenOffsetBegin() + 1); if (sentence.hasParagraph()) { tokens.get(i).set(ParagraphAnnotation.class, sentence.getParagraph()); } } // Set text int characterBegin = sentence.getCharacterOffsetBegin(); int characterEnd = sentence.getCharacterOffsetEnd(); if (characterEnd <= proto.getText().length()) { // The usual case -- get the text from the document text map.set(TextAnnotation.class, proto.getText().substring(characterBegin, characterEnd)); } else { // The document text is wrong -- guess the text from the tokens map.set(TextAnnotation.class, recoverOriginalText(tokens.subList(tokenBegin, tokenEnd), sentence)); } } // End iteration sentences.add(map); } if (!sentences.isEmpty()) { ann.set(SentencesAnnotation.class, sentences); } // Set DocID String docid = null; if (proto.hasDocID()) { docid = proto.getDocID(); ann.set(DocIDAnnotation.class, docid); } // Set reference time if (proto.hasDocDate()) { ann.set(DocDateAnnotation.class, proto.getDocDate()); } if (proto.hasCalendar()) { GregorianCalendar calendar = new GregorianCalendar(); calendar.setTimeInMillis(proto.getCalendar()); ann.set(CalendarAnnotation.class, calendar); } // Set coref chain Map<Integer, CorefChain> corefChains = new HashMap<>(); for (CoreNLPProtos.CorefChain chainProto : proto.getCorefChainList()) { CorefChain chain = fromProto(chainProto, ann); corefChains.put(chain.getChainID(), chain); } if (!corefChains.isEmpty()) { ann.set(CorefChainAnnotation.class, corefChains); } // hashes to access Mentions , later in this method need to add speakerInfo to Mention // so we need to create id -> Mention, CoreNLPProtos.Mention maps to do this, since SpeakerInfo could reference // any Mention in doc HashMap<Integer, Mention> idToMention = new HashMap<>(); HashMap<Integer, CoreNLPProtos.Mention> idToProtoMention = new HashMap<>(); // Set things in the sentence that need a document context. for (int sentenceIndex = 0; sentenceIndex < proto.getSentenceCount(); ++sentenceIndex) { CoreNLPProtos.Sentence sentence = proto.getSentenceList().get(sentenceIndex); CoreMap map = sentences.get(sentenceIndex); List<CoreLabel> sentenceTokens = map.get(TokensAnnotation.class); // Set dependency graphs if (sentence.hasBasicDependencies()) { map.set(BasicDependenciesAnnotation.class, fromProto(sentence.getBasicDependencies(), sentenceTokens, docid)); } if (sentence.hasCollapsedDependencies()) { map.set(CollapsedDependenciesAnnotation.class, fromProto(sentence.getCollapsedDependencies(), sentenceTokens, docid)); } if (sentence.hasCollapsedCCProcessedDependencies()) { map.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(sentence.getCollapsedCCProcessedDependencies(), sentenceTokens, docid)); } if (sentence.hasAlternativeDependencies()) { map.set(AlternativeDependenciesAnnotation.class, fromProto(sentence.getAlternativeDependencies(), sentenceTokens, docid)); } if (sentence.hasEnhancedDependencies()) { map.set(EnhancedDependenciesAnnotation.class, fromProto(sentence.getEnhancedDependencies(), sentenceTokens, docid)); } if (sentence.hasEnhancedPlusPlusDependencies()) { map.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(sentence.getEnhancedPlusPlusDependencies(), sentenceTokens, docid)); } // Set entailed sentences if (sentence.getEntailedSentenceCount() > 0) { Set<SentenceFragment> entailedSentences = sentence.getEntailedSentenceList().stream().map(frag -> fromProto(frag, map.get(EnhancedPlusPlusDependenciesAnnotation.class))).collect(Collectors.toSet()); map.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences); } if (sentence.getEntailedClauseCount() > 0) { Set<SentenceFragment> entailedClauses = sentence.getEntailedClauseList().stream().map(frag -> fromProto(frag, map.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toSet()); map.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses); } // Set relation triples if (sentence.getOpenieTripleCount() > 0) { List<RelationTriple> triples = new ArrayList<>(); for (CoreNLPProtos.RelationTriple triple : sentence.getOpenieTripleList()) { triples.add(fromProto(triple, ann, sentenceIndex)); } map.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, triples); } // Redo some light annotation if ( map.containsKey(TokensAnnotation.class) && (!sentence.hasHasNumerizedTokensAnnotation() || sentence.getHasNumerizedTokensAnnotation())) { map.set(NumerizedTokensAnnotation.class, NumberNormalizer.findAndMergeNumbers(map)); } // add the CoreLabel and IndexedWord info to each mention // when Mentions are serialized, just storing the index in the sentence for CoreLabels and IndexedWords // this is the point where the de-serialized sentence has tokens int mentionInt = 0; for (CoreNLPProtos.Mention protoMention : sentence.getMentionsForCorefList()) { // get the mention Mention mentionToUpdate = map.get(CorefMentionsAnnotation.class).get(mentionInt); // store these in hash for more processing later in this method idToMention.put(mentionToUpdate.mentionID, mentionToUpdate); idToProtoMention.put(mentionToUpdate.mentionID, protoMention); // update the values int headIndexedWordIndex = protoMention.getHeadIndexedWord().getTokenIndex(); if (headIndexedWordIndex >= 0) { mentionToUpdate.headIndexedWord = new IndexedWord(sentenceTokens.get(protoMention.getHeadIndexedWord().getTokenIndex())); mentionToUpdate.headIndexedWord.setCopyCount(protoMention.getHeadIndexedWord().getCopyCount()); } int dependingVerbIndex = protoMention.getDependingVerb().getTokenIndex(); if (dependingVerbIndex >= 0) { mentionToUpdate.dependingVerb = new IndexedWord(sentenceTokens.get(protoMention.getDependingVerb().getTokenIndex())); mentionToUpdate.dependingVerb.setCopyCount(protoMention.getDependingVerb().getCopyCount()); } int headWordIndex = protoMention.getHeadWord().getTokenIndex(); if (headWordIndex >= 0) { mentionToUpdate.headWord = sentenceTokens.get(protoMention.getHeadWord().getTokenIndex()); } mentionToUpdate.sentenceWords = new ArrayList<>(); for (CoreNLPProtos.IndexedWord clp : protoMention.getSentenceWordsList()) { int ti = clp.getTokenIndex(); mentionToUpdate.sentenceWords.add(sentenceTokens.get(ti)); } mentionToUpdate.originalSpan = new ArrayList<>(); for (CoreNLPProtos.IndexedWord clp : protoMention.getOriginalSpanList()) { int ti = clp.getTokenIndex(); mentionToUpdate.originalSpan.add(sentenceTokens.get(ti)); } if (protoMention.getHasBasicDependency()) { mentionToUpdate.basicDependency = map.get(BasicDependenciesAnnotation.class); } if (protoMention.getHasEnhancedDepenedncy()) { mentionToUpdate.enhancedDependency = map.get(EnhancedDependenciesAnnotation.class); } if (protoMention.getHasContextParseTree()) { mentionToUpdate.contextParseTree = map.get(TreeAnnotation.class); } // move on to next mention mentionInt++; } } // Set quotes List<CoreMap> quotes = proto.getQuoteList().stream().map(quote -> fromProto(quote, tokens)).collect(Collectors.toList()); if (!quotes.isEmpty()) { ann.set(QuotationsAnnotation.class, quotes); } // Set NERmention List<CoreMap> mentions = proto.getMentionsList().stream().map(this::fromProto).collect(Collectors.toList()); if (!mentions.isEmpty()) { ann.set(MentionsAnnotation.class, mentions); } // add SpeakerInfo stuff to Mentions, this requires knowing all mentions in the document // also add all the Set<Mention> for (int mentionID : idToMention.keySet()) { // this is the Mention message corresponding to this Mention Mention mentionToUpdate = idToMention.get(mentionID); CoreNLPProtos.Mention correspondingProtoMention = idToProtoMention.get(mentionID); if (!correspondingProtoMention.hasSpeakerInfo()) { // keep speakerInfo null for this Mention if it didn't store a speakerInfo // so just continue to next Mention continue; } // if we're here we know a speakerInfo was stored SpeakerInfo speakerInfo = fromProto(correspondingProtoMention.getSpeakerInfo()); // go through all ids stored for the speakerInfo in its mentions list, and get the Mention // Mentions are stored by MentionID , MentionID should be set by MentionAnnotator // MentionID is ID in document, 0, 1, 2, etc... for (int speakerInfoMentionID : correspondingProtoMention.getSpeakerInfo().getMentionsList()) { speakerInfo.addMention(idToMention.get(speakerInfoMentionID)); } // now the SpeakerInfo for this Mention should be fully restored mentionToUpdate.speakerInfo = speakerInfo; } // add section info ann.set(SectionsAnnotation.class, new ArrayList<CoreMap>()); for (CoreNLPProtos.Section section : proto.getSectionsList()) { ann.get(SectionsAnnotation.class).add(fromProto(section, ann.get(SentencesAnnotation.class))); } // Return return ann; } /** * Retrieve a Tree object from a saved protobuf. * This is not intended to be used on its own, but it is safe (lossless) to do so and therefore it is * left visible. * * @param proto The serialized tree. * @return A Tree object corresponding to the saved tree. This will always be a {@link LabeledScoredTreeNode}. */ public Tree fromProto(CoreNLPProtos.ParseTree proto) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } LabeledScoredTreeNode node = new LabeledScoredTreeNode(); // Set label if (proto.hasValue()) { CoreLabel value = new CoreLabel(); value.setCategory(proto.getValue()); value.setValue(proto.getValue()); node.setLabel(value); // Set span if (proto.hasYieldBeginIndex() && proto.hasYieldEndIndex()) { IntPair span = new IntPair(proto.getYieldBeginIndex(), proto.getYieldEndIndex()); value.set(SpanAnnotation.class, span); } // Set sentiment if (proto.hasSentiment()) { value.set(RNNCoreAnnotations.PredictedClass.class, proto.getSentiment().getNumber()); } } // Set score if (proto.hasScore()) { node.setScore(proto.getScore()); } // Set children Tree[] children = new LabeledScoredTreeNode[proto.getChildCount()]; for (int i = 0; i < children.length; ++i) { children[i] = fromProto(proto.getChild(i)); } node.setChildren(children); // Return return node; } /** * Return a CoreNLP language from a Protobuf language */ public static Language fromProto(CoreNLPProtos.Language lang) { switch (lang) { case Arabic: return Language.Arabic; case Chinese: return Language.Chinese; case English: return Language.English; case German: return Language.German; case French: return Language.French; case Hebrew: return Language.Hebrew; case Spanish: return Language.Spanish; case UniversalChinese: return Language.UniversalChinese; case UniversalEnglish: return Language.UniversalEnglish; case Unknown: return Language.Unknown; case Any: return Language.Any; default: throw new IllegalStateException("Unknown language: " + lang); } } /** * Return a CoreNLP Operator (Natural Logic operator) from a Protobuf operator */ public static OperatorSpec fromProto(CoreNLPProtos.Operator operator) { String opName = operator.getName().toLowerCase(); Operator op = null; for (Operator candidate : Operator.values()) { if (candidate.name().toLowerCase().equals(opName)) { op = candidate; break; } } return new OperatorSpec(op, operator.getQuantifierSpanBegin(), operator.getQuantifierSpanEnd(), operator.getSubjectSpanBegin(), operator.getSubjectSpanEnd(), operator.getObjectSpanBegin(), operator.getObjectSpanEnd()); } /** * Return a CoreNLP Polarity (Natural Logic polarity) from a Protobuf operator */ public static Polarity fromProto(CoreNLPProtos.Polarity polarity) { byte[] projectionFn = new byte[7]; projectionFn[0] = (byte) polarity.getProjectEquivalence().getNumber(); projectionFn[1] = (byte) polarity.getProjectForwardEntailment().getNumber(); projectionFn[2] = (byte) polarity.getProjectReverseEntailment().getNumber(); projectionFn[3] = (byte) polarity.getProjectNegation().getNumber(); projectionFn[4] = (byte) polarity.getProjectAlternation().getNumber(); projectionFn[5] = (byte) polarity.getProjectCover().getNumber(); projectionFn[6] = (byte) polarity.getProjectIndependence().getNumber(); return new Polarity(projectionFn); } /** * Deserialize a dependency tree, allowing for cross-sentence arcs. * This is primarily here for deserializing OpenIE triples. * * @see ProtobufAnnotationSerializer#fromProto(CoreNLPProtos.DependencyGraph, List, String) */ private static SemanticGraph fromProto(CoreNLPProtos.DependencyGraph proto, List<CoreLabel> sentence, String docid, Optional<Annotation> document) { SemanticGraph graph = new SemanticGraph(); // first construct the actual nodes; keep them indexed by their index // This block is optimized as one of the places which take noticeable time // in datum caching int min = Integer.MAX_VALUE; int max = Integer.MIN_VALUE; for(CoreNLPProtos.DependencyGraph.Node in: proto.getNodeList()){ min = in.getIndex() < min ? in.getIndex() : min; max = in.getIndex() > max ? in.getIndex() : max; } TwoDimensionalMap<Integer, Integer, IndexedWord> nodes = TwoDimensionalMap.hashMap(); for(CoreNLPProtos.DependencyGraph.Node in: proto.getNodeList()){ CoreLabel token; if (document.isPresent()) { token = document.get().get(SentencesAnnotation.class).get(in.getSentenceIndex()).get(TokensAnnotation.class).get(in.getIndex() - 1); // token index starts at 1! } else { token = sentence.get(in.getIndex() - 1); // index starts at 1! } IndexedWord word; if (in.hasCopyAnnotation() && in.getCopyAnnotation() > 0) { // TODO: if we make a copy wrapper CoreLabel, use it here instead word = new IndexedWord(new CoreLabel(token)); word.setCopyCount(in.getCopyAnnotation()); } else { word = new IndexedWord(token); } // for backwards compatibility - new annotations should have // these fields set, but annotations older than August 2014 might not if (word.docID() == null && docid != null) { word.setDocID(docid); } if (word.sentIndex() < 0 && in.getSentenceIndex() >= 0) { word.setSentIndex(in.getSentenceIndex()); } if (word.index() < 0 && in.getIndex() >= 0) { word.setIndex(in.getIndex()); } assert in.getIndex() == word.index(); nodes.put(in.getIndex(), in.getCopyAnnotation(), word); graph.addVertex(word); } // add all edges to the actual graph for(CoreNLPProtos.DependencyGraph.Edge ie: proto.getEdgeList()){ IndexedWord source = nodes.get(ie.getSource(), ie.getSourceCopy()); assert(source != null); IndexedWord target = nodes.get(ie.getTarget(), ie.getTargetCopy()); assert(target != null); synchronized (globalLock) { // this is not thread-safe: there are static fields in GrammaticalRelation assert ie.hasDep(); GrammaticalRelation rel = GrammaticalRelation.valueOf(fromProto(ie.getLanguage()), ie.getDep()); graph.addEdge(source, target, rel, 1.0, ie.hasIsExtra() && ie.getIsExtra()); } } if (proto.getRootCount() > 0) { Collection<IndexedWord> roots = proto.getRootList().stream().map(rootI -> nodes.get(rootI, 0)).collect(Collectors.toList()); graph.setRoots(roots); } else { // Roots were not saved away // compute root nodes if non-empty if(!graph.isEmpty()){ graph.resetRoots(); } } return graph; } /** * Voodoo magic to convert a serialized dependency graph into a {@link SemanticGraph}. * This method is intended to be called only from the {@link ProtobufAnnotationSerializer#fromProto(CoreNLPProtos.Document)} * method. * * @param proto The serialized representation of the graph. This relies heavily on indexing into the original document. * @param sentence The raw sentence that this graph was saved from must be provided, as it is not saved in the serialized * representation. * @param docid A docid must be supplied, as it is not saved by the serialized representation. * @return A semantic graph corresponding to the saved object, on the provided sentence. */ public static SemanticGraph fromProto(CoreNLPProtos.DependencyGraph proto, List<CoreLabel> sentence, String docid) { return fromProto(proto, sentence, docid, Optional.empty()); } /** * Return a {@link RelationTriple} object from the serialized representation. * This requires a sentence and a document so that * (1) we have a docid for the dependency tree can be accurately rebuilt, * and (2) we have references to the tokens to include in the relation triple. * * @param proto The serialized relation triples. * @param doc The document we are deserializing. This document should already * have a docid annotation set, if there is one. * @param sentenceIndex The index of the sentence this extraction should be attached to. * * @return A relation triple as a Java object, corresponding to the seriaized proto. */ public static RelationTriple fromProto(CoreNLPProtos.RelationTriple proto, Annotation doc, int sentenceIndex) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } // Get the spans for the extraction List<CoreLabel> subject = proto.getSubjectTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex()) ).collect(Collectors.toList()); List<CoreLabel> relation; if (proto.getRelationTokensCount() == 0) { // If we don't have a real span for the relation, make a dummy word relation = Collections.singletonList(new CoreLabel(new Word(proto.getRelation()))); } else { relation = proto.getRelationTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex()) ).collect(Collectors.toList()); } List<CoreLabel> object = proto.getObjectTokensList().stream().map(loc -> doc.get(SentencesAnnotation.class).get(loc.getSentenceIndex()).get(TokensAnnotation.class).get(loc.getTokenIndex()) ).collect(Collectors.toList()); // Create the extraction RelationTriple extraction; double confidence = proto.getConfidence(); if (proto.hasTree()) { SemanticGraph tree = fromProto( proto.getTree(), doc.get(SentencesAnnotation.class).get(sentenceIndex).get(TokensAnnotation.class), doc.get(DocIDAnnotation.class), Optional.of(doc)); extraction = new RelationTriple.WithTree(subject, relation, object, tree, confidence); } else { extraction = new RelationTriple(subject, relation, object, confidence); } // Tweak the extraction if (proto.hasIstmod()) { extraction.istmod(proto.getIstmod()); } if (proto.hasPrefixBe()) { extraction.isPrefixBe(proto.getPrefixBe()); } if (proto.hasSuffixBe()) { extraction.isSuffixBe(proto.getSuffixBe()); } if (proto.hasSuffixOf()) { extraction.isSuffixOf(proto.getSuffixOf()); } // Return return extraction; } /** * Returns a sentence fragment from a given protocol buffer, and an associated parse tree. * * @param fragment The saved sentence fragment. * @param tree The parse tree for the whole sentence. * * @return A {@link SentenceFragment} object corresponding to the saved proto. */ public static SentenceFragment fromProto(CoreNLPProtos.SentenceFragment fragment, SemanticGraph tree) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } SemanticGraph fragmentTree = new SemanticGraph(tree); // Set the new root if (fragment.hasRoot()) { fragmentTree.resetRoots(); fragmentTree.vertexSet().stream() .filter(vertex -> vertex.index() - 1 == fragment.getRoot()) .forEach(fragmentTree::setRoot); } // Set the new vertices Set<Integer> keptIndices = new HashSet<>(fragment.getTokenIndexList()); tree.vertexSet().stream() .filter(vertex -> !keptIndices.contains(vertex.index() - 1)) .forEach(fragmentTree::removeVertex); // Apparently this sometimes screws up the tree fragmentTree.vertexSet().stream() .filter(vertex -> fragmentTree.getFirstRoot() != vertex && tree.getFirstRoot() != vertex && !fragmentTree.incomingEdgeIterable(vertex).iterator().hasNext()) .forEach(vertex -> { SemanticGraphEdge edge = tree.incomingEdgeIterable(vertex).iterator().next(); fragmentTree.addEdge(fragmentTree.getFirstRoot(), edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra()); }); // Return the fragment //noinspection SimplifiableConditionalExpression return new SentenceFragment(fragmentTree, fragment.hasAssumedTruth() ? fragment.getAssumedTruth() : true, false) .changeScore(fragment.hasScore() ? fragment.getScore() : 1.0); } /** * Convert a serialized Map back into a Java Map. * * @param proto The serialized map. * * @return A Java Map corresponding to the serialized map. */ public static HashMap<String, String> fromProto(CoreNLPProtos.MapStringString proto) { HashMap<String, String> map = new HashMap<>(); for (int i = 0; i < proto.getKeyCount(); ++i) { map.put(proto.getKey(i), proto.getValue(i)); } return map; } /** * Convert a serialized Map back into a Java Map. * * @param proto The serialized map. * * @return A Java Map corresponding to the serialized map. */ public static HashMap<Integer, String> fromProto(CoreNLPProtos.MapIntString proto) { HashMap<Integer, String> map = new HashMap<>(); for (int i = 0; i < proto.getKeyCount(); ++i) { map.put(proto.getKey(i), proto.getValue(i)); } return map; } /** * Read a CorefChain from its serialized representation. * This is private due to the need for an additional partial document. Also, why on Earth are you trying to use * this on its own anyways? * @param proto The serialized representation of the coref chain, missing information on its mention span string. * @param partialDocument A partial document, which must contain {@link SentencesAnnotation} and {@link TokensAnnotation} in * order to fill in the mention span strings. * @return A coreference chain. */ private CorefChain fromProto(CoreNLPProtos.CorefChain proto, Annotation partialDocument) { // Get chain ID int cid = proto.getChainID(); // Get mentions Map<IntPair, Set<CorefChain.CorefMention>> mentions = new HashMap<>(); CorefChain.CorefMention representative = null; for (int i = 0; i < proto.getMentionCount(); ++i) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } CoreNLPProtos.CorefChain.CorefMention mentionProto = proto.getMention(i); // Create mention StringBuilder mentionSpan = new StringBuilder(); List<CoreLabel> sentenceTokens = partialDocument.get(SentencesAnnotation.class).get(mentionProto.getSentenceIndex()).get(TokensAnnotation.class); for (int k = mentionProto.getBeginIndex(); k < mentionProto.getEndIndex(); ++k) { mentionSpan.append(" ").append(sentenceTokens.get(k).word()); } // Set the coref cluster id for the token CorefChain.CorefMention mention = new CorefChain.CorefMention( Dictionaries.MentionType.valueOf(mentionProto.getMentionType()), Dictionaries.Number.valueOf(mentionProto.getNumber()), Dictionaries.Gender.valueOf(mentionProto.getGender()), Dictionaries.Animacy.valueOf(mentionProto.getAnimacy()), mentionProto.getBeginIndex() + 1, mentionProto.getEndIndex() + 1, mentionProto.getHeadIndex() + 1, cid, mentionProto.getMentionID(), mentionProto.getSentenceIndex() + 1, new IntTuple(new int[]{ mentionProto.getSentenceIndex() + 1, mentionProto.getPosition() }), mentionSpan.substring(mentionSpan.length() > 0 ? 1 : 0)); // Register mention IntPair key = new IntPair(mentionProto.getSentenceIndex() - 1, mentionProto.getHeadIndex() - 1); if (!mentions.containsKey(key)) { mentions.put(key, new HashSet<>()); } mentions.get(key).add(mention); // Check for representative if (proto.hasRepresentative() && i == proto.getRepresentative()) { representative = mention; } } // Return return new CorefChain(cid, mentions, representative); } private Mention fromProtoNoTokens(CoreNLPProtos.Mention protoMention) { Mention returnMention = new Mention(); // set enums if (protoMention.getMentionType() != null && !protoMention.getMentionType().equals("")) { returnMention.mentionType = Dictionaries.MentionType.valueOf(protoMention.getMentionType()); } if (protoMention.getNumber() != null && !protoMention.getNumber().equals("")) { returnMention.number = Dictionaries.Number.valueOf(protoMention.getNumber()); } if (protoMention.getGender() != null && !protoMention.getGender().equals("")) { returnMention.gender = Dictionaries.Gender.valueOf(protoMention.getGender()); } if (protoMention.getAnimacy() != null && !protoMention.getAnimacy().equals("")) { returnMention.animacy = Dictionaries.Animacy.valueOf(protoMention.getAnimacy()); } if (protoMention.getPerson() != null && !protoMention.getPerson().equals("")) { returnMention.person = Dictionaries.Person.valueOf(protoMention.getPerson()); } // TO DO: if the original Mention had "" for this field it will be lost, should deal with this problem if (!protoMention.getHeadString().equals("")) { returnMention.headString = protoMention.getHeadString(); } // TO DO: if the original Mention had "" for this field it will be lost, should deal with this problem if (!protoMention.getNerString().equals("")) { returnMention.nerString = protoMention.getNerString(); } returnMention.startIndex = protoMention.getStartIndex(); returnMention.endIndex = protoMention.getEndIndex(); returnMention.headIndex = protoMention.getHeadIndex(); returnMention.mentionID = protoMention.getMentionID(); returnMention.originalRef = protoMention.getOriginalRef(); returnMention.goldCorefClusterID = protoMention.getGoldCorefClusterID(); returnMention.corefClusterID = protoMention.getCorefClusterID(); returnMention.mentionNum = protoMention.getMentionNum(); returnMention.sentNum = protoMention.getSentNum(); returnMention.utter = protoMention.getUtter(); returnMention.paragraph = protoMention.getParagraph(); returnMention.isSubject = protoMention.getIsSubject(); returnMention.isDirectObject = protoMention.getIsDirectObject(); returnMention.isIndirectObject = protoMention.getIsIndirectObject(); returnMention.isPrepositionObject = protoMention.getIsPrepositionObject(); returnMention.hasTwin = protoMention.getHasTwin(); returnMention.generic = protoMention.getGeneric(); returnMention.isSingleton = protoMention.getIsSingleton(); // handle the sets of Strings if (protoMention.getDependentsCount() != 0) { returnMention.dependents = new HashSet<>(); returnMention.dependents.addAll(protoMention.getDependentsList()); } if (protoMention.getPreprocessedTermsCount() != 0) { returnMention.preprocessedTerms = new ArrayList<>(); returnMention.preprocessedTerms.addAll(protoMention.getPreprocessedTermsList()); } return returnMention; } private SpeakerInfo fromProto(CoreNLPProtos.SpeakerInfo speakerInfo) { String speakerName = speakerInfo.getSpeakerName(); return new SpeakerInfo(speakerName); } /** * Create an internal Timex object from the serialized protocol buffer. * @param proto The serialized protocol buffer to read from. * @return A timex, with as much information filled in as was gleaned from the protocol buffer. */ private Timex fromProto(CoreNLPProtos.Timex proto) { return new Timex( proto.hasType() ? proto.getType() : null, proto.hasValue() ? proto.getValue() : null, proto.hasAltValue() ? proto.getAltValue() : null, proto.hasTid() ? proto.getTid() : null, proto.hasText() ? proto.getText() : null, proto.hasBeginPoint() ? proto.getBeginPoint() : -1, proto.hasEndPoint() ? proto.getEndPoint() : -1); } /** * Read a entity mention from its serialized form. Requires the containing sentence to be * passed in along with the protocol buffer. * @param proto The serialized entity mention. * @param sentence The sentence this mention is attached to. * @return The entity mention corresponding to the serialized object. */ private EntityMention fromProto(CoreNLPProtos.Entity proto, CoreMap sentence) { EntityMention rtn = new EntityMention( proto.hasObjectID() ? proto.getObjectID() : null, sentence, proto.hasHeadStart() ? new Span(proto.getHeadStart(), proto.getHeadEnd()) : null, proto.hasHeadEnd() ? new Span(proto.getExtentStart(), proto.getExtentEnd()) : null, proto.hasType() ? proto.getType() : null, proto.hasSubtype() ? proto.getSubtype() : null, proto.hasMentionType() ? proto.getMentionType() : null ); if (proto.hasNormalizedName()) { rtn.setNormalizedName(proto.getNormalizedName()); } if (proto.hasHeadTokenIndex()) { rtn.setHeadTokenPosition(proto.getHeadTokenIndex()); } if (proto.hasCorefID()) { rtn.setCorefID(proto.getCorefID()); } return rtn; } /** * Read a relation mention from its serialized form. Requires the containing sentence to be * passed in along with the protocol buffer. * @param proto The serialized relation mention. * @param sentence The sentence this mention is attached to. * @return The relation mention corresponding to the serialized object. */ private RelationMention fromProto(CoreNLPProtos.Relation proto, CoreMap sentence) { List<ExtractionObject> args = proto.getArgList().stream().map(arg -> fromProto(arg, sentence)).collect(Collectors.toList()); RelationMention rtn = new RelationMention( proto.hasObjectID() ? proto.getObjectID() : null, sentence, proto.hasExtentStart() ? new Span(proto.getExtentStart(), proto.getExtentEnd()) : null, proto.hasType() ? proto.getType() : null, proto.hasSubtype() ? proto.getSubtype() : null, args); if (proto.hasSignature()) { rtn.setSignature(proto.getSignature()); } if (proto.getArgNameCount() > 0 || proto.getArgCount() == 0) { rtn.setArgNames(proto.getArgNameList()); } return rtn; } /** * Convert a quote object to a protocol buffer. */ @SuppressWarnings("UnusedParameters") private static Annotation fromProto(CoreNLPProtos.Quote quote, List<CoreLabel> tokens) { List<CoreLabel> quotedTokens = null; // note[gabor]: This works, but apparently isn't the behavior of the quote annotator? // if (quote.hasTokenBegin() && quote.hasTokenEnd() && quote.getTokenBegin() >= 0 && quote.getTokenEnd() >= 0) { // quotedTokens = tokens.subList(quote.getTokenBegin(), quote.getTokenEnd()); // } @SuppressWarnings("ConstantConditions") Annotation ann = QuoteAnnotator.makeQuote( quote.hasText() ? quote.getText() : null, quote.hasBegin() ? quote.getBegin() : -1, quote.hasEnd() ? quote.getEnd() : -1, quotedTokens, quote.hasTokenBegin() ? quote.getTokenBegin() : -1, quote.hasSentenceBegin() ? quote.getSentenceBegin() : -1, quote.hasSentenceEnd() ? quote.getSentenceEnd() : -1, quote.hasDocid() ? quote.getDocid() : null); if (quote.hasIndex()) { ann.set(QuotationIndexAnnotation.class, quote.getIndex()); } if (quote.hasTokenBegin()) { ann.set(TokenBeginAnnotation.class, quote.getTokenBegin()); } if (quote.hasTokenEnd()) { ann.set(TokenEndAnnotation.class, quote.getTokenEnd()); } return ann; } /** * Convert a quote object to a protocol buffer. */ @SuppressWarnings("UnusedParameters") private CoreMap fromProto(CoreNLPProtos.NERMention mention) { CoreMap map = new ArrayCoreMap(); if (mention.hasSentenceIndex()) map.set(SentenceIndexAnnotation.class, mention.getSentenceIndex()); if (mention.hasTokenStartInSentenceInclusive()) map.set(TokenBeginAnnotation.class, mention.getTokenStartInSentenceInclusive()); if (mention.hasTokenEndInSentenceExclusive()) map.set(TokenEndAnnotation.class, mention.getTokenEndInSentenceExclusive()); if (mention.hasNer()) map.set(NamedEntityTagAnnotation.class, mention.getNer()); if (mention.hasNormalizedNER()) map.set(NormalizedNamedEntityTagAnnotation.class, mention.getNormalizedNER()); if (mention.hasEntityType()) map.set(EntityTypeAnnotation.class, mention.getEntityType()); if (mention.hasTimex()) map.set(TimexAnnotation.class, fromProto(mention.getTimex())); if (mention.hasWikipediaEntity()) map.set(WikipediaEntityAnnotation.class, mention.getWikipediaEntity()); return map; } /** * Read a section coremap from its serialized form. Requires the containing sentence to be * passed in along with the protocol buffer. * @param section The serialized section coremap * @return The relation mention corresponding to the serialized object. */ private CoreMap fromProto(CoreNLPProtos.Section section, List<CoreMap> annotationSentences) { CoreMap map = new ArrayCoreMap(); map.set(CharacterOffsetBeginAnnotation.class, section.getCharBegin()); map.set(CharacterOffsetEndAnnotation.class, section.getCharEnd()); if (section.hasAuthor()) map.set(AuthorAnnotation.class, section.getAuthor()); if (section.hasDatetime()) map.set(SectionDateAnnotation.class, section.getDatetime()); // go through the list of sentences and add them to this section's sentence list ArrayList<CoreMap> sentencesList = new ArrayList<>(); for (int sentenceIndex : section.getSentenceIndexesList()) { sentencesList.add(annotationSentences.get(sentenceIndex)); } map.set(SentencesAnnotation.class, sentencesList); return map; } /** * Recover the {@link edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation} field of a sentence * from the tokens. This is useful if the text was not set in the protocol buffer, and therefore * needs to be reconstructed from tokens. * * @param tokens The list of tokens representing this sentence. * @return The original text of the sentence. */ protected String recoverOriginalText(List<CoreLabel> tokens, CoreNLPProtos.Sentence sentence) { StringBuilder text = new StringBuilder(); CoreLabel last = null; if (tokens.size() > 0) { CoreLabel token = tokens.get(0); if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); } last = tokens.get(0); } for (int i = 1; i < tokens.size(); ++i) { CoreLabel token = tokens.get(i); if (token.before() != null) { text.append(token.before()); assert last != null; int missingWhitespace = (token.beginPosition() - last.endPosition()) - token.before().length(); while (missingWhitespace > 0) { text.append(' '); missingWhitespace -= 1; } } if (token.originalText() != null) { text.append(token.originalText()); } else { text.append(token.word()); } last = token; } return text.toString(); } }