package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import java.util.function.Function;
/**
* Annotator that marks entity mentions in a document.
* Entity mentions are:
* <ul>
* <li> Named entities (identified by NER) </li>
* <li> Quantifiable entities
* <ul>
* <li> Times (identified by TimeAnnotator) </li>
* <li> Measurements (identified by ???) </li>
* </ul>
* </li>
* </ul>
*
* Each sentence is annotated with a list of the mentions
* (MentionsAnnotation as a list of CoreMap).
*
* @author Angel Chang
*/
public class EntityMentionsAnnotator implements Annotator {
// Currently relies on NER annotations being okay
// - Replace with calling NER classifiers and timeAnnotator directly
private final LabeledChunkIdentifier chunkIdentifier;
/**
* If true, heuristically search for organization acronyms, even if they are not marked
* explicitly by an NER tag.
* This is super useful (+20% recall) for KBP.
*/
private final boolean doAcronyms;
// TODO: Provide properties
public static PropertiesUtils.Property[] SUPPORTED_PROPERTIES = new PropertiesUtils.Property[]{};
/** the CoreAnnotation keys to use for this entity mentions annotator **/
private Class<? extends CoreAnnotation<String>> nerCoreAnnotationClass =
CoreAnnotations.NamedEntityTagAnnotation.class;
private Class<? extends CoreAnnotation<String>> nerNormalizedCoreAnnotationClass =
CoreAnnotations.NormalizedNamedEntityTagAnnotation.class;
private Class<? extends CoreAnnotation<List<CoreMap>>> mentionsCoreAnnotationClass =
CoreAnnotations.MentionsAnnotation.class;
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(EntityMentionsAnnotator.class);
public EntityMentionsAnnotator() {
// defaults
chunkIdentifier = new LabeledChunkIdentifier();
doAcronyms = false;
}
// note: used in annotate.properties
@SuppressWarnings({"UnusedDeclaration", "unchecked"})
public EntityMentionsAnnotator(String name, Properties props) {
// if the user has supplied custom CoreAnnotations for the ner tags and entity mentions override the default keys
try {
if (props.containsKey(name + ".nerCoreAnnotation")) {
nerCoreAnnotationClass =
(Class<? extends CoreAnnotation<String>>)
Class.forName(props.getProperty(name + ".nerCoreAnnotation"));
}
if (props.containsKey(name + ".nerNormalizedCoreAnnotation")) {
nerNormalizedCoreAnnotationClass =
(Class<? extends CoreAnnotation<String>>)
Class.forName(props.getProperty(name + ".nerNormalizedCoreAnnotation"));
}
if (props.containsKey(name + ".mentionsCoreAnnotation")) {
mentionsCoreAnnotationClass =
(Class<? extends CoreAnnotation<List<CoreMap>>>)
Class.forName(props.getProperty(name + ".mentionsCoreAnnotation"));
}
} catch (ClassNotFoundException e) {
log.error(e.getMessage());
}
chunkIdentifier = new LabeledChunkIdentifier();
doAcronyms = Boolean.parseBoolean(props.getProperty(name + ".acronyms", props.getProperty("acronyms", "false")));
}
private static boolean checkStrings(String s1, String s2) {
if (s1 == null || s2 == null) {
return Objects.equals(s1, s2);
} else {
return s1.equals(s2);
}
}
private static boolean checkNumbers(Number n1, Number n2) {
if (n1 == null || n2 == null) {
return Objects.equals(n1, n2);
} else {
return n1.equals(n2);
}
}
private final Function<Pair<CoreLabel,CoreLabel>, Boolean> IS_TOKENS_COMPATIBLE = in -> {
// First argument is the current token
CoreLabel cur = in.first;
// Second argument the previous token
CoreLabel prev = in.second;
if (cur == null || prev == null) {
return false;
}
// Get NormalizedNamedEntityTag and say two entities are incompatible if they are different
String v1 = cur.get(nerNormalizedCoreAnnotationClass);
String v2 = prev.get(nerNormalizedCoreAnnotationClass);
if ( ! checkStrings(v1,v2)) return false;
// This duplicates logic in the QuantifiableEntityNormalizer (but maybe we will get rid of that class)
String nerTag = cur.get(nerCoreAnnotationClass);
if ("NUMBER".equals(nerTag) || "ORDINAL".equals(nerTag)) {
// Get NumericCompositeValueAnnotation and say two entities are incompatible if they are different
Number n1 = cur.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
Number n2 = prev.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
if ( ! checkNumbers(n1,n2)) return false;
}
// Check timex...
if ("TIME".equals(nerTag) || "SET".equals(nerTag) || "DATE".equals(nerTag) || "DURATION".equals(nerTag)) {
Timex timex1 = cur.get(TimeAnnotations.TimexAnnotation.class);
Timex timex2 = prev.get(TimeAnnotations.TimexAnnotation.class);
String tid1 = (timex1 != null)? timex1.tid():null;
String tid2 = (timex2 != null)? timex2.tid():null;
if ( ! checkStrings(tid1,tid2)) return false;
}
return true;
};
@Override
public void annotate(Annotation annotation) {
List<CoreMap> allMentions = new ArrayList<>();
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
int sentenceIndex = 0;
for (CoreMap sentence : sentences) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
if (annoTokenBegin == null) {
annoTokenBegin = 0;
}
List<CoreMap> chunks = chunkIdentifier.getAnnotatedChunks(tokens, annoTokenBegin,
CoreAnnotations.TextAnnotation.class, nerCoreAnnotationClass, IS_TOKENS_COMPATIBLE);
sentence.set(mentionsCoreAnnotationClass, chunks);
// By now entity mentions have been annotated and TextAnnotation and NamedEntityAnnotation marked
// Some additional annotations
List<CoreMap> mentions = sentence.get(mentionsCoreAnnotationClass);
if (mentions != null) {
for (CoreMap mention : mentions) {
List<CoreLabel> mentionTokens = mention.get(CoreAnnotations.TokensAnnotation.class);
String name = (String) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate(
nerNormalizedCoreAnnotationClass, mentionTokens);
if (name == null) {
name = mention.get(CoreAnnotations.TextAnnotation.class);
} else {
mention.set(nerNormalizedCoreAnnotationClass, name);
}
//mention.set(CoreAnnotations.EntityNameAnnotation.class, name);
String type = mention.get(nerCoreAnnotationClass);
mention.set(CoreAnnotations.EntityTypeAnnotation.class, type);
// set sentence index annotation for mention
mention.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
// Take first non nil as timex for the mention
Timex timex = (Timex) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate(
TimeAnnotations.TimexAnnotation.class, mentionTokens);
if (timex != null) {
mention.set(TimeAnnotations.TimexAnnotation.class, timex);
}
// Set the entity link from the tokens
if (mention.get(CoreAnnotations.WikipediaEntityAnnotation.class) == null) {
for (CoreLabel token : mentionTokens) {
if ( (mention.get(CoreAnnotations.WikipediaEntityAnnotation.class) == null ||
"O".equals(mention.get(CoreAnnotations.WikipediaEntityAnnotation.class))) &&
( token.get(CoreAnnotations.WikipediaEntityAnnotation.class) != null &&
!"O".equals(token.get(CoreAnnotations.WikipediaEntityAnnotation.class))) ) {
mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, token.get(CoreAnnotations.WikipediaEntityAnnotation.class));
}
}
}
}
}
if (mentions != null) {
allMentions.addAll(mentions);
}
sentenceIndex++;
}
// Post-process with acronyms
if (doAcronyms) {
addAcronyms(annotation, allMentions);
}
annotation.set(mentionsCoreAnnotationClass, allMentions);
}
private void addAcronyms(Annotation ann, List<CoreMap> mentions) {
// Find all the organizations in a document
List<List<CoreLabel>> organizations = new ArrayList<>();
for (CoreMap mention : mentions) {
if ("ORGANIZATION".equals(mention.get(nerCoreAnnotationClass))) {
organizations.add(mention.get(CoreAnnotations.TokensAnnotation.class));
}
}
// Skip very long documents
if (organizations.size() > 100) { return; }
// Iterate over tokens...
for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
Integer totalTokensOffset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
for (int i = 0; i < tokens.size(); ++i) {
// ... that look like they might be an acronym and are not already a mention
CoreLabel token = tokens.get(i);
if ("O".equals(token.ner()) && token.word().toUpperCase().equals(token.word()) && token.word().length() >= 3) {
for (List<CoreLabel> org : organizations) {
// ... and actually are an acronym
if (AcronymMatcher.isAcronym(token.word(), org)) {
// ... and add them.
// System.out.println("found ACRONYM ORG");
token.setNER("ORGANIZATION");
CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens,
i, i + 1, totalTokensOffset, null, null, null);
chunk.set(CoreAnnotations.NamedEntityTagAnnotation.class,"ORGANIZATION");
mentions.add(chunk);
}
}
}
}
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
//TODO(jb) for now not fully enforcing pipeline if user customizes keys
if (!nerCoreAnnotationClass.getCanonicalName().
equals(CoreAnnotations.NamedEntityTagAnnotation.class.getCanonicalName())) {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class
)));
} else {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.NamedEntityTagAnnotation.class
)));
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(mentionsCoreAnnotationClass);
}
}