package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.time.TimeAnnotations; import edu.stanford.nlp.time.Timex; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.logging.Redwood; import java.util.*; import java.util.function.Function; /** * Annotator that marks entity mentions in a document. * Entity mentions are: * <ul> * <li> Named entities (identified by NER) </li> * <li> Quantifiable entities * <ul> * <li> Times (identified by TimeAnnotator) </li> * <li> Measurements (identified by ???) </li> * </ul> * </li> * </ul> * * Each sentence is annotated with a list of the mentions * (MentionsAnnotation as a list of CoreMap). * * @author Angel Chang */ public class EntityMentionsAnnotator implements Annotator { // Currently relies on NER annotations being okay // - Replace with calling NER classifiers and timeAnnotator directly private final LabeledChunkIdentifier chunkIdentifier; /** * If true, heuristically search for organization acronyms, even if they are not marked * explicitly by an NER tag. * This is super useful (+20% recall) for KBP. */ private final boolean doAcronyms; // TODO: Provide properties public static PropertiesUtils.Property[] SUPPORTED_PROPERTIES = new PropertiesUtils.Property[]{}; /** the CoreAnnotation keys to use for this entity mentions annotator **/ private Class<? extends CoreAnnotation<String>> nerCoreAnnotationClass = CoreAnnotations.NamedEntityTagAnnotation.class; private Class<? extends CoreAnnotation<String>> nerNormalizedCoreAnnotationClass = CoreAnnotations.NormalizedNamedEntityTagAnnotation.class; private Class<? extends CoreAnnotation<List<CoreMap>>> mentionsCoreAnnotationClass = CoreAnnotations.MentionsAnnotation.class; /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(EntityMentionsAnnotator.class); public EntityMentionsAnnotator() { // defaults chunkIdentifier = new LabeledChunkIdentifier(); doAcronyms = false; } // note: used in annotate.properties @SuppressWarnings({"UnusedDeclaration", "unchecked"}) public EntityMentionsAnnotator(String name, Properties props) { // if the user has supplied custom CoreAnnotations for the ner tags and entity mentions override the default keys try { if (props.containsKey(name + ".nerCoreAnnotation")) { nerCoreAnnotationClass = (Class<? extends CoreAnnotation<String>>) Class.forName(props.getProperty(name + ".nerCoreAnnotation")); } if (props.containsKey(name + ".nerNormalizedCoreAnnotation")) { nerNormalizedCoreAnnotationClass = (Class<? extends CoreAnnotation<String>>) Class.forName(props.getProperty(name + ".nerNormalizedCoreAnnotation")); } if (props.containsKey(name + ".mentionsCoreAnnotation")) { mentionsCoreAnnotationClass = (Class<? extends CoreAnnotation<List<CoreMap>>>) Class.forName(props.getProperty(name + ".mentionsCoreAnnotation")); } } catch (ClassNotFoundException e) { log.error(e.getMessage()); } chunkIdentifier = new LabeledChunkIdentifier(); doAcronyms = Boolean.parseBoolean(props.getProperty(name + ".acronyms", props.getProperty("acronyms", "false"))); } private static boolean checkStrings(String s1, String s2) { if (s1 == null || s2 == null) { return Objects.equals(s1, s2); } else { return s1.equals(s2); } } private static boolean checkNumbers(Number n1, Number n2) { if (n1 == null || n2 == null) { return Objects.equals(n1, n2); } else { return n1.equals(n2); } } private final Function<Pair<CoreLabel,CoreLabel>, Boolean> IS_TOKENS_COMPATIBLE = in -> { // First argument is the current token CoreLabel cur = in.first; // Second argument the previous token CoreLabel prev = in.second; if (cur == null || prev == null) { return false; } // Get NormalizedNamedEntityTag and say two entities are incompatible if they are different String v1 = cur.get(nerNormalizedCoreAnnotationClass); String v2 = prev.get(nerNormalizedCoreAnnotationClass); if ( ! checkStrings(v1,v2)) return false; // This duplicates logic in the QuantifiableEntityNormalizer (but maybe we will get rid of that class) String nerTag = cur.get(nerCoreAnnotationClass); if ("NUMBER".equals(nerTag) || "ORDINAL".equals(nerTag)) { // Get NumericCompositeValueAnnotation and say two entities are incompatible if they are different Number n1 = cur.get(CoreAnnotations.NumericCompositeValueAnnotation.class); Number n2 = prev.get(CoreAnnotations.NumericCompositeValueAnnotation.class); if ( ! checkNumbers(n1,n2)) return false; } // Check timex... if ("TIME".equals(nerTag) || "SET".equals(nerTag) || "DATE".equals(nerTag) || "DURATION".equals(nerTag)) { Timex timex1 = cur.get(TimeAnnotations.TimexAnnotation.class); Timex timex2 = prev.get(TimeAnnotations.TimexAnnotation.class); String tid1 = (timex1 != null)? timex1.tid():null; String tid2 = (timex2 != null)? timex2.tid():null; if ( ! checkStrings(tid1,tid2)) return false; } return true; }; @Override public void annotate(Annotation annotation) { List<CoreMap> allMentions = new ArrayList<>(); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); int sentenceIndex = 0; for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); Integer annoTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); if (annoTokenBegin == null) { annoTokenBegin = 0; } List<CoreMap> chunks = chunkIdentifier.getAnnotatedChunks(tokens, annoTokenBegin, CoreAnnotations.TextAnnotation.class, nerCoreAnnotationClass, IS_TOKENS_COMPATIBLE); sentence.set(mentionsCoreAnnotationClass, chunks); // By now entity mentions have been annotated and TextAnnotation and NamedEntityAnnotation marked // Some additional annotations List<CoreMap> mentions = sentence.get(mentionsCoreAnnotationClass); if (mentions != null) { for (CoreMap mention : mentions) { List<CoreLabel> mentionTokens = mention.get(CoreAnnotations.TokensAnnotation.class); String name = (String) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate( nerNormalizedCoreAnnotationClass, mentionTokens); if (name == null) { name = mention.get(CoreAnnotations.TextAnnotation.class); } else { mention.set(nerNormalizedCoreAnnotationClass, name); } //mention.set(CoreAnnotations.EntityNameAnnotation.class, name); String type = mention.get(nerCoreAnnotationClass); mention.set(CoreAnnotations.EntityTypeAnnotation.class, type); // set sentence index annotation for mention mention.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); // Take first non nil as timex for the mention Timex timex = (Timex) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate( TimeAnnotations.TimexAnnotation.class, mentionTokens); if (timex != null) { mention.set(TimeAnnotations.TimexAnnotation.class, timex); } // Set the entity link from the tokens if (mention.get(CoreAnnotations.WikipediaEntityAnnotation.class) == null) { for (CoreLabel token : mentionTokens) { if ( (mention.get(CoreAnnotations.WikipediaEntityAnnotation.class) == null || "O".equals(mention.get(CoreAnnotations.WikipediaEntityAnnotation.class))) && ( token.get(CoreAnnotations.WikipediaEntityAnnotation.class) != null && !"O".equals(token.get(CoreAnnotations.WikipediaEntityAnnotation.class))) ) { mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, token.get(CoreAnnotations.WikipediaEntityAnnotation.class)); } } } } } if (mentions != null) { allMentions.addAll(mentions); } sentenceIndex++; } // Post-process with acronyms if (doAcronyms) { addAcronyms(annotation, allMentions); } annotation.set(mentionsCoreAnnotationClass, allMentions); } private void addAcronyms(Annotation ann, List<CoreMap> mentions) { // Find all the organizations in a document List<List<CoreLabel>> organizations = new ArrayList<>(); for (CoreMap mention : mentions) { if ("ORGANIZATION".equals(mention.get(nerCoreAnnotationClass))) { organizations.add(mention.get(CoreAnnotations.TokensAnnotation.class)); } } // Skip very long documents if (organizations.size() > 100) { return; } // Iterate over tokens... for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); Integer totalTokensOffset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class); for (int i = 0; i < tokens.size(); ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens.get(i); if ("O".equals(token.ner()) && token.word().toUpperCase().equals(token.word()) && token.word().length() >= 3) { for (List<CoreLabel> org : organizations) { // ... and actually are an acronym if (AcronymMatcher.isAcronym(token.word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.setNER("ORGANIZATION"); CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.set(CoreAnnotations.NamedEntityTagAnnotation.class,"ORGANIZATION"); mentions.add(chunk); } } } } } } @Override public Set<Class<? extends CoreAnnotation>> requires() { //TODO(jb) for now not fully enforcing pipeline if user customizes keys if (!nerCoreAnnotationClass.getCanonicalName(). equals(CoreAnnotations.NamedEntityTagAnnotation.class.getCanonicalName())) { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TokensAnnotation.class, CoreAnnotations.SentencesAnnotation.class ))); } else { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( CoreAnnotations.TokensAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.NamedEntityTagAnnotation.class ))); } } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return Collections.singleton(mentionsCoreAnnotationClass); } }