//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference.impl.data; import java.util.HashSet; import java.util.List; import java.util.Set; import org.springframework.util.StringUtils; import uk.gov.dstl.baleen.resources.data.Gender; import uk.gov.dstl.baleen.resources.data.Multiplicity; import uk.gov.dstl.baleen.types.Base; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Entity; /** * A Mention that may be coreferenced. */ public class Mention { /** The annotation. */ private final Base annotation; /** The type. */ private final MentionType type; /** The clusters. */ private final Set<Cluster> clusters = new HashSet<>(); private Set<String> acronyms; private WordToken headWordToken; private List<WordToken> words; private Person person = Person.UNKNOWN; private Animacy animacy = Animacy.UNKNOWN; private Gender gender = Gender.UNKNOWN; private Multiplicity multiplicity = Multiplicity.UNKNOWN; private int sentenceIndex = Integer.MIN_VALUE; private Sentence sentence = null; private Mention(Base annotation, MentionType type) { this.annotation = annotation; this.type = type; } /** * Instantiates a new mention, of type PRONOUN. */ public Mention(WordToken annotation) { this(annotation, MentionType.PRONOUN); } /** * Instantiates a new mention, of type ENTITY. */ public Mention(Entity annotation) { this(annotation, MentionType.ENTITY); } /** * Instantiates a new mention, of type NP. */ public Mention(PhraseChunk annotation) { this(annotation, MentionType.NP); } /** * Gets the annotation. * * @return the annotation */ public Base getAnnotation() { return annotation; } /** * Gets the type. * * @return the type */ public MentionType getType() { return type; } /** * Gets the clusters * * @return the clusters */ public Set<Cluster> getClusters() { return clusters; } /** * @return true if there are clusters */ public boolean hasClusters() { return !clusters.isEmpty(); } /** * Returns any cluster, or null if no clusters are present */ public Cluster getAnyCluster() { if (!clusters.isEmpty()) { return clusters.iterator().next(); } else { return null; } } /** * Adds the to cluster - use cluster.add(mention) as this will not update the cluster. * * @param cluster * the cluster */ public void addToCluster(Cluster cluster) { clusters.add(cluster); } /** * Clear clusters - should not be used outside Coreference (will not remove from the cluster) */ public void clearClusters() { clusters.clear(); } /** * Get the covered text */ public String getText() { return annotation.getCoveredText(); } /** * Sets the head word token of the mention */ public void setHeadWordToken(WordToken headWordToken) { this.headWordToken = headWordToken; } /** * Returns the head word token */ public WordToken getHeadWordToken() { return headWordToken; } /** * Returns the head word as a string */ public String getHead() { return getHeadWordToken() != null ? getHeadWordToken().getCoveredText() : null; } /** * Returns true is the covered text contains no whitespace and is entirely upper case */ public boolean isAcronym() { return !StringUtils.containsWhitespace(getText()) && org.apache.commons.lang3.StringUtils.isAllUpperCase(getText()); } /** * Sets the acronyms associated with this mention */ public void setAcronym(Set<String> acronyms) { this.acronyms = acronyms; } /** * Returns the acronyms associated with this mention */ public Set<String> getAcronyms() { return acronyms; } /** * Returns true if the provided mention overlaps with this mention */ public boolean overlaps(Mention mention) { final Base a = getAnnotation(); final Base b = mention.getAnnotation(); return !(a.getEnd() < b.getBegin() || b.getEnd() < a.getBegin()); } @Override public String toString() { return getText() + " [" + type + "]"; } /** * Set the multiplicity of this mention */ public void setMultiplicity(Multiplicity multiplicity) { this.multiplicity = multiplicity; } /** * Get the multiplicity of this mention */ public Multiplicity getMultiplicity() { return multiplicity; } /** * Set the words associated with this mention */ public void setWords(List<WordToken> words) { this.words = words; } /** * Get the words associated with this mention */ public List<WordToken> getWords() { return words; } /** * Set the person associated with this mention */ public void setPerson(Person person) { this.person = person; } /** * Get the person associated with this mention */ public Person getPerson() { return person; } /** * Set the animacy associated with this mention */ public void setAnimacy(Animacy animacy) { this.animacy = animacy; } /** * Get the animacy associated with this mention */ public Animacy getAnimacy() { return animacy; } /** * Set the gender associated with this mention */ public Gender getGender() { return gender; } /** * Get the gender associated with this mention */ public void setGender(Gender gender) { this.gender = gender; } /** * Set the sentence index associated with this mention */ public void setSentenceIndex(int index) { this.sentenceIndex = index; } /** * Get the sentence index associated with this mention */ public int getSentenceIndex() { return sentenceIndex; } /** * Set the sentence associated with this mention */ public void setSentence(Sentence sentence) { this.sentence = sentence; } /** * Get the sentence associated with this mention */ public Sentence getSentence() { return sentence; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (annotation == null ? 0 : annotation.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final Mention other = (Mention) obj; if (annotation == null) { if (other.annotation != null) { return false; } } else if (!annotation.equals(other.annotation)) { return false; } return true; } /** * Returns true if the provided mention has compatible attributes with this mention */ public boolean isAttributeCompatible(Mention b) { // The paper also mentions NER labels, but I can't see how they could be (other than what is // down in people) // eg is Person entity we have already have it as a Animate so it won't match "it". if (getType() == MentionType.ENTITY && b.getType() == MentionType.ENTITY) { Class<? extends Base> aClass = getAnnotation().getClass(); Class<? extends Base> bClass = b.getAnnotation().getClass(); // Stop if they are different types semantically // That could still mean you consider an Entity (super type) to a Person (sub type) // so could be even more strict here and want aClass = bClass. if (!aClass.isAssignableFrom(bClass) && !bClass.isAssignableFrom(aClass)) { return false; } } // You can be more or less lenient here.. // gender is our worst dataset so I think its safer to be lenient return Gender.lenientEquals(getGender(), b.getGender()) && Animacy.strictEquals(getAnimacy(), b.getAnimacy()) && Multiplicity.strictEquals(getMultiplicity(), b.getMultiplicity()) && Person.strictEquals(getPerson(), b.getPerson()); } }