//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.uima.jcas.JCas;
import com.google.common.base.Splitter;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
import uk.gov.dstl.baleen.types.Base;
/**
* Sieve based on looser matching of head terms.
*/
public class RelaxedHeadMatchSieve extends AbstractCoreferenceSieve {
private final Pattern stopwordsPattern;
private static final Splitter WHITESPACE_SPLITTER = Splitter.on(" ").omitEmptyStrings().trimResults();
/**
* Constructor for RelaxedHeadMatchSieve
*/
public RelaxedHeadMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions, Collection<String> stopwords) {
super(jCas, clusters, mentions);
this.stopwordsPattern = StopwordUtils.buildStopwordPattern(stopwords, false);
}
@Override
public void sieve() {
List<Mention> mentions = getMentions(MentionType.ENTITY);
for (int i = 0; i < mentions.size(); i++) {
final Mention a = mentions.get(i);
for (int j = i + 1; j < mentions.size(); j++) {
final Mention b = mentions.get(j);
if(shouldAddToCluster(a, b)){
addToCluster(a, b);
}
}
}
}
// TODO: This should at a cluster level
private boolean hasSubsetOfNonStopWords(Mention a, Mention b) {
final List<String> aNonStop = getNonStopWords(a);
final List<String> bNonStop = getNonStopWords(b);
// TODO: This should not include the head word? See the paper for clarification.
// NOTE: This is ordered, a is earlier than b and it is unusual to introduce more information
// to an entity later in the document
// NOTE: We enforce that the set isn't empty otherwise we aren't really testing anything
return !aNonStop.isEmpty() && !bNonStop.isEmpty() && aNonStop.containsAll(bNonStop);
}
private List<String> getNonStopWords(Mention a) {
return WHITESPACE_SPLITTER.splitToList(clean(a.getText().toLowerCase()));
}
private String clean(String text) {
return text.replaceAll(stopwordsPattern.pattern(), "");
}
private boolean shouldAddToCluster(Mention a, Mention b){
final Class<? extends Base> aClazz = a.getAnnotation().getClass();
final Class<? extends Base> bClazz = b.getAnnotation().getClass();
final String aText = a.getText();
final String bHead = b.getHead();
if (!hasHead(b))
return false;
// Not i-within-i
if (a.overlaps(b)) {
return false;
}
// We have the same or at least semantically same type of entity
if (!aClazz.isAssignableFrom(bClazz) && !bClazz.isAssignableFrom(aClazz)) {
return false;
}
// Word inclusion
if (!hasSubsetOfNonStopWords(a, b)) {
return false;
}
// Do we contain the head word?
if (!aText.contains(bHead)) {
return false;
}
return true;
}
}