//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference.impl.sieves; import java.util.Collection; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import org.apache.uima.jcas.JCas; import com.google.common.base.Splitter; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention; import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType; import uk.gov.dstl.baleen.resources.utils.StopwordUtils; /** * Head matching sieve that has controllable parameters. */ public class StrictHeadMatchSieve extends AbstractCoreferenceSieve { private final boolean compatibleModifiers; private final boolean wordInclusion; private final Pattern stopwordsPattern; private static final Splitter WHITESPACE_SPLITTER = Splitter.on(" ").omitEmptyStrings().trimResults(); /** * Constructor for StrictHeadMatchSieve */ public StrictHeadMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions, boolean compatibleModifiers, boolean wordInclusion, Collection<String> stopwords) { super(jCas, clusters, mentions); this.compatibleModifiers = compatibleModifiers; this.wordInclusion = wordInclusion; this.stopwordsPattern = StopwordUtils.buildStopwordPattern(stopwords, false); } @Override public void sieve() { // TODO: We really need to work over clusters for this to make sense! List<Mention> mentions = getMentionsWithHead(MentionType.ENTITY, MentionType.NP); for (int i = 0; i < mentions.size(); i++) { final Mention a = mentions.get(i); for (int j = i + 1; j < mentions.size(); j++) { final Mention b = mentions.get(j); if(shouldAddToCluster(a, b)) addToCluster(a, b); } } } private boolean haveSubsetOfSameModifier(Mention a, Mention b) { final Set<String> aModifiers = getModifiers(a); final Set<String> bModifiers = getModifiers(b); // NOTE: This is ordered, a is earlier than b and it is unusal to introduce more information // to an entity later in the document return !aModifiers.isEmpty() && !bModifiers.isEmpty() && aModifiers.containsAll(bModifiers); } // TODO: This should at a cluster level private boolean hasSubsetOfNonStopWords(Mention a, Mention b) { final List<String> aNonStop = getNonStopWords(a); final List<String> bNonStop = getNonStopWords(b); // TODO: This should not include the head word? See the paper for clarification. // NOTE: This is ordered, a is earlier than b and it is unusual to introduce more information // to an entity later in the document // NOTE: We enforce that the set isn't empty otherwise we aren't really testing anything return !aNonStop.isEmpty() && !bNonStop.isEmpty() && aNonStop.containsAll(bNonStop); } private List<String> getNonStopWords(Mention a) { return WHITESPACE_SPLITTER.splitToList(clean(a.getText().toLowerCase())); } private String clean(String text) { return text.replaceAll(stopwordsPattern.pattern(), ""); } private boolean shouldAddToCluster(Mention a, Mention b){ String aHead = a.getHead().toLowerCase(); String bHead = b.getHead().toLowerCase(); // Entity head match - does one head contain the others if (!aHead.contains(bHead) && !bHead.contains(aHead)) { return false; } // Word inclusion - stop words of the mention are in the cluster if (wordInclusion && !hasSubsetOfNonStopWords(a, b)) { return false; } // Compatible modifiers only - do the two candidate mentions have the same adject / // nouns if (compatibleModifiers && !haveSubsetOfSameModifier(a, b)) { return false; } // Not i-within-i // NOTE: We just check for overlap here, not if a sub-NP, which is a cheap test and // can come first (but not in the cluster based case since, then we need to find the // mentions to test first. if (a.overlaps(b)) { return false; } return true; } }