StrictHeadMatchSieve.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;

import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.uima.jcas.JCas;

import com.google.common.base.Splitter;

import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;

/**
 * Head matching sieve that has controllable parameters.
 */
public class StrictHeadMatchSieve extends AbstractCoreferenceSieve {

	private final boolean compatibleModifiers;
	private final boolean wordInclusion;
	private final Pattern stopwordsPattern;
	
	private static final Splitter WHITESPACE_SPLITTER = Splitter.on(" ").omitEmptyStrings().trimResults();

	/**
	 * Constructor for StrictHeadMatchSieve
	 */
	public StrictHeadMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions,
			boolean compatibleModifiers, boolean wordInclusion, Collection<String> stopwords) {
		super(jCas, clusters, mentions);

		this.compatibleModifiers = compatibleModifiers;
		this.wordInclusion = wordInclusion;
		this.stopwordsPattern = StopwordUtils.buildStopwordPattern(stopwords, false);
	}

	@Override
	public void sieve() {
		// TODO: We really need to work over clusters for this to make sense!
		
		List<Mention> mentions = getMentionsWithHead(MentionType.ENTITY, MentionType.NP);

		for (int i = 0; i < mentions.size(); i++) {
			final Mention a = mentions.get(i);

			for (int j = i + 1; j < mentions.size(); j++) {
				final Mention b = mentions.get(j);

				if(shouldAddToCluster(a, b))
					addToCluster(a, b);
			}
		}
	}

	private boolean haveSubsetOfSameModifier(Mention a, Mention b) {
		final Set<String> aModifiers = getModifiers(a);
		final Set<String> bModifiers = getModifiers(b);

		// NOTE: This is ordered, a is earlier than b and it is unusal to introduce more information
		// to an entity later in the document
		return !aModifiers.isEmpty() && !bModifiers.isEmpty() && aModifiers.containsAll(bModifiers);
	}

	// TODO: This should at a cluster level
	private boolean hasSubsetOfNonStopWords(Mention a, Mention b) {
		final List<String> aNonStop = getNonStopWords(a);
		final List<String> bNonStop = getNonStopWords(b);

		// TODO: This should not include the head word? See the paper for clarification.

		// NOTE: This is ordered, a is earlier than b and it is unusual to introduce more information
		// to an entity later in the document

		// NOTE: We enforce that the set isn't empty otherwise we aren't really testing anything
		return !aNonStop.isEmpty() && !bNonStop.isEmpty() && aNonStop.containsAll(bNonStop);
	}

	private List<String> getNonStopWords(Mention a) {
		return WHITESPACE_SPLITTER.splitToList(clean(a.getText().toLowerCase()));
	}
	
	private String clean(String text) {
		return text.replaceAll(stopwordsPattern.pattern(), "");
	}
	
	private boolean shouldAddToCluster(Mention a, Mention b){
		String aHead = a.getHead().toLowerCase();
		String bHead = b.getHead().toLowerCase();

		// Entity head match - does one head contain the others
		if (!aHead.contains(bHead) && !bHead.contains(aHead)) {
			return false;
		}

		// Word inclusion - stop words of the mention are in the cluster
		if (wordInclusion && !hasSubsetOfNonStopWords(a, b)) {
			return false;
		}

		// Compatible modifiers only - do the two candidate mentions have the same adject /
		// nouns
		if (compatibleModifiers && !haveSubsetOfSameModifier(a, b)) {
			return false;
		}

		// Not i-within-i
		// NOTE: We just check for overlap here, not if a sub-NP, which is a cheap test and
		// can come first (but not in the cluster based case since, then we need to find the
		// mentions to test first.
		if (a.overlaps(b)) {
			return false;
		}
		
		return true;
	}
}