ProperHeadMatchSieve.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.uima.jcas.JCas;

import com.google.common.primitives.Doubles;

import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;

/**
 * Sieve based on exact matching of the head word.
 */
public class ProperHeadMatchSieve extends AbstractCoreferenceSieve {
	private static final Pattern NUMBER = Pattern.compile("-?\\d+(,\\d+)*(\\.\\d+)?[k|m|b]?", Pattern.CASE_INSENSITIVE);

	private final Set<String> spatialModifiers = new HashSet<String>(
			Arrays.asList("northern", "southern", "western", "eastern", "south", "east", "north", "west",
					"central", "upper", "lower", "middle", "inner", "outer"));

	/**
	 * Constructor for ProperHeadMatchSieve
	 */
	public ProperHeadMatchSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions) {
		super(jCas, clusters, mentions);
	}

	@Override
	public void sieve() {
		// Note: Head must be proper nouns, but ours are by construction
		List<Mention> mentions = getMentionsWithHead(MentionType.ENTITY, MentionType.NP);
		
		for (int i = 0; i < mentions.size(); i++) {
			final Mention a = mentions.get(i);

			String aHead = a.getHead().toLowerCase();

			for (int j = i + 1; j < mentions.size(); j++) {
				final Mention b = mentions.get(j);

				String bHead = b.getHead().toLowerCase();

				if (aHead.equals(bHead) && shouldAddMentionsToCluster(a, b)){
					addToCluster(a, b);
				}
			}
		}
	}

	private boolean hasSameModifiers(Mention a, Mention b) {
		// TODO: The paper says location named entities, other proper nouns or other spatial
		// modifiers but since locations should be other proper nouns we ignore that clause. We
		// could look for Locations covered by the annotation.

		final Set<String> aModifiers = getSpatialAndPNModifier(a);
		final Set<String> bModifiers = getSpatialAndPNModifier(b);

		return aModifiers.size() == bModifiers.size() && aModifiers.containsAll(bModifiers);
	}

	private Set<String> getSpatialAndPNModifier(Mention a) {
		return a.getWords().stream()
				.filter(w -> w.getPartOfSpeech().startsWith("NP") || spatialModifiers.contains(w.getCoveredText()))
				.map(w -> w.getCoveredText().toLowerCase())
				.collect(Collectors.toSet());
	}

	// Asymetric
	private List<Double> extractNumbers(String text) {
		final List<Double> list = new LinkedList<>();
		final Matcher matcher = NUMBER.matcher(text);
		while (matcher.find()) {
			final Double d = Doubles.tryParse(matcher.group().replaceAll(",", ""));
			if (d != null) {
				list.add(d);
			}
		}
		return list;
	}

	// Asymetric
	private boolean hasSameNumbers(Collection<Double> aNumbers, Collection<Double> bNumbers) {

		for (final double b : bNumbers) {
			boolean found = false;
			for (final double a : aNumbers) {
				// 'Fuzzy match' the numbers
				if (Math.abs(a - b) < 0.01 * Math.max(Math.abs(a), Math.abs(a))) {
					found = true;
					break;
				}
			}

			if (!found) {
				return false;
			}
		}

		return true;
	}

	private boolean shouldAddMentionsToCluster(Mention a, Mention b){
		// Not i-within-i
		if (a.overlaps(b)) {
			return false;
		}

		// No modifier
		if (!hasSameModifiers(a, b)) {
			return false;
		}

		// No numerical mismatches
		final List<Double> aNumbers = extractNumbers(a.getText());
		final List<Double> bNumbers = extractNumbers(b.getText());

		if (!hasSameNumbers(aNumbers, bNumbers)) {
			return false;
		}

		return true;
	}
}