AbstractCoreferenceSieve.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;

import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.uima.jcas.JCas;

import com.google.common.base.Strings;

import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.types.language.PhraseChunk;

/**
 * Base class for coreference sieves.
 * <p>
 * Provides helper functions to manage clusters, etc.
 */
public abstract class AbstractCoreferenceSieve implements CoreferenceSieve {

	private final JCas jCas;
	private final List<Cluster> clusters;
	private final List<Mention> mentions;

	protected AbstractCoreferenceSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions) {
		this.jCas = jCas;
		this.clusters = clusters;
		this.mentions = mentions;
	}

	protected List<Cluster> getClusters() {
		return clusters;
	}

	protected List<Mention> getMentions() {
		return mentions;
	}
	
	protected List<Mention> getMentions(MentionType mentionType) {
		return mentions.stream().filter(m -> m.getType().equals(mentionType)).collect(Collectors.toList());
	}
	
	protected List<Mention> getMentions(MentionType... mentionTypes) {
		List<MentionType> mentionTypesList = Arrays.asList(mentionTypes);
		return mentions.stream().filter(m -> mentionTypesList.contains(m.getType())).collect(Collectors.toList());
	}
	
	protected List<Mention> getMentionsWithHead(MentionType mentionType) {
		return mentions.stream().filter(m -> m.getType().equals(mentionType) && hasHead(m)).collect(Collectors.toList());
	}
	
	protected List<Mention> getMentionsWithHead(MentionType... mentionTypes) {
		List<MentionType> mentionTypesList = Arrays.asList(mentionTypes);

		return mentions.stream().filter(m -> mentionTypesList.contains(m.getType()) && hasHead(m)).collect(Collectors.toList());
	}

	protected JCas getJCas() {
		return jCas;
	}

	protected void addToCluster(Mention a, Mention b) {
		if (a.hasClusters()) {
			final Cluster cluster = a.getAnyCluster();
			cluster.add(b);
		} else if (b.hasClusters()) {
			final Cluster cluster = b.getAnyCluster();
			cluster.add(a);
		} else {
			clusters.add(new Cluster(a, b));
		}
	}

	protected void addPairwiseToCluster(Collection<Mention> a, Collection<Mention> b) {

		// Technically these will all need to end up in the same cluster

		final Cluster cluster = a.stream().map(x -> x.getAnyCluster()).filter(Objects::nonNull).findAny()
				.orElseGet(() -> b.stream().map(x -> x.getAnyCluster()).filter(Objects::nonNull).findAny()
						.orElseGet(() -> {
							final Cluster c = new Cluster();
							clusters.add(c);
							return c;
						}));

		cluster.addAll(a);
		cluster.addAll(b);
	}

	protected void addCoveredToCluster(PhraseChunk a, PhraseChunk b) {
		final List<Mention> aMentions = findMentionsExactly(a.getBegin(), a.getEnd());
		final List<Mention> bMentions = findMentionsExactly(b.getBegin(), b.getEnd());

		addPairwiseToCluster(aMentions, bMentions);
	}

	protected List<Mention> findMentionsExactly(int begin, int end) {
		return getMentions().stream()
				.filter(m -> begin == m.getAnnotation().getBegin() && m.getAnnotation().getEnd() == end)
				.collect(Collectors.toList());
	}

	protected List<Mention> findMentionsUnder(int begin, int end) {
		return getMentions().stream()
				.filter(m -> begin >= m.getAnnotation().getBegin() && m.getAnnotation().getEnd() <= end)
				.collect(Collectors.toList());
	}

	protected List<Mention> findMentionAbove(int begin, int end) {
		return getMentions().stream()
				.filter(m -> m.getAnnotation().getBegin() <= begin && end <= m.getAnnotation().getEnd())
				.collect(Collectors.toList());
	}

	protected Set<String> getModifiers(Mention a) {
		// In the paper they say N and J (adjective) but we need cardinal too otherwise "200 people"
		// discards the 200
		// TODO: Modifiers up to head word? See paper
		return a.getWords().stream()
				.filter(w -> w.getPartOfSpeech().startsWith("N") || w.getPartOfSpeech().startsWith("J")
						|| w.getPartOfSpeech().startsWith("CD"))
				.map(w -> w.getCoveredText().toLowerCase())
				.collect(Collectors.toSet());
	}
	
	protected static boolean hasHead(Mention m){
		return !Strings.isNullOrEmpty(m.getHead());
	}
}