PreciseConstructsSieve.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;

import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;

import org.apache.uima.jcas.JCas;

import com.google.common.collect.Sets;

import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.uima.grammar.ParseTree;
import uk.gov.dstl.baleen.uima.grammar.ParseTreeNode;

/**
 * Sieves based on very specific (precise) rules.
 * <p>
 * Includes acronyms or certain constructs like "Prime Minister, Tony Blair".
 * <p>
 * Our parser, OpenNlp, does not output (,) so we need to do a manual check for that.
 *
 */
public class PreciseConstructsSieve extends AbstractCoreferenceSieve {

	private static final Predicate<WordToken> CONJUNCTION_FILTER = w -> "CC".equals(w.getPartOfSpeech());

	private static final Pattern COMMA = Pattern.compile("\\s*,\\s*");

	private final ParseTree parseTree;

	/**
	 * Constructor for PreciseConstructsSieve
	 */
	public PreciseConstructsSieve(JCas jCas, ParseTree parseTree, List<Cluster> clusters, List<Mention> mentions) {
		super(jCas, clusters, mentions);
		this.parseTree = parseTree;
	}

	@Override
	public void sieve() {

		parseTree.traverseChildren(children -> {
			for (int i = 0; i < children.size() - 1; i++) {
				final ParseTreeNode a = children.get(i);
				final ParseTreeNode b = children.get(i + 1);

				if ("NP".equals(a.getChunk().getChunkType()) && "NP".equals(b.getChunk().getChunkType())) {
					// Appositive - look for two NP chunks
					doAppositive(a, b);
				}else if ("NP".equals(a.getChunk().getChunkType()) && "VP".equals(b.getChunk().getChunkType())) {
					// Predicate nominative - (NP VP(is / was) ) then take the NP under VP as
					doPredicateNominative(a, b);
				}else if ("NP".equals(a.getChunk().getChunkType()) && "WHNP".equals(b.getChunk().getChunkType())) {
					// Relative pronoun
					doRelativePronoun(a, b);
				}
			}
		});

		// TODO: Role appositive - slightly unclear how this is used. I guess its the "The actress
		// Rachel is in the show. The actress plays a single role"
		// Which is an the import anamorphic relation. However in that example "actress" is not
		// found as a NP / Entity in baleen. Perhaps we should create a
		// role annotation and then use that? (effective look for ROLE PERSON to fulfil this rule)

		// Acronym
		// The implement here depends on the acronym generator
		doAcronym();

		// Denoymns: Nationality - Country
		// We are fortunate that we have Nationality and Location entities, and we already have the
		// existing
		// NationalityToLocation annotator, so this is not required.

	}

	private void doAppositive(ParseTreeNode a, ParseTreeNode b){
		// Is there a comma between them, without AND/BUT/ETC
		// Not in paper: Need to see if there's an AND in the larger noun phrase, eg
		// Police, Fire and Ambulance (will get police-fire at the moment)
		final String between = getJCas().getDocumentText().substring(a.getChunk().getEnd(),
				b.getChunk().getBegin());
		final ParseTreeNode parent = a.getParent();

		// Special case there if there's its a location "London, UK" will match
		// but we don't want it too. Probably need both the a and b to have a location
		// before its wrong. Of course these depend on the quality of the entity
		// extraction.

		boolean notCoversLocation = !coversLocation(a) || !coversLocation(b);
		if (COMMA.matcher(between).matches() && !parent.containsWord(CONJUNCTION_FILTER) && notCoversLocation) {
			addCoveredToCluster(a.getChunk(), b.getChunk());
		}
	}
	
	private void doPredicateNominative(ParseTreeNode a, ParseTreeNode b){
		final Optional<ParseTreeNode> np = b.getChildren().stream()
				.filter(n -> "NP".equals(n.getChunk().getChunkType()))
				.findFirst();
		final Optional<WordToken> is = b.getWords().stream()
				.filter(w -> "is".equalsIgnoreCase(w.getCoveredText()))
				.findFirst();

		if (np.isPresent() && is.isPresent()) {
			addCoveredToCluster(a.getChunk(), np.get().getChunk());
		}
	}
	
	private void doRelativePronoun(ParseTreeNode a, ParseTreeNode b){
		// The NP could be something that interests us, or it could a subpart of a large
		// NP.
		final List<Mention> mention = findMentionsExactly(a.getChunk().getBegin(), a.getChunk().getEnd());
		final List<Mention> pronoun = findMentionsExactly(b.getChunk().getBegin(), b.getChunk().getEnd());
		addPairwiseToCluster(mention, pronoun);
	}
	
	private void doAcronym(){
		for (int i = 0; i < getMentions().size(); i++) {
			final Mention a = getMentions().get(i);
			final Set<String> aAcronyms = a.getAcronyms();

			for (int j = i + 1; j < getMentions().size(); j++) {
				final Mention b = getMentions().get(j);
				final Set<String> bAcronyms = b.getAcronyms();

				if (aAcronyms != null && bAcronyms != null && b.isAcronym() != a.isAcronym()
						&& !Sets.intersection(aAcronyms, bAcronyms).isEmpty()) {
					addToCluster(a, b);
				}

			}
		}
	}
	
	private boolean coversLocation(ParseTreeNode a) {
		return findMentionsUnder(a.getChunk().getBegin(), a.getChunk().getEnd())
				.stream()
				.anyMatch(m -> m.getAnnotation() instanceof Location);
	}

}