UbmreConstituent.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.relations;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

import com.google.common.collect.ImmutableSet;

import uk.gov.dstl.baleen.annotators.relations.helpers.AbstractInteractionBasedSentenceRelationshipAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.Interaction;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Relation;
import uk.gov.dstl.baleen.uima.grammar.ParseTree;
import uk.gov.dstl.baleen.uima.grammar.ParseTreeNode;

/**
 * Unsupervised (originally Biomedical) Relationship Extractor.
 * <p>
 * A relationship extraction algorithm based on constituent parsing.
 * <p>
 * Given an interaction word we look for the covering VP. Then look in the parse tree above the VP
 * for a NP+VP (ie Noun Phrase followed by our VP). If that is the case then we consider the
 * entities covered by the NP to be related to the entities covered by the VP by the interaction
 * word.
 * <p>
 * For example John went to London. interaction words is went. VP is "went to London". NP is John.
 * Relationship is "John", "went", "London".
 * <p>
 * Formally this is defined as two conditions from the paper:
 * <ul>
 * <li>RP1: Entity1 and Entity2 have a NP+VP phrase structure. One is in covered by an NP and // the
 * other in a VP.</li>
 * <li>RP2: Interaction word A in the VP (in NP+VP)
 * </ul>
 *
 * Our implementation approach is:
 * <ol>
 * <li>1. For each interaction word, find the immediate covering VP chunk.
 * <li>2. Look through the parents to find if any any are a NP+VP relation
 * <li>3. If any are we output all the entities within the NP and all the entities under the
 * original covering VP chunk (from 1)
 * </ol>
 *
 * Here 3 seems to be stricter than the original paper. In the original they imply that you might
 * use the VP from the NP+VP relation. But the VP and NP can be huge and frankly largely unrelated
 * to the interaction word that initiated the search. In fact in many cases you might find a NP+VP
 * for the whole sentence, even though the interaction word which triggered the NP+VP search was in
 * a small subclause. By focusing only on the immediate VP we include entities which at least have
 * direct relevance with the VP and hence the interaction words.
 * <p>
 * Note this requires the following annotations: Sentence, WordToken, PhraseChunk, Entity,
 * Interaction.
 *
 * @baleen.javadoc
 *
 */
public class UbmreConstituent extends AbstractInteractionBasedSentenceRelationshipAnnotator {

	/**
	 * Limit the search to the first NP+VP structure.
	 *
	 * If set to false then the entire tree will be traversed looking for NP+VP matches, and thus
	 * the number of relationships will be greated (although those relationships might be more
	 * tenuous). If false then only the closest (lowest in the tree) NP+VP will be used.
	 *
	 * @baleen.config true
	 */
	public static final String KEY_LIMIT = "limit";
	@ConfigurationParameter(name = KEY_LIMIT, defaultValue = "true")
	private Boolean limitedSearch;

	private Map<Interaction, Collection<WordToken>> interactionCoveringTokens;

	private ParseTree parseTree;
	@Override
	protected void preExtract(JCas jCas) {
		super.preExtract(jCas);

		parseTree = ParseTree.build(jCas);

		interactionCoveringTokens = JCasUtil.indexCovering(jCas, Interaction.class,
				WordToken.class);

	}

	@Override
	protected void postExtract(JCas jCas) {
		super.postExtract(jCas);

		parseTree = null;
	}

	@Override
	protected Stream<Relation> extract(JCas jCas, Sentence sentence, Collection<Interaction> interactions,
			Collection<Entity> entities) {

		final Stream<Relation> phrase = extractPhrases(jCas, interactions);

		return distinct(phrase);
	}

	/**
	 * Extract phrases.
	 *
	 * @param jCas
	 *            the j cas
	 * @param interactions
	 *            the interactions
	 * @param entities
	 *            the entities
	 * @return the stream
	 */
	private Stream<Relation> extractPhrases(JCas jCas, Collection<Interaction> interactions) {
		return interactions.stream().flatMap(interaction -> extractPhrase(jCas, interaction));
	}

	private Stream<? extends Relation> extractPhrase(JCas jCas, Interaction i) {
		// NOTE: This still links entities high up (in a great-grandparent with those below). They
		// have little to do with those at the lower levels. See TODO comment below to address this
		// at recall costs.
		final Collection<WordToken> tokens = interactionCoveringTokens.get(i);

		final Set<ParseTreeNode> nodes = tokens.stream().map(parseTree::getParent)
				.filter(Objects::nonNull)
				.collect(Collectors.toSet());

		if (nodes.isEmpty()) {
			// Very unlikely to arrive here - it would be a word without a covering chunk!
			return Stream.empty();
		} else {
			// NOTE: We only pick the first VP, but what if there are more than one?
			// (For better quality, though less results, we should pick the best smallest VP
			// ... which will be the first as used here (nearest parent to the word in the parse
			// tree.

			final ParseTreeNode node = nodes.iterator().next();

			final List<Relation> relations = new ArrayList<>();

			node.traverseParent((parent, child) -> {
				final int childIndex = parent.getChildren().indexOf(child);
				if (childIndex > 0 && isVerbPhrase(child)) {
					final ParseTreeNode sibling = parent.getChildren().get(childIndex - 1);

					if (isNounPhrase(sibling)) {
						// We are in a NP+VP, with an interaction word.
						// We add the entities covered by NP and the by the original node's children
						addRelations(jCas, i, sibling.getChunk(), node).forEach(relations::add);

						// If limited search we stop now
						return !limitedSearch;
					}
				}

				// If we didn't find it keep looking
				return true;
			});

			return relations.stream();
		}
	}

	private Stream<Relation> addRelations(JCas jCas, Interaction interaction,
			PhraseChunk nounPhrase,
			ParseTreeNode verbNode) {
		final List<Entity> nounEntities = JCasUtil.selectCovered(jCas, Entity.class, nounPhrase);
		final List<Entity> verbEntities = new ArrayList<>();

		// WE depart from the paper again, we don't want to look in NP+VP structures under our VP.
		// This because they are self contained and and we just grab there entities 'because they
		// are under the VP' we get all sorts of unrelated subclauses.

		// We can't use verbNode.traverse here as it doesn't allow us to be selective about the
		// children we want to decend into

		extractEntitiesFromVerbPhrase(jCas, verbEntities, verbNode);

		return createPairwiseRelations(jCas, interaction, nounEntities, verbEntities, 1.0f);
	}

	private void extractEntitiesFromVerbPhrase(JCas jCas, List<Entity> verbEntities, ParseTreeNode node) {

		List<ParseTreeNode> children = node.getChildren();
		if (children == null || children.isEmpty()) {
			// We are ok to pull out any entities from here
			verbEntities.addAll(JCasUtil.selectCovered(jCas, Entity.class, node.getChunk()));
		} else {
			for (int i = 0; i < children.size(); i++) {

				// Check if we are an NP and the next is an VP
				if (isNounPhrase(children.get(i)) && i + 1 < children.size()
						&& isVerbPhrase(children.get(i + 1))) {
					// Don't go into NP+VP structures
					break;
				}

				extractEntitiesFromVerbPhrase(jCas, verbEntities, children.get(i));
			}
		}
	}

	private boolean isNounPhrase(ParseTreeNode node) {
		return "NP".equals(node.getChunk().getChunkType());
	}

	private boolean isVerbPhrase(ParseTreeNode node) {
		return "VP".equals(node.getChunk().getChunkType());
	}

	@Override
	public AnalysisEngineAction getAction() {
		return new AnalysisEngineAction(ImmutableSet.of(Sentence.class, WordToken.class, PhraseChunk.class, Interaction.class, Entity.class), ImmutableSet.of(Relation.class));
	}
}