//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.relations;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.annotators.relations.helpers.AbstractInteractionBasedSentenceRelationshipAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.Interaction;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Relation;
import uk.gov.dstl.baleen.uima.grammar.ParseTree;
import uk.gov.dstl.baleen.uima.grammar.ParseTreeNode;
/**
* Unsupervised (originally Biomedical) Relationship Extractor.
* <p>
* A relationship extraction algorithm based on constituent parsing.
* <p>
* Given an interaction word we look for the covering VP. Then look in the parse tree above the VP
* for a NP+VP (ie Noun Phrase followed by our VP). If that is the case then we consider the
* entities covered by the NP to be related to the entities covered by the VP by the interaction
* word.
* <p>
* For example John went to London. interaction words is went. VP is "went to London". NP is John.
* Relationship is "John", "went", "London".
* <p>
* Formally this is defined as two conditions from the paper:
* <ul>
* <li>RP1: Entity1 and Entity2 have a NP+VP phrase structure. One is in covered by an NP and // the
* other in a VP.</li>
* <li>RP2: Interaction word A in the VP (in NP+VP)
* </ul>
*
* Our implementation approach is:
* <ol>
* <li>1. For each interaction word, find the immediate covering VP chunk.
* <li>2. Look through the parents to find if any any are a NP+VP relation
* <li>3. If any are we output all the entities within the NP and all the entities under the
* original covering VP chunk (from 1)
* </ol>
*
* Here 3 seems to be stricter than the original paper. In the original they imply that you might
* use the VP from the NP+VP relation. But the VP and NP can be huge and frankly largely unrelated
* to the interaction word that initiated the search. In fact in many cases you might find a NP+VP
* for the whole sentence, even though the interaction word which triggered the NP+VP search was in
* a small subclause. By focusing only on the immediate VP we include entities which at least have
* direct relevance with the VP and hence the interaction words.
* <p>
* Note this requires the following annotations: Sentence, WordToken, PhraseChunk, Entity,
* Interaction.
*
* @baleen.javadoc
*
*/
public class UbmreConstituent extends AbstractInteractionBasedSentenceRelationshipAnnotator {
/**
* Limit the search to the first NP+VP structure.
*
* If set to false then the entire tree will be traversed looking for NP+VP matches, and thus
* the number of relationships will be greated (although those relationships might be more
* tenuous). If false then only the closest (lowest in the tree) NP+VP will be used.
*
* @baleen.config true
*/
public static final String KEY_LIMIT = "limit";
@ConfigurationParameter(name = KEY_LIMIT, defaultValue = "true")
private Boolean limitedSearch;
private Map<Interaction, Collection<WordToken>> interactionCoveringTokens;
private ParseTree parseTree;
@Override
protected void preExtract(JCas jCas) {
super.preExtract(jCas);
parseTree = ParseTree.build(jCas);
interactionCoveringTokens = JCasUtil.indexCovering(jCas, Interaction.class,
WordToken.class);
}
@Override
protected void postExtract(JCas jCas) {
super.postExtract(jCas);
parseTree = null;
}
@Override
protected Stream<Relation> extract(JCas jCas, Sentence sentence, Collection<Interaction> interactions,
Collection<Entity> entities) {
final Stream<Relation> phrase = extractPhrases(jCas, interactions);
return distinct(phrase);
}
/**
* Extract phrases.
*
* @param jCas
* the j cas
* @param interactions
* the interactions
* @param entities
* the entities
* @return the stream
*/
private Stream<Relation> extractPhrases(JCas jCas, Collection<Interaction> interactions) {
return interactions.stream().flatMap(interaction -> extractPhrase(jCas, interaction));
}
private Stream<? extends Relation> extractPhrase(JCas jCas, Interaction i) {
// NOTE: This still links entities high up (in a great-grandparent with those below). They
// have little to do with those at the lower levels. See TODO comment below to address this
// at recall costs.
final Collection<WordToken> tokens = interactionCoveringTokens.get(i);
final Set<ParseTreeNode> nodes = tokens.stream().map(parseTree::getParent)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
if (nodes.isEmpty()) {
// Very unlikely to arrive here - it would be a word without a covering chunk!
return Stream.empty();
} else {
// NOTE: We only pick the first VP, but what if there are more than one?
// (For better quality, though less results, we should pick the best smallest VP
// ... which will be the first as used here (nearest parent to the word in the parse
// tree.
final ParseTreeNode node = nodes.iterator().next();
final List<Relation> relations = new ArrayList<>();
node.traverseParent((parent, child) -> {
final int childIndex = parent.getChildren().indexOf(child);
if (childIndex > 0 && isVerbPhrase(child)) {
final ParseTreeNode sibling = parent.getChildren().get(childIndex - 1);
if (isNounPhrase(sibling)) {
// We are in a NP+VP, with an interaction word.
// We add the entities covered by NP and the by the original node's children
addRelations(jCas, i, sibling.getChunk(), node).forEach(relations::add);
// If limited search we stop now
return !limitedSearch;
}
}
// If we didn't find it keep looking
return true;
});
return relations.stream();
}
}
private Stream<Relation> addRelations(JCas jCas, Interaction interaction,
PhraseChunk nounPhrase,
ParseTreeNode verbNode) {
final List<Entity> nounEntities = JCasUtil.selectCovered(jCas, Entity.class, nounPhrase);
final List<Entity> verbEntities = new ArrayList<>();
// WE depart from the paper again, we don't want to look in NP+VP structures under our VP.
// This because they are self contained and and we just grab there entities 'because they
// are under the VP' we get all sorts of unrelated subclauses.
// We can't use verbNode.traverse here as it doesn't allow us to be selective about the
// children we want to decend into
extractEntitiesFromVerbPhrase(jCas, verbEntities, verbNode);
return createPairwiseRelations(jCas, interaction, nounEntities, verbEntities, 1.0f);
}
private void extractEntitiesFromVerbPhrase(JCas jCas, List<Entity> verbEntities, ParseTreeNode node) {
List<ParseTreeNode> children = node.getChildren();
if (children == null || children.isEmpty()) {
// We are ok to pull out any entities from here
verbEntities.addAll(JCasUtil.selectCovered(jCas, Entity.class, node.getChunk()));
} else {
for (int i = 0; i < children.size(); i++) {
// Check if we are an NP and the next is an VP
if (isNounPhrase(children.get(i)) && i + 1 < children.size()
&& isVerbPhrase(children.get(i + 1))) {
// Don't go into NP+VP structures
break;
}
extractEntitiesFromVerbPhrase(jCas, verbEntities, children.get(i));
}
}
}
private boolean isNounPhrase(ParseTreeNode node) {
return "NP".equals(node.getChunk().getChunkType());
}
private boolean isVerbPhrase(ParseTreeNode node) {
return "VP".equals(node.getChunk().getChunkType());
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Sentence.class, WordToken.class, PhraseChunk.class, Interaction.class, Entity.class), ImmutableSet.of(Relation.class));
}
}