//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.relations; import java.util.Collection; import java.util.Collections; import java.util.Map; import java.util.Objects; import java.util.stream.Stream; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import uk.gov.dstl.baleen.annotators.relations.helpers.AbstractInteractionBasedRelationshipAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.language.Dependency; import uk.gov.dstl.baleen.types.language.Interaction; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.uima.grammar.DependencyGraph; import uk.gov.dstl.baleen.uima.grammar.DependencyGraph.TraversePredicate; import uk.gov.dstl.baleen.uima.utils.AnnotationUtils; /** * Unsupervised (originally Biomedical) Relationship Extractor. * <p> * A relationship extraction algorithm based on dependency parsing. * <p> * The algorithm works by taking an interaction word and then looking for entities that are * connected to it (in the way of a dependency graph, within a specified distance). * <p> * Formally this is defined as two conditions in the paper: * * <ul> * <li>RD1: dependency distance between interaction word and each entity should be <= 4. * <li>RD2: if the interaction word is a verb it should be between the two entities (within the text * of the sentence) * </ul> * * We modified this approach as it links a lot of entities, because the dependency graph fully * connects all words. * <p> * We also use a dependency distance of 1 - that effectively produces a more generalised * subject-verb-object mapping. We suspect that a total distance subject-object of 3 would also * work, but the current implementation requires symmetry. * <p> * In a paper on Unsupervised Events, the dependency is divided into trees but looks for verbs that * are connected to other verbs. In a simple case this focuses to subject-(one or more * verb-verb)-object relations, rather than subject-verb-noun-verb-object. We found the latter * connected to many unrelated parts of the sentence, whereas the former at least provided something * akin to an actionable connection. * <p> * Note this requires the following annotations: Sentence, WordToken, PhraseChunk (required for * dependency), Dependency, Entity, Interaction * * @baleen.javadoc */ public class UbmreDependency extends AbstractInteractionBasedRelationshipAnnotator { /** * The maximum distance (dependency links) between an entity and an interaction word within * which they are considered connected. * * @baleen.config 3 */ public static final String KEY_DEPENDENCY_DISTANCE = "distance"; @ConfigurationParameter(name = KEY_DEPENDENCY_DISTANCE, defaultValue = "3") private Integer maxDependencyDistance; private DependencyGraph dependencyGraph; @Override protected void preExtract(JCas jCas) { super.preExtract(jCas); dependencyGraph = DependencyGraph.build(jCas); } @Override protected void postExtract(JCas jCas) { super.postExtract(jCas); dependencyGraph = null; } @Override protected void extract(JCas jCas) { final Map<WordToken, Collection<Interaction>> tokenToInteraction = JCasUtil.indexCovered(jCas, WordToken.class, Interaction.class); final Map<Entity, Collection<Dependency>> entityToDependency = JCasUtil.indexCovered(jCas, Entity.class, Dependency.class); final Map<Interaction, Collection<WordToken>> interactionToDependencies = JCasUtil.indexCovered(jCas, Interaction.class, WordToken.class); final Collection<Entity> entities = JCasUtil.select(jCas, Entity.class); // This is the complex part. We are looking to find all entities close to interaction words // in 'dependency space'. // We allow a entity to traverse the graph until they get to a a verb, then in effect they // can go verb to verb. If they want to go verb to noun that's ok (our interaction word // could be a noun) but they can't then go back to a verb (since that would joint two // disconnected verb trees). final Multimap<Interaction, Entity> interactionToEntities = HashMultimap.create(); for (final Entity entity : entities) { dependencyGraph.traverse(maxDependencyDistance, entityToDependency.getOrDefault(entity, Collections.emptyList()), traverseToVerb(tokenToInteraction, interactionToEntities, entity)); } // Now we can create all the relations final Stream<Relation> relations = interactionToEntities.asMap().entrySet().stream().flatMap(e -> { final Interaction i = e.getKey(); final boolean interactionIsVerb = interactionToDependencies.getOrDefault(i, Collections.emptyList()) .stream() .anyMatch(p -> p.getPartOfSpeech().startsWith("V")); final Collection<Entity> c = e.getValue(); return createMeshedRelations(jCas, i, c, 1.0f) .filter(Objects::nonNull) .filter(r -> // Filter applies RD2: If a verb then we interaction should be between the two // entities !interactionIsVerb || AnnotationUtils.isInBetween(r, r.getSource(), r.getTarget())); }); addRelationsToIndex(relations); } /** * Traverse within the verb structure. * * @param tokenToInteraction * the token to interaction * @param interactionToEntities * the interaction to entities * @param entity * the entity * @return the traverse predicate */ private TraversePredicate traverseToVerb(final Map<WordToken, Collection<Interaction>> tokenToInteraction, final Multimap<Interaction, Entity> interactionToEntities, final Entity entity) { return (d, f, t, h) -> { if (f == null) // When starting allow it to carry on return true; if ("punct".equalsIgnoreCase(d.getDependencyType())) // Don't traverse punctuation return false; final boolean previousVerb = h.stream().map(WordToken::getPartOfSpeech).anyMatch(s -> s.startsWith("V")); // Can we traverse to this node if (carryOn(previousVerb, t)) { final Collection<Interaction> interactions = tokenToInteraction.get(t); if (interactions != null && !interactions.isEmpty()) // We've reached an interaction word, store our connection to it interactions.forEach(i -> interactionToEntities.put(i, entity)); return true; }else{ return false; } }; } private boolean carryOn(boolean previousIsVerb, WordToken token){ // If you've not hit a verb you can go anywhere n-n-n or // If you have hit a verb you can only move to another non-verb // v-n-v is not allowed as its another verb subtree return !previousIsVerb || !token.getPartOfSpeech().startsWith("V"); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Sentence.class, WordToken.class, PhraseChunk.class, Interaction.class, Entity.class, Dependency.class), ImmutableSet.of(Relation.class)); } }