//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.relations;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Relation;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Identifies relationships that have the NP-V(W*P)?-NP pattern,
* where NP is a Noun Phrase, V is a Verb, W is any word, and P is a preposition.
*
* Where one or more entities exists within the NP, these entities will be used to form the relationships.
* Where no entities exist within the NP, and assuming that onlyExisting is false,
* a new Entity will be created covering the whole NP.
*
* @baleen.javadoc
*/
public class NPVNP extends BaleenAnnotator {
/**
* Should we only find relations between existing entities (true),
* or should we add new entities where we find relations between
* entities that haven't already been extracted (false)
*
* @baleen.config false
*/
public static final String PARAM_ONLY_EXISTING = "onlyExisting";
@ConfigurationParameter(name = PARAM_ONLY_EXISTING, defaultValue="false")
private Boolean onlyExisting = false;
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
List<PhraseChunk> nounPhrases = JCasUtil.select(jCas, PhraseChunk.class).stream()
.filter(c -> "NP".equals(c.getChunkType()))
.collect(Collectors.toList());
for(int i = 0; i < nounPhrases.size() - 1; i++){
PhraseChunk np = nounPhrases.get(i);
PhraseChunk next = nounPhrases.get(i + 1);
processNounPhrase(jCas, np, next);
}
}
private void processNounPhrase(JCas jCas, PhraseChunk current, PhraseChunk next){
List<Entity> currentEntities = new ArrayList<>(JCasUtil.selectCovered(jCas, Entity.class, current));
List<Entity> nextEntities = new ArrayList<>(JCasUtil.selectCovered(jCas, Entity.class, next));
if(onlyExisting && (currentEntities.isEmpty() || nextEntities.isEmpty()))
return;
List<PhraseChunk> middle = JCasUtil.selectBetween(jCas, PhraseChunk.class, current, next);
if(middle.isEmpty() || !"VP".equals(middle.get(0).getChunkType()))
return;
if(middle.size() == 1 || "PP".equals(middle.get(middle.size() - 1).getChunkType())){
if(currentEntities.isEmpty()){
Entity e1 = new Entity(jCas, current.getBegin(), current.getEnd());
currentEntities.add(e1);
}
if(nextEntities.isEmpty()){
Entity e2 = new Entity(jCas, next.getBegin(), next.getEnd());
nextEntities.add(e2);
}
createRelations(jCas, currentEntities, nextEntities, jCas.getDocumentText().substring(current.getEnd(), next.getBegin()).trim());
}
}
private void createRelations(JCas jCas, List<Entity> source, List<Entity> target, String text){
for(Entity eSource : source){
for(Entity eTarget : target){
Relation relation = new Relation(jCas);
relation.setBegin(eSource.getBegin());
relation.setEnd(eTarget.getEnd());
relation.setValue(trimPunctuation(text));
relation.setSource(eSource);
relation.setTarget(eTarget);
relation.setRelationshipType("unknown");
addToJCasIndex(relation);
}
}
}
/**
* Trim punctuation (anything that isn't an alphanumeric character)
* from the start and end of a String
*/
public static String trimPunctuation(String s){
return s.replaceAll("^[^a-zA-Z0-9]*", "").replaceAll("[^a-zA-Z0-9]*$", "");
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class, Entity.class), ImmutableSet.of(Relation.class));
}
}