//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.misc; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Organisation; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Identify organisations with an adjacent (or nested) person and create * a relationship of type ROLE between the two (from person to organisation). * * Allows common roles to appear between the entities, such as spokesperson. * * If no Organisation-Person match is found, it will also check that the role * hasn't been mis-extracted and included in the organisation. If it has, then a * new Person entity will be created and a relationship between the two made. * * @baleen.javadoc */ public class OrganisationPersonRole extends BaleenTextAwareAnnotator { private static final List<String> ROLES = Arrays.asList("'s", "spokesperson", "spokesman", "spokeswoman", "chair", "chairperson", "chairman", "chairwoman", "chair person", "chair man", "chair woman", "secretary", "secretary general", "leader", "chief executive", "ceo", "c.e.o.", "boss", "president", "commander", "officer", "adviser", "senior adviser", "advisor", "senior advisor", "minister", "member"); @Override protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { Collection<Person> people = block.select(Person.class); for(Organisation org : block.select(Organisation.class)){ processOrganisation(block, org, people); } } private void processOrganisation(TextBlock block, Organisation org, Collection<Person> people){ if(findRole(block, org, people)) return; if(findAdjacent(block, org)) return; findNested(block, org); } private boolean findRole(TextBlock block, Organisation org, Collection<Person> people){ for(Person p : people){ if(p.getBegin() >= org.getEnd()){ String between = block.getDocumentText().substring(org.getEnd(), p.getBegin()).trim().toLowerCase(); if(between.isEmpty() || ROLES.contains(between) || ROLES.contains("'s " + between) || ROLES.contains("' " + between)){ Relation r = new Relation(block.getJCas(), org.getBegin(), p.getEnd()); r.setRelationshipType("ROLE"); r.setSource(p); r.setTarget(org); addToJCasIndex(r); return true; } }else if(p.getBegin() > org.getBegin() && p.getBegin() <= org.getEnd() && p.getEnd() >= org.getEnd()){ //Adjust boundary of organisation org.setEnd(p.getBegin()); while(org.getCoveredText().endsWith(" ") || org.getCoveredText().endsWith("'s") || org.getCoveredText().endsWith("'")) org.setEnd(org.getEnd() - 1); Relation r = new Relation(block.getJCas(), org.getBegin(), p.getEnd()); r.setRelationshipType("ROLE"); r.setSource(p); r.setTarget(org); addToJCasIndex(r); return true; } } return false; } private boolean findAdjacent(TextBlock block, Organisation org){ String longestMatch = ""; String subsequentText = block.getCoveredText().substring(block.toBlockOffset(org.getEnd())); for(String role : ROLES){ if("'s".equals(role) || role.length() <= longestMatch.length()) continue; Pattern p = Pattern.compile("('|'s)?\\s?"+Pattern.quote(role)+".*", Pattern.CASE_INSENSITIVE); if(p.matcher(subsequentText).matches()){ longestMatch = role; } } if(!longestMatch.isEmpty()){ Integer start = org.getEnd() + subsequentText.toLowerCase().indexOf(longestMatch); Person pers = new Person(block.getJCas(), start, start + longestMatch.length()); Relation r = new Relation(block.getJCas(), org.getBegin(), pers.getEnd()); r.setSource(org); r.setTarget(pers); r.setRelationshipType("ROLE"); addToJCasIndex(pers, r); return true; } return false; } private boolean findNested(TextBlock block, Organisation org){ //Didn't find a person, check that there's not an implied person/role nested in the organisation (e.g. bad extraction) String longestMatch = ""; for(String role : ROLES){ if("'s".equals(role) || role.length() <= longestMatch.length()) continue; Pattern p = Pattern.compile(".*\\b"+Pattern.quote(role), Pattern.CASE_INSENSITIVE); if(p.matcher(org.getCoveredText()).matches()){ longestMatch = role; } } if(!longestMatch.isEmpty()){ Person pers = new Person(block.getJCas(), org.getEnd() - longestMatch.length(), org.getEnd()); org.setEnd(org.getEnd() - longestMatch.length()); while(org.getCoveredText().endsWith(" ") || org.getCoveredText().endsWith("'s") || org.getCoveredText().endsWith("'")) org.setEnd(org.getEnd() - 1); Relation r = new Relation(block.getJCas(), org.getBegin(), pers.getEnd()); r.setSource(org); r.setTarget(pers); r.setRelationshipType("ROLE"); addToJCasIndex(pers, r); return true; } return false; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Organisation.class, Person.class), ImmutableSet.of(Relation.class, Person.class)); } }