//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.Collection; import java.util.Collections; import java.util.StringJoiner; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.grammatical.NPTitleEntity; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Add title (mr, president, etc) information to previously found people. * <p> * Often with NLP models we find a person, e.g. John Smith but omit the title information, e.g. * General John Smith, General Sir John Smith. This annotator adds that information back onto the * entity, thus improving the quality of person extraction and reducing the number of unannotated * words in a document. * * This will also find cases where the title is included in the name, but has not been set as a property. * * This cleaner will overwrite existing titles. * * @baleen.javadoc */ public class AddTitleToPerson extends BaleenAnnotator { @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { // We copy this array as we'll modify people as we go Collection<Person> people = JCasUtil.select(jCas, Person.class); for(Person p : people){ //Find existing titles findExistingTitle(p); while(makeReplacement(jCas, p)){ //Make as many replacements as possible, to capture things like Sir Major General Smith. } } } private void findExistingTitle(Person p){ String[] tokens = p.getCoveredText().split("[\\.\\s]+"); StringJoiner sj = new StringJoiner(" "); for(String token : tokens){ if(NPTitleEntity.TITLES.contains(token.toLowerCase())){ sj.add(token); }else if(token.length() > 0){ break; } } p.setTitle(sj.toString()); } private boolean makeReplacement(JCas jCas, Person p){ boolean replacementMade = false; for(String title : NPTitleEntity.TITLES){ if(p.getBegin() - title.length() - 1 < 0) continue; String precedingText = jCas.getDocumentText().substring(p.getBegin() - title.length() - 1, p.getBegin() - 1); if(title.equalsIgnoreCase(precedingText)){ p.setBegin(p.getBegin() - title.length() - 1); p.setTitle(extendTitle(precedingText, p.getTitle())); replacementMade = true; }else if(p.getBegin() - title.length() - 2 >= 0 && p.getBegin() -1 >= 0){ String precedingTextPlusOne = jCas.getDocumentText().substring(p.getBegin() - title.length() - 2, p.getBegin() - 1); if((title + ".").equalsIgnoreCase(precedingTextPlusOne)){ p.setBegin(p.getBegin() - title.length() - 2); p.setTitle(extendTitle(precedingTextPlusOne, p.getTitle())); replacementMade = true; } } } return replacementMade; } /** * Add the prefix to the existing title. * * @param prefix * the prefix * @param title * the title * @return the string */ private String extendTitle(String prefix, String title) { String cleanedPrefix = prefix.replaceAll("\\.", ""); //Get rid of periods, if (title == null || title.isEmpty()) { return cleanedPrefix; } else { return cleanedPrefix + " " + title; } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Person.class), Collections.emptySet()); } }