//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.grammatical;
import java.util.Arrays;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Person;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Look for noun phrases (NP) that start with a known title, e.g. Mr, and annotate these as people
*
*
*/
public class NPTitleEntity extends BaleenAnnotator {
public static final List<String> TITLES = Arrays.asList("mr", "mrs", "ms", "miss", "master", //Common titles
"rev", "reverend", "fr", "father", "pope", "brother", "sister", "friar", "abbess", "abbott", "cardinal", "vicar", //Religious titles (Christian)
"mullah", "imam", "ayatollah", "druid", "lama", "buddha", "rabbi", "rebbe", //Religious titles (non-Christian)
"dr", "doctor", "prof", "professor", //Educational titles
"pres", "president", "governor", "senator", "ambassador", "mayor", "envoy", "prime minister", "minister", "councillor", "representative", "speaker", "mp", "emir", "chief", "sultan", "wali", "sheikh", "shaykh", //Political titles
"pvt", "cpl", "corporal", "sgt", "sergeant", "capt", "captain", "maj", "cmdr", "commander", "lt", "lieutenant", "lt col", "lieutenant colonel", "col", "colonel", "gen", "adm", "admiral", "cdre", "commodore", "officer", //Military titles
"hrh", "his royal highness", "his majesty", "her royal highness", "her majesty", "king", "queen", "prince", "princess", "emperor", "empress", "tsar", "tsarina", "tsaritsa", "shah", //Royal titles
"sir", "dame", "lord", "lady", "baron", "baroness", "count", "countess", "duke", "duchess", "earl", "viscount", "marquis", "marquess", "grand duke", "grand duchess", "archduke", "archduchess" //Nobility titles
);
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
for(PhraseChunk chunk : JCasUtil.select(jCas, PhraseChunk.class)){
if(!"NP".equals(chunk.getChunkType()))
continue;
String coveredText = chunk.getCoveredText().toLowerCase();
for(String title : TITLES){
if(coveredText.startsWith(title) && coveredText.length() > title.length() && coveredText.substring(title.length(), title.length() + 1).matches("[^a-z0-9]")){
Person p = new Person(jCas);
p.setBegin(chunk.getBegin());
p.setEnd(chunk.getEnd());
p.setTitle(chunk.getCoveredText().substring(0, title.length()));
addToJCasIndex(p);
break;
}
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class), ImmutableSet.of(Person.class));
}
}