//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * From Person entities found in the document, extract surnames (taken to be the last * word in multi-word names) and identify other occurrences. If only one person has * that surname in the document, then coreference it. * * @baleen.javadoc */ public class Surname extends BaleenAnnotator { @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { Map<String, Person> surnames = new HashMap<>(); for(Person p : JCasUtil.select(jCas, Person.class)){ String name = p.getCoveredText(); if(!name.contains(" ")) continue; String[] nameParts = name.split(" "); String surname = nameParts[nameParts.length - 1].toLowerCase(); if(surnames.containsKey(surname)){ //Existing surname, so ensure existing entity is removed so we don't do coreference //Unless the ReferenceTarget is the same, in which case no need to do anything Person pers = surnames.get(surname); if(pers == null) continue; ReferenceTarget rt = pers.getReferent(); if(rt == null || !rt.equals(p.getReferent())) surnames.put(surname, null); }else{ //New surname surnames.put(surname, p); } } for(Entry<String, Person> entry : surnames.entrySet()){ findSurname(jCas, entry.getKey(), entry.getValue()); } } private void findSurname(JCas jCas, String surname, Person original){ Pattern pSurname = Pattern.compile("\\b" + Pattern.quote(surname) + "\\b", Pattern.CASE_INSENSITIVE); Matcher m = pSurname.matcher(jCas.getDocumentText()); while(m.find()){ if(!JCasUtil.selectCovering(jCas, Person.class, m.start(), m.end()).isEmpty()) continue; Person p = new Person(jCas, m.start(), m.end()); if(original != null){ ReferenceTarget rt = original.getReferent(); if(rt == null){ rt = new ReferenceTarget(jCas); original.setReferent(rt); } p.setReferent(rt); } p.addToIndexes(); } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Person.class), ImmutableSet.of(Person.class)); } }