//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.resources.SharedGenderMultiplicityResource; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Add gender information to a person, using the SharedGenderMultiplicityResource * * Checks each part of the name (i.e. for John Smith, it would check John and Smith) to determine the gender. * If at least one part is Male, and the other parts are Unknown or Neutral, then the person is marked as Male. * If at least one part is Female, and the other parts are Unknown or Neutral, then the person is marked as Female. * Otherwise, the person is marked as Unknown. * * Entities that already have an assigned gender (ignoring Unknown) are not modified. * * @baleen.javadoc */ public class AddGenderToPerson extends BaleenAnnotator { /** * Access to Gender and Multiplicity Resource * * @baleen.resource uk.gov.dstl.baleen.resources.SharedGenderMultiplicityResource */ public static final String KEY_GENDER_MULTIPLICITY = "genderMultiplicity"; @ExternalResource(key = KEY_GENDER_MULTIPLICITY) private SharedGenderMultiplicityResource genderResource; @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { for (Person p : JCasUtil.select(jCas, Person.class)) { if(Strings.isNullOrEmpty(p.getGender()) || "UNKNOWN".equalsIgnoreCase(p.getGender())){ String name = ""; if(!Strings.isNullOrEmpty(p.getValue())){ name = p.getValue(); }else{ name = p.getCoveredText(); } p.setGender(determineGender(name)); } } } private String determineGender(String name){ Integer mCount = 0; Integer fCount = 0; List<String> nameParts = new ArrayList<>(); nameParts.addAll(Arrays.asList(name.split("\\s+"))); for(String namePart : nameParts){ switch(genderResource.lookupGender(namePart)){ case M: mCount++; break; case F: fCount++; break; default: //Do nothing, if it's neutral or unknown we can ignore it for now } } if(mCount > 0 && fCount == 0){ return "MALE"; }else if(fCount > 0 && mCount == 0){ return "FEMALE"; } return "UNKNOWN"; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Person.class), Collections.emptySet()); } }