//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;
import java.util.Arrays;
import java.util.List;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Organisation;
import uk.gov.dstl.baleen.types.common.Person;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Annotate singular pronouns (e.g. he, she, I) as Person entities,
* and plural pronouns (e.g. they, them) as Organisation entities.
*
* This annotator does not need to be TextBlock aware, as we are
* only dealing with a single word at a time.
*
* @baleen.javadoc
*/
public class Pronouns extends BaleenAnnotator {
private List<String> singularPronouns = Arrays.asList("i", "me", "myself", "you", "yourself", "he", "she", "him", "himself", "her", "herself");
private List<String> pluralPronouns = Arrays.asList("we", "they", "them", "ourselves", "yourselves", "themselves");
@Override
protected void doProcess(JCas jCas) {
for(WordToken wt : JCasUtil.select(jCas, WordToken.class)){
String text = wt.getCoveredText().toLowerCase();
if(singularPronouns.contains(text)){
//Special case for I, which must be uppercase
if("i".equals(text) && !"I".equals(wt.getCoveredText()))
continue;
Person p = new Person(jCas);
p.setBegin(wt.getBegin());
p.setEnd(wt.getEnd());
p.setConfidence(1.0);
addToJCasIndex(p);
}else if(pluralPronouns.contains(text)){
Organisation o = new Organisation(jCas);
o.setBegin(wt.getBegin());
o.setEnd(wt.getEnd());
o.setConfidence(1.0);
addToJCasIndex(o);
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(WordToken.class), ImmutableSet.of(Person.class, Organisation.class));
}
}