//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc.helpers;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Abstract annotator that identifies a root word indicative of an entity,
* and then identifies modifiers/descriptive words prior to that root word
* to build up a complete entity.
*
* For example, it might identify 'car' as a vehicle, and then 'red' as a modifier
* to produce an entity 'red car'.
*/
public abstract class AbstractRootWordAnnotator<T extends Entity> extends BaleenTextAwareAnnotator{
protected static final Set<Class<? extends Annotation>> requiredInputs = ImmutableSet.of(Sentence.class, WordToken.class);
public static final List<String> STOPWORDS = Arrays.asList("and", "or");
@Override
protected void doProcessTextBlock(TextBlock textBlock) throws AnalysisEngineProcessException {
Collection<Sentence> sentences = textBlock.select(Sentence.class);
if(sentences.isEmpty()){
//No sentences, use whole text block
processSentence(textBlock.getJCas(), textBlock.select(WordToken.class));
}else{
//Process each sentence in turn
for(Sentence s : sentences){
processSentence(textBlock.getJCas(), JCasUtil.selectCovered(WordToken.class, s));
}
}
}
protected void processSentence(JCas jCas, Collection<WordToken> wordTokens){
WordToken wtPrevDesc = null;
for(WordToken wt : wordTokens){
String word = wt.getCoveredText().toLowerCase();
String entityType = isEntity(word);
if(entityType != null){
Entity e = createEntity(jCas);
if(wtPrevDesc == null){
e.setBegin(wt.getBegin());
}else{
e.setBegin(wtPrevDesc.getBegin());
}
e.setEnd(wt.getEnd());
e.setSubType(entityType);
addToJCasIndex(e);
wtPrevDesc = null;
}else if(isDescriptiveWord(word)){
if(wtPrevDesc == null)
wtPrevDesc = wt;
}else if(!isStopWord(word)){
wtPrevDesc = null;
}
}
}
protected boolean isStopWord(String word){
return STOPWORDS.contains(word);
}
protected abstract String isEntity(String word);
protected abstract boolean isDescriptiveWord(String word);
protected abstract T createEntity(JCas jCas);
}