//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.grammatical;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* This annotator identifies locations by looking for a VBD TO NNP part of speech pattern,
* where the NNP represents the location.
*
* For instance, in the sentence "Bob went to London", we would pick up London as a location.
*
* @baleen.javadoc
*/
public class TOLocationEntity extends BaleenAnnotator {
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
List<WordToken> tokens = new ArrayList<>(JCasUtil.select(jCas, WordToken.class));
for(int i = 0; i < tokens.size() - 2; i++){
if("VBD".equals(tokens.get(i).getPartOfSpeech()) && "TO".equals(tokens.get(i + 1).getPartOfSpeech()) && "NNP".equals(tokens.get(i + 2).getPartOfSpeech())){
Location l = new Location(jCas);
l.setBegin(tokens.get(i + 2).getBegin());
l.setEnd(findNNPEnd(tokens, i + 2));
addToJCasIndex(l);
}
}
}
private int findNNPEnd(List<WordToken> tokens, int firstToken){
int end = tokens.get(firstToken).getEnd();
for(int j = firstToken + 1; j < tokens.size(); j++){
if("NNP".equals(tokens.get(j).getPartOfSpeech())){
end = tokens.get(j).getEnd();
} else {
// Finished sequence of contiguous NNP following VBD, TO,
// Need to stop now or may encounter another NNP elsewhere
// in the document that has nothing to do with this location.
break;
}
}
return end;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(WordToken.class), ImmutableSet.of(Location.class));
}
}