//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.grammatical;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.atteo.evo.inflector.English;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Find noun phrases that contain a word that might indicate it's an location.
*/
public class NPLocation extends BaleenAnnotator {
private static final List<String> LOCATION_KEYWORDS = Arrays.asList(
"continent", "america", "europe", "africa", "asia", "australasia", "oceania",
"country", "nation", "county", "state", "province", "region", "area", "neighbourhood", "neighborhood", "territory",
"capital", "city", "town", "vilage", "hamlet", "suburb",
"river", "stream", "lake", "pond", "ocean", "sea", "beach", "coast",
"mountain", "valley", "hill",
"field", "farm",
"church", "cathedral", "mosque", "synagogue", "synagog",
"shop", "shopping centre", "shopping center", "mall", "market",
"business park", "science park",
"university", "college", "campus", "school", "nursery", "library",
"street", "road", "motorway", "car park", "petrol station", "gas station",
"bridge", "tunnel",
"house", "home", "flat", "apartment", "compound", "castle", "hotel",
"hospital", "surgery", "clinic",
"station", "airport",
"barracks", "check point", "checkpoint", "out post", "outpost", "front line",
"building", "room",
"restaurant", "cafe", "café", "diner",
"park",
"camp", "campsite", "tent",
"factory", "warehouse", "facility"
);
private List<String> locationsInclPlurals;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
locationsInclPlurals = new ArrayList<>(LOCATION_KEYWORDS.size() * 2);
for(String loc : LOCATION_KEYWORDS){
locationsInclPlurals.add(loc);
locationsInclPlurals.add(English.plural(loc));
}
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
for(PhraseChunk chunk : JCasUtil.select(jCas, PhraseChunk.class)){
if(!"NP".equals(chunk.getChunkType()))
continue;
processNP(jCas, chunk);
}
}
/**
* Process a noun phrase to find entities
*/
public void processNP(JCas jCas, PhraseChunk chunk){
String coveredText = chunk.getCoveredText().toLowerCase();
for(String loc : locationsInclPlurals){
if(coveredText.contains(loc)){
//Check that word isn't part of another word
Pattern p = Pattern.compile("\\b"+loc+"\\b", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(coveredText);
if(m.find()){
Location l = new Location(jCas);
l.setBegin(chunk.getBegin());
l.setEnd(chunk.getEnd());
addToJCasIndex(l);
return; //Phrase chunk shouldn't contain more than one location
}
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class), ImmutableSet.of(Location.class));
}
}