//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex.internals;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.SharedCountryResource;
import uk.gov.dstl.baleen.types.common.Nationality;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Extract nationality demonyms, e.g. French, from text
*
* <p>A list of demonyms are loaded from a JSON file, and each is searched for in the text. If found, then an annotation is created.</p>
*
*
*/
public class NationalityRegex extends BaleenTextAwareAnnotator {
/**
* Connection to Country Resource
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedCountryResource
*/
public static final String KEY_COUNTRY = "country";
@ExternalResource(key = KEY_COUNTRY)
private SharedCountryResource country;
private Map<String, Pattern> countryPatterns = new HashMap<>();
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
for(Entry<String, String> e : country.getDemonyms().entrySet()){
Pattern p = Pattern.compile("\\b" + e.getKey() + "\\b", Pattern.CASE_INSENSITIVE);
countryPatterns.put(e.getValue(), p);
}
}
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String text = block.getCoveredText();
for(Entry<String, Pattern> e : countryPatterns.entrySet()){
Matcher matcher = e.getValue().matcher(text);
while(matcher.find()){
getMonitor().debug("Found nationality '{}' in text", matcher.group(0));
Nationality n = new Nationality(block.getJCas());
n.setConfidence(1.0f);
block.setBeginAndEnd(n, matcher.start(), matcher.end());
n.setValue(matcher.group(0));
n.setCountryCode(e.getKey());
addToJCasIndex(n);
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Nationality.class));
}
}