//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.geo.Coordinate;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Annotate MGRS coordinates within a document using regular expressions
*
* <p>
* Military Grid Reference System (MGRS) coordinates are extracted from the
* document content using the following regular expression:
* </p>
* <p>Military Grid Reference System (MGRS) coordinates are extracted from the document content using the following regular expression:</p>
* <pre>\b(GR\\h*)?([0-6]?[0-9]\\h*([C-HJ-NP-X])\\h*[A-HJ-NP-Z][A-HJ-NP-V]\\h*(([0-9]{5}\\h*[0-9]{5})|([0-9]{4}\\h*[0-9]{4})|([0-9]{3}\\h*[0-9]{3})|([0-9]{2}\\h*[0-9]{2})))\\b</pre>
* <p>Some date strings, e.g. 19MAR1968, are also valid MGRS coordinates. These can be ignored by setting the ignoreDates parameter.
* If ignoreDates is true, then the following MGRS is used to exclude dates:</p>
* <pre>([0-2]?[0-9]|3[01])\\h*(JAN|FEB|MAR|JUN|JUL|SEP|DEC)\\h*([0-9]{2}|[0-9]{4})</pre>
*
* @baleen.javadoc
*/
public class Mgrs extends BaleenTextAwareAnnotator {
private final Pattern mgrsPattern = Pattern.compile("\\b(GR\\h*)?([0-6]?[0-9]\\h*([C-HJ-NP-X])\\h*[A-HJ-NP-Z][A-HJ-NP-V]\\h*(([0-9]{5}\\h*[0-9]{5})|([0-9]{4}\\h*[0-9]{4})|([0-9]{3}\\h*[0-9]{3})|([0-9]{2}\\h*[0-9]{2})))\\b");
private final Pattern datesPattern = Pattern.compile("([0-2]?[0-9]|3[01])\\h*(JAN|FEB|MAR|JUN|JUL|SEP|DEC)\\h*([0-9]{2}|[0-9]{4})");
/**
* Should MGRS coordinates that may refer to dates be ignored?
*
* @baleen.config false
*/
public static final String PARAM_IGNORE_DATES = "ignoreDates";
@ConfigurationParameter(name = PARAM_IGNORE_DATES, defaultValue = "false")
private boolean ignoreDates;
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String text = block.getCoveredText();
Matcher matcher = mgrsPattern.matcher(text);
while (matcher.find()) {
if (ignoreDates) {
Matcher dateMatcher = datesPattern.matcher(matcher.group(2));
if (dateMatcher.matches()) {
getMonitor().info("Discarding possible MGRS coordinate '{}' as it resembles a date", matcher.group(2));
continue;
}
}
Coordinate loc = new Coordinate(block.getJCas());
loc.setConfidence(1.0f);
block.setBeginAndEnd(loc, matcher.start(), matcher.end());
loc.setValue(matcher.group(2));
loc.setSubType("mgrs");
enhanceCoordinate(matcher, loc);
addToJCasIndex(loc);
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Coordinate.class));
}
/**
* Allows child classes to implement additional extraction to enhance the
* coordinate (eg to add lat lon)
*
* @param matcher
* @param loc
*/
protected void enhanceCoordinate(Matcher matcher, Coordinate loc) {
// Do nothing
}
}