//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.text.DecimalFormat;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ConfigUtils;
import uk.gov.dstl.baleen.types.geo.Coordinate;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Annotate Latitude-Longitude coordinates in decimal (DD) and degrees-minutes-seconds (DMS) format using regular expressions.
*
* <p><b>Decimal degree (DD)</b></p>
* <p>The document content is run through a regular expression matcher looking for latitude-longitude pairs in DD
* (e.g. 51.068787 -1.794472 or 51.068787° -1.794472° or 51.068787°N 1.794472°W).
* If the minimum number of decimal places required is 0, the following regular expressions are used:</p>
* <pre>(-?\\d{1,3}(\\.\\d+)?)(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d+)?)</pre>
* <pre>(-?\\d{1,3}(\\.\\d+)?)°(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d+)?)°</pre>
* <pre>\\b(\\d{1,3}(\\.\\d+)?)°( )?([NSEW])(,\\h*|\\h+)(\\d{1,3}(\\.\\d+)?)°( )?([NSEW])</pre>
*
* <p>If the minimum number of decimal places required is greater than 0, the following regular expressions are used
* where x is the minimum number of decimal places:</p>
* <pre>(-?\\d{1,3}(\\.\\d{x,}))(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d{x,}))</pre>
* <pre>(-?\\d{1,3}(\\.\\d{x,}))°(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d{x,}))°</pre>
* <pre>\\b(\\d{1,3}(\\.\\d{x,}))°( )?([NSEW])(,\\h*|\\h+)(\\d{1,3}(\\.\\d{x,}))°( )?([NSEW])</pre>
*
* <p>The latitude and longitude are extracted (optionally the user can specify it should be longitude and latitude instead)
* and a GeoJSON object is built representing this location.</p>
* <p>Coordinates that are preceded by a £, $ or € symbol are skipped as they are assumed to be monetary values rather than coordinates (e.g. £3,000).</p>
* <p>The regular expressions used when searching for Digital Degree formats that use the minus sign for southern and western hemispheres do not
* include word boundaries when looking for matches because this can cause the expression to fail to extract a first negative sign.
* This means the annotator will extract, for example, currency strings (e.g. $40,000).</p>
*
* <p><b>Degrees-minutes-seconds (DMS)</b></p>
* <p>The document content is run through a regular expression matcher looking for latitude-longitude pairs in DMS format
* (e.g. 51°4'7.6332"N 1°47'40.0992"W) that match the following regular expression:</p>
* <pre>\\b(\\d{1,3})°(\\d{1,2})'(\\d{1,2}(\\.\\d*)?)\"([NSEW])[,/\\h]*(\\d{1,3})°(\\d{1,2})'(\\d{1,2}(\\.\\d*)?)\"([NSEW])\\b</pre>
* <p>A similar regex is used to find pairs that have spaces instead of symbols in them (e.g. 10 12 14 N, 11 13 15 E),
* and another regex to find pairs with no spaces.</p>
* <p>The following regexes are also used to pick up other formats:</p>
* <ul>
* <li><pre>\\b(lat|latitude)\\h*(\\d{1,2})°\\h*(\\d{1,2}(\\.\\d+)?)'(\\h*(\\d{1,2}(\\.\\d+)?)\")?\\h*([NS])\\.?,?\\h*(lon|long|longitude)\\h*(\\d{1,3})°\\h*(\\d{1,2}(\\.\\d+)?)'(\\h*(\\d{1,2}(\\.\\d+)?)\")?\\h*([EW])\\b</pre></li>
* <li><pre>\\b(lat|latitude)\\h*(\\d{1,2})°\\h*(\\d{1,2})'\\.(\\d+)\\h*([NS])\\.?,?\\h*(lon|long|longitude)\\h*(\\d{1,3})°\\h*(\\d{1,2})'\\.(\\d+)\\h*([EW])\\b</pre></li>
* </ul>
* <p>Some validation is done on the extracted text, then the latitude and longitude are extracted and a GeoJSON object is
* built representing this location.</p>
*
*
* @baleen.javadoc
*/
public class LatLon extends BaleenTextAwareAnnotator {
private final Pattern llDMSPattern = Pattern
.compile("\\b(\\d{1,3})°(\\d{1,2})'(\\d{1,2}(\\.\\d+)?)\"([NSEW])[,/\\h]*(\\d{1,3})°(\\d{1,2})'(\\d{1,2}(\\.\\d+)?)\"([NSEW])\\b");
private final Pattern llDMSSpacePattern = Pattern
.compile("\\b(\\d{1,3}) (\\d{1,2}) (\\d{1,2}(\\.\\d+)?) ([NSEW])[,/\\h]*(\\d{1,3}) (\\d{1,2}) (\\d{1,2}(\\.\\d+)?) ([NSEW])\\b");
private final Pattern llDMSNumericPattern = Pattern
.compile("\\b(\\d{2,3})(\\d{2})(\\d{2})?( )?([NSEW])[,/\\h]*(\\d{2,3})(\\d{2})(\\d{2})?( )?([NSEW])\\b");
private final Pattern llDMSPunctuationPattern = Pattern
.compile("\\b(\\d{2,3})-(\\d{2}),(\\d{2})?( )?([NSEW])[,/\\h]*(\\d{2,3})-(\\d{2}),(\\d{2})?( )?([NSEW])\\b");
private final Pattern llDMSTextPattern = Pattern
.compile("\\b((lat|latitude)\\h*)?(\\d{1,2})°\\h*(\\d{1,2}(\\.\\d+)?)'(\\h*(\\d{1,2}(\\.\\d+)?)\")?\\h*([NS])\\.?,?\\h*(lon|long|longitude)?\\h*(\\d{1,3})°\\h*(\\d{1,2}(\\.\\d+)?)'(\\h*(\\d{1,2}(\\.\\d+)?)\")?\\h*([EW])\\b", Pattern.CASE_INSENSITIVE);
private final Pattern llDMTextPattern = Pattern
.compile("\\b((lat|latitude)\\h*)?(\\d{1,2})°\\h*(\\d{1,2})'\\.(\\d+)\\h*([NS])\\.?,?\\h*(lon|long|longitude)?\\h*(\\d{1,3})°\\h*(\\d{1,2})'\\.(\\d+)\\h*([EW])\\b", Pattern.CASE_INSENSITIVE);
private static final String COULD_NOT_PARSE = "Couldn't parse extracted coordinates - coordinate will be skipped";
/**
* Tell the annotator that coordinates are specified with the longitude first rather than the latitude.
*
* @baleen.config false
*/
public static final String PARAM_LONLAT = "lonlat";
@ConfigurationParameter(name = PARAM_LONLAT, defaultValue = "false")
private boolean lonlat;
/**
* Use the coordinate translated into decimal as the string to store in the
* Value field of the Entity, allowing a normalised format for all located
* coordinates. This option will also format the values to a maximum of 7.d.p.
* The use of cardinal point strings and the ordering are defined
* by other resources.
*
* @baleen.config false
*/
public static final String PARAM_STORE_DECIMAL = "storeDecimalValue";
@ConfigurationParameter(name = PARAM_STORE_DECIMAL, defaultValue = "false")
private boolean storeDecimalValue;
/**
* Store the appropriate cardinal points (N, S, E, W) with digital degree
* values when normalising the text stored in the value field.
*
* @baleen.config false
*/
public static final String PARAM_STORE_CARDINAL = "storeCardinalPoint";
@ConfigurationParameter(name = PARAM_STORE_CARDINAL, defaultValue = "false")
private boolean storeCardinalPoint;
/**
* Store the digital degree values with the longitude first instead
* of the default latitude first when normalising the text stored in
* the value field.
*
* @baleen.config false
*/
public static final String PARAM_STORE_LON_FIRST = "storeLongitudeFirst";
@ConfigurationParameter(name = PARAM_STORE_LON_FIRST, defaultValue = "false")
private boolean storeLongitudeFirst;
/**
* The minimum number of decimal places required when parsing Decimal Degrees.
* If 0, then any number of decimal places is accepted.
*
* @baleen.config 2
*/
public static final String PARAM_MIN_DP = "minDP";
@ConfigurationParameter(name = PARAM_MIN_DP, defaultValue = "2")
private String minDPString;
//Parse the minDP config parameter into this variable to avoid issues with parameter types
private int minDP;
/**
* Variable to hold the regular expression pattern for Digital Degrees
*/
private Pattern llDDPattern;
/**
* Variable to hold the regular expression pattern for Digital Degrees with degree symbol
*/
private Pattern llDDSymPattern;
/**
* Variable to hold the regular expression pattern for Digital Degrees with NSEW symbols
*/
private Pattern llDDCardPattern;
/**
* List of currency symbols to check for when excluding monetary values
*/
private final List<String> currencySymbols = Arrays.asList("£", "$", "€");
/**
* Set of already found coordinates (per document) to avoid different patterns picking out the same coordinate
*/
private Set<String> found;
/**
* Initialise the annotator - primarily, this sets the regular expression to
* the correct pattern for the user specified minDP (minimum decimal places)
*/
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
minDP = ConfigUtils.stringToInteger(minDPString, 2);
if (minDP == 0) {
// No word boundary characters as that excludes negative signs
llDDPattern = Pattern
.compile("(-?\\d{1,3}(\\.\\d+)?)(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d+)?)");
llDDSymPattern = Pattern
.compile("(-?\\d{1,3}(\\.\\d+)?)°(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d+)?)°");
llDDCardPattern = Pattern
.compile("\\b(\\d{1,3}(\\.\\d+)?)°( )?([NSEW])(,\\h*|\\h+)(\\d{1,3}(\\.\\d+)?)°( )?([NSEW])");
} else {
llDDPattern = Pattern.compile("(-?\\d{1,3}(\\.\\d{" + minDP
+ ",}))(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d{" + minDP + ",}))");
llDDSymPattern = Pattern.compile("(-?\\d{1,3}(\\.\\d{" + minDP
+ ",}))°(,\\h*|\\h+)(-?\\d{1,3}(\\.\\d{" + minDP + ",}))°");
llDDCardPattern = Pattern.compile("\\b(\\d{1,3}(\\.\\d{" + minDP
+ ",}))°( )?([NSEW])(,\\h*|\\h+)(\\d{1,3}(\\.\\d{" + minDP + ",}))°( )?([NSEW])");
}
}
/**
* Extract decimal coordinate pairs from the document, and add validated
* coordinates to the CAS
*/
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
found = new HashSet<>();
String text = normalizeQuotesAndDots(block.getCoveredText());
processDD(block, text);
processDDCard(block, text);
processDMS(block, text);
processDMSText(block, text);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Coordinate.class));
}
/**
* Searches the text for digital degree strings
* This method handles digital degree strings consisting of a
* pair of decimal numbers separated by whitespace or a comma
* and optionally with the degree symbol. The pair is assumed
* to be in the order Latitude, Longitude unless the resource
* lonlat has been set to true.
* @param aJCas the JCas object to hold the created Coordinate
* annotation.
* @param text the text string to search for coordinates.
*/
private void processDD(TextBlock block, String text) {
Pattern[] patterns = new Pattern[] { llDDPattern, llDDSymPattern };
for (Pattern p : patterns) {
Matcher matcher = p.matcher(text);
while (matcher.find()) {
if (currencySymbols.contains(text.substring(matcher.start(1) - 1,
matcher.start(1)))) {
getMonitor()
.info("Skipping coordinate as it is preceded by a currency symbol");
continue;
}
try {
Double lat;
Double lon;
if (!lonlat) {
lat = Double.parseDouble(matcher.group(1));
lon = Double.parseDouble(matcher.group(4));
} else {
lon = Double.parseDouble(matcher.group(1));
lat = Double.parseDouble(matcher.group(4));
}
addCoordinate(block, matcher, lon, lat, "dd");
} catch (NumberFormatException e) {
getMonitor().warn(COULD_NOT_PARSE, e);
}
}
}
}
/**
* Searches the text for digital degree strings with cardinal points.
* This method handles processing of digital degree strings
* consisting of a pair of decimal numbers separated by whitespace
* or a comma and each number is followed by a degree symbol and
* a letter representing its cardinal compass point, i.e. N, S, E, W.
* The cardinal point is used to determine which value is latitude and
* which is longitude.
*
* @param aJCas the JCas object to hold the created Coordinate
* annotation.
* @param text the text string to search for coordinates.
*/
private void processDDCard(TextBlock block, String text) {
Matcher matcher = llDDCardPattern.matcher(text);
while (matcher.find()) {
// If no valid cardinal point letter then skip it
if(!isValidPair(matcher.group(4), matcher.group(9))){
continue;
}
try {
Double lat;
Double lon;
// Assume latitude first
lat = Double.parseDouble(matcher.group(1));
lon = Double.parseDouble(matcher.group(6));
if ("E".equals(matcher.group(4))
|| "W".equals(matcher.group(4))) {
// Actually longitude first so swap values
Double tmp = lat;
lat = lon;
lon = tmp;
}
if(flipLon(matcher.group(4), matcher.group(9))){
lon = -lon;
}
if(flipLat(matcher.group(4), matcher.group(9))){
lat = -lat;
}
addCoordinate(block, matcher, lon, lat, "dd");
} catch (NumberFormatException e) {
getMonitor().warn(COULD_NOT_PARSE, e);
}
}
}
private void processDMS(TextBlock block, String text) throws AnalysisEngineProcessException {
Pattern[] patterns = new Pattern[] { llDMSPattern, llDMSSpacePattern, llDMSNumericPattern, llDMSPunctuationPattern };
for (Pattern p : patterns) {
Matcher matcher = p.matcher(text);
while (matcher.find()) {
if(!isValidPair(matcher.group(5), matcher.group(10))){
continue;
}
try {
double[] lonLat = determineLonLatDMS(matcher);
addCoordinate(block, matcher, lonLat[0], lonLat[1], "dms");
} catch (NumberFormatException e) {
getMonitor().warn(COULD_NOT_PARSE, e);
}
}
}
}
private void processDMSText(TextBlock block, String text) throws AnalysisEngineProcessException {
Matcher m = llDMSTextPattern.matcher(text);
while(m.find()){
Double lat = Double.parseDouble(m.group(3));
lat += Double.parseDouble(m.group(4))/60;
if(m.group(7) != null)
lat += Double.parseDouble(m.group(7))/3600;
if("S".equalsIgnoreCase(m.group(9)))
lat = -lat;
Double lon = Double.parseDouble(m.group(11));
lon += Double.parseDouble(m.group(12))/60;
if(m.group(15) != null)
lon += Double.parseDouble(m.group(15))/3600;
if("W".equalsIgnoreCase(m.group(17)))
lon = -lon;
addCoordinate(block, m, lon, lat, "dms");
}
m = llDMTextPattern.matcher(text);
while(m.find()){
Double lat = Double.parseDouble(m.group(3));
lat += Double.parseDouble(m.group(4))/60;
lat += Double.parseDouble(m.group(5))/3600;
if("S".equalsIgnoreCase(m.group(6)))
lat = -lat;
Double lon = Double.parseDouble(m.group(8));
lon += Double.parseDouble(m.group(9))/60;
lon += Double.parseDouble(m.group(10))/3600;
if("S".equalsIgnoreCase(m.group(11)))
lon = -lon;
addCoordinate(block, m, lon, lat, "dms");
}
}
private double[] determineLonLatDMS(Matcher matcher){
Double lat = 0.0;
Double lon = 0.0;
lat = dmsToDeg(Integer.parseInt(matcher.group(1)),
Integer.parseInt(matcher.group(2)),
parseOrNull(matcher.group(3)));
lon = dmsToDeg(Integer.parseInt(matcher.group(6)),
Integer.parseInt(matcher.group(7)),
parseOrNull(matcher.group(8)));
if ("E".equals(matcher.group(5))
|| "W".equals(matcher.group(5))) {
Double tmp = lat;
lat = lon;
lon = tmp;
}
if(flipLon(matcher.group(5), matcher.group(10))){
lon = -lon;
}
if(flipLat(matcher.group(5), matcher.group(10))){
lat = -lat;
}
return new double[]{lon, lat};
}
/**
* Determines whether we have both a North/South and an East/West directional indicator present
*/
private boolean isValidPair(String... parameters){
boolean nFound = false;
boolean eFound = false;
for(String s : parameters){
if("N".equalsIgnoreCase(s) || "S".equalsIgnoreCase(s)){
nFound = true;
}else if("E".equalsIgnoreCase(s) || "W".equalsIgnoreCase(s)){
eFound = true;
}
}
return nFound && eFound;
}
private boolean flipLat(String... parameters){
for(String s : parameters){
if("S".equalsIgnoreCase(s)){
return true;
}
}
return false;
}
private boolean flipLon(String... parameters){
for(String s : parameters){
if("W".equalsIgnoreCase(s)){
return true;
}
}
return false;
}
/**
* Converts a Degrees Minutes Seconds coordinate into a decimal degree value
* The conversion is ignorant of the cardinality (N, S, E, W) so degrees
* value should be positive and the conversion of an S latitude or W longitude
* coordinate to a negative value should be carried out by the calling function.
*
* @param d number of degrees. It is assumed this is a positive value
* @param m number of minutes
* @param s number of seconds, or null if no seconds supplied
* @return the decimal degree value for the degrees minutes seconds
*/
private double dmsToDeg(Integer d, Integer m, Double s) {
double seconds = m * 60.0;
if(s != null){
seconds += s;
}
return d + (seconds / 3600);
}
private Double parseOrNull(String s){
if(s != null){
return Double.parseDouble(s);
}else{
return null;
}
}
private void addCoordinate(TextBlock block, Matcher matcher, Double lon, Double lat, String coordinateType) {
if (lat >= -90 && lat <= 90 && lon >= -180 && lon <= 180) {
String textLoc = matcher.start() + "," + matcher.end();
if(found.add(textLoc)){
Coordinate loc = new Coordinate(block.getJCas());
loc.setConfidence(1.0f);
block.setBeginAndEnd(loc, matcher.start(), matcher.end());
if (storeDecimalValue) {
addNormalisedValue(lat, lon, loc);
} else {
loc.setValue(matcher.group(0));
}
String coords = "[" + lon + "," + lat + "]";
loc.setGeoJson("{\"type\":\"Point\",\"coordinates\":"
+ coords + "}");
loc.setCoordinateValue(lon + "," + lat);
loc.setSubType(coordinateType);
addToJCasIndex(loc);
}
}
}
/*
* Formats the decimal degree values into a normalised form
* This method sets the Value field of the JCas Coordinate
* object to a normalised string rather than a copy of the
* original text in the document string.
* The format consist of decimal degree values. The use of
* cardinal point strings and the order is controlled by
* resource.
*
* @param lat the latitude as a decimal degree (negative is S)
* @param lon the longitude as a decimal degree (negative is W)
* @param loc the coordinate location to hold the normalised string
*/
private void addNormalisedValue(double lat, double lon, Coordinate loc) {
String pattern = "###.#######";
double normLat = lat;
double normLon = lon;
String latString = "";
String lonString = "";
DecimalFormat df = new DecimalFormat(pattern);
if (storeCardinalPoint) {
String cardLat = "N";
String cardLon = "E";
if (normLat < 0) {
normLat = -normLat;
cardLat = "S";
}
if (normLon < 0) {
normLon = -normLon;
cardLon = "W";
}
latString = df.format(normLat) + cardLat;
lonString = df.format(normLon) + cardLon;
}
else {
latString = df.format(normLat);
lonString = df.format(normLon);
}
String firstCoord = storeLongitudeFirst ? lonString : latString;
String secondCoord = storeLongitudeFirst ? latString : lonString;
loc.setValue(firstCoord + " " + secondCoord);
loc.setIsNormalised(true);
}
/**
* Replace smart quotes, curly quotes, back ticks and mid-dots with standard quotes and dots
* to simplify the required regular expressions.
*/
public static String normalizeQuotesAndDots(String s){
return s.replaceAll("[\\u201C\\u201D\\u2033\\u02BA\\u301E\\u3003]", "\"").replaceAll("[\\u2018\\u2019\\u2032\\u00B4\\u02B9`]", "'").replaceAll("[\\u00B7]", ".");
}
}