//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.io.IOException; import java.io.InputStreamReader; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import org.apache.uima.UimaContext; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import com.opencsv.CSVReader; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.geo.Coordinate; /** * Annotate UK postcodes using RegEx and geolocate them to a point * * <p>The following regular expression is used to find potential UK postcodes in the document:</p> * <pre>\\b(GIR 0AA)|((([A-Z-[QVX]][0-9][0-9]?)|(([A-Z-[QVX]][A-Z-[IJZ]][0-9][0-9]?)|(([A-Z-[QVX]][0-9][A-HJKSTUW])|([A-Z-[IJZ]][0-9][ABEHMNPRVWXY])))) [0-9][A-Z-[CIKMOV]]{2})\\b</pre> * <p>Once found, it is compared to a CSV of UK Postcodes to retrieve LatLon information (accurate to ~11 metres). Any postcodes that aren't in the CSV are ignored and assumed to be mistakes.</p> * * */ public class Postcode extends AbstractRegexAnnotator<Coordinate> { private static final String POSTCODE_REGEX = "\\b(GIR 0AA)|((([A-Z-[QVX]][0-9][0-9]?)|(([A-Z-[QVX]][A-Z-[IJZ]][0-9][0-9]?)|(([A-Z-[QVX]][0-9][A-HJKSTUW])|([A-Z-[IJZ]][0-9][ABEHMNPRVWXY])))) [0-9][A-Z-[CIKMOV]]{2})\\b"; private Map<String, String> postcodes = null; /** New instance. * */ public Postcode() { super(POSTCODE_REGEX, true, 1.0); } @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { postcodes = new HashMap<String, String>(); try( CSVReader reader = new CSVReader(new InputStreamReader(getClass().getResourceAsStream("ukpostcodes.csv"))); ) { String[] line; while ((line = reader.readNext()) != null) { if(line.length < 3){ getMonitor().warn("Corrupt line found in ukpostcodes.csv - line will be skipped"); continue; } Double[] lonlat = parseLonLat(line[1],line[2]); if(lonlat.length == 0){ getMonitor().warn("Corrupt line found in ukpostcodes.csv - line will be skipped"); }else{ postcodes.put(line[0].toUpperCase(), lonlat[0]+","+lonlat[1]); } } getMonitor().debug(postcodes.size()+" postcodes loaded from CSV"); }catch(IOException e){ getMonitor().warn("Unable to load postcode data - geospatial data will not be available", e); } } private Double[] parseLonLat(String longitude, String latitude){ try{ Double lon = Double.parseDouble(longitude); Double lat = Double.parseDouble(latitude); return new Double[]{lon, lat}; }catch(NumberFormatException nfe){ getMonitor().warn("Unable to parse lon lat - line will be skipped", nfe); } return new Double[0]; } @Override protected Coordinate create(JCas jCas, Matcher matcher) { Coordinate loc = new Coordinate(jCas); String pcLonlat = postcodes.get(matcher.group(0).replaceAll(" ", "").toUpperCase()); if(pcLonlat != null){ loc.setGeoJson("{\"type\": \"Point\", \"coordinates\": ["+pcLonlat+"]}"); loc.setCoordinateValue(pcLonlat); loc.setSubType("postcode"); return loc; } else if(postcodes.isEmpty()){ return loc; } else { //Else skip as it's not valid and there are postcodes loaded in return null; } } @Override public void doDestroy(){ postcodes = null; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Coordinate.class)); } }