package com.bericotech.clavin.gazetteer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* BasicGeoName.java
*
*###################################################################*/
/**
* Data-rich representation of a named location, based on entries in
* the GeoNames gazetteer.
*
* TODO: link administrative subdivision code fields to the GeoName
* records they reference
*
*/
public class BasicGeoName implements GeoName {
/**
* The logger.
*/
private static final Logger LOG = LoggerFactory.getLogger(BasicGeoName.class);
/**
* The regex used to extract the administrative division level for A:ADM[1-4]H? records
*/
private static final Pattern ADM_LEVEL_REGEX = Pattern.compile("^ADM(\\d)H?$");
/**
* The set of top-level feature codes.
*/
private static final Set<FeatureCode> TOP_LEVEL_FEATURES = EnumSet.of(
FeatureCode.PCL,
FeatureCode.PCLD,
FeatureCode.PCLF,
FeatureCode.PCLI,
FeatureCode.PCLIX,
FeatureCode.PCLS,
FeatureCode.TERRI
);
/**
* The set of FeatureCodes that are valid administrative ancestors.
*/
private static final Set<FeatureCode> VALID_ADMIN_ANCESTORS = EnumSet.of(
FeatureCode.ADM1,
FeatureCode.ADM2,
FeatureCode.ADM3,
FeatureCode.ADM4,
FeatureCode.PCL,
FeatureCode.PCLD,
FeatureCode.PCLF,
FeatureCode.PCLI,
FeatureCode.PCLIX,
FeatureCode.PCLS,
FeatureCode.TERRI
);
// id of record in geonames database
private final int geonameID;
// name of geographical point (utf8)
private final String name;
// name of geographical point in plain ascii characters
private final String asciiName;
// list of alternate names for location
private final List<String> alternateNames;
// the preferred name of this GeoName
private final String preferredName;
// latitude in decimal degrees
private final double latitude;
// longitude in decimal degrees
private final double longitude;
// major feature category
// (see http://www.geonames.org/export/codes.html)
private final FeatureClass featureClass;
// http://www.geonames.org/export/codes.html
private final FeatureCode featureCode;
// ISO-3166 2-letter country code
private final CountryCode primaryCountryCode;
// associated name with country code
@Override
public String getPrimaryCountryName(){
return primaryCountryCode.name;
}
// list of alternate ISO-3166 2-letter country codes
private final List<CountryCode> alternateCountryCodes;
/* TODO: refactor the 4 fields below to link to the GeoName
* object that they refer to
*/
// Mostly FIPS codes. ISO codes are used for US, CH, BE and ME. UK
// and Greece are using an additional level between country and
// FIPS code.
private final String admin1Code;
// code for the second administrative division
// (e.g., a county in the US)
private final String admin2Code;
// code for third level administrative division
private final String admin3Code;
// code for fourth level administrative division
private final String admin4Code;
// total number of inhabitants
private final long population;
// in meters
private final int elevation;
// digital elevation model, srtm3 or gtopo30, average elevation of
// 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters,
// integer. srtm processed by cgiar/ciat.
private final int digitalElevationModel;
// timezone for geographical point
private final TimeZone timezone;
// date of last modification in GeoNames database
private final Date modificationDate;
// the GeoName ID of the parent of this GeoName
private Integer parentId;
// the parent of this GeoName
private GeoName parent;
// the gazetteer record this GeoName was parsed from
private String gazetteerRecord;
/**
* Sole constructor for {@link BasicGeoName} class.
*
* Encapsulates a gazetteer record from the GeoNames database.
*
* @param geonameID unique identifier
* @param name name of this location
* @param asciiName plain text version of name
* @param alternateNames list of alternate names, if any
* @param preferredName the preferred name, if known
* @param latitude lat coord
* @param longitude lon coord
* @param featureClass general type of feature (e.g., "Populated place")
* @param featureCode specific type of feature (e.g., "capital of a political entity")
* @param primaryCountryCode ISO country code
* @param alternateCountryCodes list of alternate country codes, if any (i.e., disputed territories)
* @param admin1Code FIPS code for first-level administrative subdivision (e.g., state or province)
* @param admin2Code second-level administrative subdivision (e.g., county)
* @param admin3Code third-level administrative subdivision
* @param admin4Code fourth-level administrative subdivision
* @param population number of inhabitants
* @param elevation elevation in meters
* @param digitalElevationModel another way to measure elevation
* @param timezone timezone for this location
* @param modificationDate date of last modification for the GeoNames record
* @param gazetteerRecord the gazetteer record
*/
public BasicGeoName(
int geonameID,
String name,
String asciiName,
List<String> alternateNames,
String preferredName,
Double latitude,
Double longitude,
FeatureClass featureClass,
FeatureCode featureCode,
CountryCode primaryCountryCode,
List<CountryCode> alternateCountryCodes,
String admin1Code,
String admin2Code,
String admin3Code,
String admin4Code,
Long population,
Integer elevation,
Integer digitalElevationModel,
TimeZone timezone,
Date modificationDate,
String gazetteerRecord) {
this.geonameID = geonameID;
this.name = name;
this.asciiName = asciiName;
if (alternateNames != null) {
// defensive copy
this.alternateNames = Collections.unmodifiableList(new ArrayList<String>(alternateNames));
} else {
// ensure this is never null
this.alternateNames = Collections.EMPTY_LIST;
}
this.latitude = latitude;
this.longitude = longitude;
this.primaryCountryCode = primaryCountryCode;
String pccName = primaryCountryCode != null ? primaryCountryCode.name : "";
if (alternateCountryCodes != null) {
// defensive copy
this.alternateCountryCodes = Collections.unmodifiableList(new ArrayList<CountryCode>(alternateCountryCodes));
} else {
// ensure this is never null
this.alternateCountryCodes = Collections.EMPTY_LIST;
}
this.featureClass = featureClass;
// configure the feature code so top-level territories are distinguishable
if (featureCode == FeatureCode.TERR) {
boolean topLevel = (this.name != null && !this.name.isEmpty() && this.name.equals(pccName)) ||
(this.asciiName != null && !this.asciiName.isEmpty() && this.asciiName.equals(pccName)) ||
this.alternateNames.contains(pccName);
this.featureCode = topLevel ? FeatureCode.TERRI : FeatureCode.TERR;
} else {
this.featureCode = featureCode;
}
// if this is a top level division, use the primary country name as the preferred name; otherwise
// use the name provided or null
boolean usePcc = TOP_LEVEL_FEATURES.contains(featureCode) && !pccName.isEmpty() &&
((this.name != null && !this.name.isEmpty() && this.name.equals(pccName)) ||
(this.asciiName != null && !this.asciiName.isEmpty() && this.asciiName.equals(pccName)) ||
this.alternateNames.contains(pccName));
if (usePcc) {
this.preferredName = pccName;
} else {
this.preferredName = preferredName != null && !preferredName.trim().isEmpty() ? preferredName.trim() : null;
}
this.admin1Code = admin1Code;
this.admin2Code = admin2Code;
this.admin3Code = admin3Code;
this.admin4Code = admin4Code;
this.population = population;
this.elevation = elevation;
this.digitalElevationModel = digitalElevationModel;
this.timezone = timezone != null ? (TimeZone) timezone.clone() : null;
this.modificationDate = modificationDate != null ? new Date(modificationDate.getTime()) : null;
this.gazetteerRecord = gazetteerRecord;
}
/**
* Builds a {@link BasicGeoName} object based on a single gazetteer
* record in the GeoNames geographical database.
*
* @param inputLine single line of tab-delimited text representing one record from the GeoNames gazetteer
* @return new GeoName object
*/
public static GeoName parseFromGeoNamesRecord(String inputLine) {
return parseFromGeoNamesRecord(inputLine, null);
}
/**
* Builds a {@link BasicGeoName} object based on a single gazetteer
* record in the GeoNames geographical database.
*
* @param inputLine single line of tab-delimited text representing one record from the GeoNames gazetteer
* @param preferredName the preferred name of this GeoName as indicated by the GeoNames alternate names table
* @return new GeoName object
*/
public static GeoName parseFromGeoNamesRecord(final String inputLine, final String preferredName) {
String[] ancestry = inputLine.split("\n");
GeoName geoName = parseGeoName(ancestry[0], preferredName);
// if more records exist, assume they are the ancestory of the target GeoName
if (ancestry.length > 1) {
GeoName current = geoName;
for (int idx = 1; idx < ancestry.length; idx++) {
GeoName parent = parseGeoName(ancestry[idx], null);
if (!current.setParent(parent)) {
LOG.error("Invalid ancestry path for GeoName [{}]: {}", geoName, inputLine.replaceAll("\n", " |@| "));
break;
}
current = parent;
}
}
return geoName;
}
private static GeoName parseGeoName(final String inputLine, final String preferredName) {
// GeoNames gazetteer entries are tab-delimited
String[] tokens = inputLine.split("\t");
// initialize each field with the corresponding token
int geonameID = Integer.parseInt(tokens[0]);
String name = tokens[1];
String asciiName = tokens[2];
List<String> alternateNames;
if (tokens[3].length() > 0) {
// better to pass empty array than array containing empty String ""
alternateNames = Arrays.asList(tokens[3].split(","));
} else alternateNames = new ArrayList<String>();
double latitude;
try {
latitude = Double.parseDouble(tokens[4]);
} catch (NumberFormatException e) {
latitude = OUT_OF_BOUNDS;
}
double longitude;
try {
longitude = Double.parseDouble(tokens[5]);
} catch (NumberFormatException e) {
longitude = OUT_OF_BOUNDS;
}
FeatureClass featureClass;
if (tokens[6].length() > 0) {
featureClass = FeatureClass.valueOf(tokens[6]);
} else featureClass = FeatureClass.NULL; // not available
FeatureCode featureCode;
if (tokens[7].length() > 0) {
featureCode = FeatureCode.valueOf(tokens[7]);
} else featureCode = FeatureCode.NULL; // not available
CountryCode primaryCountryCode;
if (tokens[8].length() > 0) {
primaryCountryCode = CountryCode.valueOf(tokens[8]);
} else primaryCountryCode = CountryCode.NULL; // No Man's Land
List<CountryCode> alternateCountryCodes = new ArrayList<CountryCode>();
if (tokens[9].length() > 0) {
// don't pass list only containing empty String ""
for (String code : tokens[9].split(",")) {
if (code.length() > 0) // check for malformed data
alternateCountryCodes.add(CountryCode.valueOf(code));
}
}
String admin1Code = tokens[10];
String admin2Code = tokens[11];
String admin3Code;
String admin4Code;
long population;
int elevation;
int digitalElevationModel;
TimeZone timezone;
Date modificationDate;
// check for dirty data...
if (tokens.length < 19) {
// GeoNames record format is corrupted, don't trust any
// data after this point
admin3Code = "";
admin4Code = "";
population = OUT_OF_BOUNDS;
elevation = OUT_OF_BOUNDS;
digitalElevationModel = OUT_OF_BOUNDS;
timezone = null;
modificationDate = new Date(0);
} else { // everything looks ok, soldiering on...
admin3Code = tokens[12];
admin4Code = tokens[13];
try {
population = Long.parseLong(tokens[14]);
} catch (NumberFormatException e) {
population = OUT_OF_BOUNDS;
}
try {
elevation = Integer.parseInt(tokens[15]);
} catch (NumberFormatException e) {
elevation = OUT_OF_BOUNDS;
}
try {
digitalElevationModel = Integer.parseInt(tokens[16]);
} catch (NumberFormatException e) {
digitalElevationModel = OUT_OF_BOUNDS;
}
timezone = TimeZone.getTimeZone(tokens[17]);
try {
modificationDate = new SimpleDateFormat("yyyy-MM-dd").parse(tokens[18]);
} catch (ParseException e) {
modificationDate = new Date(0);
}
}
return new BasicGeoName(geonameID, name, asciiName, alternateNames, preferredName,
latitude, longitude, featureClass, featureCode,
primaryCountryCode, alternateCountryCodes, admin1Code,
admin2Code, admin3Code, admin4Code, population,
elevation, digitalElevationModel, timezone,
modificationDate, inputLine);
}
private static int getAdminLevel(final FeatureClass fClass, final FeatureCode fCode) {
return getAdminLevel(fClass, fCode, null, null, null);
}
private static int getAdminLevel(final FeatureClass fClass, final FeatureCode fCode, final String geoname, final List<String> altNames, final String countryName) {
int admLevel = Integer.MAX_VALUE;
if (fClass == FeatureClass.A) {
if (fCode == null) {
admLevel = -1;
} else if (fCode == FeatureCode.TERR) {
admLevel = 1;
} else if (fCode == FeatureCode.PRSH) {
admLevel = 1;
} else if (TOP_LEVEL_FEATURES.contains(fCode)) {
admLevel = 0;
} else {
Matcher matcher = ADM_LEVEL_REGEX.matcher(fCode.name());
if (matcher.matches()) {
admLevel = Integer.parseInt(matcher.group(1));
}
}
}
return admLevel;
}
/**
* For pretty-printing.
*
*/
@Override
public String toString() {
return getPreferredName() + " (" + getPrimaryCountryName() + ", " + admin1Code + ")" + " [pop: " + population + "] <" + geonameID + ">";
}
@Override
public String getParentAncestryKey() {
String key = buildAncestryKey(FeatureCode.ADM4, false);
// return null if the key is empty; that means we are a top-level administrative component
return !key.isEmpty() ? key : null;
}
@Override
public String getAncestryKey() {
boolean hasKey = featureClass == FeatureClass.A && VALID_ADMIN_ANCESTORS.contains(featureCode);
if (hasKey) {
String myCode;
switch (featureCode) {
case ADM1:
myCode = admin1Code;
break;
case ADM2:
myCode = admin2Code;
break;
case ADM3:
myCode = admin3Code;
break;
case ADM4:
myCode = admin4Code;
break;
case PCL:
case PCLD:
case PCLF:
case PCLI:
case PCLIX:
case PCLS:
case TERRI:
myCode = primaryCountryCode != null ? primaryCountryCode.name() : null;
break;
default:
myCode = null;
break;
}
hasKey = myCode != null && !myCode.trim().isEmpty();
}
String key = (hasKey ? buildAncestryKey(FeatureCode.ADM4, true) : "").trim();
return !key.isEmpty() ? key : null;
}
@Override
public boolean isTopLevelAdminDivision() {
return TOP_LEVEL_FEATURES.contains(featureCode);
}
@Override
public boolean isTopLevelTerritory() {
return featureCode == FeatureCode.TERRI;
}
/**
* Recursively builds the ancestry key for this GeoName, optionally including the
* key for this GeoName's administrative division if requested and applicable. See
* {@link BasicGeoName#getAncestryKey()} for a description of the ancestry key. Only
* divisions that have a non-empty code set in this GeoName will be included in the
* key.
* @param level the administrative division at the end of the key (e.g. ADM2 to build
* the key COUNTRY.ADM1.ADM2)
* @param includeSelf <code>true</code> to include this GeoName's code in the key
* @return the generated ancestry key
*/
private String buildAncestryKey(final FeatureCode level, final boolean includeSelf) {
// if we have reached the root level, stop
if (level == null) {
return "";
}
String keyPart;
FeatureCode nextLevel;
switch (level) {
case ADM4:
keyPart = admin4Code;
nextLevel = FeatureCode.ADM3;
break;
case ADM3:
keyPart = admin3Code;
nextLevel = FeatureCode.ADM2;
break;
case ADM2:
keyPart = admin2Code;
nextLevel = FeatureCode.ADM1;
break;
case ADM1:
// territories will be considered level 1 if they have the same country code as their
// parent but cannot contain descendants so there should be no keypart for this level;
// all parishes are considered to be direct descendants of their containing country with
// no descendants; they should not have a key part at this level
keyPart = featureCode != FeatureCode.TERR && featureCode != FeatureCode.PRSH ? admin1Code : "";
nextLevel = FeatureCode.PCL;
break;
case PCL:
keyPart = primaryCountryCode != null && primaryCountryCode != CountryCode.NULL ? primaryCountryCode.name() : "";
nextLevel = null;
break;
default:
throw new IllegalArgumentException("Level must be one of [PCL, ADM1, ADM2, ADM3, ADM4]");
}
keyPart = keyPart.trim();
if (nextLevel != null && !keyPart.isEmpty()) {
keyPart = String.format(".%s", keyPart);
}
int keyLevel = getAdminLevel(FeatureClass.A, level);
int nameLevel = getAdminLevel(featureClass, featureCode, name, alternateNames, primaryCountryCode.name);
// if the requested key part is a larger administrative division than the level of the
// geoname or, if we are including the geoname's key part and it is the requested part,
// include it in the ancestry key (if not blank); otherwise, move to the next level
String qualifiedKey = (nameLevel > keyLevel || (includeSelf && keyLevel == nameLevel)) && !keyPart.isEmpty() ?
String.format("%s%s", buildAncestryKey(nextLevel, includeSelf), keyPart) :
buildAncestryKey(nextLevel, includeSelf);
// if any part of the key is missing once a lower-level component has been specified, we cannot
// resolve the ancestry path and an empty string should be returned.
if (qualifiedKey.startsWith(".") || qualifiedKey.contains("..") || qualifiedKey.endsWith(".")) {
qualifiedKey = "";
}
return qualifiedKey;
}
@Override
public boolean isDescendantOf(final GeoName geoname) {
boolean descended = false;
if (geoname != null) {
GeoName test;
// empty for loop exits when parent is found or top level is reached
for (test = this; test != null && !test.equals(geoname); test = test.getParent());
descended = test != null;
}
return descended;
}
@Override
public boolean isAncestorOf(final GeoName geoname) {
return geoname != null && geoname.isDescendantOf(this);
}
@Override
public GeoName getParent() {
return parent;
}
@Override
public boolean setParent(final GeoName prnt) {
String myParentKey = this.getParentAncestryKey();
String parentKey = prnt != null ? prnt.getAncestryKey() : null;
boolean parentSet = false;
if (prnt != null) {
if (prnt.getFeatureClass() != FeatureClass.A || !VALID_ADMIN_ANCESTORS.contains(prnt.getFeatureCode())) {
LOG.error(String.format("Invalid administrative parent type [%s:%s] specified for GeoName [%s]; Parent [%s]",
prnt.getFeatureClass(), prnt.getFeatureCode(), this, prnt));
} else if (myParentKey != null && parentKey != null && !myParentKey.startsWith(parentKey)) {
LOG.error(String.format("Parent ancestry key [%s] does not match the expected key [%s] for GeoName [%s]; Parent [%s]",
parentKey, myParentKey, this, prnt));
} else if (this.equals(prnt)) {
LOG.warn("Attempted to set parent to self: {}", prnt);
} else {
this.parent = prnt;
parentSet = true;
}
}
return parentSet;
}
@Override
public Integer getParentId() {
return parent != null ? parent.getGeonameID() : null;
}
@Override
public boolean isAncestryResolved() {
// this GeoName is considered resolved if it is a top level administrative division,
// it is unresolvable, or all parents up to a top-level element have been configured
return getAdminLevel(featureClass, featureCode, name, alternateNames, primaryCountryCode.name) <= 0 ||
getParentAncestryKey() == null || (parent != null && parent.isAncestryResolved());
}
@Override
public int getGeonameID() {
return geonameID;
}
@Override
public String getName() {
return name;
}
@Override
public String getAsciiName() {
return asciiName;
}
@Override
public List<String> getAlternateNames() {
return alternateNames;
}
@Override
public String getPreferredName() {
return preferredName != null ? preferredName : name;
}
@Override
public double getLatitude() {
return latitude;
}
@Override
public double getLongitude() {
return longitude;
}
@Override
public FeatureClass getFeatureClass() {
return featureClass;
}
@Override
public FeatureCode getFeatureCode() {
return featureCode;
}
@Override
public CountryCode getPrimaryCountryCode() {
return primaryCountryCode;
}
@Override
public List<CountryCode> getAlternateCountryCodes() {
return alternateCountryCodes;
}
@Override
public String getAdmin1Code() {
return admin1Code;
}
@Override
public String getAdmin2Code() {
return admin2Code;
}
@Override
public String getAdmin3Code() {
return admin3Code;
}
@Override
public String getAdmin4Code() {
return admin4Code;
}
@Override
public long getPopulation() {
return population;
}
@Override
public int getElevation() {
return elevation;
}
@Override
public int getDigitalElevationModel() {
return digitalElevationModel;
}
@Override
public TimeZone getTimezone() {
// defensive copy
return timezone != null ? (TimeZone) timezone.clone() : null;
}
@Override
public Date getModificationDate() {
// defensive copy
return modificationDate != null ? new Date(modificationDate.getTime()) : null;
}
@Override
public String getGazetteerRecord() {
return gazetteerRecord;
}
@Override
public String getGazetteerRecordWithAncestry() {
return parent != null ? String.format("%s\n%s", gazetteerRecord, parent.getGazetteerRecordWithAncestry()) : gazetteerRecord;
}
@Override
public int hashCode() {
int hash = 3;
hash = 83 * hash + this.geonameID;
return hash;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final BasicGeoName other = (BasicGeoName) obj;
if (this.geonameID != other.geonameID) {
return false;
}
return true;
}
}