/* Copyright 2013 hbz, Pascal Christoph
* Licensed under the Eclipse Public License 1.0 */
package org.lobid.lodmill;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.RDFLanguages;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Closeables;
import com.google.zxing.WriterException;
import com.hp.hpl.jena.graph.Graph;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.NodeFactory;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.rdf.model.LiteralRequiredException;
import com.hp.hpl.jena.rdf.model.NodeIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.util.ResourceUtils;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;
/**
* Lookup in openstreetmap-Web-API to get geo locations. Caches these in
* serialized HashMap. Triplify these geo locations. Lookup in geonames dump and
* triplify matches. Create qr codes and triplify them.
*
* @TODO instead of doing everything (transformation of zdb-isil-file and
* enrichment) in one class it may be better to first transform into
* ntriples using @PipeEncodeTriples and use the output file as the input
* for another flux chain, serializing the new gained ntriples and merge
* the two files in the end. But then, this leads to greater redundancy.
*
* @author Pascal Christoph
*/
@Description("Lookup geo location data in OSM. Decodes triples as string. Predefined values for output are"
+ " 'RDF/XML', 'N-TRIPLE', 'TURTLE' (or 'TTL') and 'N3'. null represents the "
+ "default language, 'RDF/XML'. 'RDF/XML-ABBREV' is a synonym for 'RDF/XML'."
+ "Default output is NTriples.")
@In(StreamReceiver.class)
@Out(String.class)
public class PipeLobidOrganisationEnrichment extends PipeEncodeTriples {
private static final String HTTP_PURL_ORG_LOBID_LIBTYPE_N86 =
"http://purl.org/lobid/libtype#n86";
private static final String HTTP_WWW_W3_ORG_NS_ORG_CLASSIFICATION =
"http://www.w3.org/ns/org#classification";
private enum VcardNs {
LOCALITY("http://www.w3.org/2006/vcard/ns#locality"), COUNTRY_NAME(
"http://www.w3.org/2006/vcard/ns#country-name"), STREET_ADDRESS(
"http://www.w3.org/2006/vcard/ns#street-address"), POSTAL_CODE(
"http://www.w3.org/2006/vcard/ns#postal-code"), EMAIL(
"http://www.w3.org/2006/vcard/ns#email"), VOICE(
"http://www.w3.org/2006/vcard/ns#Voice"), HOMEPAGE(
"http://www.w3.org/2006/vcard/ns#url");
String uri;
VcardNs(String uri) {
this.uri = uri;
}
}
private Lang serialization = Lang.NTRIPLES;
private static final String FOAF_NAME = "http://xmlns.com/foaf/0.1/name";
private static final String GEO_WGS84_POS =
"http://www.w3.org/2003/01/geo/wgs84_pos#";
private static final String GN_LOCATED_IN =
"http://www.geonames.org/ontology#locatedIn";
private static final String GEO_WGS84_POS_LONG = GEO_WGS84_POS + "long";
private static final String GEO_WGS84_POS_LAT = GEO_WGS84_POS + "lat";
private static final String LAT_LON_FILENAME = "latlon.ser";
private static final String OSM_LOOKUP_FORMAT_PARAMETER = "format=json";
private static final String OSM_API_BASE_URL =
"http://nominatim.openstreetmap.org/search";
// use two different API parameters, example:
// [0]="http://nominatim.openstreetmap.org/search.php?q=germany+k%C3%B6ln+50679+library&format=json"
// [1]="http://nominatim.openstreetmap.org/search/95643/Tirschenreuth/bahnhofstr.?format=json"
private String[] urlOsmLookupSearchParameters = new String[2];
private Resource bnodeIDGeoPos;
private String countryName;
private String locality;
private String postalcode;
private String street;
private static Map<String, Double[]> LAT_LON =
new HashMap<String, Double[]>();
// will be persisted only temporarily
private static Set<String> LAT_LON_LOOKUP_NULL = new HashSet<String>();
private static Map<String, Integer> GEONAMES_REGION_ID =
new HashMap<String, Integer>();
private URL[] osmUrl = new URL[2];
private Double lat = null;
private Double lon = null;
private static final int URL_CONNECTION_TIMEOUT = 10000; // 10 secs
private BufferedReader osmApiLookupResult;
private boolean latLonChanged;
private static final Logger LOG =
LoggerFactory.getLogger(PipeLobidOrganisationEnrichment.class);
private static final QREncoder QRENCODER = new QREncoder();
private String qrFilePath = "tmp/";
private static final String LV_CONTACTQR =
"http://purl.org/lobid/lv#contactqr";
private static final String RDF_SYNTAX_NS_VALUE =
"http://www.w3.org/1999/02/22-rdf-syntax-ns#value";
private static final String NS_GEONAMES = "http://sws.geonames.org/";
private String GEONAMES_DE_FILENAME;
private static final String QR_URI_PATH = "http://lobid.org/media/";
private static final String RDF_SYNTAX_NS_TYPE =
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
private static final String WGS84_POS_SPATIALTHING =
"http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing";
boolean doApiLookup = false;
/**
* Sets the serialization format of the outgoing String .
*
* @param serialization one of 'RDF/XML', 'N-TRIPLE', 'TURTLE' (or 'TTL') and
* 'N3'. Any other value defaults to 'RDF/XML'. 'RDF/XML-ABBREV' is a
* synonym for 'RDF/XML'.")
*/
public void setSerialization(final String serialization) {
this.serialization = RDFLanguages.nameToLang(serialization);
}
/**
* Set the file name of the geonames csv file
*
* @param filename The name of the file
*/
public void setGeonameFilename(final String filename) {
this.GEONAMES_DE_FILENAME = filename;
}
/**
* Set if an online lookup should be made. Default is no.
*
* @param lookup If true, make an online lookup at OSM API.
*/
public void setDoApiLookup(boolean lookup) {
this.doApiLookup = lookup;
}
/**
* Sets the file path to which the QR codes will be written. Default is
* "media/" .
*
* @param path the path to where the QR codes will be written
*/
public void setQrFilePath(String path) {
qrFilePath = path;
}
@Override
public void startRecord(final String identifier) {
this.lat = null;
this.lon = null;
super.startRecord(identifier);
}
@Override
public void literal(final String name, final String value) {
if (value == null) {
LOG.warn("Value should not be null, ID " + "'" + super.subject + "'");
} else if (!name.equals("")) {
boolean isRegionalID = name.startsWith(GN_LOCATED_IN);
super.literal(name, isRegionalID ? createGeonameLink(value) : value);
}
}
@Override
public void endRecord() {
if (super.subject != PipeEncodeTriples.DUMMY_SUBJECT) {
startOsmLookupEnrichment();
startQREncodeEnrichment();
ResourceUtils.renameResource(
model.getResource(PipeEncodeTriples.DUMMY_SUBJECT), super.subject);
final StringWriter tripleWriter = new StringWriter();
RDFDataMgr.write(tripleWriter, model, this.serialization);
getReceiver().process(tripleWriter.toString());
} else {
LOG.info("Missing ISIL, thus ignoring that record.");
LOG.debug("Record with missing ISIL:" + model.toString());
}
}
@Override
protected void onSetReceiver() {
super.onSetReceiver();
iniOsmApiLookup();
iniGeonamesDump();
File file = new File(qrFilePath);
if (!file.exists()) {
file.mkdir();
}
}
@Override
protected void onCloseStream() {
super.onCloseStream();
if (LAT_LON.size() > 0 && latLonChanged) {
ObjectOutputStream oos = null;
try {
oos = new ObjectOutputStream(new FileOutputStream(LAT_LON_FILENAME));
oos.writeObject(LAT_LON);
} catch (IOException e) {
LOG.error(e.getMessage(), e);
} finally {
Closeables.closeQuietly(oos);
}
}
}
@Override
public void startEntity(final String name) {
super.startEntity(name);
if (name.startsWith(GEO_WGS84_POS)) {
this.bnodeIDGeoPos = super.resources.peek();
}
}
private String getRdfsvalueOfSubjectHavingObject(final String object) {
String ret = null;
Node nodeObject = NodeFactory.createURI(object);
Graph graph = this.model.getGraph();
ExtendedIterator<Triple> triples;
triples = graph.find(Node.ANY, Node.ANY, nodeObject);
if (triples.hasNext()) {
triples = graph.find(triples.next().getSubject(),
NodeFactory.createURI(RDF_SYNTAX_NS_VALUE), Node.ANY);
if (triples.hasNext()) {
ret = triples.next().getObject().getLiteralLexicalForm();
}
}
return ret;
}
private void startQREncodeEnrichment() {
if (this.postalcode == null || this.street == null || this.locality == null)
return;
String qrCodeText = createQrCodeText();
try {
String isil = (new URI(super.subject)).getPath().replaceAll("/.*/", "");
QRENCODER.createQRImage(qrFilePath + isil, qrCodeText,
(int) (java.lang.Math.sqrt(qrCodeText.length() * 10) + 20) * 2);
this.model.add(this.model.createResource(super.subject),
this.model.createProperty(LV_CONTACTQR),
this.model.asRDFNode(NodeFactory.createURI(QR_URI_PATH + isil
+ QREncoder.FILE_SUFFIX + "." + QREncoder.FILE_TYPE)));
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (WriterException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private String createQrCodeText() {
final String name = getFirstLiteralOfProperty(FOAF_NAME);
String qrCodeText = "MECARD:N:" + name + ";" + "ADR:" + this.street + ","
+ this.locality + "," + this.postalcode;
Resource email = getFirstResourceOfProperty(VcardNs.EMAIL.uri);
if (email != null)
qrCodeText =
qrCodeText + ";EMAIL:" + email.getURI().replaceAll("mailto:", "");
String telephone = getRdfsvalueOfSubjectHavingObject(VcardNs.VOICE.uri);
if (telephone != null)
qrCodeText = qrCodeText + ";TEL:" + telephone;
Resource homepage = getFirstResourceOfProperty(VcardNs.HOMEPAGE.uri);
if (homepage != null)
qrCodeText = qrCodeText + ";URL:" + homepage;
qrCodeText = qrCodeText + ";END;";
return qrCodeText;
}
private String createGeonameLink(final String value) {
String ret = null;
if (GEONAMES_REGION_ID.containsKey(value)) {
ret = NS_GEONAMES + GEONAMES_REGION_ID.get(value);
}
if (ret == null) {
LOG.warn(String.format(
"Could not find geoname entry for value '%s' for subject '%s'", value,
super.subject));
}
return ret;
}
private void iniGeonamesDump() {
final Scanner geonamesDump =
new Scanner(Thread.currentThread().getContextClassLoader()
.getResourceAsStream(this.GEONAMES_DE_FILENAME));
try {
while (geonamesDump.hasNextLine()) {
String[] geonameDumpLines = geonamesDump.nextLine().split("\t");
if (geonameDumpLines[13].matches("\\d+")
&& geonameDumpLines[7].equals("ADM4")) {
String gnRegionalId = geonameDumpLines[13];
int gnId = Integer.parseInt(geonameDumpLines[0]);
GEONAMES_REGION_ID.put(gnRegionalId, gnId);
}
}
} finally {
geonamesDump.close();
}
}
private static void iniOsmApiLookup() {
// see https://wiki.openstreetmap.org/wiki/DE:Nominatim#Nutzungsbedingungen
System.setProperty("http.agent",
"java.net.URLConnection, email=<semweb@hbz-nrw.de>");
FileInputStream fis = null;
ObjectInputStream ois = null;
try {
fis = new FileInputStream(LAT_LON_FILENAME);
ois = new ObjectInputStream(fis);
LAT_LON = (HashMap<String, Double[]>) ois.readObject();
LOG.info("Number of cached URLs in file " + LAT_LON_FILENAME + ":"
+ LAT_LON.size());
ois.close();
} catch (IOException e) {
LOG.info("File not found, will create a new one if necessary.",
e.getMessage());
} catch (ClassNotFoundException e) {
LOG.error(e.getMessage(), e);
} finally {
Closeables.closeQuietly(fis);
Closeables.closeQuietly(ois);
}
}
private void startOsmLookupEnrichment() {
// activate the geo position bnode
enterBnode(this.bnodeIDGeoPos);
for (int i = 0; i < 2; i++) {
osmUrl[i] = null;
urlOsmLookupSearchParameters[1] = null;
}
final String firstLiteralOfProperty =
getFirstLiteralOfProperty(VcardNs.LOCALITY.uri);
if (firstLiteralOfProperty != null) {
// OSM Api doesn't like e.g /Marburg%2FLahn/ but accepts /Marburg/.
// Having also the postcode we will not encounter ambigous cities
try {
this.locality =
URIUtil.encodeQuery((URIUtil.decode(firstLiteralOfProperty, "UTF-8")
.replaceAll("(.*)\\p{Punct}.*", "$1")), "UTF-8");
} catch (URIException e1) {
this.locality = firstLiteralOfProperty;
e1.printStackTrace();
}
}
this.postalcode = getFirstLiteralOfProperty(VcardNs.POSTAL_CODE.uri);
this.street = getFirstLiteralOfProperty(VcardNs.STREET_ADDRESS.uri);
if (!doubles()) {
this.countryName = getFirstLiteralOfProperty(VcardNs.COUNTRY_NAME.uri);
if (makeOsmApiSearchParameters()) {
lookupLocation(); // TODO check whats happening if geo data already in
// source file
}
}
if (this.lat != null && this.lon != null) {
super.literal(GEO_WGS84_POS_LAT, String.valueOf(this.lat));
super.literal(GEO_WGS84_POS_LONG, String.valueOf(this.lon));
super.literal(RDF_SYNTAX_NS_TYPE, WGS84_POS_SPATIALTHING);
}
}
private boolean doubles() {
try {
Double.valueOf(getFirstLiteralOfProperty(GEO_WGS84_POS_LAT));
Double.valueOf(getFirstLiteralOfProperty(GEO_WGS84_POS_LONG));
} catch (Exception e) {
return false;
}
return true;
}
private boolean makeOsmApiSearchParameters() {
boolean ret = false;
if (this.countryName != null && this.locality != null
&& this.postalcode != null) {
String osmSearchType = getOsmApiSearchType();
if (osmSearchType != null) {
this.urlOsmLookupSearchParameters[0] = String
.format(osmSearchType + "+%s+%s", this.postalcode, this.locality);
}
if (this.street != null) {
this.urlOsmLookupSearchParameters[1] = String.format("%s/%s/%s/%s",
this.countryName, this.locality, this.postalcode, this.street);
ret = true;
}
} else {
LOG.warn("One or more parameter needing by the OSM API is missing for "
+ super.subject + " : country=" + this.countryName + ",locality="
+ this.locality + ",postcode=" + this.postalcode);
}
return ret;
}
private String getOsmApiSearchType() throws NumberFormatException {
String OSM_SEARCH_TYPE = null;
String type;
Resource res_type =
getFirstResourceOfProperty(HTTP_WWW_W3_ORG_NS_ORG_CLASSIFICATION);
if (res_type != null) {
type = res_type.toString();
if (Integer.parseInt(type.replaceAll(".*#n", "")) < 85) {
OSM_SEARCH_TYPE = "library";
} else if (type.equals(HTTP_PURL_ORG_LOBID_LIBTYPE_N86)) {
OSM_SEARCH_TYPE = "museum";
}
}
return OSM_SEARCH_TYPE;
}
/**
*
* @return true if cached, otherwise false
*/
private boolean makeUrlAndLookupIfCached() {
boolean ret = false;
try {
osmUrl[0] = new URL(
OSM_API_BASE_URL + ".php?q=" + this.urlOsmLookupSearchParameters[0]
+ "&" + OSM_LOOKUP_FORMAT_PARAMETER);
osmUrl[1] =
new URL(OSM_API_BASE_URL + "/" + this.urlOsmLookupSearchParameters[1]
+ "?" + OSM_LOOKUP_FORMAT_PARAMETER);
} catch (MalformedURLException e) {
LOG.error(super.subject + " " + e.getMessage(), e);
}
for (int i = 0; i < 2; i++) {
if (LAT_LON.containsKey(this.urlOsmLookupSearchParameters[i])) {
this.lat = LAT_LON.get(this.urlOsmLookupSearchParameters[i])[0];
this.lon = LAT_LON.get(this.urlOsmLookupSearchParameters[i])[1];
ret = true;
}
}
if (LAT_LON_LOOKUP_NULL.contains(this.urlOsmLookupSearchParameters[0])
&& LAT_LON_LOOKUP_NULL.contains(this.urlOsmLookupSearchParameters[1])) {
LOG.warn("Could not generate geo location for " + super.subject
+ ". The URL is:" + this.osmUrl[1]);
ret = true; // do not store anything
}
return ret;
}
/**
* Lookup URL. If no result, make streetname ever more abstract till something
* is (hopefully) found via the OSM-API.
*
* @param regex
*/
private void lookupLocation() {
// TODO don't use exceptions as control structures
if (!makeUrlAndLookupIfCached() && this.doApiLookup) {
try {
this.osmApiLookupResult = getUrlContent(this.osmUrl[0]);
} catch (IOException e) {
// ignore, will be treated below
}
try {
parseJsonAndStoreLatLon();
} catch (Exception e) {
try {
this.osmApiLookupResult = getUrlContent(this.osmUrl[1]);
} catch (IOException e3) {
// ignore, will be treated below
}
try {
parseJsonAndStoreLatLon();
} catch (Exception e3) {
try {
// "Albertus-Magnus-Pl. 23 (Zimmer 2)" => "Albertus-Magnus-Pl. 23"
sanitizeStreetnameAndRetrieveOsmApiResultAndStoreLatLon(
"(.*?\\d+){1}?.*");
} catch (Exception e1) {
try {
// "Albertus-Magnus-Pl. 23 (Zimmer 2)" => "Albertus-Magnus-Pl."
sanitizeStreetnameAndRetrieveOsmApiResultAndStoreLatLon(
"(.*?){1}\\ .*");
} catch (Exception e2) {
// failed definetly
LOG.warn("Failed to generate geo location for " + super.subject
+ ". The URL is:" + this.osmUrl[1]);
LAT_LON_LOOKUP_NULL.add(this.urlOsmLookupSearchParameters[1]);
}
}
}
}
}
}
private void sanitizeStreetnameAndRetrieveOsmApiResultAndStoreLatLon(
String regex) throws Exception {
String tmp = "";
try {
tmp = URIUtil.encodeQuery(
(URIUtil.decode(this.street, "UTF-8").replaceAll(regex, "$1")),
"UTF-8");
} catch (URIException e2) {
e2.printStackTrace();
}
// make new request only if strings differ
if (!tmp.equals(this.street)) {
this.street = tmp;
try {
if (makeOsmApiSearchParameters()) {
if (!makeUrlAndLookupIfCached()) {
this.osmApiLookupResult = getUrlContent(this.osmUrl[1]);
parseJsonAndStoreLatLon();
}
}
} catch (IOException e1) {
LOG.error(super.subject + " " + e1.getLocalizedMessage());
}
}
}
private void parseJsonAndStoreLatLon() throws Exception {
String json;
StringBuilder builder = new StringBuilder();
String aux;
while ((aux = this.osmApiLookupResult.readLine()) != null) {
builder.append(aux);
}
json = builder.toString();
Object obj = JSONValue.parse(json);
JSONArray osm = (JSONArray) obj;
// ignore library search results if result > 1
// this may lead to wrong results, though, e. g. if there are two libraries
// of which just one is tagged as library. See
// http://lobid.org/organisation/DE-Tir1 (where the geo location is in fact
// of http://lobid.org/organisation/DE-1445 )
if (osm.size() > 1 && this.osmUrl.toString().contains("library")) {
LOG.info("More than 1 result for " + super.subject + ", search "
+ this.osmUrl);
throw new Exception();
}
JSONObject jo = (JSONObject) osm.get(0);
this.lat = Double.valueOf(jo.get("lat").toString());
this.lon = Double.valueOf(jo.get("lon").toString());
Double doubleArr[] = new Double[2];
doubleArr[0] = this.lat;
doubleArr[1] = this.lon;
LAT_LON.put(this.urlOsmLookupSearchParameters[1], doubleArr);
this.latLonChanged = true;
}
private static BufferedReader getUrlContent(final URL url)
throws IOException {
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout(URL_CONNECTION_TIMEOUT);
LOG.debug("Lookup url:" + url);
return new BufferedReader(
new InputStreamReader(urlConnection.getInputStream()));
}
private String getFirstLiteralOfProperty(String ns) {
NodeIterator it =
this.model.listObjectsOfProperty(this.model.getProperty(ns));
if (it.hasNext()) {
try {
return URIUtil.encodeQuery(it.next().asLiteral().getLexicalForm(),
"UTF-8");
} catch (URIException e) {
LOG.error(super.subject + " " + e.getMessage(), e);
} catch (LiteralRequiredException le) {
LOG.warn(le.getMessage(), le);
}
}
return null;
}
private Resource getFirstResourceOfProperty(String ns) {
NodeIterator it =
this.model.listObjectsOfProperty(this.model.getProperty(ns));
Resource res = null;
try {
if (it.hasNext()) {
res = it.next().asResource();
}
} catch (Exception e) {
LOG.warn("Exception with subject" + super.subject + " Resource=" + res,
e.getLocalizedMessage());
}
return res;
}
}