package nl.ipo.cds.etl.process; import static nl.ipo.cds.etl.process.MetadataHarvester.FeatureCollectionType.GML; import static nl.ipo.cds.etl.process.MetadataHarvester.FeatureCollectionType.WFS; import static nl.ipo.cds.etl.process.MetadataHarvester.FeatureCollectionType.XML; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.sql.Timestamp; import java.util.HashMap; import java.util.Map; import java.util.Properties; import javax.xml.namespace.QName; import nl.ipo.cds.etl.process.helpers.HttpGetUtil; import nl.ipo.cds.utils.AxiomUtils; import nl.ipo.cds.utils.DateTimeUtils; import org.apache.axiom.om.OMElement; import org.apache.axiom.om.xpath.AXIOMXPath; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.deegree.commons.utils.Pair; import org.jaxen.JaxenException; public class MetadataHarvester { public static enum FeatureCollectionType { WFS, GML, XML } public static class FeatureCollectionReference { public final FeatureCollectionType type; public final String url; public final String featureTypeName; //TODO uitbreiden met xsd (W3C:XSD) URL uit metadata doc public final String xsdUrl; public FeatureCollectionReference (final FeatureCollectionType type, final String url, final String featureTypeName, final String xsdUrl) { assert type != null; assert url != null; this.type = type; this.url = url; this.featureTypeName = featureTypeName; this.xsdUrl = xsdUrl; } } private static final Log technicalLog = LogFactory.getLog (MetadataHarvester.class); // developer log private final static String xpathCreationDate = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='creation']/*[local-name()='date']/*[local-name()='Date']"; private final static String xpathRevisionDate = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='revision']/*[local-name()='date']/*[local-name()='Date']"; private final static String xpathCreationDateOrRevisionDate = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][(*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='creation') or (*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='revision')]/*[local-name()='date']/*[local-name()='Date']"; private final static String xpathCreationDateTime = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='creation']/*[local-name()='date']/*[local-name()='DateTime']"; private final static String xpathRevisionDateTime = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='revision']/*[local-name()='date']/*[local-name()='DateTime']"; private final static String xpathCreationDateTimeOrRevisionDateTime = "//*[local-name()='MD_Metadata']/*[local-name()='identificationInfo']/*[local-name()='MD_DataIdentification']/*[local-name()='citation']/*[local-name()='CI_Citation']/*[local-name()='date']/*[local-name()='CI_Date'][(*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='creation') or (*[local-name()='dateType']/*[local-name()='CI_DateTypeCode']/@codeListValue='revision')]/*[local-name()='date']/*[local-name()='DateTime']"; private final static String xpathWfs = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='OGC:WFS']/*[local-name()='linkage']/*[local-name()='URL']"; private final static String xpathGmlFeatureCollection = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='OGC:GML']/*[local-name()='linkage']/*[local-name()='URL']"; private final static String xpathXmlDataset = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='W3C:XML']/*[local-name()='linkage']/*[local-name()='URL']"; private final static String xpathXsdUrl = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='W3C:XSD']/*[local-name()='linkage']/*[local-name()='URL']"; private final static String xpathFeatureType = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='OGC:WFS']/*[local-name()='name']/*[local-name()='CharacterString']"; private final static String xpathGmlFeatureType = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='OGC:GML']/*[local-name()='name']/*[local-name()='CharacterString']"; private final static String xpathXmlFeatureType = "//*[local-name()='MD_Metadata']/*[local-name()='distributionInfo']/*[local-name()='MD_Distribution']/*[local-name()='transferOptions']/*[local-name()='MD_DigitalTransferOptions']/*[local-name()='onLine']/*[local-name()='CI_OnlineResource'][*[local-name()='protocol']/*[local-name()='CharacterString']='W3C:XML']/*[local-name()='name']/*[local-name()='CharacterString']"; private final static String xpathGetFeature = "//*[local-name()='OperationsMetadata']/*[local-name()='Operation'][@name='GetFeature']/*[local-name()='DCP']/*[local-name()='HTTP']/*[local-name()='Get']"; private static final String xpathGetRecordByIdResponse = "//*[local-name()='GetRecordByIdResponse']/*[local-name()='MD_Metadata']"; private static final String xpathValidMetadataDocument = "//*[local-name()='MD_Metadata']"; private final static Map<FeatureCollectionType, Pair<String, String>> featureCollectionReferenceMap = new HashMap<FeatureCollectionType, Pair<String,String>> (); static { featureCollectionReferenceMap.put (WFS, new Pair<String, String> (xpathWfs, xpathFeatureType)); featureCollectionReferenceMap.put (GML, new Pair<String, String> (xpathGmlFeatureCollection, xpathGmlFeatureType)); featureCollectionReferenceMap.put (XML, new Pair<String, String> (xpathXmlDataset, xpathXmlFeatureType)); } private final static String metaDataDateTimeFormat = "yyyy-MM-dd'T'HH:mm:ss.SSS"; private final static String metaDataDateFormat = "yyyy-MM-dd"; private final static Properties namespaces = new Properties (); static { namespaces.setProperty("csw", "http://www.opengis.net/cat/csw/2.0.2"); namespaces.setProperty("gmd", "http://www.isotc211.org/2005/gmd"); namespaces.setProperty("gco", "http://www.isotc211.org/2005/gco"); namespaces.setProperty("ows", "http://www.opengis.net/ows"); } private final String pgrBaseUrl; public MetadataHarvester (final String pgrBaseUrl) { this.pgrBaseUrl = pgrBaseUrl; } public DatasetMetadata parseMetadata (final String uuid) throws HarvesterException { if (uuid != null && (uuid.startsWith ("http://")||uuid.startsWith ("https://"))) { if (uuid.contains (";")) { return parseStaticGmlMetadata (uuid); } else { final HttpGetUtil metadataGetUtil = new HttpGetUtil (uuid); return parseMetadataFromUrl (uuid, metadataGetUtil); } } else { final String url = getMetadataUrl (uuid); technicalLog.debug("metadata URL to PGR: " + url); final HttpGetUtil pgrHttpGetUtil = new HttpGetUtil (url); return parseMetadataFromPgr (uuid, url, pgrHttpGetUtil); } } protected GmlMetadata parseStaticGmlMetadata (final String uuid) throws HarvesterException { final String[] parts = uuid.split (";"); if (parts.length != 3) { throw new HarvesterException (HarvesterMessageKey.METADATA_INVALID_IDENTIFIER, uuid); } return new GmlMetadata (parts[2], parts[0], parts[1]); } protected PgrMetadata parseMetadataFromUrl (final String url, final HttpGetUtil httpGetUtil) throws HarvesterException { try { final OMElement rootElement; if (!httpGetUtil.isValidResponse ()) { throw new HarvesterException (HarvesterMessageKey.PGR_HTTP_ERROR, url, String.valueOf (httpGetUtil.getStatusCode ())); } if ((rootElement = httpGetUtil.getEntityOMElement ()) == null) { throw new RuntimeException ("Metadata root not found"); } technicalLog.debug (" - metadata root: " + url); testValidMetadataDocument (url, rootElement); final Timestamp metadataDate = getMetadataDate (rootElement, url); final FeatureCollectionReference featureCollectionReference = getMetadataFeatureCollectionReference (rootElement, url); if(metadataDate == null || featureCollectionReference == null) { return null; } return new PgrMetadata (url, metadataDate, featureCollectionReference); } catch (HarvesterException e) { throw e; } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.PGR_EXCEPTION, url, getExceptionMessage (e)); } finally { httpGetUtil.close (); } } protected PgrMetadata parseMetadataFromPgr (final String uuid, final String url, final HttpGetUtil pgrHttpGetUtil) throws HarvesterException { try { final OMElement rootPGR; try { if(!pgrHttpGetUtil.isValidResponse()) { throw new HarvesterException (HarvesterMessageKey.PGR_HTTP_ERROR, url, String.valueOf (pgrHttpGetUtil.getStatusCode ())); } rootPGR = pgrHttpGetUtil.getEntityOMElement(); if (rootPGR == null){ // We should not get here throw new RuntimeException ("Root of PGR not found"); } else{ technicalLog.debug(" - root PGR: " + url); // Check if correct GetRecordByIdResponse testSuccessfulGetRecordByIdResponse (url, rootPGR); } } catch (HarvesterException e) { throw e; } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.PGR_EXCEPTION, url, getExceptionMessage (e)); } final Timestamp metadataDate = getMetadataDate(rootPGR, url); final FeatureCollectionReference featureCollectionReference = getMetadataFeatureCollectionReference (rootPGR, url); if(metadataDate == null || featureCollectionReference == null){ return null; } return new PgrMetadata (url, metadataDate, featureCollectionReference); } finally { pgrHttpGetUtil.close(); } } private String getExceptionMessage (final Exception exception) { if (exception.getLocalizedMessage () != null) { return exception.getLocalizedMessage (); } else { return exception.toString (); } } public String getMetadataUrl (final String uuid) { if (uuid.startsWith ("http://")) { if (uuid.contains (";")) { return null; } else { return uuid; } } // get metadata from PGR try { return pgrBaseUrl + java.net.URLEncoder.encode(uuid, "UTF-8"); } catch (UnsupportedEncodingException e1) { technicalLog.warn ("Unable to URLencode uuid: " + uuid); return pgrBaseUrl + uuid; } } private void testValidMetadataDocument (final String url, final OMElement rootElement) throws HarvesterException { testXPath (url, rootElement, xpathValidMetadataDocument); } private void testSuccessfulGetRecordByIdResponse (final String url, final OMElement rootPGR) throws HarvesterException { testXPath (url, rootPGR, xpathGetRecordByIdResponse); } private void testXPath (final String url, final OMElement rootElement, final String xpath) throws HarvesterException { final String queryXPathString = xpath; AXIOMXPath queryXPath; OMElement queryElement = null; try { queryXPath = new AXIOMXPath(queryXPathString); queryXPath.addNamespace("csw", "http://www.opengis.net/cat/csw/2.0.2"); queryXPath.addNamespace("gmd", "http://www.isotc211.org/2005/gmd"); queryXPath.addNamespace("gco", "http://www.isotc211.org/2005/gco"); queryXPath.addNamespace("ows", "http://www.opengis.net/ows"); queryElement = (OMElement)queryXPath.selectSingleNode(rootElement); if(queryElement == null){ throw new HarvesterException (HarvesterMessageKey.METADATA_NOT_FOUND, url); } } catch (JaxenException e) { throw new HarvesterException (e, HarvesterMessageKey.METADATA_NOT_FOUND_ERROR, url, getExceptionMessage (e)); } } /** * Get the date from metadata.<br> * It handles cases:<br> * <code><gco:DateTime>2012-05-15T00:00:00.000</gco:DateTime></code><br> * and<br> * <code><gco:Date>2009-03-06</gco:Date></code> * @param rootPGR the metadata * @param url used to retrieve the date from * @return Timestamp containing the date */ private Timestamp getMetadataDate (final OMElement rootPGR, final String url) throws HarvesterException { // get metadata date String xpathRevision = ""; String xpathCreation = ""; OMElement dateElement = null; try { xpathRevision = xpathRevisionDateTime; dateElement = AxiomUtils.getOMElementWithXPath(rootPGR, xpathRevision, namespaces); if (dateElement == null) { xpathRevision = xpathRevisionDate; dateElement = AxiomUtils.getOMElementWithXPath(rootPGR, xpathRevision, namespaces); if (dateElement == null) { xpathCreation = xpathCreationDateTime; dateElement = AxiomUtils.getOMElementWithXPath(rootPGR, xpathCreation, namespaces); if (dateElement == null) { xpathCreation = xpathCreationDate; dateElement = AxiomUtils.getOMElementWithXPath(rootPGR, xpathCreation, namespaces); if (dateElement == null) { throw new HarvesterException( HarvesterMessageKey.METADATA_DATE, url, xpathCreationDateOrRevisionDate, ""); } else { technicalLog.debug(" - creation date: " + dateElement.getText()); } } } else { technicalLog.debug(" - revision date: " + dateElement.getText()); } } } catch (Exception e) { throw new HarvesterException(e, HarvesterMessageKey.METADATA_DATE, url, xpathCreationDateTimeOrRevisionDateTime, getExceptionMessage(e)); } return parseMetadataDate(dateElement, xpathCreation.isEmpty() ? xpathRevision : xpathCreation, url); } /** * Check the date in dateElement against a certain pattern * @param dateElement element containing the date string * @param xpathRevisionDateTime * @param url the date is retrieved from * @return Timestamp containing the date. * @throws HarvesterException */ private Timestamp parseMetadataDate (final OMElement dateElement, final String xpathDate, final String url) throws HarvesterException { // parse date string into timestamp object Timestamp metadataUpdateDatum = null; try { metadataUpdateDatum = new Timestamp(DateTimeUtils.parseDate(dateElement.getText(), metaDataDateTimeFormat)); } catch (Exception e1) { try { metadataUpdateDatum = new Timestamp(DateTimeUtils.parseDate(dateElement.getText(), metaDataDateFormat)); } catch (Exception e2) { throw new HarvesterException (e2, HarvesterMessageKey.METADATA_DATEFORMAT, url, xpathDate, getExceptionMessage (e2), metaDataDateFormat); } } return metadataUpdateDatum; } /** * Get a Wfs url form metadata. * @param rootPGR the metadata * @param url used to retrieve WFS url from * @return String containing a Wfs Url * @throws HarvesterException */ private FeatureCollectionReference getMetadataFeatureCollectionReference (final OMElement rootElement, final String url) throws HarvesterException { for (final Map.Entry<FeatureCollectionType, Pair<String, String>> entry: featureCollectionReferenceMap.entrySet ()) { final String xpathFeatureCollectionUrl = entry.getValue ().first; final String xpathFeatureTypeName = entry.getValue ().second; final OMElement featureCollectionUrlElement; final OMElement featureTypeNameElement; try { featureCollectionUrlElement= AxiomUtils.getOMElementWithXPath (rootElement, xpathFeatureCollectionUrl, namespaces); } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.METADATA_WFSURL, url, xpathFeatureCollectionUrl, getExceptionMessage (e)); } try { featureTypeNameElement = AxiomUtils.getOMElementWithXPath (rootElement, xpathFeatureTypeName, namespaces); } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.METADATA_FEATURETYPE, url, xpathFeatureTypeName, getExceptionMessage (e)); } if (featureCollectionUrlElement == null) { continue; }else{ String fcUrl = featureCollectionUrlElement.getText().trim(); if (fcUrl.indexOf("?")>0){ fcUrl = fcUrl.substring(0, fcUrl.indexOf("?")); } if (!fcUrl.endsWith("?") && fcUrl.indexOf("?")>0){ throw new HarvesterException (HarvesterMessageKey.METADATA_WFSURL, url, xpathFeatureCollectionUrl, "Url not correct: [" + fcUrl+"]"); } } if (featureTypeNameElement == null){ throw new HarvesterException (HarvesterMessageKey.METADATA_FEATURETYPE, url, xpathFeatureTypeName, "featureTypeNameElement not found"); } technicalLog.debug (" - feature collection url: [" + featureCollectionUrlElement.getText ()+"]"); technicalLog.debug (" - feature type name : [" + featureTypeNameElement.getText ()+"]"); //expect an W3C:XSD url if xpathFeatureCollectionUrl is OGC:GML or W3C:XML String xsdUrl = null; if (xpathFeatureCollectionUrl.equals(xpathGmlFeatureCollection) || xpathFeatureCollectionUrl.equals(xpathXmlDataset)) { final OMElement xsdLocationElement; try { xsdLocationElement= AxiomUtils.getOMElementWithXPath (rootElement, xpathXsdUrl, namespaces); if (xsdLocationElement != null){ xsdUrl = xsdLocationElement.getText(); } } catch (Exception e) { System.err.println("XSD error: " + getExceptionMessage (e)); throw new HarvesterException (e, HarvesterMessageKey.METADATA_XSDURL, url, xpathXsdUrl, getExceptionMessage (e)); } } technicalLog.debug (" - feature xsd url : [" + xsdUrl+"]"); return new FeatureCollectionReference (entry.getKey (), featureCollectionUrlElement.getText ().trim(), featureTypeNameElement == null ? null : featureTypeNameElement.getText (), xsdUrl); } return null; } /** * Request capabilities from a wfs. * @param wfsUrl String containing wfs url * @return OMElement containing the getcapabilities document */ public String getFeatureCollectionUrl (final DatasetMetadata metadata) throws HarvesterException { if (!(metadata instanceof PgrMetadata)) { return metadata.getFeatureCollectionUrl (); } final PgrMetadata pgrMetadata = (PgrMetadata)metadata; FeatureCollectionType dataType = pgrMetadata.getFeatureCollectionReference ().type; if (dataType == GML || dataType == XML) { return pgrMetadata.getFeatureCollectionReference ().url; } OMElement urlElement = null; String capabilitiesUrl = createWfsGetCapabilitiesUrl(metadata.getFeatureCollectionUrl ()); technicalLog.debug("GetCapabilitiesUrl: " + capabilitiesUrl); HttpGetUtil capabilitiesHttpGetUtil = null; try { OMElement rootWFS = null; try { capabilitiesHttpGetUtil = new HttpGetUtil(capabilitiesUrl); if (!capabilitiesHttpGetUtil.isValidResponse()) { throw new HarvesterException (HarvesterMessageKey.METADATA_CAPABILITIES_HTTP_ERROR, capabilitiesUrl, ""+capabilitiesHttpGetUtil.getStatusCode()); } rootWFS = capabilitiesHttpGetUtil.getEntityOMElement(); if(rootWFS == null){ // We should not get here throw new RuntimeException("Not a valid GetCapabilitiesResponse"); } } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.METADATA_CAPABILITIES_EXCEPTION, capabilitiesUrl, getExceptionMessage (e)); } try { urlElement = AxiomUtils.getOMElementWithXPath(rootWFS, xpathGetFeature, namespaces); if (urlElement==null){ throw new RuntimeException("Url to GetFeature operation not found in capabilities-response document"); } else { technicalLog.debug(" - wfs url href: " + urlElement.getAttributeValue(new QName("http://www.w3.org/1999/xlink","href"))); } } catch (Exception e) { throw new HarvesterException (e, HarvesterMessageKey.METADATA_CAPABILITIES_WFSURL, capabilitiesUrl, xpathGetFeature, getExceptionMessage (e)); } } finally{ if(capabilitiesHttpGetUtil != null) { capabilitiesHttpGetUtil.close(); } } return urlElement == null ? null : urlElement.getAttributeValue(new QName("http://www.w3.org/1999/xlink","href")); } protected String createWfsGetCapabilitiesUrl(final String wfsUrl) { // some url's end with '?service=wfs', others do not String separator; String capabilitiesUrl = wfsUrl; int qmarkIndex = wfsUrl.indexOf("?"); boolean hasService = false, hasRequest = false; if(qmarkIndex != -1) { String query = wfsUrl.substring(qmarkIndex + 1).trim(); if(!query.isEmpty()) { String[] parameters = query.split("&"); for(String parameter : parameters) { String[] parameterSplit = parameter.split("="); if(parameterSplit.length == 2) { try { String key = URLDecoder.decode(parameterSplit[0], "utf-8").toLowerCase(); String value = URLDecoder.decode(parameterSplit[1], "utf-8"); hasService |= "service".equals(key) && value != null && value.toLowerCase().equals("wfs"); hasRequest |= "request".equals(key) && "GetCapabilities".equals(value); } catch (UnsupportedEncodingException e) {} } } separator = "&"; } else { separator = ""; } } else { separator = "?"; } if(!hasService) { capabilitiesUrl += separator + "service=WFS"; separator = "&"; } if(!hasRequest) { capabilitiesUrl += separator + "request=GetCapabilities"; } return capabilitiesUrl; } }