package eu.esdihumboldt.hale.io.codelist.inspire.reader; import java.io.IOException; import java.io.InputStream; import java.net.Proxy; import java.net.URI; import java.util.Locale; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import org.apache.http.HttpEntity; import org.apache.http.HttpHeaders; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpResponseException; import org.apache.http.client.ResponseHandler; import org.apache.http.client.fluent.Executor; import org.apache.http.client.fluent.Request; import org.apache.http.client.fluent.Response; import org.apache.http.entity.ContentType; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import eu.esdihumboldt.hale.common.codelist.CodeList; import eu.esdihumboldt.hale.common.codelist.CodeList.CodeEntry; import eu.esdihumboldt.hale.common.codelist.io.CodeListReader; import eu.esdihumboldt.hale.common.core.io.IOProviderConfigurationException; import eu.esdihumboldt.hale.common.core.io.ProgressIndicator; import eu.esdihumboldt.hale.common.core.io.impl.AbstractImportProvider; import eu.esdihumboldt.hale.common.core.io.report.IOReport; import eu.esdihumboldt.hale.common.core.io.report.IOReporter; import eu.esdihumboldt.hale.common.core.io.report.impl.IOMessageImpl; import eu.esdihumboldt.util.http.ProxyUtil; import eu.esdihumboldt.util.http.client.fluent.FluentProxyUtil; import eu.esdihumboldt.util.io.InputSupplier; import eu.esdihumboldt.util.resource.Resources; /** * Load XML code lists as provided by the INSPIRE registry. * * @author Kai Schwierczek */ public class INSPIRECodeListReader extends AbstractImportProvider implements CodeListReader { /** * The provider ID. */ public static final String PROVIDER_ID = "eu.esdihumboldt.hale.io.codelist.inspire.reader"; private CodeList codelist; @Override public boolean isCancelable() { return false; // TODO } /** * @see CodeListReader#getCodeList() */ @Override public CodeList getCodeList() { return codelist; } @Override protected IOReport execute(ProgressIndicator progress, IOReporter reporter) throws IOProviderConfigurationException, IOException { progress.begin("Loading code list.", ProgressIndicator.UNKNOWN); try { Document doc; URI loc = getSource().getLocation(); if (loc != null && (loc.getScheme().equals("http") || loc.getScheme().equals("https"))) { // load with HTTP client // and provide headers to retrieve correct format and language try { doc = loadXmlDocument(loc); } catch (Exception e) { // try local resources as fall-back InputSupplier<? extends InputStream> localInput = Resources.tryResolve(loc, Resources.RESOURCE_TYPE_XML_CODELIST); if (localInput != null) { try (InputStream is = localInput.getInput()) { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); doc = db.parse(is); } } else throw e; } } else { // just access stream try (InputStream is = getSource().getInput()) { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); doc = db.parse(is); } } reporter.setSuccess(parse(doc, loc, reporter)); } catch (Exception e) { throw new RuntimeException(e); } progress.setCurrentTask("Code list loaded."); return reporter; } /** * Load an XML document via HTTP, providing headers to request proper format * and language. * * @param loc the location * @return the XML document * @throws IOException if reading the document fails * @throws ClientProtocolException if retrieving the document fails */ public static Document loadXmlDocument(URI loc) throws ClientProtocolException, IOException { Response response = getResponse(loc); return response.handleResponse(new ResponseHandler<Document>() { @Override public Document handleResponse(HttpResponse response) throws ClientProtocolException, IOException { StatusLine statusLine = response.getStatusLine(); HttpEntity entity = response.getEntity(); if (statusLine.getStatusCode() >= 300) { throw new HttpResponseException(statusLine.getStatusCode(), statusLine.getReasonPhrase()); } if (entity == null) { throw new ClientProtocolException("Response contains no content"); } DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); try { DocumentBuilder docBuilder = dbfac.newDocumentBuilder(); return docBuilder.parse(entity.getContent()); } catch (ParserConfigurationException ex) { throw new IllegalStateException(ex); } catch (SAXException ex) { throw new ClientProtocolException("Malformed XML document", ex); } } }); } private boolean parse(Document doc, URI location, IOReporter reporter) throws Exception { XPath xpath = XPathFactory.newInstance().newXPath(); boolean directlyReferenced = location != null && location.toString().toLowerCase().endsWith(".xml"); String description = null; String namespace = null; namespace = (String) xpath.evaluate("codelist/@id", doc, XPathConstants.STRING); if (namespace == null) { reporter.error(new IOMessageImpl("No id attribute present in INSPIRE codelist.", null)); return false; } // use the last part of the id as name String name = namespace; int idxSlash = name.indexOf('/'); if (idxSlash >= 0 && idxSlash + 1 < name.length()) { name = name.substring(idxSlash); } if (directlyReferenced) { // if directly referenced use the label as name // (for backwards compatibility) NodeList labels = (NodeList) xpath.evaluate("codelist/label", doc, XPathConstants.NODESET); if (labels.getLength() > 0) { name = labels.item(0).getTextContent(); } } NodeList definitions = (NodeList) xpath.evaluate("codelist/definition", doc, XPathConstants.NODESET); if (definitions.getLength() > 0) description = definitions.item(0).getTextContent(); // XXX ignore descriptions for now // also ignore status, extensibility, register, applicationschema and // theme // don't use the name as identifier, as it is language dependent! INSPIRECodeList codelist = new INSPIRECodeList(namespace, name, description, location); NodeList entries = (NodeList) xpath.evaluate("codelist/containeditems/value", doc, XPathConstants.NODESET); for (int i = 0; i < entries.getLength(); i++) addEntry(entries.item(i), codelist, xpath, reporter); this.codelist = codelist; return true; } private void addEntry(Node item, INSPIRECodeList codelist, XPath xpath, IOReporter reporter) throws Exception { String name = null; String description = null; String identifier = null; String namespace = null; identifier = (String) xpath.evaluate("@id", item, XPathConstants.STRING); if (identifier == null) { reporter.warn(new IOMessageImpl( "No id attribute present in a value of the INSPIRE codelist. Skipping value.", null)); return; } // XXX what about multiple labels or definitions? NodeList labels = (NodeList) xpath.evaluate("label", item, XPathConstants.NODESET); if (labels.getLength() > 0) name = labels.item(0).getTextContent(); else { reporter.warn(new IOMessageImpl("No label present in a value of the INSPIRE codelist.", null)); name = identifier; } NodeList definitions = (NodeList) xpath.evaluate("definition", item, XPathConstants.NODESET); if (definitions.getLength() > 0) description = definitions.item(0).getTextContent(); // in schema no description, but in data; anyways, ignore it for now // also ignore status, register, applicationschema and theme namespace = (String) xpath.evaluate("codelist/@id", item, XPathConstants.STRING); // XXX I guess namespace has to be the same as the codelist. Check this? codelist.addEntry(new CodeEntry(name, description, identifier, namespace)); } @Override protected String getDefaultTypeName() { return "INSPIRE code list"; } /** * This method creates a fluent request for the given URI reference resource * location, adds header to accept application/xml content type. Sets the * proxy if proxy is configured. Executes the fluent request and returns the * fluent response * * @param uri uri reference of the resource location. * @return Executor, returns the executor for executing fluent request * @throws IOException throws if there are some interruption I/O operations * while executing the fluent request * @throws ClientProtocolException throws if it fails while executing the * request */ public static Response getResponse(URI uri) throws ClientProtocolException, IOException { Request request = Request.Get(uri) .addHeader(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()) .addHeader(HttpHeaders.ACCEPT_LANGUAGE, Locale.getDefault().getLanguage()); Proxy proxy = ProxyUtil.findProxy(uri); // If proxy is configured then set the proxy Executor executor = FluentProxyUtil.setProxy(request, proxy); return executor.execute(request); } }