/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.ctask.general; import java.io.InputStream; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.namespace.NamespaceContext; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import javax.xml.XMLConstants; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.log4j.Logger; import org.dspace.content.MetadataValue; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import org.dspace.authorize.AuthorizeException; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.core.Constants; import org.dspace.curate.AbstractCurationTask; import org.dspace.curate.Curator; import org.dspace.curate.Mutative; import org.dspace.curate.Suspendable; /** * MetadataWebService task calls a web service using metadata from * passed item to obtain data. Depending on configuration, this * data may be assigned to item metadata fields, or just recorded in the * task result string. Task succeeds if web service call succeeds and * configured updates occur, fails if task user not authorized or item * lacks metadata to call service, and returns error in all other cases * (except skip status for non-item objects). * Intended use: cataloging tool in workflow and general curation. * The task uses a URL 'template' to compose the service call, e.g. * * {@code http://www.sherpa.ac.uk/romeo/api29.php?issn=\{dc.identifier.issn\}} * * Task will substitute the value of the passed item's metadata field * in the {parameter} position. If multiple values are present in the * item field, the first value is used. * * The task uses another property (the datamap) to determine what data * to extract from the service response and how to use it, e.g. * * {@code //publisher/name=>dc.publisher,//romeocolour} * * Task will evaluate the left-hand side (or entire token) of each * comma-separated token in the property as an XPath 1.0 expression into * the response document, and if there is a mapping symbol (e.g. {@code '=>'}) and * value, it will assign the response document value(s) to the named * metadata field in the passed item. If the response document contains * multiple values, they will all be assigned to the item field. The * mapping symbol governs the nature of metadata field assignment: * * {@code '->'} mapping will add to any existing values in the item field * {@code '=>'} mapping will replace any existing values in the item field * {@code '~>'} mapping will add *only* if item field has no existing values * * Unmapped data (without a mapping symbol) will simply be added to the task * result string, prepended by the XPath expression (a little prettified). * Each label/value pair in the result string is separated by a space, * unless the optional 'separator' property is defined. * * A very rudimentary facility for transformation of data is supported, e.g. * * {@code http://www.crossref.org/openurl/?id=\{doi:dc.relation.isversionof\}&format=unixref} * * The 'doi:' prefix will cause the task to look for a 'transform' with that * name, which is applied to the metadata value before parameter substitution * occurs. Transforms are defined in a task property such as the following: * * {@code transform.doi = match 10. trunc 60} * * This means exclude the value string up to the occurrence of '10.', then * truncate after 60 characters. The only transform functions currently defined: * * {@code 'cut' <number>} = remove number leading characters * {@code 'trunc' <number>} = remove trailing characters after number length * {@code 'match' <pattern>} = start match at pattern * {@code 'text' <characters>} = append literal characters (enclose in ' ' when whitespace needed) * * If the transform results in an invalid state (e.g. cutting more characters * than are in the value), the condition will be logged and the * un-transformed value used. * * Transforms may also be used in datamaps, e.g. * * {@code //publisher/name=>shorten:dc.publisher,//romeocolour} * * which would apply the 'shorten' transform to the service response value(s) * prior to metadata field assignment. * * An optional property 'headers' may be defined to stipulate any HTTP headers * required in the service call. The property syntax is double-pipe separated headers: * * {@code Accept: text/xml||Cache-Control: no-cache} * * @author richardrodgers */ @Mutative @Suspendable public class MetadataWebService extends AbstractCurationTask implements NamespaceContext { /** log4j category */ private static final Logger log = Logger.getLogger(MetadataWebService.class); // transform token parsing pattern protected Pattern ttPattern = Pattern.compile("\'([^\']*)\'|(\\S+)"); // URL of web service with template parameters protected String urlTemplate = null; // template parameter protected String templateParam = null; // Item metadata field to use in service call protected String lookupField = null; // Optional transformation of lookupField protected String lookupTransform = null; // response data to map/record protected List<MetadataWebServiceDataInfo> dataList = null; // response document parsing tools protected DocumentBuilder docBuilder = null; // language for metadata fields assigned protected String lang = null; // field separator in result string protected String fieldSeparator = null; // optional XML namespace map protected Map<String, String> nsMap = new HashMap<String, String>(); // optional HTTP headers protected Map<String, String> headers = new HashMap<String, String>(); /** * Initializes task * @param curator Curator object performing this task * @param taskId the configured local name of the task */ @Override public void init(Curator curator, String taskId) throws IOException { super.init(curator, taskId); lang = configurationService.getProperty("default.language"); String fldSep = taskProperty("separator"); fieldSeparator = (fldSep != null) ? fldSep : " "; urlTemplate = taskProperty("template"); templateParam = urlTemplate.substring(urlTemplate.indexOf("{") + 1, urlTemplate.indexOf("}")); String[] parsed = parseTransform(templateParam); lookupField = parsed[0]; lookupTransform = parsed[1]; dataList = new ArrayList<>(); for (String entry : taskArrayProperty("datamap")) { entry = entry.trim(); String src = entry; String mapping = null; String field = null; int mapIdx = getMapIndex(entry); if (mapIdx > 0) { src = entry.substring(0, mapIdx); mapping = entry.substring(mapIdx, mapIdx + 2); field = entry.substring(mapIdx + 2); } int slIdx = src.lastIndexOf("/"); String label = (slIdx > 0) ? src.substring(slIdx + 1) : src; dataList.add(new MetadataWebServiceDataInfo(this, src, label, mapping, field)); } String hdrs = taskProperty("headers"); if (hdrs != null) { for (String header : hdrs.split("\\|\\|")) { int split = header.indexOf(":"); headers.put(header.substring(0, split).trim(), header.substring(split + 1).trim()); } } // initialize response document parser DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); try { docBuilder = factory.newDocumentBuilder(); } catch (ParserConfigurationException pcE) { log.error("caught exception: " + pcE); // no point in continuing throw new IOException(pcE.getMessage(), pcE); } } /** * Perform the curation task upon passed DSO * * @param dso the DSpace object * @throws IOException if IO error */ @Override public int perform(DSpaceObject dso) throws IOException { int status = Curator.CURATE_SKIP; StringBuilder resultSb = new StringBuilder(); if (dso.getType() == Constants.ITEM) { Item item = (Item)dso; String itemId = item.getHandle(); if (itemId == null) { // we are still in workflow - no handle assigned - try title List<MetadataValue> titleDc = itemService.getMetadata(item, "dc", "title", null, Item.ANY); String title = (titleDc.size() > 0) ? titleDc.get(0).getValue() : "untitled - dbId: " + item.getID(); itemId = "Workflow item: " + title; } else { itemId = "handle: " + itemId; } resultSb.append(itemId); // Only proceed if item has a value for service template parameter List<MetadataValue> dcVals = itemService.getMetadataByMetadataString(item, lookupField); if (dcVals.size() > 0 && dcVals.get(0).getValue().length() > 0) { String value = transform(dcVals.get(0).getValue(), lookupTransform); status = callService(value, item, resultSb); } else { resultSb.append(" lacks metadata value required for service: ").append(lookupField); status = Curator.CURATE_FAIL; } } else { resultSb.append("Object skipped"); } report(resultSb.toString()); setResult(resultSb.toString()); return status; } protected int callService(String value, Item item, StringBuilder resultSb) throws IOException { String callUrl = urlTemplate.replaceAll("\\{" + templateParam + "\\}", value); HttpClient client = new DefaultHttpClient(); HttpGet req = new HttpGet(callUrl); for (Map.Entry<String, String> entry : headers.entrySet()) { req.addHeader(entry.getKey(), entry.getValue()); } HttpResponse resp = client.execute(req); int status = Curator.CURATE_ERROR; int statusCode = resp.getStatusLine().getStatusCode(); if (statusCode == HttpStatus.SC_OK) { HttpEntity entity = resp.getEntity(); if (entity != null) { // boiler-plate handling taken from Apache 4.1 javadoc InputStream instream = entity.getContent(); try { Document doc = docBuilder.parse(instream); status = processResponse(doc, item, resultSb); } catch (SAXException saxE) { log.error("caught exception: " + saxE); resultSb.append(" unable to read response document"); } catch (RuntimeException ex) { // In case of an unexpected exception you may want to abort // the HTTP request in order to shut down the underlying // connection and release it back to the connection manager. req.abort(); log.error("caught exception: " + ex); throw ex; } finally { // Closing the input stream will trigger connection release instream.close(); } // When HttpClient instance is no longer needed, // shut down the connection manager to ensure // immediate deallocation of all system resources client.getConnectionManager().shutdown(); } else { log.error(" obtained no valid service response"); resultSb.append("no service response"); } } else { log.error("service returned non-OK status: " + statusCode); resultSb.append("no service response"); } return status; } protected int processResponse(Document doc, Item item, StringBuilder resultSb) throws IOException { boolean update = false; int status = Curator.CURATE_ERROR; List<String> values = new ArrayList<String>(); checkNamespaces(doc); try { for (MetadataWebServiceDataInfo info : dataList) { NodeList nodes = (NodeList)info.getExpr().evaluate(doc, XPathConstants.NODESET); values.clear(); // if data found and we are mapping, check assignment policy if (nodes.getLength() > 0 && info.getMapping() != null) { if ("=>".equals(info.getMapping())) { itemService.clearMetadata(Curator.curationContext(), item, info.getSchema(), info.getElement(), info.getQualifier(), Item.ANY); } else if ("~>".equals(info.getMapping())) { if (itemService.getMetadata(item, info.getSchema(), info.getElement(), info.getQualifier(), Item.ANY).size() > 0) { // there are values, so don't overwrite continue; } } else { for (MetadataValue dcVal : itemService.getMetadata(item, info.getSchema(), info.getElement(), info.getQualifier(), Item.ANY)) { values.add(dcVal.getValue()); } } } for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); String tvalue = transform(node.getFirstChild().getNodeValue(), info.getTransform()); // assign to metadata field if mapped && not present if (info.getMapping() != null && ! values.contains(tvalue)) { itemService.addMetadata(Curator.curationContext(), item, info.getSchema(), info.getElement(), info.getQualifier(), lang, tvalue); update = true; } // add to result string in any case resultSb.append(fieldSeparator).append(info.getLabel()).append(": ").append(tvalue); } } // update Item if it has changed if (update) { itemService.update(Curator.curationContext(), item); } status = Curator.CURATE_SUCCESS; } catch (AuthorizeException authE) { log.error("caught exception: " + authE); resultSb.append(" not authorized to update"); status = Curator.CURATE_FAIL; } catch (SQLException sqlE) { log.error("caught exception: " + sqlE); resultSb.append(" error updating metadata"); } catch (XPathExpressionException xpeE) { log.error("caught exception: " + xpeE); resultSb.append(" error reading response document"); } return status; } protected String transform(String value, String transDef) { if (transDef == null) { return value; } String[] tokens = tokenize(transDef); String retValue = value; for (int i = 0; i < tokens.length; i+= 2) { if ("cut".equals(tokens[i]) || "trunc".equals(tokens[i])) { int index = Integer.parseInt(tokens[i+1]); if (retValue.length() > index) { if ("cut".equals(tokens[i])) { retValue = retValue.substring(index); } else { retValue = retValue.substring(0, index); } } else if ("cut".equals(tokens[i])) { log.error("requested cut: " + index + " exceeds value length"); return value; } } else if ("match".equals(tokens[i])) { int index2 = retValue.indexOf(tokens[i+1]); if (index2 > 0) { retValue = retValue.substring(index2); } else { log.error("requested match: " + tokens[i+1] + " failed"); return value; } } else if ("text".equals(tokens[i])) { retValue = retValue + tokens[i+1]; } else { log.error(" unknown transform operation: " + tokens[i]); return value; } } return retValue; } protected String[] tokenize(String text) { List<String> list = new ArrayList<String>(); Matcher m = ttPattern.matcher(text); while (m.find()) { if (m.group(1) != null) { list.add(m.group(1)); } else if (m.group(2) != null) { list.add(m.group(2)); } } return list.toArray(new String[0]); } protected int getMapIndex(String mapping) { int index = mapping.indexOf("->"); if (index == -1) { index = mapping.indexOf("=>"); } if (index == -1) { index = mapping.indexOf("~>"); } return index; } protected String[] parseTransform(String field) { String[] parsed = new String[2]; parsed[0] = field; int txIdx = field.indexOf(":"); if (txIdx > 0) { // transform specified String txName = field.substring(0, txIdx); parsed[1] = taskProperty("transform." + txName); if (parsed[1] == null) { log.error("no transform found for: " + txName); } parsed[0] = field.substring(txIdx + 1); } return parsed; } protected void checkNamespaces(Document document) throws IOException { // skip if already done if (dataList.get(0).getExpr() != null) { return; } try { XPath xpath = XPathFactory.newInstance().newXPath(); String prefix = null; NamedNodeMap attrs = document.getDocumentElement().getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node n = attrs.item(i); String name = n.getNodeName(); // remember if a namespace if (name.startsWith("xmlns")) { if (! "xmlns".equals(name)) { // it is a declared (non-default) namespace - capture prefix nsMap.put(name.substring(name.indexOf(":") + 1), n.getNodeValue()); } else { // it is the default name space - mint a unique prefix prefix = "pre"; nsMap.put(prefix, n.getNodeValue()); } } } if (nsMap.size() > 0) { xpath.setNamespaceContext(this); } // now compile the XPath expressions for (MetadataWebServiceDataInfo info : dataList) { info.setExpr(xpath.compile(mangleExpr(info.getXpsrc(), prefix))); } } catch (XPathExpressionException xpeE) { log.error("caught exception: " + xpeE); // no point in continuing throw new IOException(xpeE.getMessage(), xpeE); } } protected String mangleExpr(String expr, String prefix) { if (prefix == null) { return expr; } // OK the drill is to prepend all node names with the prefix // *unless* the node name already has a prefix. StringBuilder sb = new StringBuilder(); int i = 0; while (i < expr.length()) { if (expr.charAt(i) == '/') { sb.append("/"); i++; } else { int next = expr.indexOf("/", i); String token = (next > 0) ? expr.substring(i, next) : expr.substring(i); if (! token.startsWith("@") && token.indexOf(":") < 0) { sb.append(prefix).append(":"); } sb.append(token); i += token.length(); } } return sb.toString(); } // ---- NamespaceContext methods ---- // @Override public String getNamespaceURI(String prefix) { if (prefix == null) { throw new NullPointerException("Null prefix"); } else if ("xml".equals(prefix)) { return XMLConstants.XML_NS_URI; } String nsURI = nsMap.get(prefix); return (nsURI != null) ? nsURI : XMLConstants.NULL_NS_URI; } @Override public String getPrefix(String uri) { throw new UnsupportedOperationException(); } @Override public Iterator getPrefixes(String uri) { throw new UnsupportedOperationException(); } }