/**
* Copyright 2008 The University of North Carolina at Chapel Hill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.unc.lib.dl.cdr.services.techmd;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.FITS_NS;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.PREMIS_V2_NS;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.util.URIUtil;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.XMLOutputter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.unc.lib.dl.cdr.services.AbstractFedoraEnhancement;
import edu.unc.lib.dl.cdr.services.AbstractFedoraEnhancementService;
import edu.unc.lib.dl.cdr.services.AbstractIrodsObjectEnhancementService;
import edu.unc.lib.dl.cdr.services.exception.EnhancementException;
import edu.unc.lib.dl.cdr.services.exception.EnhancementException.Severity;
import edu.unc.lib.dl.cdr.services.model.EnhancementMessage;
import edu.unc.lib.dl.fedora.FedoraException;
import edu.unc.lib.dl.fedora.FileSystemException;
import edu.unc.lib.dl.fedora.NotFoundException;
import edu.unc.lib.dl.fedora.PID;
import edu.unc.lib.dl.util.ContentModelHelper;
import edu.unc.lib.dl.util.ContentModelHelper.CDRProperty;
import edu.unc.lib.dl.xml.FOXMLJDOMUtil;
import edu.unc.lib.dl.xml.JDOMNamespaceUtil;
/**
* Executes irods script which uses FITS to extract technical metadata features of objects with data file datastreams.
*
* @author Gregory Jansen
*
*/
public class TechnicalMetadataEnhancement extends AbstractFedoraEnhancement {
Namespace ns = JDOMNamespaceUtil.FITS_NS;
private static final Logger LOG = LoggerFactory.getLogger(TechnicalMetadataEnhancement.class);
private static final int MAX_EXTENSION_LENGTH = 8;
/*
* (non-Javadoc)
*
* @see java.lang.Runnable#run()
*/
@Override
public Element call() throws EnhancementException {
Element result = null;
// check to see if the service is still active
if (!this.service.isActive()) {
LOG.debug("{} call method exited, service is not active.", this.getClass().getName());
return null;
}
String md5checksum = null;
Map<String, Document> ds2FitsDoc = new HashMap<String, Document>();
try {
Document foxml = this.retrieveFoxml();
// get sourceData data stream IDs
List<String> srcDSURIs = this.getSourceData(foxml);
Map<String, String> sourceMimetype = new HashMap<String, String>(srcDSURIs.size());
for (String srcURI : srcDSURIs) { // for each source datastream
LOG.debug("source data URI: {}", srcURI);
String dsid = srcURI.substring(srcURI.lastIndexOf("/") + 1);
LOG.debug("datastream ID: {}", dsid);
// get current datastream version ID
String dsLocation = null;
String dsIrodsPath = null;
String dsAltIds = null;
Element newestSourceDS = FOXMLJDOMUtil.getMostRecentDatastream(
ContentModelHelper.Datastream.getDatastream(dsid), foxml);
if (newestSourceDS != null) {
sourceMimetype.put(dsid, newestSourceDS.getAttributeValue("MIMETYPE"));
dsLocation = newestSourceDS.getChild("contentLocation", JDOMNamespaceUtil.FOXML_NS).getAttributeValue(
"REF");
dsAltIds = newestSourceDS.getAttributeValue("ALT_IDS");
Element dfDigest = newestSourceDS.getChild("contentDigest", JDOMNamespaceUtil.FOXML_NS);
if (dfDigest != null) {
md5checksum = dfDigest.getAttributeValue("DIGEST");
}
} else {
throw new EnhancementException("Specified source datastream " + srcURI + " was not found, the object "
+ this.pid.getPid() + " is most likely invalid", Severity.UNRECOVERABLE);
}
// get logical iRODS path for datastream version
dsIrodsPath = service.getManagementClient().getIrodsPath(dsLocation);
// call fits via irods rule for the locations
Document fits = null;
try {
fits = runFITS(dsIrodsPath, dsAltIds);
} catch (JDOMException e) {
// Rethrow JDOM exception as an unrecoverable enhancement exception
throw new EnhancementException(e, Severity.UNRECOVERABLE);
} catch (Exception e) {
throw new RuntimeException(e);
}
// put the FITS document in DS map
ds2FitsDoc.put(dsid, fits);
}
// build a PREMIS document
Document premisTech = new Document();
Element p = new Element("premis", PREMIS_V2_NS);
premisTech.addContent(p);
for (String dsid : ds2FitsDoc.keySet()) {
// get key PREMIS data
Document fits = ds2FitsDoc.get(dsid);
String size = fits.getRootElement().getChild("fileinfo", FITS_NS).getChildText("size", FITS_NS);
String fedoraMimetype = sourceMimetype.get(dsid);
String fitsMimetype = fedoraMimetype;
// Format only provided if using FITS mimetype, since otherwise it may not match
String format = null;
// If fedora mimetype is not meaningful, then override with FITS generated value.
// For example some pdfs come with the mime-type application/(x-)download from fedora, which is unusable by browsers
if (fedoraMimetype == null || fedoraMimetype.trim().length() == 0
|| fedoraMimetype.contains("octet-stream") || fedoraMimetype.contains("download")) {
// get mimetype out of FITS XML
Element identity = null;
Element idn = fits.getRootElement().getChild("identification", ns);
// If there was no conflict, use the first identity
if (idn.getAttributeValue("status") == null) {
identity = idn.getChild("identity", ns);
} else {
// otherwise, find the first identity set where multiple tools agreed or Exif was not the sole
// tool to determine that the file was a symlink.
for (Object child : idn.getChildren("identity", ns)) {
Element el = (Element) child;
if (el.getChildren("tool", ns).size() > 1
|| (!"Exiftool".equals(el.getChild("tool", ns).getAttributeValue("toolname"))
&& !"application/x-symlink".equals(el.getAttributeValue("mimetype")))) {
identity = el;
break;
}
}
}
if (identity != null) {
fitsMimetype = identity.getAttributeValue("mimetype");
format = identity.getAttributeValue("format");
} else {
format = "Unknown";
LOG.warn("FITS unable to conclusively identify file: {}/{}", pid, dsid);
LOG.debug(new XMLOutputter().outputString(fits));
}
}
if ("DATA_FILE".equals(dsid)) {
if (fitsMimetype != null) {
// Throw away the encoding in the mimetype for now
int index = fitsMimetype.indexOf(';');
if (index != -1) {
fitsMimetype = fitsMimetype.substring(0, index);
}
client.setExclusiveLiteral(pid, CDRProperty.hasSourceMimeType.getPredicate(),
CDRProperty.hasSourceMimeType.getNamespace(), fitsMimetype, null);
} else { // application/octet-stream
client.setExclusiveLiteral(pid, CDRProperty.hasSourceMimeType.getPredicate(),
CDRProperty.hasSourceMimeType.getNamespace(), "application/octet-stream", null);
}
try {
Long.parseLong(size);
client.setExclusiveLiteral(pid, CDRProperty.hasSourceFileSize.getPredicate(),
CDRProperty.hasSourceFileSize.getNamespace(), size, "http://www.w3.org/2001/XMLSchema#long");
} catch (NumberFormatException e) {
LOG.error("FITS produced a non-integer value for size: " + size);
}
}
Element objCharsEl = new Element("objectCharacteristics", PREMIS_V2_NS);
if (md5checksum != null) {
objCharsEl.addContent(
new Element("fixity", PREMIS_V2_NS).addContent(
new Element("messageDigestAlgorithm", PREMIS_V2_NS).setText("MD5"))
.addContent(new Element("messageDigest", PREMIS_V2_NS).setText(md5checksum)));
}
if (format != null) {
objCharsEl.addContent(
new Element("format", PREMIS_V2_NS)
.addContent(new Element("formatDesignation", PREMIS_V2_NS)
.addContent(new Element("formatName", PREMIS_V2_NS)
.setText(format))));
}
p.addContent(new Element("object", PREMIS_V2_NS)
.addContent(
new Element("objectIdentifier", PREMIS_V2_NS).addContent(
new Element("objectIdentifierType", PREMIS_V2_NS).setText("Fedora Datastream PID"))
.addContent(new Element("objectIdentifierValue", PREMIS_V2_NS).setText(dsid)))
.addContent(
objCharsEl
.addContent(new Element("compositionLevel", PREMIS_V2_NS).setText("0"))
.addContent(new Element("size", PREMIS_V2_NS).setText(size))
.addContent(
new Element("objectCharacteristicsExtension", PREMIS_V2_NS).addContent(ds2FitsDoc
.get(dsid).detachRootElement())))
.setAttribute("type", PREMIS_V2_NS.getPrefix() + ":file", JDOMNamespaceUtil.XSI_NS));
}
// upload tech MD PREMIS XML
String premisTechURL = service.getManagementClient().upload(premisTech);
// Add or replace the MD_TECHNICAL datastream for the object
if (FOXMLJDOMUtil.getDatastream(foxml, ContentModelHelper.Datastream.MD_TECHNICAL.getName()) == null) {
LOG.debug("Adding FITS output to MD_TECHNICAL");
String message = "Adding technical metadata derived by FITS";
client.addManagedDatastream(pid,
ContentModelHelper.Datastream.MD_TECHNICAL.getName(), false, message, new ArrayList<String>(),
"PREMIS Technical Metadata", false, "text/xml", premisTechURL);
} else {
LOG.debug("Replacing MD_TECHNICAL with new FITS output");
String message = "Replacing technical metadata derived by FITS";
client.modifyDatastreamByReference(pid,
ContentModelHelper.Datastream.MD_TECHNICAL.getName(), false, message, new ArrayList<String>(),
"PREMIS Technical Metadata", "text/xml", null, null, premisTechURL);
}
LOG.debug("Adding techData relationship");
PID newDSPID = new PID(pid.getPid() + "/" + ContentModelHelper.Datastream.MD_TECHNICAL.getName());
Map<String, List<String>> rels = service.getTripleStoreQueryService().fetchAllTriples(pid);
List<String> techrel = rels.get(ContentModelHelper.CDRProperty.techData.toString());
if (techrel == null || !techrel.contains(newDSPID.getURI())) {
client.addObjectRelationship(pid, CDRProperty.techData.getPredicate(),
CDRProperty.techData.getNamespace(), newDSPID);
}
LOG.debug("Finished MD_TECHNICAL updating for {}", pid.getPid());
} catch (FileSystemException e) {
throw new EnhancementException(e, Severity.FATAL);
} catch (NotFoundException e) {
throw new EnhancementException(e, Severity.UNRECOVERABLE);
} catch (FedoraException e) {
throw new EnhancementException(e, Severity.RECOVERABLE);
}
return result;
}
/**
* Executes fits extract irods script
*
* @param dsIrodsPath
* @return FITS output XML Document
*/
private Document runFITS(String dsIrodsPath, String altIds) throws Exception {
Document result = null;
// try to extract file name from ALT_ID
String filename = null;
if (altIds != null) {
for (String altid : altIds.split(" ")) {
if (altid.length() > 0) {
String rawPath = altid;
// Narrow file name down to after the last /
int lastSlash = rawPath.lastIndexOf("/");
if (lastSlash > 0)
rawPath = rawPath.substring(lastSlash + 1);
int ind = rawPath.lastIndexOf(".");
// Use text after last . as extension if its length is 0 > len >= MAX_EXTENSION_LENGTH
if (ind > 0 && rawPath.length() - 1 > ind && (rawPath.length() - ind <= MAX_EXTENSION_LENGTH)) {
filename = rawPath.substring(ind + 1);
filename = URIUtil.decode("linkedfile." + filename);
break;
}
}
}
}
// execute FITS
LOG.debug("Run fits for {}", dsIrodsPath);
BufferedReader reader = null;
String xmlstr = null;
String errstr = null;
try {
if (filename == null) {
reader = new BufferedReader(new InputStreamReader(
((AbstractIrodsObjectEnhancementService) service).remoteExecuteWithPhysicalLocation("fitsextract",
dsIrodsPath)));
} else {
reader = new BufferedReader(new InputStreamReader(
((AbstractIrodsObjectEnhancementService) service).remoteExecuteWithPhysicalLocation("fitsextract",
"'" + filename + "'", dsIrodsPath)));
}
StringBuilder xml = new StringBuilder();
StringBuilder err = new StringBuilder();
boolean declareReached = false;
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
if (!declareReached && line.startsWith("<?xml")) {
declareReached = true;
}
if (declareReached) {
xml.append(line).append("\n");
} else {
if (line.trim().length() > 0) {
err.append(line).append("\n");
}
}
}
xmlstr = xml.toString();
errstr = err.toString();
if (errstr.length() > 0) {
LOG.warn("FITS is warning for path: " + dsIrodsPath);
LOG.info(errstr);
}
result = new SAXBuilder().build(new StringReader(xmlstr));
return result;
} catch (JDOMException e) {
LOG.warn("Failed to parse FITS output for path: " + dsIrodsPath);
LOG.info("FITS returned: \n" + xmlstr + "\n\n" + errstr);
throw e;
} finally {
if (reader != null) {
reader.close();
}
}
}
public TechnicalMetadataEnhancement(AbstractFedoraEnhancementService service, EnhancementMessage message) {
super(service, message);
}
}