/**
* Copyright 2008 The University of North Carolina at Chapel Hill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.unc.lib.deposit.normalize;
import static edu.unc.lib.deposit.work.DepositGraphUtils.cdrprop;
import static edu.unc.lib.deposit.work.DepositGraphUtils.dprop;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.hasSourceMetadataProfile;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.sourceMetadata;
import static edu.unc.lib.dl.util.ContentModelHelper.Datastream.MD_SOURCE;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.hasDatastream;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.mimetype;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.stagingLocation;
import static edu.unc.lib.dl.util.MetadataProfileConstants.BIOMED_ARTICLE;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.EPDCX_NS;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.METS_NS;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import org.jdom2.Attribute;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.filter.Filters;
import org.jdom2.input.SAXBuilder;
import org.jdom2.input.sax.XMLReaders;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.jdom2.transform.JDOMResult;
import org.jdom2.transform.JDOMSource;
import org.jdom2.xpath.XPathExpression;
import org.jdom2.xpath.XPathFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.rdf.model.Bag;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.NodeIterator;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;
import edu.unc.lib.dl.fedora.PID;
import edu.unc.lib.dl.util.ContentModelHelper;
import edu.unc.lib.dl.util.ContentModelHelper.CDRProperty;
import edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship;
import edu.unc.lib.dl.util.DepositConstants;
import edu.unc.lib.dl.util.PackagingType;
import edu.unc.lib.dl.util.PremisEventLogger.Type;
import edu.unc.lib.dl.xml.JDOMNamespaceUtil;
import edu.unc.lib.dl.xml.METSProfile;
/**
* @author bbpennel
* @date Oct 28, 2015
*/
public class BioMedToN3BagJob extends AbstractMETS2N3BagJob {
private static final Logger log = LoggerFactory.getLogger(BioMedToN3BagJob.class);
private static final String fLocatHrefPath =
"/m:mets/m:fileSec/m:fileGrp/m:file[@ID = '%s']/m:FLocat/@xlink:href";
private static final Pattern mainArticlePattern = Pattern.compile(".*\\_Article\\_.*\\.[pP][dD][fF]");
private Transformer epdcx2modsTransformer = null;
public BioMedToN3BagJob(String uuid, String depositUUID) {
super(uuid, depositUUID);
}
public Transformer getEpdcx2modsTransformer() {
return epdcx2modsTransformer;
}
public void setEpdcx2modsTransformer(Transformer epdcx2modsTransformer) {
this.epdcx2modsTransformer = epdcx2modsTransformer;
}
@Override
public void runJob() {
validateMETS();
// Store a reference to the manifest file
addManifestURI();
validateProfile(METSProfile.DSPACE_SIP);
Document mets = loadMETS();
assignPIDs(mets); // assign any missing PIDs
saveMETS(mets); // manifest updated to have record of all PIDs
Model model = getWritableModel();
METSHelper helper = new METSHelper(mets);
// deposit RDF bag
Bag top = model.createBag(getDepositPID().getURI().toString());
// add aggregate work bag
Element aggregateEl = helper.mets.getRootElement().getChild("structMap", METS_NS).getChild("div", METS_NS);
List<Element> topChildren = new ArrayList<>();
String metadataFileName = retrieveChildrenMinusMetadata(aggregateEl, helper.mets, topChildren);
Resource rootResource = constructResources(model, aggregateEl, topChildren, helper);
top.add(rootResource);
if (topChildren.size() > 1) {
setDefaultWebObject(model, model.getBag(rootResource));
}
extractEPDCX(helper.mets, rootResource);
try {
addSourceMetadata(model, rootResource, metadataFileName);
} catch (JDOMException | IOException e) {
failJob(e, "Failed to add source metadata.");
}
recordDepositEvent(Type.NORMALIZATION, "Normalized deposit package from {0} to {1}", PackagingType.METS_DSPACE_SIP_1.getUri(), PackagingType.BAG_WITH_N3.getUri());
}
private String retrieveChildrenMinusMetadata(Element aggregateEl, Document mets, List<Element> topChildren) {
XPathFactory xFactory = XPathFactory.instance();
String metadataFileName = null;
// Get the list of children minus the metadata document if it exists
for (Element child : aggregateEl.getChildren("div", METS_NS)) {
// Detect the metadata file if it has not already been located
if (metadataFileName == null) {
// Find the filename for current div
String fileId = child.getChild("fptr", METS_NS).getAttributeValue("FILEID");
XPathExpression<Attribute> xPath = xFactory.compile(String.format(fLocatHrefPath, fileId),
Filters.attribute(), null, METS_NS, JDOMNamespaceUtil.XLINK_NS);
String fileName = xPath.evaluateFirst(mets).getValue();
// Is it the metadata document?
if (fileName.endsWith(".xml.Meta")) {
// Capture reference to the xml document
metadataFileName = fileName;
continue;
}
}
// Add all other children to the list
topChildren.add(child);
}
return metadataFileName;
}
private Resource constructResources(Model model, Element aggregateEl, List<Element> topChildren, METSHelper helper) {
Property hasModel = model.createProperty(ContentModelHelper.FedoraProperty.hasModel.getURI().toString());
Property fileLocation = model.createProperty(DepositRelationship.stagingLocation.toString());
if (topChildren.size() == 1) {
Resource rootResource = model.createResource(METSHelper.getPIDURI(topChildren.get(0)));
model.add(rootResource, hasModel, model.createResource(ContentModelHelper.Model.SIMPLE.getURI().toString()));
helper.addFileAssociations(model, true);
// Move properties for data to the root resource
String location = rootResource.getProperty(fileLocation).getString();
String filename = location.substring("data/".length()).toLowerCase();
model.add(rootResource, dprop(model, DepositRelationship.label), filename);
return rootResource;
}
Bag rootObject = model.createBag(METSHelper.getPIDURI(aggregateEl));
model.add(rootObject, hasModel, model.createResource(ContentModelHelper.Model.CONTAINER.getURI().toString()));
model.add(rootObject, hasModel, model.createResource(ContentModelHelper.Model.AGGREGATE_WORK.getURI().toString()));
for (Element childEl : topChildren) {
Resource child = model.createResource(METSHelper.getPIDURI(childEl));
rootObject.add(child);
}
helper.addFileAssociations(model, true);
// Add labels to aggregate children
NodeIterator children = rootObject.iterator();
try {
while (children.hasNext()) {
Resource child = children.nextNode().asResource();
String location = child.getProperty(fileLocation).getString();
String filename = location.substring("data/".length()).toLowerCase();
model.add(child, dprop(model, DepositRelationship.label), filename);
}
} finally {
children.close();
}
return rootObject;
}
private void extractEPDCX(Document mets, Resource rootResource) {
// extract EPDCX from mets
FileOutputStream fos = null;
try {
Element epdcxEl = mets.getRootElement().getChild("dmdSec", METS_NS).getChild("mdWrap", METS_NS)
.getChild("xmlData", METS_NS).getChild("descriptionSet", EPDCX_NS);
JDOMResult mods = new JDOMResult();
epdcx2modsTransformer.transform(new JDOMSource(epdcxEl), mods);
final File modsFolder = getDescriptionDir();
modsFolder.mkdir();
File modsFile = new File(modsFolder, new PID(rootResource.getURI()).getUUID()+".xml");
fos = new FileOutputStream(modsFile);
new XMLOutputter(Format.getPrettyFormat()).output(mods.getDocument(), fos);
} catch(NullPointerException ignored) {
log.debug("NPE", ignored);
// no embedded metadata
} catch (TransformerException | IOException e) {
failJob(e, "Failed during transform of EPDCX to MODS.");
}
}
private void addSourceMetadata(Model model, Resource rootResource, String metadataFileName)
throws JDOMException, IOException {
if (metadataFileName == null) {
return;
}
PID sourceMDPID = new PID(rootResource.getURI() + "/" + MD_SOURCE.getName());
Resource sourceMDResource = model.createResource(sourceMDPID.getURI());
model.add(rootResource, dprop(model, hasDatastream), sourceMDResource);
model.add(rootResource, cdrprop(model, sourceMetadata), sourceMDResource);
model.add(sourceMDResource, dprop(model, stagingLocation),
this.getDataDirectory().getName() + "/" + metadataFileName);
model.add(rootResource, cdrprop(model, hasSourceMetadataProfile), BIOMED_ARTICLE);
model.add(sourceMDResource, dprop(model, mimetype), "text/xml");
File modsFile = new File(getDescriptionDir(), new PID(rootResource.getURI()).getUUID() + ".xml");
SAXBuilder sb = new SAXBuilder(XMLReaders.NONVALIDATING);
sb.setFeature("http://xml.org/sax/features/validation", false);
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
Document existingModsDocument = null;
// Start from an existing MODS document if there is one
if (modsFile.exists()) {
existingModsDocument = sb.build(modsFile);
} else {
// Make sure the description directory exists since there was no MODS doc
File descriptionDir = new File(getDepositDirectory(), DepositConstants.DESCRIPTION_DIR);
if (!descriptionDir.exists())
descriptionDir.mkdir();
}
Document metadataDocument = sb.build(new File(this.getDataDirectory(), metadataFileName));
BioMedArticleHelper biohelper = new BioMedArticleHelper();
Document mods = biohelper.extractMODS(metadataDocument, existingModsDocument);
// Output the new MODS file, overwriting the existing one if it was present
try (FileOutputStream out = new FileOutputStream(modsFile, false)) {
new XMLOutputter(Format.getPrettyFormat()).output(mods, out);
}
}
private void setDefaultWebObject(Model model, Bag rootObject) {
Property fileLocation = model.createProperty(ContentModelHelper.DepositRelationship.stagingLocation.toString());
NodeIterator children = rootObject.iterator();
try {
// Find the main article file
while(children.hasNext()) {
Resource child = children.nextNode().asResource();
String location = child.getProperty(fileLocation).getString();
// filename will be the article ID, but not XML
if (!mainArticlePattern.matcher(location).matches()) {
continue;
}
log.debug("Found primary Biomed content document {}", location);
// If this is a main object, then designate it as a default web object for its parent container
Property defaultObject = model.getProperty(CDRProperty.defaultWebObject.getURI().toString());
model.add(rootObject, defaultObject, child);
return;
}
} finally {
children.close();
}
}
}