/**
* Copyright 2008 The University of North Carolina at Chapel Hill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.unc.lib.deposit.normalize;
import static edu.unc.lib.deposit.work.DepositGraphUtils.cdrprop;
import static edu.unc.lib.deposit.work.DepositGraphUtils.dprop;
import static edu.unc.lib.deposit.work.DepositGraphUtils.fprop;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.dateCreated;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.defaultWebObject;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.embargoUntil;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.hasSourceMetadataProfile;
import static edu.unc.lib.dl.util.ContentModelHelper.CDRProperty.sourceMetadata;
import static edu.unc.lib.dl.util.ContentModelHelper.Datastream.MD_SOURCE;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.hasDatastream;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.label;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.mimetype;
import static edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship.stagingLocation;
import static edu.unc.lib.dl.util.ContentModelHelper.FedoraProperty.hasModel;
import static edu.unc.lib.dl.util.ContentModelHelper.Model.AGGREGATE_WORK;
import static edu.unc.lib.dl.util.ContentModelHelper.Model.CONTAINER;
import static edu.unc.lib.dl.util.MetadataProfileConstants.PROQUEST_ETD;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.MODS_V3_NS;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.UUID;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.jdom2.transform.JDOMResult;
import org.jdom2.transform.JDOMSource;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.util.UriUtils;
import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.rdf.model.Bag;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;
import edu.unc.lib.deposit.work.AbstractDepositJob;
import edu.unc.lib.dl.fedora.PID;
import edu.unc.lib.dl.util.DateTimeUtil;
import edu.unc.lib.dl.util.PackagingType;
import edu.unc.lib.dl.util.PremisEventLogger.Type;
import edu.unc.lib.dl.util.ZipFileUtil;
/**
* Normalizes a Proquest ETD deposit object into an N3 deposit structure.
*
* Expects to receive a deposit directory with the data directory containing the already expanded contents of the
* Proquest package.
*
* @author bbpennel
* @date Apr 23, 2014
*/
public class Proquest2N3BagJob extends AbstractDepositJob {
private static final Logger log = LoggerFactory.getLogger(Proquest2N3BagJob.class);
public static final String DATA_SUFFIX = "_DATA.xml";
private Transformer proquest2ModsTransformer = null;
public Proquest2N3BagJob() {
}
public Proquest2N3BagJob(String uuid, String depositUUID) {
super(uuid, depositUUID);
}
@Override
public void runJob() {
unzipPackages();
// deposit RDF bag
Model model = getWritableModel();
Bag depositBag = model.createBag(getDepositPID().getURI().toString());
File[] packageDirs = this.getDataDirectory().listFiles();
for (File packageDir : packageDirs) {
if (packageDir.isDirectory()) {
normalizePackage(packageDir, model, depositBag);
}
}
// Add normalization event to deposit record
recordDepositEvent(Type.NORMALIZATION, "Normalized deposit package from {0} to {1}",
PackagingType.PROQUEST_ETD.getUri(), PackagingType.BAG_WITH_N3.getUri());
}
private void normalizePackage(File packageDir, Model model, Bag depositBag) {
// Generate a uuid for the main object
PID primaryPID = new PID("uuid:" + UUID.randomUUID());
Resource primaryResource;
// Identify the important files from the deposit
File dataFile = null, contentFile = null, attachmentDir = null;
File[] files = packageDir.listFiles();
for (File file : files) {
if (file.isDirectory()) {
attachmentDir = file;
} else if (file.getName().endsWith(DATA_SUFFIX)) {
dataFile = file;
} else {
contentFile = file;
}
}
long lastModified = -1;
File zipFile = new File(packageDir.getAbsolutePath() + ".zip");
try (ZipFile zip = new ZipFile(zipFile)) {
ZipArchiveEntry entry = zip.getEntry(contentFile.getName());
lastModified = entry.getTime();
} catch (IOException e) {
log.error("Failed to read zip file located at {}.zip", packageDir.getAbsolutePath(), e);
}
if (lastModified == -1) {
lastModified = zipFile.lastModified();
}
DateTime modified = new DateTime(lastModified, DateTimeZone.UTC);
// Deserialize the data document
SAXBuilder builder = new SAXBuilder();
Element dataRoot = null;
Document mods = null;
try {
Document dataDocument = builder.build(dataFile);
dataRoot = dataDocument.getRootElement();
// Transform the data into MODS and store it to its final resting place
mods = extractMods(primaryPID, dataRoot, modified);
} catch (TransformerException e) {
failJob(e, "Failed to transform metadata to MODS.");
} catch (Exception e) {
failJob(e, "Unable to deserialize the metadata file.");
}
// Detect if there are any attachments
List<?> attachmentElements = dataRoot.getChild("DISS_content").getChildren("DISS_attachment");
if (attachmentElements == null || attachmentElements.size() == 0) {
// Simple object with the content as its source data
primaryResource = populateSimple(model, primaryPID, contentFile);
} else {
String title = mods.getRootElement().getChild("titleInfo", MODS_V3_NS).getChildText("title", MODS_V3_NS);
// Has attachments, so it is an aggregate
primaryResource = populateAggregate(model, primaryPID, attachmentElements, attachmentDir, contentFile, title);
}
// Store primary resource as child of the deposit
depositBag.add(primaryResource);
// Add the data file as a metadata datastream of the primary object
setSourceMetadata(model, primaryResource, dataFile);
// Capture other metadata, like embargoes
setEmbargoUntil(model, primaryResource, dataRoot);
// Creation date for the content file
model.add(primaryResource, cdrprop(model, dateCreated), modified.toString(), XSDDatatype.XSDdateTime);
}
private void unzipPackages() {
File dataDirectory = this.getDataDirectory();
File zipFiles[] = dataDirectory.listFiles(new FilenameFilter() {
@Override
public boolean accept(File directory, String fileName) {
return fileName.endsWith(".zip");
}
});
for (File packageFile : zipFiles) {
try {
String packageDir = packageFile.getName();
packageDir = packageDir.substring(0, packageDir.length() - 4);
ZipFileUtil.unzipToDir(packageFile, new File(dataDirectory, packageDir));
} catch (IOException e) {
throw new Error("Unable to unpack your deposit: " + getDepositPID().getUUID(), e);
}
}
}
/**
* Transform the given root element from the data document into MODS and stores it as the metadata for the object
* being ingested
*
* @param primaryPID
* @param dataRoot
* @param modified
* @throws TransformerException
* @throws FileNotFoundException
* @throws IOException
*/
private Document extractMods(PID primaryPID, Element dataRoot, DateTime modified) throws TransformerException,
FileNotFoundException,
IOException {
int month = modified.getMonthOfYear();
String gradSemester;
if (month >= 2 && month <= 6) {
gradSemester = "Spring";
} else if (month >= 7 && month <= 9) {
gradSemester = "Summer";
} else {
gradSemester = "Winter";
}
JDOMResult mods = new JDOMResult();
// Transform the metadata into MODS
synchronized (proquest2ModsTransformer) {
proquest2ModsTransformer.setParameter("graduationSemester", gradSemester + " " + modified.getYear());
proquest2ModsTransformer.transform(new JDOMSource(dataRoot), mods);
}
// Create the description folder and write the MODS out to it
final File modsFolder = getDescriptionDir();
modsFolder.mkdir();
File modsFile = new File(modsFolder, primaryPID.getUUID() + ".xml");
try (FileOutputStream fos = new FileOutputStream(modsFile)) {
new XMLOutputter(Format.getPrettyFormat()).output(mods.getDocument(), fos);
}
return mods.getDocument();
}
private Resource populateSimple(Model model, PID primaryPID, File contentFile) {
// Create the primary resource as a simple resource
Resource primaryResource = model.createResource(primaryPID.getURI());
// use the filename as the label
model.add(primaryResource, dprop(model, label), contentFile.getName());
// Reference the content file as the data file
model.add(primaryResource, dprop(model, stagingLocation), getRelativePath(contentFile));
return primaryResource;
}
private Resource populateAggregate(Model model, PID primaryPID, List<?> attachmentElements, File attachmentDir,
File contentFile, String title) {
Property labelP = dprop(model, label);
Property fileLocation = dprop(model, stagingLocation);
Property defaultWebObjectP = cdrprop(model, defaultWebObject);
Property hasModelP = fprop(model, hasModel);
// Create the primary resource as a bag
Bag primaryBag = model.createBag(primaryPID.getURI());
model.add(primaryBag, hasModelP, model.createResource(CONTAINER.getURI().toString()));
model.add(primaryBag, hasModelP, model.createResource(AGGREGATE_WORK.getURI().toString()));
// Assign title to the main object as a label
if (title.length() > 128)
model.add(primaryBag, labelP, title.substring(0, 128));
else
model.add(primaryBag, labelP, title);
// Create default web object child entry for the main document
PID defaultObjectPID = new PID("uuid:" + UUID.randomUUID());
Resource defaultObjectResource = model.createResource(defaultObjectPID.getURI());
primaryBag.add(defaultObjectResource);
// Store the main content on the child
model.add(defaultObjectResource, labelP, contentFile.getName());
model.add(defaultObjectResource, fileLocation, getRelativePath(contentFile));
// Store reference to content as the default web object
model.add(primaryBag, defaultWebObjectP, defaultObjectResource);
// Add the attachments as supplemental files
for (Object attachmentObj : attachmentElements) {
Element attachEl = (Element) attachmentObj;
String filename = attachEl.getChildText("DISS_file_name");
String description = attachEl.getChildText("DISS_file_descr");
// Make the child entry with a new uuid
PID pid = new PID("uuid:" + UUID.randomUUID());
Resource child = model.createResource(pid.getURI());
primaryBag.add(child);
// Use the description as a label if one was provided
if (description != null && description.trim().length() > 0)
model.add(child, labelP, description);
else
model.add(child, labelP, filename);
// Link the file to the child entry
model.add(child, fileLocation, getRelativePath(new File(attachmentDir, filename)));
}
return primaryBag;
}
private void setSourceMetadata(Model model, Resource primaryResource, File dataFile) {
// Add the data file as a metadata datastream of the primary object
PID sourceMDPID = new PID(primaryResource.getURI() + "/" + MD_SOURCE.getName());
Resource sourceMDResource = model.createResource(sourceMDPID.getURI());
model.add(primaryResource, dprop(model, hasDatastream), sourceMDResource);
model.add(primaryResource, cdrprop(model, sourceMetadata), sourceMDResource);
model.add(sourceMDResource, dprop(model, stagingLocation), getRelativePath(dataFile));
model.add(primaryResource, cdrprop(model, hasSourceMetadataProfile), PROQUEST_ETD);
model.add(sourceMDResource, dprop(model, mimetype), "text/xml");
}
private void setEmbargoUntil(Model model, Resource primaryResource, Element dataRoot) {
String embargoCode = dataRoot.getAttributeValue("embargo_code");
if (embargoCode != null) {
DateTime currentDate = new DateTime();
// Get the completion year and create a date time out of the end of the year, to make the most generous embargo possible
String compDateString = dataRoot.getChild("DISS_description").getChild("DISS_dates").getChildText("DISS_comp_date");
DateTime compDate = new DateTime(Integer.parseInt(compDateString), 12, 31, 0, 0, 0, 0);
// Embargo start time is the lowest of either the current date or the completion date
DateTime embargoEnd = currentDate.compareTo(compDate) < 0? currentDate : compDate;
if ("2".equals(embargoCode))
embargoEnd = embargoEnd.plusYears(1);
else if ("3".equals(embargoCode) || "4".equals(embargoCode))
embargoEnd = embargoEnd.plusYears(2);
else
embargoEnd = null;
// If the embargo end date isn't coming from comp_date then make sure it hasn't already expired
if (embargoEnd != null && embargoEnd != currentDate && embargoEnd.compareTo(currentDate) < 0) {
// Embargo has already expired, no need to set it
embargoEnd = null;
}
// Add the embargo end date as a triple
if (embargoEnd != null) {
model.add(primaryResource, cdrprop(model, embargoUntil),
DateTimeUtil.utcYMDFormatter.print(embargoEnd) + "T00:00:00",
XSDDatatype.XSDdateTime);
}
}
}
private String getRelativePath(File file) {
try {
return UriUtils.encodePath(
file.getAbsolutePath().substring(getDepositDirectory().getAbsolutePath().length() + 1), "UTF-8");
} catch (UnsupportedEncodingException e) {
log.error("Failed to encode file path", e);
return null;
}
}
public void setProquest2ModsTransformer(Transformer proquest2ModsTransformer) {
this.proquest2ModsTransformer = proquest2ModsTransformer;
}
}