package edu.unc.lib.deposit.fcrepo3;
import static edu.unc.lib.deposit.work.DepositGraphUtils.dprop;
import static edu.unc.lib.dl.util.ContentModelHelper.Datastream.DATA_FILE;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.ConnectException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.XMLOutputter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.ws.client.WebServiceTransportException;
import com.hp.hpl.jena.rdf.model.Bag;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;
import edu.unc.lib.deposit.work.AbstractDepositJob;
import edu.unc.lib.deposit.work.DepositGraphUtils;
import edu.unc.lib.dl.acl.util.AccessGroupSet;
import edu.unc.lib.dl.acl.util.GroupsThreadStore;
import edu.unc.lib.dl.fedora.AccessClient;
import edu.unc.lib.dl.fedora.FedoraException;
import edu.unc.lib.dl.fedora.FedoraTimeoutException;
import edu.unc.lib.dl.fedora.JobForwardingJMSListener;
import edu.unc.lib.dl.fedora.ListenerJob;
import edu.unc.lib.dl.fedora.ManagementClient;
import edu.unc.lib.dl.fedora.ManagementClient.Format;
import edu.unc.lib.dl.fedora.ObjectExistsException;
import edu.unc.lib.dl.fedora.ObjectIntegrityException;
import edu.unc.lib.dl.fedora.PID;
import edu.unc.lib.dl.fedora.ServiceException;
import edu.unc.lib.dl.fedora.types.Datastream;
import edu.unc.lib.dl.ingest.IngestException;
import edu.unc.lib.dl.reporting.ActivityMetricsClient;
import edu.unc.lib.dl.services.DigitalObjectManager;
import edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship;
import edu.unc.lib.dl.util.ContentModelHelper.Relationship;
import edu.unc.lib.dl.util.DepositConstants;
import edu.unc.lib.dl.util.DepositException;
import edu.unc.lib.dl.util.DepositStatusFactory;
import edu.unc.lib.dl.util.JMSMessageUtil;
import edu.unc.lib.dl.util.JMSMessageUtil.FedoraActions;
import edu.unc.lib.dl.util.PremisEventLogger;
import edu.unc.lib.dl.util.RedisWorkerConstants.DepositField;
import edu.unc.lib.dl.util.TripleStoreQueryService;
import edu.unc.lib.dl.xml.FOXMLJDOMUtil;
/**
* Ingests the contents of the deposit into the Fedora repository, along with a deposit record. Also performs updates to
* the destination container.
*
* @author bbpennel
* @author count0
*
*/
public class IngestDeposit extends AbstractDepositJob implements ListenerJob {
private static final Logger log = LoggerFactory.getLogger(IngestDeposit.class);
private static long COMPLETE_CHECK_DELAY = 500L;
private static long CONNECT_EXCEPTION_DELAY = 30000L;
@Autowired
private JobForwardingJMSListener listener;
@Autowired
private ManagementClient client;
@Autowired
private DigitalObjectManager digitalObjectManager;
@Autowired
private AccessClient accessClient;
@Autowired
private TripleStoreQueryService tsqs;
@Autowired
private ActivityMetricsClient metricsClient;
private int ingestObjectCount;
private Queue<String> ingestPids;
private Collection<String> ingestsAwaitingConfirmation;
private List<String> topLevelPids;
private PID destinationPID;
// Flag indicating whether to ingest the deposit record object to the repository
private boolean excludeDepositRecord;
private File foxmlDirectory;
private Map<String, String> depositStatus;
public IngestDeposit() {
super();
}
public IngestDeposit(String uuid, String depositUUID) {
super(uuid, depositUUID);
}
/**
* Listener event fired when an object has been ingested. If the ingested object belonged to this job, then mark that
* ingest as completed
*
* @param message Fedora APIM message
*/
@Override
public void onEvent(Document message) {
String action = JMSMessageUtil.getAction(message);
if (!FedoraActions.INGEST.getName().equals(action))
return;
PID pid = new PID(JMSMessageUtil.getPid(message));
boolean result = ingestsAwaitingConfirmation.remove(pid.getURI());
if (result) {
addClicks(1);
getDepositStatusFactory().addConfirmedPID(getDepositUUID(), pid.getPid());
log.debug("Notified that {} has finished ingesting as part of deposit {}",
pid.getPid(), this.getDepositUUID());
}
}
/**
* Processes the structure of the deposit, storing the number of actions involved and retrieving an list of pids in
* the correct order for ingest
*/
private void processDepositStructure() {
// Store reference to the foxml directory
foxmlDirectory = new File(getDepositDirectory(), DepositConstants.FOXML_DIR);
// Retrieve the pid of the container this object is being ingested to
destinationPID = new PID(depositStatus.get(DepositField.containerId.name()));
excludeDepositRecord = Boolean.parseBoolean(depositStatus.get(DepositField.excludeDepositRecord.name()));
Model model = getReadOnlyModel();
ingestPids = new ArrayDeque<String>();
topLevelPids = new ArrayList<String>();
ingestsAwaitingConfirmation = Collections.synchronizedSet(new HashSet<String>());
String depositPid = getDepositPID().getURI();
Bag depositBag = model.getBag(depositPid);
// Capture number of objects and depth first list of pids for individual objects to be ingested
DepositGraphUtils.walkChildrenDepthFirst(depositBag, ingestPids, true);
// Store the number of objects being ingested, excluding the deposit record
ingestObjectCount = ingestPids.size();
// Add the deposit pid to the list
if (!excludeDepositRecord) {
ingestPids.add(depositPid);
}
// Number of actions is the number of ingest objects plus deposit record
setTotalClicks(ingestPids.size());
// Deposit is restarting from part way through, reduce set of items for ingest
boolean resuming = getDepositStatusFactory().isResumedDeposit(getDepositUUID());
if (resuming) {
ingestObjectCount -= removeAlreadyIngested();
}
// Capture the top level pids
DepositGraphUtils.walkChildrenDepthFirst(depositBag, topLevelPids, false);
closeModel();
// TODO capture structure for ordered sequences instead of just bags
}
/**
* Removes any pids confirmed or uploaded and present in fedora from the list of pids for ingest
*/
private int removeAlreadyIngested() {
int numberRemoved = 0;
// Prevent reingest of all items already confirmed to have been ingested.
Set<String> confirmedSet = getDepositStatusFactory().getConfirmedUploads(getDepositUUID());
for (String confirmed : confirmedSet) {
ingestPids.remove(new PID(confirmed).getURI());
numberRemoved++;
}
// Check for any items that were uploaded but not confirmed, and check to see if they made it in
Set<String> unconfirmedSet = getDepositStatusFactory().getUnconfirmedUploads(getDepositUUID());
for (String unconfirmed : unconfirmedSet) {
PID unconfirmedPID = new PID(unconfirmed);
try {
if (accessClient.getObjectProfile(unconfirmedPID, null) != null) {
ingestPids.remove(unconfirmedPID.getURI());
// Update status to indicate this item was actually confirmed
addClicks(1);
getDepositStatusFactory().addConfirmedPID(getDepositUUID(), unconfirmedPID.getPid());
numberRemoved++;
}
} catch (FedoraException e) {
// Object wasn't found, so ingest must have failed. Should be retained for ingest
} catch (ServiceException e) {
log.debug("Unexpected failure while checking for ingest of {}", unconfirmedPID, e);
}
}
return numberRemoved;
}
@Override
public void runJob() {
depositStatus = getDepositStatus();
try {
// set up permission groups for forwarding
String groups = depositStatus.get(DepositField.permissionGroups.name());
AccessGroupSet ags = new AccessGroupSet(groups);
GroupsThreadStore.storeGroups(ags);
GroupsThreadStore.storeUsername(depositStatus.get(DepositField.depositorName.name()));
// When ingesting, assume that an "object exists" exception is confirmation
// that the object exists, rather than an error.
boolean confirmExisting;
if (Boolean.parseBoolean(depositStatus.get(DepositField.isResubmit.name()))) {
confirmExisting = true;
} else {
confirmExisting = false;
}
// Extract information about structure of the deposit
processDepositStructure();
// Register this job with the JMS listener prior to doing work
listener.registerListener(this);
DepositStatusFactory statusFactory = getDepositStatusFactory();
// Begin ingest of individual objects in the deposit
String ingestPid = null;
try {
// Ingest all deposit objects and record, start listening for them
while ((ingestPid = ingestPids.poll()) != null) {
addTopLevelToContainer(ingestPid);
// Register pid as needing ingest confirmation
ingestsAwaitingConfirmation.add(ingestPid);
ingestObject(ingestPid, confirmExisting);
statusFactory.addUploadedPID(getDepositUUID(), new PID(ingestPid).getPid());
statusFactory.incrIngestedObjects(getDepositUUID(), 1);
// Verify that the job has not been interrupted before continuing
verifyRunning();
}
} catch (DepositException e) {
failJob(e, e.getLocalizedMessage(), ingestPid);
return;
}
// listen to Fedora JMS to see when all objects are ingested
try {
while (ingestsAwaitingConfirmation.size() > 0) {
verifyRunning();
Thread.sleep(COMPLETE_CHECK_DELAY);
}
log.debug("Finished waiting for children of {} to be ingested", this.getDepositUUID());
} catch (InterruptedException e) {
log.info("Interrupted ingest of job {}", this.getJobUUID());
return;
}
updateDestinationEvents();
} finally {
GroupsThreadStore.clearGroups();
GroupsThreadStore.clearUsername();
// Unregister self from the jms listener
listener.unregisterListener(this);
}
}
/**
* Adds the given objects pid to the destination container if the pid is at the top level of the ingest
*
* @param pid
* @throws DepositException
*/
private void addTopLevelToContainer(String pid) throws DepositException {
if (!topLevelPids.contains(pid))
return;
while (true) {
try {
digitalObjectManager.addChildrenToContainer(destinationPID, Arrays.asList(new PID(pid)));
return;
} catch (FedoraTimeoutException e) {
throw e;
} catch (FedoraException | IngestException e) {
throw new DepositException("Failed to add object " + pid + " to destination " + destinationPID.getPid(), e);
} catch (ServiceException e) {
waitIfConnectionLostOrRethrow(e);
}
}
}
/**
* Ingests an object and its referenced files into Fedora.
* <p>
* If confirmExisting is true, we will consider an exception from Fedora
* telling us the object already exists to be confirmation that it is already
* ingested and remove it from the list of ingests awaiting confirmation.
* Otherwise, we will rethrow such exceptions.
*
* @param ingestPid
* @param confirmExisting
* @throws DepositException
*/
private void ingestObject(String ingestPid, boolean confirmExisting) throws DepositException {
PID pid = new PID(ingestPid);
File foxml = new File(foxmlDirectory, pid.getUUID() + ".xml");
// Load objects foxml
SAXBuilder builder = new SAXBuilder();
Document foxmlDoc;
try {
foxmlDoc = builder.build(foxml);
} catch (Exception e) {
throw new DepositException("Failed to parse FOXML for object " + pid.getPid() + ".", e);
}
// Add ingestion event to PREMIS log
Element ingestEvent = getEventLog().logEvent(PremisEventLogger.Type.INGESTION, "ingested as PID:" + pid.getPid(),
pid);
appendDepositEvent(pid, ingestEvent);
// Upload files included in this ingest and updates file references
uploadIngestFiles(foxmlDoc, pid);
// Ingest the object's FOXML
try {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
XMLOutputter xmlOutput = new XMLOutputter();
xmlOutput.output(foxmlDoc, outputStream);
while (true) {
try {
log.debug("Ingesting foxml for {}", ingestPid);
client.ingestRaw(outputStream.toByteArray(), Format.FOXML_1_1, getDepositUUID());
// Record FOXML throughput metrics
metricsClient.incrDepositFileThroughput(getDepositUUID(), foxml.length());
return;
} catch (ServiceException e) {
waitIfConnectionLostOrRethrow(e);
}
}
} catch (FedoraTimeoutException e) {
log.info("Fedora ingest timed out, awaiting ingest confirmation and proceeding with the remainder of the deposit: "
+ e.getLocalizedMessage());
} catch (ObjectExistsException e) {
if (confirmExisting || isDuplicateOkay(pid)) {
if (ingestsAwaitingConfirmation.remove(ingestPid)) {
addClicks(1);
}
} else {
throw new DepositException("Object " + pid.getPid() + " already exists in the repository.", e);
}
} catch (ObjectIntegrityException e) {
throw new DepositException("Checksum mismatch for object " + pid.getPid() + ".", e);
} catch (Exception e) {
throw new DepositException("Failed to ingest object " + pid.getPid() + " into Fedora.", e);
}
// TODO increment ingestedOctets
}
private boolean isDuplicateOkay(PID pid) {
// Get the deposit ID for the repository copy of pid
List<String> deposits = tsqs.fetchBySubjectAndPredicate(pid, Relationship.originalDeposit.toString());
// Ensure that the deposit id as record by fedora matches the current deposit or is not present
if (deposits != null && !deposits.contains(this.getDepositPID().getURI())) {
return false;
}
Model model = getReadOnlyModel();
try {
Resource objectResc = model.getResource(pid.getURI());
Property stagingLocation = dprop(model, DepositRelationship.stagingLocation);
if (!objectResc.hasProperty(stagingLocation)) {
// No staging location, no file, no reason to check further
return true;
}
// Get information for copy in the repository
Datastream ds = client.getDatastream(pid, DATA_FILE.getName());
// Confirm that incoming file is the same size as the one in the repository
Property filesizeProperty = dprop(model, DepositRelationship.size);
if (objectResc.hasProperty(filesizeProperty)) {
long incomingSize = Long.parseLong(objectResc.getProperty(filesizeProperty).getString());
if (incomingSize != ds.getSize() && !(ds.getSize() == -1 && incomingSize == 0)) {
// File sizes didn't match, so this is not the correct file
return false;
}
}
// If a checksum is available, make sure it matches the one in the repository
Property md5sum = dprop(model, DepositRelationship.md5sum);
if (objectResc.hasProperty(md5sum)) {
String incomingChecksum = objectResc.getProperty(md5sum).getString();
return ds.getChecksum().equals(incomingChecksum);
}
return true;
} catch (FedoraException e1) {
log.debug("Failed to get datastream info while checking on duplicate for {}", pid, e1);
} finally {
closeModel();
}
return false;
}
/**
* Uploads locally held files and PREMIS referenced by an objects FOXML. As a side effect, updates the FOXML
* document's file references to point to the uploaded file paths in Fedora instead of the local file paths.
*
* @param foxml
* @param pid
* @throws DepositException
*/
private void uploadIngestFiles(Document foxml, PID pid) throws DepositException {
for (Element cLocation : FOXMLJDOMUtil.getFileLocators(foxml)) {
String ref = cLocation.getAttributeValue("REF");
String newref = null;
try {
URI uri = new URI(ref);
// Upload local file reference
if (uri.getScheme() == null || uri.getScheme().contains("file")) {
String path = uri.getPath();
File file = getDepositDirectory().toPath().resolve(path).toFile();
// Make sure the file was inside the deposit directory
if (!file.toPath().toAbsolutePath().startsWith(getDepositDirectory().toPath().toAbsolutePath())) {
throw new DepositException("File path was outside the deposit directory");
}
repeatUpload: while (true) {
try {
if (!file.exists()) {
throw new IOException("File not found: " + ref);
}
log.debug("uploading " + file.getPath());
newref = client.upload(file);
cLocation.setAttribute("REF", newref);
// Record throughput metrics
metricsClient.incrDepositFileThroughput(getDepositUUID(), file.length());
break repeatUpload;
} catch (FedoraTimeoutException e) {
log.warn("Connection to Fedora lost while ingesting {}, halting ingest", ref);
throw e;
} catch (IOException e) {
throw new DepositException("Data file missing: " + ref, e);
} catch (ServiceException e) {
waitIfConnectionLostOrRethrow(e);
}
}
} else {
continue;
}
} catch (URISyntaxException e) {
throw new DepositException("Bad URI syntax for file ref", e);
}
log.debug("uploaded " + ref + " to Fedora " + newref + " for " + pid);
}
}
/**
* Updates the destination container event log to include this ingest
*/
private void updateDestinationEvents() {
// Record ingest event on parent
PremisEventLogger destinationPremis = new PremisEventLogger(getDepositStatus().get(DepositField.depositorName));
destinationPremis.logEvent(PremisEventLogger.Type.INGESTION,
"added " + ingestObjectCount + " child object(s) to this container", destinationPID);
while (true) {
try {
client.writePremisEventsToFedoraObject(destinationPremis, destinationPID);
return;
} catch (FedoraException e) {
log.error("Failed to update PREMIS events after completing ingest to " + destinationPID.getPid(), e);
return;
} catch (ServiceException e) {
waitIfConnectionLostOrRethrow(e);
}
}
}
/**
* If the given exception indicates that it was caused by a lost connection, then wait until
* the repository is available again. Otherwise, rethrow the exception
*
* @param e
* @throws ServiceException
*/
private void waitIfConnectionLostOrRethrow(ServiceException e) throws ServiceException {
Throwable rootCause = e.getRootCause();
if (rootCause instanceof ConnectException || rootCause instanceof WebServiceTransportException) {
while (true) {
log.warn("Unable to connect to Fedora repository, waiting before retrying.");
try {
Thread.sleep(CONNECT_EXCEPTION_DELAY);
} catch (InterruptedException e1) {
throw new ServiceException("Attempt to reconnect to Fedora was interrupted.");
}
verifyRunning();
if (client.isRepositoryAvailable()) {
return;
}
}
} else {
throw e;
}
}
public Queue<String> getIngestPids() {
return ingestPids;
}
public List<String> getTopLevelPids() {
return topLevelPids;
}
public Collection<String> getIngestsAwaitingConfirmation() {
return ingestsAwaitingConfirmation;
}
public void setListener(JobForwardingJMSListener listener) {
this.listener = listener;
}
public int getIngestObjectCount() {
return ingestObjectCount;
}
}