package dk.kb.yggdrasil.preservation; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.UUID; import org.jwat.common.ContentType; import org.jwat.common.Uri; import org.jwat.warc.WarcConcurrentTo; import org.jwat.warc.WarcDigest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import dk.kb.yggdrasil.config.RequestHandlerContext; import dk.kb.yggdrasil.db.PreservationRequestState; import dk.kb.yggdrasil.exceptions.ArgumentCheck; import dk.kb.yggdrasil.exceptions.PreservationException; import dk.kb.yggdrasil.exceptions.YggdrasilException; import dk.kb.yggdrasil.json.preservation.Update; import dk.kb.yggdrasil.warc.Digest; import dk.kb.yggdrasil.warc.WarcWriterWrapper; import dk.kb.yggdrasil.warc.YggdrasilWarcConstants; /** * Class to manage the creation and upload of WARC files. * It will create one WARC record at the time, and new records will be added until one of following two * conditions are met: * The size of the WARC file is too large, or too long time has passed since the first record was added. * When one of these conditions are met, then the WARC file is uploaded to the Bitrepository, replied * to the * * This manager only handles a specific preservation collection. */ public class PreservationPacker { /** Logging mechanism. */ private Logger logger = LoggerFactory.getLogger(this.getClass().getName()); /** The context, containing settings, etc. */ private final RequestHandlerContext context; /** The collection id for this manager.*/ private final String collectionId; /** The writer of the WARC file.*/ private WarcWriterWrapper writer; /** The preservationRequests where the metadata are stored in the warc file.*/ private List<PreservationRequestState> metadataRequests; /** The date for the current timeout. 0 until the writer is initialized. */ private Long currentTimeout = 0L; /** * Constructor. * @param context The context for the preservation * @param collectionId The id of the collection. */ public PreservationPacker(RequestHandlerContext context, String collectionId) { this.context = context; this.collectionId = collectionId; } /** * Check the conditions, and upload if any of them has been met. */ public synchronized void verifyConditions() { if(writer != null) { boolean conditionsMet = false; if (writer.getWarcFileSize() > context.getConfig().getWarcSizeLimit()) { conditionsMet = true; logger.debug("WARC file size limit reached."); } if(new Date().getTime() > currentTimeout) { conditionsMet = true; logger.debug("Time limit reached."); } if(conditionsMet) { logger.info("Finished packaging WARC file. Uploading and cleaning up."); uploadWarcFile(); cleanUp(); } } } /** * Write the contentPaylod and transformed of the preservation records. * @param prs The record of the preservation request to write. * @throws YggdrasilException If it fails to write the preservation request state. * @throws PreservationException If it fails to perform the preservation. */ public synchronized void writePreservationRecord(PreservationRequestState prs) throws YggdrasilException, PreservationException { checkInitialize(); metadataRequests.add(prs); try { Uri resourceId = null; Digest digestor = new Digest("SHA-1"); InputStream in = null; if (prs.getContentPayload() != null) { File resource = prs.getContentPayload(); Long offsetStart = writer.getWarcFileSize(); try { in = new FileInputStream(resource); WarcDigest blockDigest = digestor.getDigestOfFile(resource); resourceId = writer.writeResourceRecord(in, resource.length(), ContentType.parseContentType("application/binary"), blockDigest, prs.getRequest().File_UUID); } finally { if(in != null) { in.close(); in = null; } } prs.setResourceWarcFile(writer.getWarcFile()); Long offsetEnd = writer.getWarcFileSize(); prs.setFileOffset(offsetStart, offsetEnd); context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_RESOURCES_PACKAGE_SUCCESS); } if (prs.getMetadataPayload() != null) { File metadata = prs.getMetadataPayload(); Long offsetStart = writer.getWarcFileSize(); try { in = new FileInputStream(metadata); WarcDigest blockDigest = digestor.getDigestOfFile(metadata); writer.writeMetadataRecord(in, metadata.length(), ContentType.parseContentType("text/xml"), resourceId, blockDigest, prs.getRequest().UUID); in.close(); } finally { if(in != null) { in.close(); in = null; } } prs.setMetadataWarcFile(writer.getWarcFile()); Long offsetEnd = writer.getWarcFileSize(); prs.setOffset(offsetStart, offsetEnd); } context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_PACKAGE_COMPLETE); prs.setMetadataWarcFile(writer.getWarcFile()); context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_PACKAGE_WAITING_FOR_MORE_DATA); } catch (IOException e) { throw new PreservationException(PreservationState.PRESERVATION_METADATA_PACKAGED_FAILURE, "Error while writing WARC record!", e); } } /** * Write the contentPaylod and transformed of the preservation update records. * @param prs The record of the preservation update request to write. * @throws YggdrasilException If it fails to write the preservation request state. * @throws PreservationException If it fails to perform the preservation. */ public synchronized void writeUpdateRecord(PreservationRequestState prs) throws YggdrasilException, PreservationException { checkInitialize(); metadataRequests.add(prs); Update update = createUpdateElement(prs, writer.getWarcFile().getName()); prs.setUpdatePreservation(update); try { Uri resourceId = null; Digest digestor = new Digest("SHA-1"); InputStream in = null; if (prs.getContentPayload() != null) { File resource = prs.getContentPayload(); Long offsetStart = writer.getWarcFileSize(); try { WarcConcurrentTo concurrentTo = new WarcConcurrentTo(); concurrentTo.warcConcurrentToStr = prs.getRequest().File_UUID; in = new FileInputStream(resource); WarcDigest blockDigest = digestor.getDigestOfFile(resource); resourceId = writer.writeUpdateRecord(in, resource.length(), ContentType.parseContentType("application/binary"), null, Arrays.asList(concurrentTo), blockDigest, update.file_uuid); } finally { if(in != null) { in.close(); in = null; } } Long offsetEnd = writer.getWarcFileSize(); prs.setFileOffset(offsetStart, offsetEnd); context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_RESOURCES_PACKAGE_SUCCESS); } if (prs.getMetadataPayload() != null) { File metadata = prs.getMetadataPayload(); Long offsetStart = writer.getWarcFileSize(); try { WarcConcurrentTo concurrentTo = new WarcConcurrentTo(); concurrentTo.warcConcurrentToStr = prs.getRequest().UUID; in = new FileInputStream(metadata); WarcDigest blockDigest = digestor.getDigestOfFile(metadata); resourceId = writer.writeUpdateRecord(in, metadata.length(), ContentType.parseContentType("text/xml"), resourceId, Arrays.asList(concurrentTo), blockDigest, update.uuid); in = new FileInputStream(metadata); in.close(); } finally { if(in != null) { in.close(); in = null; } } Long offsetEnd = writer.getWarcFileSize(); prs.setOffset(offsetStart, offsetEnd); } context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_PACKAGE_COMPLETE); context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, PreservationState.PRESERVATION_PACKAGE_WAITING_FOR_MORE_DATA); } catch (IOException e) { throw new PreservationException(PreservationState.PRESERVATION_METADATA_PACKAGED_FAILURE, "Error while writing WARC record!", e); } } /** * Initializes the WARC file in necessary. * Also performs the condition check. * @throws YggdrasilException */ private void checkInitialize() throws YggdrasilException { verifyConditions(); if(writer == null) { currentTimeout = new Date().getTime() + context.getConfig().getUploadWaitLimit(); metadataRequests = new ArrayList<PreservationRequestState>(); initializeNewWarcFile(); logger.debug("Initialising new WARC file: " + writer.getWarcFileId() + ", with size limit: " + context.getConfig().getWarcSizeLimit() + ", date limit: " + new Date(currentTimeout)); } } /** * Uploads the Warc file to the Bitrepository. * @throws YggdrasilException */ private void uploadWarcFile() { boolean success = context.getBitrepository().uploadFile(writer.getWarcFile(), collectionId); try { for(PreservationRequestState prs : metadataRequests) { if(success) { updateRequestState(PreservationState.PRESERVATION_PACKAGE_UPLOAD_SUCCESS, prs); logger.info("Upload to bitrepository for UUID '" + prs.getUUID() + "' of package '" + writer.getWarcFileId() + "' was successful."); } else { prs.resetUploadPackage(); // reset warcId to null updateRequestState(PreservationState.PRESERVATION_PACKAGE_UPLOAD_FAILURE, prs); logger.warn("Upload to bitrepository for UUID '" + prs.getUUID() + "' of package '" + writer.getWarcFileId() + "' failed."); } prs.cleanup(); context.getStateDatabase().delete(prs.getUUID()); } } catch (YggdrasilException e) { logger.error("A error occured when reporting about bitrepository upload of the file '" + writer.getWarcFileId() + "' to the collection '" + collectionId + "'. Trying to continue.", e); } } /** * Update the preservation state of the request, both locally and remote. * @param preservationState The new state. * @param prs The request to update. * @throws YggdrasilException If something goes wrong. */ private void updateRequestState(PreservationState preservationState, PreservationRequestState prs) throws YggdrasilException { context.getRemotePreservationStateUpdater().sendPreservationResponse(prs, preservationState); context.getStateDatabase().putPreservationRecord(prs.getUUID(), prs); } /** * Cleans up the current warc writer. */ protected void cleanUp() { metadataRequests.clear(); if(writer != null) { //boolean deleteSuccess = writer.getWarcFile().delete(); //logger.debug("Cleaned up file: succesfully removed from disc: " + deleteSuccess); try { writer.close(); } catch (YggdrasilException e) { logger.warn("An issue occured when closing the current writer.", e); } writer = null; } } /** * Initializes the new WarcFile, with the WarcInfo. */ private void initializeNewWarcFile() throws YggdrasilException { UUID packageId = UUID.randomUUID(); File writeDirectory = context.getConfig().getTemporaryDir(); writer = WarcWriterWrapper.getWriter(writeDirectory, packageId.toString()); try { Digest digestor = new Digest("SHA-1"); String warcInfoPayload = YggdrasilWarcConstants.getWarcInfoPayload(); byte[] warcInfoPayloadBytes = warcInfoPayload.getBytes("UTF-8"); writer.writeWarcinfoRecord(warcInfoPayloadBytes, digestor.getDigestOfBytes(warcInfoPayloadBytes)); } catch (IOException e) { throw new YggdrasilException("Could not create the WARC info record.", e); } } /** * @param prs The preservation request state to * @return The Update element for the preservation request state. */ private Update createUpdateElement(PreservationRequestState prs, String warcId) { ArgumentCheck.checkTrue(prs.getContentPayload() != null || prs.getMetadataPayload() != null, "Cannot create an update element with neither content nor metadata."); Update res = new Update(); res.date = new Date().toString(); if(prs.getContentPayload() != null) { res.file_uuid = UUID.randomUUID().toString(); res.file_warc_id = warcId; if(prs.getFileOffset() != null) { res.file_warc_offset = prs.getFileOffset(); } } if(prs.getMetadataPayload() != null) { res.uuid = UUID.randomUUID().toString(); res.warc_id = warcId; if(prs.getOffset() != null) { res.warc_offset = prs.getOffset(); } } return res; } }