package dk.kb.yggdrasil.preservationimport;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.math.BigInteger;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.bitrepository.bitrepositoryelements.ChecksumType;
import org.bitrepository.bitrepositoryelements.FilePart;
import org.bitrepository.common.utils.ChecksumUtils;
import org.jwat.common.Uri;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import dk.kb.yggdrasil.config.RequestHandlerContext;
import dk.kb.yggdrasil.db.PreservationImportRequestState;
import dk.kb.yggdrasil.exceptions.ArgumentCheck;
import dk.kb.yggdrasil.exceptions.YggdrasilException;
import dk.kb.yggdrasil.json.JSONMessaging;
import dk.kb.yggdrasil.json.preservationimport.PreservationImportRequest;
import dk.kb.yggdrasil.json.preservationimport.Security;
import dk.kb.yggdrasil.messaging.MessageRequestHandler;
import dk.kb.yggdrasil.utils.TimeUtils;
/**
* The handler class for preservation import requests.
*/
public class PreservationImportRequestHandler extends MessageRequestHandler<PreservationImportRequest> {
/** Logging mechanism. */
private Logger logger = LoggerFactory.getLogger(this.getClass().getName());
/** Context for this preservation. */
private final RequestHandlerContext context;
/** The size of the buffer. */
private static final int BUFFER_SIZE = 16*1024;
/**
* Constructor.
* @param context The context for the preservation import.
*/
public PreservationImportRequestHandler(RequestHandlerContext context) {
ArgumentCheck.checkNotNull(context, "PreservationContext context");
this.context = context;
}
/**
* Handles the PreservationImportRequest.
* @param request The preservation import request to handle.
* @throws YggdrasilException if anything goes wrong.
*/
@Override
public void handleRequest(PreservationImportRequest request) throws YggdrasilException {
logger.info("Preservation request received.");
if (!request.isMessageValid()) {
logger.error("Skipping invalid message");
return;
}
PreservationImportRequestState state = new PreservationImportRequestState(request,
PreservationImportState.PRESERVATION_IMPORT_REQUEST_RECEIVED_AND_VALIDATED);
if(!validateRequest(state)) {
logger.warn("The request is invalid: " + request.toString());
return;
}
performImport(state);
}
@Override
public PreservationImportRequest extractRequest(byte[] b) throws YggdrasilException {
return JSONMessaging.getRequest(new PushbackInputStream(new ByteArrayInputStream(b), PUSHBACKBUFFERSIZE),
PreservationImportRequest.class);
}
/**
* Performing the import operation.
* @param state The state for handling the preservation import request.
* @throws YggdrasilException If it fails.
*/
public void performImport(PreservationImportRequestState state) throws YggdrasilException {
logger.info("Starting to import '" + state.getRequest().type + "' for uuid '" + state.getRequest().uuid + "'");
try {
retrieveData(state);
logger.info("Retrieved data for import of '" + state.getRequest().type + "' for uuid '"
+ state.getRequest().uuid + "'");
validateExtractedData(state);
validateTokenDate(state);
logger.info("Starting to deliver data for import '" + state.getRequest().type + "' for uuid '"
+ state.getRequest().uuid + "'");
deliverData(state);
// Send final success response.
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_FINISHED, null);
// cleanup
state.cleanup();
logger.info("Finished processing the preservation import request of '" + state.getRequest().type
+ "' for uuid '" + state.getRequest().uuid + "'");
} catch (YggdrasilException e) {
// Send failure, if it is not a fail-state.
if(state.getState().isOkState()) {
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_FAILURE, e.getMessage());
}
logger.error("Failure", e);
}
}
/**
* Validates the preservation import request.
* Currently only validates the preservation profile against the possible bitrepository-collections.
* @param request The preservation import request to validate.
* @return Whether or not the request is valid.
*/
protected boolean validateRequest(PreservationImportRequestState state) throws YggdrasilException {
List<String> errors = new ArrayList<String>();
// Add check about whether the profile is a known collectionID or not known
String preservationProfile = state.getRequest().preservation_profile;
List<String> possibleCollections = context.getBitrepository().getKnownCollections();
if (!possibleCollections.contains(preservationProfile)) {
String errMsg = "The given preservation profile '" + preservationProfile
+ "' does not match a known collection ID. Expected one of: " + possibleCollections;
logger.error(errMsg);
errors.add(errMsg);
}
// Check the type. Must be 'FILE'
// TODO Handle other types than FILE.
if(!state.getRequest().type.equalsIgnoreCase("FILE")) {
String errMsg = "The given preservation profile '" + preservationProfile
+ "' does not match a known collection ID. Expected one of: " + possibleCollections;
logger.error(errMsg);
errors.add(errMsg);
}
// validate the delivery URL
try {
new URL(state.getRequest().url);
} catch (MalformedURLException e) {
String errMsg = "Malformed URL: " + state.getRequest().url;
logger.error(errMsg, e);
errors.add(errMsg);
}
// validate checksum format: 'algorithm':'checksum'
if(state.getRequest().security != null && state.getRequest().security.checksum != null
&& !state.getRequest().security.checksum.isEmpty()) {
String checksum = state.getRequest().security.checksum;
if(!checksum.contains(":")) {
String errMsg = "The checksum in the request does not comply with definition. No algorithm";
logger.error(errMsg);
errors.add(errMsg);
} else {
try {
extractChecksumType(checksum);
} catch (YggdrasilException e) {
logger.error(e.getMessage());
errors.add(e.getMessage());
}
}
}
if(errors.isEmpty()) {
// Send update about success retrieval and validation.
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_REQUEST_RECEIVED_AND_VALIDATED, null);
return true;
} else {
// Send the update about validation failure.
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_REQUEST_VALIDATION_FAILURE, errors.toString());
return false;
}
}
/**
* Extracts the WARC data from the Bitrepository.
* @param request The request containing about which warc file to retrieve and which
* Bitrepository collection to retrieve the warc file from.
* @return The warc file.
* @throws YggdrasilException If retrieving the warc file from the Bitrepository fails.
*/
protected void retrieveData(PreservationImportRequestState state) throws YggdrasilException {
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_RETRIEVAL_FROM_BITREPOSITORY_INITIATED, null);
try {
if(state.getImportData() == null || !state.getImportData().isFile()) {
FilePart filePart = null;
PreservationImportRequest request = state.getRequest();
if(state.getRequest().warc.warc_offset != null && request.warc.warc_record_size != null) {
filePart = new FilePart();
filePart.setPartOffSet(BigInteger.valueOf(Long.parseLong(request.warc.warc_offset)));
filePart.setPartLength(BigInteger.valueOf(Long.parseLong(request.warc.warc_record_size)));
}
File warcFile = context.getBitrepository().getFile(request.warc.warc_file_id,
request.preservation_profile, filePart);
File record = extractData(warcFile, state);
state.setImportData(record);
logger.info("Retrieved data from Bitrepository for '" + state.getRequest().uuid + "'.");
} else {
logger.warn("Already having retrieved the data. This must be recovery from "
+ "failure or unexpected shutdown.");
}
} catch (YggdrasilException e) {
// Sending retrieval failure response.
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_RETRIEVAL_FROM_BITREPOSITORY_FAILURE, e.getMessage());
throw e;
}
}
/**
* Extracts the warc-record payload from the warc-file.
* @param warcFile The warc file.
* @param request The request containing information about which warc record to extract.
* @return A file containing the warc-record payload.
* @throws YggdrasilException If the extraction of the warc-record fails.
*/
protected File extractData(File warcFile, PreservationImportRequestState state) throws YggdrasilException {
ArgumentCheck.checkExistsNormalFile(warcFile, "File warcFile");
try (InputStream in = new FileInputStream(warcFile);) {
WarcRecord retrievedRecord = null;
Uri uuid = new Uri("urn:uuid:" + state.getRequest().warc.warc_record_id);
WarcReader reader = WarcReaderFactory.getReader( in );
WarcRecord record;
while (retrievedRecord == null && (record = reader.getNextRecord()) != null) {
if(record.header.warcRecordIdUri.equals(uuid)) {
retrievedRecord = record;
state.setWarcHeaderChecksum(record.header.warcBlockDigestStr);
}
}
if(retrievedRecord == null) {
String errMsg = "Did not find the record";
logger.warn(errMsg);
throw new YggdrasilException(errMsg);
}
return extractRecordPayloadAsFile(retrievedRecord);
} catch (IOException e) {
throw new YggdrasilException("Could not extract the data from the warc file.", e);
} catch (URISyntaxException e) {
throw new YggdrasilException("URI for the warc-record is invalid.", e);
}
}
/**
* Extracts the payload from the warc-record.
* @param record The warc record.
* @return A file containing the payload of the warc-record.
* @throws YggdrasilException If the extraction fails.
*/
private File extractRecordPayloadAsFile(WarcRecord record) throws YggdrasilException {
File res = new File(context.getConfig().getTemporaryDir(), "warc-record-" + new Date().getTime());
InputStream in = record.getPayloadContent();
try (FileOutputStream out = new FileOutputStream(res);){
byte[] read = new byte[BUFFER_SIZE];
int i;
while((i = in.read(read)) > -1) {
out.write(read, 0, i);
}
out.flush();
} catch (IOException e) {
throw new YggdrasilException("Could not extract warc record content into seperate file.", e);
}
return res;
}
/**
* Validates the extracted data against the optional security-checksum in the request (if it is there).
* @param state The preservation import request state.
* @throws YggdrasilException If the extracted data is not valid.
*/
private void validateExtractedData(PreservationImportRequestState state) throws YggdrasilException {
if(state.getRequest().security == null || state.getRequest().security.checksum == null
|| state.getRequest().security.checksum.isEmpty()) {
logger.debug("No checksum to validate ");
return;
}
ChecksumType csType = extractChecksumType(state.getRequest().security.checksum);
String deliveredChecksum = state.getRequest().security.checksum.split(":")[1];
String calculatedChecksum = ChecksumUtils.generateChecksum(state.getImportData(), csType);
// Validate against delivered checksum.
if(!calculatedChecksum.equalsIgnoreCase(deliveredChecksum)) {
String errMsg = "Inconsistent checksum between retrieved file ('" + calculatedChecksum
+ "') and the delivered checksum ('" + deliveredChecksum + "') in the algorithm '"
+ csType.name() + "'.";
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_FAILURE, errMsg);
throw new YggdrasilException(errMsg);
}
// Validate against warc-header checksum
if(state.getWarcHeaderChecksum() == null || state.getWarcHeaderChecksum().isEmpty()
|| !state.getWarcHeaderChecksum().contains(":")) {
logger.warn("Cannot validate against header fields. Continuing anyway.");
return;
}
ChecksumType headerCsType = extractChecksumType(state.getWarcHeaderChecksum());
String headerChecksum = state.getWarcHeaderChecksum().split(":")[1];
String checksumForHeader;
if(headerCsType == csType) {
checksumForHeader = calculatedChecksum;
} else {
checksumForHeader = ChecksumUtils.generateChecksum(state.getImportData(), headerCsType);
}
if(!headerChecksum.equalsIgnoreCase(checksumForHeader)){
String errMsg = "Inconsistent checksum between retrieved file ('" + checksumForHeader
+ "') and the header checksum ('" + headerChecksum + "') in the algorithm '"
+ headerCsType.name() + "'.";
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_FAILURE, errMsg);
throw new YggdrasilException(errMsg);
}
}
/**
* Validate the token timeout date.
* @param state The state containing the request with the token-timeout to validate.
* @throws YggdrasilException If the timeout has already been reached.
*/
private void validateTokenDate(PreservationImportRequestState state) throws YggdrasilException {
Security s = state.getRequest().security;
if(s != null && s.token != null && s.token_timeout != null) {
Date d = TimeUtils.parseDate(s.token_timeout);
if(d.getTime() < new Date().getTime()) {
throw new YggdrasilException("Token timeout (" + d.toString() + ") exceeded.");
}
} else {
logger.debug("No timeout of the token to validate.");
}
}
/**
* Sends the file to the given URL, though security demands a token, then also deliver the token.
* @param state The state of the preservation import request message handling.
* @throws YggdrasilException If the data fails to be delivered.
*/
private void deliverData(PreservationImportRequestState state) throws YggdrasilException {
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_DELIVERY_INITIATED, null);
MultipartEntityBuilder builder = MultipartEntityBuilder.create();
if(state.getRequest().security != null) {
String token = state.getRequest().security.token;
if(token != null && !token.isEmpty()) {
builder.addTextBody("token", token, ContentType.TEXT_PLAIN);
}
}
builder.addTextBody("uuid", state.getRequest().uuid);
builder.addTextBody("type", state.getRequest().type);
builder.addBinaryBody("file", state.getImportData());
HttpEntity multipart = builder.build();
boolean success = context.getHttpCommunication().post(state.getRequest().url, multipart);
if(success) {
logger.info("Successfully delivered data for '" + state.getRequest().uuid + "'");
} else {
// Failure. Send response telling about the error.
String errMsg = "Could not deliver the data to '" + state.getRequest().url;
context.getRemotePreservationStateUpdater().sendPreservationImportResponse(state,
PreservationImportState.PRESERVATION_IMPORT_DELIVERY_FAILURE, errMsg);
throw new YggdrasilException(errMsg);
}
}
/**
* Extracts the checksum type from a digestBlock.
* @param digestBlock The digestBlock in format 'algorithm':'checksum'.
* @return The checksum type.
*/
private ChecksumType extractChecksumType(String digestBlock) throws YggdrasilException {
if(!digestBlock.contains(":")) {
throw new YggdrasilException("The checksum in the request does not comply with definition. "
+ "No algorithm");
} else {
String checksumType = digestBlock.split(":")[0];
// Remove any '-' from the SHA algorithms, and makes it upper-case.
checksumType = checksumType.replaceFirst("-", "").toUpperCase();
try {
return ChecksumType.fromValue(checksumType);
} catch (IllegalArgumentException e) {
throw new YggdrasilException(checksumType + " is not supported.", e);
}
}
}
}