/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
*
* University Of Edinburgh (EDINA)
* Scotland
*
*
* File Name : BasePackageDetector.java
* Author : gwaller
* Approver : Gareth Waller
*
* Notes :
*
*
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* HISTORY
* -------
*
* $LastChangedRevision$
* $LastChangedDate$
* $LastChangedBy$
*/
package uk.ac.jorum.packager.detector;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.log4j.Logger;
import org.dspace.content.Bitstream;
import org.dspace.content.packager.PackageDetector;
import org.dspace.content.packager.PackageIngester;
import org.dspace.content.packager.PackageUtils;
import org.jdom.Document;
import uk.ac.jorum.packager.XMLManifest;
import uk.ac.jorum.submit.step.PackageDetectorStep;
import uk.ac.jorum.utils.ExceptionLogger;
import eu.medsea.mimeutil.MimeType;
import eu.medsea.mimeutil.MimeUtil;
/**
* @author gwaller
*
*/
public abstract class BasePackageDetector implements PackageDetector {
private static Logger logger = Logger.getLogger(BasePackageDetector.class);
// Initialise the MimeUtil class to use 2 detectors
static {
MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.ExtensionMimeDetector");
MimeUtil.registerMimeDetector("eu.medsea.mimeutil.detector.MagicMimeMimeDetector");
}
// Read in 512 bytes in an attempt to detect the mime type via the magic header
private static final int MIME_DETECTOR_HEADER_BUFFER_SIZE = 512;
private Bitstream b;
/* (non-Javadoc)
* @see uk.ac.jorum.packager.dectector.PackageDetector#isValidPackage()
*/
public abstract boolean isValidPackage();
/* (non-Javadoc)
* @see uk.ac.jorum.packager.detector.PackageDetector#setBitstream(org.dspace.content.Bitstream)
*/
public void setBitstream(Bitstream b) {
this.b = b;
}
public Bitstream getBitstream(){
return this.b;
}
/* (non-Javadoc)
* @see uk.ac.jorum.packager.detector.PackageDetector#ingesterClass()
*/
public abstract Class<? extends PackageIngester> ingesterClass();
private Collection<MimeType> getMimeTypes(byte[] byteArr){
return MimeUtil.getMimeTypes(byteArr);
}
private Collection<MimeType> getMimeTypes(String filename){
return MimeUtil.getMimeTypes(filename);
}
private boolean isZip(Collection<MimeType> types){
boolean result = false;
// Valid Zip MIME types (from MimeUtil):
// application/zip,application/x-zip,application/x-compressed,application/x-zip-compressed,multipart/x-zip
for (MimeType type : types){
logger.debug("Checking MIME type: " + type);
if (type.getMediaType().compareTo("application") == 0){
result = (type.getSubType().compareTo("zip") == 0) ||
(type.getSubType().compareTo("x-zip") == 0) ||
(type.getSubType().compareTo("x-compressed") == 0) ||
(type.getSubType().compareTo("x-zip-compressed") == 0);
} else if (type.getMediaType().compareTo("multipart") == 0){
result = (type.getSubType().compareTo("x-zip") == 0);
}
}
return result;
}
private boolean isZip(byte[] byteArr){
boolean result = false;
try{
// Get mime types
Collection<MimeType> mimeTypes = getMimeTypes(byteArr);
result = isZip(mimeTypes);
} catch (Exception e){
ExceptionLogger.logException(logger, e);
}
return result;
}
private boolean isZip(String filename){
boolean result = false;
try{
// Get mime types
Collection<MimeType> mimeTypes = getMimeTypes(filename);
result = isZip(mimeTypes);
} catch (Exception e){
ExceptionLogger.logException(logger, e);
}
return result;
}
private byte[] fillBuffer(InputStream is){
byte[] result = new byte[MIME_DETECTOR_HEADER_BUFFER_SIZE];
// Read in as much as we can
int offset = 0;
while (true){
try{
int read = is.read(result, offset, result.length - offset);
if (read == -1){
// Got to end of stream
break;
}
// Update offset
offset += read;
if (offset >= result.length){
// filled buffer!
break;
}
} catch (IOException e){
break;
}
}
// Must check how many bytes we have actually read - if we have read less than the buffer size, must copy this to a new array to
// ensure the array returned only contains data read from the stream - not random data in memory
if (offset != result.length){
byte[] resultTrimmed = new byte[offset];
for (int i = 0; i < offset; i++){
resultTrimmed[i] = result[i];
}
result = resultTrimmed;
}
return result;
}
/**
* This method retiieves the bitstream inputstream, check to see if it is a Zip file (by reading the first
* MIME_DETECTOR_HEADER_BUFFER_SIZE number of bytes and using MimeUtil) and if a zip is found, attempts to
* find the manifest file specified by manifestName and parses it using JDom. (Schema validation is based
* on the dspace config - see PackageDetectorStep)
* @param manifestName the name of the manifest file to look for in the zip
* @return Document - a JDOM Document instance representing the parsed manifest or null if none found, or error occurred
*/
protected Document containsManifest(String manifestName){
Document result = null;
ZipInputStream zip = null;
try{
// Get an input stream to the object to read some bytes
/*
* NOTE: The input stream returned is a GeneralFileInputStream i.e. either SRBFileInputStream or LocalFileInputStream
* Neither of these streams extend FileInputStream and do not support mar or reset methods and as such directly
* passing the stream to MIMEUtil will result in the exception:
*
* Caught Exception: eu.medsea.mimeutil.MimeException: InputStream must support the mark() and reset() methods.
* eu.medsea.mimeutil.MimeUtil2.getMimeTypes(MimeUtil2.java:478)
* eu.medsea.mimeutil.MimeUtil2.getMimeTypes(MimeUtil2.java:455)
*
* Instead supply a byte array of the intial bytes in the stream, enough so that the magic mime detector can make
* a good guess but not too much as to use a lot of memory - cannot read the whole file as it may be huge!
*
* A good compromise is to pass the byte array through the magic mime detector and see if a match is found, if not
* pass the file name through the extension detector as a last resort and see if a match can be found that way.
*/
InputStream is = this.b.retrieve();
boolean foundZip = false;
if (is != null){
// Read some bytes and close the stream
byte[] byteArr = fillBuffer(is);
try {is.close();} catch (Exception e){}
logger.debug("Detecting file type using byte array approach ...");
foundZip = isZip(byteArr);
logger.debug("foundZip = " + foundZip);
} else {
logger.warn("Could not retrieve InputStream for bitstream " + this.b.getID());
}
if (!foundZip){
logger.debug("Detecting file type using file name approach ... filename = " + this.b.getName());
// Try the extension detector now
foundZip = isZip(this.b.getName());
logger.debug("foundZip = " + foundZip);
}
// First of all check if it is a Zip file
if (foundZip){
// Now we need to iterate through the zip entries and find a manifest file
// NOTE: Must get the InputStream again - the call to isZip will have read from the stream
InputStream contentsStream = this.b.retrieve();
// Shouldn't be null if we got here but check anyway just in case!
if (contentsStream != null){
zip = new ZipInputStream(contentsStream);
ZipEntry ze;
while ((ze = zip.getNextEntry()) != null) {
String fname = ze.getName();
// Manifest must be at top level directory - therefore shouldn't have a "/" in name
if (!fname.contains("/") && fname.compareTo(manifestName) == 0){
// found manifest - now parse the manifest and return as a Document
// NOTE: validation set based on configuration - already read in PackageDetectorStep class
// Don't want the stream closed - we will do that in the finally block, hence use of UnclosableInputStream
result = XMLManifest.parseManifest(new PackageUtils.UnclosableInputStream(zip), PackageDetectorStep.validate);
break;
}
}
} else {
logger.warn("Could not retrieve InputStream for bitstream " + this.b.getID());
}
}
} catch (Exception e){
ExceptionLogger.logException(logger, e);
} finally {
try {zip.close();} catch (Exception e){}
}
return result;
}
}