/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
*
* University Of Edinburgh (EDINA)
* Scotland
*
*
* File Name : XMLManifest.java
* Author : gwaller
* Approver : Gareth Waller
*
* Notes :
*
*
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* HISTORY
* -------
*
* $LastChangedRevision$
* $LastChangedDate$
* $LastChangedBy$
*/
package uk.ac.jorum.packager;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.sql.SQLException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Item;
import org.dspace.content.crosswalk.CrosswalkException;
import org.dspace.content.crosswalk.IngestionCrosswalk;
import org.dspace.content.crosswalk.MetadataValidationException;
import org.dspace.content.packager.PackageException;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Constants;
import org.dspace.core.PluginInstantiationException;
import org.dspace.core.PluginManager;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import uk.ac.jorum.utils.ExceptionLogger;
/**
* @author gwaller
*
*/
public abstract class XMLManifest {
/** log4j category */
private static Logger log = Logger.getLogger(XMLManifest.class);
// START GWaller 52/02/09 IssueID #175 Moved MetadataFormat to sep class. Add in element deletion list
private static XMLManifestElementDeletion[] ELEMENT_DELETION_LIST = {
// Remove the o-ex rights element - not dependant on Metadataformat
new XMLManifestElementDeletion(null, "//o-ex:rights", new Namespace[] { Namespace.getNamespace("o-ex",
"http://odrl.net/1.1/ODRL-EX") }),
// Remove the Technical section from LOM
new XMLManifestElementDeletion(MetadataFormat.LOM, "//" + MetadataFormat.JORUM_NAMESPACE_PREFIX
+ ":technical", null) };
// END GWaller 52/02/09 IssueID #175 Moved MetadataFormat to sep class. Add in element deletion list
protected static String localSchemas;
/** prefix of config lines identifying local XML Schema (XSD) files */
private final static String CONFIG_XSD_PREFIX = "ims.xsd.";
/** config element for regex to pull out licence URL from the XML element value text */
protected final static String CONFIG_LICENCE_URL_REGEX = "licence.url.regex";
/** config element specifying which group (i.e. what matches within round brackets) in the URL regex to use as the URL - numbers start from 1 */
protected final static String CONFIG_LICENCE_URL_REGEX_GROUP = "licence.url.regex.groupnum";
protected final String DEFAULT_CC_URL_REGEX = ".*(http:\\/\\/creativecommons\\.org[\\S]*).*$";
/** Prefix of DSpace configuration lines that map IMS metadata type to
* crosswalk plugin names.
*/
private final static String CONFIG_METADATA_PREFIX = "xml.submission.crosswalk.";
protected static final String MATCHED_KEY = "matched";
public static final String IDENTIFIER_REF_ATTR = "identifierref";
public static final String IDENTIFIER_ATTR = "identifier";
protected static final String ITEM_ID_PREFIX = "ITEM-";
protected static final String RESOURCE_ID_PREFIX = "RES-";
public static final String HREF_ATTR = "href";
protected static final String RESOURCE_ELEM = "resource";
protected static final String FILE_ELEM = "file";
protected Document manifestDocument;
protected Element manifestRoot;
protected MetadataFormat format = null;
protected String metadataPrefix = null;
protected Map<String, Map<String, String>> bitstreamInfoMap = null;
protected Bundle[] urlBundle = null;
protected Bundle[] relatedBundle = null;
protected Bundle[] contentBundle = null;
protected Bundle[] metadata = null;
protected Item item = null;
protected String exportMetadataFormat = null;
// Create list of local schemas at load time, since it depends only
// on the DSpace configuration.
static {
String dspace_dir = ConfigurationManager.getProperty("dspace.dir");
File xsdPath1 = new File(dspace_dir + "/config/schemas/");
File xsdPath2 = new File(dspace_dir + "/config/");
Enumeration pe = ConfigurationManager.propertyNames();
StringBuffer result = new StringBuffer();
while (pe.hasMoreElements()) {
// config lines have the format:
// mets.xsd.{identifier} = {namespace} {xsd-URL}
// e.g.
// mets.xsd.dc = http://purl.org/dc/elements/1.1/ dc.xsd
// (filename is relative to {dspace_dir}/config/schemas/)
String key = (String) pe.nextElement();
if (key.startsWith(CONFIG_XSD_PREFIX)) {
String spec = ConfigurationManager.getProperty(key);
String val[] = spec.trim().split("\\s+");
if (val.length == 2) {
File xsd = new File(xsdPath1, val[1]);
if (!xsd.exists())
xsd = new File(xsdPath2, val[1]);
if (!xsd.exists())
log.warn("Schema file not found for config entry=\"" + spec + "\"");
else {
try {
String u = xsd.toURL().toString();
if (result.length() > 0)
result.append(" ");
result.append(val[0]).append(" ").append(u);
} catch (java.net.MalformedURLException e) {
log.warn("Skipping badly formed XSD URL: " + e.toString());
}
}
} else
log.warn("Schema config entry has wrong format, entry=\"" + spec + "\"");
}
}
localSchemas = result.toString();
log.debug("Got local schemas = \"" + localSchemas + "\"");
}
public XMLManifest(Document manifestDoc) {
this.manifestDocument = manifestDoc;
this.manifestRoot = this.manifestDocument.getRootElement();
log.debug("XMLManifest constructor - root element is: " + this.manifestRoot.toString());
}
public XMLManifest() {
}
/**
* Create a new manifest object from a serialized manifest XML document.
* Parse document read from the input stream, optionally validating.
* NOTE: This method attempts to close the InputStream, is
* @param is input stream containing serialized XML
* @param validate if true, enable XML validation using schemas
* in document. Also validates any sub-documents.
* @throws MetadataValidationException if there is any error parsing
* or validating the METS.
* @return new Document object.
*/
public static Document parseManifest(InputStream is, boolean validate) throws IOException,
MetadataValidationException {
SAXBuilder builder = new SAXBuilder(validate);
// Set the SAX parser to expand entity references so no need to check in value strings
builder.setExpandEntities(true);
// Set validation feature
if (validate) {
builder.setFeature("http://apache.org/xml/features/validation/schema", true);
}
log.debug("Mainfest XML validation set to: " + validate);
// Tell the parser where local copies of schemas are, to speed up
// validation. Local XSDs are identified in the configuration file.
if (localSchemas.length() > 0)
builder.setProperty("http://apache.org/xml/properties/schema/external-schemaLocation", localSchemas);
// Parse the manifest file
Document manifestDocument;
try {
manifestDocument = builder.build(is);
// XXX for temporary debugging
XMLOutputter outputPretty = new XMLOutputter(Format.getPrettyFormat());
log.debug("Got IMS DOCUMENT:");
// log.debug(outputPretty.outputString(manifestDocument));
} catch (JDOMException je) {
throw new MetadataValidationException("Error validating IMS manifest in " + is.toString(), je);
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
ExceptionLogger.logException(log, e);
}
}
}
return manifestDocument;
}
/**
* Creates a brand new manifest for items that were not originally content packages
* @throws PackageException
* @throws PluginInstantiationException
* @throws CrosswalkException
* @throws IOException
* @throws SQLException
*/
public abstract void createNewManifest() throws PackageException, PluginInstantiationException, CrosswalkException,
IOException, SQLException;
/**
* Reconstruct original manifest for items submitted as content packages
*
* @param originalManifest
* @throws PackageException
* @throws PluginInstantiationException
* @throws CrosswalkException
* @throws IOException
* @throws SQLException
*/
public abstract void reconstructManifest(Document originalManifest) throws PackageException,
PluginInstantiationException, CrosswalkException, IOException, SQLException;
/**
* Set the metadata format for an export
* @param exportMetadataFormat
*/
public abstract void setExportMetadataFormat(String exportMetadataFormat);
/**
* Load original manifest and return it as a jDom document
* @param item
* @return
* @throws IOException
* @throws SQLException
* @throws PackageException
* @throws MetadataValidationException
*/
public abstract Document recreateOriginalManifestDocument(Item item) throws IOException, SQLException,
PackageException, MetadataValidationException;
/**
* Construct a LinkedHashMap containing maps of the url, content and related bitstream info (as appropriate) we need
* to construct the organisation section of the manifest
* @param contentBundle array of content bitstreams
* @param urlBundle array of url bitstreams
* @param relatedBundle TODO
* @return Map of Maps with bitstream info
*/
protected LinkedHashMap<String, Map<String, String>> getBitstreamInfoMap(Bundle[] contentBundle,
Bundle[] urlBundle, Bundle[] relatedBundle) {
LinkedHashMap<String, Map<String, String>> bsInfoMap = new LinkedHashMap<String, Map<String, String>>();
// The content bundle should always exists, although it might not contain any bitstreams
int contentBundleLength = contentBundle.length;
Bitstream[] contentBitstreams = null;
int contentBitstreamLength = 0;
if (contentBundleLength > 0) {
contentBitstreams = contentBundle[0].getBitstreams();
contentBitstreamLength = contentBitstreams.length;
}
int urlBundleLength = urlBundle.length;
int relatedBundleLength = relatedBundle.length;
if (contentBitstreamLength > 0 && urlBundleLength > 0) {
log.debug("Populate bsInfoMap with url and content bitstreams");
Bitstream[] urlBitstreams = urlBundle[0].getBitstreams();
int urlBitstreamLength = urlBitstreams.length;
Bitstream[] bitstreams = new Bitstream[contentBitstreamLength + urlBitstreamLength];
System.arraycopy(contentBitstreams, 0, bitstreams, 0, contentBitstreamLength);
System.arraycopy(urlBitstreams, 0, bitstreams, contentBitstreamLength, urlBitstreamLength);
populateBitstreamInfo(bsInfoMap, bitstreams);
} else if (contentBitstreamLength > 0 && urlBundleLength == 0) {
log.debug("Populate bsInfoMap with content bitstreams");
populateBitstreamInfo(bsInfoMap, contentBundle[0].getBitstreams());
} else if (contentBitstreamLength == 0 && urlBundleLength > 0) {
log.debug("Populate bsInfoMap with url bitstreams");
populateBitstreamInfo(bsInfoMap, urlBundle[0].getBitstreams());
} else if (relatedBundleLength > 0) {
log.debug("Populate bsInfoMap with related bitstreams");
populateBitstreamInfo(bsInfoMap, relatedBundle[0].getBitstreams());
}
return bsInfoMap;
}
/**
* Put details of each bitstream in a LinkedHashMap
* @param bsInfoMap
* @param bitstreams
*/
protected void populateBitstreamInfo(LinkedHashMap<String, Map<String, String>> bsInfoMap, Bitstream[] bitstreams) {
for (Bitstream bitstream : bitstreams) {
Map<String, String> bitstreamDetails = new HashMap<String, String>();
String checksum = bitstream.getChecksum();
String bsName = bitstream.getName();
bitstreamDetails.put(IDENTIFIER_ATTR, new StringBuilder(ITEM_ID_PREFIX).append(checksum).toString());
bitstreamDetails
.put(IDENTIFIER_REF_ATTR, new StringBuilder(RESOURCE_ID_PREFIX).append(checksum).toString());
String source = bitstream.getSource();
if (source != null) {
if (source.equals(Constants.RELATED_CONTENT_PACKAGE_BUNDLE)) {
// For related bundles, determine the url, rather than just the handle
log.debug("Related bitstream found - work out url");
bitstreamDetails.put(HREF_ATTR, new StringBuilder(ConfigurationManager.getProperty("dspace.url"))
.append(File.separator).append(bsName).toString());
} else {
bitstreamDetails.put(HREF_ATTR, bsName);
}
} else {
bitstreamDetails.put(HREF_ATTR, bsName);
}
bitstreamDetails.put(MATCHED_KEY, "false");
bsInfoMap.put(bsName, bitstreamDetails);
}
}
public String matchCCLicenceUrl(String licenceText) throws MetadataValidationException {
String url = null;
if (licenceText == null) {
// throw an excpetion - we can't get a licence if the text is null
throw new MetadataValidationException("Null licence text supplied");
}
String ccUrlRegex = ConfigurationManager.getProperty(CONFIG_LICENCE_URL_REGEX);
if (ccUrlRegex == null) {
log.warn("Configuration property '" + CONFIG_LICENCE_URL_REGEX + "' not set - using default regex '"
+ DEFAULT_CC_URL_REGEX + "'");
}
// Pull out the URL
log.debug("Attempting to pull out licence url from string " + licenceText);
log.debug("Using regex: " + ccUrlRegex);
// Obtain a Matcher instance from the regex - no flags tp Patter - than can be supplied as modifiers in the pattern eg (?s)
Matcher matcher = Pattern.compile(ccUrlRegex).matcher(licenceText);
// Now run the regex against the string
if (!matcher.matches()) {
// Couldn't match pattern
log.debug("Matcher returned false - pattern was not found");
} else {
int matchingGroups = matcher.groupCount();
log.debug("Matcher found " + matchingGroups + " groups");
String groupToMatch = ConfigurationManager.getProperty(CONFIG_LICENCE_URL_REGEX_GROUP);
int groupToMatchVal = 1;
if (groupToMatch == null) {
log.warn("Configuration property '" + CONFIG_LICENCE_URL_REGEX_GROUP
+ "' not set. Will therefore attempt to match on group 1");
} else {
try {
groupToMatchVal = Integer.parseInt(groupToMatch);
if (groupToMatchVal < 0) {
throw new NumberFormatException();
}
} catch (NumberFormatException e) {
log.error("Invalid number specified for configuration property '"
+ CONFIG_LICENCE_URL_REGEX_GROUP + "' - using 1 instead.");
ExceptionLogger.logException(log, e);
}
}
if (groupToMatchVal <= matchingGroups) {
// Pull out the url using the group specified - this can return null if no match found
url = matcher.group(groupToMatchVal);
}
}
if (url != null) {
log.debug("Found licence URL: " + url);
} else {
log.debug("Licence URL not found - regex either did not match or invalid group specified.");
}
return url;
}
/**
* @return the manifest
*/
public Element getRootElement() {
return manifestRoot;
}
/**
* @return the manifestDocument
*/
public Document getManifestDocument() {
return manifestDocument;
}
/** Find crosswalk for the indicated metadata type (e.g. "DC", "MODS")
* The crosswalk plugin name MAY be indirected in config file,
* through an entry like
* ims.submission.crosswalk.{mdType} = {pluginName}
* e.g.
* ims.submission.crosswalk.DC = mysite-QDC
*/
public IngestionCrosswalk getCrosswalk(String type) {
String xwalkName = ConfigurationManager.getProperty(CONFIG_METADATA_PREFIX + type);
if (xwalkName == null)
xwalkName = type;
return (IngestionCrosswalk) PluginManager.getNamedPlugin(IngestionCrosswalk.class, xwalkName);
}
// GWaller 3/2/10 IssueID #175 Added method to return the XML as an input stream
public InputStream getXmlAsStream() throws UnsupportedEncodingException {
InputStream stream = null;
XMLOutputter outputPretty = new XMLOutputter(Format.getPrettyFormat());
String xml = outputPretty.outputString(manifestDocument);
// GWaller IssueID #484 XML from the XMLOutputter should be treated as UTF-8 encoded
stream = new ByteArrayInputStream(xml.getBytes("UTF-8"));
return stream;
}
public abstract List<Element> getResources() throws MetadataValidationException;
public abstract List<Element> getMetadataElements() throws MetadataValidationException;
public abstract MetadataFormat getMetadataFormat() throws MetadataFormatException, MetadataValidationException;
public abstract List<Element> getMetadataElements(Element mdRootNode) throws MetadataValidationException;
public abstract MetadataFormat getMetadataFormat(Element mdRootNode) throws MetadataFormatException,
MetadataValidationException;
// GWaller 02/02/09 IssueID #175 Xpath selector for the IMS metadata element
public abstract String getRootMetadataElementXpathSelector(MetadataFormat mdFormat) throws Exception;
// GWaller 02/02/09 IssueID #175 Add method to return the prefix for metadata elements currently used in the manifest
public String getMetadataElementPrefix() {
if (this.metadataPrefix == null) {
try {
List<Element> mdElements = getMetadataElements();
if (mdElements != null && mdElements.size() > 0) {
this.metadataPrefix = mdElements.get(0).getNamespacePrefix();
}
} catch (Exception e) {
}
}
// Could not detect it - simply use a null string
if (this.metadataPrefix == null) {
this.metadataPrefix = "";
}
return this.metadataPrefix;
}
/**
* Decide whether to create a new manifest or re-populate the original manifest
*
* @throws PluginInstantiationException
* @throws PackageException
* @throws CrosswalkException
* @throws IOException
* @throws SQLException
*/
public void populate() throws PluginInstantiationException, PackageException, CrosswalkException, IOException,
SQLException {
if (metadata.length == 0) {
log.debug("Not originally a content package - create a new manifest");
createNewManifest();
} else {
log.debug("Was a content package - have to read original manifest");
// Reconstruct and check the original manifest
reconstructManifest(recreateOriginalManifestDocument(item));
}
}
}