/* * METSExport.java * * Version: $Revision: 3739 $ * * Date: $Date: 2009-04-27 22:26:36 +0000 (Mon, 27 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.mets; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URLEncoder; import java.sql.SQLException; import java.util.Date; import java.util.Properties; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.AuthorizeManager; import org.dspace.content.Bitstream; import org.dspace.content.BitstreamFormat; import org.dspace.content.Bundle; import org.dspace.content.Collection; import org.dspace.content.DCValue; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.content.ItemIterator; import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.Utils; import org.dspace.handle.HandleManager; import org.dspace.app.util.Util; import edu.harvard.hul.ois.mets.Agent; import edu.harvard.hul.ois.mets.AmdSec; import edu.harvard.hul.ois.mets.BinData; import edu.harvard.hul.ois.mets.Checksumtype; import edu.harvard.hul.ois.mets.Div; import edu.harvard.hul.ois.mets.DmdSec; import edu.harvard.hul.ois.mets.FLocat; import edu.harvard.hul.ois.mets.FileGrp; import edu.harvard.hul.ois.mets.FileSec; import edu.harvard.hul.ois.mets.Loctype; import edu.harvard.hul.ois.mets.MdWrap; import edu.harvard.hul.ois.mets.Mdtype; import edu.harvard.hul.ois.mets.Mets; import edu.harvard.hul.ois.mets.MetsHdr; import edu.harvard.hul.ois.mets.Name; import edu.harvard.hul.ois.mets.RightsMD; import edu.harvard.hul.ois.mets.Role; import edu.harvard.hul.ois.mets.StructMap; import edu.harvard.hul.ois.mets.Type; import edu.harvard.hul.ois.mets.XmlData; import edu.harvard.hul.ois.mets.helper.Base64; import edu.harvard.hul.ois.mets.helper.MetsException; import edu.harvard.hul.ois.mets.helper.MetsValidator; import edu.harvard.hul.ois.mets.helper.MetsWriter; import edu.harvard.hul.ois.mets.helper.PCData; import edu.harvard.hul.ois.mets.helper.PreformedXML; /** * Tool for exporting DSpace AIPs with the metadata serialised in METS format * * @author Robert Tansley * @version $Revision: 3739 $ */ public class METSExport { private static int licenseFormat = -1; private static Properties dcToMODS; public static void main(String[] args) throws Exception { Context context = new Context(); init(context); // create an options object and populate it CommandLineParser parser = new PosixParser(); Options options = new Options(); options.addOption("c", "collection", true, "Handle of collection to export"); options.addOption("i", "item", true, "Handle of item to export"); options.addOption("a", "all", false, "Export all items in the archive"); options.addOption("d", "destination", true, "Destination directory"); options.addOption("h", "help", false, "Help"); CommandLine line = parser.parse(options, args); if (line.hasOption('h')) { HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("metsexport", options); System.out .println("\nExport a collection: metsexport -c hdl:123.456/789"); System.out .println("Export an item: metsexport -i hdl:123.456/890"); System.out.println("Export everything: metsexport -a"); System.exit(0); } String dest = ""; if (line.hasOption('d')) { dest = line.getOptionValue('d'); // Make sure it ends with a file separator if (!dest.endsWith(File.separator)) { dest = dest + File.separator; } } if (line.hasOption('i')) { String handle = getHandleArg(line.getOptionValue('i')); // Exporting a single item DSpaceObject o = HandleManager.resolveToObject(context, handle); if ((o != null) && o instanceof Item) { writeAIP(context, (Item) o, dest); System.exit(0); } else { System.err.println(line.getOptionValue('i') + " is not a valid item Handle"); System.exit(1); } } ItemIterator items = null; try { if (line.hasOption('c')) { String handle = getHandleArg(line.getOptionValue('c')); // Exporting a collection's worth of items DSpaceObject o = HandleManager.resolveToObject(context, handle); if ((o != null) && o instanceof Collection) { items = ((Collection) o).getItems(); } else { System.err.println(line.getOptionValue('c') + " is not a valid collection Handle"); System.exit(1); } } if (line.hasOption('a')) { items = Item.findAll(context); } if (items == null) { System.err.println("Nothing to export specified!"); System.exit(1); } while (items.hasNext()) { writeAIP(context, items.next(), dest); } } finally { if (items != null) items.close(); } context.abort(); System.exit(0); } /** * Initialise various variables, read in config etc. * * @param context * DSpace context */ private static void init(Context context) throws SQLException, IOException { // Don't init again if initialised already if (licenseFormat != -1) { return; } // Find the License format BitstreamFormat bf = BitstreamFormat.findByShortDescription(context, "License"); licenseFormat = bf.getID(); // get path to DC->MODS map info file String configFile = ConfigurationManager.getProperty("dspace.dir") + File.separator + "config" + File.separator + "dc2mods.cfg"; // Read it in InputStream is = null; try { is = new FileInputStream(configFile); dcToMODS = new Properties(); dcToMODS.load(is); } finally { if (is != null) try { is.close(); } catch (IOException ioe) { } } } /** * Write out the AIP for the given item to the given directory. A new * directory will be created with the Handle (URL-encoded) as the directory * name, and inside, a mets.xml file written, together with the bitstreams. * * @param context * DSpace context to use * @param item * Item to write * @param dest * destination directory */ public static void writeAIP(Context context, Item item, String dest) throws SQLException, IOException, AuthorizeException, MetsException { System.out.println("Exporting item hdl:" + item.getHandle()); // Create aip directory java.io.File aipDir = new java.io.File(dest + URLEncoder.encode("hdl:" + item.getHandle(), "UTF-8")); if (!aipDir.mkdir()) { // Couldn't make the directory for some reason throw new IOException("Couldn't create " + aipDir.toString()); } // Write the METS file FileOutputStream out = new FileOutputStream(aipDir.toString() + java.io.File.separator + "mets.xml"); writeMETS(context, item, out, false); out.close(); // Write bitstreams Bundle[] bundles = item.getBundles(); for (int i = 0; i < bundles.length; i++) { Bitstream[] bitstreams = bundles[i].getBitstreams(); for (int b = 0; b < bitstreams.length; b++) { // Skip license bitstream and unauthorized resources if ((bitstreams[b].getFormat().getID() != licenseFormat) && AuthorizeManager.authorizeActionBoolean(context, bitstreams[b], Constants.READ)) { out = new FileOutputStream(aipDir.toString() + java.io.File.separator + bitstreams[b].getName()); InputStream in = bitstreams[b].retrieve(); Utils.bufferedCopy(in, out); out.close(); in.close(); } } } } /** * Write METS metadata corresponding to the metadata for an item * * @param context * DSpace context * @param item * DSpace item to create METS object for * @param os * A stream to write METS package to (UTF-8 encoding will be used) * @param fullURL * if <code>true</code>, the <FLocat> values for each * bitstream will be the full URL for that bitstream. Otherwise, * only the filename itself will be used. */ public static void writeMETS(Context context, Item item, OutputStream os, boolean fullURL) throws SQLException, IOException, AuthorizeException { try { init(context); // Create the METS file Mets mets = new Mets(); // Top-level stuff mets.setOBJID("hdl:" + item.getHandle()); mets.setLABEL("DSpace Item"); mets.setSchema("mods", "http://www.loc.gov/mods/v3", "http://www.loc.gov/standards/mods/v3/mods-3-0.xsd"); // MetsHdr MetsHdr metsHdr = new MetsHdr(); metsHdr.setCREATEDATE(new Date()); // FIXME: CREATEDATE is now: // maybe should be item create // date? // Agent Agent agent = new Agent(); agent.setROLE(Role.CUSTODIAN); agent.setTYPE(Type.ORGANIZATION); Name name = new Name(); name.getContent() .add( new PCData(ConfigurationManager .getProperty("dspace.name"))); agent.getContent().add(name); metsHdr.getContent().add(agent); mets.getContent().add(metsHdr); DmdSec dmdSec = new DmdSec(); dmdSec.setID("DMD_hdl_" + item.getHandle()); MdWrap mdWrap = new MdWrap(); mdWrap.setMDTYPE(Mdtype.MODS); XmlData xmlData = new XmlData(); createMODS(item, xmlData); mdWrap.getContent().add(xmlData); dmdSec.getContent().add(mdWrap); mets.getContent().add(dmdSec); // amdSec AmdSec amdSec = new AmdSec(); amdSec.setID("TMD_hdl_" + item.getHandle()); // FIXME: techMD here // License as <rightsMD><mdWrap><binData>base64encoded</binData>... InputStream licenseStream = findLicense(context, item); if (licenseStream != null) { RightsMD rightsMD = new RightsMD(); MdWrap rightsMDWrap = new MdWrap(); rightsMDWrap.setMIMETYPE("text/plain"); rightsMDWrap.setMDTYPE(Mdtype.OTHER); rightsMDWrap.setOTHERMDTYPE("TEXT"); BinData binData = new BinData(); Base64 base64 = new Base64(licenseStream); binData.getContent().add(base64); rightsMDWrap.getContent().add(binData); rightsMD.getContent().add(rightsMDWrap); amdSec.getContent().add(rightsMD); } // FIXME: History data???? Nooooo!!!! mets.getContent().add(amdSec); // fileSec FileSec fileSec = new FileSec(); boolean fileSecEmpty = true; Bundle[] bundles = item.getBundles(); for (int i = 0; i < bundles.length; i++) { Bitstream[] bitstreams = bundles[i].getBitstreams(); // Unusual condition, but if no bitstreams, skip this bundle if (bitstreams.length == 0) { continue; } // First: we skip the license bundle, since it's included // elsewhere if (bitstreams[0].getFormat().getID() == licenseFormat) { continue; } // Create a fileGrp FileGrp fileGrp = new FileGrp(); // Bundle name for USE attribute if ((bundles[i].getName() != null) && !bundles[i].getName().equals("")) { fileGrp.setUSE(bundles[i].getName()); } for (int bits = 0; bits < bitstreams.length; bits++) { // What's the persistent(-ish) ID? String bitstreamPID = ConfigurationManager .getProperty("dspace.url") + "/bitstream/" + item.getHandle() + "/" + bitstreams[bits].getSequenceID() + "/" + Util.encodeBitstreamName(bitstreams[bits].getName(), "UTF-8"); edu.harvard.hul.ois.mets.File file = new edu.harvard.hul.ois.mets.File(); /* * ID: we use the unique part of the persistent ID, i.e. the * Handle + sequence number, but with _'s instead of /'s so * it's a legal xsd:ID. */ String xmlIDstart = item.getHandle().replaceAll("/", "_") + "_"; file.setID(xmlIDstart + bitstreams[bits].getSequenceID()); String groupID = "GROUP_" + xmlIDstart + bitstreams[bits].getSequenceID(); /* * If we're in THUMBNAIL or TEXT bundles, the bitstream is * extracted text or a thumbnail, so we use the name to work * out which bitstream to be in the same group as */ if ((bundles[i].getName() != null) && (bundles[i].getName().equals("THUMBNAIL") || bundles[i] .getName().equals("TEXT"))) { // Try and find the original bitstream, and chuck the // derived // bitstream in the same group Bitstream original = findOriginalBitstream(item, bitstreams[bits]); if (original != null) { groupID = "GROUP_" + xmlIDstart + original.getSequenceID(); } } file.setGROUPID(groupID); file.setOWNERID(bitstreamPID); // FIXME: ADMID should point to appropriate TechMD section // above file .setMIMETYPE(bitstreams[bits].getFormat() .getMIMEType()); // FIXME: CREATED: no date file.setSIZE(bitstreams[bits].getSize()); file.setCHECKSUM(bitstreams[bits].getChecksum()); file.setCHECKSUMTYPE(Checksumtype.MD5); // FLocat: filename is as in records, or full URL // FIXME: Duplicate filenames and characters illegal to // local OS may cause problems FLocat flocat = new FLocat(); flocat.setLOCTYPE(Loctype.URL); if (fullURL) { flocat.setXlinkHref(bitstreamPID); } else { flocat.setXlinkHref(bitstreams[bits].getName()); } // Add FLocat to File, and File to FileGrp file.getContent().add(flocat); fileGrp.getContent().add(file); } // Add fileGrp to fileSec fileSec.getContent().add(fileGrp); fileSecEmpty = false; } // Add fileSec to document if (!fileSecEmpty) { mets.getContent().add(fileSec); } // FIXME: Add Structmap here, but it is empty and we won't use it now. StructMap structMap = new StructMap(); Div div = new Div(); structMap.getContent().add(div); mets.getContent().add(structMap); mets.validate(new MetsValidator()); mets.write(new MetsWriter(os)); } catch (MetsException e) { // We don't pass up a MetsException, so callers don't need to // know the details of the METS toolkit e.printStackTrace(); throw new IOException(e.getMessage()); } } /** * Utility to find the license bitstream from an item * * @param context * DSpace context * @param item * the item * @return the license as a string * * @throws IOException * if the license bitstream can't be read */ private static InputStream findLicense(Context context, Item item) throws SQLException, IOException, AuthorizeException { Bundle[] bundles = item.getBundles(); for (int i = 0; i < bundles.length; i++) { // Assume license will be in its own bundle Bitstream[] bitstreams = bundles[i].getBitstreams(); if (bitstreams.length > 0) { if (bitstreams[0].getFormat().getID() == licenseFormat) { // Read the license into a string return bitstreams[0].retrieve(); } } } // Oops! No license! return null; } /** * For a bitstream that's a thumbnail or extracted text, find the * corresponding bitstream in the ORIGINAL bundle * * @param item * the item we're dealing with * @param derived * the derived bitstream * * @return the corresponding original bitstream (or null) */ private static Bitstream findOriginalBitstream(Item item, Bitstream derived) throws SQLException { Bundle[] bundles = item.getBundles(); // Filename of original will be filename of the derived bitstream // minus the extension (last 4 chars - .jpg or .txt) String originalFilename = derived.getName().substring(0, derived.getName().length() - 4); // First find "original" bundle for (int i = 0; i < bundles.length; i++) { if ((bundles[i].getName() != null) && bundles[i].getName().equals("ORIGINAL")) { // Now find the corresponding bitstream Bitstream[] bitstreams = bundles[i].getBitstreams(); for (int bsnum = 0; bsnum < bitstreams.length; bsnum++) { if (bitstreams[bsnum].getName().equals(originalFilename)) { return bitstreams[bsnum]; } } } } // Didn't find it return null; } /** * Create MODS metadata from the DC in the item, and add to the given * XmlData METS object. * * @param item * the item * @param xmlData * xmlData to add MODS to. */ private static void createMODS(Item item, XmlData xmlData) { DCValue[] dc = item.getDC(Item.ANY, Item.ANY, Item.ANY); StringBuffer modsXML = new StringBuffer(); for (int i = 0; i < dc.length; i++) { // Get the property name - element[.qualifier] String propName = ((dc[i].qualifier == null) ? dc[i].element : (dc[i].element + "." + dc[i].qualifier)); String modsMapping = dcToMODS.getProperty(propName); if (modsMapping == null) { System.err.println("WARNING: No MODS mapping for " + propName); } else { String value = dc[i].value; // Replace all $'s with \$ so it doesn't trip up the replaceAll! if (value != null && value.length() > 0) { // RegExp note: Yes, there really does need to be this many backslashes! // To have \$ inserted in the replacement, both the backslash and the dollar // have to be escaped (backslash) - so the replacemenet string has to be // passed as \\\$. All of those backslashes then have to escaped in the literal // for them to be in string used!!! value = dc[i].value.replaceAll("\\$", "\\\\\\$"); } if (!(("description.provenance".equals(propName)) && ((ConfigurationManager.getBooleanProperty("oai.mets.hide-provenance", false))))) { // Replace '%s' with DC value (with entities encoded) modsXML.append(modsMapping.replaceAll("%s", Utils .addEntities(value))); modsXML.append("\n"); // For readability } } } PreformedXML pXML = new PreformedXML(modsXML.toString()); xmlData.getContent().add(pXML); } /** * Get the handle from the command line in the form 123.456/789. Doesn't * matter if incoming handle has 'hdl:' or 'http://hdl....' before it. * * @param original * Handle as passed in by user * @return Handle as can be looked up in our table */ private static String getHandleArg(String original) { if (original.startsWith("hdl:")) { return original.substring(4); } if (original.startsWith("http://hdl.handle.net/")) { return original.substring(22); } return original; } }