/* * Harvest.java * * Version: $Revision: 1 $ * * Date: $Date: 2007-11-28 15:07:34 -0600 (Wed, 28 Nov 2007) $ * * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts * Institute of Technology. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the Hewlett-Packard Company nor the name of the * Massachusetts Institute of Technology nor the names of their * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.harvest; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.io.PrintWriter; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.StringTokenizer; import java.util.Vector; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.apache.xpath.XPathAPI; import org.dspace.app.itemimport.ItemImport; import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.AuthorizeManager; import org.dspace.authorize.ResourcePolicy; import org.dspace.browse.IndexBrowse; import org.dspace.content.Bitstream; import org.dspace.content.BitstreamFormat; import org.dspace.content.Bundle; import org.dspace.content.Collection; import org.dspace.content.DSpaceObject; import org.dspace.content.FormatIdentifier; import org.dspace.harvest.HarvestedCollection; import org.dspace.content.InstallItem; import org.dspace.content.Item; import org.dspace.content.ItemIterator; import org.dspace.content.MetadataField; import org.dspace.content.MetadataSchema; import org.dspace.harvest.OAIHarvester; import org.dspace.content.WorkspaceItem; import org.dspace.harvest.OAIHarvester.HarvestingException; import org.dspace.content.crosswalk.CrosswalkException; import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.eperson.EPerson; import org.dspace.eperson.Group; import org.dspace.handle.HandleManager; import org.dspace.workflow.WorkflowManager; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Test class for harvested collections. * * @author Alexey Maslov */ public class Harvest { private static Context context; public static void main(String[] argv) throws Exception { // create an options object and populate it CommandLineParser parser = new PosixParser(); Options options = new Options(); options.addOption("p", "purge", false, "delete all items in the collection"); options.addOption("r", "run", false, "run the standrad harvest procedure"); options.addOption("g", "ping", false, "test the OAI server and set"); options.addOption("o", "once", false, "run the harvest procedure with specified parameters"); options.addOption("s", "setup", false, "Set the collection up for harvesting"); options.addOption("S", "start", false, "start the harvest loop"); options.addOption("R", "reset", false, "reset harvest status on all collections"); options.addOption("P", "purge", false, "purge all harvestable collections"); options.addOption("e", "eperson", true, "eperson"); options.addOption("c", "collection", true, "harvesting collection (handle or id)"); options.addOption("t", "type", true, "type of harvesting (0 for none)"); options.addOption("a", "address", true, "address of the OAI-PMH server"); options.addOption("i", "oai_set_id", true, "id of the PMH set representing the harvested collection"); options.addOption("m", "metadata_format", true, "the name of the desired metadata format for harvesting, resolved to namespace and crosswalk in dspace.cfg"); options.addOption("h", "help", false, "help"); CommandLine line = parser.parse(options, argv); String command = null; String eperson = null; String collection = null; String oaiSource = null; String oaiSetID = null; String metadataKey = null; int harvestType = 0; if (line.hasOption('h')) { HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("Harvest\n", options); System.out .println("\nPING OAI server: Harvest -g -s oai_source -i oai_set_id"); System.out .println("RUNONCE harvest with arbitrary options: Harvest -o -e eperson -c collection -t harvest_type -a oai_source -i oai_set_id -m metadata_format"); System.out .println("SETUP a collection for harvesting: Harvest -s -c collection -t harvest_type -a oai_source -i oai_set_id -m metadata_format"); System.out .println("RUN harvest once: Harvest -r -e eperson -c collection"); System.out .println("START harvest scheduler: Harvest -S"); System.out .println("RESET all harvest status: Harvest -R"); System.out .println("PURGE a collection of items and settings: Harvest -p -e eperson -c collection"); System.out .println("PURGE all harvestable collections: Harvest -P -e eperson"); System.exit(0); } if (line.hasOption('s')) { command = "config"; } if (line.hasOption('p')) { command = "purge"; } if (line.hasOption('r')) { command = "run"; } if (line.hasOption('g')) { command = "ping"; } if (line.hasOption('o')) { command = "runOnce"; } if (line.hasOption('S')) { command = "start"; } if (line.hasOption('R')) { command = "reset"; } if (line.hasOption('P')) { command = "purgeAll"; } if (line.hasOption('e')) { eperson = line.getOptionValue('e'); } if (line.hasOption('c')) { collection = line.getOptionValue('c'); } if (line.hasOption('t')) { harvestType = Integer.parseInt(line.getOptionValue('t')); } else { harvestType = 0; } if (line.hasOption('a')) { oaiSource = line.getOptionValue('a'); } if (line.hasOption('i')) { oaiSetID = line.getOptionValue('i'); } if (line.hasOption('m')) { metadataKey = line.getOptionValue('m'); } // Instantiate our class Harvest harvester = new Harvest(); harvester.context = new Context(); // Check our options if (command == null) { System.out .println("Error - no parameters specified (run with -h flag for details)"); System.exit(1); } // Run a single harvest cycle on a collection using saved settings. else if (command.equals("run")) { if (collection == null || eperson == null) { System.out .println("Error - a target collection and eperson must be provided"); System.out.println(" (run with -h flag for details)"); System.exit(1); } harvester.runHarvest(collection, eperson); } // start the harvest loop else if (command.equals("start")) { startHarvester(); } // reset harvesting status else if (command.equals("reset")) { resetHarvesting(); } // purge all collections that are set up for harvesting (obviously for testing purposes only) else if (command.equals("purgeAll")) { if (eperson == null) { System.out .println("Error - an eperson must be provided"); System.out.println(" (run with -h flag for details)"); System.exit(1); } List<Integer> cids = HarvestedCollection.findAll(context); System.out.println("Purging the following collections (deleting items and resetting harvest status): " + cids.toString()); for (Integer cid : cids) { harvester.purgeCollection(cid.toString(), eperson); } context.complete(); } // Delete all items in a collection. Useful for testing fresh harvests. else if (command.equals("purge")) { if (collection == null || eperson == null) { System.out .println("Error - a target collection and eperson must be provided"); System.out.println(" (run with -h flag for details)"); System.exit(1); } harvester.purgeCollection(collection, eperson); context.complete(); //TODO: implement this... remove all items and remember to unset "last-harvested" settings } // Configure a collection with the three main settings else if (command.equals("config")) { if (collection == null) { System.out.println("Error - a target collection must be provided"); System.out.println(" (run with -h flag for details)"); System.exit(1); } if (oaiSource == null || oaiSetID == null) { System.out.println("Error - both the OAI server address and OAI set id must be specified"); System.out.println(" (run with -h flag for details)"); System.exit(1); } if (metadataKey == null) { System.out.println("Error - a metadata key (commonly the prefix) must be specified for this collection"); System.out.println(" (run with -h flag for details)"); System.exit(1); } harvester.configureCollection(collection, harvestType, oaiSource, oaiSetID, metadataKey); } else if (command.equals("ping")) { if (oaiSource == null || oaiSetID == null) { System.out.println("Error - both the OAI server address and OAI set id must be specified"); System.out.println(" (run with -h flag for details)"); System.exit(1); } } } /** * check * @param collectionID * @return */ private Collection checkCollection(String collectionID) { Collection collection = resolveCollection(collectionID); try { HarvestedCollection hc = HarvestedCollection.find(context, collection.getID()); if (!hc.isHarvestable()) { System.out.println("Collection '"+ collection.getName() +"' is not set up for harvesting"); System.exit(1); } } catch (SQLException se) { se.printStackTrace(); } return collection; } /* * Resolve the ID into a collection and check to see if its harvesting options are set. If so, return * the collection, if not, bail out. */ private Collection resolveCollection(String collectionID) { DSpaceObject dso; Collection targetCollection = null; try { // is the ID a handle? if (collectionID.indexOf('/') != -1) { // string has a / so it must be a handle - try and resolve it dso = HandleManager.resolveToObject(context, collectionID); // resolved, now make sure it's a collection if (dso == null || dso.getType() != Constants.COLLECTION) targetCollection = null; else targetCollection = (Collection)dso; } // not a handle, try and treat it as an integer collection // database ID else if (collectionID != null) { System.out.println("Looking up by id: " + collectionID + ", parsed as '" + Integer.parseInt(collectionID) + "', " + "in context: " + context); targetCollection = Collection.find(context, Integer.parseInt(collectionID)); } // was the collection valid? if (targetCollection == null) { System.out.println("Cannot resolve " + collectionID + " to collection"); System.exit(1); } } catch (SQLException se) { se.printStackTrace(); } return targetCollection; } private void configureCollection(String collectionID, int type, String oaiSource, String oaiSetId, String mdConfigId) { System.out.println("Running: configure collection"); Collection collection = resolveCollection(collectionID); System.out.println(collection.getID()); try { HarvestedCollection hc = HarvestedCollection.find(context, collection.getID()); if (hc == null) { hc = HarvestedCollection.create(context, collection.getID()); } context.turnOffAuthorisationSystem(); hc.setHarvestParams(type, oaiSource, oaiSetId, mdConfigId); hc.setHarvestStatus(HarvestedCollection.STATUS_READY); hc.update(); context.restoreAuthSystemState(); context.complete(); } catch (Exception e) { System.out.println("Changes could not be committed"); e.printStackTrace(); System.exit(1); } finally { context.restoreAuthSystemState(); } } /** * Purges a collection of all harvest-related data and settings. All items in the collection will be deleted. * * @param collectionID * @param email */ private void purgeCollection(String collectionID, String email) { System.out.println("Purging collection of all items and reseting last_harvested and harvest_message: " + collectionID); Collection collection = resolveCollection(collectionID); try { EPerson eperson = EPerson.findByEmail(context, email); context.setCurrentUser(eperson); context.turnOffAuthorisationSystem(); ItemIterator it = collection.getAllItems(); IndexBrowse ib = new IndexBrowse(context); int i=0; while (it.hasNext()) { i++; Item item = it.next(); System.out.println("Deleting: " + item.getHandle()); ib.itemRemoved(item); collection.removeItem(item); // commit every 50 items if (i%50 == 0) { context.commit(); i=0; } } HarvestedCollection hc = HarvestedCollection.find(context, collection.getID()); if (hc != null) { hc.setHarvestResult(null,""); hc.setHarvestStatus(HarvestedCollection.STATUS_READY); hc.setHarvestStartTime(null); hc.update(); } context.restoreAuthSystemState(); context.commit(); } catch (Exception e) { System.out.println("Changes could not be committed"); e.printStackTrace(); System.exit(1); } finally { context.restoreAuthSystemState(); } } /** * Run a single harvest cycle on the specified collection under the authorization of the supplied EPerson */ private void runHarvest(String collectionID, String email) { System.out.println("Running: a harvest cycle on " + collectionID); System.out.print("Initializing the harvester... "); OAIHarvester harvester = null; try { Collection collection = resolveCollection(collectionID); HarvestedCollection hc = HarvestedCollection.find(context, collection.getID()); harvester = new OAIHarvester(context, collection, hc); System.out.println("success. "); } catch (HarvestingException hex) { System.out.print("failed. "); System.out.println(hex.getMessage()); System.exit(1); } catch (SQLException se) { // TODO Auto-generated catch block se.printStackTrace(); } try { // Harvest will not work for an anonymous user EPerson eperson = EPerson.findByEmail(context, email); System.out.println("Harvest started... "); context.setCurrentUser(eperson); harvester.runHarvest(); context.complete(); } catch (Exception e) { // Not much else we can do at this point e.printStackTrace(); System.exit(1); } System.out.println("Harvest complete. "); } /** * Resets harvest_status and harvest_start_time flags for all collections that have a row in the harvested_collections table */ private static void resetHarvesting() { System.out.print("Resetting harvest status flag on all collections... "); try { List<Integer> cids = HarvestedCollection.findAll(context); for (Integer cid : cids) { HarvestedCollection hc = HarvestedCollection.find(context, cid); //hc.setHarvestResult(null,""); hc.setHarvestStartTime(null); hc.setHarvestStatus(HarvestedCollection.STATUS_READY); hc.update(); } context.commit(); System.out.println("success. "); } catch (Exception ex) { System.out.println("failed. "); ex.printStackTrace(); } } /** * Starts up the harvest scheduler. Terminating this process will stop the scheduler. */ private static void startHarvester() { try { System.out.print("Starting harvest loop... "); OAIHarvester.startNewScheduler(); System.out.println("running. "); } catch (Exception ex) { ex.printStackTrace(); } } }