/* * MediaFilterManager.java * * Version: $Revision: 4503 $ * * Date: $Date: 2009-11-05 02:31:03 +0000 (Thu, 05 Nov 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.mediafilter; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; import java.util.HashMap; import java.util.Map; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.MissingArgumentException; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.dspace.authorize.AuthorizeManager; import org.dspace.content.Bitstream; import org.dspace.content.BitstreamFormat; import org.dspace.content.Bundle; import org.dspace.content.Collection; import org.dspace.content.Community; import org.dspace.content.DCDate; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.content.ItemIterator; import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.PluginManager; import org.dspace.core.SelfNamedPlugin; import org.dspace.handle.HandleManager; import org.dspace.search.DSIndexer; /** * MediaFilterManager is the class that invokes the media/format filters over the * repository's content. a few command line flags affect the operation of the * MFM: -v verbose outputs all extracted text to STDOUT; -f force forces all * bitstreams to be processed, even if they have been before; -n noindex does not * recreate index after processing bitstreams; -i [identifier] limits processing * scope to a community, collection or item; and -m [max] limits processing to a * maximum number of items. */ public class MediaFilterManager { //key (in dspace.cfg) which lists all enabled filters by name public static String MEDIA_FILTER_PLUGINS_KEY = "filter.plugins"; //prefix (in dspace.cfg) for all filter properties public static String FILTER_PREFIX = "filter"; //suffix (in dspace.cfg) for input formats supported by each filter public static String INPUT_FORMATS_SUFFIX = "inputFormats"; public static boolean updateIndex = true; // default to updating index public static boolean isVerbose = false; // default to not verbose public static boolean isQuiet = false; // default is noisy public static boolean isForce = false; // default to not forced public static String identifier = null; // object scope limiter public static int max2Process = Integer.MAX_VALUE; // maximum number items to process public static int processed = 0; // number items processed private static Item currentItem = null; // current item being processed private static FormatFilter[] filterClasses = null; private static Map filterFormats = new HashMap(); private static List skipList = null; //list of identifiers to skip during processing //separator in filterFormats Map between a filter class name and a plugin name, //for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char) public static String FILTER_PLUGIN_SEPARATOR = "\034"; public static void main(String[] argv) throws Exception { // set headless for non-gui workstations System.setProperty("java.awt.headless", "true"); // create an options object and populate it CommandLineParser parser = new PosixParser(); int status = 0; Options options = new Options(); options.addOption("v", "verbose", false, "print all extracted text and other details to STDOUT"); options.addOption("q", "quiet", false, "do not print anything except in the event of errors."); options.addOption("f", "force", false, "force all bitstreams to be processed"); options.addOption("n", "noindex", false, "do NOT update the search index after filtering bitstreams"); options.addOption("i", "identifier", true, "ONLY process bitstreams belonging to identifier"); options.addOption("m", "maximum", true, "process no more than maximum items"); options.addOption("h", "help", false, "help"); //create a "plugin" option (to specify specific MediaFilter plugins to run) OptionBuilder.withLongOpt("plugins"); OptionBuilder.withValueSeparator(','); OptionBuilder.withDescription( "ONLY run the specified Media Filter plugin(s)\n" + "listed from '" + MEDIA_FILTER_PLUGINS_KEY + "' in dspace.cfg.\n" + "Separate multiple with a comma (,)\n" + "(e.g. MediaFilterManager -p \n\"Word Text Extractor\",\"PDF Text Extractor\")"); Option pluginOption = OptionBuilder.create('p'); pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args options.addOption(pluginOption); //create a "skip" option (to specify communities/collections/items to skip) OptionBuilder.withLongOpt("skip"); OptionBuilder.withValueSeparator(','); OptionBuilder.withDescription( "SKIP the bitstreams belonging to identifier\n" + "Separate multiple identifiers with a comma (,)\n" + "(e.g. MediaFilterManager -s \n 123456789/34,123456789/323)"); Option skipOption = OptionBuilder.create('s'); skipOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args options.addOption(skipOption); CommandLine line = null; try { line = parser.parse(options, argv); } catch(MissingArgumentException e) { System.out.println("ERROR: " + e.getMessage()); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(1); } if (line.hasOption('h')) { HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(0); } if (line.hasOption('v')) { isVerbose = true; } isQuiet = line.hasOption('q'); if (line.hasOption('n')) { updateIndex = false; } if (line.hasOption('f')) { isForce = true; } if (line.hasOption('i')) { identifier = line.getOptionValue('i'); } if (line.hasOption('m')) { max2Process = Integer.parseInt(line.getOptionValue('m')); if (max2Process <= 1) { System.out.println("Invalid maximum value '" + line.getOptionValue('m') + "' - ignoring"); max2Process = Integer.MAX_VALUE; } } String filterNames[] = null; if(line.hasOption('p')) { //specified which media filter plugins we are using filterNames = line.getOptionValues('p'); if(filterNames==null || filterNames.length==0) { //display error, since no plugins specified System.err.println("\nERROR: -p (-plugin) option requires at least one plugin to be specified.\n" + "(e.g. MediaFilterManager -p \"Word Text Extractor\",\"PDF Text Extractor\")\n"); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(1); } } else { //retrieve list of all enabled media filter plugins! String enabledPlugins = ConfigurationManager.getProperty(MEDIA_FILTER_PLUGINS_KEY); filterNames = enabledPlugins.split(",\\s*"); } //initialize an array of our enabled filters List filterList = new ArrayList(); //set up each filter for(int i=0; i< filterNames.length; i++) { //get filter of this name & add to list of filters FormatFilter filter = (FormatFilter) PluginManager.getNamedPlugin(FormatFilter.class, filterNames[i]); if(filter==null) { System.err.println("\nERROR: Unknown MediaFilter specified (either from command-line or in dspace.cfg): '" + filterNames[i] + "'"); System.exit(1); } else { filterList.add(filter); String filterClassName = filter.getClass().getName(); String pluginName = null; //If this filter is a SelfNamedPlugin, //then the input formats it accepts may differ for //each "named" plugin that it defines. //So, we have to look for every key that fits the //following format: filter.<class-name>.<plugin-name>.inputFormats if( SelfNamedPlugin.class.isAssignableFrom(filter.getClass()) ) { //Get the plugin instance name for this class pluginName = ((SelfNamedPlugin) filter).getPluginInstanceName(); } //Retrieve our list of supported formats from dspace.cfg //For SelfNamedPlugins, format of key is: // filter.<class-name>.<plugin-name>.inputFormats //For other MediaFilters, format of key is: // filter.<class-name>.inputFormats String formats = ConfigurationManager.getProperty( FILTER_PREFIX + "." + filterClassName + (pluginName!=null ? "." + pluginName : "") + "." + INPUT_FORMATS_SUFFIX); //add to internal map of filters to supported formats if (formats != null) { //For SelfNamedPlugins, map key is: // <class-name><separator><plugin-name> //For other MediaFilters, map key is just: // <class-name> filterFormats.put(filterClassName + (pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : ""), Arrays.asList(formats.split(",[\\s]*"))); } }//end if filter!=null }//end for //If verbose, print out loaded mediafilter info if(isVerbose) { System.out.println("The following MediaFilters are enabled: "); java.util.Iterator i = filterFormats.keySet().iterator(); while(i.hasNext()) { String filterName = (String) i.next(); System.out.println("Full Filter Name: " + filterName); String pluginName = null; if(filterName.contains(FILTER_PLUGIN_SEPARATOR)) { String[] fields = filterName.split(FILTER_PLUGIN_SEPARATOR); filterName=fields[0]; pluginName=fields[1]; } System.out.println(filterName + (pluginName!=null? " (Plugin: " + pluginName + ")": "")); } } //store our filter list into an internal array filterClasses = (FormatFilter[]) filterList.toArray(new FormatFilter[filterList.size()]); //Retrieve list of identifiers to skip (if any) String skipIds[] = null; if(line.hasOption('s')) { //specified which identifiers to skip when processing skipIds = line.getOptionValues('s'); if(skipIds==null || skipIds.length==0) { //display error, since no identifiers specified to skip System.err.println("\nERROR: -s (-skip) option requires at least one identifier to SKIP.\n" + "Make sure to separate multiple identifiers with a comma!\n" + "(e.g. MediaFilterManager -s 123456789/34,123456789/323)\n"); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(0); } //save to a global skip list skipList = Arrays.asList(skipIds); } Context c = null; try { c = new Context(); // have to be super-user to do the filtering c.setIgnoreAuthorization(true); // now apply the filters if (identifier == null) { applyFiltersAllItems(c); } else // restrict application scope to identifier { DSpaceObject dso = HandleManager.resolveToObject(c, identifier); if (dso == null) { throw new IllegalArgumentException("Cannot resolve " + identifier + " to a DSpace object"); } switch (dso.getType()) { case Constants.COMMUNITY: applyFiltersCommunity(c, (Community)dso); break; case Constants.COLLECTION: applyFiltersCollection(c, (Collection)dso); break; case Constants.ITEM: applyFiltersItem(c, (Item)dso); break; } } // update search index? if (updateIndex) { if (!isQuiet) System.out.println("Updating search index:"); DSIndexer.updateIndex(c); } c.complete(); c = null; } catch (Exception e) { status = 1; } finally { if (c != null) { c.abort(); } } System.exit(status); } public static void applyFiltersAllItems(Context c) throws Exception { if(skipList!=null) { //if a skip-list exists, we need to filter community-by-community //so we can respect what is in the skip-list Community[] topLevelCommunities = Community.findAllTop(c); for(int i=0; i<topLevelCommunities.length; i++) applyFiltersCommunity(c, topLevelCommunities[i]); } else { //otherwise, just find every item and process ItemIterator i = Item.findAll(c); try { while (i.hasNext() && processed < max2Process) { applyFiltersItem(c, i.next()); } } finally { if (i != null) i.close(); } } } public static void applyFiltersCommunity(Context c, Community community) throws Exception { //only apply filters if community not in skip-list if(!inSkipList(community.getHandle())) { Community[] subcommunities = community.getSubcommunities(); for (int i = 0; i < subcommunities.length; i++) { applyFiltersCommunity(c, subcommunities[i]); } Collection[] collections = community.getCollections(); for (int j = 0; j < collections.length; j++) { applyFiltersCollection(c, collections[j]); } } } public static void applyFiltersCollection(Context c, Collection collection) throws Exception { //only apply filters if collection not in skip-list if(!inSkipList(collection.getHandle())) { ItemIterator i = collection.getItems(); try { while (i.hasNext() && processed < max2Process) { applyFiltersItem(c, i.next()); } } finally { if (i != null) i.close(); } } } public static void applyFiltersItem(Context c, Item item) throws Exception { //only apply filters if item not in skip-list if(!inSkipList(item.getHandle())) { //cache this item in MediaFilterManager //so it can be accessed by MediaFilters as necessary currentItem = item; if (filterItem(c, item)) { // commit changes after each filtered item c.commit(); // increment processed count ++processed; } // clear item objects from context cache and internal cache item.decache(); currentItem = null; } } /** * iterate through the item's bitstreams in the ORIGINAL bundle, applying * filters if possible * * @return true if any bitstreams processed, * false if none */ public static boolean filterItem(Context c, Item myItem) throws Exception { // get 'original' bundles Bundle[] myBundles = myItem.getBundles("ORIGINAL"); boolean done = false; for (int i = 0; i < myBundles.length; i++) { // now look at all of the bitstreams Bitstream[] myBitstreams = myBundles[i].getBitstreams(); for (int k = 0; k < myBitstreams.length; k++) { done |= filterBitstream(c, myItem, myBitstreams[k]); } } return done; } /** * Attempt to filter a bitstream * * An exception will be thrown if the media filter class cannot be * instantiated, exceptions from filtering will be logged to STDOUT and * swallowed. * * @return true if bitstream processed, * false if no applicable filter or already processed */ public static boolean filterBitstream(Context c, Item myItem, Bitstream myBitstream) throws Exception { boolean filtered = false; // iterate through filter classes. A single format may be actioned // by more than one filter for (int i = 0; i < filterClasses.length; i++) { //List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName()); String pluginName = null; //if this filter class is a SelfNamedPlugin, //its list of supported formats is different for //differently named "plugin" if( SelfNamedPlugin.class.isAssignableFrom(filterClasses[i].getClass()) ) { //get plugin instance name for this media filter pluginName = ((SelfNamedPlugin)filterClasses[i]).getPluginInstanceName(); } //Get list of supported formats for the filter (and possibly named plugin) //For SelfNamedPlugins, map key is: // <class-name><separator><plugin-name> //For other MediaFilters, map key is just: // <class-name> List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName() + (pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : "")); if (fmts.contains(myBitstream.getFormat().getShortDescription())) { try { // only update item if bitstream not skipped if (processBitstream(c, myItem, myBitstream, filterClasses[i])) { myItem.update(); // Make sure new bitstream has a sequence // number filtered = true; } } catch (Exception e) { String handle = myItem.getHandle(); Bundle[] bundles = myBitstream.getBundles(); String name = myBitstream.getName(); long size = myBitstream.getSize(); String checksum = myBitstream.getChecksum() + " ("+myBitstream.getChecksumAlgorithm()+")"; int assetstore = myBitstream.getStoreNumber(); // Printout helpfull information to find the errored bistream. System.out.println("ERROR filtering, skipping bitstream:\n"); System.out.println("\tItem Handle: "+ handle); for (Bundle bundle : bundles) { System.out.println("\tBundle Name: " + bundle.getName()); } System.out.println("\tFile Size: " + size); System.out.println("\tChecksum: " + checksum); System.out.println("\tAsset Store: " + assetstore); System.out.println(e); e.printStackTrace(); } } } return filtered; } /** * processBitstream is a utility class that calls the virtual methods * from the current MediaFilter class. * It scans the bitstreams in an item, and decides if a bitstream has * already been filtered, and if not or if overWrite is set, invokes the * filter. * * @param c * context * @param item * item containing bitstream to process * @param source * source bitstream to process * @param formatFilter * FormatFilter to perform filtering * * @return true if new rendition is created, false if rendition already * exists and overWrite is not set */ public static boolean processBitstream(Context c, Item item, Bitstream source, FormatFilter formatFilter) throws Exception { //do pre-processing of this bitstream, and if it fails, skip this bitstream! if(!formatFilter.preProcessBitstream(c, item, source)) return false; boolean overWrite = MediaFilterManager.isForce; // get bitstream filename, calculate destination filename String newName = formatFilter.getFilteredName(source.getName()); Bitstream existingBitstream = null; // is there an existing rendition? Bundle targetBundle = null; // bundle we're modifying Bundle[] bundles = item.getBundles(formatFilter.getBundleName()); // check if destination bitstream exists if (bundles.length > 0) { // only finds the last match (FIXME?) for (int i = 0; i < bundles.length; i++) { Bitstream[] bitstreams = bundles[i].getBitstreams(); for (int j = 0; j < bitstreams.length; j++) { if (bitstreams[j].getName().equals(newName)) { targetBundle = bundles[i]; existingBitstream = bitstreams[j]; } } } } // if exists and overwrite = false, exit if (!overWrite && (existingBitstream != null)) { if (!isQuiet) System.out.println("SKIPPED: bitstream " + source.getID() + " (item: " + item.getHandle() + ") because '" + newName + "' already exists"); return false; } InputStream destStream = formatFilter.getDestinationStream(source.retrieve()); if (destStream == null) { if (!isQuiet) System.out.println("SKIPPED: bitstream " + source.getID() + " (item: " + item.getHandle() + ") because filtering was unsuccessful"); return false; } // create new bundle if needed if (bundles.length < 1) { targetBundle = item.createBundle(formatFilter.getBundleName()); } else { // take the first match targetBundle = bundles[0]; } Bitstream b = targetBundle.createBitstream(destStream); // Now set the format and name of the bitstream b.setName(newName); b.setSource("Written by FormatFilter " + formatFilter.getClass().getName() + " on " + DCDate.getCurrent() + " (GMT)."); b.setDescription(formatFilter.getDescription()); // Find the proper format BitstreamFormat bf = BitstreamFormat.findByShortDescription(c, formatFilter.getFormatString()); b.setFormat(bf); b.update(); //Inherit policies from the source bitstream //(first remove any existing policies) AuthorizeManager.removeAllPolicies(c, b); AuthorizeManager.inheritPolicies(c, source, b); // fixme - set date? // we are overwriting, so remove old bitstream if (existingBitstream != null) { targetBundle.removeBitstream(existingBitstream); } if (!isQuiet) System.out.println("FILTERED: bitstream " + source.getID() + " (item: " + item.getHandle() + ") and created '" + newName + "'"); //do post-processing of the generated bitstream formatFilter.postProcessBitstream(c, item, b); return true; } /** * Return the item that is currently being processed/filtered * by the MediaFilterManager * <p> * This allows FormatFilters to retrieve the Item object * in case they need access to item-level information for their format * transformations/conversions. * * @return current Item being processed by MediaFilterManager */ public static Item getCurrentItem() { return currentItem; } /** * Check whether or not to skip processing the given identifier * * @param identifier * identifier (handle) of a community, collection or item * * @return true if this community, collection or item should be skipped * during processing. Otherwise, return false. */ public static boolean inSkipList(String identifier) { if(skipList!=null && skipList.contains(identifier)) { if (!isQuiet) System.out.println("SKIP-LIST: skipped bitstreams within identifier " + identifier); return true; } else return false; } }