/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.app.mediafilter; import org.apache.commons.cli.*; import org.dspace.app.mediafilter.factory.MediaFilterServiceFactory; import org.dspace.app.mediafilter.service.MediaFilterService; import org.dspace.content.Collection; import org.dspace.content.Community; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.SelfNamedPlugin; import org.dspace.core.factory.CoreServiceFactory; import org.dspace.handle.factory.HandleServiceFactory; import java.util.*; import org.apache.commons.lang.ArrayUtils; import org.dspace.services.factory.DSpaceServicesFactory; /** * MediaFilterManager is the class that invokes the media/format filters over the * repository's content. A few command line flags affect the operation of the * MFM: -v verbose outputs all extracted text to STDOUT; -f force forces all * bitstreams to be processed, even if they have been before; -n noindex does not * recreate index after processing bitstreams; -i [identifier] limits processing * scope to a community, collection or item; and -m [max] limits processing to a * maximum number of items. */ public class MediaFilterCLITool { //key (in dspace.cfg) which lists all enabled filters by name private static final String MEDIA_FILTER_PLUGINS_KEY = "filter.plugins"; //prefix (in dspace.cfg) for all filter properties private static final String FILTER_PREFIX = "filter"; //suffix (in dspace.cfg) for input formats supported by each filter private static final String INPUT_FORMATS_SUFFIX = "inputFormats"; public static void main(String[] argv) throws Exception { // set headless for non-gui workstations System.setProperty("java.awt.headless", "true"); // create an options object and populate it CommandLineParser parser = new PosixParser(); int status = 0; Options options = new Options(); options.addOption("v", "verbose", false, "print all extracted text and other details to STDOUT"); options.addOption("q", "quiet", false, "do not print anything except in the event of errors."); options.addOption("f", "force", false, "force all bitstreams to be processed"); options.addOption("i", "identifier", true, "ONLY process bitstreams belonging to identifier"); options.addOption("m", "maximum", true, "process no more than maximum items"); options.addOption("h", "help", false, "help"); //create a "plugin" option (to specify specific MediaFilter plugins to run) OptionBuilder.withLongOpt("plugins"); OptionBuilder.withValueSeparator(','); OptionBuilder.withDescription( "ONLY run the specified Media Filter plugin(s)\n" + "listed from '" + MEDIA_FILTER_PLUGINS_KEY + "' in dspace.cfg.\n" + "Separate multiple with a comma (,)\n" + "(e.g. MediaFilterManager -p \n\"Word Text Extractor\",\"PDF Text Extractor\")"); Option pluginOption = OptionBuilder.create('p'); pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args options.addOption(pluginOption); //create a "skip" option (to specify communities/collections/items to skip) OptionBuilder.withLongOpt("skip"); OptionBuilder.withValueSeparator(','); OptionBuilder.withDescription( "SKIP the bitstreams belonging to identifier\n" + "Separate multiple identifiers with a comma (,)\n" + "(e.g. MediaFilterManager -s \n 123456789/34,123456789/323)"); Option skipOption = OptionBuilder.create('s'); skipOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args options.addOption(skipOption); boolean isVerbose = false; boolean isQuiet = false; boolean isForce = false; // default to not forced String identifier = null; // object scope limiter int max2Process = Integer.MAX_VALUE; Map<String, List<String>> filterFormats = new HashMap<>(); CommandLine line = null; try { line = parser.parse(options, argv); } catch(MissingArgumentException e) { System.out.println("ERROR: " + e.getMessage()); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(1); } if (line.hasOption('h')) { HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(0); } if (line.hasOption('v')) { isVerbose = true; } isQuiet = line.hasOption('q'); if (line.hasOption('f')) { isForce = true; } if (line.hasOption('i')) { identifier = line.getOptionValue('i'); } if (line.hasOption('m')) { max2Process = Integer.parseInt(line.getOptionValue('m')); if (max2Process <= 1) { System.out.println("Invalid maximum value '" + line.getOptionValue('m') + "' - ignoring"); max2Process = Integer.MAX_VALUE; } } String filterNames[] = null; if(line.hasOption('p')) { //specified which media filter plugins we are using filterNames = line.getOptionValues('p'); if(filterNames==null || filterNames.length==0) { //display error, since no plugins specified System.err.println("\nERROR: -p (-plugin) option requires at least one plugin to be specified.\n" + "(e.g. MediaFilterManager -p \"Word Text Extractor\",\"PDF Text Extractor\")\n"); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(1); } } else { //retrieve list of all enabled media filter plugins! filterNames = DSpaceServicesFactory.getInstance().getConfigurationService().getArrayProperty(MEDIA_FILTER_PLUGINS_KEY); } MediaFilterService mediaFilterService = MediaFilterServiceFactory.getInstance().getMediaFilterService(); mediaFilterService.setForce(isForce); mediaFilterService.setQuiet(isQuiet); mediaFilterService.setVerbose(isVerbose); mediaFilterService.setMax2Process(max2Process); //initialize an array of our enabled filters List<FormatFilter> filterList = new ArrayList<FormatFilter>(); //set up each filter for(int i=0; i< filterNames.length; i++) { //get filter of this name & add to list of filters FormatFilter filter = (FormatFilter) CoreServiceFactory.getInstance().getPluginService().getNamedPlugin(FormatFilter.class, filterNames[i]); if(filter==null) { System.err.println("\nERROR: Unknown MediaFilter specified (either from command-line or in dspace.cfg): '" + filterNames[i] + "'"); System.exit(1); } else { filterList.add(filter); String filterClassName = filter.getClass().getName(); String pluginName = null; //If this filter is a SelfNamedPlugin, //then the input formats it accepts may differ for //each "named" plugin that it defines. //So, we have to look for every key that fits the //following format: filter.<class-name>.<plugin-name>.inputFormats if( SelfNamedPlugin.class.isAssignableFrom(filter.getClass()) ) { //Get the plugin instance name for this class pluginName = ((SelfNamedPlugin) filter).getPluginInstanceName(); } //Retrieve our list of supported formats from dspace.cfg //For SelfNamedPlugins, format of key is: // filter.<class-name>.<plugin-name>.inputFormats //For other MediaFilters, format of key is: // filter.<class-name>.inputFormats String[] formats = DSpaceServicesFactory.getInstance().getConfigurationService().getArrayProperty( FILTER_PREFIX + "." + filterClassName + (pluginName!=null ? "." + pluginName : "") + "." + INPUT_FORMATS_SUFFIX); //add to internal map of filters to supported formats if (ArrayUtils.isNotEmpty(formats)) { //For SelfNamedPlugins, map key is: // <class-name><separator><plugin-name> //For other MediaFilters, map key is just: // <class-name> filterFormats.put(filterClassName + (pluginName!=null ? MediaFilterService.FILTER_PLUGIN_SEPARATOR + pluginName : ""), Arrays.asList(formats)); } }//end if filter!=null }//end for //If verbose, print out loaded mediafilter info if(isVerbose) { System.out.println("The following MediaFilters are enabled: "); Iterator<String> i = filterFormats.keySet().iterator(); while(i.hasNext()) { String filterName = i.next(); System.out.println("Full Filter Name: " + filterName); String pluginName = null; if(filterName.contains(MediaFilterService.FILTER_PLUGIN_SEPARATOR)) { String[] fields = filterName.split(MediaFilterService.FILTER_PLUGIN_SEPARATOR); filterName=fields[0]; pluginName=fields[1]; } System.out.println(filterName + (pluginName!=null? " (Plugin: " + pluginName + ")": "")); } } mediaFilterService.setFilterFormats(filterFormats); //store our filter list into an internal array mediaFilterService.setFilterClasses(filterList); //Retrieve list of identifiers to skip (if any) String skipIds[] = null; if(line.hasOption('s')) { //specified which identifiers to skip when processing skipIds = line.getOptionValues('s'); if(skipIds==null || skipIds.length==0) { //display error, since no identifiers specified to skip System.err.println("\nERROR: -s (-skip) option requires at least one identifier to SKIP.\n" + "Make sure to separate multiple identifiers with a comma!\n" + "(e.g. MediaFilterManager -s 123456789/34,123456789/323)\n"); HelpFormatter myhelp = new HelpFormatter(); myhelp.printHelp("MediaFilterManager\n", options); System.exit(0); } //save to a global skip list mediaFilterService.setSkipList(Arrays.asList(skipIds)); } Context c = null; try { c = new Context(); // have to be super-user to do the filtering c.turnOffAuthorisationSystem(); // now apply the filters if (identifier == null) { mediaFilterService.applyFiltersAllItems(c); } else // restrict application scope to identifier { DSpaceObject dso = HandleServiceFactory.getInstance().getHandleService().resolveToObject(c, identifier); if (dso == null) { throw new IllegalArgumentException("Cannot resolve " + identifier + " to a DSpace object"); } switch (dso.getType()) { case Constants.COMMUNITY: mediaFilterService.applyFiltersCommunity(c, (Community) dso); break; case Constants.COLLECTION: mediaFilterService.applyFiltersCollection(c, (Collection) dso); break; case Constants.ITEM: mediaFilterService.applyFiltersItem(c, (Item) dso); break; } } c.complete(); c = null; } catch (Exception e) { status = 1; } finally { if (c != null) { c.abort(); } } System.exit(status); } }