package uk.ac.rhul.cs.cl1.ui.cmdline; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.OptionGroup; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import uk.ac.rhul.cs.cl1.*; import uk.ac.rhul.cs.cl1.io.ClusteringWriter; import uk.ac.rhul.cs.cl1.io.CSVClusteringWriter; import uk.ac.rhul.cs.cl1.io.ClusteringWriterFactory; import uk.ac.rhul.cs.cl1.io.GraphReader; import uk.ac.rhul.cs.cl1.io.GraphReaderFactory; import uk.ac.rhul.cs.cl1.io.GraphReaderFactory.Format; import uk.ac.rhul.cs.cl1.ui.ConsoleTaskMonitor; import uk.ac.rhul.cs.graph.Graph; /// The command line interface to ClusterONE public class CommandLineApplication { /// Options object to describe the command line options accepted by ClusterONE protected Options options = null; /// Whether we are running in debug mode protected boolean debugMode = false; /// Whether we are running in profiling mode protected boolean profilingMode = false; /// Task monitor that shows the progress of the algorithm on the console protected TaskMonitor taskMonitor = new ConsoleTaskMonitor(); /// Constructor of the command line entry point to ClusterONE public CommandLineApplication() { initOptions(); } /// Parses the command line options and then executes the main program public int run(String[] args) { CommandLineParser parser = new PosixParser(); CommandLine cmd = null; ClusterONEAlgorithmParameters params = new ClusterONEAlgorithmParameters(); String inputFormatSpec = null; String outputFormatSpec = null; GraphReaderFactory.Format inputFormat = null; ClusteringWriterFactory.Format outputFormat; ClusteringWriter outputWriter; params.setRejectSeedsWithOnlyUsedNodes(true); try { cmd = parser.parse(this.options, args); if (cmd.hasOption("version")) { showVersion(); return 0; } if (cmd.hasOption("debug")) debugMode = true; if (cmd.hasOption("profile")) profilingMode = true; if (cmd.hasOption("fluff")) params.setFluffClusters(true); if (cmd.hasOption("keep-initial-seeds")) params.setKeepInitialSeeds(true); if (cmd.hasOption("haircut")) params.setHaircutThreshold(Double.parseDouble(cmd.getOptionValue("haircut"))); if (cmd.hasOption("k-core")) params.setKCoreThreshold(Integer.parseInt(cmd.getOptionValue("k-core"))); if (cmd.hasOption("input-format")) inputFormatSpec = cmd.getOptionValue("input-format"); if (cmd.hasOption("max-overlap")) params.setOverlapThreshold(Double.parseDouble(cmd.getOptionValue("max-overlap"))); if (cmd.hasOption("merge-method")) params.setMergingMethodName(cmd.getOptionValue("merge-method")); if (cmd.hasOption("min-density")) { String value = cmd.getOptionValue("min-density"); if (value == null || value.equalsIgnoreCase("auto")) params.setMinDensity(null); else params.setMinDensity(Double.parseDouble(value)); } if (cmd.hasOption("min-size")) params.setMinSize(Integer.parseInt(cmd.getOptionValue("min-size"))); if (cmd.hasOption("no-fluff")) params.setFluffClusters(false); if (cmd.hasOption("no-keep-initial-seeds")) params.setKeepInitialSeeds(false); if (cmd.hasOption("no-merge")) params.setMergingMethodName("none"); if (cmd.hasOption("output-format")) outputFormatSpec = cmd.getOptionValue("output-format"); if (cmd.hasOption("penalty")) params.setNodePenalty(Double.parseDouble(cmd.getOptionValue("penalty"))); if (cmd.hasOption("seed-method")) { // Handle legacy unused_nodes specification here because setSeedGenerator() // does not understand it any more. // TODO: show deprecation warning if ("unused_nodes".equals(cmd.getOptionValue("seed-method"))) { params.setSeedGenerator("nodes"); params.setRejectSeedsWithOnlyUsedNodes(true); } else { params.setSeedGenerator(cmd.getOptionValue("seed-method")); params.setRejectSeedsWithOnlyUsedNodes(false); } } if (cmd.hasOption("similarity")) params.setSimilarityFunction(cmd.getOptionValue("similarity")); if (cmd.hasOption("num-threads")) { String numThreadsValue = cmd.getOptionValue("num-threads"); if ("auto".equals(numThreadsValue)) { params.setNumThreads(0); } else { params.setNumThreads(Integer.parseInt(numThreadsValue)); } } } catch (ParseException ex) { System.err.println("Failed to parse command line options. Reason: " + ex.getMessage()); return 1; } catch (InstantiationException ex) { System.err.println("Failed to interpret string: "+cmd.getOptionValue("seed-method")); ex.printStackTrace(); return 1; } // Check if we have an input file name or if we have the -h option if (cmd.getArgList().size() == 0 || cmd.hasOption('h')) { showUsage(); return 0; } // Check if we have more than one input file if (cmd.getArgList().size() > 1) { System.err.println("Only a single input file is supported"); return 1; } // Process the options if (inputFormatSpec != null) try { inputFormat = GraphReaderFactory.Format.valueOf(inputFormatSpec.toUpperCase()); } catch (IllegalArgumentException ex) { System.err.println("Unknown input file format: "+inputFormatSpec); return 1; } if (outputFormatSpec != null) { try { outputFormat = ClusteringWriterFactory.Format.valueOf(outputFormatSpec.toUpperCase()); } catch (IllegalArgumentException ex) { System.err.println("Unknown output file format: "+outputFormatSpec); return 1; } } else { outputFormat = ClusteringWriterFactory.Format.PLAIN; } outputWriter = ClusteringWriterFactory.fromFormat(outputFormat); if (outputFormat == ClusteringWriterFactory.Format.CSV) { ((CSVClusteringWriter)outputWriter).setQualityFunction( params.getQualityFunction() ); } // Pause if profiling pauseDuringProfiling("Press Enter to start reading the input file..."); // Read the input file Graph graph; long startTime = System.currentTimeMillis(); try { graph = loadGraph(cmd.getArgs()[0], inputFormat); } catch (IOException ex) { System.err.println("IO error while reading input file: "+ex.getMessage()); return 1; } System.err.println("Loaded graph with "+graph.getNodeCount()+" nodes and "+graph.getEdgeCount()+" edges"); if (profilingMode) { System.err.println("Loading took " + (System.currentTimeMillis() - startTime) + " ms"); } // Pause if profiling pauseDuringProfiling("Press Enter to start the algorithm..."); // Start the algorithm ClusterONE algorithm = new ClusterONE(params); algorithm.setDebugMode(debugMode); algorithm.setTaskMonitor(taskMonitor); try { algorithm.runOnGraph(graph); } catch (ClusterONEException ex) { System.err.println("Error while executing the clustering algorithm: "); System.err.println(ex.getMessage()); return 1; } System.err.println("Detected "+algorithm.getResults().size()+" complexes"); try { outputWriter.writeClustering(algorithm.getResults(), System.out); } catch (IOException ex) { System.err.println("IO error while printing the results: "); System.err.println(ex.getMessage()); return 1; } return 0; } /// Initializes the Options object that describes the command line options accepted by ClusterONE @SuppressWarnings("static-access") protected void initOptions() { options = new Options(); /* help option */ options.addOption("h", "help", false, "shows this help message"); /* version option */ options.addOption("v", "version", false, "shows the version number"); /* input format override option */ options.addOption(OptionBuilder.withLongOpt("input-format") .withDescription("specifies the format of the input file (sif or edge_list)") .withType(String.class).hasArg().create("f")); /* output format override option */ options.addOption(OptionBuilder.withLongOpt("output-format") .withDescription("specifies the format of the output file (plain, genepro or csv)") .withType(String.class).hasArg().create("F")); /* minimum size option */ options.addOption(OptionBuilder.withLongOpt("min-size") .withDescription("specifies the minimum size of clusters") .withType(Integer.class).hasArg().create("s")); /* minimum density option */ options.addOption(OptionBuilder.withLongOpt("min-density") .withDescription("specifies the minimum density of clusters (default: auto)") .withType(Float.class).hasArg().create("d")); /* maximum overlap option (advanced) */ options.addOption(OptionBuilder.withLongOpt("max-overlap") .withDescription("specifies the maximum allowed overlap between two clusters") .withType(Float.class).hasArg().create()); /* haircut threshold option (advanced) */ options.addOption(OptionBuilder.withLongOpt("haircut") .withDescription("specifies the haircut threshold for clusters") .withType(Float.class).hasArg().create()); /* penalty scores of nodes (advanced) */ options.addOption(OptionBuilder.withLongOpt("penalty") .withDescription("set the node penalty value") .withType(Float.class).hasArg().create()); /* k-core threshold (advanced) */ options.addOption(OptionBuilder.withLongOpt("k-core") .withDescription("specifies the minimum k-core index of clusters") .withType(Integer.class).hasArg().create()); /* fluffing option (advanced) */ OptionGroup fluffGroup = new OptionGroup(); fluffGroup.addOption(OptionBuilder.withLongOpt("fluff") .withDescription("fluffs the clusters") .withType(Boolean.class).create()); fluffGroup.addOption(OptionBuilder.withLongOpt("no-fluff") .withDescription("don't fluff the clusters (default)") .withType(Boolean.class).create()); options.addOptionGroup(fluffGroup); /* keep initial seeds option (advanced) */ OptionGroup keepInitialSeedsGroup = new OptionGroup(); keepInitialSeedsGroup.addOption(OptionBuilder.withLongOpt("keep-initial-seeds") .withDescription("always keep the initial seed nodes in the cluster") .withType(Boolean.class).create()); keepInitialSeedsGroup.addOption(OptionBuilder.withLongOpt("no-keep-initial-seeds") .withDescription("allow the initial seed nodes to leave the cluster if needed (default)") .withType(Boolean.class).create()); options.addOptionGroup(keepInitialSeedsGroup); /* merging method option (advanced) */ options.addOption(OptionBuilder.withLongOpt("merge-method") .withDescription("specifies the cluster merging method to use (single or multi)") .withType(String.class).hasArg().create()); /* seeding method option (advanced) */ options.addOption(OptionBuilder.withLongOpt("seed-method") .withDescription("specifies the seed generation method to use") .withType(String.class).hasArg().create()); /* similarity function option (advanced) */ options.addOption(OptionBuilder.withLongOpt("similarity") .withDescription("specifies the similarity function to use (match, simpson, jaccard or dice)") .withType(String.class).hasArg().create()); /* any other parameter (advanced) */ /* options.addOption(OptionBuilder.withLongOpt("param") .withDescription("specifies the value of an advanced named parameter of the algorithm") .withArgName("name=value").hasArgs(2).withValueSeparator().create("p")); */ /* debug mode option (advanced) */ options.addOption(OptionBuilder.withLongOpt("debug") .withDescription("turns on the debug mode").withType(Boolean.class).create()); /* profiling mode option (advanced) */ options.addOption(OptionBuilder.withLongOpt("profile") .withDescription("turns on the profiling mode").withType(Boolean.class).create()); /* number of threads (advanced) */ options.addOption(OptionBuilder.withLongOpt("num-threads") .withDescription("specifies the number of threads to use during the growth process (default=auto)") .withType(String.class).hasArg().create()); /* skip the merging phase (useful for debugging only) */ options.addOption(OptionBuilder.withLongOpt("no-merge") .withDescription("don't merge highly overlapping clusters") .create()); } /// Shows the usage instructions public void showUsage() { HelpFormatter formatter = new HelpFormatter(); showVersion(); System.out.println(""); formatter.printHelp("cl1", options, true); } /** * Shows the version information */ public void showVersion() { System.out.println(ClusterONE.applicationName+" "+ClusterONE.version); } /** * Loads a graph from an input file * * @param filename name of the file to be loaded * @param format the format of the file, null means autodetection based on extension */ public Graph loadGraph(String filename, Format format) throws IOException { GraphReader reader; InputStream stream; if (format == null) { if ("-".equals(filename)) { reader = GraphReaderFactory.fromFormat(Format.EDGE_LIST); } else { reader = GraphReaderFactory.fromFilename(filename); } } else { reader = GraphReaderFactory.fromFormat(format); } if ("-".equals(filename)) stream = System.in; else stream = new FileInputStream(filename); if (reader instanceof TaskMonitorSupport) { ((TaskMonitorSupport)reader).setTaskMonitor(taskMonitor); } return reader.readGraph(new InputStreamReader(stream, "utf-8")); } /** * Prints a message and waits for the Enter key to be pressed. * * @param message the message to print */ public void pauseDuringProfiling(String message) { if (!profilingMode) return; System.gc(); printHeapStatistics(); System.err.println(message); try { System.in.read(); } catch (IOException ex) { System.err.println("IOException while waiting for the Enter key."); } } /** * Prints some basic statistics of the heap usage. */ public void printHeapStatistics() { long usedHeapSpace = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); long maxHeapSpace = Runtime.getRuntime().maxMemory(); System.err.println("Heap usage: " + usedHeapSpace + " bytes used out of " + maxHeapSpace + " bytes; " + Math.round(usedHeapSpace * 10000 / maxHeapSpace) / 100 + "% full" ); } /** * Starts the command line version of ClusterONE * * @param args the command line arguments */ public static void main(String[] args) { CommandLineApplication app = new CommandLineApplication(); System.exit(app.run(args)); } }