package ivory.lsh.driver; import ivory.core.RetrievalEnvironment; import ivory.lsh.projection.ComputeSignaturesMinhash; import ivory.lsh.projection.ComputeSignaturesRandom; import ivory.lsh.projection.ComputeSignaturesSimhash; import ivory.lsh.projection.WriteRandomVectors; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; /** * A Hadoop task to compute signatures from document vectors. * * @author ferhanture * * */ public class RunComputeSignatures extends PwsimEnvironment implements Tool { public static final String[] RequiredParameters = {}; private static final Logger sLogger = Logger.getLogger(RunComputeSignatures.class); private static Options options; private static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "RunComputeSignatures", options ); System.exit(-1); } public int run(String[] args) throws Exception { CommandLine cmdline = parseArgs(args); if ( cmdline == null ) { printUsage(); return -1; } numOfBits = Integer.parseInt(cmdline.getOptionValue(SIZE_OPTION)); signatureType = cmdline.getOptionValue(TYPE_OPTION).toLowerCase(); String dir = cmdline.getOptionValue(INDEX_OPTION); Configuration config = getConf(); FileSystem fs = FileSystem.get(config); config.set("Ivory.IndexPath", dir); config.setInt("Ivory.NumOfBits", numOfBits); String type = (signatureType.charAt(0)+"").toUpperCase() + signatureType.substring(1, signatureType.length()); //capitalize first character RetrievalEnvironment env = new RetrievalEnvironment(dir, fs); String collName = env.readCollectionName(); config.set("Ivory.CollectionName", collName); PwsimEnvironment.setClassTypes(signatureType, config); int batchSize = -1; if (cmdline.hasOption(BATCH_OPTION)) { batchSize = Integer.parseInt(cmdline.getOptionValue(BATCH_OPTION)); if (batchSize > 0) { int numDocs = env.readCollectionDocumentCount(); numBatchFiles = numDocs / batchSize; if(numDocs % batchSize > 0) numBatchFiles++; System.out.println("numBatchFiles: "+numBatchFiles); config.setInt("NumBatch", numBatchFiles); } } if (type.equals("Random")) { WriteRandomVectors writeRandomTask = new WriteRandomVectors(config); writeRandomTask.run(); ComputeSignaturesRandom computeSignaturesTask = new ComputeSignaturesRandom(config); computeSignaturesTask.run(); } else if(type.equals("Simhash")) { if (numOfBits != 64) { sLogger.info("Simhash signatures need to be 64 bits! Quitting..."); System.exit(0); } ComputeSignaturesSimhash computeSignaturesTask = new ComputeSignaturesSimhash(config); computeSignaturesTask.run(); } else { //minhash ComputeSignaturesMinhash computeSignaturesTask = new ComputeSignaturesMinhash(config); computeSignaturesTask.run(); } return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new RunComputeSignatures(), args); System.exit(res); } private static final String INDEX_OPTION = "index"; private static final String SIZE_OPTION = "num_bits"; private static final String TYPE_OPTION = "type"; private static final String BATCH_OPTION = "batch_size"; private static final String LIBJARS_OPTION = "libjars"; @SuppressWarnings("static-access") private static CommandLine parseArgs(String[] args) { options = new Options(); options.addOption(OptionBuilder.withDescription("path to collection index").withArgName("path").hasArg().isRequired().create(INDEX_OPTION)); options.addOption(OptionBuilder.withDescription("number of bits per signature").withArgName("integer").hasArg().isRequired().create(SIZE_OPTION)); options.addOption(OptionBuilder.withDescription("type of signature").withArgName("random|simhash|minhash").hasArg().isRequired().create(TYPE_OPTION)); options.addOption(OptionBuilder.withDescription("batch size").withArgName("number of signatures in one output file").hasArg().create(BATCH_OPTION)); options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return null; } return cmdline; } }