package ivory.lsh.eval; import ivory.core.RetrievalEnvironment; import ivory.lsh.driver.PwsimEnvironment; import ivory.lsh.eval.SampleSignatures.mapoutput; import java.io.IOException; import java.net.URI; import java.util.Iterator; import java.util.Map.Entry; import java.util.SortedMap; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import tl.lin.data.map.HMapII; import tl.lin.data.map.HMapIIW; import tl.lin.data.map.HMapStFW; import edu.umd.cloud9.io.SequenceFileUtils; /** * <p> * A program that samples from a collection of key,value pairs according to a given frequency. * </p> * * <ul> * <li>[input] path to the collection * <li>[output-dir] path of the output file containing sample * <li>[num-mappers] number of mappers to run * <li>[sample-frequency] if entered N, then every Nth <key,value> pair is sampled. N=1 is * equivalent to sampling everything. * </ul> * * <p> * User needs to modify the source file to change the key and value class type. Change input and * output class type of the mapper, and modify the 3 static fields accordingly. * </p> * <p> * Here's a sample invocation: * </p> * * <blockquote> * * <pre> * hadoop jar ivory.jar ivory.util.SampleDocVectors * /umd-lin/fture/pwsim/medline/wt-term-doc-vectors * /umd-lin/fture/pwsim/medline/wt-term-doc-vectors-sample * 100 * </pre> * * <p> * If there is a text file containing docnos to be sampled (one docno per line), this should be * specified as the fifth and last argument. In this case, the sample frequency argument can be * anything since it will be ignored. * </p> * * </blockquote> * * * usage: [input] [output-dir] [number-of-mappers] [sample-freq] ([sample-docnos-path]) * * @author ferhanture * */ @SuppressWarnings("deprecation") public class SampleTermDocVectors extends Configured implements Tool { @SuppressWarnings("unchecked") static Class keyClass = IntWritable.class, valueClass = HMapStFW.class, inputFormat = SequenceFileInputFormat.class; private static final Logger sLogger = Logger.getLogger(SampleTermDocVectors.class); private void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( this.getClass().getCanonicalName(), options ); } private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, HMapStFW, IntWritable, HMapStFW> { private int sampleFreq; private HMapII samplesMap = null; private String getFilename(String s) { return s.substring(s.lastIndexOf("/") + 1); } private HMapIIW readSamplesFromCache(String samplesFile, JobConf conf) throws IOException { Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); HMapIIW samplesMap = null; for (Path localFile : localFiles) { if (localFile.toString().contains(samplesFile)) { samplesMap = new HMapIIW(); LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFile)); Text t = new Text(); while (reader.readLine(t) != 0) { int docno = Integer.parseInt(t.toString()); sLogger.info(docno + " --> sample"); samplesMap.put(docno, 1); } reader.close(); sLogger.info(samplesMap.size() + " sampled"); } } if (samplesMap == null) throw new RuntimeException("Not found in local cache: " + samplesFile); return samplesMap; } public void configure(JobConf conf) { sLogger.setLevel(Level.INFO); sampleFreq = conf.getInt("SampleFrequency", -1); // read doc ids of sample into vectors String samplesFile = conf.get("Ivory.SampleFile"); if (samplesFile != null) { try { samplesMap = readSamplesFromCache(getFilename(samplesFile), conf); } catch (NumberFormatException e) { e.printStackTrace(); throw new RuntimeException("Incorrect format in " + samplesFile); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException("I/O error in " + samplesFile); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Error reading sample file!"); } } } public void map(IntWritable key, HMapStFW val, OutputCollector<IntWritable, HMapStFW> output, Reporter reporter) throws IOException { if (samplesMap != null) { if (samplesMap.containsKey(key.get())) { reporter.incrCounter(mapoutput.count, 1); output.collect(key, val); } } else { int randInt = (int) (Math.random() * sampleFreq); // integer in [0,sampleFrq) if (randInt == 0) { output.collect(key, val); } } } } public static class MyReducer extends MapReduceBase implements Reducer<IntWritable, HMapStFW, IntWritable, HMapStFW> { @Override public void reduce(IntWritable key, Iterator<HMapStFW> values, OutputCollector<IntWritable, HMapStFW> output, Reporter reporter) throws IOException { output.collect(key, values.next()); } } @SuppressWarnings("unchecked") public int run(String[] args) throws Exception { sLogger.setLevel(Level.INFO); if ( parseArgs(args) < 0 ) { printUsage(); System.exit(-1); } JobConf job = new JobConf(getConf(), SampleTermDocVectors.class); FileSystem fs = FileSystem.get(job); inputPath = (inputPath == null) ? PwsimEnvironment.getTermDocvectorsFile(workDir, fs) : inputPath; outputPath = (outputPath == null) ? PwsimEnvironment.getTermDocvectorsFile(workDir, fs, sampleSize) : outputPath; if (!fs.exists(new Path(inputPath))) { throw new RuntimeException("Error, input path does not exist!"); } job.setJobName(getClass().getName()); // if sample docnos path provided and frequency not provided if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) { job.set("Ivory.SampleFile", sampleDocnosFile); DistributedCache.addCacheFile(new URI(sampleDocnosFile), job); } else if (sampleSize != -1) { RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs); int collectionSize = env.readCollectionDocumentCount(); sampleFreq = collectionSize / (float) sampleSize; job.setInt("SampleFrequency", (int) sampleFreq); } else { throw new RuntimeException("Either provide sample frequency with " + "option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION); } int numMappers = 100; int numReducers = 1; if (!fs.exists(new Path(inputPath))) { throw new RuntimeException("Error, input path does not exist!"); } sLogger.setLevel(Level.INFO); fs.delete(new Path(outputPath), true); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setJarByClass(SampleTermDocVectors.class); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 100); job.setInt("mapred.reduce.max.attempts", 100); job.setInt("mapred.task.timeout", 600000000); sLogger.info("Running job " + job.getJobName()); sLogger.info("Input directory: " + inputPath); sLogger.info("Output directory: " + outputPath); sLogger.info("Sample frequency: " + sampleFreq); sLogger.info("Sample docnos: " + job.get("Ivory.SampleFile")); job.setNumMapTasks(numMappers); job.setNumReduceTasks(numReducers); job.setInputFormat(inputFormat); job.setMapOutputKeyClass(keyClass); job.setMapOutputValueClass(valueClass); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); if (sampleDocnosFile != null && !fs.exists(new Path(sampleDocnosFile))) { sLogger.info("Extracting sample docnos from sampled vectors..."); SortedMap<WritableComparable, Writable> docno2DocVectors; try{ docno2DocVectors = SequenceFileUtils.readFileIntoMap(new Path(outputPath+"/part-00000")); FSDataOutputStream out = fs.create(new Path(sampleDocnosFile)); for(Entry<WritableComparable, Writable> entry : docno2DocVectors.entrySet()){ int docno = ((IntWritable) entry.getKey()).get(); out.writeBytes(docno+"\n"); } out.close(); } catch (Exception e) { throw new RuntimeException(e.toString()); } } return 0; } private Options options; private String sampleDocnosFile, inputPath, outputPath, workDir; private int sampleSize; private float sampleFreq; private static final String WORKDIR_PATH_OPTION = "index"; private static final String INPUT_PATH_OPTION = "input"; private static final String OUTPUT_PATH_OPTION = "output"; private static final String SAMPLEDOCNOS_OPTION = "docnos"; private static final String SAMPLESIZE_OPTION = "size"; private static final String LIBJARS_OPTION = "libjars"; @SuppressWarnings("static-access") private int parseArgs(String[] args) { options = new Options(); options.addOption(OptionBuilder.withDescription("path to directory with weighted term doc vectors").withArgName("path").hasArg().isRequired().create(WORKDIR_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("path to weighted term doc vectors").withArgName("path").hasArg().create(INPUT_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("path to sampled weighted term doc vectors").withArgName("path").hasArg().create(OUTPUT_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("only keep pairs that match these docnos").withArgName("path to sample docnos file").hasArg().create(SAMPLEDOCNOS_OPTION)); options.addOption(OptionBuilder.withDescription("sample a document with probability = number-of-docs/N").withArgName("N").hasArg().create(SAMPLESIZE_OPTION)); options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } workDir = cmdline.getOptionValue(WORKDIR_PATH_OPTION); inputPath = cmdline.hasOption(INPUT_PATH_OPTION) ? cmdline.getOptionValue(INPUT_PATH_OPTION) : null; outputPath = cmdline.hasOption(OUTPUT_PATH_OPTION) ? cmdline.getOptionValue(OUTPUT_PATH_OPTION) : null; sampleSize = cmdline.hasOption(SAMPLESIZE_OPTION) ? Integer.parseInt(cmdline.getOptionValue(SAMPLESIZE_OPTION)) : -1; sampleDocnosFile = cmdline.hasOption(SAMPLEDOCNOS_OPTION) ? cmdline.getOptionValue(SAMPLEDOCNOS_OPTION) : null; return 0; } public static void main(String[] args) throws Exception { ToolRunner.run(new SampleIntDocVectors(), args); return; } }