package ivory.lsh.eval; import ivory.core.util.CLIRUtils; import java.io.IOException; import java.net.URI; import java.util.Iterator; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.map.HMapIIW; import tl.lin.data.map.HMapIV; import tl.lin.data.pair.PairOfIntString; import tl.lin.data.pair.PairOfInts; import edu.umd.cloud9.collection.wikipedia.WikipediaPage; /** * @author ferhanture * */ public class Docnos2Titles extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(Docnos2Titles.class); private static Options options; private static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "Docnos2Titles", options ); System.exit(-1); } static enum Pairs { COUNT, COUNT2, COUNT3, COUNTE, COUNTF, COUNT4, COUNT3x; } /** * Candidate generation * * Map: (edocno, eWikiPage) --> (<fdocno, edocno>, <E, eTitle>) * Map: (fdocno, fWikiPage) --> (<fdocno, edocno>, <F, fTitle>) * Input is the union of source and target collections * @author ferhanture */ private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, WikipediaPage, PairOfInts, PairOfIntString> { private HMapIV<ArrayListOfIntsWritable> pwsimMapping; // mapping for pwsim pairs private JobConf mJob; private ArrayListOfIntsWritable similarDocnos; private String srcLang; private PairOfIntString valOut; private PairOfInts keyOut; private HMapIIW samplesMap = null; public void configure(JobConf job) { sLogger.setLevel(Level.INFO); srcLang = job.get("fLang"); mJob = job; pwsimMapping = new HMapIV<ArrayListOfIntsWritable>(); valOut = new PairOfIntString(); keyOut = new PairOfInts(); // read doc ids of sample into vectors String samplesFile = job.get("Ivory.SampleFile"); if (samplesFile != null) { try { samplesMap = readSamplesFromCache(getFilename(samplesFile), job); } catch (NumberFormatException e) { e.printStackTrace(); throw new RuntimeException("Incorrect format in " + samplesFile); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException("I/O error in " + samplesFile); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Error reading sample file: " + samplesFile); } } } private static String getFilename(String s) { return s.substring(s.lastIndexOf("/") + 1); } private static void loadPairs(HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter){ try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); int eDocno = key.getLeftElement(); if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) { sLogger.info(key); } if(langID == CLIRUtils.E){ if(!pwsimMapping.containsKey(eDocno)){ pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(eDocno).add(fDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo }else{ if(!pwsimMapping.containsKey(fDocno)){ pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(fDocno).add(eDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } } private HMapIIW readSamplesFromCache(String samplesFile, JobConf conf) throws IOException { Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); HMapIIW samplesMap = null; for (Path localFile : localFiles) { if (localFile.toString().contains(samplesFile)) { samplesMap = new HMapIIW(); LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFile)); Text t = new Text(); while (reader.readLine(t) != 0) { int docno = Integer.parseInt(t.toString()); sLogger.info(docno + " --> sample"); samplesMap.put(docno, 1); } reader.close(); sLogger.info(samplesMap.size() + " sampled"); } } if (samplesMap == null) throw new RuntimeException("Not found in local cache: " + samplesFile); return samplesMap; } public void map(IntWritable docnoKey, WikipediaPage p, OutputCollector<PairOfInts, PairOfIntString> output, Reporter reporter) throws IOException { int docno = docnoKey.get(); String title = p.getTitle(); String lang = p.getLanguage(); int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E; if (langID == CLIRUtils.F ) { docno += 1000000000; if (samplesMap != null && !samplesMap.containsKey(docno)) { return; } } // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above) if (pwsimMapping.isEmpty()) { loadPairs(pwsimMapping, langID, mJob, reporter); sLogger.info("Mapping loaded: " + pwsimMapping.size()); } // if no similar docs for docno, return if (pwsimMapping.containsKey(docno)) { similarDocnos = pwsimMapping.get(docno); }else{ return; } for (int similarDocno : similarDocnos) { if (langID == CLIRUtils.E) { if (samplesMap != null && !samplesMap.containsKey(similarDocno)) { continue; } keyOut.set(similarDocno, docno); }else { keyOut.set(docno, similarDocno); } valOut.set(langID, title); output.collect(keyOut, valOut); } } } private static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, PairOfIntString, Text, Text>{ private Text fTitle, eTitle; public void configure(JobConf job) { fTitle = new Text(); eTitle = new Text(); } @Override public void reduce(PairOfInts docnoPair, Iterator<PairOfIntString> titles, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { eTitle.clear(); fTitle.clear(); sLogger.info(docnoPair); int cnt = 0; while (titles.hasNext()) { PairOfIntString title = titles.next(); sLogger.info(title); if (title.getLeftElement() == CLIRUtils.E) { eTitle.set(title.getRightElement()); cnt++; } else if (title.getLeftElement() == CLIRUtils.F) { fTitle.set(title.getRightElement()); cnt++; } else { throw new RuntimeException("Unknown language ID: " + title.getLeftElement()); } } if (cnt == 2) { output.collect(fTitle, eTitle); }else { sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle); } } } /** * Runs this tool. */ @SuppressWarnings("deprecation") public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), Docnos2Titles.class); // Read commandline arguments CommandLine cmdline = parseArgs(args); if (cmdline == null) { printUsage(); } String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION); String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION); String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION); String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION); String eLang = cmdline.getOptionValue(ELANG_OPTION); String fLang = cmdline.getOptionValue(FLANG_OPTION); String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION); job.setJobName("Docnos2Titles_" + fLang + "-" + eLang); FileInputFormat.addInputPaths(job, eCollectionPath); FileInputFormat.addInputPaths(job, fCollectionPath); FileOutputFormat.setOutputPath(job, new Path(titlePairsPath)); DistributedCache.addCacheFile(new URI(pwsimOutputPath), job); DistributedCache.addCacheFile(new URI(samplesFile), job); job.set("eLang", eLang); job.set("fLang", fLang); job.set("PwsimPairs", pwsimOutputPath); job.set("Ivory.SampleFile", samplesFile); job.setInt("mapred.task.timeout", 60000000); job.set("mapreduce.map.memory.mb", "3000"); job.set("mapreduce.map.java.opts", "-Xmx3000m"); job.setBoolean("mapred.map.tasks.speculative.execution", false); job.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setNumMapTasks(100); job.setNumReduceTasks(1); job.setInt("mapred.min.split.size", 2000000000); job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(PairOfIntString.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); sLogger.info("Running job " + job.getJobName() + "..."); sLogger.info("E-collection path: " + eCollectionPath); sLogger.info("F-collection path: " + fCollectionPath); sLogger.info("Pwsim output path: " + pwsimOutputPath); sLogger.info("Output path: " + titlePairsPath); sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none")); long startTime = System.currentTimeMillis(); JobClient.runJob(job); System.out.println("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; } private static final String FCOLLECTION_OPTION = "f_collection"; private static final String ECOLLECTION_OPTION = "e_collection"; private static final String FLANG_OPTION = "f_lang"; private static final String ELANG_OPTION = "e_lang"; private static final String PWSIM_OPTION = "pwsim_output"; private static final String OUTPUT_PATH_OPTION = "output"; private static final String SAMPLEDOCNOS_OPTION = "docnos"; private static final String LIBJARS_OPTION = "libjars"; @SuppressWarnings("static-access") private CommandLine parseArgs(String[] args) throws Exception { options = new Options(); options.addOption(OptionBuilder.withDescription("path to output of pwsim algorithm").withArgName("path").hasArg().isRequired().create(PWSIM_OPTION)); options.addOption(OptionBuilder.withDescription("path to output").withArgName("path").hasArg().isRequired().create(OUTPUT_PATH_OPTION)); options.addOption(OptionBuilder.withDescription("source-side raw collection path").withArgName("path").hasArg().isRequired().create(FCOLLECTION_OPTION)); options.addOption(OptionBuilder.withDescription("target-side raw collection path").withArgName("path").hasArg().isRequired().create(ECOLLECTION_OPTION)); options.addOption(OptionBuilder.withDescription("two-letter code for f-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(FLANG_OPTION)); options.addOption(OptionBuilder.withDescription("two-letter code for e-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(ELANG_OPTION)); options.addOption(OptionBuilder.withDescription("only keep pairs that match these docnos").withArgName("path to sample docnos file").hasArg().create(SAMPLEDOCNOS_OPTION)); options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return null; } return cmdline; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Docnos2Titles(), args); System.exit(res); } }