package ivory.core.util; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import edu.umd.cloud9.collection.clue.ClueWarcForwardIndex; import edu.umd.cloud9.collection.clue.ClueWarcRecord; import edu.umd.cloud9.mapred.NullInputFormat; import edu.umd.cloud9.mapred.NullMapper; import edu.umd.cloud9.mapred.NullOutputFormat; public class AnnotateClueRunAllWithURLs extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(AnnotateClueRunAllWithURLs.class); static { Logger.getLogger(edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.class).setLevel( Level.WARN); } private static enum MyCounter { Count, Time }; private static class MyMapper extends NullMapper { public void run(JobConf conf, Reporter reporter) throws IOException { String inputFile = conf.get("InputFile"); String outputFile = conf.get("OutputFile"); ClueWarcForwardIndex[] indexes = new ClueWarcForwardIndex[10]; indexes[0] = new ClueWarcForwardIndex(); indexes[0].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.01.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[1] = new ClueWarcForwardIndex(); indexes[1].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.02.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[2] = new ClueWarcForwardIndex(); indexes[2].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.03.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[3] = new ClueWarcForwardIndex(); indexes[3].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.04.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[4] = new ClueWarcForwardIndex(); indexes[4].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.05.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[5] = new ClueWarcForwardIndex(); indexes[5].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.06.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[6] = new ClueWarcForwardIndex(); indexes[6].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.07.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[7] = new ClueWarcForwardIndex(); indexes[7].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.08.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[8] = new ClueWarcForwardIndex(); indexes[8].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.09.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); indexes[9] = new ClueWarcForwardIndex(); indexes[9].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.10.dat"), new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf)); FileSystem fs = FileSystem.get(conf); sLogger.info("reading " + inputFile); LineReader reader = new LineReader(fs.open(new Path(inputFile))); FSDataOutputStream writer = fs.create(new Path(outputFile), true); Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); String docid = arr[2]; int rank = Integer.parseInt(arr[3]); long start = System.currentTimeMillis(); ClueWarcRecord doc = null; for (int i = 0; i < 10; i++) { doc = indexes[i].getDocument(docid); if (doc != null) break; } String url = doc.getHeaderMetadataItem("WARC-Target-URI"); long duration = System.currentTimeMillis() - start; reporter.incrCounter(MyCounter.Count, 1); reporter.incrCounter(MyCounter.Time, duration); if (rank == 1 || rank % 100 == 0) sLogger.info(line + " " + url + " (" + duration + "ms)"); writer.write(new String(line + " " + url + "\n").getBytes()); } reader.close(); writer.close(); } } /** * Creates an instance of this tool. */ public AnnotateClueRunAllWithURLs() { } private static int printUsage() { System.out.println("usage: [input-file] [output-file]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String inputFile = args[0]; String outputFile = args[1]; sLogger.info("Tool name: AnnotateClueRunWithURLs"); sLogger.info(" - input file: " + inputFile); sLogger.info(" - output file: " + outputFile); JobConf conf = new JobConf(AnnotateClueRunAllWithURLs.class); conf.setJobName("AnnotateClueRunWithURLs"); conf.setSpeculativeExecution(false); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.set("InputFile", inputFile); conf.set("OutputFile", outputFile); JobClient.runJob(conf); return 0; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new AnnotateClueRunAllWithURLs(), args); System.exit(res); } }