package edu.umd.cloud9.example.hits; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import tl.lin.data.array.ArrayListOfIntsWritable; /** * * <p> * Driver program to merges the output of HFormatterWG and AFormatterWG into a * single set of of output files. It takes five command line arguments: * </p> * * <ul> * <li>[hub-input-path]: input directory containing output of HFormatterWG</li> * <li>[auth-input-path]: input directory containing output of AFormatterWG</li> * <li>[output-path]: output directory</li> * <li>[num-mappers]: number of mappers to use (may be overridden by Hadoop)</li> * <li>[num-reducers]: number of reducers to use, also the number of output files</li> * </ul> * * @see HFormatterWG * @see AFormatterWG * @author Mike McGrath * */ public class MergeFormattedRecords extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(MergeFormattedRecords.class); private static class MergeReducer extends MapReduceBase implements Reducer<IntWritable, HITSNode, IntWritable, HITSNode> { public void reduce(IntWritable key, Iterator<HITSNode> values, OutputCollector<IntWritable, HITSNode> output, Reporter reporter) throws IOException { ArrayListOfIntsWritable adjList = new ArrayListOfIntsWritable(); //construct new HITSNode HITSNode nodeOut = new HITSNode(); nodeOut.setType(HITSNode.TYPE_NODE_COMPLETE); nodeOut.setARank(0); nodeOut.setInlinks(new ArrayListOfIntsWritable()); nodeOut.setHRank(0); nodeOut.setOutlinks(new ArrayListOfIntsWritable()); nodeOut.setNodeId(key.get()); while (values.hasNext()) { HITSNode nodeIn = values.next(); if (nodeIn.getType() == HITSNode.TYPE_HUB_COMPLETE) { nodeOut.setHRank(nodeIn.getHRank()); nodeOut.setOutlinks(new ArrayListOfIntsWritable(nodeIn.getOutlinks())); } if (nodeIn.getType() == HITSNode.TYPE_AUTH_COMPLETE) { nodeOut.setARank(nodeIn.getARank()); nodeOut.setInlinks(new ArrayListOfIntsWritable(nodeIn.getInlinks())); } } output.collect(key, nodeOut); } } private static int printUsage() { System.out .println("usage: [hub-input-path] [auth-input-path] [output-path] [num-mappers] [num-reducers]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } public int run(String[] args) throws Exception { // TODO Auto-generated method stub if (args.length != 5) { printUsage(); return -1; } String hInputPath = args[0]; String aInputPath = args[1]; String outputPath = args[2]; int mapTasks = Integer.parseInt(args[3]); int reduceTasks = Integer.parseInt(args[4]); sLogger.info("Tool: MergeFormattedRecords"); sLogger.info(" - input paths: " + hInputPath + " " + aInputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(MergeFormattedRecords.class); conf.setJobName("HAMergeFormattedRecords"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(hInputPath)); FileInputFormat.addInputPath(conf, new Path(aInputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HITSNode.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MergeReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; } /** * @param args */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new MergeFormattedRecords(), args); System.exit(res); } }