MergeFormattedRecords.java example

Explorer
Cloud9-master
- src
package edu.umd.cloud9.example.hits;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import tl.lin.data.array.ArrayListOfIntsWritable;

/**
 * 
 * <p>
 * Driver program to merges the output of HFormatterWG and AFormatterWG into a 
 * single set of  of output files. It takes five command line arguments:
 * </p>
 * 
 * <ul>
 * <li>[hub-input-path]: input directory containing output of HFormatterWG</li>
 * <li>[auth-input-path]: input directory containing output of AFormatterWG</li>
 * <li>[output-path]: output directory</li>
 * <li>[num-mappers]: number of mappers to use (may be overridden by Hadoop)</li>
 * <li>[num-reducers]: number of reducers to use, also the number of output files</li>
 * </ul>
 * 
 * @see HFormatterWG
 * @see AFormatterWG
 * @author Mike McGrath
 *
 */

public class MergeFormattedRecords extends Configured implements Tool {

	private static final Logger sLogger = Logger.getLogger(MergeFormattedRecords.class);

	private static class MergeReducer extends MapReduceBase implements
			Reducer<IntWritable, HITSNode, IntWritable, HITSNode> {
		public void reduce(IntWritable key, Iterator<HITSNode> values,
				OutputCollector<IntWritable, HITSNode> output, Reporter reporter)
				throws IOException {
			ArrayListOfIntsWritable adjList = new ArrayListOfIntsWritable();

			//construct new HITSNode
			HITSNode nodeOut = new HITSNode();
			
			nodeOut.setType(HITSNode.TYPE_NODE_COMPLETE);
			nodeOut.setARank(0);
			nodeOut.setInlinks(new ArrayListOfIntsWritable());
			nodeOut.setHRank(0);
			nodeOut.setOutlinks(new ArrayListOfIntsWritable());
			nodeOut.setNodeId(key.get());
			
			while (values.hasNext()) {
				HITSNode nodeIn = values.next();
				if (nodeIn.getType() == HITSNode.TYPE_HUB_COMPLETE)
				{
					nodeOut.setHRank(nodeIn.getHRank());
					nodeOut.setOutlinks(new ArrayListOfIntsWritable(nodeIn.getOutlinks()));
				}
				if (nodeIn.getType() == HITSNode.TYPE_AUTH_COMPLETE)
				{
					nodeOut.setARank(nodeIn.getARank());
					nodeOut.setInlinks(new ArrayListOfIntsWritable(nodeIn.getInlinks()));
				}
			}
			output.collect(key, nodeOut);
		}
	}

	private static int printUsage() {
		System.out
				.println("usage: [hub-input-path] [auth-input-path] [output-path] [num-mappers] [num-reducers]");
		ToolRunner.printGenericCommandUsage(System.out);
		return -1;
	}

	public int run(String[] args) throws Exception {
		// TODO Auto-generated method stub

		if (args.length != 5) {
			printUsage();
			return -1;
		}

		String hInputPath = args[0];
		String aInputPath = args[1];
		String outputPath = args[2];

		int mapTasks = Integer.parseInt(args[3]);
		int reduceTasks = Integer.parseInt(args[4]);

		sLogger.info("Tool: MergeFormattedRecords");
		sLogger.info(" - input paths: " + hInputPath + " " + aInputPath);
		sLogger.info(" - output path: " + outputPath);
		sLogger.info(" - number of mappers: " + mapTasks);
		sLogger.info(" - number of reducers: " + reduceTasks);

		JobConf conf = new JobConf(MergeFormattedRecords.class);
		conf.setJobName("HAMergeFormattedRecords");

		conf.setNumMapTasks(mapTasks);
		conf.setNumReduceTasks(reduceTasks);

		FileInputFormat.setInputPaths(conf, new Path(hInputPath));
		FileInputFormat.addInputPath(conf, new Path(aInputPath));
		FileOutputFormat.setOutputPath(conf, new Path(outputPath));
		FileOutputFormat.setCompressOutput(conf, false);

		conf.setInputFormat(SequenceFileInputFormat.class);
		conf.setOutputKeyClass(IntWritable.class);
		conf.setOutputValueClass(HITSNode.class);
		conf.setOutputFormat(SequenceFileOutputFormat.class);

		conf.setMapperClass(IdentityMapper.class);
		conf.setReducerClass(MergeReducer.class);

		// Delete the output directory if it exists already
		Path outputDir = new Path(outputPath);
		FileSystem.get(conf).delete(outputDir, true);

		long startTime = System.currentTimeMillis();
		JobClient.runJob(conf);
		sLogger.info("Job Finished in "
				+ (System.currentTimeMillis() - startTime) / 1000.0
				+ " seconds");

		return 0;
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(),
				new MergeFormattedRecords(), args);
		System.exit(res);
	}

}