package edu.umd.cloud9.util; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; /** * Combine a number of sequence files into a smaller number of sequence files. <p> * <p> * Input: Any number of sequence files containing key-value pairs, each of a certain type.<p> * Output: N sequence or text files containing all key-value pairs given as input.<p> * <p> * Given the number of desired sequence files, say N, map over a number of sequence files and partition all key-value pairs into N.<p> * <p> * Usage: [input] [output-dir] [number-of-mappers] [number-of-reducers] [key-class-name] [value-class-name] [seqeuence|text]<p> *<p> * @author ferhanture * */ public class CombineSequenceFiles extends Configured implements Tool{ private static final Logger LOG = Logger.getLogger(CombineSequenceFiles.class); public CombineSequenceFiles(){ } private static int printUsage() { System.out.println("usage: [input] [output-dir] [number-of-mappers] [number-of-reducers] [key-class-name] [value-class-name] [seqeuence|text]"); return -1; } @SuppressWarnings("unchecked") public int run(String[] args) throws Exception { if (args.length != 7 && args.length!=8) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); String keyClassName = args[4]; String valueClassName = args[5]; Class<? extends Writable> keyClass, valueClass; try { keyClass = (Class<? extends Writable>) Class.forName(keyClassName); valueClass = (Class<? extends Writable>) Class.forName(valueClassName); } catch (ClassNotFoundException e) { e.printStackTrace(); throw new RuntimeException("Class not found: "+keyClassName + "," + valueClassName); } JobConf job = new JobConf(CombineSequenceFiles.class); job.setJobName("CombineSequenceFiles"); FileSystem fs = FileSystem.get(job); fs.delete(new Path(outputPath), true); fs.delete(new Path(inputPath + "/_SUCCESS"), true); FileStatus[] stat = fs.listStatus(new Path(inputPath)); for (int i = 0; i < stat.length; ++i) { FileInputFormat.addInputPath(job, stat[i].getPath()); LOG.info("Added: "+stat[i].getPath()); } FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 100); job.setInt("mapred.reduce.max.attempts", 100); job.setInt("mapred.task.timeout", 600000000); if(args.length==8){ job.set("mapred.job.tracker", "local"); job.set("fs.default.name", "file:///"); } LOG.setLevel(Level.INFO); LOG.info("Running job "+job.getJobName()); LOG.info("Input directory: "+inputPath); LOG.info("Output directory: "+outputPath); LOG.info("Number of mappers: "+numMappers); LOG.info("Number of reducers: "+numReducers); LOG.info("Key class: "+keyClass.getName()); LOG.info("Value class: "+valueClass.getName()); job.setNumMapTasks(numMappers); job.setNumReduceTasks(numReducers); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(keyClass); job.setMapOutputValueClass(valueClass); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); job.setMapperClass(IdentityMapper.class); job.setReducerClass(IdentityReducer.class); if (args[6].equals("sequence")) { job.setOutputFormat(SequenceFileOutputFormat.class); }else if (args[6].equals("text")) { job.setOutputFormat(TextOutputFormat.class); }else { throw new RuntimeException("Unknown output format: "+args[6]); } JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception{ ToolRunner.run(new Configuration(), new CombineSequenceFiles(), args); return; } }