package edu.isi.karma.mapreduce.driver;
import java.io.File;
import java.io.FileInputStream;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import edu.isi.karma.mapreduce.inputformat.ZIPInputFormat;
public class ZipFileProcessor extends Configured implements Tool{
public Job configure(Properties p ) throws Exception
{
Configuration conf = getConf();
if(p.getProperty("fs.default.name") != null)
{
conf.setIfUnset("fs.default.name", p.getProperty("fs.default.name"));
}
if(p.getProperty("mapred.job.tracker")!= null)
{
conf.setIfUnset("mapred.job.tracker", p.getProperty("mapred.job.tracker"));
}
Job job = Job.getInstance(conf);
job.setInputFormatClass(ZIPInputFormat.class);
job.setJarByClass(ZipFileProcessor.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
String[] paths = p.getProperty("input.directory").split(",");
Path[] array = new Path[paths.length];
int i = 0;
for (String path : paths) {
array[i++] = new Path(path);
}
FileInputFormat.setInputPaths(job, array);
FileOutputFormat.setOutputPath(job, new Path(p.getProperty("output.directory")));
job.setNumReduceTasks(0);
return job;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new ZipFileProcessor(), args));
}
public int run(String[] args) throws Exception {
Properties p = new Properties();
p.load(new FileInputStream(new File(args[0])));
Job job = configure(p);
if(!job.waitForCompletion(true))
{
System.err.println("Unable to finish job");
return -1;
}
return 0;
}
}