package org.archive.wayback.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapRunner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HTTPImportJob extends Configured implements Tool {
Configuration conf = null;
public final static String HTTP_IMPORT_TARGET = "http-import.target";
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public int run(String[] args) throws Exception {
Job job = new Job(getConf(), "http-import");
Configuration conf = job.getConfiguration();
job.setJarByClass(HTTPImportJob.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(HTTPImportMapper.class);
int i = 0;
int numMaps = 10;
while(i < args.length -1) {
if(args[i].equals("-m")) {
i++;
numMaps = Integer.parseInt(args[i]);
i++;
} else {
break;
}
}
if(args.length - 3 != i) {
throw new IllegalArgumentException("wrong number of args...");
}
Path inputPath = new Path(args[i]);
Path outputPath = new Path(args[i+1]);
Path targetPath = new Path(args[i+2]);
TextInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
conf.set(HTTP_IMPORT_TARGET, targetPath.toString());
conf.setBoolean("mapred.map.tasks.speculative.execution", false);
FileSystem fs = inputPath.getFileSystem(conf);
FileStatus inputStatus = fs.getFileStatus(inputPath);
long inputLen = inputStatus.getLen();
long bytesPerMap = (int) inputLen / numMaps;
FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new HTTPImportJob(), args);
System.exit(res);
}
}