package org.archive.hadoop.jobs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.hadoop.mapreduce.WATExtractorMapper;
public class WATExtractorJob extends Configured implements Tool {
Configuration conf = null;
public final static String TOOL_NAME = "WATExtractor";
public final static String TOOL_DESCRIPTION = "A map/reduce program that extract a bunch of WARC files into WAT files into HDFS.";
public final static String WAT_EXTRACT_TARGET = "wat-extractor.target";
public final static String WAT_EXTRACTOR_OVERRIDE = "wat-extractor.override";
public Configuration getConf() {
return conf;
}
static int printUsage() {
System.out.println("WATExtractor [OPTIONS] <input> <outputdir> <importTarget>");
System.out.println("\tOPTIONS can be:");
System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks");
System.out.println("\t\t--override - to override existent WAT files with the same name, the default is to skip the extracted files.");
System.out.println("\tThe input file contains lines of the form:");
System.out.println("\t\t\tFilePath");
System.out.println("\tOR");
System.out.println("\t\t\tBASENAME<SPACE>FilePath");
System.out.println("\tif only FilePath is specified, then the target will be <importTarget>/<BASENAME of FilePath>");
System.out.println("\totherwise the target will be <importTarget>/<BASENAME>");
System.err.println("\tFilePath is HTTP or HDFS URL to an arc, warc, arc.gz, or warc.gz.");
System.out.println();
return -1;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf(), "wat-extractor");
Configuration conf = job.getConfiguration();
job.setJarByClass(WATExtractorJob.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(WATExtractorMapper.class);
int i = 0;
int numMaps = 10;
while(i < args.length -1) {
if(args[i].equals("-m")) {
i++;
numMaps = Integer.parseInt(args[i]);
i++;
} else if(args[i].equals("--override")) {
WATExtractorMapper.setOverride(conf, true);
i++;
} else {
break;
}
}
if(args.length - 3 != i) {
printUsage();
// throw new IllegalArgumentException("wrong number of args...");
}
Path inputPath = new Path(args[i]);
Path outputPath = new Path(args[i+1]);
Path targetPath = new Path(args[i+2]);
TextInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
WATExtractorMapper.setTargetDir(conf, targetPath.toString());
conf.setBoolean("mapred.map.tasks.speculative.execution", false);
FileSystem fs = inputPath.getFileSystem(conf);
FileStatus inputStatus = fs.getFileStatus(inputPath);
long inputLen = inputStatus.getLen();
long bytesPerMap = (int) inputLen / numMaps;
FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
return (job.waitForCompletion(true) ? 0 : 1);
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new WATExtractorJob(), args);
System.exit(res);
}
}