package uk.bl.wa.hadoop.recrawl; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import uk.bl.wa.hadoop.ArchiveFileInputFormat; import uk.bl.wa.hadoop.TextOutputFormat; @SuppressWarnings("deprecation") public class PersistLogBuilder extends Configured implements Tool { private static final String CLI_USAGE = "[-i <input file>] [-o <output dir>] [-r <number of reducers>]"; private static final String CLI_HEADER = "PersistLogBuilder - MapReduce method for building persistlog data from WARCS."; private String input; private String output; private int reducers = 1; public int run(String[] args) throws IOException, ParseException { JobConf conf = new JobConf(getConf(), PersistLogBuilder.class); String line = null; setup(args); BufferedReader br = new BufferedReader(new FileReader(this.input)); int lineCount = 0; while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); System.out.print("Added " + ++lineCount + " input paths.\r"); } System.out.println(); FileOutputFormat.setOutputPath(conf, new Path(this.output)); conf.setJobName(this.input + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(PersistLogMapper.class); conf.setMapOutputValueClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); conf.setNumReduceTasks(reducers); this.setProperties(conf); JobClient client = new JobClient(conf); client.submitJob(conf); return 0; } private void setup(String[] args) throws ParseException { Options options = new Options(); options.addOption("i", true, "input file list"); options.addOption("o", true, "output directory"); options.addOption("r", true, "number of reducers"); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); if (!cmd.hasOption("i") || !cmd.hasOption("o")) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.setWidth(80); helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, ""); System.exit(1); } this.input = cmd.getOptionValue("i"); this.output = cmd.getOptionValue("o"); if (cmd.hasOption("r")) { reducers = Integer.parseInt(cmd.getOptionValue("r")); } } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new PersistLogBuilder(), args); System.exit(ret); } private void setProperties(JobConf conf) throws IOException { conf.set("mapred.reduce.tasks.speculative.execution", "false"); conf.set("mapred.output.compress", "true"); conf.set("mapred.compress.map.output", "true"); conf.setClass("mapred.output.compression.codec", GzipCodec.class, CompressionCodec.class); } }