package org.archive.hadoop.jobs; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.hadoop.io.MergeClusterRangesInputFormat; import org.archive.hadoop.mapreduce.SimpleTextMapper; import org.archive.hadoop.mapreduce.ZipNumOutputFormat; public class MergeClusterRangesJob implements Tool { public final static String TOOL_NAME = "merge-clusters"; public static final String TOOL_DESCRIPTION = "A tool for merging and re-partitioning CDX clusters"; Configuration conf = null; /** * As hard-coded into the Text RecordWriter */ public static String TEXT_OUTPUT_DELIM_CONFIG = "mapred.textoutputformat.separator"; static int printUsage() { System.out.println("merge-clusters [OPTIONS] SPLIT_PATH OUTPUT_DIRECTORY CLUSTER_PATH ..."); System.out.println(); System.out.println("Merge and possibly re-partition multiple clusters into a new cluster at OUTPUT_DIRECTORY."); System.out.println("SPLIT_PATH is the HDFS URL containing lines of the form:"); System.out.println("\t\tNAME<tab>START<tab>END"); System.out.println(""); System.out.println("The output cluster will contain one .gz file for each line, "); System.out.println("containing records between START(inclusive) and END(exclusive)"); System.out.println(""); System.out.println("CLUSTER_PATH is a series of directories containing existing clusters, which must have an ALL.summary file."); System.out.println("\tOPTIONS can be:"); System.out.println("\t\t--max-map-attempts NUM - retry map(merge) tasks up to NUM times(default likely 3..)"); System.out.println("\t\t--zip-num-output NUM - compress output files with ZipNum, into blocks wit NUM lines, default is 3000"); // System.out.println("\t\t--day-limit NUM - only allow NUM captures in a given day - discard extras"); // System.out.println("\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default <SPACE>"); // System.out.println("\t\t--global-cdx - perform special filtering for the global Wayback CDX:"); // System.out.println("\t\t\t. if lines have 10 columns, assume column 8 is HTML meta info - omit those with 'A'"); // System.out.println("\t\t\t. reduce digest field 6 to 3-digits"); // System.out.println("\t\t\t. omit records with non numeric HTTP response code field 5"); // System.out.println("\t\t\t. omit records with non numeric file offset field -2"); // System.out.println("\t\t\t. omit records which are 502/504 live web ARCs"); // System.out.println("\t\t\t. only all 111 records per url-day"); // System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:"); // System.out.println("\t\t\t. extra trailing field indicating HTML meta NOARCHIVE data, which should be omitted, result lines do not include the last field"); // System.out.println("\t\t\t. truncating digest field to 3 digits"); // System.out.println("\t\t\t. column 0 is original URL (identity CDX files)"); System.out.println(); // ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * The main driver for sort program. Invoke this method to submit the * map/reduce job. * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { int zipNumLines = -1; List<String> otherArgs = new ArrayList<String>(); Job job = new Job(getConf(), "merge-clusters"); Configuration conf = job.getConfiguration(); for (int i = 0; i < args.length; ++i) { try { if ("--zip-num-output".equals(args[i])) { zipNumLines = Integer.parseInt(args[++i]); } else if ("--max-map-attempts".equals(args[i])) { conf.setInt("mapred.map.max.attempts",Integer.parseInt(args[++i])); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are at least 3 parameters left: split target input input ... if (otherArgs.size() < 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } MergeClusterRangesInputFormat.setSplitPath(conf, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); String[] clusters = new String[otherArgs.size() - 2]; for(int i = 2; i < otherArgs.size(); i++) { clusters[i-2] = otherArgs.get(i); } MergeClusterRangesInputFormat.setClusterPaths(conf, clusters); job.setInputFormatClass(MergeClusterRangesInputFormat.class); job.setJarByClass(MergeClusterRangesJob.class); job.setMapperClass(SimpleTextMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); if(zipNumLines != -1) { System.err.format("INFO: zipnum count: %d\n",zipNumLines); ZipNumOutputFormat.setZipNumLineCount(conf, zipNumLines); } job.setOutputFormatClass(ZipNumOutputFormat.class); job.setNumReduceTasks(0); return (job.waitForCompletion(true) ? 0 : 1); } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new MergeClusterRangesJob(), args); System.exit(res); } public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; } }