package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.IOException; import java.net.URI; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.Vector; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.KeyBasedSequenceFileIndex; import org.commoncrawl.util.MultiFileMergeUtils; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.Tuples.Pair; @SuppressWarnings("static-access") public class CrawlDBIndexWriter { static final Log LOG = LogFactory.getLog(CrawlDBIndexWriter.class); static Options options = new Options(); static { options.addOption(OptionBuilder.withArgName("input").hasArg(true).isRequired().withDescription("Input Path").create("input")); options.addOption(OptionBuilder.withArgName("output").hasArg(true).isRequired().withDescription("Output Path").create("output")); options.addOption(OptionBuilder.withArgName("shards").hasArg(true).withDescription("Shard Count (test only)").create("shards")); } public static void main(String[] args)throws Exception { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse( options, args ); // build the index... buildIndex(cmdLine); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "CrawlDBIndexWriter", options ); throw e; } } static void buildIndex(CommandLine commandLine)throws IOException { Path inputPath = new Path(commandLine.getOptionValue("input")); Path outputPath = new Path(commandLine.getOptionValue("output")); int shardCount = CrawlDBCommon.NUM_SHARDS; if (commandLine.hasOption("shards")) { shardCount = Integer.parseInt(commandLine.getOptionValue("shards")); } Configuration conf = new Configuration(); // construct input paths ... ArrayList<Path> inputPaths = new ArrayList<Path>(); inputPaths.add(inputPath); JobConf jobConf = new JobBuilder("Index Builder", conf) .inputs(inputPaths) .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(NullOutputFormat.class) .reducer(CrawlDBIndexWriterReducer.class,false) .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class) .numReducers(shardCount) .speculativeExecution(true) .output(outputPath) .compressMapOutput(true) .compressor(CompressionType.BLOCK, GzipCodec.class) .maxMapAttempts(10) .maxReduceAttempts(3) .maxReduceTaskFailures(5) .reuseJVM(1) .build(); LOG.info("Starting JOB:" + jobConf); try { JobClient.runJob(jobConf); LOG.info("Finished JOB:" + jobConf); } catch (Exception e) { LOG.info("JOB Exec Failed for:" + jobConf); LOG.error(CCStringUtils.stringifyException(e)); } } public static class CrawlDBIndexWriterReducer implements Reducer<IntWritable, Text ,TextBytes,TextBytes> { JobConf _conf; @Override public void configure(JobConf job) { LOG.info("Configuring"); _conf = job; } @Override public void close() throws IOException { // TODO Auto-generated method stub } private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } @Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { LOG.info("Shard:" + key.get()); // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); Set<String> fsType = new HashSet<String>(); while(values.hasNext()){ String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); // convert to uri ... URI uri = new Path(path).toUri(); // get scheme if present ... String scheme = uri.getScheme(); if (scheme == null || scheme.length() == 0) { fsType.add("default"); } else { fsType.add(scheme); } } if (fsType.size() != 1) { throw new IOException("Only One Input Scheme at a time supported!"); } // pick filesystem based on path ... FileSystem fs = FileSystem.get(incomingPaths.get(0).toUri(),_conf); LOG.info("FileSystem is:" + fs.toString()); // create output path... Path indexFilePath = new Path(FileOutputFormat.getWorkOutputPath(_conf),"part-" + NUMBER_FORMAT.format(key.get())); // create crawl db writer, which is the actual reducer we want to use ... KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes> indexWriter = new KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes>(_conf, indexFilePath); try { Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null; // read the single sharded file ... SequenceFile.Reader reader = new SequenceFile.Reader(fs, incomingPaths.get(0), _conf); try { DataOutputBuffer keyBuffer = new DataOutputBuffer(); // walk tuples and feed them to the actual reducer ... long preReadPos = reader.getPosition(); while (reader.nextRawKey(keyBuffer) != -1) { long postReadPos = reader.getPosition(); if (postReadPos != preReadPos) { indexWriter.indexItem(keyBuffer.getData(),0,keyBuffer.getLength(),null,0,0,preReadPos); } preReadPos = postReadPos; reporter.progress(); keyBuffer.reset(); } } finally { reader.close(); } } finally { indexWriter.close(); } } } }