package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.IOException; import java.net.URI; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.Vector; import javax.annotation.Nullable; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.hadoop.util.TextDatumInputSplit; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergeSortReducer.RawValueIterator; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.KeyBasedSequenceFileIndex; import org.commoncrawl.util.MultiFileMergeUtils; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.S3NFileSystem; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.Tuples.Pair; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; /** * Although a 10000 shard index (the default when generating the crawldb) * is good for merge parallelism, it is unwieldly when trying to run * queries against it. This job shrinks the number of shards down to a more manageabale * level and also builds an index against the resulting database. * * @author rana * */ @SuppressWarnings("static-access") public class CrawlDBCompactor { static final Log LOG = LogFactory.getLog(CrawlDBCompactor.class); static Options options = new Options(); static { options.addOption(OptionBuilder.withArgName("input").hasArg(true).isRequired().withDescription("Input Path").create("input")); options.addOption(OptionBuilder.withArgName("output").hasArg(true).isRequired().withDescription("Output Path").create("output")); options.addOption(OptionBuilder.withArgName("shards").hasArg(true).isRequired().withDescription("Desired Output Shard Count").create("shards")); options.addOption(OptionBuilder.withArgName("sample").hasArg(true).withDescription("Optional Sample Size").create("sample")); } public static void main(String[] args)throws Exception { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse( options, args ); // build the index... compactDB(cmdLine); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "CrawlDBCompactor", options ); throw e; } } static void compactDB(CommandLine commandLine)throws IOException { Configuration conf = new Configuration(); Path inputPath = new Path(commandLine.getOptionValue("input")); Path outputPath = new Path(commandLine.getOptionValue("output")); int targetShardCount = Integer.parseInt(commandLine.getOptionValue("shards")); int sampleSize = (commandLine.hasOption("sample") ? Integer.parseInt(commandLine.getOptionValue("sample")) : -1); FileSystem inputFS = FileSystem.get(inputPath.toUri(),conf); // collect shards from input FileStatus shards[] = inputFS.globStatus(new Path(inputPath,"part-*")); // restrict shard count to sample size if so desired if (sampleSize != -1) { shards = Arrays.copyOfRange(shards, 0, sampleSize); } if (shards.length % targetShardCount != 0) { throw new IOException("input shard count:" + shards.length + " not evenly divisible by target shard count:" + targetShardCount); } // transform to paths Iterator<Path> pathItertor = Iterators.transform(Iterators.forArray(shards), new Function<FileStatus, Path>() { @Override @Nullable public Path apply(@Nullable FileStatus arg0) { return arg0.getPath(); } }); // partition ... final List<List<Path>> partitions = Lists.partition(Lists.newArrayList(pathItertor),shards.length / targetShardCount); // set the partition info into the conf CustomInputFormat.writePartitions(partitions, conf); // setup job conf JobConf jobConf = new JobBuilder("Index Builder", conf) .inputFormat(CustomInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(NullOutputFormat.class) .reducer(CrawlDBCompactingReducer.class,false) .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class) .numReducers(targetShardCount) .speculativeExecution(true) .output(outputPath) .compressMapOutput(true) .compressor(CompressionType.BLOCK, GzipCodec.class) .maxMapAttempts(10) .maxReduceAttempts(3) .maxReduceTaskFailures(5) .reuseJVM(1) .build(); LOG.info("Starting JOB:" + jobConf); try { JobClient.runJob(jobConf); LOG.info("Finished JOB:" + jobConf); } catch (Exception e) { LOG.info("JOB Exec Failed for:" + jobConf); LOG.error(CCStringUtils.stringifyException(e)); } } /** * Merge a bunch of shards into a single output shard * Also, create an index of the resulting shard ... * * @author rana * */ public static class CrawlDBCompactingReducer implements Reducer<IntWritable,Text,TextBytes,TextBytes> { private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } JobConf _conf; @Override public void configure(JobConf job) { LOG.info("Configuring"); _conf = job; } @Override public void close() throws IOException { } @Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first List<Path> incomingPaths = Lists.newArrayList(); Set<String> fsType = new HashSet<String>(); while(values.hasNext()){ String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); // convert to uri ... URI uri = new Path(path).toUri(); // get scheme if present ... String scheme = uri.getScheme(); if (scheme == null || scheme.length() == 0) { fsType.add("default"); } else { fsType.add(scheme); } } if (fsType.size() != 1) { throw new IOException("Only One Input Scheme at a time supported!"); } // figure out output path ... Path outputPath = new Path(FileOutputFormat.getWorkOutputPath(_conf),"part-" + NUMBER_FORMAT.format(key.get())); Path indexOutputPath = new Path(FileOutputFormat.getWorkOutputPath(_conf),"index-" + NUMBER_FORMAT.format(key.get())); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); // we don't want to use a grouping comparator because the we are using the reducer code from the intermediate // merge localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,CrawlDBKey.LinkKeyComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,TextBytes.class, WritableComparable.class); // setup big buffer sizes for merge sort localMergeConfig.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, 250000000); localMergeConfig.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM,250000000); // set small queue size so as to not run out RAM localMergeConfig.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_QUEUE_SIZE_PARAM,1); // set codec ... localMergeConfig.set(SequenceFileSpillWriter.SPILL_WRITER_COMPRESSION_CODEC,GzipCodec.class.getName()); // create index writer ... KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes> indexWriter = new KeyBasedSequenceFileIndex.IndexWriter<TextBytes,TextBytes>(localMergeConfig, indexOutputPath); // splill writer ... SequenceFileSpillWriter<TextBytes, TextBytes> spillWriter = new SequenceFileSpillWriter<TextBytes, TextBytes>( FileSystem.get(outputPath.toUri(),localMergeConfig), localMergeConfig, outputPath, TextBytes.class, TextBytes.class, indexWriter, true); try { // pick filesystem based on path ... FileSystem mergefs = getFileSystemForMergePath(incomingPaths.get(0),localMergeConfig); // initialize reader ... LOG.info("FileSystem is:" + mergefs.toString()); MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(mergefs, incomingPaths, localMergeConfig); try { RawValueIterator rawValueIterator = new RawValueIterator(); Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null; // walk tuples and feed them to the actual reducer ... while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { for (RawRecordValue rawValue : nextItem.e1) { spillWriter.spillRawRecord( nextItem.e0._keyData.getData(), 0, nextItem.e0._keyData.getLength(), rawValue.data.getData(), 0, rawValue.data.getLength()); } reporter.progress(); } } finally { multiFileInputReader.close(); } } finally { spillWriter.close(); } } private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException { // override S3N if (path.toUri().getScheme().equalsIgnoreCase("s3n")) { FileSystem fs = new S3NFileSystem(); fs.initialize(path.toUri(), conf); return fs; } // conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class); return FileSystem.get(path.toUri(),conf); } } /** * An InputFormat that groups paths by shard id * @author rana * */ public static class CustomInputFormat implements InputFormat<IntWritable,Text> { public static final String PARTITION_COUNT_TEXT = "CustomFF.ParitionCount"; public static final String PARTITION_ID_PREFIX = "CustomFF.ParitionID"; public static void writePartitions(List<List<Path>> partitions,Configuration conf) { conf.setInt(PARTITION_COUNT_TEXT,partitions.size()); for (int partIndex=0;partIndex<partitions.size();++partIndex) { conf.set(PARTITION_ID_PREFIX+partIndex,Joiner.on(',').join(partitions.get(partIndex)).toString()); } } @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int numPartitions = job.getInt(PARTITION_COUNT_TEXT, -1); InputSplit splits[] = new InputSplit[numPartitions]; for (int i=0;i<numPartitions;++i) { splits[i] = new TextDatumInputSplit(Integer.toString(i) +"," + job.get(PARTITION_ID_PREFIX+i)); } return splits; } @Override public RecordReader<IntWritable, Text> getRecordReader(final InputSplit split, JobConf job, Reporter reporter) throws IOException { final ArrayList<String> parts = Lists.newArrayList(Splitter.on(',') .trimResults() .omitEmptyStrings() .split(((TextDatumInputSplit)split).getDatum())); final int partitionId = Integer.parseInt(parts.remove(0)); return new RecordReader<IntWritable,Text>() { int index=0; @Override public boolean next(IntWritable key, Text value) throws IOException { if (index < parts.size()) { key.set(partitionId); value.set(parts.get(index)); index++; return true; } return false; } @Override public IntWritable createKey() { return new IntWritable(); } @Override public Text createValue() { return new Text(); } @Override public long getPos() throws IOException { return 0; } @Override public void close() throws IOException { } @Override public float getProgress() throws IOException { return 0; } }; } } }