package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.IOException; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.RawDataSpillWriter; import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.MultiFileMergeUtils; import org.commoncrawl.util.TextBytes; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; /** * The intermediate segments link graph data doesn't seem to be sorted properly, and * the regeneration of this data would be costly, so to avoid this, we are going to resort * and re-merge each shard individually. ... Temporary Fix :-( * * * @author rana * */ public class CrawlDBResortFinalJob implements Reducer<IntWritable, Text ,TextBytes,TextBytes> { static final Log LOG = LogFactory.getLog(CrawlDBResortFinalJob.class); public static void main(String[] args)throws IOException { Path existingMergeDBPath = new Path(args[0]); Path fixedMergedDBPath = new Path(args[1]); Configuration conf = new Configuration(); // spin up the resort job ... JobConf jobConf = new JobBuilder("Resort Final Merge Shards", conf) .inputs(Lists.newArrayList(existingMergeDBPath)) .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(CrawlDBResortFinalJob.class,false) .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class) .numReducers(CrawlDBCommon.NUM_SHARDS) .speculativeExecution(true) .output(fixedMergedDBPath) .compressMapOutput(true) .compressor(CompressionType.BLOCK, GzipCodec.class) .maxMapAttempts(10) .maxReduceAttempts(4) .maxMapTaskFailures(1) .reuseJVM(1) .build(); LOG.info("Starting JOB:" + jobConf); try { JobClient.runJob(jobConf); LOG.info("Finished JOB:" + jobConf); } catch (IOException e) { LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e)); } } JobConf _conf; @Override public void configure(JobConf job) { _conf = job; } @Override public void close() throws IOException { // TODO Auto-generated method stub } // declare a keyvalue comparator that delegates to linkkey comparator ... static class Comparator implements RawKeyValueComparator<TextBytes, TextBytes> { CrawlDBKey.LinkKeyComparator comparator = new CrawlDBKey.LinkKeyComparator(); @Override public int compare(TextBytes key1, TextBytes value1, TextBytes key2, TextBytes value2) { return comparator.compare(key1, key2); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { return comparator.compare(key1Data, key1Offset, key1Length, key2Data, key2Offset, key2Length); } } @SuppressWarnings("unchecked") @Override public void reduce(IntWritable key, final Iterator<Text> values,final OutputCollector<TextBytes, TextBytes> collector, final Reporter reporter)throws IOException { // we expect a single path per shard here ... // construct the CrawlDBWriter (merging reducer instance we going to delegate to) final CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer(); crawlDBWriter.configure(_conf); // construct a raw data spill writer (required by merger) that delegates to the merging reducer RawDataSpillWriter<TextBytes, TextBytes> spillWriter = new RawDataSpillWriter<TextBytes, TextBytes>() { TextBytes _key = new TextBytes(); TextBytes _value = new TextBytes(); DataInputBuffer _buffer = new DataInputBuffer(); @Override public void spillRecord(TextBytes key, TextBytes value) throws IOException { // ok spill this to the final collector ... crawlDBWriter.reduce(key, Iterators.forArray(value), collector, reporter); } @Override public void close() throws IOException { // flush the writer ... crawlDBWriter.close(); } @Override public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength, byte[] valueData, int valueOffset, int valueLength) throws IOException { // we want to avoid any memory allocations here .. // read key and data lengths, and reconstitute the key/val objects. _buffer.reset(keyData,keyOffset,keyLength); int realLength = WritableUtils.readVInt(_buffer); _key.set(keyData,_buffer.getPosition(), realLength); _buffer.reset(valueData,valueOffset,valueLength); realLength = WritableUtils.readVInt(_buffer); _value.set(valueData,_buffer.getPosition(),realLength); // delegate to typed spill method spillRecord(_key, _value); } }; // we need a custom config for the merger... since we want to use really big buffers to accomodate really big key/value pairs Configuration sortConf = new Configuration(_conf); // setup big buffer sizes for merge ... sortConf.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, 250000000); sortConf.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM,250000000); // spawn merge sorter // it will sort incoming data in chunks and then spill to temp // finally, it will merge sort all chunks and spill to final output (spillWriter) @SuppressWarnings("rawtypes") MergeSortSpillWriter merger = new MergeSortSpillWriter<TextBytes,TextBytes>( sortConf, spillWriter, FileSystem.getLocal(_conf), new Path("/mnt/tmp/"), null, new Comparator(), TextBytes.class, TextBytes.class, true, null); try { Path inputPath = new Path(Iterators.getNext(values, null).toString()); // read unsorted file and feed data to merger ... SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(inputPath.toUri(),_conf), inputPath, _conf); try { TextBytes inputKey = new TextBytes(); TextBytes inputValue = new TextBytes(); while (reader.next(inputKey,inputValue)) { merger.spillRecord(inputKey,inputValue); } } finally { reader.close(); } } finally { merger.close(); spillWriter.close(); } } }