CrawlDBResortFinalJob.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawDataSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils;
import org.commoncrawl.util.TextBytes;

import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;


/** 
 * The intermediate segments link graph data doesn't seem to be sorted properly, and 
 * the regeneration of this data would be costly, so to avoid this, we are going to resort 
 * and re-merge each shard individually. ... Temporary Fix :-(
 * 
 *  
 * @author rana
 *
 */
public class CrawlDBResortFinalJob implements Reducer<IntWritable, Text ,TextBytes,TextBytes> {
  
  static final Log LOG = LogFactory.getLog(CrawlDBResortFinalJob.class);
  
  public static void main(String[] args)throws IOException {
    Path existingMergeDBPath = new Path(args[0]);
    Path fixedMergedDBPath = new Path(args[1]);

    Configuration conf = new Configuration();
    
    
    // spin up the resort job ... 
    JobConf jobConf = new JobBuilder("Resort Final Merge Shards", conf)
    .inputs(Lists.newArrayList(existingMergeDBPath))
    .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class)
    .mapperKeyValue(IntWritable.class, Text.class)
    .outputKeyValue(TextBytes.class, TextBytes.class)
    .outputFormat(SequenceFileOutputFormat.class)
    .reducer(CrawlDBResortFinalJob.class,false)
    .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class)
    .numReducers(CrawlDBCommon.NUM_SHARDS)
    .speculativeExecution(true)
    .output(fixedMergedDBPath)
    .compressMapOutput(true)
    .compressor(CompressionType.BLOCK, GzipCodec.class)
    .maxMapAttempts(10)
    .maxReduceAttempts(4)
    .maxMapTaskFailures(1)
    .reuseJVM(1)
    .build();
    
    LOG.info("Starting JOB:" + jobConf);
    try { 
      JobClient.runJob(jobConf);
      LOG.info("Finished JOB:" + jobConf);
    }
    catch (IOException e) { 
      LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e));
    }    
  }

  JobConf _conf;
  @Override
  public void configure(JobConf job) {
    _conf = job;
  }

  @Override
  public void close() throws IOException {
    // TODO Auto-generated method stub
    
  }

  // declare a keyvalue comparator that delegates to linkkey comparator ... 
  static class Comparator implements RawKeyValueComparator<TextBytes, TextBytes> {

    CrawlDBKey.LinkKeyComparator comparator = new CrawlDBKey.LinkKeyComparator();
    
    @Override
    public int compare(TextBytes key1, TextBytes value1, TextBytes key2,
        TextBytes value2) {
      return comparator.compare(key1, key2);
    }

    @Override
    public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
        byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
        int value1Offset, int value1Length, byte[] value2Data,
        int value2Offset, int value2Length) throws IOException {
      return comparator.compare(key1Data, key1Offset, key1Length, key2Data, key2Offset, key2Length);
    } 
  }    

  
  @SuppressWarnings("unchecked")
  @Override
  public void reduce(IntWritable key, final Iterator<Text> values,final OutputCollector<TextBytes, TextBytes> collector, final Reporter reporter)throws IOException {
    // we expect a single path per shard here ...
    
    
    // construct the CrawlDBWriter (merging reducer instance we going to delegate to) 
    final CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer();
    crawlDBWriter.configure(_conf);
    
    // construct a raw data spill writer (required by merger) that delegates to the merging reducer 
    RawDataSpillWriter<TextBytes, TextBytes> spillWriter = new RawDataSpillWriter<TextBytes, TextBytes>() {
      
      TextBytes _key = new TextBytes();
      TextBytes _value = new TextBytes();
      DataInputBuffer _buffer = new DataInputBuffer();
      
      @Override
      public void spillRecord(TextBytes key, TextBytes value) throws IOException {
        // ok spill this to the final collector ... 
        crawlDBWriter.reduce(key, Iterators.forArray(value), collector, reporter);
      }
      
      @Override
      public void close() throws IOException {
        // flush the writer ... 
        crawlDBWriter.close();
      }
      
      @Override
      public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength,
          byte[] valueData, int valueOffset, int valueLength) throws IOException {
        // we want to avoid any memory allocations here .. 
        // read key and data lengths, and reconstitute the key/val objects.
        _buffer.reset(keyData,keyOffset,keyLength);
        int realLength = WritableUtils.readVInt(_buffer);
        _key.set(keyData,_buffer.getPosition(), realLength);
        _buffer.reset(valueData,valueOffset,valueLength);
        realLength = WritableUtils.readVInt(_buffer);
        _value.set(valueData,_buffer.getPosition(),realLength);
        // delegate to typed spill method 
        spillRecord(_key, _value);
      }
    };
    
    // we need a custom config for the merger... since we want to use really big buffers to accomodate really big key/value pairs
    Configuration sortConf = new Configuration(_conf);
    // setup big buffer sizes for merge ... 
    sortConf.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, 250000000);
    sortConf.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM,250000000);
   
    // spawn merge sorter
    // it will sort incoming data in chunks and then spill to temp
    // finally, it will merge sort all chunks and spill to final output (spillWriter)
    @SuppressWarnings("rawtypes")
    MergeSortSpillWriter merger 
    = new MergeSortSpillWriter<TextBytes,TextBytes>(
        sortConf, 
        spillWriter, 
        FileSystem.getLocal(_conf),
        new Path("/mnt/tmp/"), 
        null,
        new Comparator(), 
        TextBytes.class, 
        TextBytes.class, 
        true, 
        null);
    
    try { 
      Path inputPath = new Path(Iterators.getNext(values, null).toString());
      // read unsorted file and feed data to merger ... 
      SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(inputPath.toUri(),_conf), inputPath, _conf);
      try { 
        TextBytes inputKey = new TextBytes();
        TextBytes inputValue = new TextBytes();
        
        while (reader.next(inputKey,inputValue)) { 
          merger.spillRecord(inputKey,inputValue);
        }
      }
      finally { 
        reader.close();
      }
    }
    finally { 
        merger.close();
        spillWriter.close();
    }
  }
}