SortPartitionData.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import static org.mockito.Mockito.mock;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.InputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawDataSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.util.TextBytes;

import com.google.common.collect.Iterators;

public class SortPartitionData {
  
  static class Comparator implements RawKeyValueComparator<TextBytes, TextBytes> {

    CrawlDBKey.LinkKeyComparator comparator = new CrawlDBKey.LinkKeyComparator();
    
    @Override
    public int compare(TextBytes key1, TextBytes value1, TextBytes key2,
        TextBytes value2) {
      return comparator.compare(key1, key2);
    }

    @Override
    public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
        byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
        int value1Offset, int value1Length, byte[] value2Data,
        int value2Offset, int value2Length) throws IOException {
      return comparator.compare(key1Data, key1Offset, key1Length, key2Data, key2Offset, key2Length);
    } 
  }
  
  public static void main(String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    Configuration conf = new Configuration();
    
    final SequenceFile.Writer outputWriter = SequenceFile.createWriter(
        FileSystem.get(outputPath.toUri(),conf),
        conf,
        outputPath,
        TextBytes.class,
        TextBytes.class,
        CompressionType.BLOCK,
        new GzipCodec());
    
    final CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer();
    crawlDBWriter.configure(new JobConf(conf));
    
    final OutputCollector<TextBytes,TextBytes> collector = new OutputCollector<TextBytes, TextBytes>() {
      
      @Override
      public void collect(TextBytes key, TextBytes value) throws IOException {
        outputWriter.append(key, value);
      }
    };
    
    RawDataSpillWriter<TextBytes, TextBytes> spillWriter = new RawDataSpillWriter<TextBytes, TextBytes>() {
      
      TextBytes _key = new TextBytes();
      TextBytes _value = new TextBytes();
      DataInputBuffer _buffer = new DataInputBuffer();
      
      @Override
      public void spillRecord(TextBytes key, TextBytes value) throws IOException {
        crawlDBWriter.reduce(key, Iterators.forArray(value), collector, mock(Reporter.class));
      }
      
      @Override
      public void close() throws IOException {
        crawlDBWriter.close();
        outputWriter.close();
      }
      
      @Override
      public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength,
          byte[] valueData, int valueOffset, int valueLength) throws IOException {
        _buffer.reset(keyData,keyOffset,keyLength);
        int realLength = WritableUtils.readVInt(_buffer);
        _key.set(keyData, keyOffset + _buffer.getPosition(), realLength);
        _buffer.reset(valueData,valueOffset,valueLength);
        realLength = WritableUtils.readVInt(_buffer);
        _value.set(valueData,valueOffset + _buffer.getPosition(),realLength);
        spillRecord(_key, _value);
      }
    };
    
    //SequenceFileSpillWriter<TextBytes, TextBytes> finalWriter 
    //  = new SequenceFileSpillWriter<TextBytes, TextBytes>(FileSystem.get(outputPath.toUri(),conf), conf, outputPath, TextBytes.class, TextBytes.class, null, true);
    
    conf.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, 100000000);
    conf.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM,100000000);
    
    MergeSortSpillWriter merger 
    = new MergeSortSpillWriter<TextBytes,TextBytes>(
        conf, 
        spillWriter, 
        FileSystem.getLocal(conf),
        new Path("/tmp"), 
        null,
        new Comparator(), 
        TextBytes.class, 
        TextBytes.class, 
        true, 
        null);
    
    try { 
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(inputPath.toUri(),conf), inputPath, conf);
      TextBytes key = new TextBytes();
      TextBytes value = new TextBytes();
      
      while (reader.next(key, value)) { 
        merger.spillRecord(key, value);
      }
    }
    finally { 
      merger.close();
      spillWriter.close();
    }
  }
}