CrawlDBMergeSortReducer.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import static org.mockito.Mockito.mock;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.S3SeekableResilientInputStream;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;

import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.google.common.collect.Lists;

/** 
 * Final Merge Step is done using a non-shuffle reduce since all input segments have been pre-sorted and 
 * pre-sharded with the proper shard count.
 * 
 * @author rana
 *
 */
public class CrawlDBMergeSortReducer implements Reducer<IntWritable, Text ,TextBytes,TextBytes> {

  static final Log LOG = LogFactory.getLog(CrawlDBMergeSortReducer.class);
  
  JobConf _conf;
  AmazonS3Client _s3Client;

  @Override
  public void configure(JobConf job) {
    _conf = job;
    _s3Client = new AmazonS3Client(new BasicAWSCredentials(_conf.get("fs.s3n.awsAccessKeyId"),_conf.get("fs.s3n.awsSecretAccessKey")));
  }

  @Override
  public void close() throws IOException {
    _s3Client.shutdown();
  }

  static class RawValueIterator implements Iterator<TextBytes>  {

    TextBytes keyBytes  = new TextBytes();
    TextBytes valueBytes = new TextBytes();
    DataInputBuffer keyInputBuffer = new DataInputBuffer();
    DataInputBuffer inputBuffer = new DataInputBuffer();
    
    Iterator<RawRecordValue> rawIterator;
    void reset(Iterable<RawRecordValue> rawIterable) { 
      this.rawIterator = rawIterable.iterator();
    }
    
    @Override
    public boolean hasNext() {
      return rawIterator.hasNext();
    }

    @Override
    public TextBytes next(){
      try { 
        RawRecordValue nextRawValue = rawIterator.next();
        // read in text bytes key ... 
        keyInputBuffer.reset(nextRawValue.key.getData(),0,nextRawValue.key.getLength());
        inputBuffer.reset(nextRawValue.data.getData(),0,nextRawValue.data.getLength());
        int valueTextLen = WritableUtils.readVInt(inputBuffer);
        valueBytes.set(nextRawValue.data.getData(),inputBuffer.getPosition(),valueTextLen);
        int keyTextLen = WritableUtils.readVInt(keyInputBuffer);
        keyBytes.set(nextRawValue.key.getData(),keyInputBuffer.getPosition(),keyTextLen);
        
        System.out.println("NextKey:" + keyBytes.toString() + " Source:" + nextRawValue.source);
        return valueBytes;
      }
      catch (IOException e) { 
        LOG.error(CCStringUtils.stringifyException(e));
        throw new RuntimeException(e);
      }
    }

    @Override
    public void remove() {
      throw new UnsupportedOperationException("remove");
    } 
  }
    
  private FileSystem getS3NFileSystem()throws IOException {
    return new FileSystem() {

      @Override
      public URI getUri() {
        return null;
      }

      @Override
      public FSDataInputStream open(Path f, int bufferSize) throws IOException {
        return new FSDataInputStream(
            new S3SeekableResilientInputStream(
                f.toUri(), 
                _conf.get("fs.s3n.awsAccessKeyId"), 
                _conf.get("fs.s3n.awsSecretAccessKey"), bufferSize, 100));
      }

      @Override
      public FSDataOutputStream create(Path f, FsPermission permission,
          boolean overwrite, int bufferSize, short replication, long blockSize,
          Progressable progress) throws IOException {
        return null;
      }

      @Override
      public FSDataOutputStream append(Path f, int bufferSize,
          Progressable progress) throws IOException {
        return null;
      }

      @Override
      public boolean rename(Path src, Path dst) throws IOException {
        return false;
      }

      @Override
      @Deprecated
      public boolean delete(Path f) throws IOException {
        // TODO Auto-generated method stub
        return false;
      }

      @Override
      public boolean delete(Path f, boolean recursive) throws IOException {
        // TODO Auto-generated method stub
        return false;
      }

      @Override
      public FileStatus[] listStatus(Path f) throws IOException {
        // TODO Auto-generated method stub
        return null;
      }

      @Override
      public void setWorkingDirectory(Path new_dir) {
        // TODO Auto-generated method stub
        
      }

      @Override
      public Path getWorkingDirectory() {
        // TODO Auto-generated method stub
        return null;
      }

      @Override
      public boolean mkdirs(Path f, FsPermission permission) throws IOException {
        // TODO Auto-generated method stub
        return false;
      }

      @Override
      public FileStatus getFileStatus(Path f) throws IOException {
        // get uri from path ... 
        URI uri = f.toUri();
        // convert to s3 path .. 
        String key = uri.getPath().substring(1);
        System.out.println("***uri path:" +key );
        ObjectMetadata metadata = _s3Client.getObjectMetadata(uri.getHost(), key);
        if (metadata != null) { 
          FileStatus fileStatus = new FileStatus(metadata.getContentLength(),false,1,0,metadata.getLastModified().getTime(),0,FsPermission.getDefault(),"","",f);
          return fileStatus;
        }
        return null;
      } 
      
    };    
  }
  public void reduce(IntWritable key, Iterator<Text> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException {
    // collect all incoming paths first
    Vector<Path> incomingPaths = new Vector<Path>();
    
    Set<String> fsType = new HashSet<String>();
    
    while(values.hasNext()){ 
      String path = values.next().toString();
      LOG.info("Found Incoming Path:" + path);
      incomingPaths.add(new Path(path));
      // convert to uri ... 
      URI uri = new Path(path).toUri();
      // get scheme if present ... 
      String scheme = uri.getScheme();
      if (scheme == null || scheme.length() == 0) { 
        fsType.add("default");
      }
      else { 
        fsType.add(scheme);
      }
    }
    
    if (fsType.size() != 1) { 
      throw new IOException("Only One Input Scheme at a time supported!");
    }
    
    boolean isS3N = fsType.contains("s3n");


    // set up merge attributes
    Configuration localMergeConfig = new Configuration(_conf);
    // we don't want to use a grouping comparator because the we are using the reducer code from the intermediate 
    // merge 
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,CrawlDBKey.LinkKeyComparator.class, RawComparator.class);
    localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,TextBytes.class, WritableComparable.class);
    
    // spawn merger

    // pick filesystem based on path ... 
    FileSystem fs = null;
    if (!isS3N) { 
      fs = FileSystem.get(incomingPaths.get(0).toUri(),_conf);
    }
    else { 
      // use our custom s3n stub 
      fs = getS3NFileSystem();
    }
    LOG.info("FileSystem is:" + fs.toString());
    MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(fs, incomingPaths, localMergeConfig);

    // create crawl db writer, which is the actual reducer we want to use ...  
    CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer();
    crawlDBWriter.configure(_conf);
    
    RawValueIterator rawValueIterator = new RawValueIterator();
    
    Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null;
    // walk tuples and feed them to the actual reducer ...  
    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
      
      System.out.println("PKey:" + nextItem.e0._keyObject);
      rawValueIterator.reset(nextItem.e1);
      // output to reducer ... 
      crawlDBWriter.reduce(nextItem.e0._keyObject,rawValueIterator, output, reporter);
      
      reporter.progress();
    }
    // flush output 
    crawlDBWriter.close();

  }
  
  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }    

  
  private static List<Path> getIntermediateSegmentPaths(AmazonS3Client s3Client)throws IOException { 
    ArrayList<Path> listOut = Lists.newArrayList();
    ObjectListing response = s3Client.listObjects(new ListObjectsRequest()
      .withBucketName("aws-publicdatasets")
      .withPrefix("common-crawl/crawl-db/merge/intermediate/")
      .withDelimiter("/")
      );

    do {
      LOG.info("Response Key Count:" + response.getCommonPrefixes());
      
      for (String entry : response.getCommonPrefixes()) { 
        try { 
          Path s3nPath =new Path("s3n","aws-publicdatasets","/"+entry);
          //long timestamp = Long.parseLong(s3nPath.getName());
          listOut.add(s3nPath);
        }
        catch (Exception e) { 
          
        }
      }

      if (response.isTruncated()) { 
        response = s3Client.listNextBatchOfObjects(response);
      }
      else { 
        break;
      }
    }
    while (true);
    
    return listOut;
  }

  /** 
   * do a merge on a single shard for test purposes
   * @param args
   * @throws IOException
   */
  public static void main(String[] args)throws IOException {
    String s3AccessKey = args[0];
    String s3Secret    = args[1];
    int    partNumber  = Integer.parseInt(args[2]);
    
    AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKey,s3Secret));
    
    List<Path> segments = getIntermediateSegmentPaths(s3Client);
    
    String partName = "part-" + NUMBER_FORMAT.format(partNumber);
    
    List<Text> transformedPaths = Lists.newArrayList();
    
    for (Path path : segments) { 
      transformedPaths.add(new Text(new Path(path,partName).toUri().toString()));
      if (transformedPaths.size() >= 3)
        break;
    }
    
    CrawlDBMergeSortReducer finalMerge = new CrawlDBMergeSortReducer();
    
    JobConf conf = new JobConf();

    conf.set("fs.s3n.awsAccessKeyId",s3AccessKey);
    conf.set("fs.s3n.awsSecretAccessKey",s3Secret);
    
    finalMerge.configure(conf);
    finalMerge.reduce(new IntWritable(1), transformedPaths.iterator(), new OutputCollector<TextBytes, TextBytes>() {

      long lastValue = Long.MIN_VALUE;
      
      @Override
      public void collect(TextBytes key, TextBytes value) throws IOException {
        long domainHash = CrawlDBKey.getLongComponentFromKey(key, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID);
        if (domainHash < lastValue) { 
          throw new IOException("LastValue:" + lastValue + " CurrentValue:" + domainHash + " " + value.toString());
        }
        lastValue = domainHash;
        System.out.println("OutputKey:"+ key.toString());
      }
    },mock(Reporter.class));
        
  }
}