CrawlDBMergeJob.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;

import javax.annotation.Nullable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.CrawlDBKeyPartitioner;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.LinkKeyComparator;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils;
import org.commoncrawl.util.TextBytes;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

public class CrawlDBMergeJob {
  
  static final Log LOG = LogFactory.getLog(CrawlDBMergeJob.class);

  ///////////////////////////////////////////////////////////////////////////
  // CONSTANTS  
  ///////////////////////////////////////////////////////////////////////////
  
  // EC2 PATHS 
  static final String S3N_BUCKET_PREFIX = "s3n://aws-publicdatasets";
  static final String GRAPH_DATA_OUTPUT_PATH = "/common-crawl/crawl-db/intermediate/";
  static final String INTERMEDIATE_MERGE_PATH = "/common-crawl/crawl-db/merge/intermediate";
  static final String FULL_MERGE_PATH = "/common-crawl/crawl-db/merge/full";
  static final String MULTIPART_SEGMENT_FILE = "MULTIPART.txt";
  
  // Default Max Paritions per Run 
  static final int DEFAULT_MAX_PARTITIONS_PER_RUN = 5;
  
  // max simultaneous intermediate merge jobs ... 
  static final int MAX_SIMULTANEOUS_JOBS = 1;
  
  static LinkedBlockingQueue<QueueItem> jobQueue = new LinkedBlockingQueue<QueueItem>();

  
  static class QueueItem {
    QueueItem() { 
      segmentIds = null;
    }
    
    QueueItem(FileSystem fs,Configuration conf,List<Long> segmentIds) { 
      this.conf = conf;
      this.fs = fs;
      this.segmentIds = segmentIds;
    }
    
    public Configuration conf;
    public FileSystem fs;
    public List<Long> segmentIds;
    public boolean    finalMergeJob;
  }
  
  static class QueueTask implements Runnable {

    Semaphore jobTaskSemaphore = null;
    
    public QueueTask(Semaphore jobTaskSemaphore) { 
      this.jobTaskSemaphore = jobTaskSemaphore;
    }
    
    @Override
    public void run() {
      while (true) {
        LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Running");
        try {
          QueueItem item = jobQueue.take();
          
          
          if (item.segmentIds != null) { 
            LOG.info("Queue Thread:" + Thread.currentThread().getId() + " got segments:" + item.segmentIds);
            LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Starting Job");
            try {
              runIntermediateMerge(item.fs,item.conf,item.segmentIds);
            } catch (IOException e) {
              LOG.error("Queue Thread:" + Thread.currentThread().getId() + " threw exception:" + CCStringUtils.stringifyException(e));
            }
          }
          else { 
            LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Got Shutdown Queue Item - EXITING");
            break;
          }
        } catch (InterruptedException e) {
        }
      }
      
      LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Released Semaphore");
      jobTaskSemaphore.release();
    } 
  }  
  
  public static void main(String[] args) throws Exception  {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(new URI(S3N_BUCKET_PREFIX),conf);
    
    //TODO: Read Parition Size from Command Line
    int partitionSize = DEFAULT_MAX_PARTITIONS_PER_RUN;
    
    boolean finalMerge = (args.length != 0 && args[0].equalsIgnoreCase("--final")); 

    // find latest intermediate merge timestamp ...
    long latestFullMergeTS = findLatestTimestamp(fs, conf,FULL_MERGE_PATH);
    
    LOG.info("Latest Full Merge Timestamp is:" + latestFullMergeTS);
    
    if (!finalMerge) { 
      
      // find list of completed merge candidates ... 
      SortedSet<Long> completedCandidateList = filterAndSortRawInputs(fs, conf, INTERMEDIATE_MERGE_PATH, latestFullMergeTS,true);
          
      // find list of raw merge candidates ... 
      SortedSet<Long> rawCandidateList = filterAndSortRawInputs(fs, conf, GRAPH_DATA_OUTPUT_PATH, latestFullMergeTS,true);
  
      // find list of raw candiates that still need an intermediate merge ... 
      Set<Long> rawUnmergedSet = Sets.difference(rawCandidateList, completedCandidateList);
      
      // convert to list 
      List<Long> unmergedList = Lists.newArrayList(rawUnmergedSet);
      // unecessary sort ?? 
      Collections.sort(unmergedList);
      // partition into groups
      List<List<Long>> partitions = Lists.partition(unmergedList, partitionSize);
      
      if (partitions.size() != 0) { 
        // create completion semaphore ... 
        Semaphore mergeCompletionSemaphore = new Semaphore(-(partitions.size() - 1));
        // ok queue intermediate merges ...
        for (List<Long> partitionIds : partitions) { 
          jobQueue.put(new QueueItem(fs,conf,partitionIds));
        }
        // queue shutdown items 
        for (int i=0;i<MAX_SIMULTANEOUS_JOBS;++i) { 
          jobQueue.put(new QueueItem());
        }
  
        // start threads 
        LOG.info("Starting Threads");
        // startup threads .. 
        for (int i=0;i<MAX_SIMULTANEOUS_JOBS;++i) { 
          Thread thread = new Thread(new QueueTask(mergeCompletionSemaphore));
          thread.start();
        }
        
        // wait for completion ... 
        LOG.info("Waiting for intermediate merge completion");
        mergeCompletionSemaphore.acquireUninterruptibly();
      }
    }
    else { 
      // find list of final merge candidates 
      Set<Long> finalMergeCandidates = filterAndSortRawInputs(fs, conf, INTERMEDIATE_MERGE_PATH, latestFullMergeTS,false);
      
      LOG.info("final Merge Candidates:" + finalMergeCandidates);
      
      if (finalMergeCandidates.size() != 0) {
        LOG.info("Starting Potential Final Merge");
        // run final merge ... (if necessary)
        runFinalMerge(fs,conf,Lists.newArrayList(finalMergeCandidates),latestFullMergeTS);
      }
    }
  }
  
  /** 
   * run the crawldb merge on a list of graph data segments  
   * @param fs
   * @param conf
   * @param partitionIds
   * @throws IOException
   */
  static void runIntermediateMerge(FileSystem fs,Configuration conf,List<Long> partitionIds)throws IOException  { 
    long maxTimestamp = Iterators.getLast(partitionIds.iterator(),(long)-1).longValue();
    
    if (maxTimestamp == -1) { 
      throw new IOException("No Valid Partitions Found in List:" + partitionIds);
    }
    
    // construct a final output path ...   
    Path finalOutputPath = new Path(S3N_BUCKET_PREFIX + INTERMEDIATE_MERGE_PATH,Long.toString(maxTimestamp));
    
    if (fs.exists(finalOutputPath)) {
      LOG.info("Deleting Existing Output at:" + finalOutputPath);
      fs.delete(finalOutputPath, true);
    }
    
    LOG.info("Starting Intermeidate Merge for Paritions:" + partitionIds + " OutputPath is:" + finalOutputPath);

    // construct input paths ...  
    ArrayList<Path> inputPaths = new ArrayList<Path>();
    
    for (long segmentId : partitionIds) { 
      inputPaths.add(new Path(S3N_BUCKET_PREFIX + GRAPH_DATA_OUTPUT_PATH,Long.toString(segmentId)));
    }
    
    JobConf jobConf = new JobBuilder("Intermediate Merge for Segments:" + partitionIds, conf)
    .inputs(inputPaths)
    .inputFormat(SequenceFileInputFormat.class)
    .mapperKeyValue(TextBytes.class, TextBytes.class)
    .outputKeyValue(TextBytes.class, TextBytes.class)
    .outputFormat(SequenceFileOutputFormat.class)
    .reducer(CrawlDBMergingReducer.class,true)
    .partition(CrawlDBKeyPartitioner.class)
    .sort(LinkKeyComparator.class)
    .numReducers(CrawlDBCommon.NUM_SHARDS)
    .speculativeExecution(true)
    .output(finalOutputPath)
    .compressMapOutput(true)
    .maxMapAttempts(4)
    .maxReduceAttempts(3)
    .maxMapTaskFailures(1)
    .compressor(CompressionType.BLOCK, SnappyCodec.class)
    .build();
            
    LOG.info("Starting JOB:" + jobConf);
    try { 
      JobClient.runJob(jobConf);
      LOG.info("Finished JOB:" + jobConf);
      // write multipart candidate list 
      if (partitionIds.size() != 1) { 
        LOG.info("Writing Multipart Segment List for OutputSegment:"+ maxTimestamp);
        listToTextFile(partitionIds,fs,new Path(finalOutputPath,MULTIPART_SEGMENT_FILE));
        LOG.info("Successfully Wrote Multipart Segment List for OutputSegment:"+ maxTimestamp);
      }
    }
    catch (IOException e) { 
      LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e));
       fs.delete(finalOutputPath, true);
    }
  }
  
  /** 
   * run the final crawldb merge on a list intermediate merge candidates   
   * @param fs
   * @param conf
   * @param partitionIds
   * @throws IOException
   */
  static void runFinalMerge(FileSystem fs,Configuration conf,List<Long> partitionIds, long latestFinalMergeTS)throws IOException  { 
    long maxTimestamp = Iterators.getLast(partitionIds.iterator(),(long)-1).longValue();
    
    if (maxTimestamp == -1) { 
      throw new IOException("No Valid Partitions Found in List:" + partitionIds);
    }
    
    // construct a final output path ...   
    Path finalOutputPath = new Path(S3N_BUCKET_PREFIX + FULL_MERGE_PATH,Long.toString(maxTimestamp));
    
    LOG.info("Starting Final Merge for Paritions:" + partitionIds + " and FinalMerge TS:" + latestFinalMergeTS + " OutputPath is:" + finalOutputPath);

    // construct input paths ...  
    ArrayList<Path> inputPaths = new ArrayList<Path>();
    
    for (long segmentId : partitionIds) { 
      inputPaths.add(new Path(S3N_BUCKET_PREFIX + INTERMEDIATE_MERGE_PATH,Long.toString(segmentId)));
    }
    if (latestFinalMergeTS != -1) { 
      inputPaths.add(new Path(S3N_BUCKET_PREFIX + FULL_MERGE_PATH,Long.toString(latestFinalMergeTS)));
    }

    // ok we will need to run this job as a shuffle free reduce 
    
    JobConf jobConf = new JobBuilder("Final Merge for Segments:" + partitionIds, conf)
    .inputs(inputPaths)
    .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class)
    .mapperKeyValue(IntWritable.class, Text.class)
    .outputKeyValue(TextBytes.class, TextBytes.class)
    .outputFormat(SequenceFileOutputFormat.class)
    .reducer(CrawlDBMergeSortReducer.class,false)
    .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class)
    .numReducers(CrawlDBCommon.NUM_SHARDS)
    .speculativeExecution(true)
    .output(finalOutputPath)
    .compressMapOutput(true)
    .compressor(CompressionType.BLOCK, GzipCodec.class)
    .maxMapAttempts(10)
    .maxReduceAttempts(4)
    .maxMapTaskFailures(1)
    .reuseJVM(1)
    .build();
            
    LOG.info("Starting JOB:" + jobConf);
    try { 
      JobClient.runJob(jobConf);
      LOG.info("Finished JOB:" + jobConf);
    }
    catch (IOException e) { 
      LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e));
    }
  }
  
  /** 
   * scan the given prefix path and find the latest output
   * 
   * @param fs
   * @param conf
   * @return
   * @throws IOException
   */
  static long findLatestTimestamp(FileSystem fs,Configuration conf,String searchPath)throws IOException {
    long timestampOut = -1L;
    
    FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + searchPath,"[0-9]*"));
    
    for (FileStatus candidate : files) { 
      Path successPath = new Path(candidate.getPath(),"_SUCCESS");
      if (fs.exists(successPath)) { 
        long timestamp = Long.parseLong(candidate.getPath().getName());
        timestampOut = Math.max(timestamp, timestampOut);
      }
    }
    return timestampOut;
  }
  
  
  static List<String> textFileToList(FileSystem fs,Path path)throws IOException { 
    
    ImmutableList.Builder<String> builder = new ImmutableList.Builder<String>(); 
    
    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path),Charset.forName("UTF-8")));
    try { 
      String line;
      while ((line = reader.readLine()) != null) { 
        if (line.length() != 0 && !line.startsWith("#")) 
          builder.add(line);
      }
    }
    finally { 
      reader.close();
    }
    return builder.build();
  }
  
  static void listToTextFile(List<? extends Object> objects,FileSystem fs,Path path)throws IOException { 
    Writer writer = new OutputStreamWriter(fs.create(path), Charset.forName("UTF-8"));
    try { 
      for (Object obj : objects) { 
        writer.write(obj.toString());
        writer.append("\n");
      }
      writer.flush();
    }
    finally { 
      writer.close();
    }
  }
  
  /** 
   * 
   */
  static List<Long> scanForMultiPartList(FileSystem fs,Configuration conf,Path rootSegmentPath)throws IOException { 
    Path multiPartFilePath = new Path(rootSegmentPath,MULTIPART_SEGMENT_FILE);
    if (fs.exists(multiPartFilePath)) { 
      return Lists.transform(textFileToList(fs,multiPartFilePath), new Function<String,Long>() {

        @Override
        @Nullable
        public Long apply(@Nullable String arg0) {
          return Long.parseLong(arg0);
        } 
      });
    }
    else { 
      return null;
    }
  }
  
  /** 
   * iterate the intermediate link graph data and extract unmerged set ... 
   * 
   * @param fs
   * @param conf
   * @param latestMergeDBTimestamp
   * @return
   * @throws IOException
   */
  static SortedSet<Long> filterAndSortRawInputs(FileSystem fs,Configuration conf,String searchPath, long latestMergeDBTimestamp,boolean processMultipartFiles)throws IOException { 
    TreeSet<Long> set = new TreeSet<Long>();

    FileStatus candidates[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + searchPath,"[0-9]*"));
    
    for (FileStatus candidate : candidates) {
      LOG.info("Found Merge Candidate:" + candidate.getPath());
      long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
      if (candidateTimestamp > latestMergeDBTimestamp) { 
        Path successPath = new Path(candidate.getPath(),"_SUCCESS");
        if (fs.exists(successPath)) {
          // scan for a multipart result 
          List<Long> multipartResult = scanForMultiPartList(fs,conf,candidate.getPath());
          if (processMultipartFiles && multipartResult != null) {
            LOG.info("Merge Candidate Completed with Multipart Result:" + multipartResult);
            set.addAll(multipartResult);
          }
          else { 
            set.add(candidateTimestamp);
          }
        }
        else { 
          LOG.info("Rejected Merge Candidate:" + candidate.getPath());
        }
      }
    }
    return set;
  }  
  
}