package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.URI; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import javax.annotation.Nullable; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.CrawlDBKeyPartitioner; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.LinkKeyComparator; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.MultiFileMergeUtils; import org.commoncrawl.util.TextBytes; import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.Sets; public class CrawlDBMergeJob { static final Log LOG = LogFactory.getLog(CrawlDBMergeJob.class); /////////////////////////////////////////////////////////////////////////// // CONSTANTS /////////////////////////////////////////////////////////////////////////// // EC2 PATHS static final String S3N_BUCKET_PREFIX = "s3n://aws-publicdatasets"; static final String GRAPH_DATA_OUTPUT_PATH = "/common-crawl/crawl-db/intermediate/"; static final String INTERMEDIATE_MERGE_PATH = "/common-crawl/crawl-db/merge/intermediate"; static final String FULL_MERGE_PATH = "/common-crawl/crawl-db/merge/full"; static final String MULTIPART_SEGMENT_FILE = "MULTIPART.txt"; // Default Max Paritions per Run static final int DEFAULT_MAX_PARTITIONS_PER_RUN = 5; // max simultaneous intermediate merge jobs ... static final int MAX_SIMULTANEOUS_JOBS = 1; static LinkedBlockingQueue<QueueItem> jobQueue = new LinkedBlockingQueue<QueueItem>(); static class QueueItem { QueueItem() { segmentIds = null; } QueueItem(FileSystem fs,Configuration conf,List<Long> segmentIds) { this.conf = conf; this.fs = fs; this.segmentIds = segmentIds; } public Configuration conf; public FileSystem fs; public List<Long> segmentIds; public boolean finalMergeJob; } static class QueueTask implements Runnable { Semaphore jobTaskSemaphore = null; public QueueTask(Semaphore jobTaskSemaphore) { this.jobTaskSemaphore = jobTaskSemaphore; } @Override public void run() { while (true) { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Running"); try { QueueItem item = jobQueue.take(); if (item.segmentIds != null) { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " got segments:" + item.segmentIds); LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Starting Job"); try { runIntermediateMerge(item.fs,item.conf,item.segmentIds); } catch (IOException e) { LOG.error("Queue Thread:" + Thread.currentThread().getId() + " threw exception:" + CCStringUtils.stringifyException(e)); } } else { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Got Shutdown Queue Item - EXITING"); break; } } catch (InterruptedException e) { } } LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Released Semaphore"); jobTaskSemaphore.release(); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(new URI(S3N_BUCKET_PREFIX),conf); //TODO: Read Parition Size from Command Line int partitionSize = DEFAULT_MAX_PARTITIONS_PER_RUN; boolean finalMerge = (args.length != 0 && args[0].equalsIgnoreCase("--final")); // find latest intermediate merge timestamp ... long latestFullMergeTS = findLatestTimestamp(fs, conf,FULL_MERGE_PATH); LOG.info("Latest Full Merge Timestamp is:" + latestFullMergeTS); if (!finalMerge) { // find list of completed merge candidates ... SortedSet<Long> completedCandidateList = filterAndSortRawInputs(fs, conf, INTERMEDIATE_MERGE_PATH, latestFullMergeTS,true); // find list of raw merge candidates ... SortedSet<Long> rawCandidateList = filterAndSortRawInputs(fs, conf, GRAPH_DATA_OUTPUT_PATH, latestFullMergeTS,true); // find list of raw candiates that still need an intermediate merge ... Set<Long> rawUnmergedSet = Sets.difference(rawCandidateList, completedCandidateList); // convert to list List<Long> unmergedList = Lists.newArrayList(rawUnmergedSet); // unecessary sort ?? Collections.sort(unmergedList); // partition into groups List<List<Long>> partitions = Lists.partition(unmergedList, partitionSize); if (partitions.size() != 0) { // create completion semaphore ... Semaphore mergeCompletionSemaphore = new Semaphore(-(partitions.size() - 1)); // ok queue intermediate merges ... for (List<Long> partitionIds : partitions) { jobQueue.put(new QueueItem(fs,conf,partitionIds)); } // queue shutdown items for (int i=0;i<MAX_SIMULTANEOUS_JOBS;++i) { jobQueue.put(new QueueItem()); } // start threads LOG.info("Starting Threads"); // startup threads .. for (int i=0;i<MAX_SIMULTANEOUS_JOBS;++i) { Thread thread = new Thread(new QueueTask(mergeCompletionSemaphore)); thread.start(); } // wait for completion ... LOG.info("Waiting for intermediate merge completion"); mergeCompletionSemaphore.acquireUninterruptibly(); } } else { // find list of final merge candidates Set<Long> finalMergeCandidates = filterAndSortRawInputs(fs, conf, INTERMEDIATE_MERGE_PATH, latestFullMergeTS,false); LOG.info("final Merge Candidates:" + finalMergeCandidates); if (finalMergeCandidates.size() != 0) { LOG.info("Starting Potential Final Merge"); // run final merge ... (if necessary) runFinalMerge(fs,conf,Lists.newArrayList(finalMergeCandidates),latestFullMergeTS); } } } /** * run the crawldb merge on a list of graph data segments * @param fs * @param conf * @param partitionIds * @throws IOException */ static void runIntermediateMerge(FileSystem fs,Configuration conf,List<Long> partitionIds)throws IOException { long maxTimestamp = Iterators.getLast(partitionIds.iterator(),(long)-1).longValue(); if (maxTimestamp == -1) { throw new IOException("No Valid Partitions Found in List:" + partitionIds); } // construct a final output path ... Path finalOutputPath = new Path(S3N_BUCKET_PREFIX + INTERMEDIATE_MERGE_PATH,Long.toString(maxTimestamp)); if (fs.exists(finalOutputPath)) { LOG.info("Deleting Existing Output at:" + finalOutputPath); fs.delete(finalOutputPath, true); } LOG.info("Starting Intermeidate Merge for Paritions:" + partitionIds + " OutputPath is:" + finalOutputPath); // construct input paths ... ArrayList<Path> inputPaths = new ArrayList<Path>(); for (long segmentId : partitionIds) { inputPaths.add(new Path(S3N_BUCKET_PREFIX + GRAPH_DATA_OUTPUT_PATH,Long.toString(segmentId))); } JobConf jobConf = new JobBuilder("Intermediate Merge for Segments:" + partitionIds, conf) .inputs(inputPaths) .inputFormat(SequenceFileInputFormat.class) .mapperKeyValue(TextBytes.class, TextBytes.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(CrawlDBMergingReducer.class,true) .partition(CrawlDBKeyPartitioner.class) .sort(LinkKeyComparator.class) .numReducers(CrawlDBCommon.NUM_SHARDS) .speculativeExecution(true) .output(finalOutputPath) .compressMapOutput(true) .maxMapAttempts(4) .maxReduceAttempts(3) .maxMapTaskFailures(1) .compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); LOG.info("Starting JOB:" + jobConf); try { JobClient.runJob(jobConf); LOG.info("Finished JOB:" + jobConf); // write multipart candidate list if (partitionIds.size() != 1) { LOG.info("Writing Multipart Segment List for OutputSegment:"+ maxTimestamp); listToTextFile(partitionIds,fs,new Path(finalOutputPath,MULTIPART_SEGMENT_FILE)); LOG.info("Successfully Wrote Multipart Segment List for OutputSegment:"+ maxTimestamp); } } catch (IOException e) { LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e)); fs.delete(finalOutputPath, true); } } /** * run the final crawldb merge on a list intermediate merge candidates * @param fs * @param conf * @param partitionIds * @throws IOException */ static void runFinalMerge(FileSystem fs,Configuration conf,List<Long> partitionIds, long latestFinalMergeTS)throws IOException { long maxTimestamp = Iterators.getLast(partitionIds.iterator(),(long)-1).longValue(); if (maxTimestamp == -1) { throw new IOException("No Valid Partitions Found in List:" + partitionIds); } // construct a final output path ... Path finalOutputPath = new Path(S3N_BUCKET_PREFIX + FULL_MERGE_PATH,Long.toString(maxTimestamp)); LOG.info("Starting Final Merge for Paritions:" + partitionIds + " and FinalMerge TS:" + latestFinalMergeTS + " OutputPath is:" + finalOutputPath); // construct input paths ... ArrayList<Path> inputPaths = new ArrayList<Path>(); for (long segmentId : partitionIds) { inputPaths.add(new Path(S3N_BUCKET_PREFIX + INTERMEDIATE_MERGE_PATH,Long.toString(segmentId))); } if (latestFinalMergeTS != -1) { inputPaths.add(new Path(S3N_BUCKET_PREFIX + FULL_MERGE_PATH,Long.toString(latestFinalMergeTS))); } // ok we will need to run this job as a shuffle free reduce JobConf jobConf = new JobBuilder("Final Merge for Segments:" + partitionIds, conf) .inputs(inputPaths) .inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(CrawlDBMergeSortReducer.class,false) .partition(MultiFileMergeUtils.MultiFileMergePartitioner.class) .numReducers(CrawlDBCommon.NUM_SHARDS) .speculativeExecution(true) .output(finalOutputPath) .compressMapOutput(true) .compressor(CompressionType.BLOCK, GzipCodec.class) .maxMapAttempts(10) .maxReduceAttempts(4) .maxMapTaskFailures(1) .reuseJVM(1) .build(); LOG.info("Starting JOB:" + jobConf); try { JobClient.runJob(jobConf); LOG.info("Finished JOB:" + jobConf); } catch (IOException e) { LOG.error("Failed to Execute JOB:" + jobConf + " Exception:\n" + CCStringUtils.stringifyException(e)); } } /** * scan the given prefix path and find the latest output * * @param fs * @param conf * @return * @throws IOException */ static long findLatestTimestamp(FileSystem fs,Configuration conf,String searchPath)throws IOException { long timestampOut = -1L; FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + searchPath,"[0-9]*")); for (FileStatus candidate : files) { Path successPath = new Path(candidate.getPath(),"_SUCCESS"); if (fs.exists(successPath)) { long timestamp = Long.parseLong(candidate.getPath().getName()); timestampOut = Math.max(timestamp, timestampOut); } } return timestampOut; } static List<String> textFileToList(FileSystem fs,Path path)throws IOException { ImmutableList.Builder<String> builder = new ImmutableList.Builder<String>(); BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path),Charset.forName("UTF-8"))); try { String line; while ((line = reader.readLine()) != null) { if (line.length() != 0 && !line.startsWith("#")) builder.add(line); } } finally { reader.close(); } return builder.build(); } static void listToTextFile(List<? extends Object> objects,FileSystem fs,Path path)throws IOException { Writer writer = new OutputStreamWriter(fs.create(path), Charset.forName("UTF-8")); try { for (Object obj : objects) { writer.write(obj.toString()); writer.append("\n"); } writer.flush(); } finally { writer.close(); } } /** * */ static List<Long> scanForMultiPartList(FileSystem fs,Configuration conf,Path rootSegmentPath)throws IOException { Path multiPartFilePath = new Path(rootSegmentPath,MULTIPART_SEGMENT_FILE); if (fs.exists(multiPartFilePath)) { return Lists.transform(textFileToList(fs,multiPartFilePath), new Function<String,Long>() { @Override @Nullable public Long apply(@Nullable String arg0) { return Long.parseLong(arg0); } }); } else { return null; } } /** * iterate the intermediate link graph data and extract unmerged set ... * * @param fs * @param conf * @param latestMergeDBTimestamp * @return * @throws IOException */ static SortedSet<Long> filterAndSortRawInputs(FileSystem fs,Configuration conf,String searchPath, long latestMergeDBTimestamp,boolean processMultipartFiles)throws IOException { TreeSet<Long> set = new TreeSet<Long>(); FileStatus candidates[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + searchPath,"[0-9]*")); for (FileStatus candidate : candidates) { LOG.info("Found Merge Candidate:" + candidate.getPath()); long candidateTimestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp > latestMergeDBTimestamp) { Path successPath = new Path(candidate.getPath(),"_SUCCESS"); if (fs.exists(successPath)) { // scan for a multipart result List<Long> multipartResult = scanForMultiPartList(fs,conf,candidate.getPath()); if (processMultipartFiles && multipartResult != null) { LOG.info("Merge Candidate Completed with Multipart Result:" + multipartResult); set.addAll(multipartResult); } else { set.add(candidateTimestamp); } } else { LOG.info("Rejected Merge Candidate:" + candidate.getPath()); } } } return set; } }