/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.ec2.parser;
import java.io.IOException;
import java.net.URI;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.commoncrawl.protocol.ParseOutput;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
/**
* EC2 ParserTask
*
* Spawns the EMR Job that processes CrawlLogs.
*
* First in a sequence of jobs that are part of the migration of data processing
* from the internal cluster to EC2. This job is designed to run on EMR. It
* utilizes spot instances to help reduce costs, and thus only currently uses
* Mappers with (0) Reducers to help make the job more resilient to machine
* failures as well as dynamic (spot) task tracker availablity.
*
*
* @author rana
*
*/
@SuppressWarnings("static-access")
public class EC2ParserTask extends EC2TaskDataAwareTask {
public static final Log LOG = LogFactory.getLog(EC2ParserTask.class);
static final int LOGS_PER_ITERATION = 1000;
static final Pattern CRAWL_LOG_REG_EXP = Pattern.compile("CrawlLog_ccc[0-9]{2}-[0-9]{2}_([0-9]*)");
static final int MAX_SIMULTANEOUS_JOBS = 100;
LinkedBlockingQueue<QueueItem> _queue = new LinkedBlockingQueue<QueueItem>();
Semaphore jobThreadSemaphore = null;
int maxSimultaneousJobs = MAX_SIMULTANEOUS_JOBS;
static Options options = new Options();
static {
options.addOption(
OptionBuilder.withArgName("testMode").hasArg(false).withDescription("Test Mode").create("testMode"));
options.addOption(
OptionBuilder.withArgName("checkpoint").hasArg(false).withDescription("Create Checkpoint").create("checkpoint"));
}
public EC2ParserTask(Configuration conf)throws Exception {
super(conf);
if (!conf.getBoolean(CONF_PARAM_TEST_MODE, false)) {
conf.set(VALID_SEGMENTS_PATH_PROPERTY,VALID_SEGMENTS_PATH);
conf.set(SEGMENT_PATH_PROPERTY,SEGMENTS_PATH);
conf.set(JOB_LOGS_PATH_PROPERTY, JOB_LOGS_PATH);
conf.set(CHECKPOIINTS_PATH_PROPERTY,CHECKPOINTS_PATH);
jobThreadSemaphore = new Semaphore(-(MAX_SIMULTANEOUS_JOBS-1));
}
else {
conf.set(VALID_SEGMENTS_PATH_PROPERTY,TEST_VALID_SEGMENTS_PATH);
conf.set(SEGMENT_PATH_PROPERTY,TEST_SEGMENTS_PATH);
conf.set(JOB_LOGS_PATH_PROPERTY, TEST_JOB_LOGS_PATH);
jobThreadSemaphore = new Semaphore(0);
maxSimultaneousJobs = 1;
}
FileSystem fs = FileSystem.get(new URI("s3n://aws-publicdatasets"),conf);
LOG.info("FileSystem is:" + fs.getUri() +" Scanning for candidates at path:" + CRAWL_LOG_INTERMEDIATE_PATH);
TreeSet<Path> candidateSet = buildCandidateList(fs, new Path(CRAWL_LOG_INTERMEDIATE_PATH));
LOG.info("Scanning for completed segments");
List<Path> processedLogs = scanForCompletedSegments(fs,conf);
LOG.info("Found " + processedLogs.size() + " processed logs");
// remove processed from candidate set ...
candidateSet.removeAll(processedLogs);
// ok we are ready to go ..
LOG.info("There are: " + candidateSet.size() + " logs in need of parsing");
while (candidateSet.size() != 0) {
ImmutableList.Builder<Path> pathBuilder = new ImmutableList.Builder<Path>();
Iterator<Path> iterator = Iterators.limit(candidateSet.iterator(),LOGS_PER_ITERATION);
while (iterator.hasNext()) {
pathBuilder.add(iterator.next());
iterator.remove();
}
LOG.info("Queueing Parse");
queue(fs,conf,pathBuilder.build());
LOG.info("Queued Parse");
// in test mode, queue only a single segment's worth of data
if (conf.getBoolean(CONF_PARAM_TEST_MODE, false)) {
LOG.info("Test Mode - Queueing only a single Item");
break;
}
}
// queue shutdown items
for (int i=0;i<maxSimultaneousJobs;++i) {
_queue.put(new QueueItem());
}
}
void run() {
LOG.info("Starting Threads");
// startup threads ..
for (int i=0;i<maxSimultaneousJobs;++i) {
Thread thread = new Thread(new QueueTask());
thread.start();
}
// ok wait for them to die
LOG.info("Waiting for Queue Threads to Die");
jobThreadSemaphore.acquireUninterruptibly();
LOG.info("Queue Threads Dead. Exiting");
}
static class QueueItem {
QueueItem() {
pathList = null;
}
QueueItem(FileSystem fs,Configuration conf,ImmutableList<Path> pathList) {
this.conf = conf;
this.fs = fs;
this.pathList = pathList;
}
public Configuration conf;
public FileSystem fs;
public ImmutableList<Path> pathList;
}
private void queue(FileSystem fs,Configuration conf,ImmutableList<Path> paths) {
try {
_queue.put(new QueueItem(fs,conf,paths));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
class QueueTask implements Runnable {
@Override
public void run() {
while (true) {
LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Running");
try {
QueueItem item = _queue.take();
if (item.pathList != null) {
LOG.info("Queue Thread:" + Thread.currentThread().getId() + " got item with Paths:" + item.pathList);
LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Starting Job");
try {
parse(item.fs,item.conf,item.pathList);
} catch (IOException e) {
LOG.error("Queue Thread:" + Thread.currentThread().getId() + " threw exception:" + CCStringUtils.stringifyException(e));
}
}
else {
LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Got Shutdown Queue Item - EXITING");
break;
}
} catch (InterruptedException e) {
}
}
LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Released Semaphore");
jobThreadSemaphore.release();
}
}
public static void main(String[] args)throws Exception {
Configuration conf = new Configuration();
conf.addResource(new Path("/home/hadoop/conf/core-site.xml"));
conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml"));
CommandLineParser parser = new GnuParser();
try {
// parse the command line arguments
CommandLine line = parser.parse( options, args );
boolean testMode = line.hasOption("testMode");
if (testMode) {
LOG.info("Running in Test Mode");
conf.setBoolean(CONF_PARAM_TEST_MODE,true);
}
else {
LOG.info("Running in Prod Mode");
}
EC2ParserTask task = new EC2ParserTask(conf);
task.run();
task.shutdown();
System.exit(0);
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
System.exit(1);
}
private static void parse(FileSystem fs,Configuration conf,ImmutableList<Path> paths)throws IOException {
LOG.info("Need to Parse:" + paths.toString());
// create output path
long segmentId = System.currentTimeMillis();
String segmentPathPrefix = conf.get(SEGMENT_PATH_PROPERTY);
Path outputPath = new Path(S3N_BUCKET_PREFIX + segmentPathPrefix + Long.toString(segmentId));
LOG.info("Starting Map-Reduce Job. SegmentId:" + segmentId + " OutputPath:" + outputPath);
// run job...
JobConf jobConf = new JobBuilder("parse job",conf)
.inputs(paths)
.inputFormat(SequenceFileInputFormat.class)
.keyValue(Text.class, ParseOutput.class)
.mapRunner(ParserMapRunner.class)
.mapper(ParserMapper.class)
// allow two attempts to process the split
// after that, we will pick it up in post processing step
.maxMapAttempts(2)
.maxMapTaskFailures(1000)
.speculativeExecution(true)
.numReducers(0)
.outputFormat(ParserOutputFormat.class)
.output(outputPath)
.minSplitSize(134217728*4)
.reuseJVM(1000)
.build();
Path jobLogsPath = new Path(S3N_BUCKET_PREFIX + conf.get(JOB_LOGS_PATH_PROPERTY) + Long.toString(segmentId));
jobConf.set("hadoop.job.history.user.location", jobLogsPath.toString());
jobConf.set("fs.default.name", S3N_BUCKET_PREFIX);
jobConf.setLong("cc.segmet.id", segmentId);
// set task timeout to 20 minutes
jobConf.setInt("mapred.task.timeout", 20 * 60 * 1000);
// set mapper runtime to max 45 minutes .....
jobConf.setLong(ParserMapper.MAX_MAPPER_RUNTIME_PROPERTY, 45 * 60 * 1000);
jobConf.setOutputCommitter(OutputCommitter.class);
// allow lots of failures per tracker per job
jobConf.setMaxTaskFailuresPerTracker(Integer.MAX_VALUE);
initializeTaskDataAwareJob(jobConf,segmentId);
JobClient.runJob(jobConf);
LOG.info("Job Finished. Writing Segments Manifest Files");
writeSegmentManifestFile(fs,conf,segmentId,paths);
String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY);
Path manifestOutputPath = new Path(validSegmentPathPrefix+Long.toString(segmentId));
fs.mkdirs(manifestOutputPath);
finalizeJob(fs,conf,jobConf,manifestOutputPath,segmentId);
}
private static List<Path> scanForCompletedSegments(FileSystem fs,Configuration conf) throws IOException {
ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>();
String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY);
for (FileStatus fileStatus : fs.globStatus(new Path(validSegmentPathPrefix+"[0-9]*"))) {
pathListBuilder.addAll(scanSegmentManifestFile(fs,fileStatus.getPath()));
}
return pathListBuilder.build();
}
private static List<Path> scanSegmentManifestFile(FileSystem fs,Path segmentPath)throws IOException {
LOG.info("Scanning Segment Manifest for segment at path:" + segmentPath);
Path manifestPath = new Path(segmentPath,SEGMENT_MANIFEST_FILE);
ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>();
for (String pathStr : textFileToList(fs, manifestPath)) {
pathListBuilder.add(new Path(pathStr));
}
return pathListBuilder.build();
}
private static void writeSegmentManifestFile(FileSystem fs,Configuration conf,long segmentTimestamp,List<Path> logsInSegment) throws IOException {
LOG.info("Writing Segment Manifest for Segment: " + segmentTimestamp + " itemCount:" + logsInSegment.size());
ImmutableList.Builder<String> listBuilder = new ImmutableList.Builder<String>();
String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY);
for (Path logPath : logsInSegment) {
listBuilder.add(logPath.toString().substring(S3N_BUCKET_PREFIX.length()));
}
listToTextFile(listBuilder.build(), fs, new Path(validSegmentPathPrefix+Long.toString(segmentTimestamp)+"/"+SEGMENT_MANIFEST_FILE));
}
/** build a list of parse candidates sorted by timestamp
*
* @param fs
* @param logFilePath
* @return a Set of Candidates
* @throws IOException
*/
private static TreeSet<Path> buildCandidateList(FileSystem fs,Path logFilePath)throws IOException {
TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() {
@Override
public int compare(Path p1, Path p2) {
String n1 = p1.getName();
String n2 = p2.getName();
Matcher m1 = CRAWL_LOG_REG_EXP.matcher(n1);
Matcher m2 = CRAWL_LOG_REG_EXP.matcher(n2);
m1.matches();
m2.matches();
Long v1 = Long.parseLong(m1.group(1));
Long v2 = Long.parseLong(m2.group(1));
return v1.compareTo(v2);
}
});
LOG.info("Scanning for Log Files at:" + logFilePath);
FileStatus candidateItems[] = fs.globStatus(new Path(logFilePath,"CrawlLog*"));
for (FileStatus candidate : candidateItems) {
candidateList.add(candidate.getPath());
}
return candidateList;
}
static void printUsage() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "EC2Launcher", options );
}
}