/**
*
*/
package org.commoncrawl.mapred.segmenter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItem;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.protocol.CrawlDatumAndMetadata;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.TextBytes;
public class SegmenterReducer implements Reducer<SegmentGeneratorBundleKey,SegmentGeneratorItemBundle,NullWritable,NullWritable> {
static final Log LOG = LogFactory.getLog(SegmenterReducer.class);
private static NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
enum Counters {
BAD_URL_DURING_HOST_EXTRACTION
}
@Override
public void reduce(SegmentGeneratorBundleKey key,Iterator<SegmentGeneratorItemBundle> values,
OutputCollector<NullWritable, NullWritable> output, Reporter reporter)
throws IOException {
while (values.hasNext()) {
writeBundle(values.next(),reporter);
}
}
Path _workOutputPath = null;
Path _debugOutputPath = null;
int _activeSegmentId = -1;
SequenceFile.Writer _activeWriter = null;
int _activeSegmentURLCount = -1;
FileSystem _fs = null;
JobConf _conf;
String crawlerNames[] = null;
int taskNumber;
int crawlerIndex = -1;
int bucketIndex = -1;
String crawlerName = null;
//Writer _urlDebugURLWriter;
//FSDataOutputStream _debugURLStream;
@Override
public void configure(JobConf job) {
String crawlers = job.get(CrawlEnvironment.PROPERTY_CRAWLERS);
// get buckets per crawler ...
int bucketsPerCrawler = job.getInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, 8);
// extract crawler names ...
crawlerNames = crawlers.split(",");
// get the local task index ...
taskNumber = job.getInt("mapred.task.partition", 0);
// compute crawler index based on num crawlers
crawlerIndex = taskNumber / bucketsPerCrawler;
// compute bucket id
bucketIndex = taskNumber % bucketsPerCrawler;
// get the crawler name ..
crawlerName = crawlerNames[crawlerIndex];
// calculate work path ...
_workOutputPath = new Path(FileOutputFormat.getOutputPath(job),crawlerName +"/"+NUMBER_FORMAT.format(bucketIndex));
_debugOutputPath = new Path(FileOutputFormat.getOutputPath(job),"debug/" + crawlerName +"/"+NUMBER_FORMAT.format(bucketIndex));
try {
_fs = FileSystem.get(_workOutputPath.toUri(),job);
_fs.delete(_workOutputPath, true);
_fs.delete(_debugOutputPath, true);
LOG.info("Making Directory:" + _workOutputPath);
_fs.mkdirs(_workOutputPath);
// _fs.mkdirs(_debugOutputPath);
_conf = job;
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
@Override
public void close() throws IOException {
flushActiveWriter();
}
DataInputBuffer datumReaderStream = new DataInputBuffer();
CrawlDatumAndMetadata datum = new CrawlDatumAndMetadata();
TextBytes urlBytes = new TextBytes();
void writeBundle(SegmentGeneratorItemBundle bundle,Reporter reporter)throws IOException {
reporter.incrCounter("", "GOT_BUNDLE", 1);
CrawlSegmentHost host = null;
int originalPosition = 0;
for (SegmentGeneratorItem item : bundle.getUrls()) {
if (host == null) {
urlBytes.set(item.getUrlAsBuffer().getReadOnlyBytes(),0,item.getUrlAsBuffer().getCount());
GoogleURL urlObject = new GoogleURL(item.getUrl().toString());
String hostName = urlObject.getHost();
if (hostName == null || hostName.length() == 0) {
reporter.incrCounter(Counters.BAD_URL_DURING_HOST_EXTRACTION, 1);
continue;
}
else {
reporter.incrCounter("", "GENERATED_SEGMENT_HOST", 1);
host = new CrawlSegmentHost();
host.setHostFP(bundle.getHostFP());
host.setHostName(hostName);
//LOG.info("Allocated new CrawlSegmentHost:" + hostName);
}
}
// create url item object ...
CrawlSegmentURL urlObjectOut = new CrawlSegmentURL();
urlObjectOut.getUrlAsTextBytes().set(item.getUrlAsTextBytes());
urlObjectOut.setFieldDirty(CrawlSegmentURL.Field_URL);
urlObjectOut.setUrlFP(item.getUrlFP());
urlObjectOut.setCrawlSegmentId(_activeSegmentId);
urlObjectOut.setOriginalPosition(originalPosition++);
if (item.isFieldDirty(SegmentGeneratorItem.Field_LASTMODIFIEDTIME)) {
urlObjectOut.setLastModifiedTime(item.getLastModifiedTime());
}
if (item.isFieldDirty(SegmentGeneratorItem.Field_ETAG)) {
urlObjectOut.setEtag(item.getEtag());
}
// debug
//urlBytes.set(item.getUrlAsBuffer().getReadOnlyBytes(),0,item.getUrlAsBuffer().getCount());
//LOG.info("Added URL:" + urlBytes.toString());
// add to host ...
host.getUrlTargets().add(urlObjectOut);
}
if (host != null) {
// sort the targets in ascending order ...
//Collections.sort(host.getUrlTargets());
// access writer ...
SequenceFile.Writer writer = ensureWriter(reporter);
reporter.incrCounter("", "APPENDED_HOST_TO_FILE", 1);
// and spit the host out into the file ...
writer.append(new LongWritable(host.getHostFP()),host);
/*
_urlDebugURLWriter.append("\nHost:" + host.getHostName());
for (CrawlSegmentURL urlObject : host.getUrlTargets()) {
_urlDebugURLWriter.append(" " + urlObject.getUrl());
}
*/
// ok increment url count for active segment
_activeSegmentURLCount += host.getUrlTargets().size();
//LOG.info("Wrote Host:" + host.getHostName() + " URLCount:" + host.getUrlTargets().size() + " Segment URLCount:"+ _activeSegmentURLCount);
// now see if need a new segment
if (_activeSegmentURLCount >= Segmenter.SEGMENT_SIZE_MAX) {
//LOG.info("Flushing Active Segment");
flushActiveWriter();
}
}
}
void flushActiveWriter() throws IOException {
if (_activeWriter != null) {
// flush
_activeWriter.close();
_activeWriter = null;
_activeSegmentURLCount =0;
}
/*
if (_debugURLStream != null) {
_urlDebugURLWriter.flush();
_debugURLStream.close();
_urlDebugURLWriter = null;
_debugURLStream = null;
}
*/
}
SequenceFile.Writer ensureWriter(Reporter reporter)throws IOException {
if (_activeWriter == null) {
// increment segment id
_activeSegmentId++;
// create path
Path outputPath = new Path(_workOutputPath,Integer.toString(_activeSegmentId));
//Path debugPath = new Path(_debugOutputPath,Integer.toString(_activeSegmentId));
reporter.incrCounter("", "CREATED_WRITER", 1);
LOG.info("Creating Writer at:" + outputPath);
_activeWriter = SequenceFile.createWriter(
_fs,
_conf,
outputPath,
LongWritable.class,
CrawlSegmentHost.class,
SequenceFileOutputFormat.getOutputCompressionType(_conf),
reporter);
//_debugURLStream = _fs.create(debugPath);
//_urlDebugURLWriter = new OutputStreamWriter(_debugURLStream,Charset.forName("UTF-8"));
_activeSegmentURLCount = 0;
}
return _activeWriter;
}
}