/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3.crawllistgen;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItem;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.Tuples.Pair;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
*
* @author rana
*
*/
public class GenBundlesStep extends CrawlPipelineStep implements
Reducer<IntWritable, Text, SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> {
enum Counters {
GOT_RECORD, GOT_CRAWLSTATS, GOT_HOMEPAGE_DATA, GOT_BLOGPROBE_DATA, GOT_CRAWLURL_DATA, EMITTED_URL_RECORD,
EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, EMITTING_URL_RECORD_WITH_DOMINSTATS, EMITTED_RECORD_HAD_CRAWLSTATUS,
EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, GOT_FEEDURL_DATA, DOMAIN_WITH_GT_10MILLION_URLS, DOMAIN_WITH_GT_1MILLION_URLS,
DOMAIN_WITH_GT_100K_URLS, DOMAIN_WITH_GT_10K_URLS, DOMAIN_WITH_GT_50K_URLS, DOMAIN_WITH_GT_1K_URLS,
DOMAIN_WITH_GT_100_URLS, DOMAIN_WITH_GT_10_URLS, DOMAIN_WITH_LT_10_URLS, DOMAIN_WITH_1_URL, GOT_REDIRECT_DATA,
SKIPPING_REDIRECTED_URL, SKIPPING_ALREADY_FETCHED, ALLOWING_HOMEPAGE_OR_FEEDURL, SKIPPING_BLOGPROBE_URL,
RECRAWLING_BLOGPROBE_URL, SKIPPING_INVALID_URL, SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY,
SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, SKIPPING_BAD_DOMAIN_URL, SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL,
SKIPPING_QUERY_URL, NULL_FP_FOR_URL, SKIPPING_ALREADY_EMITTED_URL, FLUSHED_BLOOMFILTER, SKIPPING_BLOCKED_DOMAIN,
LET_THROUGH_QUERY_URL, HIT_QUERY_CHECK_CONDITION, SKIPPING_INVALID_LENGTH_URL, TRANSITIONING_DOMAIN,
SPILLED_1_MILLION_SKIPPED_REST, SKIPPING_IP_ADDRESS, NO_SOURCE_URL_IN_JSON, INVALID_URL_OBJECT, INVALID_SCHEME
}
private static final Log LOG = LogFactory.getLog(GenBundlesStep.class);
public static final String OUTPUT_DIR_NAME = "bundlesGenerator";
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 28;
static final int FLUSH_THRESHOLD = 1 << 23;
public static final int SPILL_THRESHOLD = 250;
private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut)
throws IOException {
inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
int newLength = WritableUtils.readVInt(inputBuffer);
textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
}
private static void rawValueToWritable(RawRecordValue rawValue, DataInputBuffer inputBuffer, Writable typeOut)
throws IOException {
inputBuffer.reset(rawValue.data.getData(), rawValue.data.getLength());
typeOut.readFields(inputBuffer);
}
JsonParser parser = new JsonParser();
public static final int HAS_HOMEPAGE_URLDATA = 2;
public static final int HAS_BLOGPROBE_URLDATA = 4;
public static final int HAS_FEED_URLDATA = 8;
public static final int HAS_CRAWL_STATUS = 16;
public static final int HAS_REDIRECT_DATA = 32;
static final int NUM_HASH_FUNCTIONS = 10;
int _flags = 0;
boolean _skipDomain = false;
boolean _skipEverythingButHomepage = false;
TextBytes _newDomainBytes = new TextBytes();
TextBytes _contextURLBytes = new TextBytes();
TextBytes _newURLBytes = new TextBytes();
BooleanWritable _blogURLSkipFlag = new BooleanWritable(true);
TextBytes tempTextBuffer = new TextBytes();
DataInputBuffer tempBuffer = new DataInputBuffer();
JsonObject _domainStats = null;
double _domainRank = 0.0;
JsonObject _crawlStatus = null;
URLFPBloomFilter _emittedURLSFilter;
long _emittedURLSInFilter = 0;
// spill state ...
ArrayList<SegmentGeneratorItem> items = new ArrayList<SegmentGeneratorItem>();
long _currentDomainId = -1;
String currentDomainName = "";
int currentDomainURLCount = 0;
int currentDomainSpilledItemCount = 0;
int currentDomainCrawlIdx = -1;
SegmentGeneratorItemBundle currentBundle = null;
int currentBundleId = 0;
OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> _collector = null;
int crawlerCount = 0;
Pattern ipAddressRegExPattern = Pattern.compile("[0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*");
URLFPV2 fpTest = new URLFPV2();
JobConf _jobConf = null;
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
Writer urlDebugURLWriter;
FSDataOutputStream debugURLStream;
int partitionNumber;
public GenBundlesStep() {
super(null, null, null);
}
public GenBundlesStep(CrawlPipelineTask task) {
super(task, "Generate Bundles", OUTPUT_DIR_NAME);
}
@Override
public void close() throws IOException {
flushDomain(null);
urlDebugURLWriter.flush();
debugURLStream.close();
}
public void configure(JobConf job) {
_jobConf = job;
crawlerCount = job.getInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, CrawlEnvironment.CRAWLERS.length);
partitionNumber = job.getInt("mapred.task.partition", -1);
try {
FileSystem fs = FileSystem.get(job);
Path workPath = FileOutputFormat.getOutputPath(job);
debugURLStream = fs.create(new Path(workPath, "debugURLS-" + NUMBER_FORMAT.format(partitionNumber)));
urlDebugURLWriter = new OutputStreamWriter(debugURLStream, Charset.forName("UTF-8"));
_emittedURLSFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
/** potentially reset state based on domain id transition **/
private void domainTransition(long newDomainFP, String newDomainName, Reporter reporter) throws IOException {
if (_currentDomainId != -1) {
flushDomain(reporter);
}
_flags = 0;
_domainStats = null;
_domainRank = 0.0;
_skipDomain = false;
_skipEverythingButHomepage = false;
// zero out item count ...
items.clear();
// reset domain id
_currentDomainId = newDomainFP;
currentDomainCrawlIdx = (((int) _currentDomainId & Integer.MAX_VALUE) % crawlerCount);
// reset current domain url count
currentDomainURLCount = 0;
currentDomainName = newDomainName;
// and reset last bundle id
currentBundleId = 0;
// reset spill count for domain
currentDomainSpilledItemCount = 0;
if (BlockedDomainList.blockedDomains.contains(newDomainFP)) {
reporter.incrCounter(Counters.SKIPPING_BLOCKED_DOMAIN, 1);
LOG.info("Skipping Blocked Domain:" + newDomainName);
_skipDomain = true;
}
if (ipAddressRegExPattern.matcher(currentDomainName.trim()).matches()) {
reporter.incrCounter(Counters.SKIPPING_IP_ADDRESS, 1);
_skipDomain = true;
}
}
void emitLastRecord(Reporter reporter) throws IOException {
if (_flags != 0) {
if (_domainStats == null) {
reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
} else {
reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
}
if (_crawlStatus != null) {
reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
} else {
reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
}
}
if (_contextURLBytes.getLength() >= 4097) {
reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
} else {
GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());
if (!skipRecord(urlObject, reporter)) {
if (urlObject.has_query()) {
reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
}
URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
if (fp != null) {
if (_emittedURLSFilter.isPresent(fp)) {
reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
} else {
_emittedURLSFilter.add(fp);
_emittedURLSInFilter++;
SegmentGeneratorItem itemValue = new SegmentGeneratorItem();
itemValue.setDomainFP(fp.getDomainHash());
itemValue.setRootDomainFP(fp.getRootDomainHash());
itemValue.setUrlFP(fp.getUrlHash());
itemValue.setUrl(urlObject.getCanonicalURL());
itemValue.setPageRank(0);
itemValue.setModifiedStatus((byte) 0);
items.add(itemValue);
if (items.size() >= SPILL_THRESHOLD)
spillItems(reporter);
}
} else {
reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
}
}
}
// reset stuff
_flags = 0;
_crawlStatus = null;
_contextURLBytes.clear();
_blogURLSkipFlag.set(true);
}
/** flush the currently active bundle **/
void flushCurrentBundle(Reporter reporter) throws IOException {
if (currentBundle != null && currentBundle.getUrls().size() != 0) {
int crawlerIndex = ((int) currentBundle.getHostFP() & Integer.MAX_VALUE) % crawlerCount;
// generate a bundle key
SegmentGeneratorBundleKey bundleKey = new SegmentGeneratorBundleKey();
bundleKey.setRecordType(0);
bundleKey.setCrawlerId(crawlerIndex);
bundleKey.setDomainFP(_currentDomainId);
// and increment bundle id ...
bundleKey.setBundleId(currentBundleId++);
bundleKey.setAvgPageRank((float) _domainRank);
if (reporter != null) {
reporter.incrCounter("CRAWLER", Long.toString(crawlerIndex), 1);
}
// ok spill bundle ...
_collector.collect(bundleKey, currentBundle);
}
// current bundle is now null
currentBundle = null;
}
private void flushDomain(Reporter reporter) throws IOException {
if (_currentDomainId != -1) {
if (items.size() != 0) {
spillItems(reporter);
}
if (reporter != null) {
if (currentDomainSpilledItemCount >= 10000000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10MILLION_URLS, 1);
} else if (currentDomainSpilledItemCount >= 1000000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_1MILLION_URLS, 1);
} else if (currentDomainSpilledItemCount >= 100000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_100K_URLS, 1);
} else if (currentDomainSpilledItemCount >= 50000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_50K_URLS, 1);
} else if (currentDomainSpilledItemCount >= 10000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10K_URLS, 1);
} else if (currentDomainSpilledItemCount >= 1000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_1K_URLS, 1);
} else if (currentDomainSpilledItemCount >= 100) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_100_URLS, 1);
} else if (currentDomainSpilledItemCount >= 10) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10_URLS, 1);
} else if (currentDomainSpilledItemCount > 1) {
reporter.incrCounter(Counters.DOMAIN_WITH_LT_10_URLS, 1);
} else if (currentDomainSpilledItemCount == 1) {
reporter.incrCounter(Counters.DOMAIN_WITH_1_URL, 1);
}
}
_currentDomainId = -1;
currentDomainCrawlIdx = -1;
currentDomainName = "";
currentDomainSpilledItemCount = 0;
currentDomainURLCount = 0;
}
}
/** generate a bundle from the given list of items and simultaneously flush it **/
void generateABundle(long domainFP, List<SegmentGeneratorItem> items, Reporter reporter) throws IOException {
SegmentGeneratorItemBundle bundle = getBundleForDomain(domainFP);
// LOG.info("Generating Bundle:" + currentBundleId + " for DH:" + domainFP);
float maxPageRank = 0.0f;
for (SegmentGeneratorItem item : items) {
// LOG.info("URL:" + item.getUrl() + " Status:" +
// CrawlDatum.getStatusName(item.getStatus()) +" PR:" +
// item.getMetadata().getPageRank());
bundle.getUrls().add(item);
currentDomainURLCount++;
maxPageRank = Math.max(maxPageRank, item.getPageRank());
if (currentDomainURLCount <= 200) {
urlDebugURLWriter.append(item.getUrl() + "\t" + item.getModifiedStatus() + "\t" + item.getPageRank() + "\n");
}
}
// LOG.info("Done Generating Bunlde - PR is:" + maxPageRank);
// set page rank for bundle
bundle.setMaxPageRank(maxPageRank);
flushCurrentBundle(reporter);
}
/** helper method **/
private SegmentGeneratorItemBundle getBundleForDomain(long domainFP) throws IOException {
currentBundle = new SegmentGeneratorItemBundle();
currentBundle.setHostFP(domainFP);
return currentBundle;
}
@Override
public Log getLogger() {
return LOG;
}
void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter) throws IOException {
Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
int iterationCount = 0;
while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
reporter.incrCounter(Counters.GOT_RECORD, 1);
int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject);
PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes);
PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes);
if (_newURLBytes.compareTo(_contextURLBytes) != 0) {
emitLastRecord(reporter);
}
long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString());
if (newDomainFP != _currentDomainId) {
reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1);
domainTransition(newDomainFP, _newDomainBytes.toString(), reporter);
}
RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null);
switch (type) {
case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS: {
reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1);
setDomainStats(rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter);
}
break;
case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL: {
reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1);
rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
_flags |= HAS_HOMEPAGE_URLDATA;
}
break;
case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL: {
reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1);
rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag);
_contextURLBytes.set(_newURLBytes);
_flags |= HAS_BLOGPROBE_URLDATA;
}
break;
case CrawlListGeneratorTask.KEY_TYPE_FEED_URL: {
reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1);
rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
_flags |= HAS_FEED_URLDATA;
}
break;
case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD: {
reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1);
_contextURLBytes.set(_newURLBytes);
_flags |= HAS_REDIRECT_DATA;
}
break;
case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA: {
reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1);
_contextURLBytes.set(_newURLBytes);
_crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer);
_flags |= HAS_CRAWL_STATUS;
}
break;
}
}
// flush trailing record ...
emitLastRecord(reporter);
flushDomain(reporter);
}
private JsonObject rawValueToJsonObject(DataOutputBuffer dataBuffer, DataInputBuffer stream, TextBytes tempTextBuffer)
throws IOException {
rawValueToTextBytes(dataBuffer, stream, tempTextBuffer);
try {
return parser.parse(tempTextBuffer.toString()).getAsJsonObject();
} catch (Exception e) {
throw new IOException("Exception Building Json from String:" + tempTextBuffer.toString());
}
}
@Override
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output, Reporter reporter)
throws IOException {
// collect all incoming paths first
Vector<Path> incomingPaths = new Vector<Path>();
while (values.hasNext()) {
String path = values.next().toString();
LOG.info("Found Incoming Path:" + path);
incomingPaths.add(new Path(path));
}
// set up merge attributes
Configuration localMergeConfig = new Configuration(_jobConf);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, TextBytes.Comparator.class,
TextBytes.Comparator.class);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class);
// ok now spawn merger
MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
FileSystem.get(_jobConf), incomingPaths, localMergeConfig);
// save a reference to the collector
_collector = output;
iterateItems(multiFileInputReader, reporter);
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
ImmutableList<Path> pathList = ImmutableList.of(getOutputDirForStep(PartitionCrawlDBStep.class),
getOutputDirForStep(GenFeedUrlsStep.class), getOutputDirForStep(GenHomepageUrlsStep.class),
getOutputDirForStep(GenBlogPlatformUrlsStep.class), getOutputDirForStep(PartitionCrawlStatsStep.class),
getOutputDirForStep(PartitionRedirectDataStep.class));
JobConf jobConf = new JobBuilder("BundleWriter Step", getConf())
.inputs(pathList)
.inputFormat(MultiFileMergeInputFormat.class)
.mapperKeyValue(IntWritable.class, Text.class)
.outputKeyValue(SegmentGeneratorBundleKey.class, SegmentGeneratorItemBundle.class)
.outputFormat(SequenceFileOutputFormat.class)
.reducer(GenBundlesStep.class, false)
.partition(MultiFileMergePartitioner.class)
.numReducers(CrawlListGeneratorTask.NUM_SHARDS)
.speculativeExecution(false)
.output(outputPathLocation)
.compressMapOutput(false)
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.build();
LOG.info("Starting JOB");
JobClient.runJob(jobConf);
LOG.info("Finsihed JOB");
}
void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException {
_domainStats = domainStats;
if (_domainStats.has("dR")) {
_domainRank = _domainStats.get("dR").getAsDouble();
} else {
_domainRank = 0.0;
}
if (_domainStats.has("urls")) {
int urlCount = _domainStats.get("urls").getAsInt();
int crawledCount = _domainStats.get("crawled").getAsInt();
int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0;
if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) {
reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1);
LOG.info("Skipping Everything But Homepage for Domain:" + _newDomainBytes.toString() + " CrawledCount:"
+ crawledCount + " HTTP200Count:" + Http200Count + " URLCount:" + urlCount);
_skipEverythingButHomepage = true;
} else if (urlCount > 25000 && urlCount < 100000) {
if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) {
LOG.info("Skipping Domain:" + _newDomainBytes.toString());
reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
_skipDomain = true;
}
} else if (urlCount > 250000 && urlCount < 1000000) {
if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) {
LOG.info("Skipping Domain:" + _newDomainBytes.toString());
reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
_skipDomain = true;
}
} else if (urlCount > 1000000) {
if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) {
LOG.info("Skipping Domain:" + _newDomainBytes.toString());
reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
_skipDomain = true;
}
}
}
if (_emittedURLSInFilter >= FLUSH_THRESHOLD) {
_emittedURLSFilter.clear();
_emittedURLSInFilter = 0;
reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1);
}
}
boolean skipRecord(GoogleURL urlObject, Reporter reporter) {
if (_skipDomain) {
reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1);
return true;
}
if (!urlObject.isValid()) {
reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1);
return true;
} else if (urlObject.has_query()) {
reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1);
if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) {
reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1);
return true;
}
} else {
// if redirect ... skip
if ((_flags & HAS_REDIRECT_DATA) != 0) {
reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1);
return true;
}
if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) {
if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) {
reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1);
return false;
}
}
if (_skipEverythingButHomepage) {
reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1);
return true;
}
if (_crawlStatus != null) {
if (_crawlStatus.has("crawl_status")) {
JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject();
if (realCrawlStatus.has("http_result")) {
int httpResult = realCrawlStatus.get("http_result").getAsInt();
if (httpResult == 200 || httpResult == 404) {
if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) {
if (_blogURLSkipFlag.get()) {
reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1);
return true;
} else {
reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1);
return false;
}
} else {
reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1);
return true;
}
}
}
}
}
}
return false;
}
/** spill cached items **/
void spillItems(Reporter reporter) throws IOException {
// if item count exceeds spill threshold .. or we ran out of data ...
if (items.size() != 0) {
// LOG.info("Spilling Bundle:" + currentBundleId + " for DH:" +
// currentDomain + " ItemCount:" + subList.size());
// flush items
generateABundle(_currentDomainId, items, reporter);
if (reporter != null) {
reporter.progress();
}
// ok, increment counts ...
currentDomainSpilledItemCount += items.size();
if (currentDomainSpilledItemCount >= 1000000) {
reporter.incrCounter(Counters.SPILLED_1_MILLION_SKIPPED_REST, 1);
LOG.info("Skipping Remaining URLS for Domain:" + currentDomainName);
_skipDomain = true;
}
}
// reset list ...
items.clear();
}
}