/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.ec2.postprocess.crawldb;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.ComponentId;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.ByteArrayUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import static org.commoncrawl.util.JSONUtils.*;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
* map reduce job that produces a crawldb given link graph/crawl status data emitted
* from both the LinkGraphDataEmitter job and previous runs of the CrawlDBWriter itself.
*
* @author rana
*
*/
public class CrawlDBMergingReducer implements Reducer<TextBytes, TextBytes ,TextBytes,TextBytes>, CrawlDBCommon {
static final Log LOG = LogFactory.getLog(CrawlDBMergingReducer.class);
// The crawldb job emits data in the form a JSON data structure
// The top level JSON object contains optionally, a link_status object, a summary object
// and a source_url string.
// The Summary object has the properties defined by the SUMMARYRECORD_ constant prefix.
// The LinkStatus object has properties defined by the LINKSTATUS_ prefix
// The Summary object can contain zero to N CrawlDetail objects, one for each
// crawl attempt. The properties defined by CrawlDetail object are prefixed with
// the CRAWLDETAIL_ prefix.
///////////////////////////////////////////////////////////////////////////
// EC2 PATHS
///////////////////////////////////////////////////////////////////////////
static final String S3N_BUCKET_PREFIX = "s3n://aws-publicdatasets";
static final String MERGE_INTERMEDIATE_OUTPUT_PATH = "/common-crawl/crawl-db/intermediate/";
static final String MERGE_DB_PATH = "/common-crawl/crawl-db/mergedDB/";
///////////////////////////////////////////////////////////////////////////
// CONSTANTS
///////////////////////////////////////////////////////////////////////////
static final int MAX_TYPE_SAMPLES = 5;
static final int DEFAULT_OUTGOING_URLS_BUFFER_SIZE = 1 << 18; // 262K
static final int DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT = 16384;
static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE = 1 << 27; // 134 MB
static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = 16384;
static final int MAX_EXTERNALLY_REFERENCED_URLS = 100;
//private int OUTGOING_URLS_BUFFER_SIZE = DEFAULT_OUTGOING_URLS_BUFFER_SIZE;
//private int OUTGOING_URLS_BUFFER_PAD_AMOUNT =DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT;
private int EXT_SOURCE_SAMPLE_BUFFER_SIZE = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE;
private int EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT;
///////////////////////////////////////////////////////////////////////////
// Counters
///////////////////////////////////////////////////////////////////////////
enum Counters {
FAILED_TO_GET_LINKS_FROM_HTML,
NO_HREF_FOR_HTML_LINK,
EXCEPTION_IN_MAP,
GOT_HTML_METADATA,
GOT_FEED_METADATA,
EMITTED_ATOM_LINK,
EMITTED_HTML_LINK,
EMITTED_RSS_LINK,
GOT_PARSED_AS_ATTRIBUTE,
GOT_LINK_OBJECT,
NULL_CONTENT_OBJECT,
NULL_LINKS_ARRAY,
FP_NULL_IN_EMBEDDED_LINK,
SKIPPED_ALREADY_EMITTED_LINK,
FOUND_HTTP_DATE_HEADER,
FOUND_HTTP_AGE_HEADER,
FOUND_HTTP_LAST_MODIFIED_HEADER,
FOUND_HTTP_EXPIRES_HEADER,
FOUND_HTTP_CACHE_CONTROL_HEADER,
FOUND_HTTP_PRAGMA_HEADER,
REDUCER_GOT_LINK,
REDUCER_GOT_STATUS,
ONE_REDUNDANT_LINK_IN_REDUCER,
TWO_REDUNDANT_LINKS_IN_REDUCER,
THREE_REDUNDANT_LINKS_IN_REDUCER,
GT_THREE_REDUNDANT_LINKS_IN_REDUCER,
ONE_REDUNDANT_STATUS_IN_REDUCER,
TWO_REDUNDANT_STATUS_IN_REDUCER,
THREE_REDUNDANT_STATUS_IN_REDUCER,
GT_THREE_REDUNDANT_STATUS_IN_REDUCER,
GOT_RSS_FEED,
GOT_ATOM_FEED,
GOT_ALTERNATE_LINK_FOR_ATOM_ITEM,
GOT_CONTENT_FOR_ATOM_ITEM,
GOT_ITEM_LINK_FROM_RSS_ITEM,
GOT_TOP_LEVEL_LINK_FROM_RSS_ITEM,
GOT_TOP_LEVEL_LINK_FROM_ATOM_ITEM,
EMITTED_REDIRECT_RECORD,
DISCOVERED_NEW_LINK,
GOT_LINK_FOR_ITEM_WITH_STATUS,
FAILED_TO_GET_SOURCE_HREF,
GOT_CRAWL_STATUS_RECORD,
GOT_EXTERNAL_DOMAIN_SOURCE,
NO_SOURCE_URL_FOR_CRAWL_STATUS,
OUTPUT_KEY_FROM_INTERNAL_LINK,
OUTPUT_KEY_FROM_EXTERNAL_LINK, GOT_HTTP_200_CRAWL_STATUS, GOT_REDIRECT_CRAWL_STATUS, BAD_REDIRECT_URL, GOT_MERGED_RECORD, MERGED_OBJECT_FIRST_OBJECT, ADOPTED_SOURCE_SUMMARY_RECORD, MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, ADOPTED_SOURCE_LINKSUMMARY_RECORD, MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, EMITTED_SOURCEINPUTS_RECORD, GOT_NULL_REDIRECT_URL, INTERDOMAIN_LINKS_LTEQ_100, INTERDOMAIN_LINKS_LTEQ_1000, INTERDOMAIN_LINKS_GT_1000, EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED,
INPUT_RECORD_COUNT, ADOPTED_NEW_BLEKKO_METADATA_RECORD, BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, MERGE_RECORD_HAS_BLEKKO_METADATA, EMITTED_RECORD_WITH_BLEKKO_METADATA, BLEKKO_RECORD_ALREADY_IN_DATABASE
, BLEKKO_CRAWLED_CC_CRAWLED, BLEKKO_NOT_CRAWLED_CC_CRAWLED}
///////////////////////////////////////////////////////////////////////////
// Data Members
///////////////////////////////////////////////////////////////////////////
public static final int NUM_HASH_FUNCTIONS = 10;
public static final int NUM_BITS = 11;
public static final int NUM_ELEMENTS = 1 << 26;
public static final int FLUSH_INTERVAL = 1 << 17;
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
// parser
JsonParser _parser = new JsonParser();
// the top level object
JsonObject _topLevelJSONObject;
// the current summary record ...
JsonObject _summaryRecord = null;
// the current link summary record
JsonObject _linkSummaryRecord = null;
// collection of types detected for current url
HashSet<String> _types = new HashSet<String>();
// collection of external references urls in current document
HashSet<String> _extHrefs = new HashSet<String>();
// the url string to use as the output key ...
String _outputKeyString = null;
// freeze url key ...
boolean _urlKeyForzen = false;
// url object representing the current key
GoogleURL _outputKeyURLObj = null;
// source inputs tracking bloomfilter
URLFPBloomFilter _sourceInputsTrackingFilter;
// a count of the number of urls processed
long _urlsProcessed = 0;
// key used to test bloomfilter
URLFPV2 _bloomFilterKey = new URLFPV2();
// captured job conf
JobConf _conf;
// file system
FileSystem _fs;
// partition id
int _partitionId;
//SequenceFile.Writer _redirectWriter = null;
// input buffer used to collect referencing urls
DataOutputBuffer _sourceInputsBuffer;
// count of referencing domains
int _sourceSampleSize = 0;
// current input key
URLFPV2 _currentKey = null;
// temporary key used to transition input keys
URLFPV2 _tempKey = new URLFPV2();
// cached collector pointer ...
OutputCollector<TextBytes, TextBytes> _outputCollector;
Reporter _reporter;
@Override
public void reduce(TextBytes keyBytes, Iterator<TextBytes> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
if (_outputCollector == null) {
_outputCollector = output;
_reporter = reporter;
}
// potentially transition to new url
readFPCheckForTransition(keyBytes,output,reporter);
// extract link type ..
long linkType = CrawlDBKey.getLongComponentFromKey(keyBytes,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
while (values.hasNext()) {
reporter.incrCounter(Counters.INPUT_RECORD_COUNT, 1);
TextBytes valueBytes = values.next();
//LOG.debug("ValueBytes:"+ valueBytes.toString());
if (linkType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) {
reporter.incrCounter(Counters.GOT_MERGED_RECORD, 1);
JsonObject mergedObject = _parser.parse(valueBytes.toString()).getAsJsonObject();
if (mergedObject != null) {
setSourceURLFromJSONObject(mergedObject,linkType);
processMergedRecord(mergedObject,_currentKey,reporter);
}
}
else if (linkType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
reporter.incrCounter(Counters.GOT_CRAWL_STATUS_RECORD,1);
try {
JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
if (object != null) {
// update url key if necessary ...
setSourceURLFromJSONObject(object,linkType);
// emit a redirect record if necessary ...
JsonElement redirectObject = object.get("redirect_from");
if (redirectObject != null) {
emitRedirectRecord(object, redirectObject.getAsJsonObject(),output, reporter);
}
// get latest crawl time
long latestCrawlTime = (_summaryRecord != null) ? safeGetLong(_summaryRecord,SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY) : -1;
long attemptTime = safeGetLong(object, "attempt_time");
// if this is the latest crawl event, then we want to track the links associated with this crawl status ...
HashSet<String> extHrefs = (attemptTime > latestCrawlTime) ? _extHrefs : null;
// create a crawl detail record from incoming JSON
JsonObject crawlDetail = crawlDetailRecordFromCrawlStatusRecord(object,_currentKey,extHrefs,reporter);
// add to our list of crawl detail records ...
safeAddCrawlDetailToSummaryRecord(crawlDetail);
// ok, now update summary stats based on incoming crawl detail record ...
updateSummaryRecordFromCrawlDetailRecord(crawlDetail,_currentKey,reporter);
}
}
catch (Exception e) {
LOG.error("Error Parsing JSON:" + valueBytes.toString());
throw new IOException(e);
}
break;
}
else if (linkType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && linkType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) {
JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
if (object != null) {
setSourceURLFromJSONObject(object,linkType);
// LOG.debug("Got LinkData:" + JSONUtils.prettyPrintJSON(object));
// ok this is a link ...
updateLinkStatsFromLinkJSONObject(object,_currentKey,reporter);
}
}
else if (linkType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal()) {
importLinkSourceData(_currentKey, valueBytes);
}
reporter.progress();
}
}
@Override
public void configure(JobConf job) {
_sourceInputsBuffer = new DataOutputBuffer(EXT_SOURCE_SAMPLE_BUFFER_SIZE);
_sourceInputsTrackingFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
_conf = job;
try {
_fs = FileSystem.get(_conf);
_partitionId = _conf.getInt("mapred.task.partition", 0);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void close() throws IOException {
flushCurrentRecord(_outputCollector,_reporter);
}
/**
* internal helper - emit a redirect record give a source crawl status record
*
* @param jsonObject
* @param redirectObj
* @param output
* @param reporter
* @throws IOException
*/
void emitRedirectRecord(JsonObject jsonObject,JsonObject redirectObj,OutputCollector<TextBytes, TextBytes> output,Reporter reporter)throws IOException {
// ok first things first, generate a fingerprint for redirect SOURCE
URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectObj.get("source_url").getAsString());
if (redirectFP == null) {
reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1);
}
else {
int httpResult = redirectObj.get("http_result").getAsInt();
JsonObject redirectJSON = new JsonObject();
redirectJSON.addProperty("disposition","SUCCESS");
redirectJSON.addProperty("http_result",httpResult);
redirectJSON.addProperty("server_ip",redirectObj.get("server_ip").getAsString());
redirectJSON.addProperty("attempt_time",jsonObject.get("attempt_time").getAsLong());
redirectJSON.addProperty("target_url",jsonObject.get("source_url").getAsString());
redirectJSON.addProperty("source_url",redirectObj.get("source_url").getAsString());
// ok emit the redirect record ...
TextBytes key = CrawlDBKey.generateKey(redirectFP,CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS,jsonObject.get("attempt_time").getAsLong());
LOG.debug("!!!!!!Emitting Redirect Record:" + redirectJSON.toString());
output.collect(key, new TextBytes(redirectJSON.toString()));
reporter.incrCounter(Counters.EMITTED_REDIRECT_RECORD, 1);
//_redirectWriter.append(new TextBytes(redirectObj.get("source_url").getAsString()), new TextBytes(redirectJSON.toString()));
}
}
/**
* grab date headers and incorporate them into the crawl detail object
*
* @param jsonObject
* @param crawlStatsJSON
*/
static void populateDateHeadersFromJSONObject(JsonObject jsonObject,JsonObject crawlStatsJSON) {
JsonObject headers = jsonObject.getAsJsonObject("http_headers");
if (headers != null) {
JsonElement httpDate = headers.get("date");
JsonElement age = headers.get("age");
JsonElement lastModified = headers.get("last-modified");
JsonElement expires = headers.get("expires");
JsonElement cacheControl = headers.get("cache-control");
JsonElement pragma = headers.get("pragma");
JsonElement etag = headers.get("etag");
if (httpDate != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_DATE_PROPERTY, HttpHeaderInfoExtractor.getTime(httpDate.getAsString()));
}
if (age != null) {
crawlStatsJSON.add(CRAWLDETAIL_HTTP_AGE_PROPERTY, age);
}
if (lastModified != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_LAST_MODIFIED_PROPERTY, HttpHeaderInfoExtractor.getTime(lastModified.getAsString()));
}
if (expires != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_EXPIRES_PROPERTY, HttpHeaderInfoExtractor.getTime(expires.getAsString()));
}
if (cacheControl != null) {
crawlStatsJSON.add(CRAWLDETAIL_HTTP_CACHE_CONTROL_PROPERTY, cacheControl);
}
if (pragma != null) {
crawlStatsJSON.add(CRAWLDETAIL_HTTP_PRAGMA_PROPERTY, pragma);
}
if (etag != null) {
crawlStatsJSON.add(CRAWLDETAIL_HTTP_ETAG_PROPERTY, etag);
}
}
}
/**
*
* @param contentObj
* @param crawlStatsJSON
*/
static void addMinMaxFeedItemTimes(JsonObject contentObj,JsonObject crawlStatsJSON) {
JsonArray items = contentObj.getAsJsonArray("items");
if (items != null) {
long minPubDate = -1L;
long maxPubDate = -1L;
int itemCount = 0;
for (JsonElement item : items) {
long pubDateValue = -1;
JsonElement pubDate = item.getAsJsonObject().get("published");
if (pubDate != null) {
pubDateValue = pubDate.getAsLong();
}
JsonElement updateDate = item.getAsJsonObject().get("updated");
if (updateDate != null) {
if (updateDate.getAsLong() > pubDateValue) {
pubDateValue = updateDate.getAsLong();
}
}
if (minPubDate == -1L || pubDateValue < minPubDate) {
minPubDate = pubDateValue;
}
if (maxPubDate == -1L || pubDateValue > maxPubDate) {
maxPubDate = pubDateValue;
}
itemCount++;
}
crawlStatsJSON.addProperty(RSS_MIN_PUBDATE_PROPERTY,minPubDate);
crawlStatsJSON.addProperty(RSS_MAX_PUBDATE_PROPERTY,maxPubDate);
crawlStatsJSON.addProperty(RSS_ITEM_COUNT_PROPERTY,itemCount);
}
}
/**
* we need to extract source url from the JSON because it is not available via
* the key
*
* @param jsonObject
* @param keyType
*/
void setSourceURLFromJSONObject(JsonObject jsonObject, long keyType) {
if (!_urlKeyForzen) {
JsonElement sourceElement = jsonObject.get("source_url");
if (keyType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
_outputKeyString = sourceElement.getAsString();
_outputKeyURLObj = new GoogleURL(_outputKeyString);
JsonElement httpResultElem = jsonObject.get("http_result");
if (httpResultElem != null) {
int httpResult = httpResultElem.getAsInt();
if (httpResult >= 200 && httpResult <= 299) {
if (sourceElement != null && _outputKeyString == null) {
_outputKeyString = sourceElement.getAsString();
_outputKeyURLObj = new GoogleURL(_outputKeyString);
if (_outputKeyURLObj.isValid())
_urlKeyForzen = true;
}
}
}
}
else if (keyType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) {
_outputKeyString = sourceElement.getAsString();
_outputKeyURLObj = new GoogleURL(_outputKeyString);
_urlKeyForzen = true;
}
else if (keyType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && keyType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) {
if (_outputKeyString == null) {
JsonElement hrefElement = jsonObject.get("href");
if (sourceElement != null && hrefElement != null) {
GoogleURL hrefSource = new GoogleURL(sourceElement.getAsString());
if (hrefSource.isValid()) {
_outputKeyString = hrefElement.getAsString();
_outputKeyURLObj = new GoogleURL(_outputKeyString);
}
}
}
}
}
}
void mergeBlekkoMetadata(JsonObject newBlekkoMetadata,JsonObject existingTopLevelObj,Reporter reporter) {
if (newBlekkoMetadata != null) {
if (!existingTopLevelObj.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY,newBlekkoMetadata);
}
else {
JsonObject existingBlkkoMetadata = existingTopLevelObj.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY);
long existingTimestamp = existingBlkkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong();
long newTimestamp = newBlekkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong();
if (newTimestamp > existingTimestamp){
existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY, newBlekkoMetadata);
reporter.incrCounter(Counters.ADOPTED_NEW_BLEKKO_METADATA_RECORD, 1);
}
}
}
}
void mergeLinkRecords(JsonObject sourceRecord,JsonObject topLevelJSONObject,Reporter reporter) {
JsonElement destRecord = topLevelJSONObject.get(TOPLEVEL_LINKSTATUS_PROPERTY);
if (destRecord == null) {
if (sourceRecord != null) {
reporter.incrCounter(Counters.ADOPTED_SOURCE_LINKSUMMARY_RECORD, 1);
topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY,sourceRecord);
JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY);
if (typeAndRels != null) {
for (JsonElement typeAndRel : typeAndRels) {
_types.add(typeAndRel.getAsString());
}
}
}
}
else {
if (sourceRecord != null) {
reporter.incrCounter(Counters.MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, 1);
safeIncrementJSONCounter(destRecord.getAsJsonObject(),LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY,sourceRecord.get(LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY));
safeIncrementJSONCounter(destRecord.getAsJsonObject(),LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY,sourceRecord.get(LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY));
safeSetMinLongValue(destRecord.getAsJsonObject(),LINKSTATUS_EARLIEST_DATE_PROPERTY,sourceRecord.get(LINKSTATUS_EARLIEST_DATE_PROPERTY));
safeSetMaxLongValue(destRecord.getAsJsonObject(),LINKSTATUS_LATEST_DATE_PROPERTY,sourceRecord.get(LINKSTATUS_LATEST_DATE_PROPERTY));
JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY);
if (typeAndRels != null) {
for (JsonElement typeAndRel : typeAndRels) {
_types.add(typeAndRel.getAsString());
}
}
}
}
}
/**
* merge two crawl summary records
* @param incomingRecord
* @param topLevelJSONObject
* @param reporter
* @throws IOException
*/
void mergeSummaryRecords(JsonObject incomingRecord,JsonObject topLevelJSONObject,Reporter reporter)throws IOException {
JsonObject destinationSummaryRecord = topLevelJSONObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);
if (destinationSummaryRecord == null) {
if (incomingRecord != null) {
reporter.incrCounter(Counters.ADOPTED_SOURCE_SUMMARY_RECORD, 1);
// adopt source ...
topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY,incomingRecord);
_summaryRecord = incomingRecord;
}
}
else {
if (incomingRecord != null) {
reporter.incrCounter(Counters.MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, 1);
// walk crawl detail records in incoming record and merge them into destination record ...
JsonElement crawlStatsArray = incomingRecord.get(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
if (crawlStatsArray != null) {
for (JsonElement crawlDetail : crawlStatsArray.getAsJsonArray()) {
// add to our list of crawl detail records ...
safeAddCrawlDetailToSummaryRecord(crawlDetail.getAsJsonObject());
// ok, now update summary stats based on incoming crawl detail record ...
updateSummaryRecordFromCrawlDetailRecord(crawlDetail.getAsJsonObject(),_currentKey,reporter);
}
}
}
}
}
/**
* for the current url, merge the currently accumulated information with a previously generated crawl summary record
* @param jsonObject
* @param destFP
* @param reporter
* @throws IOException
*/
void processMergedRecord(JsonObject jsonObject,URLFPV2 destFP,Reporter reporter)throws IOException {
if (jsonObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
reporter.incrCounter(Counters.MERGE_RECORD_HAS_BLEKKO_METADATA, 1);
}
if (_topLevelJSONObject == null) {
reporter.incrCounter(Counters.MERGED_OBJECT_FIRST_OBJECT, 1);
_topLevelJSONObject = jsonObject;
_summaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);
_linkSummaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY);
if (_linkSummaryRecord != null) {
// read in type and rels collection ...
safeJsonArrayToStringCollection(_linkSummaryRecord,LINKSTATUS_TYPEANDRELS_PROPERTY, _types);
}
// and ext hrefs ..
if (_summaryRecord != null) {
safeJsonArrayToStringCollection(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS,_extHrefs);
}
// special blekko import stats
if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
if (_summaryRecord == null && _linkSummaryRecord == null) {
reporter.incrCounter(Counters.BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, 1);
}
}
}
else {
mergeSummaryRecords(jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY),_topLevelJSONObject,reporter);
mergeLinkRecords(jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY),_topLevelJSONObject,reporter);
mergeBlekkoMetadata(jsonObject.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY),_topLevelJSONObject,reporter);
}
}
/**
* given a incoming link record, track the link source and also update stats and
* also capture document type information (if available via the href).
*
* @param jsonObject
* @param destFP
* @param reporter
* @throws IOException
*/
void updateLinkStatsFromLinkJSONObject(JsonObject jsonObject,URLFPV2 destFP,Reporter reporter) throws IOException {
JsonElement sourceElement = jsonObject.get("source_url");
JsonElement hrefElement = jsonObject.get("href");
if (sourceElement != null && hrefElement != null) {
//LOG.info("source:" + sourceElement.getAsString() + " href:" + hrefElement.getAsString());
GoogleURL sourceURLObj = new GoogleURL(sourceElement.getAsString());
if (sourceURLObj.isValid()) {
if (_linkSummaryRecord == null) {
_linkSummaryRecord = new JsonObject();
}
// ok, first compare known host name with incoming link host name ...
// if not a match then ...
if (!_outputKeyURLObj.getHost().equals(sourceURLObj.getHost())) {
// ok now deeper check ...
URLFPV2 sourceFP = URLUtils.getURLFPV2FromURLObject(sourceURLObj);
if (sourceFP != null) {
reporter.incrCounter(Counters.GOT_EXTERNAL_DOMAIN_SOURCE, 1);
// increment external source count
safeIncrementJSONCounter(_linkSummaryRecord,LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY);
//LOG.info("sourceFP:" + sourceFP.getKey() + " hrefFP:" + destFP.getKey());
// ok track sources if from a different root domain (for now)
if (sourceFP.getRootDomainHash() != destFP.getRootDomainHash()) {
trackPotentialLinkSource(sourceFP,sourceElement.getAsString(),destFP);
}
}
}
// otherwise, count it as an internal link
else {
// internal for sure ...
safeIncrementJSONCounter(_linkSummaryRecord,LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY);
}
JsonObject sourceHeaders = jsonObject.getAsJsonObject("source_headers");
if (sourceHeaders != null) {
long httpDate = safeGetHttpDate(sourceHeaders, "date");
long lastModified = safeGetHttpDate(sourceHeaders, "last-modified");
if (lastModified != -1 && lastModified < httpDate)
httpDate = lastModified;
if (httpDate != -1L) {
safeSetMinLongValue(_linkSummaryRecord, LINKSTATUS_EARLIEST_DATE_PROPERTY, httpDate);
safeSetMaxLongValue(_linkSummaryRecord, LINKSTATUS_LATEST_DATE_PROPERTY, httpDate);
}
}
JsonElement typeElement = jsonObject.get("type");
JsonElement relElement = jsonObject.get("rel");
String sourceTypeAndRel = jsonObject.get("source_type").getAsString() + ":";
if (typeElement != null) {
sourceTypeAndRel += typeElement.getAsString();
}
if (relElement != null) {
sourceTypeAndRel += ":" + relElement.getAsString();
}
if (_types.size() < MAX_TYPE_SAMPLES)
_types.add(sourceTypeAndRel);
}
}
}
/**
* take linking href data and add it to our list of incoming hrefs
* (used during the intermediate merge process)
*
* @param destFP
* @param inputData
* @throws IOException
*/
void importLinkSourceData(URLFPV2 destFP,TextBytes inputData) throws IOException {
TextBytes urlText = new TextBytes();
int curpos =inputData.getOffset();
int endpos = inputData.getOffset() + inputData.getLength();
byte lfPattern[] = { 0xA };
byte tabPattern[] = { 0x9 };
while (curpos != endpos) {
int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
if (tabIndex == -1) {
break;
}
else {
int lfIndex = ByteArrayUtils.indexOf(inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
if (lfIndex == -1) {
break;
}
else {
long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(),curpos, tabIndex-curpos, 10);
urlText.set(inputData.getBytes(),tabIndex + 1,lfIndex - (tabIndex + 1));
URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceDomainHash,destFP.getUrlHash());
if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) {
// if not, check to see that we are not about to overflow sample buffer ...
if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) {
_sourceInputsBuffer.write(inputData.getBytes(),curpos,(lfIndex + 1) - curpos);
_sourceSampleSize++;
}
}
curpos = lfIndex + 1;
}
}
}
}
/**
* given an incoming link for a given url, store it in a accumulation buffer IFF we have not
* seen a url from the given domain before
*
* @param sourceFP
* @param sourceURL
* @param destFP
* @throws IOException
*/
void trackPotentialLinkSource(URLFPV2 sourceFP,String sourceURL,URLFPV2 destFP) throws IOException {
URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceFP.getDomainHash(),destFP.getUrlHash());
// check to see if we have collected a sample for this source domain / destination url combo or not ...
if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) {
LOG.debug("sourceFP:" + sourceFP.getKey() + " passed BloomFilter Test");
// if not, check to see that we are not about to overflow sample buffer ...
if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) {
// ok store the external reference sample ...
// write source domain hash
_sourceInputsBuffer.write(Long.toString(sourceFP.getDomainHash()).getBytes());
// delimiter
_sourceInputsBuffer.write(0x09);// TAB
// and source url ...
_sourceInputsBuffer.write(sourceURL.getBytes(Charset.forName("UTF-8")));
_sourceInputsBuffer.write(0x0A);// LF
_sourceSampleSize++;
// add to bloom filter ...
_sourceInputsTrackingFilter.add(bloomKey);
}
}
else {
LOG.debug("sourceFP:" + sourceFP.getKey() + " failed BloomFilter Test");
}
}
/**
* construct a (hacked) fingerprint key consisting of the source domain and destination
* url fingerprint to be used for the purposes of setting bits in a bloomfilter
*
* @param sourceDomain
* @param destURLHash
* @return
*/
private URLFPV2 sourceKeyFromSourceAndDest(long sourceDomain,long destURLHash) {
_bloomFilterKey.setDomainHash(sourceDomain);
_bloomFilterKey.setUrlHash(destURLHash);
return _bloomFilterKey;
}
/**
* construct crawl detail record from incoming crawl status JSON
*
* @param jsonObject
* @param fpSource
* @param extHRefs
* @param reporter
* @return
* @throws IOException
*/
static JsonObject crawlDetailRecordFromCrawlStatusRecord(JsonObject jsonObject,URLFPV2 fpSource,HashSet<String> extHRefs,Reporter reporter)throws IOException {
String disposition = jsonObject.get("disposition").getAsString();
long attemptTime = jsonObject.get("attempt_time").getAsLong();
// inject all the details into a JSONObject
JsonObject crawlStatsJSON = new JsonObject();
crawlStatsJSON.addProperty(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY, attemptTime);
if (disposition.equals("SUCCESS")) {
// basic stats ... starting with crawl time ...
int httpResult = jsonObject.get("http_result").getAsInt();
crawlStatsJSON.addProperty(CRAWLDETAIL_HTTPRESULT_PROPERTY,httpResult);
crawlStatsJSON.addProperty(CRAWLDETAIL_SERVERIP_PROPERTY, jsonObject.get("server_ip").getAsString());
//populate date headers ...
populateDateHeadersFromJSONObject(jsonObject,crawlStatsJSON);
// if http 200 ...
if (httpResult >= 200 && httpResult <= 299) {
reporter.incrCounter(Counters.GOT_HTTP_200_CRAWL_STATUS,1);
crawlStatsJSON.addProperty(CRAWLDETAIL_CONTENTLEN_PROPERTY,jsonObject.get("content_len").getAsInt());
if (jsonObject.get("mime_type") != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_MIMETYPE_PROPERTY,jsonObject.get("mime_type").getAsString());
}
if (jsonObject.get("md5") != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_MD5_PROPERTY,jsonObject.get("md5").getAsString());
}
if (jsonObject.get("text_simhash") != null) {
crawlStatsJSON.addProperty(CRAWLDETAIL_TEXTSIMHASH_PROPERTY,jsonObject.get("text_simhash").getAsLong());
}
JsonElement parsedAs = jsonObject.get("parsed_as");
if (parsedAs != null) {
// populate some info based on type ...
crawlStatsJSON.addProperty(CRAWLDETAIL_PARSEDAS_PROPERTY,parsedAs.getAsString());
String parsedAsString = parsedAs.getAsString();
// if html ...
if (parsedAsString.equals("html")) {
JsonObject content = jsonObject.get("content").getAsJsonObject();
if (content != null) {
JsonElement titleElement = content.get("title");
JsonElement metaElement = content.get("meta_tags");
if (titleElement != null) {
crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement);
}
if (metaElement != null) {
crawlStatsJSON.add(CRAWLDETAIL_METATAGS_PROPERTY, metaElement);
}
// collect link stats for json ...
updateLinkStatsFromHTMLContent(crawlStatsJSON,jsonObject,extHRefs,fpSource,reporter);
}
}
// if feed ...
else if (parsedAsString.equals("feed")) {
// get content ...
JsonObject content = jsonObject.get("content").getAsJsonObject();
JsonElement titleElement = content.get("title");
if (titleElement != null) {
crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement);
}
// set update time ...
long updateTime = safeGetLong(content, "updated");
if (updateTime != -1) {
crawlStatsJSON.addProperty(CRAWLDETAIL_UPDATED_PROPERTY, updateTime);
}
addMinMaxFeedItemTimes(content,crawlStatsJSON);
}
}
}
// redirect ...
else if (httpResult >=300 && httpResult <= 399) {
reporter.incrCounter(Counters.GOT_REDIRECT_CRAWL_STATUS,1);
// get the target url ...
JsonElement targetURL = jsonObject.get("target_url");
if (targetURL != null) {
// redirect details ...
crawlStatsJSON.addProperty(CRAWLDETAIL_REDIRECT_URL, targetURL.getAsString());
}
else {
reporter.incrCounter(Counters.GOT_NULL_REDIRECT_URL, 1);
}
}
}
else {
// inject all the details into a JSONObject
// basic stats ... starting with crawl time ...
crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE,true);
crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_REASON,safeGetStringFromElement(jsonObject,"failure_reason"));
crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_DETAIL,safeGetStringFromElement(jsonObject,"failure_detail"));
}
return crawlStatsJSON;
}
/**
* given a crawl detail json record, update summary record stats
*
* @param crawlDetailRecord
* @param fpSource
* @param reporter
* @throws IOException
*/
void updateSummaryRecordFromCrawlDetailRecord(JsonObject crawlDetailRecord,URLFPV2 fpSource,Reporter reporter) throws IOException {
if (_summaryRecord == null) {
_summaryRecord = new JsonObject();
}
boolean failure = safeGetBoolean(crawlDetailRecord,CRAWLDETAIL_FAILURE);
long attemptTime = crawlDetailRecord.get(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY).getAsLong();
// set latest attempt time ...
long latestAttemptTime = safeSetMaxLongValue(_summaryRecord,SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY,attemptTime);
// increment attempt count
safeIncrementJSONCounter(_summaryRecord,SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY);
// if this is the latest attempt ...
if (latestAttemptTime == attemptTime) {
// add latest http result to summary
if (!failure && crawlDetailRecord.has(CRAWLDETAIL_HTTPRESULT_PROPERTY)) {
int httpResult = crawlDetailRecord.get(CRAWLDETAIL_HTTPRESULT_PROPERTY).getAsInt();
// set last http result
_summaryRecord.addProperty(SUMMARYRECORD_HTTP_RESULT_PROPERTY,httpResult);
if (httpResult >= 200 && httpResult <= 299) {
// update the crawl timestamp
_summaryRecord.addProperty(SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY,attemptTime);
// and the crawl count ....
safeIncrementJSONCounter(_summaryRecord,SUMMARYRECORD_CRAWLCOUNT_PROPERTY);
// update parsed as
if (crawlDetailRecord.has(CRAWLDETAIL_PARSEDAS_PROPERTY)) {
_summaryRecord.addProperty(SUMMARYRECORD_PARSEDAS_PROPERTY, safeGetStringFromElement(crawlDetailRecord,CRAWLDETAIL_PARSEDAS_PROPERTY));
}
}
else if (httpResult >=300 && httpResult <= 399) {
if (crawlDetailRecord.has(CRAWLDETAIL_REDIRECT_URL)) {
_summaryRecord.addProperty(SUMMARYRECORD_REDIRECT_URL_PROPERTY, safeGetStringFromElement(crawlDetailRecord,CRAWLDETAIL_REDIRECT_URL));
}
}
}
}
}
/**
* given html content (json object), extract out of domain hrefs and cache them
* and ... update stats
* @param crawlStats
* @param incomingJSONObject
* @param extHRefs
* @param fpSource
* @param reporter
*/
static void updateLinkStatsFromHTMLContent(JsonObject crawlStats,JsonObject incomingJSONObject,HashSet<String> extHRefs,URLFPV2 fpSource,Reporter reporter) {
JsonArray links = incomingJSONObject.getAsJsonArray("links");
if (links == null) {
reporter.incrCounter(Counters.NULL_LINKS_ARRAY, 1);
}
else {
// clear our snapshot of externally referenced urls
// we only want to capture this information from
// the links extracted via the latest content
if (extHRefs != null)
extHRefs.clear();
int intraDomainLinkCount = 0;
int intraRootLinkCount = 0;
int interDomainLinkCount = 0;
for (JsonElement link : links) {
JsonObject linkObj = link.getAsJsonObject();
if (linkObj != null && linkObj.has("href")) {
String href = linkObj.get("href").getAsString();
GoogleURL urlObject = new GoogleURL(href);
if (urlObject.isValid()) {
URLFPV2 linkFP = URLUtils.getURLFPV2FromURLObject(urlObject);
if (linkFP != null) {
if (linkFP.getRootDomainHash() == fpSource.getRootDomainHash()) {
if (linkFP.getDomainHash() == fpSource.getDomainHash()) {
intraDomainLinkCount ++;
}
else {
intraRootLinkCount ++;
}
}
else {
interDomainLinkCount++;
// track domains we link to
if (extHRefs != null) {
if (extHRefs.size() <= MAX_EXTERNALLY_REFERENCED_URLS) {
extHRefs.add(urlObject.getCanonicalURL());
}
}
}
}
}
}
}
// update counts in crawl stats data structure ...
crawlStats.addProperty(CRAWLDETAIL_INTRADOMAIN_LINKS, intraDomainLinkCount);
crawlStats.addProperty(CRAWLDETAIL_INTRAROOT_LINKS, intraRootLinkCount);
crawlStats.addProperty(CRAWLDETAIL_INTERDOMAIN_LINKS, interDomainLinkCount);
if (interDomainLinkCount <= 100) {
reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_100, 1);
}
else if (interDomainLinkCount <= 1000) {
reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_1000, 1);
}
else {
reporter.incrCounter(Counters.INTERDOMAIN_LINKS_GT_1000, 1);
}
}
}
/**
* flush currently accumulated JSON record
*
* @param output
* @param reporter
* @throws IOException
*/
private void flushCurrentRecord(OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
_urlsProcessed++;
if (_outputKeyString == null || !_outputKeyURLObj.isValid()) {
if (reporter != null) {
reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
}
}
else {
if (_topLevelJSONObject != null || _summaryRecord != null || _linkSummaryRecord != null) {
if (_topLevelJSONObject == null) {
reporter.incrCounter(Counters.ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, 1);
_topLevelJSONObject = new JsonObject();
_topLevelJSONObject.addProperty(TOPLEVEL_SOURCE_URL_PROPRETY,_outputKeyString);
}
else {
reporter.incrCounter(Counters.ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, 1);
}
if (_summaryRecord != null) {
_summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS);
_summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED);
if (_extHrefs.size() != 0) {
// output links in the top level object ...
stringCollectionToJsonArrayWithMax(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, _extHrefs,MAX_EXTERNALLY_REFERENCED_URLS);
if (_extHrefs.size() > MAX_EXTERNALLY_REFERENCED_URLS) {
_summaryRecord.addProperty(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED,true);
}
}
reporter.incrCounter(Counters.ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, 1);
_topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY, _summaryRecord);
}
if (_linkSummaryRecord != null) {
reporter.incrCounter(Counters.ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, 1);
if (_types != null && _types.size() != 0) {
stringCollectionToJsonArray(_linkSummaryRecord,LINKSTATUS_TYPEANDRELS_PROPERTY,_types);
}
_topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY, _linkSummaryRecord);
}
//System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0));
if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
JsonObject blekkoMetadata = _topLevelJSONObject.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY);
reporter.incrCounter(Counters.EMITTED_RECORD_WITH_BLEKKO_METADATA, 1);
if (_linkSummaryRecord != null || _summaryRecord != null ) {
reporter.incrCounter(Counters.BLEKKO_RECORD_ALREADY_IN_DATABASE, 1);
if (_summaryRecord != null) {
if (_summaryRecord.has(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY)
&& _summaryRecord.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt() != 0) {
String status = blekkoMetadata.get(BLEKKO_METADATA_STATUS).getAsString();
if (status.equalsIgnoreCase("crawled")) {
reporter.incrCounter(Counters.BLEKKO_CRAWLED_CC_CRAWLED, 1);
}
else {
reporter.incrCounter(Counters.BLEKKO_NOT_CRAWLED_CC_CRAWLED, 1);
}
}
}
}
}
// output top level record ...
output.collect(CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0),new TextBytes(_topLevelJSONObject.toString()));
// if there is link status available ...
if (_sourceSampleSize != 0) {
reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_RECORD, 1);
TextBytes sourceInputsText= new TextBytes();
sourceInputsText.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
//System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0));
output.collect(CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0),sourceInputsText);
reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED, sourceInputsText.getLength());
}
}
if (_urlsProcessed % FLUSH_INTERVAL == 0) {
_sourceInputsTrackingFilter.clear();
}
}
_sourceInputsBuffer.reset();
_sourceSampleSize = 0;
_topLevelJSONObject = null;
_summaryRecord = null;
_linkSummaryRecord = null;
_types.clear();
_extHrefs.clear();
_outputKeyString = null;
_urlKeyForzen = false;
_outputKeyURLObj = null;
}
/**
* Extract the fingerprint from the incoming key and potentially trigger a flush if it is indicative of a
* primary key transition
* @param key
* @param output
* @param reporter
* @throws IOException
*/
private void readFPCheckForTransition(TextBytes key,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
if (_tempKey == null) {
_tempKey = new URLFPV2();
}
_tempKey.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
_tempKey.setDomainHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.DOMAIN_HASH_COMPONENT_ID));
_tempKey.setUrlHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.URL_HASH_COMPONENT_ID));
if (_currentKey == null) {
_currentKey = _tempKey;
_tempKey = null;
}
else {
// check for key transition ...
if (_currentKey.compareTo(_tempKey) != 0) {
// transition
flushCurrentRecord(output,reporter);
// swap keys ...
URLFPV2 oldKey = _currentKey;
_currentKey = _tempKey;
_tempKey = oldKey;
}
}
}
/**
* add crawl detail to summary record. construct a summary detail if none exists ...
*
* @param crawlStatsJSON
*/
void safeAddCrawlDetailToSummaryRecord(JsonObject crawlStatsJSON) {
if (_summaryRecord == null) {
_summaryRecord = new JsonObject();
}
// construct crawl stats array if necessary
JsonArray crawlStatsArray = _summaryRecord.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
if (crawlStatsArray == null) {
crawlStatsArray = new JsonArray();
_summaryRecord.add(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY, crawlStatsArray);
}
// add crawl stats to it
crawlStatsArray.add(crawlStatsJSON);
}
/**
* scan the merge db path and find the latest crawl database timestamp
*
* @param fs
* @param conf
* @return
* @throws IOException
*/
static long findLatestMergeDBTimestamp(FileSystem fs,Configuration conf)throws IOException {
long timestampOut = -1L;
FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH,"[0-9]*"));
for (FileStatus candidate : files) {
Path successPath = new Path(candidate.getPath(),"_SUCCESS");
if (fs.exists(successPath)) {
long timestamp = Long.parseLong(candidate.getPath().getName());
timestampOut = Math.max(timestamp, timestampOut);
}
}
return timestampOut;
}
/**
* iterate the intermediate link graph data and extract unmerged set ...
*
* @param fs
* @param conf
* @param latestMergeDBTimestamp
* @return
* @throws IOException
*/
static List<Path> filterMergeCandidtes(FileSystem fs,Configuration conf, long latestMergeDBTimestamp )throws IOException {
ArrayList<Path> list = new ArrayList<Path>();
FileStatus candidates[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH,"[0-9]*"));
for (FileStatus candidate : candidates) {
LOG.info("Found Merge Candidate:" + candidate.getPath());
long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
if (candidateTimestamp > latestMergeDBTimestamp) {
Path successPath = new Path(candidate.getPath(),"_SUCCESS");
if (fs.exists(successPath)) {
list.add(candidate.getPath());
}
else {
LOG.info("Rejected Merge Candidate:" + candidate.getPath());
}
}
}
return list;
}
///////////////////////////////////////////////////////////////////////////
// TEST CODE
///////////////////////////////////////////////////////////////////////////
/*
// PARK THIS CODE FOR NOW SINCE WE ARE TRANSFERRING DATA PROCESSING TO EC2
if (_skipPartition)
return;
// collect all incoming paths first
Vector<Path> incomingPaths = new Vector<Path>();
while(values.hasNext()){
String path = values.next().toString();
LOG.info("Found Incoming Path:" + path);
incomingPaths.add(new Path(path));
}
FlexBuffer scanArray[] = LinkKey.allocateScanArray();
// set up merge attributes
Configuration localMergeConfig = new Configuration(_conf);
localMergeConfig.setClass(
MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,
LinkKeyGroupingComparator.class, RawComparator.class);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,
TextBytes.class, WritableComparable.class);
// ok now spawn merger
MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
_fs, incomingPaths, localMergeConfig);
TextBytes keyBytes = new TextBytes();
TextBytes valueBytes = new TextBytes();
DataInputBuffer inputBuffer = new DataInputBuffer();
int processedKeysCount = 0;
Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null;
while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
urlsProcessed++;
_sourceInputsBuffer.reset();
_sourceSampleSize = 0;
summaryRecord = null;
linkSummaryRecord = null;
types.clear();
outputKeyString = null;
outputKeyFromInternalLink = false;
outputKeyURLObj = null;
extLinkedDomains.clear();
int statusCount = 0;
int linkCount = 0;
// scan key components
LinkKey.scanForComponents(nextItem.e0._keyObject, ':',scanArray);
// pick up source fp from key ...
URLFPV2 fpSource = new URLFPV2();
fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.URL_HASH_COMPONENT_ID));
for (RawRecordValue rawValue: nextItem.e1) {
inputBuffer.reset(rawValue.key.getData(),0,rawValue.key.getLength());
int length = WritableUtils.readVInt(inputBuffer);
keyBytes.set(rawValue.key.getData(),inputBuffer.getPosition(),length);
inputBuffer.reset(rawValue.data.getData(),0,rawValue.data.getLength());
length = WritableUtils.readVInt(inputBuffer);
valueBytes.set(rawValue.data.getData(),inputBuffer.getPosition(),length);
*/
}