/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.crawler;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import javax.servlet.jsp.JspWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.DailyRollingFileAppender;
import org.apache.log4j.Layout;
import org.apache.log4j.spi.LoggingEvent;
import org.commoncrawl.util.SuffixStringMatcher;
import org.commoncrawl.async.Callback;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIODNSQueryResult;
import org.commoncrawl.io.NIODNSCache;
import org.commoncrawl.io.NIODNSQueryClient;
import org.commoncrawl.io.NIODNSResolver;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlSegment;
import org.commoncrawl.protocol.CrawlSegmentDetail;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentStatus;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.NullMessage;
import org.commoncrawl.rpc.base.shared.RPCException;
import org.commoncrawl.service.crawler.CrawlLog.CheckpointCompletionCallback;
import org.commoncrawl.service.crawler.CrawlLog.LogFlusherStopActionCallback;
import org.commoncrawl.service.crawler.CrawlSegmentLog.CrawlSegmentFPMap;
import org.commoncrawl.service.crawler.SegmentLoader.LoadProgressCallback;
import org.commoncrawl.service.crawler.filters.Filter.FilterResult;
import org.commoncrawl.service.crawler.util.URLFPBloomFilter;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.service.statscollector.LogCrawlStatsRequest;
import org.commoncrawl.util.AsyncAppender;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CustomLogger;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.JVMStats;
import org.commoncrawl.util.MovingAverage;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.SessionIDURLNormalizer;
import org.commoncrawl.util.SubDomainComparator;
import org.commoncrawl.util.URLUtils;
/**
* Class that manages the crawler process state
*
* @author rana
*
*/
public final class CrawlerEngine implements SegmentLoader.CancelOperationCallback {
/** database keys **/
static final String CRAWLER_CRAWL_SEGMENT_TYPE_PARENT_KEY = "TWUnit";
static final String CrawlSegmentKeyPrefix = "CSInfo_";
static final int DEFAULT_MAX_SIMULTANEOUS_CONNECTIONS = 500;
static final int HOSTID_CACHE_SIZE = 10000;
static final int DEFAULT_DNS_TIMEOUT = 30000;
static final int STATS_COLLECTION_INTERVAL = 500;
static final int DEFAULT_MAX_ACTIVE_URLS = 2000000000;
static final int LOADER_OVERFLOW_ALLOWED= 100000;
static final int MAX_PENDING_URLS=100000;
static final int DEFAULT_MAX_ACTIVE_HOSTS = 10000;
static final int DEFAULT_LOCAL_BLOOMFILTER_NUM_ELEMENTS = 500000000;
static final int DEFAULT_LOCAL_BLOOMFILTER_BITS_PER_ELEMENT = 11;
static final int DEFAULT_LOCAL_BLOOMFILTER_BITS_NUM_HASH_FUNCTIONS = 10;
// the loader will stall if memory utilization reaches or exceeds specified number ..
static final float DEFAULT_LOADER_STALL_MEMORY_UTILIZATION_RATIO = .70f;
// if the loader is stalled, it will not resume until memory utilization reaches specified ratio ..
static final float DEFAULT_LOADER_RESUME_MEMORY_UTILIZATION_RATIO = .55f;
// default list id
static final long DEFAULT_LIST_ID = 1;
/** logging **/
private static final Log LOG = LogFactory.getLog("org.commoncrawl.crawler.CrawlEngine");
/** database id of crawler **/
String _databaseId;
/** back pointer to crawl server **/
CrawlerServer _server;
/** http crawl queue **/
CrawlQueue _httpCrawlQueue;
/** crawl interface list **/
InetSocketAddress[] _crawlInterfaces;
/** max tcp connect sockets **/
int _maxTCPSockets = -1;
/** max active urls **/
int _maxActiveURLS = DEFAULT_MAX_ACTIVE_URLS;
/** max active hosts **/
int _maxActiveHosts = DEFAULT_MAX_ACTIVE_HOSTS;
/** custom logger **/
CustomLogger _failureLog;
CustomLogger _SuccessLog;
CustomLogger _GETLog;
CustomLogger _DNSSuccessLog;
CustomLogger _DNSFailureLog;
CustomLogger _RobotsLog;
CustomLogger _CrawlLogLog;
CustomLogger _CookieLogger;
SimpleDateFormat robotsLogDateFormat = new SimpleDateFormat("yyyy.MM.dd hh:mm:ss.SSS");
/** the crawl log **/
CrawlLog _crawlLog;
/** stats collector **/
RuntimeStatsCollector _stats = new RuntimeStatsCollector();
/** stats collector (remote) stats **/
CrawlerStats _crawlerStats = new CrawlerStats();
/** last stats upload time **/
private long _lastCrawlerStatsUploadTime = -1;
/** crawler stats flush interval **/
private static final int CRAWLER_STATS_FLUSH_INTERVAL = 5 * 60 * 1000; // 5 minutes
/** stats collecetor timer **/
private Timer _statsCollectionTimer = null;
/** startup time **/
private long _startupTime = System.currentTimeMillis();
/** failure/ success counts **/
long _totalAvailableURLCount =0;
long _totalProcessedURLCount = 0;
long _failedURLCount = 0;
long _successURLCount = 0;
long _loadCount = 0;
/** queue stats **/
int _pendingCount = 0;
int _queuedCount = 0;
/** queue stats **/
int _activeHosts = 0;
/** segment load counts **/
int _activeLoadCount = 0;
int _segmentScanPending = 0;
/** crawl active flag **/
boolean _crawlActive = false;
/** crawl was stopped **/
boolean _crawlStopped = false;
/** shutdown flag **/
boolean _shutdownFlag = false;
/** local crawl history bloom filter **/
URLFPBloomFilter _localBloomFilter;
ReentrantLock _loaderStalledLock = new ReentrantLock();
Condition _loaderStalledCondition = _loaderStalledLock.newCondition();
Condition _loaderStalledEvent = null;
enum LoaderStallReason {
None,
Memory, // stalled due to low memory
MaxURLS, // loader stalled due to max urls allowed in queue
ActiveHostCount // load stalled due to too many active hosts in queue
}
LoaderStallReason _loaderStallReason = LoaderStallReason.None;
Queue<CrawlSegmentStatus> _segmentLoadQueue = new PriorityQueue<CrawlSegmentStatus>(100,
new Comparator<CrawlSegmentStatus>() {
public int compare(CrawlSegmentStatus o1, CrawlSegmentStatus o2) {
if (o1.getLoadHint() > o2.getLoadHint()) {
return 1;
}
else if (o1.getLoadHint() < o2.getLoadHint()) {
return -1;
}
return 0;
}
});
//Queue<CrawlSegmentStatus> _segmentLoadQueue = new LinkedList<CrawlSegmentStatus>();
/** dns stats **/
MovingAverage _dnsProcessResultsTime = new MovingAverage(25);
/** work unit status map **/
private Map<Long,CrawlSegmentStatus> _statusMap = new LinkedHashMap<Long,CrawlSegmentStatus>() ;
/** DNS Resolver Thread Pool **/
private int _hostQueuedForResolution = 0;
private int _urlsPendingResolution = 0;
private int _dnsHighWaterMark;
private int _dnsLowWaterMark;
private boolean _highWaterMarkHit = false;
private long _cycleTime = -1;
private LinkedList<CrawlSegmentDetail> _dnsDeferedSegments = new LinkedList<CrawlSegmentDetail>();
private LinkedList<CrawlSegmentHost> _dnsDeferedHosts = new LinkedList<CrawlSegmentHost>();
/** Various Lists **/
private static SuffixStringMatcher _blackListedHostsMatcher;
private static SessionIDURLNormalizer _sessionIDNormalizer = new SessionIDURLNormalizer();
/** inverse cache **/
private static NIODNSCache _badDomainCache = new NIODNSCache();
/** shared SubDomain Comparator instance **/
SubDomainComparator _subDomainComparator = new SubDomainComparator();
/** the active list id we are operating on **/
int _activeListId = -1;
/** bloom filter size **/
public static int BLOOM_FILTER_SIZE = DEFAULT_LOCAL_BLOOMFILTER_NUM_ELEMENTS;
/** constructor **/
public CrawlerEngine(CrawlerServer server,int maxSockets,int dnsHighWaterMark,int dnsLowWaterMark,long cycleTime, int activeListId) {
_maxTCPSockets = maxSockets;
_server = server;
_dnsHighWaterMark = dnsHighWaterMark;
_dnsLowWaterMark = dnsLowWaterMark;
_cycleTime = cycleTime;
_activeListId = activeListId;
}
private static class CustomLoggerLayout extends Layout {
StringBuffer sbuf = new StringBuffer(1024);
@Override
public String format(LoggingEvent event) {
sbuf.setLength(0);
sbuf.append(event.getRenderedMessage());
sbuf.append(LINE_SEP);
return sbuf.toString();
}
@Override
public boolean ignoresThrowable() {
return true;
}
public void activateOptions() {
}
}
private Appender createLogFileAppender(String logFileName) throws IOException {
DailyRollingFileAppender drfaAppender = new DailyRollingFileAppender(new CustomLoggerLayout(),_server.getLogDirectory() + "/" + logFileName,"yyyy-MM-dd");
AsyncAppender asyncAppender = new AsyncAppender(8192);
asyncAppender.addAppender(drfaAppender);
return asyncAppender;
}
/** initialization
*
*
*/
public boolean initialize(InetSocketAddress[] crawlInterfaceList) {
_crawlInterfaces = crawlInterfaceList;
_failureLog = new CustomLogger("CrawlerFailureLog");
_GETLog = new CustomLogger("GETLog");
_SuccessLog = new CustomLogger("SuccessLog");
_DNSSuccessLog = new CustomLogger("DNSSuccessLog");
_DNSFailureLog = new CustomLogger("DNSFailureLog");
_RobotsLog = new CustomLogger("RobotsLog");
_CrawlLogLog = new CustomLogger("CrawlLogLog");
_CookieLogger = new CustomLogger("CookieLogger");
NIOHttpConnection.setCookieLogger(_CookieLogger);
try {
_failureLog.addAppender(createLogFileAppender("crawlerFailures.log"));
_GETLog.addAppender(createLogFileAppender("crawlerGETs.log"));
_SuccessLog.addAppender(createLogFileAppender("crawlerSuccess.log"));
_DNSSuccessLog.addAppender(createLogFileAppender("crawlerDNS.log"));
_DNSFailureLog.addAppender(createLogFileAppender("crawlerDNSFailures.log"));
_RobotsLog.addAppender(createLogFileAppender("robotsFetchLog.log"));
_CrawlLogLog.addAppender(createLogFileAppender("crawlLog.log"));
_CookieLogger.addAppender(createLogFileAppender("cookieLog.log"));
} catch (IOException e) {
e.printStackTrace();
}
LOG.info("Allocating BloomFilter");
_localBloomFilter
= new URLFPBloomFilter(
BLOOM_FILTER_SIZE,
DEFAULT_LOCAL_BLOOMFILTER_BITS_NUM_HASH_FUNCTIONS,
DEFAULT_LOCAL_BLOOMFILTER_BITS_PER_ELEMENT);
try {
// register ourselves as log sink for dns events ...
// NIODNSLocalResolver.setLogger(this);
if (Environment.detailLogEnabled())
LOG.info("initialize - Recordstore says Database Id is: " + _databaseId);
if (Environment.detailLogEnabled())
LOG.info("initialize - Loading State");
loadState();
if (Environment.detailLogEnabled())
LOG.info("initialize http CrawlQueue");
if (_maxTCPSockets == -1) {
if (Environment.detailLogEnabled())
LOG.info("Max TCP Sockets Unspecified. Defaulting to:" + DEFAULT_MAX_SIMULTANEOUS_CONNECTIONS);
_maxTCPSockets = DEFAULT_MAX_SIMULTANEOUS_CONNECTIONS;
}
else {
if (Environment.detailLogEnabled())
LOG.info("Max TCP Sockets is:" + _maxTCPSockets);
}
LOG.info("Starting CrawlDomain Disk Queueing Thread");
CrawlList.startDiskQueueingThread(this.getEventLoop(),getServer().getDomainQueueDir());
LOG.info("Initialize HTTP Crawl Queue");
HttpFetcher fetcher = new HttpFetcher(_maxTCPSockets,crawlInterfaceList,getServer().getHostName());
_httpCrawlQueue = new CrawlQueue(CrawlQueue.Protocol.HTTP,fetcher);
if (getServer().enableCrawlLog()) {
if (Environment.detailLogEnabled())
LOG.info("initializing crawl engine log ... ");
try {
_crawlLog = new CrawlLog(this);
}
catch (IOException e) {
LOG.fatal("Exception thrown while initializing CrawlLog:" + CCStringUtils.stringifyException(e));
return false;
}
}
if (Environment.detailLogEnabled())
LOG.info("loading Crawl Segments");
if (getServer().externallyManageCrawlSegments() && !CrawlEnvironment.inUnitTestMode()) {
kickOffCrawl();
}
return true;
}
catch (IOException e) {
LOG.fatal(e);
return false;
}
}
/** return the socket address associated with a crawl interface
*
* @param index the crawl interface index
* @return the socket address associated with that index ...
*/
public InetAddress getCrawlInterfaceGivenIndex(int index) {
if (index != -1 && index < _crawlInterfaces.length) {
return _crawlInterfaces[index].getAddress();
}
return null;
}
private boolean potentiallyAddToParseQueue(CrawlURL urlObject) throws IOException {
boolean enqueued = false;
if (urlObject.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) {
NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(urlObject.getHeaders());
if (finalHeaders != null && finalHeaders.getValue(0) != null) {
int httpResult = HttpHeaderInfoExtractor.parseStatusLine(finalHeaders.getValue(0)).e0;
if (httpResult == 200) {
// queue it up for parsing ...
}
}
}
return enqueued;
}
/** kick off crawl segment loader **/
public void loadCrawlSegments() {
try {
loadCrawlSegments( new Callback() {
// this callback is executed when the last segment has been successfully loaded
public void execute() {
LOG.info("loadCrawlSegment Completion Callback Excecuted - Checking to see if Checkpoint Possilbe");
long currentTime = System.currentTimeMillis();
if (getServer().enableCrawlLog() && _crawlLog.isCheckpointPossible(currentTime)) {
LOG.info("Delaying Crawl Startup - Checkpointing Logs to HDFS FIRST...");
// start the checkpoint ...
_crawlLog.checkpoint(currentTime,new CheckpointCompletionCallback() {
public void checkpointComplete(long checkpointId,Vector<Long> completedSegmentList) {
LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed");
if (completedSegmentList != null) {
// walk completed segments ... updating their crawl state ...
for (long packedSegmentId : completedSegmentList) {
// notify crawler engine of status change ...
crawlSegmentComplete(packedSegmentId);
}
}
// now kick off the crawl ...
kickOffCrawl();
}
public void checkpointFailed(long checkpointId,Exception e) {
LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:" + CCStringUtils.stringifyException(e));
throw new RuntimeException("Checkpoint Failed During Startup With Exception:" + CCStringUtils.stringifyException(e));
}
}, currentTime);
}
else {
// kick off crawl immediately ..
kickOffCrawl();
}
}
});
}
catch (IOException e) {
LOG.fatal("Caught IOException while loadCrawlSegments! Exception:" +CCStringUtils.stringifyException(e));
}
}
public void shutdown() {
LOG.info("Shuting down crawl engine");
_shutdownFlag = true;
if (_crawlActive) {
stopCrawl(null);
while (_crawlActive) {
if (Thread.currentThread() == getEventLoop().getEventThread()) {
LOG.info("Polling Selector while waiting for crawl to stop");
try {
getEventLoop().waitForIO(1000);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else {
LOG.info("Waiting for crawl to stop");
try {
Thread.currentThread().sleep(1000);
} catch (InterruptedException e) {
}
}
}
}
// clear crawl queue
_httpCrawlQueue.shutdown();
_httpCrawlQueue = null;
System.gc();
// wait for loader / dns threads to exit
while (_activeLoadCount != 0 || _urlsPendingResolution != 0) {
if (Thread.currentThread() == getServer().getEventLoop().getEventThread()) {
try {
LOG.info("Polling Event Thread Selector while waiting for loadcount / resolution count to go to zero. LoadCount:"
+ _activeLoadCount + " PendingResolutionCount:" + _urlsPendingResolution);
getEventLoop().waitForIO(1000);
} catch (IOException e) {
e.printStackTrace();
}
}
else {
LOG.info("Waiting for loadcount / resolution count to go to zero");
try {
Thread.currentThread().sleep(1000);
} catch (InterruptedException e) {
}
}
}
LOG.info("load count / resolution count went to zero!");
_dnsDeferedSegments.clear();
_dnsDeferedHosts.clear();
// null out crawl log
_crawlLog = null;
}
public void stopCrawlerCleanly() {
LOG.info("Clean Shutdown - Stopping Crawl");
if (_crawlActive) {
stopCrawl(new CrawlStopCallback() {
public void crawlStopped() {
LOG.info("Clean Shutdown - Crawl Stopped. Stopping Server");
_server.stop();
LOG.info("Clean Shutdown - Stopping Hadoop");
try {
FileSystem.closeAll();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
LOG.info("Clean Shutdown - Exiting App");
System.exit(1);
}
});
}
}
public void kickOffCrawl() {
if (_cycleTime != -1 && !_server.disableCycleTimer()) {
SimpleDateFormat formatter = new SimpleDateFormat("yyyy.MM.dd hh:mm:ss z");
LOG.info("Cycle Time is Set. Will AutoShutdown Crawler at:" + formatter.format(new Date(_cycleTime)));
long delay = _cycleTime - System.currentTimeMillis();
Timer cycleTimer = new Timer(delay,false,new Timer.Callback() {
public void timerFired(Timer timer) {
stopCrawlerCleanly();
}
});
getEventLoop().setTimer(cycleTimer);
}
LOG.info("kicking off crawl - Loading Crawl Segment");
// now try to load a segment ...
potentiallyLoadNextSegment();
LOG.info("Starting Crawl");
// and then start the crawl ...
startCrawl();
// finally, if in unit test mode ...
if (CrawlEnvironment.inUnitTestMode()) {
LOG.info("UnitTest Mode Detected - Runing Test...");
}
}
private static int MS_IN_AN_SECOND = 1000;
private static int MS_IN_AN_MINUTE = MS_IN_AN_SECOND * 60;
private static int MS_IN_AN_HOUR = MS_IN_AN_MINUTE * 60;
private void startStatsCollector() {
_lastCrawlerStatsUploadTime = System.currentTimeMillis();
_statsCollectionTimer = new Timer(STATS_COLLECTION_INTERVAL,true,new Timer.Callback() {
public void timerFired(Timer timer) {
synchronized(_stats) {
//async stats ...
//getServer().getEventLoop().collectStats(_stats);
// engine stats ...
synchronized(this) {
long msSinceStartup = _startupTime - System.currentTimeMillis();
int hours = (int) (msSinceStartup / MS_IN_AN_HOUR);
msSinceStartup -= (hours * MS_IN_AN_HOUR);
int minutes = (int)(msSinceStartup / MS_IN_AN_MINUTE);
msSinceStartup -= (minutes * MS_IN_AN_MINUTE);
int seconds = (int)(msSinceStartup / MS_IN_AN_SECOND);
String upTimeString = String.format("%1$d Hours %2$d Minutes %3$d Seconds",hours,minutes,seconds);
_stats.setStringValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_UpTime,upTimeString);
_stats.setStringValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_BuildTime,System.getProperty("commoncrawl.build.date"));
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_TotalProcessedURLCount,_totalProcessedURLCount);
//_crawlerStats.setUrlsProcessed(_crawlerStats.getUrlsProcessed() + _)
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_FetchFailedCount,_failedURLCount);
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_FetchSucceededCount,_successURLCount);
_stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_ActiveLoadCount,_activeLoadCount);
_stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_DeferredLoadCount,_segmentLoadQueue.size());
_stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_PendingCount,_pendingCount);
_stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_QueuedCount,_queuedCount);
// resolver stats ...
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.DNS_TotalDNSQueries, getServer().getEventLoop().getResolver().getQueryCount());
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.DNS_TotalCacheHits, getServer().getEventLoop().getResolver().getCacheHitCount());
_stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.DNS_TotalCacheMisses, getServer().getEventLoop().getResolver().getCacheMissCount());
//_stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_DNSAddToCacheTime,NIODNSResolver.getDNSCache()._dnsAddToCacheTime.getAverage());
//_stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_DNSLookupFromCacheTime,NIODNSResolver.getDNSCache()._dnsLookupFromCacheTime.getAverage());
// _stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_DNSCacheNodeCount,NIODNSLocalResolver.getDNSCache().getActiveNodeCount());
_stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlerEngine_DNSProcessResultTime,_dnsProcessResultsTime.getAverage());
}
// server stats ...
getServer().collectStats(_stats);
// crawlqueue / fetcher stats ...
_httpCrawlQueue.collectStats(_crawlerStats,_stats);
if (getServer().enableCrawlLog()) {
// crawl log stats ...
_crawlLog.collectStats(_stats);
}
if (System.currentTimeMillis() - _lastCrawlerStatsUploadTime >= CRAWLER_STATS_FLUSH_INTERVAL) {
synchronized(_crawlerStats) {
_crawlerStats.setUrlsInFetcherQueue(_pendingCount);
_crawlerStats.setUrlsInLoaderQueue(_queuedCount);
_crawlerStats.setActiveDNSRequests(_hostQueuedForResolution);
_crawlerStats.setQueuedDNSRequests(_dnsDeferedHosts.size());
_crawlerStats.setCrawlerMemoryUsedRatio(JVMStats.getHeapUtilizationRatio());
}
CrawlerStats statsOut = null;
// clone the stats
try {
statsOut = (CrawlerStats) _crawlerStats.clone();
statsOut.setTimestamp(System.currentTimeMillis());
} catch (CloneNotSupportedException e) {
}
// clear collector stats ...
_crawlerStats.clear();
// reset stats collection timer
_lastCrawlerStatsUploadTime = System.currentTimeMillis();
// send the stats out ...
if (getServer().getStatsCollectorStub() != null) {
LogCrawlStatsRequest requestOut = new LogCrawlStatsRequest();
requestOut.setCrawlerName(getServer().getHostName());
requestOut.setCrawlerStats(statsOut);
try {
getServer().getStatsCollectorStub().logCrawlerStats(requestOut, new AsyncRequest.Callback<LogCrawlStatsRequest, NullMessage>() {
@Override
public void requestComplete(AsyncRequest<LogCrawlStatsRequest, NullMessage> request) {
}
});
} catch (RPCException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
}
}
});
getServer().getEventLoop().setTimer(_statsCollectionTimer);
}
private void stopStatsCollector() {
if (_statsCollectionTimer != null) {
getServer().getEventLoop().cancelTimer(_statsCollectionTimer);
}
}
RuntimeStatsCollector getStats() { return _stats; }
public boolean isBlackListedHost(String hostName) {
if (getServer().getDomainBlackListFilter() != null || getServer().getTemporaryBlackListFilter()!= null){
String rootDomain = URLUtils.extractRootDomainName(hostName);
if (rootDomain != null) {
if (getServer().getDomainBlackListFilter() != null){
if (getServer().getDomainBlackListFilter().filterItem(rootDomain,hostName, "", null, null) == FilterResult.Filter_Reject) {
LOG.info("### FILTER Flagged:" + hostName + " as a BlackListed Host");
return true;
}
}
if (getServer().getTemporaryBlackListFilter()!= null) {
if (getServer().getTemporaryBlackListFilter().filterItem(rootDomain,hostName, "", null, null) == FilterResult.Filter_Reject) {
LOG.info("### FILTER Flagged:" + hostName + " as a Temporarily BlackListed Host");
return true;
}
}
}
else {
LOG.warn("Got Invalid HostName during isBlackListedHost:" + hostName);
return true;
}
}
return false;
}
private static CrawlURLMetadata ipTestMetadata = new CrawlURLMetadata();
private boolean isBlackListedIPAddress(int ipAddress) {
if (getServer().getIPAddressFilter() != null) {
ipTestMetadata.setServerIP(ipAddress);
if (getServer().getIPAddressFilter().filterItem(null,null, null, ipTestMetadata, null) == FilterResult.Filter_Reject) {
LOG.info("### FILTER IPAddress:" + IPAddressUtils.IntegerToIPAddressString(ipAddress) + " as a BlackListed IP");
return true;
}
}
return false;
}
/** get the database uuid **/
public String getDatabaseId() { return _databaseId; }
/** get access to the server object **/
public CrawlerServer getServer() { return _server; }
/** get at the event loop **/
public EventLoop getEventLoop() { return _server.getEventLoop(); }
/** get access to the specialized crawler failure log **/
public CustomLogger getFailureLog() { return _failureLog; }
/** get access to the specialized crawler failure log **/
public CustomLogger getGETLog() { return _GETLog; }
/** get access to the specialized crawler success log **/
public CustomLogger getSuccessLog() { return _SuccessLog; }
/** get access to the specialized crawl log log **/
public CustomLogger getCrawlLogLog() { return _CrawlLogLog; }
/** get cookie log **/
public CustomLogger getCookieLog() { return _CookieLogger; }
/** get local bloom filter **/
public URLFPBloomFilter getLocalBloomFilter() { return _localBloomFilter; }
/** get access to the crawler stats data structure **/
public CrawlerStats getCrawlerStats() { return _crawlerStats; }
/** get access to the dns logger **/
//public CustomLogger getDNSLog() { return _DNSSuccessLog; }
/** get access to the dns failure logger **/
//public CustomLogger getDNSFailureLog() { return _DNSFailureLog; }
/** get the subdomain comparator **/
public SubDomainComparator getSubDomainComparator() { return _subDomainComparator; }
/** get pending url count **/
public synchronized int getPendingURLCount() { return _pendingCount ; }
/** get the total number of active urls in the system (pending + queued) **/
public synchronized int getActiveURLCount() { return _pendingCount + _queuedCount; }
/** get the number of active hosts in the queue **/
public synchronized int getActiveHosts() { return _activeHosts; }
/** increment / decrement active host count **/
public void incDecActiveHostCount(int incDecAmount) {
boolean loaderStalled = false;
int activeCount = 0;
synchronized(this) {
_activeHosts += incDecAmount;
activeCount = _activeHosts;
//LOG.info("### ACTIVEHOST Count:" + _activeHosts);
}
_loaderStalledLock.lock();
if (_loaderStalledEvent != null && _loaderStallReason == LoaderStallReason.ActiveHostCount) {
loaderStalled = true;
}
_loaderStalledLock.unlock();
if (loaderStalled) {
if (activeCount < getMaxActiveHostsThreshold()) {
LOG.info("### LOADER Event Thread Acquiring Lock to Loader");
// grab loader lock
_loaderStalledLock.lock();
_loaderStallReason = LoaderStallReason.None;
// at this point the event may have been nulled out ...
if (_loaderStalledEvent != null) {
// trigger event ... thus releasing loader thread (to continue loading the active segment)...
LOG.info("### LOADER Event Thread Signalling Stall Event (activeHosts < MaxActiveThreshold)");
_loaderStalledEvent.signal();
// clear the event ...
_loaderStalledEvent = null;
}
//release loader lock
_loaderStalledLock.unlock();
LOG.info("Releasing Lock - Signaled Loader");
}
}
}
private long _lastLoaderStalledDebugEvt = 0;
/** increment decrement pending count **/
public void incDecPendingQueuedURLCount(int pendingAmount,int queuedAmount) {
int activeCount = 0;
boolean loaderStalled = false;
// atomically increment queue counts ...
synchronized (this) {
_pendingCount += pendingAmount;
_queuedCount += queuedAmount;
// and safely calculate queued count ..
activeCount = _pendingCount + _queuedCount;
}
_loaderStalledLock.lock();
if (_loaderStalledEvent != null && _loaderStallReason == LoaderStallReason.MaxURLS) {
loaderStalled = true;
}
_loaderStalledLock.unlock();
if (loaderStalled) {
if (_lastLoaderStalledDebugEvt == 0 || (System.currentTimeMillis() - _lastLoaderStalledDebugEvt) >= 60000) {
_lastLoaderStalledDebugEvt = System.currentTimeMillis();
if (activeCount >= getMaxActiveThreshold()) {
LOG.info("### LOADER Loader Event Set but will not trigger because Active URL Count: " + activeCount + " >= " + getMaxActiveThreshold());
}
else if (_activeLoadCount != 1) {
LOG.info("### LOADER Event Set but will not trigger because Load Count: " + _activeLoadCount + " != 1");
}
}
}
// if the loader is waiting on the queue and active url count is less than threshold ...
if (_activeLoadCount == 1 && loaderStalled && activeCount < getMaxActiveThreshold()) {
LOG.info("### LOADER Event Thread Acquiring Lock to Loader");
// grab loader lock
_loaderStalledLock.lock();
_loaderStallReason = LoaderStallReason.None;
// at this point the event may have been nulled out ...
if (_loaderStalledEvent != null) {
// trigger event ... thus releasing loader thread (to continue loading the active segment)...
LOG.info("### LOADER Event Thread Signalling Stall Event (activeCount < MaxActiveThreshold)");
_loaderStalledEvent.signal();
// clear the event ...
_loaderStalledEvent = null;
}
//release loader lock
_loaderStalledLock.unlock();
LOG.info("Releasing Lock - Signaled Loader");
}
}
/** get set the max active url threshold **/
public void setMaxActiveURLThreshold(int thresholdValue) {
_maxActiveURLS = thresholdValue;
}
public int getMaxActiveThreshold() {
return _maxActiveURLS;
}
public int getMaxActiveHostsThreshold() {
return _maxActiveHosts;
}
/** load state **/
private void loadState() throws IOException {
}
/** internal helper routine to load crawl segment metdata given list id **/
private List<CrawlSegment> populateCrawlSegmentsFromHDFS() throws IOException {
ArrayList<CrawlSegment> crawlSegments = new ArrayList<CrawlSegment>();
// get segment path for host ...
Path basePath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory(),_server.getHostName());
// get file system based on path
FileSystem segmentFS = FileSystem.get(basePath.toUri(),CrawlEnvironment.getHadoopConfig());
LOG.info("Loading Crawl Segments Using:" + segmentFS + " from Path:" + basePath);
// iterate lists in given path ...
FileStatus[] subDirs = segmentFS.globStatus(new Path(basePath,"[0-9]*"));
for (FileStatus subDir : subDirs) {
LOG.info("Found SubDir: " + subDir.getPath().getName());
// extract list id from sub dir name
int listId = Integer.parseInt(subDir.getPath().getName());
// search for segments in sub directory
Path searchPath = new Path(subDir.getPath(),"[0-9]*");
LOG.info("Searching for crawl segments for list id:" + listId + " with search path:"+ searchPath);
// scan hdfs for matching files ...
FileStatus fileStatusArray[] = segmentFS.globStatus(searchPath);
LOG.info("Found:" + fileStatusArray.length + " segments at path:"+ searchPath);
// now walk matched set
for (FileStatus fileStatus : fileStatusArray) {
// segment id is the parent path name of the matched file
String segmentName = fileStatus.getPath().getName();
int segmentId = Integer.parseInt(segmentName);
//now populate crawl segment information
CrawlSegment crawlSegment = new CrawlSegment();
crawlSegment.setListId(listId);
crawlSegment.setSegmentId(segmentId);
LOG.info("adding crawl segment:"+crawlSegment.getSegmentId() + " for List:" + listId);
crawlSegments.add(crawlSegment);
}
}
return crawlSegments;
}
/** internal load work unit routine **/
private void loadCrawlSegments(final Callback completionCallback) throws IOException {
LOG.info("Loading Crawl Segments");
List<CrawlSegment> crawlSegments = populateCrawlSegmentsFromHDFS();
LOG.info("defer loading lists");
float loadPosition = 0.0f;
// now sort the list by segment id
Collections.sort(crawlSegments,new Comparator<CrawlSegment>() {
@Override
public int compare(CrawlSegment o1, CrawlSegment o2) {
int result = (o1.getListId() < o2.getListId()) ? -1 : (o1.getListId() > o2.getListId()) ? 1 : 0;
if (result == 0) {
result = (o1.getSegmentId() < o2.getSegmentId()) ? -1 : (o1.getSegmentId() > o2.getSegmentId()) ? 1 : 0;
}
return result;
}
});
// now queue up load requests ...
for (CrawlSegment crawlSegment : crawlSegments) {
// if the segment has not been marked as completed but it is marked as crawling (we loaded the last time we ran)
if (!crawlSegment.getIsComplete()) {
LOG.info("Delay Loading CrawlSegment:" + crawlSegment.getSegmentId() + " for List:" + crawlSegment.getListId());
// delay load this segment ...
queueSegment(crawlSegment,loadPosition++);
}
else {
if (Environment.detailLogEnabled())
LOG.info("skipping already completed segment:" + crawlSegment.getSegmentId() + " during load");
}
}
// call outer completion callback ...
LOG.info("Last Segment Loaded. Calling Completion Callback");
completionCallback.execute();
}
public static CrawlSegment crawlSegmentFromCrawlSegmentStatus(CrawlSegmentStatus status) {
CrawlSegment segment = new CrawlSegment();
segment.setListId(status.getListId());
segment.setSegmentId(status.getSegmentId());
return segment;
}
/** load the specified crawl segment
*
* @param crawlSegment
* @param loadPosition
* @return
*/
public CrawlSegmentStatus queueSegment(final CrawlSegment crawlSegment,final float loadPosition) {
final CrawlSegmentStatus status = new CrawlSegmentStatus();
// create a log object
final CrawlSegmentLog log = new CrawlSegmentLog(getServer().getDataDirectory(),crawlSegment.getListId(),crawlSegment.getSegmentId(),getServer().getHostName());
status.setListId(crawlSegment.getListId());
status.setSegmentId(crawlSegment.getSegmentId());
status.setLoadStatus(CrawlSegmentStatus.LoadStatus.LOAD_PENDING);
status.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.UNKNOWN);
status.setLoadHint(loadPosition);
status.setUrlCount(0);
status.setUrlsComplete(0);
status.setIsDirty(true);
_statusMap.put(CrawlLog.makeSegmentLogId(crawlSegment.getListId(), crawlSegment.getSegmentId()),status);
if (getServer().enableCrawlLog()) {
// activate the segment log ...
activateSegmentLog(log);
}
LOG.info("Adding Segment:" + status.getSegmentId() +" to DelayLoad Queue with Hint:" + status.getLoadHint());
// add item to load queue
_segmentLoadQueue.add(status);
// and finally try to load the next segment if possible ...
if (_segmentScanPending == 0 ) {
potentiallyLoadNextSegment();
}
return status;
}
private LoadProgressCallback createLoadProgressCallback(final CrawlSegmentStatus status) {
return new LoadProgressCallback() {
public boolean hostAvailable(final CrawlSegmentHost host,final int originalURLCount,final int completedURLCount) {
if (_shutdownFlag == true) {
return false;
}
if (Environment.detailLogEnabled())
LOG.info("### LOADER hostAvailable called on for host:" + host.getHostName());
// check bad hosts table
if (isBadDomain(host.getHostName())) {
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("### LOADER Ignoring Bad Host during Segment Load. HostName:" + host.getHostName());
return true;
}
final int availableCount = originalURLCount - completedURLCount;
// increment a separate load count (how many urls have been loaded to date)
_loadCount += availableCount;
// check memory utilization ... if it has reached target threshold..
if (!_shutdownFlag && JVMStats.getHeapUtilizationRatio() >= DEFAULT_LOADER_STALL_MEMORY_UTILIZATION_RATIO || (getPendingURLCount() + availableCount) > MAX_PENDING_URLS) {
if (JVMStats.getHeapUtilizationRatio() >= DEFAULT_LOADER_STALL_MEMORY_UTILIZATION_RATIO)
LOG.info("### LOADER Stalling ... Memory Utilization Reached or Exceeded Target Ratio:" + DEFAULT_LOADER_STALL_MEMORY_UTILIZATION_RATIO);
else
LOG.info("### LOADER Stalling ... Pending URL Count:" + (getPendingURLCount() +availableCount) + " Exceeds Max Allowed:" + MAX_PENDING_URLS);
long waitStartTime = System.currentTimeMillis();
long waitInterval = 60000;
while (!_shutdownFlag && JVMStats.getHeapUtilizationRatio() >= DEFAULT_LOADER_RESUME_MEMORY_UTILIZATION_RATIO || (getPendingURLCount() + availableCount) > MAX_PENDING_URLS) {
try {
Thread.sleep(5000);
if (System.currentTimeMillis() - waitStartTime >= waitInterval) {
if (JVMStats.getHeapUtilizationRatio() >= DEFAULT_LOADER_RESUME_MEMORY_UTILIZATION_RATIO) {
LOG.info("### LOADER Doing Full GC To Try and Reclaim Memory");
System.gc();
waitInterval = waitInterval *= 2;
}
waitStartTime = System.currentTimeMillis();
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
LOG.info("### LOADER Resuming Load. Memory Utilization:" + JVMStats.getHeapUtilizationRatio() + " Pending URL Count:" + (getPendingURLCount() + availableCount));
}
int pendingDiskOperationCount = CrawlList.getPendingDiskOperationCount();
if (!_shutdownFlag && pendingDiskOperationCount > 10000) {
do {
LOG.info("### LOADER Disk Queue: Waiting for pendingDiskOperationCount to drop below threshold - " + pendingDiskOperationCount);
try {
Thread.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
pendingDiskOperationCount = CrawlList.getPendingDiskOperationCount();
}
while (pendingDiskOperationCount != 0 && !_shutdownFlag);
LOG.info("### LOADER Disk Queue: pendingDiskOperationCount drop below threshold .. continuing load ... ");
}
// if the loading of this will exceed our max
if (!_shutdownFlag && getActiveURLCount() >= (getMaxActiveThreshold() + LOADER_OVERFLOW_ALLOWED)) {
LOG.info("### LOADER Exceeded Max Allowed Active URLS Threshold. Going to Sleep...");
// acquire the loader stalled lock ...
_loaderStalledLock.lock();
try {
while (!_shutdownFlag && getActiveURLCount() >= (getMaxActiveThreshold() + LOADER_OVERFLOW_ALLOWED)) {
// set the stall event ...
_loaderStalledEvent = _loaderStalledCondition;
_loaderStallReason = LoaderStallReason.MaxURLS;
LOG.info("## LOADER Stalling on MaxURLS. Waiting on Event...");
// and release lock and wait for the condition to be set ...
try {
while (!_shutdownFlag && _loaderStalledEvent.await(5000, TimeUnit.MILLISECONDS) == false);
} catch (InterruptedException e) {
}
}
LOG.info("## LOADER Woke Up from Sleep - Continuing Load ...");
}
finally {
// null out loader stalled event ...
_loaderStalledEvent = null;
// we have to unlock the lock once we meet the criteria of our wait state condition ...
_loaderStalledLock.unlock();
}
}
// if the loading of this will exceed our max
if (!_shutdownFlag && getActiveHosts() >= getMaxActiveHostsThreshold()) {
LOG.info("### LOADER Active Hosts Count:" + getActiveHosts() +" Exceeds Threshold:" + getMaxActiveHostsThreshold() + " Going to Sleep...");
// acquire the loader stalled lock ...
_loaderStalledLock.lock();
try {
// set the stall event ...
_loaderStalledEvent = _loaderStalledCondition;
_loaderStallReason = LoaderStallReason.ActiveHostCount;
LOG.info("## LOADER Stalling on MaxActiveHosts. Waiting on Event...");
// and release lock and wait for the condition to be set ...
try {
while (!_shutdownFlag && _loaderStalledEvent.await(5000, TimeUnit.MILLISECONDS) == false);
} catch (InterruptedException e) {
}
LOG.info("## LOADER Woke Up from Sleep - Continuing Load ...");
}
finally {
// null out loader stalled event ...
_loaderStalledEvent = null;
// we have to unlock the lock once we meet the criteria of our wait state condition ...
_loaderStalledLock.unlock();
}
}
// schedule an async event in the main thread ...
_server.getEventLoop().setTimer(new Timer(0,false,new Timer.Callback() {
public void timerFired(Timer timer) {
if (!_shutdownFlag) {
// update segment status ...
status.setUrlCount(status.getUrlCount() +originalURLCount );
status.setUrlsComplete(status.getUrlsComplete() + completedURLCount);
// update crawl status if neccessary ...
if (completedURLCount != originalURLCount && status.getCrawlStatus() != CrawlSegmentStatus.CrawlStatus.CRAWLING) {
status.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.CRAWLING);
}
else if (status.getLoadStatus() == CrawlSegmentStatus.LoadStatus.LOAD_SUCCEEDED && status.getUrlCount() == status.getUrlsComplete()) {
status.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.CRAWL_COMPLETE);
status.setIsComplete(true);
}
// set dirty flag for segment
status.setIsDirty(true);
// update total available count ...
_totalAvailableURLCount += originalURLCount - completedURLCount;
// increment pending count ...
incDecPendingQueuedURLCount(originalURLCount - completedURLCount, 0);
// and finally, submit the host for distribution ...
if (availableCount != 0 && !_shutdownFlag) {
LOG.info("## LOADER distributingSegmentHost:"+ host.getHostName());
distributeSegmentHost(host);
}
}
}
}));
return (!_shutdownFlag);
}
};
}
CompletionCallback<CrawlSegmentStatus> createCompletionCallback(final CrawlSegment crawlSegment,final CrawlSegmentStatus status) {
return new CompletionCallback<CrawlSegmentStatus>() {
public void taskComplete(CrawlSegmentStatus foo) {
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("### SYNC Task Completion Callback for List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId());
if (!_shutdownFlag) {
if (Environment.detailLogEnabled())
LOG.info("### LOADER Load for Segment:"+crawlSegment.getSegmentId() + " SUCCEEDED");
// update status ...
status.setLoadStatus(CrawlSegmentStatus.LoadStatus.LOAD_SUCCEEDED);
// mark the status dirty ...
status.setIsDirty(true);
}
--_activeLoadCount;
if (!_shutdownFlag) {
// now potentially load any deferred segments ...
potentiallyLoadNextSegment();
}
}
public void taskFailed(Exception e) {
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("### LOADER Load for Segment:" + crawlSegment.getSegmentId() + " FAILED With Error: " + StringUtils.stringifyException(e));
status.setUrlsComplete(0);
status.setLoadStatus(CrawlSegmentStatus.LoadStatus.LOAD_FAILED);
status.setIsDirty(true);
--_activeLoadCount;
if (!_shutdownFlag) {
// now potentially load any deferred segments ...
potentiallyLoadNextSegment();
}
}
};
}
/** internal loadWorkUnit routine **/
private CrawlSegmentStatus loadCrawlSegment(final CrawlSegment crawlSegment) {
_activeLoadCount++;
// mark the segment as crawling ...
crawlSegment.setIsCrawling(true);
final CrawlSegmentStatus status = new CrawlSegmentStatus();
status.setListId(crawlSegment.getListId());
status.setSegmentId(crawlSegment.getSegmentId());
status.setLoadStatus(CrawlSegmentStatus.LoadStatus.LOADING);
status.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.UNKNOWN);
status.setUrlCount(0);
status.setUrlsComplete(0);
status.setIsDirty(true);
_statusMap.put(CrawlLog.makeSegmentLogId(crawlSegment.getListId(), crawlSegment.getSegmentId()),status);
if (Environment.detailLogEnabled())
LOG.info("loading crawl segment:"+crawlSegment.getSegmentId());
if (!getServer().externallyManageCrawlSegments()) {
// remove crawl segment log from crawl log data structure
// (we need to do this to protect the data structure from corruption, since the underlying
// worker thread walks the log and reconciles it against the segment data)
final CrawlSegmentLog segmentLogObj = (getServer().enableCrawlLog()) ? _crawlLog.removeSegmentLog(crawlSegment.getListId(),crawlSegment.getSegmentId()) : null;
if (segmentLogObj == null && getServer().enableCrawlLog()) {
_activeLoadCount--;
throw new RuntimeException("Expected Non-NULL CrawlSegmentLog for Segment:" + crawlSegment.getSegmentId());
}
getServer().getDefaultThreadPool().execute(new ConcurrentTask<CrawlSegmentStatus>(getServer().getEventLoop(),
new Callable<CrawlSegmentStatus>(){
public CrawlSegmentStatus call() throws Exception {
try {
LOG.info("### SYNC:Loading SegmentFPInfo for List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId());
// load work unit fingerprint detail ...
final CrawlSegmentFPMap urlFPMap = SegmentLoader.loadCrawlSegmentFPInfo(crawlSegment.getListId(),crawlSegment.getSegmentId(),CrawlerEngine.this.getServer().getHostName(),CrawlerEngine.this);
if (_shutdownFlag) {
LOG.info("### SYNC:EXITING LOAD OF List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId());
return new CrawlSegmentStatus();
}
if (getServer().enableCrawlLog()) {
LOG.info("### SYNC: Syncing Log to SegmentFPInfo for List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId());
// re-sync log to segment ...
segmentLogObj.syncToLog(urlFPMap,CrawlerEngine.this);
}
LOG.info("### SYNC: Sync for List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId() + " Returned:" + urlFPMap._urlCount + " Total URLS and " + urlFPMap._urlsComplete + " CompleteURLS");
if (!_shutdownFlag) {
// now activate the segment log ...
final Semaphore segActiveSemaphore = new Semaphore(0);
// check for completion here ...
if (urlFPMap._urlCount == urlFPMap._urlsComplete && !_shutdownFlag) {
LOG.info("### SYNC: For List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId() +" indicates Completed Segment.");
_server.getEventLoop().setTimer(new Timer(1,false,new Timer.Callback() {
public void timerFired(Timer timer) {
LOG.info("### SYNC: For List:" + crawlSegment.getListId() + " Segment:" + crawlSegment.getSegmentId() +" setting Status to CompletedCompleted Segment.");
if (!_shutdownFlag) {
// update segment status ...
status.setUrlCount(urlFPMap._urlCount);
status.setUrlsComplete(urlFPMap._urlCount);
// update crawl status
status.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.CRAWL_COMPLETE);
status.setIsComplete(true);
// set dirty flag for segment
status.setIsDirty(true);
}
// and release semaphore ...
segActiveSemaphore.release();
}
}));
}
else {
_server.getEventLoop().setTimer(new Timer(1,false,new Timer.Callback() {
public void timerFired(Timer timer) {
if (!_shutdownFlag) {
if (getServer().enableCrawlLog()) {
//back in primary thread context, so go ahead and SAFELY re-activate the segment log ...
activateSegmentLog(segmentLogObj);
}
}
// and release semaphore ...
segActiveSemaphore.release();
}
}));
}
// wait for segment activation ...
segActiveSemaphore.acquireUninterruptibly();
}
// now if complete return immediately
if (urlFPMap._urlCount != urlFPMap._urlsComplete && !_shutdownFlag) {
LOG.info("### LOADER Loading CrawlSegment Detail for Segment:" + crawlSegment.getSegmentId());
SegmentLoader.loadCrawlSegment(
crawlSegment.getListId(),
crawlSegment.getSegmentId(),
CrawlerEngine.this.getServer().getHostName(),
urlFPMap,
null,
createLoadProgressCallback(status),
new SegmentLoader.CancelOperationCallback() {
@Override
public boolean cancelOperation() {
return _shutdownFlag;
}
}
);
}
}
catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
return status;
}
},
createCompletionCallback(crawlSegment,status)
)
);
}
else {
getServer().loadExternalCrawlSegment(crawlSegment,createLoadProgressCallback(status),createCompletionCallback(crawlSegment, status),status);
}
return status;
}
/** active the specified segment log **/
private void activateSegmentLog(CrawlSegmentLog log) {
if (getServer().enableCrawlLog()) {
if (Environment.detailLogEnabled())
LOG.info("activating segment log for segment:" + log.getSegmentId());
_crawlLog.addSegmentLog(log);
}
}
private CrawlLog getCrawlLog() {
return _crawlLog;
}
private static String extractHostNameFromCrawlSegmentHost(CrawlSegmentHost host) {
String hostNameOut = null;
if (host.getUrlTargets().size() != 0) {
CrawlSegmentURL url = host.getUrlTargets().get(0);
if (url != null) {
String urlString = url.getUrl();
//LOG.info("Extracting HostName from url:" + urlString);
String extractedHostName = URLUtils.fastGetHostFromURL(urlString);
//LOG.info("Extracted HostName:" + extractedHostName);
return extractedHostName;
}
}
else {
LOG.error("Zero URL Target for Host:" + host.getHostName() + "!");
}
return null;
}
/** resolve IP addresses for remaining hosts in work unit **/
private void distributeSegmentHost(final CrawlSegmentHost host) {
if (_shutdownFlag)
return;
if (Environment.detailLogEnabled())
LOG.info("Distributing Host:" + host.getHostName() + "URL Count:" + host.getUrlTargets().size());
// get host name
final String hostName = extractHostNameFromCrawlSegmentHost(host);
if (hostName == null) {
LOG.error("No Valid Extracted HostName found for host: " + host.getHostName());
processHostIPResolutionResult(host,false,null);
return;
}
// cache lookup happened in segment load ... so just check to see if IP is populated ...
if (host.isFieldDirty(CrawlSegmentHost.Field_IPADDRESS)) {
if (Environment.detailLogEnabled())
LOG.info("Immediately Processing " + host.getUrlTargets().size() + " Items for SuperHost:" + host.getHostName());
// if so, immediately process results
processHostIPResolutionResult(host,false,null);
}
// otherwise queue up for result ...
else {
if ((_highWaterMarkHit && _hostQueuedForResolution <= _dnsHighWaterMark) || (!_highWaterMarkHit && _hostQueuedForResolution < _dnsHighWaterMark)) {
// increment pending resolution count ...
_hostQueuedForResolution++;
// and increment urls pending resolution
if (_hostQueuedForResolution == _dnsHighWaterMark) {
if (!_highWaterMarkHit) {
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("### DNS High Water Mark Hit. PendingResolutions:" + _hostQueuedForResolution);
_highWaterMarkHit = true;
}
}
if (Environment.detailLogEnabled())
LOG.info("Scheduling Resolution for Host:" + hostName);
// schedule resolution ...
NIODNSQueryClient queryClient = new NIODNSQueryClient() {
@Override
public void AddressResolutionFailure(NIODNSResolver source,String hostName, Status status, String errorDesc) {
// LOG.info("Got AddressResolutionFailure for host:" + hostName);
// decrement pending resolution count ...
_hostQueuedForResolution--;
if (_hostQueuedForResolution <= _dnsLowWaterMark && _highWaterMarkHit) {
if (!_shutdownFlag) {
feedDNSQueue();
}
}
logDNSFailure(host.getHostName(), errorDesc);
LOG.error("Host IP Resolution for Host:"+host.getHostName() + " FAILED with Status:" + status.toString() + " ErrorDesc:" + errorDesc);
if (!_shutdownFlag) {
// now react to the result ...
processHostIPResolutionResult(host,true,null);
}
}
@Override
public void AddressResolutionSuccess(NIODNSResolver source,String hostName, String name, InetAddress address, long addressTTL) {
//LOG.info("Got AddressResolutionSuccess for host:" + hostName);
// decrement pending resolution count ...
_hostQueuedForResolution--;
logDNSQuery(hostName, address, addressTTL, null);
if (_hostQueuedForResolution <= _dnsLowWaterMark && _highWaterMarkHit) {
if (!_shutdownFlag) {
feedDNSQueue();
}
}
int hostAddress = 0;
if (address != null && address.getAddress() != null) {
byte[] addr = address.getAddress();
if (addr.length == 4) {
hostAddress = IPAddressUtils.IPV4AddressToInteger(addr);
}
}
if (hostAddress != 0) {
// and update segment host's ip info ...
host.setIpAddress(hostAddress);
host.setTtl(Math.max(addressTTL,System.currentTimeMillis() + CrawlEnvironment.MIN_DNS_CACHE_TIME));
// log it
// LOG.info(host.getHostName() + " " +IPAddressUtils.IntegerToIPAddressString(hostAddress) + " " + result.getTTL());
if (!_shutdownFlag) {
// now react to the result ...
processHostIPResolutionResult(host,false,null);
}
}
else {
//if (Environment.detailLogEnabled())
LOG.error("Host IP Resolution for Host:"+host.getHostName() + " FAILED with Zero IP");
if (!_shutdownFlag) {
// now react to the result ...
processHostIPResolutionResult(host,true,null);
}
}
}
@Override
public void DNSResultsAvailable() {}
@Override
public void done(NIODNSResolver source,Future<NIODNSQueryResult> task) {
}
};
try {
getServer().getDNSServiceResolver().resolve(queryClient, hostName,false, false,DEFAULT_DNS_TIMEOUT);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
// decrement pending resolution count ...
_hostQueuedForResolution--;
if (_hostQueuedForResolution <= _dnsLowWaterMark && _highWaterMarkHit) {
if (!_shutdownFlag) {
feedDNSQueue();
}
}
LOG.error("Host IP Resolution for Host:"+host.getHostName() + " FAILED with Exception:" + CCStringUtils.stringifyException(e));
// now react to the result ...
if (!_shutdownFlag) {
processHostIPResolutionResult(host,true,null);
}
}
}
//otherwise if we need to defer this item ...
else {
if (Environment.detailLogEnabled())
LOG.info("Deferring DNS Lookup for Host:" + host.getHostName());
_dnsDeferedHosts.add(host);
}
}
}
Collection<CrawlSegmentHost> breakoutHostsBySubDomain(CrawlSegmentHost incomingHost) {
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("Breaking out hosts by subdomain for incoming host:" + incomingHost.getHostName());
long timeStart = System.currentTimeMillis();
Map<String,CrawlSegmentHost> mapOfHostsByName = new TreeMap<String,CrawlSegmentHost>();
CrawlSegmentHost lastHost = null;
for (CrawlSegmentURL target : incomingHost.getUrlTargets()) {
String url = target.getUrl();
String hostName = URLUtils.fastGetHostFromURL(url);
if (hostName.length() != 0) {
CrawlSegmentHost host = lastHost;
if (host != null && !host.equals(hostName)) {
host = mapOfHostsByName.get(hostName);
}
if (host == null) {
host = new CrawlSegmentHost();
host.setHostName(hostName);
byte hostNameAsBytes[] = hostName.getBytes();
host.setHostFP(FPGenerator.std64.fp(hostNameAsBytes,0,hostNameAsBytes.length));
host.setSegmentId(incomingHost.getSegmentId());
mapOfHostsByName.put(hostName, host);
}
lastHost = host;
host.getUrlTargets().add(target);
}
}
long timeEnd = System.currentTimeMillis();
if (CrawlEnvironment.detailLoggingEnabled)
LOG.info("Breakout took:" + (timeEnd - timeStart) + "MS and returned:" + mapOfHostsByName.size() + " Elements");
return mapOfHostsByName.values();
}
void feedDNSQueue() {
if (Environment.detailLogEnabled())
LOG.info("### DNS Feeding DNS Queue. PendingResolutions:" + _hostQueuedForResolution);
_highWaterMarkHit = false;
while(!_highWaterMarkHit && _hostQueuedForResolution < _dnsHighWaterMark) {
if (_dnsDeferedHosts.size() != 0) {
distributeSegmentHost(_dnsDeferedHosts.removeFirst());
/*
CrawlSegmentHost masterHost = ;
for (CrawlSegmentHost host : breakoutHostsBySubDomain(masterHost)) {
;
}
*/
}
else {
break;
}
}
}
/** process host ip resolution results **/
public void processHostIPResolutionResult(CrawlSegmentHost host,boolean failed,CrawlItemStatusCallback callback) {
synchronized (_crawlerStats) {
if (failed) {
_crawlerStats.setFailedDNSRequests(_crawlerStats.getFailedDNSRequests() + 1);
}
else {
_crawlerStats.setSuccessfullDNSRequests(_crawlerStats.getSuccessfullDNSRequests() + 1);
}
}
long timeStart = System.currentTimeMillis();
boolean blackListedHost = false;
// check to see if the host is blocked ...
if (isBlackListedHost(host.getHostName())) {
if (Environment.detailLogEnabled())
LOG.info("Rejecting " + host.getUrlTargets().size() + " URLS for Black Listed Host:" + host.getHostName());
blackListedHost = true;
}
if (isBlackListedIPAddress(host.getIpAddress())) {
if (Environment.detailLogEnabled())
LOG.info("Rejecting " + host.getUrlTargets().size() + " URLS for Black Listed IP:" + IPAddressUtils.IntegerToIPAddressString(host.getIpAddress()));
blackListedHost = true;
}
// walk urls in host ...
CrawlSegmentURL segmentURL = null;
// capture original url count ...
int originalURLCount = host.getUrlTargets().size();
// decrement pending url count ... and artificially inflate queued count (for now)
incDecPendingQueuedURLCount(-originalURLCount,originalURLCount);
// increment processed url count ...
_totalProcessedURLCount += originalURLCount;
// ok, sort targets by positon first
Collections.sort(host.getUrlTargets(),new Comparator<CrawlSegmentURL>() {
@Override
public int compare(CrawlSegmentURL o1, CrawlSegmentURL o2) {
return (o1.getOriginalPosition() < o2.getOriginalPosition()) ? -1 : (o1.getOriginalPosition() > o2.getOriginalPosition()) ? 1 : 0;
}
});
// now walk targets ...
for (int i=0;i<host.getUrlTargets().size();++i) {
// get the url at the current index ..
segmentURL = host.getUrlTargets().get(i);
boolean badSessionIDURL = false;
boolean malformedURL = false;
URLFP fp = URLUtils.getURLFPFromURL(segmentURL.getUrl(),false);
if (fp == null) {
malformedURL = true;
}
if (!failed && !blackListedHost) {
/*
try {
if (_sessionIDNormalizer.normalize(segmentURL.getUrl(), "") != segmentURL.getUrl()) {
badSessionIDURL = true;
}
} catch (MalformedURLException e) {
LOG.error("Malformed URL Detected during SessionID Normalize. URL:" + segmentURL.getUrl() + " Exception:" + CCStringUtils.stringifyException(e));
malformedURL = true;
}
*/
}
// if ip address is zero... this indicates a dns failure ... react accordingly ...
if (failed || blackListedHost || badSessionIDURL || malformedURL) {
// remove the item from the list ...
host.getUrlTargets().remove(i);
--i;
if (failed) {
CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.DNSFailure,"DNS Failed during URL Distribution");
}
else {
if (blackListedHost)
CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.BlackListedHost,"URL Rejected - Black Listed Host");
else if (badSessionIDURL)
CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.MalformedURL,"URL Rejected - Bad SessionID URL");
else
CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.MalformedURL,"URL Rejected - Malforned URL");
}
}
else {
String url = segmentURL.getUrl();
// ... identify protocol ...
CrawlQueue.Protocol protocol = CrawlQueue.identifyProtocol(url);
// we only support http for now ...
if (protocol != CrawlQueue.Protocol.HTTP) {
LOG.error("No protocol available for URL:"+url + " Segment:"+host.getSegmentId());
// remove the item from the list ...
host.getUrlTargets().remove(i);
--i;
// immedialtely fail this url ...
CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.UnknownProtocol,"Uknown Protocol encountered during URL Distribution.");
}
}
}
// now if we have some urls to crawl ..
if (host.getUrlTargets().size() != 0) {
// remember original size ..
int originalSize = host.getUrlTargets().size();
// submit it to the http crawl queue
int queuedSize = _httpCrawlQueue.queueHost(host.getSegmentId(),host.getListId(),host,callback);
// check delta ..
if (queuedSize < originalSize) {
if (Environment.detailLogEnabled())
LOG.info(Integer.toString(originalSize - queuedSize) + " Entries DROPPED for Domain:" + host.getHostName());
// decrement queued count again by the number of entries that were dropped on the floor ...
incDecPendingQueuedURLCount(0,-(originalSize - queuedSize));
}
}
host.clear();
long timeEnd = System.currentTimeMillis();
_dnsProcessResultsTime.addSample((double)timeEnd-timeStart);
}
private static String buildCrawlSegmentKey(int listId,int segmentId) {
CrawlSegment segment = new CrawlSegment();
segment.setListId(listId);
segment.setSegmentId(segmentId);
return segment.getKey();
}
private void potentiallyLoadNextSegment() {
if (_segmentScanPending == 0) {
// if active url count is less than max threshold ..
if (_activeLoadCount == 0 && _segmentLoadQueue.size() != 0 && getActiveURLCount() < getMaxActiveThreshold()) {
CrawlSegmentStatus loadTarget = _segmentLoadQueue.remove();
// now if a load target was found ...
if (loadTarget != null) {
// get the crawl segment object ...
CrawlSegment segment = crawlSegmentFromCrawlSegmentStatus(loadTarget);
if (Environment.detailLogEnabled())
LOG.info("potenitallyLoadNextSegment returned segment:" + segment.getSegmentId() + " from list:" + segment.getListId());
loadCrawlSegment(segment);
}
}
}
}
public interface CrawlStopCallback {
public void crawlStopped();
}
/** stop the crawl process and clear all host queues **/
public void stopCrawl(final CrawlStopCallback callback) {
if (_crawlActive) {
LOG.info("stopCrawl - stopping HttpQueue");
_httpCrawlQueue.stopCrawl();
// stop disk queue thread
CrawlList.stopDiskQueueingThread();
stopStatsCollector();
if (getServer().enableCrawlLog()) {
LOG.info("stopCrawl - stopping LogFlusher");
_crawlLog.stopLogFlusher(new LogFlusherStopActionCallback() {
public void stopComplete() {
_crawlActive = false;
_crawlStopped = true;
LOG.info("stopCrawl - LogFlusher Stop Complete");
// notify caller if necessary ...
if (callback != null) {
LOG.info("stopCrawl - LogFlusher Stop Complete - Notifying Caller");
callback.crawlStopped();
}
}
});
}
else {
_crawlActive = false;
if (callback != null) {
callback.crawlStopped();
}
}
}
else {
if (callback != null) {
callback.crawlStopped();
}
}
}
/** start crawling **/
public void startCrawl() {
if (!_crawlActive) {
if (Environment.detailLogEnabled())
LOG.info("startCrawl");
startStatsCollector();
_httpCrawlQueue.startCrawl(_crawlStopped);
_crawlStopped = false;
if (getServer().enableCrawlLog()) {
_crawlLog.startLogFlusher();
}
_crawlActive = true;
}
}
/** clear persistent and in memory state **/
public void clearState() {
// stop the crawl ...
stopCrawl(null);
// stop stats collection timer ...
stopStatsCollector();
// clear queues ...
_httpCrawlQueue.clear();
// clear internal data structures ...
_statusMap.clear();
}
public final void logSuccessfulRobotsGET(NIOHttpConnection connection, CrawlTarget url) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%1$20.20s ",CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
sb.append(String.format("%1$16.16s ",(connection.getLocalAddress()!=null) ? connection.getLocalAddress().getAddress() : "UNDEFINED"));
sb.append(String.format("%1$16.16s ",connection.getResolvedAddress()));
sb.append(String.format("%1$4.4s ", url.getResultCode()));
sb.append(String.format("%1$10.10s ",connection.getDownloadLength()));
sb.append(String.format("%1$10.10s ",connection.getConnectTime()));
sb.append(String.format("%1$10.10s ",connection.getUploadTime()));
sb.append(String.format("%1$10.10s ",connection.getDownloadTime()));
if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
sb.append(url.getRedirectURL());
sb.append(" ");
}
sb.append(url.getOriginalURL());
getSuccessLog().info(sb.toString());
}
public final void logSuccessfulGET(NIOHttpConnection connection, CrawlURL url) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%1$20.20s ",CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
sb.append(String.format("%1$16.16s ",(connection.getLocalAddress()!=null) ? connection.getLocalAddress().getAddress() : "UNDEFINED"));
sb.append(String.format("%1$16.16s ",connection.getResolvedAddress()));
sb.append(String.format("%1$4.4s ", url.getResultCode()));
sb.append(String.format("%1$10.10s ",url.getContentRaw().getCount()));
sb.append(String.format("%1$10.10s ",connection.getConnectTime()));
sb.append(String.format("%1$10.10s ",connection.getUploadTime()));
sb.append(String.format("%1$10.10s ",connection.getDownloadTime()));
if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
sb.append(url.getRedirectURL());
sb.append(" ");
}
sb.append(url.getUrl());
getSuccessLog().info(sb.toString());
}
void fetchStarting(CrawlTarget target,NIOHttpConnection connection) {
_server.fetchStarting(target,connection);
}
/** callback triggered whenever a crawl of an item succeeds or fails .. **/
void crawlComplete(NIOHttpConnection connection,CrawlURL url,CrawlTarget optTargetObj,boolean success) {
if (getServer().enableCrawlLog()) {
// if robots get
if ((url.getFlags() & CrawlURL.Flags.IsRobotsURL) != 0) {
getCrawlLog().getRobotsSegment().completeItem(url);
}
// else regular fetch ...
else {
// ok check to see if we have a parse queue
if (_server.isParseQueueEnabled()) {
if (success && url.isFieldDirty(CrawlURL.Field_CRAWLDIRECTIVEJSON)
&& url.getCrawlDirectiveJSONAsTextBytes().getLength() != 0) {
try {
if (potentiallyAddToParseQueue(url)) {
// mark as in parse queue
url.setFlags(url.getFlags() | CrawlURL.Flags.InParseQueue);
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
//update segment log ...
CrawlSegmentLog segmentLog = getCrawlLog().getLogForSegment(url.getListId(),(int)url.getCrawlSegmentId());
if (segmentLog != null) {
segmentLog.completeItem(url);
}
else {
LOG.error("Segement Log for List:" + url.getListId() + " Segment:"+ url.getCrawlSegmentId() + " is NULL (during CrawlComplete) for URL:" + url.getUrl());
}
// IFF target has no valid segment id ... then this is a high priority request ... delegate to outer controller
if (url.getCrawlSegmentId() == -1) {
// notify server (in case it delegates the call)
getServer().crawlComplete(connection, url, optTargetObj,success);
}
else {
CrawlSegmentStatus workUnitStatus = _statusMap.get(CrawlLog.makeSegmentLogId(url.getListId(), (int)url.getCrawlSegmentId()));
// update work unit stats ...
if (workUnitStatus != null) {
workUnitStatus.setUrlsComplete(workUnitStatus.getUrlsComplete() + 1);
if (workUnitStatus.getUrlCount() == workUnitStatus.getUrlsComplete()) {
workUnitStatus.setCrawlStatus(CrawlSegmentStatus.CrawlStatus.CRAWL_COMPLETE);
}
workUnitStatus.setIsDirty(true);
if (!getServer().externallyManageCrawlSegments()) {
getServer().updateCrawlSegmentStatus((int)url.getCrawlSegmentId(),workUnitStatus);
}
}
else {
LOG.error("CrawlSegmentStatus for List:" + url.getListId() + " Segment:"+ url.getCrawlSegmentId() + " is NULL (during CrawlComplete).");
}
}
}
}
else {
// notify server (in case it delegates the call)
getServer().crawlComplete(connection, url, optTargetObj,success);
}
// update stats
if (success) {
_successURLCount++;
synchronized(_crawlerStats) {
_crawlerStats.setUrlsSucceeded(_crawlerStats.getUrlsSucceeded() + 1);
_crawlerStats.setUrlsProcessed(_crawlerStats.getUrlsProcessed() + 1);
if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
switch (url.getOriginalResultCode()) {
case 301: _crawlerStats.setHttp301Count(_crawlerStats.getHttp301Count() + 1);break;
case 302: _crawlerStats.setHttp302Count(_crawlerStats.getHttp302Count() + 1);break;
default: _crawlerStats.setHttp300Count(_crawlerStats.getHttp300Count() + 1);break;
}
if (optTargetObj != null) {
switch (optTargetObj.getRedirectCount()) {
case 1: _crawlerStats.setRedirectResultAfter1Hops(_crawlerStats.getRedirectResultAfter1Hops() + 1);break;
case 2: _crawlerStats.setRedirectResultAfter2Hops(_crawlerStats.getRedirectResultAfter2Hops() + 1);break;
case 3: _crawlerStats.setRedirectResultAfter3Hops(_crawlerStats.getRedirectResultAfter3Hops() + 1);break;
default: _crawlerStats.setRedirectResultAfterGT3Hops(_crawlerStats.getRedirectResultAfterGT3Hops() + 1);break;
}
}
}
// ok now process the http result code ...
if (url.getResultCode() >= 200 && url.getResultCode() < 300) {
_crawlerStats.setHttp200Count(_crawlerStats.getHttp200Count() + 1);
}
else if (url.getResultCode() >= 300 && url.getResultCode() < 400) {
switch (url.getResultCode()) {
case 301: _crawlerStats.setHttp301Count(_crawlerStats.getHttp301Count() + 1);break;
case 302: _crawlerStats.setHttp302Count(_crawlerStats.getHttp302Count() + 1);break;
case 304: _crawlerStats.setHttp304Count(_crawlerStats.getHttp304Count() + 1);break;
default: _crawlerStats.setHttp300Count(_crawlerStats.getHttp300Count() + 1);break;
}
}
else if (url.getResultCode() >= 400 && url.getResultCode() < 500) {
switch (url.getResultCode()) {
case 403: _crawlerStats.setHttp403Count(_crawlerStats.getHttp403Count() + 1);break;
case 404: _crawlerStats.setHttp404Count(_crawlerStats.getHttp404Count() + 1);break;
default: _crawlerStats.setHttp400Count(_crawlerStats.getHttp400Count() + 1);break;
}
}
else if (url.getResultCode() >= 500 && url.getResultCode() < 600) {
_crawlerStats.setHttp500Count(_crawlerStats.getHttp500Count() + 1);
}
else {
_crawlerStats.setHttpOtherCount(_crawlerStats.getHttpOtherCount() + 1);
}
}
logSuccessfulGET(connection,url);
}
else {
_failedURLCount++;
synchronized(_crawlerStats) {
_crawlerStats.setUrlsFailed(_crawlerStats.getUrlsFailed() + 1);
_crawlerStats.setUrlsProcessed(_crawlerStats.getUrlsProcessed() + 1);
switch (url.getLastAttemptFailureReason()) {
case CrawlURL.FailureReason.UnknownProtocol: _crawlerStats.setHttpErrorUnknownProtocol(_crawlerStats.getHttpErrorUnknownProtocol() + 1);break;
case CrawlURL.FailureReason.MalformedURL: _crawlerStats.setHttpErrorMalformedURL(_crawlerStats.getHttpErrorMalformedURL() + 1);break;
case CrawlURL.FailureReason.Timeout: _crawlerStats.setHttpErrorTimeout(_crawlerStats.getHttpErrorTimeout() + 1);break;
case CrawlURL.FailureReason.DNSFailure: _crawlerStats.setHttpErrorDNSFailure(_crawlerStats.getHttpErrorDNSFailure() + 1);break;
case CrawlURL.FailureReason.ResolverFailure: _crawlerStats.setHttpErrorResolverFailure(_crawlerStats.getHttpErrorResolverFailure() + 1);break;
case CrawlURL.FailureReason.IOException: _crawlerStats.setHttpErrorIOException(_crawlerStats.getHttpErrorIOException() + 1);break;
case CrawlURL.FailureReason.RobotsExcluded: _crawlerStats.setHttpErrorRobotsExcluded(_crawlerStats.getHttpErrorRobotsExcluded() + 1);break;
case CrawlURL.FailureReason.NoData: _crawlerStats.setHttpErrorNoData(_crawlerStats.getHttpErrorNoData() + 1);break;
case CrawlURL.FailureReason.RobotsParseError: _crawlerStats.setHttpErrorRobotsParseError(_crawlerStats.getHttpErrorRobotsParseError() + 1);break;
case CrawlURL.FailureReason.RedirectFailed: _crawlerStats.setHttpErrorRedirectFailed(_crawlerStats.getHttpErrorRedirectFailed() + 1);break;
case CrawlURL.FailureReason.RuntimeError: _crawlerStats.setHttpErrorRuntimeError(_crawlerStats.getHttpErrorRuntimeError() + 1);break;
case CrawlURL.FailureReason.ConnectTimeout: _crawlerStats.setHttpErrorConnectTimeout(_crawlerStats.getHttpErrorConnectTimeout() + 1);break;
case CrawlURL.FailureReason.BlackListedHost: _crawlerStats.setHttpErrorBlackListedHost(_crawlerStats.getHttpErrorBlackListedHost() + 1);break;
case CrawlURL.FailureReason.BlackListedURL: _crawlerStats.setHttpErrorBlackListedURL(_crawlerStats.getHttpErrorBlackListedURL() + 1);break;
case CrawlURL.FailureReason.TooManyErrors: _crawlerStats.setHttpErrorTooManyErrors(_crawlerStats.getHttpErrorTooManyErrors() + 1);break;
case CrawlURL.FailureReason.InCache: _crawlerStats.setHttpErrorInCache(_crawlerStats.getHttpErrorInCache() + 1);break;
case CrawlURL.FailureReason.InvalidResponseCode: _crawlerStats.setHttpErrorInvalidResponseCode(_crawlerStats.getHttpErrorInvalidResponseCode() + 1);break;
case CrawlURL.FailureReason.BadRedirectData: _crawlerStats.setHttpErrorBadRedirectData(_crawlerStats.getHttpErrorBadRedirectData() + 1);break;
default: _crawlerStats.setHttpErrorUNKNOWN(_crawlerStats.getHttpErrorUNKNOWN() + 1);
}
}
}
if (getServer().enableCrawlLog()) {
// either way decrement queued count ...
incDecPendingQueuedURLCount(0,-1);
// now potentially load any deferred segments ...
potentiallyLoadNextSegment();
}
}
/** callback used by crawl log to notify engine of a segment that should be marked as completed ... **/
void crawlSegmentComplete(long packedSegmentId) {
CrawlSegmentStatus segmentStatus = _statusMap.get(packedSegmentId);
if (segmentStatus != null) {
// mark the segment as completed ...
segmentStatus.setIsComplete(true);
}
}
/*
void purgeCrawlSegments(CrawlSegmentList segmentList) {
for (long packedSegmentId : segmentList.getSegments()) {
CrawlSegmentStatus status = _statusMap.get(packedSegmentId);
if (status != null && status.getIsComplete() == true) {
CrawlSegmentLog segmentLog = _crawlLog.getLogForSegment(CrawlLog.getListIdFromLogId(packedSegmentId),CrawlLog.getSegmentIdFromLogId(packedSegmentId));
if (segmentLog == null || segmentLog.isSegmentComplete()) {
// remove from map ...
_statusMap.remove(packedSegmentId);
// remove from log ...
_crawlLog.removeSegmentLog(CrawlLog.getListIdFromLogId(packedSegmentId),CrawlLog.getSegmentIdFromLogId(packedSegmentId));
try {
// remove from database
_recordStore.beginTransaction();
_recordStore.deleteRecordByKey(CrawlSegmentKeyPrefix + buildCrawlSegmentKey(status.getListId(),status.getSegmentId()));
_recordStore.commitTransaction();
}
catch (RecordStoreException e) {
LOG.error("purge of crawl segment record with list id:"+ status.getListId() + "segment id:" + status.getSegmentId() + " threw Exception:" + CCStringUtils.stringifyException(e));
}
}
}
}
}
*/
public void dumpQueueDetailsToHTML(JspWriter out)throws IOException {
_httpCrawlQueue.dumpDetailsToHTML(out);
}
public void dumpHostDetailsToHTML(JspWriter out, int hostIP)throws IOException {
_httpCrawlQueue.dumpHostDetailsToHTML(out, hostIP);
}
public void pauseFetch() {
LOG.info("PAUSING CRAWL");
_httpCrawlQueue.pauseCrawl();
}
public void resumeFetch() {
LOG.info("RESUMING CRAWL");
_httpCrawlQueue.resumeCrawl();
}
public void logDNSFailure(String hostName, String errorDescription) {
synchronized(_DNSFailureLog) {
_DNSFailureLog.error(hostName + "," + errorDescription);
}
}
public void logDNSQuery(String hostName, InetAddress address, long ttl,String opCName) {
synchronized(_DNSSuccessLog) {
_DNSSuccessLog.info(hostName + "," + address.toString() + "," + ttl + "," + opCName);
}
}
/** black list the given host name **/
public void failDomain(String domainName) {
_badDomainCache.cacheIPAddressForHost(domainName, 0,0, null);
}
public boolean isBadDomain(String domainName) {
return _badDomainCache.findNode(domainName) != null;
}
public void queueExternalURL(final String url,final long fingerprint,final boolean highPriorityRequest,final CrawlItemStatusCallback callback) {
// validate the url ...
String hostName = URLUtils.fastGetHostFromURL(url);
if (hostName == null) {
if (callback != null) {
callback.crawlComplete(null,CrawlTarget.allocateCrawlURLForFailure(url,fingerprint,CrawlURL.FailureReason.MalformedURL,"URL Rejected - Bad SessionID URL"),null,false);
}
else {
if (Environment.detailLogEnabled())
LOG.error("queueExternalURL for URL:" + url + " Failed with:URL Rejected - Bad SessionID URL");
}
}
else {
// schedule resolution ...
NIODNSQueryClient queryClient = new NIODNSQueryClient() {
@Override
public void AddressResolutionFailure(NIODNSResolver source,String hostName, Status status, String errorDesc) {
// LOG.info("DNS Failed for High Priority Request:" + hostName + " Errror:" + errorDesc);
if (callback != null) {
callback.crawlComplete(null,CrawlTarget.allocateCrawlURLForFailure(url,fingerprint,CrawlURL.FailureReason.DNSFailure,errorDesc),null,false);
}
else {
if (Environment.detailLogEnabled())
LOG.error("queueExternalURL for URL:" + url + " Failed with:DNS Failed for High Priority Request:" + hostName + " Errror:" + errorDesc);
}
}
@Override
public void AddressResolutionSuccess(NIODNSResolver source,String hostName, String name, InetAddress address, long addressTTL) {
int hostAddress = 0;
if (address != null && address.getAddress() != null) {
byte[] addr = address.getAddress();
if (addr.length == 4) {
hostAddress = IPAddressUtils.IPV4AddressToInteger(addr);
}
}
if (hostAddress != 0) {
// LOG.info("DNS Success for High Priority URL:" + url + "IP:" + address.toString());
_httpCrawlQueue.queueExternalURLRequest(url, getServer().getHighPriorityListId(), fingerprint, hostName, hostAddress, addressTTL + 30000000,highPriorityRequest, callback);
}
else {
// LOG.error("DNS Failed for High Priority URL:"+ url + " with Zero IP");
if (callback != null) {
callback.crawlComplete(null,CrawlTarget.allocateCrawlURLForFailure(url,fingerprint,CrawlURL.FailureReason.DNSFailure,"Invalid IP Address"),null,false);
}
else {
if (Environment.detailLogEnabled())
LOG.error("queueExternalURL for URL:" + url + " Failed with:DNS Failed for High Priority URL:"+ url + " with Zero IP");
}
}
}
@Override
public void DNSResultsAvailable() {}
@Override
public void done(NIODNSResolver source,Future<NIODNSQueryResult> task) {
}
};
try {
getServer().getDNSServiceResolver().resolve(queryClient, hostName,false, true,DEFAULT_DNS_TIMEOUT);
} catch (IOException e) {
// LOG.error("Failed to Dispatch DNS Query for High Priority URL:" + url + " Exception:" + CCStringUtils.stringifyException(e));
if (callback != null) {
callback.crawlComplete(null,CrawlTarget.allocateCrawlURLForFailure(url,fingerprint,CrawlURL.FailureReason.ResolverFailure,CCStringUtils.stringifyException(e)),null,false);
}
else {
if (Environment.detailLogEnabled())
LOG.error("queueExternalURL for URL:" + url + " Failed with:Exception:" + CCStringUtils.stringifyException(e));
}
}
}
}
public void queueExternalCrawlSegmentHost(CrawlSegmentHost host,CrawlItemStatusCallback callback){
processHostIPResolutionResult(host,false,callback);
}
public enum RobotsLogEventType {
HTTP_GET_Complete,
HTTP_GET_Failed,
Parse_Succeeded,
Parse_Failed
}
public static final int RobotsParseFlag_ExcludesAll = 1 << 0;
public static final int RobotsParseFlag_ExplicitMention = 1 << 1;
public static final int RobotsParseFlag_HasCrawlDelay = 1 << 2;
public static final int RobotsParseFlag_ContentDecodeFailed = 1 << 3;
public static final int RobotsParseFlag_ContentWasHTML = 1 << 3;
/** log robots fetch **/
public synchronized void logRobots(long fetchTime,String domain,int httpResultCode,String robotsData,RobotsLogEventType eventType, int flags) {
StringBuffer sb = new StringBuffer(2048);
sb.append(String.format("%1$24.24s ",robotsLogDateFormat.format(new Date(fetchTime))));
sb.append(String.format("%1$40.40s ",domain));
switch (eventType) {
case HTTP_GET_Complete: {
sb.append(String.format("%1$12.12s ","GET_COMPLETE"));
}
break;
case HTTP_GET_Failed: {
sb.append(String.format("%1$12.12s ","GET_FAILURE"));
}
break;
case Parse_Succeeded: {
sb.append(String.format("%1$12.12s ","PARSE_SUCCESS"));
}
break;
case Parse_Failed: {
sb.append(String.format("%1$12.12s ","PARSE_FAILED"));
}
break;
}
sb.append(String.format("%1$4.4s ",httpResultCode));
if (eventType == RobotsLogEventType.HTTP_GET_Complete) {
if (robotsData != null && robotsData.length() != 0) {
sb.append("\n****CONTENT-START****\n");
sb.append(robotsData,0,Math.min(robotsData.length(),8192));
sb.append("\n****CONTENT-END ****\n");
}
else if ((flags & RobotsParseFlag_ContentDecodeFailed) != 0) {
sb.append(" ContentDecodeFailed");
}
else if ((flags & RobotsParseFlag_ContentWasHTML) != 0) {
sb.append(" ContentWasHTML");
}
}
else if (eventType == RobotsLogEventType.Parse_Succeeded) {
if ((flags & RobotsParseFlag_ExcludesAll) != 0) {
sb.append("ExcludesAll ");
}
if ((flags & RobotsParseFlag_ExplicitMention) != 0) {
sb.append("ExplicitMention ");
}
if ((flags & RobotsParseFlag_HasCrawlDelay) != 0) {
sb.append("HasCrawlDelay ");
}
}
_RobotsLog.error(sb.toString());
}
FlexBuffer getActiveHostListAsBuffer() throws IOException {
if (_crawlActive && _httpCrawlQueue != null) {
DataOutputBuffer outputBuffer = new DataOutputBuffer();
Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs();
WritableUtils.writeVInt(outputBuffer, ipAddressSet.size());
for (int hostIP : ipAddressSet) {
WritableUtils.writeVInt(outputBuffer, hostIP);
}
return new FlexBuffer(outputBuffer.getData(),0,outputBuffer.getLength());
}
return null;
}
@Override
public boolean cancelOperation() {
return _shutdownFlag == true;
}
}