/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.commoncrawl.service.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeMap;
import javax.servlet.jsp.JspWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.async.Timer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.protocol.CrawlURL.FailureReason;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.URLUtils;
import com.google.common.collect.ImmutableSet;
/**
* A queue that manages a set of Crawlable Hosts
*
* @author rana
*
*/
public final class CrawlQueue {
/** constants **/
private static final int SCHEDULER_QUEUE_INITIAL_SIZE = 10000;
private static final int SCHEDULER_SCAN_INTERVAL = 1000;
private static final int IDLE_SCAN_INTERVAL = 30000;
/** logging **/
private static final Log LOG = LogFactory.getLog(CrawlQueue.class);
public enum Protocol {
UNKNOWN,
HTTP,
HTTPS,
FTP
}
private Protocol _protocol;
private Fetcher _fetcher;
private boolean _active = false;
private Timer _scheulderTimer = null;
private long _lastIdleCheckTime = -1;
private long _purgedHostCount = 0;
/** active host map **/
private TreeMap<Integer,CrawlQueueHost> _activeHosts = new TreeMap<Integer,CrawlQueueHost>();
/** idle host map **/
private TreeMap<Integer,CrawlQueueHost> _idleHosts = new TreeMap<Integer,CrawlQueueHost>();
/** scheduled host priority queue **/
private PriorityQueue<CrawlQueueHost> _schedulerQueue = new PriorityQueue<CrawlQueueHost>(SCHEDULER_QUEUE_INITIAL_SIZE,
new Comparator<CrawlQueueHost>() {
public int compare(CrawlQueueHost host1, CrawlQueueHost host2) {
if (host1.getWaitTime() < host2.getWaitTime()) {
return -1;
}
else if (host1.getWaitTime() > host2.getWaitTime()) {
return 1;
}
return 0;
}
});
public CrawlQueue(Protocol protocol,Fetcher fetcher) {
_protocol = protocol;
_fetcher = fetcher;
}
private void setScheduleTimer() {
_scheulderTimer = new Timer(SCHEDULER_SCAN_INTERVAL,true,new Timer.Callback() {
public void timerFired(Timer timer) {
// list of hosts that have cleared the wait state ...
LinkedList<CrawlQueueHost> readyList = new LinkedList<CrawlQueueHost>();
CrawlQueueHost item = null;
Long currentTime = System.currentTimeMillis();
while ( (item = _schedulerQueue.peek()) != null) {
// check to see if this host's timer has expired ...
if (currentTime >= item.getWaitTime()) {
// remove from queue ...
_schedulerQueue.remove();
// add to ready list ...
readyList.add(item);
}
else {
// break out of loop
break;
}
}
// now walk ready list and clear the host's wait state
for (CrawlQueueHost readyHost : readyList) {
readyHost.clearWaitState();
}
if (_lastIdleCheckTime == -1 || System.currentTimeMillis() - _lastIdleCheckTime >= IDLE_SCAN_INTERVAL) {
_lastIdleCheckTime = System.currentTimeMillis();
// do idle scan ...
purgeIdleHosts();
}
}
});
getEngine().getServer().getEventLoop().setTimer(_scheulderTimer);
}
private void purgeIdleHosts() {
LinkedList<CrawlQueueHost> purgeCandidates = new LinkedList<CrawlQueueHost>();
long currentTime = System.currentTimeMillis();
// walk active hosts ...
for (CrawlQueueHost host : _activeHosts.values()) {
if (currentTime - host.getLastModifiedTime() >= getEngine().getServer().getHostIdleFlushThreshold()) {
if (host.noActiveLists()) {
// LOG.info("BUG: Host:" + host.getIPAddressAsString() + " shows NOT_IDLE but is really IDLE");
purgeCandidates.add(host);
}
}
}
// move inactive but not idled hosts into idle host bucket ...
for (CrawlQueueHost host : purgeCandidates) {
idleHost(host);
}
// walk idle hosts ...
for (CrawlQueueHost host : _idleHosts.values()) {
if (host.noActiveLists()) {
if (currentTime - host.getLastModifiedTime() >= getEngine().getServer().getHostIdleFlushThreshold()) {
purgeCandidates.add(host);
}
}
}
if (purgeCandidates.size() != 0) {
if (Environment.detailLogEnabled())
LOG.info("Purging " + purgeCandidates.size() + " IDLE Hosts");
}
for (CrawlQueueHost host : purgeCandidates) {
if (Environment.detailLogEnabled())
LOG.info("Purging IDLE Host:" + host.getIPAddressAsString());
// clear the host ...
host.purgeReferences();
// and remove it from the map ...
_idleHosts.remove(host.getIPAddress());
// increment stats ...
++_purgedHostCount;
}
purgeCandidates.clear();
}
private void stopScheduleTimer() {
if (_scheulderTimer != null) {
getEngine().getServer().getEventLoop().cancelTimer(_scheulderTimer);
}
_scheulderTimer = null;
}
/** get access to the engine object **/
CrawlerEngine getEngine() { return CrawlerServer.getEngine(); }
/** get the protocol associated with this queue **/
public Protocol getProtocol() { return _protocol; }
/** identify the protocol associated with the given host ... **/
public static Protocol identifyProtocol(String url) {
if (url.toLowerCase().startsWith("http://") || url.startsWith("HTTP://")
|| url.toLowerCase().startsWith("https://") || url.startsWith("HTTPS://")) {
return Protocol.HTTP;
}
return Protocol.UNKNOWN;
}
/** create a robots url given a specified host (based on the queue's protocol) **/
URL getRobotsURL(String host) {
URL urlOut = null;
// we only support http for now ...
if (_protocol == Protocol.HTTP) {
try {
urlOut = new URL("http",host,"/robots.txt");
} catch (MalformedURLException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
return urlOut;
}
/** get / set active state **/
public final boolean isActive() { return _active; }
public void startCrawl(boolean isRestart) {
// start the fetcher ...
_fetcher.start();
// start schedule timer
setScheduleTimer();
_active = true;
if (!isRestart) {
// reschedule hosts ...
for (CrawlQueueHost host: _activeHosts.values()) {
if (!host.isIdled()) {
host.feedQueue();
}
}
}
}
public void stopCrawl() {
stopScheduleTimer();
// stop fetcher first ... it will retain all urls in its queue ...
_fetcher.stop();
}
public void pauseCrawl() {
if (_fetcher != null) {
_fetcher.pause();
}
}
public void resumeCrawl() {
if (_fetcher != null) {
_fetcher.resume();
}
}
private CrawlQueueHost getCrawlHost(int serverIP) {
boolean activeHostCountIncreased = false;
// get a host based on ip address ...
CrawlQueueHost crawlHost = _activeHosts.get(serverIP);
// if null. first check idled hosts ...
if (crawlHost == null) {
crawlHost = _idleHosts.get(serverIP);
// if we found an idled host ... mark it as active ...
if (crawlHost != null) {
_idleHosts.remove(crawlHost.getIPAddress());
crawlHost.setIdle(false);
_activeHosts.put(crawlHost.getIPAddress(),crawlHost);
activeHostCountIncreased = true;
}
}
// if crawl host is still null... allocate a new host ...
if (crawlHost == null) {
crawlHost = new CrawlHostImpl(this,serverIP);
_activeHosts.put(serverIP,crawlHost);
activeHostCountIncreased = true;
}
if (activeHostCountIncreased) {
getEngine().incDecActiveHostCount(1);
}
return crawlHost;
}
public void queueExternalURLRequest(String url,int listId,long fingerprint,String hostName,int resolvedIPAddress,long ipAddressTTL,boolean highPriorityRequest,CrawlItemStatusCallback callback) {
// get crawl host based on ip address
CrawlQueueHost crawlHost = getCrawlHost(resolvedIPAddress);
CrawlList crawlList = crawlHost.getCrawlList(listId);
// lock it ...
synchronized (crawlList) {
// update the list's dns cache
crawlList.cacheDNSEntry(hostName,resolvedIPAddress, ipAddressTTL);
// update it's disposition change time ...
crawlList.updateLastModifiedTime(System.currentTimeMillis());
// allocate a new crawl target ...
CrawlTarget target = new CrawlTarget(-1,crawlList,url,fingerprint,callback);
// set target's ip address and ttl based on original host
target.setServerIP(resolvedIPAddress);
target.setServerIPTTL(ipAddressTTL);
// and add to the domain's list ...
crawlList.addCrawlTarget(target,highPriorityRequest);
}
}
/** add the specified url to the queue **/
//TODO: OPTIMIZE THIS ROUTINE ....
public int queueHost(int segmentId,int listId, CrawlSegmentHost host,CrawlItemStatusCallback callback) {
int newItemsQueued = 0;
int originalDomainQueuedCount= 0;
// int originalHostQueuedCount = 0;
// get crawl host based on ip address
CrawlQueueHost crawlHost = getCrawlHost(host.getIpAddress());
// update last mod time
crawlHost.updateLastModifiedTime(System.currentTimeMillis());
// and extract host name ...
String hostName = host.getHostName();
// get crawl list for host name ...
CrawlList crawlList = crawlHost.getCrawlList(listId);
// lock it ...
synchronized (crawlList) {
// update the list's dns cache
crawlList.cacheDNSEntry(hostName,host.getIpAddress(), host.getTtl());
// update it's disposition change time ...
crawlList.updateLastModifiedTime(System.currentTimeMillis());
// get the domain initial queued count ...
originalDomainQueuedCount = crawlList.getPendingURLCount() + crawlList.getOfflineURLCount();
// and next ... walk the targets ...
for (CrawlSegmentURL segmentURL : host.getUrlTargets()) {
//NOTE:(AHAD) REMOVED QUEUE SIZE RESTRICTION
//if (originalDomainQueuedCount + newItemsQueued >= MAX_DOMAIN_QUEUE_SIZE /*|| originalHostQueuedCount + newItemsQueued >= MAX_HOST_QUEUE_SIZE*/)
// break;
// get fp for url
URLFP urlFingerprint = URLUtils.getURLFPFromURL(segmentURL.getUrl(),false);
if (urlFingerprint != null) {
// check to see if already crawled ...
if (CrawlerServer.getEngine().getLocalBloomFilter().isPresent(urlFingerprint)) {
// CrawlTarget.failURL(CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),null, CrawlURL.FailureReason.UNKNOWN,"Item Already Crawled");
CrawlTarget.logFailureDetail(
getEngine(),
CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),
null,
FailureReason.UNKNOWN,
"Aready Crawled URL");
}
else {
// allocate a new crawl target ...
CrawlTarget target = new CrawlTarget(segmentId,crawlList,host,segmentURL);
// set target's ip address and ttl based on original host
target.setServerIP(host.getIpAddress());
target.setServerIPTTL(host.getTtl());
// set optional callback
target.setCompletionCallback(callback);
// and add to the domain's list ...
crawlList.addCrawlTarget(target,false);
// increment items Queued
newItemsQueued++;
}
}
else {
CrawlTarget.logFailureDetail(
getEngine(),
CrawlTarget.allocateCrawlURLFromSegmentURL(host.getSegmentId(),host,segmentURL,false),
null,
FailureReason.MalformedURL,
"Failed to Convert to URLFP");
}
}
}
return newItemsQueued;
}
void fetchItem(CrawlTarget target) {
_fetcher.queueURL(target);
}
void setTimer(CrawlQueueHost host,long timeoutTime) {
_schedulerQueue.add(host);
}
void killTimer(CrawlQueueHost host) {
_schedulerQueue.remove(host);
}
/** clear / reset queue **/
public void clear() {
_fetcher.clearQueues();
for (CrawlQueueHost host: _idleHosts.values()) {
host.purgeReferences();
}
_idleHosts.clear();
for (CrawlQueueHost host: _activeHosts.values()) {
host.purgeReferences();
}
// inform engine of the change ...
getEngine().incDecActiveHostCount(-_activeHosts.size());
_activeHosts.clear();
_schedulerQueue.clear();
}
void shutdown() {
// clear data structures first ....
clear();
_protocol = null;
_fetcher.shutdown();
_fetcher = null;
_scheulderTimer = null;
}
void idleHost(CrawlQueueHost host) {
host.setIdle(true);
_activeHosts.remove(host.getIPAddress());
_idleHosts.put(host.getIPAddress(),host);
// inform engine of stat change
getEngine().incDecActiveHostCount(-1);
}
void collectStats(CrawlerStats crawlerStats,RuntimeStatsCollector stats) {
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlQueue_ActiveHostsCount, _activeHosts.size());
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlQueue_ScheduledHostsCount, _schedulerQueue.size());
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlQueue_IdledHostsCount,_idleHosts.size());
stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.CrawlQueue_PurgedHostsCount,_purgedHostCount);
synchronized(crawlerStats) {
crawlerStats.setActiveHosts(_activeHosts.size());
crawlerStats.setScheduledHosts(_schedulerQueue.size());
crawlerStats.setIdledHosts(_idleHosts.size());
}
//sb.append("*****ACTIVE-HOST-DUMP*****\n");
/*
sb.append(String.format("%1$20.20s ", "NAME"));
sb.append(String.format("%1$10.10s ", "ROBOTS"));
sb.append(String.format("%1$10.10s ", "REDIRECTS"));
sb.append(String.format(" %1$7.7s ", "PENDING"));
sb.append(String.format("%1$9.9s ", "SCHEDULED"));
sb.append(String.format("%1$6.6s ", "ACTIVE"));
sb.append(String.format("%1$10.10s ", "DISPOSITON"));
sb.append(String.format("%1$8.8s\n", "HOSTFAIL"));
for (CrawlHost host : _hosts.values()) {
sb.append(String.format("%1$20.20s ", host.getHostName()));
sb.append(String.format("%1$10.10s ", ((Boolean)host.robotsRetrieved()).toString()));
sb.append(String.format("%1$10.10s ",host.getTotalRedirects()));
sb.append(String.format(" %1$7.7s ", host.getPendingURLCount()));
sb.append(String.format("%1$9.9s ", host.isScheduled()));
sb.append(String.format("%1$6.6s ", host.getActiveURLCount()));
sb.append(String.format("%1$10.10s ", host.getDisposition()));
sb.append(String.format("%1$8.8s\n", host.isFailedHost()));
}
*/
/*
sb.append("*****IDLE-HOST-DUMP*****\n");
sb.append(String.format("%1$20.20s\n", "NAME"));
for (CrawlHost host : _idledHosts.values()) {
sb.append(String.format("%1$20.20s\n", host.getHostName()));
}
*/
_fetcher.collectStats(crawlerStats,stats);
}
private static class HostLoadInfo implements Comparable {
public HostLoadInfo(CrawlQueueHost host) {
this.host = host;
this.loadCount = host.getQueuedURLCount();
}
public CrawlQueueHost host;
public int loadCount;
public int compareTo(Object o) {
HostLoadInfo other = (HostLoadInfo)o;
if (loadCount < other.loadCount) {
return 1;
}
else if (loadCount > other.loadCount) {
return -1;
}
return 0;
}
}
void dumpDetailsToHTML(JspWriter out) throws IOException {
if (_activeHosts.size() != 0) {
HostLoadInfo loadInfoVector[] = new HostLoadInfo[_activeHosts.size()];
int index=0;
int queuedAmount = 0;
for (CrawlQueueHost host: _activeHosts.values()) {
loadInfoVector[index] = new HostLoadInfo(host);
queuedAmount += loadInfoVector[index].loadCount;
++index;
}
// sort the list by load ...
Arrays.sort(loadInfoVector);
out.write("<B>Queued Item Count:" + queuedAmount + "</B></BR>" );
out.write("<table border=1>");
out.write("<tr><td>Host</td><td>Details</td></tr>");
for (HostLoadInfo hostInfo : loadInfoVector) {
CrawlQueueHost host = hostInfo.host;
out.write("<tr><td><a href=\"showHostDetails.jsp?hostId="+host.getIPAddress()+"\">" + host.getIPAddressAsString() + "</a></td><td><pre>");
out.write("loadCount:<B>("+hostInfo.loadCount +") </B>");
out.write("idled:" + host.isIdled());
out.write(" activeList:" + ((host.getActiveList() != null) ? host.getActiveList().getListName() : "NULL"));
out.write(" disp:<b>" + ((host.getActiveList() != null) ? host.getActiveList().getDisposition().toString() : "NULL") + "</b>");
out.write(" lastChange(MS):");
long lastChangeTimeMS = System.currentTimeMillis() - host.getLastModifiedTime();
boolean colorRed = false;
if (lastChangeTimeMS > 60000) {
colorRed = true;
out.write("<FONT color=RED>");
}
out.write(Long.toString(lastChangeTimeMS));
if (colorRed) {
out.write("</FONT>");
}
out.write("</pre></td></tr>");
}
out.write("</table>");
}
}
public void dumpHostDetailsToHTML(JspWriter out, int hostIP)throws IOException {
CrawlQueueHost host = _activeHosts.get(hostIP);
if (host != null) {
out.write("<h2>Host Details for Host:" + host.getIPAddressAsString() + "</h2><BR>");
out.write("<pre>");
host.dumpDetails(out);
out.write("</pre>");
}
}
public static String protocolToScheme(Protocol protocol) {
switch (protocol) {
case FTP: return "ftp";
case HTTP: return "http";
case HTTPS: return "https";
default: return null;
}
}
public Set<Integer> getActiveHostIPs() {
return ImmutableSet.copyOf(_activeHosts.keySet());
}
}