/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.commoncrawl.service.crawler;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.Locale;
import java.util.SimpleTimeZone;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.async.Timer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIODNSResolver;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOSocketSelector;
import org.commoncrawl.io.NIOHttpConnection.State;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.MovingAverage;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.SmoothedAverage;
import org.commoncrawl.util.URLUtils;
/**
*
* @author rana
*
*/
public final class HttpFetcher implements Fetcher , NIOHttpConnection.Listener {
/** constants **/
private static long TIMEOUT_TIMER_INTERVAL = 1000; // every 1 second ...
private static int DOWNLOAD_LIMIT = CrawlEnvironment.CONTENT_SIZE_LIMIT;
private static final int MAX_REDIRECTS = 6;
/** logging **/
private static final Log LOG = LogFactory.getLog(HttpFetcher.class);
/** running state variable **/
private boolean _running = false;
/** paused state **/
private boolean _paused = false;
/** max open sockets variable **/
private int _maxSockets;
/** selector reference **/
NIOSocketSelector _selector;
/** resolver reference **/
NIODNSResolver _resolver;
/** timeout timer object **/
Timer _timeoutTimer;
/** rotating snapshot timer **/
int _snapshotNumber;
/** fail connections mode **/
boolean _failConnections = false;
/** crawler name **/
String _crawlerName;
/** stats **/
private int connectionCount = 0;
private int finishCount = 0;
private int successCount = 0;
private int failureCount = 0;
private int resolvingCount = 0;
private int connectingCount = 0;
private int sendingCount = 0;
private int receivingCount = 0;
private long cumilativeURLCount = 0;
private long firstSnapShotTime = -1;
private int snapShotURLCount = 0;
private int snapShotConnectionCount = 0;
private long snapShotTime = -1;
private long cumilativeURLSSEC = 0;
private int snapShotCount = 0;
private long snapShotDownloadAmt = 0;
private long cumilativeDownloadPerSec = 0;
private MovingAverage _urlsPerSecMovingAverage;
private MovingAverage _kbPerSecMovingAverage;
private SmoothedAverage _urlsPerSecSmoothed;
private SmoothedAverage _kbPerSecSmoothed;
private MovingAverage _avgDownloadSize;
private InetSocketAddress _crawlInterfaces[];
/** CrawlContext **/
private static class CrawlContext {
CrawlTarget _url;
int _index;
public CrawlContext(CrawlTarget url, int index) {
_url = url;
_index = index;
}
public CrawlTarget getURL() {
return _url;
}
public int getIndex() {
return _index;
}
}
/** active connections **/
private NIOHttpConnection _active[] = null;
/** active connection versions **/
private short _activeVersions[] = null;
/** trailing connection versions **/
private short _trailingVersions[] = null;
/** pending URLs **/
private LinkedList<CrawlTarget> _pending = new LinkedList<CrawlTarget>();
/** pause support **/
public boolean isPaused() { return _paused; }
public void pause() { _paused = true; }
public void resume() {
if (_paused) {
_paused = false;
if (_running)
fillQueue(true);
}
}
/** if-modified-since support **/
private SimpleDateFormat http_date_format;
public HttpFetcher(int maxOpenSockets,InetSocketAddress[] crawlInterfaceList,String crawlerName) {
_maxSockets = maxOpenSockets;
_active = new NIOHttpConnection[_maxSockets];
_activeVersions = new short[_maxSockets];
_trailingVersions = new short[_maxSockets];
_selector = CrawlerServer.getServer().getEventLoop().getSelector();
_resolver = CrawlerServer.getServer().getDNSServiceResolver();
_urlsPerSecMovingAverage = new MovingAverage(200);
_kbPerSecMovingAverage = new MovingAverage(200);
_avgDownloadSize = new MovingAverage(200);
_urlsPerSecSmoothed = new SmoothedAverage(.25);
_kbPerSecSmoothed = new SmoothedAverage(.25);
_crawlInterfaces = crawlInterfaceList;
_crawlerName = crawlerName;
http_date_format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss 'GMT'", Locale.US);
http_date_format.setTimeZone(new SimpleTimeZone(0, "GMT"));
// set the default ccbot user agent string
NIOHttpConnection.setDefaultUserAgentString("CCBot/1.0 (+http://www.commoncrawl.org/bot.html)");
}
public void clearQueues() {
if (!_running) {
for (int i=0;i<_active.length;++i) {
_active[i] = null;
}
_pending.clear();
}
else {
throw new IllegalStateException();
}
}
public void shutdown() {
clearQueues();
}
public void queueURLs(LinkedList<CrawlTarget> urlList) {
for (CrawlTarget url : urlList) {
//LOG.debug("Adding URL:"+url.getURL() + " to Fetcher Queue");
_pending.add(url);
}
// fillQueue(false);
}
public void queueURL(CrawlTarget target) {
_pending.add(target);
//fillQueue(false);
}
public void start() {
// reset stats ...
finishCount = 0;
successCount = 0;
failureCount = 0;
// flip running bit ...
_running = true;
fillQueue(false);
_timeoutTimer = new Timer(TIMEOUT_TIMER_INTERVAL,true, new Timer.Callback() {
public void timerFired(Timer timer) {
fillQueue(true);
}
});
// register timeout timer ...
CrawlerServer.getServer().getEventLoop().setTimer(_timeoutTimer);
}
public void stop() {
// flip running bit ..
_running = false;
// first step .. cancel timer ...
if (_timeoutTimer != null) {
CrawlerServer.getServer().getEventLoop().cancelTimer(_timeoutTimer);
_timeoutTimer = null;
}
// next cancel all active connections ...
for (int i=0;i<_active.length;++i){
if (_active[i] != null) {
CrawlContext context = (CrawlContext) _active[i].getContext();
_active[i].setContext(null);
// close the connection
_active[i].close();
// null out the slot ...
_active[i] = null;
// and add the connection back to the pending queue ...
// add the item back to pending list ...
_pending.addFirst(context.getURL());
}
}
// clear appropriate summary fields ...
connectionCount = 0;
resolvingCount = 0;
connectingCount = 0;
sendingCount = 0;
receivingCount = 0;
}
private void logGET(CrawlTarget url, int index) {
if (Environment.detailLogEnabled()) {
// if ( (url.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%1$20.20s ",CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
sb.append(String.format("%1$4.4s ",url.getResultCode()));
sb.append(String.format("%1$4.4s ",url.getRetryCount()));
sb.append(String.format("%1$4.4s ",url.getRedirectCount()));
sb.append(url.getOriginalURL());
sb.append(" ");
if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
sb.append(url.getRedirectURL());
}
CrawlerServer.getEngine().getGETLog().info(sb.toString());
// }
}
}
/** figure out which ip address(source) to use for the specified crawl target
*
* @return index number of the interface to use
*/
private int getCrawlInterfaceForCrawlTarget(CrawlTarget target) {
if (target.getCrawlInterface() != -1) {
return target.getCrawlInterface();
}
if (target.getSourceList() != null) {
// save current interface
int nextCrawlInterface = target.getSourceList().getNextCrawlInterface();
// set next interface
target.getSourceList().setNextCrawlInterface((nextCrawlInterface+1) % _crawlInterfaces.length);
// set affinity in target
target.setCrawlInterface(nextCrawlInterface);
return nextCrawlInterface;
}
else {
return 0;
}
}
private boolean fillSlot(int index,CrawlTarget optionalTarget) {
// dont fill slot in paused state ...
if (!isPaused() || optionalTarget != null) {
if (_active[index] != null) {
LOG.error("fill Slot Called on Non-Empty Slot:"+ index + " With URL:" + _active[index].getURL());
}
// if there are pending urls ...
if (optionalTarget != null || _pending.size() != 0) {
// pop a url off of the queue ... or use the optionally passed in target
CrawlTarget crawlTarget = (optionalTarget != null) ? optionalTarget : _pending.removeFirst();
try {
URL fetchURL = new URL(crawlTarget.getActiveURL());
URL originalURL = (crawlTarget.getRedirectCount() == 0) ? fetchURL : new URL(crawlTarget.getOriginalURL());
// open a new connection and assign it to the available slot ...
if (_crawlInterfaces!= null) {
_active[index] = new NIOHttpConnection(fetchURL,_crawlInterfaces[getCrawlInterfaceForCrawlTarget(crawlTarget)],_selector,_resolver,crawlTarget.getCookieStore());
}
else {
_active[index] = new NIOHttpConnection(fetchURL,_selector,_resolver,crawlTarget.getCookieStore());
}
// LOG.info("### FETCHER Alloc HTTPConnect to:" + fetchURL + " Slot:" + index);
//TODO: MAJOR HACK
// disable proxy requests for robots
if ((crawlTarget.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
if (CrawlerServer.getServer().getProxyAddress() != null) {
// check to see if we should be using a proxy server
_active[index].setProxyServer(CrawlerServer.getServer().getProxyAddress());
}
}
// add in special source header
_active[index].getRequestHeaders().setIfNotSet("x-cc-id",_crawlerName);
// add in cache tests if present
if (crawlTarget.getRedirectCount() == 0) {
if (crawlTarget.getLastModifiedTime() != -1) {
_active[index].getRequestHeaders().setIfNotSet(
"If-Modified-Since",http_date_format.format(new Date(crawlTarget.getLastModifiedTime())));
}
if (crawlTarget.getETag() != null) {
_active[index].getRequestHeaders().setIfNotSet(
"If-None-Match",crawlTarget.getETag());
}
}
_activeVersions[index] = (short) ((_activeVersions[index] + 1) % 10);
long currentTime = System.currentTimeMillis();
if (crawlTarget.getRedirectCount() != 0) {
String newHost = fetchURL.getHost();
String originalHost = originalURL.getHost();
if (newHost != null && originalHost != null && newHost.equalsIgnoreCase(originalHost)) {
crawlTarget.getSourceList().populateIPAddressForTarget(fetchURL.getHost(),crawlTarget);
}
}
// IFF NOT Redirect
else {
// if the cached ip is still valid based on stored TTL ...
if (crawlTarget.getServerIP() != 0 && crawlTarget.getServerIPTTL() >= System.currentTimeMillis()) {
// then set the resolved address data members (thus enabling us to bypass dns lookup)
_active[index].setResolvedAddress(IPAddressUtils.IntegerToInetAddress(crawlTarget.getServerIP()),crawlTarget.getServerIPTTL(),null);
}
else {
if (Environment.detailLogEnabled())
{
if (crawlTarget.getServerIP() == 0)
LOG.info("#### IP Address for Host:" + fetchURL.getHost() + " Not Set. Will require DNS Resolution");
else
LOG.info("#### TTL of Cached IP Expired for Host:" + fetchURL.getHost() + ". Will require DNS Resolution");
}
}
}
_active[index].setListener(this);
_active[index].setContext(new CrawlContext(crawlTarget,index));
_active[index].setDownloadMax(DOWNLOAD_LIMIT);
if (!_failConnections) {
_active[index].open();
//LOG.info("### FETCHER called open on connection to:" + fetchURL + " slot:" + index);
}
else {
throw new IOException("Deliberately Skipped Open and FAILED Connection");
}
snapShotConnectionCount++;
connectionCount++;
if (Environment.detailLogEnabled())
LOG.info("Filled SLOT:"+index + " With URL:" + crawlTarget.getActiveURL());
// inform the target of the status change
crawlTarget.fetchStarting(_active[index]);
// LOG.info("### FETCHER called fetchStarting for:" + fetchURL + " slot:" + index);
// log it ...
logGET(crawlTarget,index);
// and construct the http request ...
}
catch (UnknownHostException e) {
//TODO: CLEAR SLOT BEFORE CALLING fetchFailed!!!!
if (_active[index] != null) {
_active[index].setContext(null);
_active[index].close();
}
_active[index] = null;
if (Environment.detailLogEnabled())
LOG.error("Maformed URL Exception Processing URL:" + crawlTarget.getActiveURL());
crawlTarget.fetchFailed(CrawlURL.FailureReason.MalformedURL,e.toString());
failureCount++;
}
catch (MalformedURLException e) {
//TODO: CLEAR SLOT BEFORE CALLING fetchFailed!!!!
if (_active[index] != null) {
_active[index].setContext(null);
_active[index].close();
}
_active[index] = null;
if (Environment.detailLogEnabled())
LOG.error("Maformed URL Exception Processing URL:" + crawlTarget.getActiveURL());
crawlTarget.fetchFailed(CrawlURL.FailureReason.MalformedURL,e.toString());
failureCount++;
}
catch (IOException e2){
if (Environment.detailLogEnabled())
LOG.error("IOException Processing URL:" + crawlTarget.getActiveURL() + " Details:" + e2.getMessage());
//TODO: WATCH IT!!! - always clear slot FIRST because fetchFailed calls back into fillSlot!!!!
if (_active[index] != null) {
_active[index].setContext(null);
_active[index].close();
}
_active[index] = null;
// LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
// notify url of failure ...
//TODO: Investigate if it is SANE!!! to call back into fillSlot from fetchFailed !!!
crawlTarget.fetchFailed(CrawlURL.FailureReason.IOException,e2.getMessage());
failureCount++;
}
catch (Exception e) {
LOG.error("Runtime Exception Processing URL:" + crawlTarget.getActiveURL() + " Details:" + e.getMessage());
//TODO: WATCH IT!!! - always clear slot FIRST because fetchFailed calls back into fillSlot!!!!
if (_active[index] != null) {
_active[index].setContext(null);
_active[index].close();
}
_active[index] = null;
// LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
// notify url of failure ...
//TODO: Investigate if it is SANE!!! to call back into fillSlot from fetchFailed !!!
crawlTarget.fetchFailed(CrawlURL.FailureReason.RuntimeError,e.getMessage());
failureCount++;
}
}
}
return _active[index] != null;
}
private void fillQueue(boolean checkForTimeout) {
// LOG.debug("fillQueue BEGIN- activeCount:"+connectionCount + " pendingCount:" + _pending.size());
for (int index=0;index<_active.length;++index) {
if (_active[index] != null && checkForTimeout) {
if (_active[index].checkForTimeout()) {
CrawlContext context = (CrawlContext)_active[index].getContext();
NIOHttpConnection theTimedOutConnection = _active[index];
if (context != null) {
if (Environment.detailLogEnabled())
LOG.error("Fetch TimedOut for Original URL:"+context.getURL().getOriginalURL() + " ActiveURL:" + context.getURL().getActiveURL());
switch (theTimedOutConnection.getTimeoutState()) {
case AWAITING_RESOLUTION:
// reduce resolving count if necessary ...
resolvingCount--;
break;
case AWAITING_CONNECT:
connectingCount--;
break;
case SENDING_REQUEST:
sendingCount--;
break;
case RECEIVING_HEADERS:
receivingCount--;
break;
}
//TODO: DO ALL SLOT OPERATIONS BEFORE CALLING fetchFailed since it is calling back into fillQueue!!! BAD!!!
_active[index].setContext(null);
_active[index].close();
_active[index] = null;
connectionCount--;
failureCount++;
if (theTimedOutConnection.getTimeoutState() == NIOHttpConnection.State.AWAITING_CONNECT) {
context.getURL().fetchFailed(CrawlURL.FailureReason.ConnectTimeout, "TimedOut in Fill Queue AWAITING_CONNECT");
}
else if (theTimedOutConnection.getTimeoutState() == NIOHttpConnection.State.AWAITING_RESOLUTION){
context.getURL().fetchFailed(CrawlURL.FailureReason.DNSFailure, "TimedOut in Fill Queue AWAITING_RESOLUTION");
}
else {
context.getURL().fetchFailed(CrawlURL.FailureReason.Timeout, "TimedOut in Fill Queue RECEIVING_DATA");
}
}
else {
LOG.error("Context NULL in fillQueue call");
throw new RuntimeException("Context Should NOT be NULL");
}
}
}
// if the current slot is empty ....
if (_active[index] == null) {
// try to fill the slot ...
if (!fillSlot(index,null)) {
// if failed to fill slot, either break out or continue (if checking for timeouts)
if (!checkForTimeout)
break;
}
}
}
// LOG.debug("fillQueue END- activeCount:"+connectionCount + " pendingCount:" + _pending.size());
}
/** NIOHttpConnection.Listener overloads **/
// @Override
public void HttpConnectionStateChanged(NIOHttpConnection theConnection,State oldState, State state) {
if (Environment.detailLogEnabled())
LOG.info("URL:"+theConnection.getURL() + " OldState:"+ oldState + " NewState:"+state);
// only process events if we are in a running state ...
if (_running) {
if (oldState == State.AWAITING_RESOLUTION) {
// reduce resolving count if necessary ...
resolvingCount--;
}
else if (oldState == State.AWAITING_CONNECT) {
connectingCount--;
}
else if (oldState == State.SENDING_REQUEST) {
sendingCount--;
}
else if (oldState == State.RECEIVING_HEADERS) {
receivingCount--;
}
if (state == State.DONE || state == State.ERROR) {
// log it ...
if (Environment.detailLogEnabled())
LOG.debug("ConnectionState for URL:" + theConnection.getURL() + " Changed from:" + oldState + " to:" + state);
// get context
CrawlContext context = (CrawlContext)theConnection.getContext();
if (context == null) {
LOG.error("Context is NULL for Connection to URL:"+theConnection.getURL() + " Connection State:" + state);
}
else {
//TODO: RELEASE SLOT UPFRONT !!!
if (Environment.detailLogEnabled())
LOG.info("Releasing SLOT:" + context.getIndex() + " URL:" + _active[context.getIndex()].getURL());
// either way, this connection is now dead ...
_active[context.getIndex()].setContext(null);
_active[context.getIndex()].close();
_active[context.getIndex()] = null;
// decrement active count
connectionCount--;
// increment finish count no matter what ...
finishCount++;
if (state == State.DONE) {
URLFP urlFingerprint = URLUtils.getURLFPFromURL(
(context.getURL().getRedirectCount() == 0) ?
context.getURL().getOriginalURL()
: context.getURL().getRedirectURL(), false);
// update local history bloom filer
CrawlerServer.getEngine().getLocalBloomFilter().add(urlFingerprint);
try {
// increment success count and process results ...
successCount++;
if (snapShotTime != -1) {
// increment snapshot stats ...
snapShotURLCount++;
// increment cumilative number
cumilativeURLCount++;
}
// handle redirects ...
if (theConnection.isRedirectResponse()) {
// if redirect count == 0, preserve original data...
if (context.getURL().getRedirectCount() == 0) {
context.getURL().cacheOriginalRequestData(theConnection);
}
// increment redirect counter ...
context.getURL().incRedirectCount();
// if either max redirect limit exceeded or location is null ...
if (context.getURL().getRedirectCount() > MAX_REDIRECTS || theConnection.getRedirectLocation() == null) {
String errorDescription = null;
if (context.getURL().getRedirectCount() > MAX_REDIRECTS)
errorDescription = "Max Redirect Count Exceeded";
else
errorDescription = "Location not found in Redirect Headers";
// fail the url ...
context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, errorDescription);
}
// otherwise, silently re-queue process the redirect ...
else {
try {
URL originalURL = new URL(context.getURL().getOriginalURL());
String redirectLocation = theConnection.getRedirectLocation().toLowerCase();
URL redirectURL = (redirectLocation.startsWith("http://") || redirectLocation.startsWith("https://"))
? new URL(theConnection.getRedirectLocation()) : new URL(originalURL,theConnection.getRedirectLocation());
String redirectURLStr = redirectURL.toString();
// by default process the url ...
boolean processRedirect = true;
if ((context.getURL().getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
// but if url is different from original url ...
if (!redirectURLStr.equals(context.getURL().getOriginalURL())) {
URLFP redirectFingerprint = URLUtils.getURLFPFromURL(redirectURLStr, false);
if (redirectFingerprint != null) {
// validate the url against the bloom filter to see that we have not visited it before ...
if (CrawlerServer.getEngine().getLocalBloomFilter().isPresent(redirectFingerprint)) {
// yes we have ... fail the url ...
LOG.info("!!!!Rejecting redirect. from:" + originalURL + " to:" + redirectURL +". Already Visited Target URL");
context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Alread Visited Redirect Location:" + theConnection.getRedirectLocation());
processRedirect = false;
}
}
else {
LOG.error("!!!!Rejecting redirect. from:" + originalURL + " to:" + redirectURL +". Redirect Fingerprint returned Null Fingerprint! RedirectString:" + theConnection.getRedirectLocation());
}
}
}
if (processRedirect) {
if (Environment.detailLogEnabled())
LOG.info("Redirecting request:" + originalURL + " to:" + redirectURL);
// set up redirect metdata ...
context.getURL().setFlags(context.getURL().getFlags() | CrawlURL.Flags.IsRedirected);
context.getURL().setRedirectURL(redirectURLStr);
// refill slot ...
fillSlot(context.getIndex(), context.getURL());
}
//}
//else {
// circular redirect fail case
// context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Circular Redirect detected:" + theConnection.getRedirectLocation());
//}
}
catch (MalformedURLException e) {
// invalid url fail case ...
context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Malformed URL:" + theConnection.getRedirectLocation());
}
}
}
else {
// ok before passing things on ... check to see if this was a successfull get as a result of a redirect ...
if (context.getURL().getRedirectCount() != 0 && context.getURL().getActiveURL() != null) {
URLFP fingerprint = URLUtils.getURLFPFromURL(context.getURL().getActiveURL(),false);
if (fingerprint == null) {
LOG.error("####!!!! getURLFPFromURL Returned NULL FOR URL" + context.getURL().getActiveURL());
}
else {
CrawlerServer.getEngine().getLocalBloomFilter().add(fingerprint);
}
}
_avgDownloadSize.addSample((double)theConnection.getContentBuffer().available());
// process this as a successful get
context.getURL().fetchSucceeded(theConnection,theConnection.getResponseHeaders(),theConnection.getContentBuffer());
}
}
catch (Exception e) {
LOG.error("Exception processing HttpConnectionStateChange-DONE:" + CCStringUtils.stringifyException(e));
context.getURL().fetchFailed(CrawlURL.FailureReason.RuntimeError, "Exception:" + CCStringUtils.stringifyException(e));
}
}
else if (state == State.ERROR) {
// increment failure count ...
failureCount++;
int failureCode = CrawlURL.FailureReason.UNKNOWN;
// generate accurate failure reason ...
switch (theConnection.getErrorType()) {
case RESOLVER_FAILURE: failureCode = CrawlURL.FailureReason.ResolverFailure;break;
case DNS_FAILURE: failureCode = CrawlURL.FailureReason.DNSFailure;break;
case IOEXCEPTION: failureCode = CrawlURL.FailureReason.IOException;break;
case TIMEOUT: failureCode = CrawlURL.FailureReason.Timeout;break;
}
// LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
// notify url of failure ...
context.getURL().fetchFailed(failureCode,(theConnection.getErrorDesc() != null) ? theConnection.getErrorDesc() : "ERROR During Connection State Change");
}
// repopulate slot (if possible)
if (_active[context.getIndex()] == null) {
fillSlot(context.getIndex(),null);
}
}
}
else if (state == State.AWAITING_RESOLUTION) {
resolvingCount++;
}
else if (state == State.AWAITING_CONNECT) {
connectingCount++;
}
else if (state == State.SENDING_REQUEST) {
// get context
CrawlContext context = (CrawlContext)theConnection.getContext();
// if context is valid ... send the crawl target a fetchStarted event ...
if (context != null) {
context.getURL().fetchStarted();
}
else {
LOG.error("SENDING_REQUEST STATE TRIGERRED W/ NULL CONTEXT URL:" + theConnection.getURL());
}
sendingCount++;
}
else if (state == State.RECEIVING_HEADERS) {
receivingCount++;
}
}
}
//@Override
public void HttpContentAvailable(NIOHttpConnection theConnection,NIOBufferList contentBuffer) {
// NOOP
}
public void collectStats(CrawlerStats crawlerStats,RuntimeStatsCollector stats) {
_snapshotNumber++;
long curTime = System.currentTimeMillis();
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ActiveConnections,connectionCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_FetcherQueueSize,_pending.size());
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TotalSuccessfulConnects,successCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TotalFailedConnects,failureCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInResolvingState,resolvingCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInConnectingState,connectingCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInSendingState,sendingCount );
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInRecevingState,receivingCount );
if (snapShotTime != -1) {
stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TimeDeltaBetweenSnapshots, (int)(curTime - snapShotTime));
}
double urlsPerSecond= 0;
int bytesSnapShot = 0;
double bytesPerSec = 0;
// if last snap shot time is set ...
if (snapShotTime != -1) {
// calculate urls / sec
int millisecondsElapsed = (int)(curTime - snapShotTime);
urlsPerSecond =( (double)snapShotURLCount / ((double)millisecondsElapsed / 1000.00));
_urlsPerSecMovingAverage.addSample(urlsPerSecond);
_urlsPerSecSmoothed.addSample(urlsPerSecond);
cumilativeURLSSEC += urlsPerSecond;
snapShotCount += 1;
bytesSnapShot = (int) (NIOHttpConnection.getCumilativeBytesRead() - this.snapShotDownloadAmt);
snapShotDownloadAmt = NIOHttpConnection.getCumilativeBytesRead();
bytesPerSec =( (double)bytesSnapShot / ((double)millisecondsElapsed / 1000.00));
_kbPerSecMovingAverage.addSample(bytesPerSec / 1000.00);
_kbPerSecSmoothed.addSample(bytesPerSec / 1000.00);
cumilativeDownloadPerSec += bytesPerSec;
}
snapShotTime = curTime;
if (firstSnapShotTime == -1)
firstSnapShotTime = snapShotTime;
snapShotURLCount = 0;
snapShotConnectionCount = 0;
synchronized(crawlerStats) {
crawlerStats.setUrlsPerSecond((float)_urlsPerSecMovingAverage.getAverage());
crawlerStats.setMbytesDownPerSecond((float)(_kbPerSecMovingAverage .getAverage() / 1000.00 ));
crawlerStats.setBytesDownloaded(crawlerStats.getBytesDownloaded() + bytesSnapShot);
crawlerStats.setAverageDownloadSize((float)_avgDownloadSize.getAverage());
}
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SnapshotURLSPerSecond,urlsPerSecond);
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_MovingAverageURLSPerSecond,_urlsPerSecMovingAverage.getAverage());
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SmoothedURLSPerSecond,_urlsPerSecSmoothed.getAverage());
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SnapshotKBPerSec,bytesPerSec/1000.00);
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_MovingAverageKBPerSec,_kbPerSecMovingAverage .getAverage());
stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SmoothedKBPerSec,_kbPerSecSmoothed.getAverage());
int active = 0;
StringBuffer sb = new StringBuffer();
int MAX_LINE_LEN = 20;
sb.append("[");
for (int i=0;i<MAX_LINE_LEN;++i) {
sb.append(i%10);
}
sb.append("]\n[");
long currentTime = System.currentTimeMillis();
// iterate connections ...
int i=0;
for (;i<_active.length;) {
stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,null);
if (_active[i] != null) {
if (_activeVersions[i] != _trailingVersions[i]) {
sb.append("<FONT color=red>");
}
if (_active[i].getOpenTime() == -1) {
sb.append("?");
}
else if (_active[i].getState() == NIOHttpConnection.State.AWAITING_RESOLUTION) {
if (currentTime - _active[i].getOpenTime() <= 60000)
sb.append("r");
else
sb.append("R");
}
else if (currentTime - _active[i].getOpenTime() <= 60000) {
sb.append(_activeVersions[i]);
}
else if (currentTime - _active[i].getOpenTime() <= 120000) {
sb.append("!");
stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,"[!]["+(currentTime - _active[i].getOpenTime())+"]" + _active[i].getURL().toString());
}
else {
sb.append("$");
stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,"[$]["+(currentTime - _active[i].getOpenTime())+"]" + _active[i].getURL().toString());
}
active++;
}
else {
sb.append("-");
}
if (_activeVersions[i] != _trailingVersions[i]) {
sb.append("</FONT>");
_trailingVersions[i] = _activeVersions[i];
}
if (++i%MAX_LINE_LEN == 0) {
sb.append("]\n[");
}
}
for (;i%MAX_LINE_LEN != 0;++i) {
sb.append(" ");
}
sb.append("]");
stats.setStringValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionMap,sb.toString());
stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_CumilativeKBytesIN, NIOHttpConnection.getCumilativeBytesRead() / 1000);
stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_CumilativeKBytesOUT, NIOHttpConnection.getCumilativeBytesWritten() / 1000);
}
}