HttpFetcher.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.Locale;
import java.util.SimpleTimeZone;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.async.Timer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIODNSResolver;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOSocketSelector;
import org.commoncrawl.io.NIOHttpConnection.State;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.MovingAverage;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.SmoothedAverage;
import org.commoncrawl.util.URLUtils;

/**
 * 
 * @author rana
 *
 */
public final class HttpFetcher implements Fetcher , NIOHttpConnection.Listener {

  /** constants **/
  private static long TIMEOUT_TIMER_INTERVAL = 1000; // every 1 second ... 

  private static int   DOWNLOAD_LIMIT = CrawlEnvironment.CONTENT_SIZE_LIMIT;

  private static final int MAX_REDIRECTS = 6;


  /** logging **/
  private static final Log LOG = LogFactory.getLog(HttpFetcher.class);
  /** running state variable **/
  private boolean 		   				_running = false;
  /** paused state **/
  private boolean              _paused = false;
  /** max open sockets variable **/
  private int								   _maxSockets;
  /** selector reference **/
  NIOSocketSelector 	_selector;
  /** resolver reference **/
  NIODNSResolver 		_resolver;
  /** timeout timer object **/
  Timer 							_timeoutTimer;
  /** rotating snapshot timer **/
  int  _snapshotNumber;
  /** fail connections mode **/
  boolean _failConnections = false;
  /** crawler name **/
  String _crawlerName;

  /** stats **/
  private int       connectionCount = 0;
  private int       finishCount = 0;
  private int       successCount = 0;
  private int       failureCount  = 0;
  private int       resolvingCount = 0;	
  private int 		   connectingCount = 0;
  private int 		   sendingCount = 0;
  private int		   receivingCount = 0;

  private long    cumilativeURLCount = 0;
  private long    firstSnapShotTime = -1;
  private int       snapShotURLCount = 0;
  private int       snapShotConnectionCount = 0;
  private long    snapShotTime = -1;
  private long    cumilativeURLSSEC = 0;

  private int       snapShotCount = 0;
  private long    snapShotDownloadAmt = 0;
  private long    cumilativeDownloadPerSec = 0;

  private MovingAverage _urlsPerSecMovingAverage;
  private MovingAverage _kbPerSecMovingAverage;
  private SmoothedAverage _urlsPerSecSmoothed;
  private SmoothedAverage _kbPerSecSmoothed;
  private MovingAverage  _avgDownloadSize;

  private InetSocketAddress _crawlInterfaces[];


  /** CrawlContext **/
  private static class CrawlContext { 

    CrawlTarget _url;
    int				 _index;

    public CrawlContext(CrawlTarget url, int index) { 
      _url = url;
      _index = index;
    }

    public CrawlTarget getURL() {
      return _url;
    }

    public int getIndex() {
      return _index;
    }
  }

  /** active connections **/
  private NIOHttpConnection _active[] 				= null;
  /** active connection versions **/
  private short _activeVersions[] = null;
  /** trailing connection versions **/
  private short _trailingVersions[] = null;
  /** pending URLs **/
  private LinkedList<CrawlTarget> _pending 	= new LinkedList<CrawlTarget>();

  /** pause support **/
  public boolean isPaused() { return _paused; }
  public void pause() { _paused = true; }
  public void resume() { 
    if (_paused) { 
      _paused = false;
      if (_running)
        fillQueue(true);
    }
  }

  /** if-modified-since support **/
  private SimpleDateFormat http_date_format;


  public HttpFetcher(int maxOpenSockets,InetSocketAddress[] crawlInterfaceList,String crawlerName) { 
    _maxSockets = maxOpenSockets;
    _active 			= new NIOHttpConnection[_maxSockets];
    _activeVersions = new short[_maxSockets];
    _trailingVersions = new short[_maxSockets];

    _selector			= CrawlerServer.getServer().getEventLoop().getSelector();
    _resolver			= CrawlerServer.getServer().getDNSServiceResolver();

    _urlsPerSecMovingAverage = new MovingAverage(200);
    _kbPerSecMovingAverage = new MovingAverage(200);
    _avgDownloadSize = new MovingAverage(200);
    _urlsPerSecSmoothed =  new SmoothedAverage(.25);
    _kbPerSecSmoothed = new SmoothedAverage(.25);

    _crawlInterfaces = crawlInterfaceList;
    _crawlerName = crawlerName;
    http_date_format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss 'GMT'", Locale.US);
    http_date_format.setTimeZone(new SimpleTimeZone(0, "GMT"));

    // set the default ccbot user agent string 
    NIOHttpConnection.setDefaultUserAgentString("CCBot/1.0 (+http://www.commoncrawl.org/bot.html)");
  }


  public void clearQueues() {
    if (!_running) { 
      for (int i=0;i<_active.length;++i) { 
        _active[i] = null;
      }
      _pending.clear();
    }
    else {
      throw new IllegalStateException();
    }
  }

  public void shutdown() { 
    clearQueues();
  }


  public void queueURLs(LinkedList<CrawlTarget> urlList) {
    for (CrawlTarget url : urlList) {
      //LOG.debug("Adding URL:"+url.getURL() + " to Fetcher Queue");
      _pending.add(url);
    }
    // fillQueue(false);
  }


  public void queueURL(CrawlTarget target) {
    _pending.add(target);
    //fillQueue(false);
  }	

  public void start() {

    // reset stats ... 
    finishCount = 0;
    successCount = 0;
    failureCount  = 0;
    // flip running bit ...
    _running = true;

    fillQueue(false);

    _timeoutTimer = new Timer(TIMEOUT_TIMER_INTERVAL,true, new Timer.Callback() {

      public void timerFired(Timer timer) {

        fillQueue(true);

      }
    });

    // register timeout timer ... 
    CrawlerServer.getServer().getEventLoop().setTimer(_timeoutTimer);
  }

  public void stop() {
    // flip running bit ..
    _running = false;

    // first step .. cancel timer ... 
    if (_timeoutTimer != null) { 
      CrawlerServer.getServer().getEventLoop().cancelTimer(_timeoutTimer);
      _timeoutTimer = null;
    }
    // next cancel all active connections ... 
    for (int i=0;i<_active.length;++i){ 

      if (_active[i] != null) {

        CrawlContext context = (CrawlContext) _active[i].getContext();

        _active[i].setContext(null);
        // close the connection 
        _active[i].close();
        // null out the slot ... 
        _active[i] = null;
        // and add the connection back to the pending queue ... 
        // add the item back to pending list ... 
        _pending.addFirst(context.getURL());
      }
    }
    // clear appropriate summary fields ...
    connectionCount = 0;
    resolvingCount = 0;
    connectingCount = 0;
    sendingCount = 0;
    receivingCount = 0;
  }

  private void logGET(CrawlTarget url, int index) { 

    if (Environment.detailLogEnabled()) { 
      //	  if ( (url.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) { 

      StringBuffer sb = new StringBuffer();


      sb.append(String.format("%1$20.20s ",CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
      sb.append(String.format("%1$4.4s ",url.getResultCode()));
      sb.append(String.format("%1$4.4s ",url.getRetryCount()));
      sb.append(String.format("%1$4.4s ",url.getRedirectCount()));
      sb.append(url.getOriginalURL());
      sb.append(" ");
      if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { 
        sb.append(url.getRedirectURL());
      }
      CrawlerServer.getEngine().getGETLog().info(sb.toString());
      //	  }
    }
  }


  /** figure out which ip address(source) to use for the specified crawl target
   * 
   * @return index number of the interface to use
   */
  private int getCrawlInterfaceForCrawlTarget(CrawlTarget target) { 

    if (target.getCrawlInterface() != -1) { 
      return target.getCrawlInterface();
    }

    if (target.getSourceList() != null) {
      // save current interface 
      int nextCrawlInterface = target.getSourceList().getNextCrawlInterface();
      // set next interface 
      target.getSourceList().setNextCrawlInterface((nextCrawlInterface+1) % _crawlInterfaces.length);
      // set affinity in target 
      target.setCrawlInterface(nextCrawlInterface);

      return nextCrawlInterface;
    }
    else { 
      return 0;
    }
  }

  private boolean fillSlot(int index,CrawlTarget optionalTarget) {

    // dont fill slot in paused state ... 
    if (!isPaused() || optionalTarget != null) { 

      if (_active[index] != null) { 
        LOG.error("fill Slot Called on Non-Empty Slot:"+ index + " With URL:" + _active[index].getURL());
      }
      // if there are pending urls ... 	
      if (optionalTarget != null || _pending.size() != 0) { 
        // pop a url off of the queue ... or use the optionally passed in target 
        CrawlTarget crawlTarget = (optionalTarget != null) ? optionalTarget : _pending.removeFirst();

        try {
          URL fetchURL = new URL(crawlTarget.getActiveURL());
          URL originalURL = (crawlTarget.getRedirectCount() == 0) ? fetchURL : new URL(crawlTarget.getOriginalURL());


          // open a new connection and assign it to the available slot ...
          if (_crawlInterfaces!= null) { 
            _active[index] = new NIOHttpConnection(fetchURL,_crawlInterfaces[getCrawlInterfaceForCrawlTarget(crawlTarget)],_selector,_resolver,crawlTarget.getCookieStore());
          }
          else {  
            _active[index] = new NIOHttpConnection(fetchURL,_selector,_resolver,crawlTarget.getCookieStore());
          }

          // LOG.info("### FETCHER Alloc HTTPConnect to:" + fetchURL + " Slot:" + index);

          //TODO: MAJOR HACK
          // disable proxy requests for robots
          if ((crawlTarget.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
            if (CrawlerServer.getServer().getProxyAddress() != null) {
              // check to see if we should be using a proxy server 
              _active[index].setProxyServer(CrawlerServer.getServer().getProxyAddress());
            }
          }

          // add in special source header
          _active[index].getRequestHeaders().setIfNotSet("x-cc-id",_crawlerName);
          // add in cache tests if present 
          if (crawlTarget.getRedirectCount() == 0) { 
            if (crawlTarget.getLastModifiedTime() != -1) { 
              _active[index].getRequestHeaders().setIfNotSet(
                  "If-Modified-Since",http_date_format.format(new Date(crawlTarget.getLastModifiedTime())));

            }
            if (crawlTarget.getETag() != null) { 
              _active[index].getRequestHeaders().setIfNotSet(
                  "If-None-Match",crawlTarget.getETag());

            }
          }

          _activeVersions[index] = (short) ((_activeVersions[index] + 1) % 10); 

          long currentTime = System.currentTimeMillis();


          if (crawlTarget.getRedirectCount() != 0) {
            String newHost = fetchURL.getHost();
            String originalHost = originalURL.getHost();
            if (newHost != null && originalHost != null && newHost.equalsIgnoreCase(originalHost)) { 
              crawlTarget.getSourceList().populateIPAddressForTarget(fetchURL.getHost(),crawlTarget);
            }
          }
          // IFF NOT Redirect 
          else {
            // if the cached ip is still valid based on stored TTL ... 
            if (crawlTarget.getServerIP() != 0 && crawlTarget.getServerIPTTL() >= System.currentTimeMillis()) { 
              // then set the resolved address data members (thus enabling us to bypass dns lookup)
              _active[index].setResolvedAddress(IPAddressUtils.IntegerToInetAddress(crawlTarget.getServerIP()),crawlTarget.getServerIPTTL(),null);
            }
            else { 
              if (Environment.detailLogEnabled())
              {
                if (crawlTarget.getServerIP() == 0)
                  LOG.info("#### IP Address for Host:" + fetchURL.getHost() + " Not Set. Will require DNS Resolution");
                else 
                  LOG.info("#### TTL of Cached IP Expired for Host:" + fetchURL.getHost() + ". Will require DNS Resolution");
              }
            }
          }

          _active[index].setListener(this);
          _active[index].setContext(new CrawlContext(crawlTarget,index));
          _active[index].setDownloadMax(DOWNLOAD_LIMIT);

          if (!_failConnections) { 
            _active[index].open();
            //LOG.info("### FETCHER called open on connection to:" + fetchURL + " slot:" + index);
          }
          else {  
            throw new IOException("Deliberately Skipped Open and FAILED Connection");
          }

          snapShotConnectionCount++;
          connectionCount++;
          if (Environment.detailLogEnabled())
            LOG.info("Filled SLOT:"+index + " With URL:" + crawlTarget.getActiveURL());

          // inform the target of the status change 
          crawlTarget.fetchStarting(_active[index]);
          // LOG.info("### FETCHER called fetchStarting for:" + fetchURL + " slot:" + index);

          // log it ... 
          logGET(crawlTarget,index);
          // and construct the http request ...

        }
        catch (UnknownHostException e) { 
          //TODO: CLEAR SLOT BEFORE CALLING fetchFailed!!!!
          if (_active[index] != null) { 
            _active[index].setContext(null);
            _active[index].close();
          }
          _active[index] = null;
          if (Environment.detailLogEnabled())
            LOG.error("Maformed URL Exception Processing URL:" + crawlTarget.getActiveURL());
          crawlTarget.fetchFailed(CrawlURL.FailureReason.MalformedURL,e.toString());

          failureCount++;
        }
        catch (MalformedURLException e) { 

          //TODO: CLEAR SLOT BEFORE CALLING fetchFailed!!!!
          if (_active[index] != null) { 
            _active[index].setContext(null);
            _active[index].close();
          }
          _active[index] = null;
          if (Environment.detailLogEnabled())
            LOG.error("Maformed URL Exception Processing URL:" + crawlTarget.getActiveURL());
          crawlTarget.fetchFailed(CrawlURL.FailureReason.MalformedURL,e.toString());

          failureCount++;
        }
        catch (IOException e2){
          if (Environment.detailLogEnabled())   
            LOG.error("IOException Processing URL:" + crawlTarget.getActiveURL() + " Details:" + e2.getMessage());

          //TODO: WATCH IT!!! - always clear slot FIRST because fetchFailed calls back into fillSlot!!!!
          if (_active[index] != null) { 
            _active[index].setContext(null);
            _active[index].close();
          }
          _active[index] = null;

          // LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
          // notify url of failure ... 
          //TODO: Investigate if it is SANE!!! to call back into fillSlot from fetchFailed !!!
          crawlTarget.fetchFailed(CrawlURL.FailureReason.IOException,e2.getMessage());

          failureCount++;
        }
        catch (Exception e) { 

          LOG.error("Runtime Exception Processing URL:" + crawlTarget.getActiveURL() + " Details:" + e.getMessage());

          //TODO: WATCH IT!!! - always clear slot FIRST because fetchFailed calls back into fillSlot!!!!
          if (_active[index] != null) { 
            _active[index].setContext(null);
            _active[index].close();
          }
          _active[index] = null;

          // LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
          // notify url of failure ... 
          //TODO: Investigate if it is SANE!!! to call back into fillSlot from fetchFailed !!!
          crawlTarget.fetchFailed(CrawlURL.FailureReason.RuntimeError,e.getMessage());

          failureCount++;

        }
      }
    }
    return _active[index] != null;	
  }

  private void fillQueue(boolean checkForTimeout) { 

    // LOG.debug("fillQueue BEGIN- activeCount:"+connectionCount + " pendingCount:" + _pending.size());

    for (int index=0;index<_active.length;++index) { 

      if (_active[index] != null && checkForTimeout) { 

        if (_active[index].checkForTimeout()) { 

          CrawlContext context = (CrawlContext)_active[index].getContext();

          NIOHttpConnection theTimedOutConnection = _active[index];

          if (context != null) {
            if (Environment.detailLogEnabled())
              LOG.error("Fetch TimedOut for Original URL:"+context.getURL().getOriginalURL() + " ActiveURL:" + context.getURL().getActiveURL());

            switch (theTimedOutConnection.getTimeoutState()) { 

              case AWAITING_RESOLUTION: 
                // reduce resolving count if necessary ... 
                resolvingCount--;
                break;

              case AWAITING_CONNECT:  
                connectingCount--;
                break;

              case SENDING_REQUEST: 
                sendingCount--;
                break;

              case RECEIVING_HEADERS: 
                receivingCount--;
                break;

            }

            //TODO: DO ALL SLOT OPERATIONS BEFORE CALLING fetchFailed since it is calling back into fillQueue!!! BAD!!!
            _active[index].setContext(null);
            _active[index].close();
            _active[index] = null;
            connectionCount--;
            failureCount++;

            if (theTimedOutConnection.getTimeoutState() == NIOHttpConnection.State.AWAITING_CONNECT) { 
              context.getURL().fetchFailed(CrawlURL.FailureReason.ConnectTimeout, "TimedOut in Fill Queue AWAITING_CONNECT");
            }
            else if (theTimedOutConnection.getTimeoutState() == NIOHttpConnection.State.AWAITING_RESOLUTION){
              context.getURL().fetchFailed(CrawlURL.FailureReason.DNSFailure, "TimedOut in Fill Queue AWAITING_RESOLUTION");
            }
            else { 
              context.getURL().fetchFailed(CrawlURL.FailureReason.Timeout, "TimedOut in Fill Queue RECEIVING_DATA");
            }
          }
          else { 
            LOG.error("Context NULL in fillQueue call");
            throw new RuntimeException("Context Should NOT be NULL");
          }
        }
      }

      // if the current slot is empty .... 
      if (_active[index] == null) {

        // try to fill the slot ... 
        if (!fillSlot(index,null)) { 
          // if failed to fill slot, either break out or continue (if checking for timeouts)
          if (!checkForTimeout)
            break;
        }
      }
    }

    // LOG.debug("fillQueue END- activeCount:"+connectionCount + " pendingCount:" + _pending.size());
  }


  /** NIOHttpConnection.Listener overloads **/
  // @Override
  public void HttpConnectionStateChanged(NIOHttpConnection theConnection,State oldState, State state) {

    if (Environment.detailLogEnabled())
      LOG.info("URL:"+theConnection.getURL() + " OldState:"+ oldState + " NewState:"+state);

    // only process events if we are in a running state ... 
    if (_running)  { 

      if (oldState == State.AWAITING_RESOLUTION) {
        // reduce resolving count if necessary ... 
        resolvingCount--;
      }
      else if (oldState == State.AWAITING_CONNECT) { 
        connectingCount--;
      }
      else if (oldState == State.SENDING_REQUEST) { 
        sendingCount--;
      }
      else if (oldState == State.RECEIVING_HEADERS) { 
        receivingCount--;
      }


      if (state == State.DONE || state == State.ERROR) {

        // log it ... 
        if (Environment.detailLogEnabled())
          LOG.debug("ConnectionState for URL:" + theConnection.getURL() + " Changed from:" + oldState + " to:" + state);

        // get context 
        CrawlContext context = (CrawlContext)theConnection.getContext();

        if (context == null) { 
          LOG.error("Context is NULL for Connection to URL:"+theConnection.getURL() + " Connection State:" + state);
        }
        else { 

          //TODO: RELEASE SLOT UPFRONT !!!
          if (Environment.detailLogEnabled())
            LOG.info("Releasing SLOT:" + context.getIndex() + " URL:" + _active[context.getIndex()].getURL());
          // either way, this connection is now dead ... 
          _active[context.getIndex()].setContext(null);
          _active[context.getIndex()].close();

          _active[context.getIndex()] = null;
          // decrement active count 
          connectionCount--;

          // increment finish count no matter what ... 
          finishCount++;

          if (state == State.DONE) {
            URLFP urlFingerprint = URLUtils.getURLFPFromURL(
                (context.getURL().getRedirectCount() == 0) ? 
                    context.getURL().getOriginalURL()
                    : context.getURL().getRedirectURL(), false);
            // update local history bloom filer
            CrawlerServer.getEngine().getLocalBloomFilter().add(urlFingerprint);
            
            try { 
              // increment success count and process results ... 
              successCount++;

              if (snapShotTime != -1) { 
                // increment snapshot stats ... 
                snapShotURLCount++;
                // increment cumilative number 
                cumilativeURLCount++;
              }

              // handle redirects ... 
              if (theConnection.isRedirectResponse()) {

                // if redirect count == 0, preserve original data... 
                if (context.getURL().getRedirectCount() == 0) { 
                  context.getURL().cacheOriginalRequestData(theConnection);
                }
                // increment redirect counter ...
                context.getURL().incRedirectCount();
                // if either max redirect limit exceeded or location is null ...
                if (context.getURL().getRedirectCount() > MAX_REDIRECTS || theConnection.getRedirectLocation() == null) { 
                  String errorDescription = null;
                  if (context.getURL().getRedirectCount() > MAX_REDIRECTS)
                    errorDescription = "Max Redirect Count Exceeded";
                  else 
                    errorDescription = "Location not found in Redirect Headers";
                  // fail the url ... 
                  context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, errorDescription);
                }
                // otherwise, silently re-queue process the redirect ... 
                else { 

                  try { 
                    URL originalURL = new URL(context.getURL().getOriginalURL());
                    String redirectLocation = theConnection.getRedirectLocation().toLowerCase();
                    URL redirectURL = (redirectLocation.startsWith("http://") || redirectLocation.startsWith("https://")) 
                        ? new URL(theConnection.getRedirectLocation()) : new URL(originalURL,theConnection.getRedirectLocation());  
                        String redirectURLStr = redirectURL.toString();

                        // by default process the url ... 
                        boolean processRedirect = true;

                        if ((context.getURL().getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) { 
                          // but if url is different from original url ... 
                          if (!redirectURLStr.equals(context.getURL().getOriginalURL())) {

                            URLFP redirectFingerprint = URLUtils.getURLFPFromURL(redirectURLStr, false);

                            if (redirectFingerprint != null) { 
                              // validate the url against the bloom filter to see that we have not visited it before ...  
                              if (CrawlerServer.getEngine().getLocalBloomFilter().isPresent(redirectFingerprint)) { 
                                // yes we have ... fail the url ... 
                                LOG.info("!!!!Rejecting redirect. from:" + originalURL + " to:" + redirectURL +". Already Visited Target URL");
                                context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Alread Visited Redirect Location:" + theConnection.getRedirectLocation());
                                processRedirect = false;
                              }
                            }
                            else { 
                              LOG.error("!!!!Rejecting redirect. from:" + originalURL + " to:" + redirectURL +". Redirect Fingerprint returned Null Fingerprint! RedirectString:" + theConnection.getRedirectLocation());
                            }
                          }
                        }

                        if (processRedirect) { 
                          if (Environment.detailLogEnabled())
                            LOG.info("Redirecting request:" + originalURL + " to:" + redirectURL);
                          // set up redirect metdata ... 
                          context.getURL().setFlags(context.getURL().getFlags() | CrawlURL.Flags.IsRedirected);
                          context.getURL().setRedirectURL(redirectURLStr);
                          // refill slot ... 
                          fillSlot(context.getIndex(), context.getURL());
                        }
                        //}
                        //else { 
                        // circular redirect fail case 
                        //  context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Circular Redirect detected:" + theConnection.getRedirectLocation());
                        //}
                  }
                  catch (MalformedURLException e) { 
                    // invalid url fail case ... 
                    context.getURL().fetchFailed(CrawlURL.FailureReason.RedirectFailed, "Malformed URL:" + theConnection.getRedirectLocation());
                  }
                }
              }
              else { 
                // ok before passing things on ... check to see if this was a successfull get as a result of a redirect ... 
                if (context.getURL().getRedirectCount() != 0 && context.getURL().getActiveURL() != null) {
                  URLFP fingerprint = URLUtils.getURLFPFromURL(context.getURL().getActiveURL(),false);
                  if (fingerprint == null) { 
                    LOG.error("####!!!! getURLFPFromURL Returned NULL FOR URL" + context.getURL().getActiveURL());
                  }
                  else { 
                    CrawlerServer.getEngine().getLocalBloomFilter().add(fingerprint);
                  }

                }

                _avgDownloadSize.addSample((double)theConnection.getContentBuffer().available());
                // process this as a successful get
                context.getURL().fetchSucceeded(theConnection,theConnection.getResponseHeaders(),theConnection.getContentBuffer());
              }
            }
            catch (Exception e) { 
              LOG.error("Exception processing HttpConnectionStateChange-DONE:" + CCStringUtils.stringifyException(e));
              context.getURL().fetchFailed(CrawlURL.FailureReason.RuntimeError, "Exception:" + CCStringUtils.stringifyException(e));
            }

          }
          else if (state == State.ERROR) { 

            // increment failure count ... 
            failureCount++;

            int failureCode = CrawlURL.FailureReason.UNKNOWN;
            // generate accurate failure reason ... 
            switch (theConnection.getErrorType()) { 
              case RESOLVER_FAILURE: 			failureCode = CrawlURL.FailureReason.ResolverFailure;break;
              case DNS_FAILURE:					failureCode = CrawlURL.FailureReason.DNSFailure;break;
              case IOEXCEPTION:					failureCode = CrawlURL.FailureReason.IOException;break;
              case TIMEOUT:							failureCode = CrawlURL.FailureReason.Timeout;break;
            }

            // LOG.debug("Fetch FAILED URL:"+ context.getURL().getURL() + " Code:"+ failureCode);
            // notify url of failure ... 
            context.getURL().fetchFailed(failureCode,(theConnection.getErrorDesc() != null) ? theConnection.getErrorDesc() : "ERROR During Connection State Change");
          }

          // repopulate slot (if possible)
          if (_active[context.getIndex()] == null) { 
            fillSlot(context.getIndex(),null);
          }
        }
      }
      else if (state == State.AWAITING_RESOLUTION) { 
        resolvingCount++;
      }
      else if (state == State.AWAITING_CONNECT) { 
        connectingCount++;
      }
      else if (state == State.SENDING_REQUEST) { 
        // get context 
        CrawlContext context = (CrawlContext)theConnection.getContext();

        // if context is valid ... send the crawl target a fetchStarted event ... 
        if (context != null) { 
          context.getURL().fetchStarted();
        }
        else { 
          LOG.error("SENDING_REQUEST STATE TRIGERRED W/ NULL CONTEXT URL:" + theConnection.getURL());
        }
        sendingCount++;
      }
      else if (state == State.RECEIVING_HEADERS) { 
        receivingCount++;
      }
    }
  }

  //@Override
  public void HttpContentAvailable(NIOHttpConnection theConnection,NIOBufferList contentBuffer) { 
    // NOOP
  }



  public void collectStats(CrawlerStats crawlerStats,RuntimeStatsCollector stats) { 

    _snapshotNumber++;

    long curTime = System.currentTimeMillis();

    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ActiveConnections,connectionCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_FetcherQueueSize,_pending.size());
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TotalSuccessfulConnects,successCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TotalFailedConnects,failureCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInResolvingState,resolvingCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInConnectingState,connectingCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInSendingState,sendingCount );
    stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionsInRecevingState,receivingCount );

    if (snapShotTime != -1) {
      stats.setIntValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_TimeDeltaBetweenSnapshots, (int)(curTime - snapShotTime));
    }

    double urlsPerSecond= 0;
    int bytesSnapShot = 0;
    double bytesPerSec = 0;

    // if last snap shot time is set ... 
    if (snapShotTime != -1) { 
      // calculate urls / sec 
      int millisecondsElapsed = (int)(curTime - snapShotTime);
      urlsPerSecond =( (double)snapShotURLCount / ((double)millisecondsElapsed / 1000.00));

      _urlsPerSecMovingAverage.addSample(urlsPerSecond);
      _urlsPerSecSmoothed.addSample(urlsPerSecond);

      cumilativeURLSSEC += urlsPerSecond;
      snapShotCount += 1;

      bytesSnapShot = (int) (NIOHttpConnection.getCumilativeBytesRead() - this.snapShotDownloadAmt);
      snapShotDownloadAmt = NIOHttpConnection.getCumilativeBytesRead();

      bytesPerSec =( (double)bytesSnapShot / ((double)millisecondsElapsed / 1000.00));

      _kbPerSecMovingAverage.addSample(bytesPerSec / 1000.00);
      _kbPerSecSmoothed.addSample(bytesPerSec / 1000.00);

      cumilativeDownloadPerSec += bytesPerSec;
    }
    snapShotTime = curTime;
    if (firstSnapShotTime == -1)
      firstSnapShotTime = snapShotTime;
    snapShotURLCount = 0;
    snapShotConnectionCount = 0;


    synchronized(crawlerStats) { 
      crawlerStats.setUrlsPerSecond((float)_urlsPerSecMovingAverage.getAverage());
      crawlerStats.setMbytesDownPerSecond((float)(_kbPerSecMovingAverage .getAverage() / 1000.00 ));
      crawlerStats.setBytesDownloaded(crawlerStats.getBytesDownloaded() + bytesSnapShot);
      crawlerStats.setAverageDownloadSize((float)_avgDownloadSize.getAverage());
    }

    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SnapshotURLSPerSecond,urlsPerSecond);
    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_MovingAverageURLSPerSecond,_urlsPerSecMovingAverage.getAverage());
    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SmoothedURLSPerSecond,_urlsPerSecSmoothed.getAverage());

    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SnapshotKBPerSec,bytesPerSec/1000.00);
    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_MovingAverageKBPerSec,_kbPerSecMovingAverage .getAverage());
    stats.setDoubleValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_SmoothedKBPerSec,_kbPerSecSmoothed.getAverage());

    int active = 0;
    StringBuffer sb = new StringBuffer();
    int MAX_LINE_LEN = 20;

    sb.append("[");
    for (int i=0;i<MAX_LINE_LEN;++i) { 
      sb.append(i%10);
    }
    sb.append("]\n[");
    long currentTime = System.currentTimeMillis();

    // iterate connections ...
    int i=0;
    for (;i<_active.length;) { 

      stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,null);

      if (_active[i] != null) {

        if (_activeVersions[i] != _trailingVersions[i]) {
          sb.append("<FONT color=red>");
        }
        if (_active[i].getOpenTime() == -1) { 
          sb.append("?");
        }
        else if (_active[i].getState() == NIOHttpConnection.State.AWAITING_RESOLUTION) { 
          if (currentTime - _active[i].getOpenTime() <= 60000)
            sb.append("r");
          else 
            sb.append("R");
        }
        else if (currentTime - _active[i].getOpenTime() <= 60000) { 
          sb.append(_activeVersions[i]);
        }
        else if (currentTime - _active[i].getOpenTime() <= 120000) { 
          sb.append("!");
          stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,"[!]["+(currentTime - _active[i].getOpenTime())+"]" + _active[i].getURL().toString()); 
        }
        else { 
          sb.append("$");
          stats.setArrayValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_LaggingConnectionDetailArray, _active.length, i,"[$]["+(currentTime - _active[i].getOpenTime())+"]" + _active[i].getURL().toString()); 
        }
        active++;
      }
      else {
        sb.append("-");
      }
      if (_activeVersions[i] != _trailingVersions[i]) {
        sb.append("</FONT>");
        _trailingVersions[i] = _activeVersions[i]; 
      }
      if (++i%MAX_LINE_LEN == 0) { 
        sb.append("]\n[");
      }
    }
    for (;i%MAX_LINE_LEN != 0;++i) { 
      sb.append(" ");
    }
    sb.append("]");

    stats.setStringValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_ConnectionMap,sb.toString());
    stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_CumilativeKBytesIN, NIOHttpConnection.getCumilativeBytesRead() / 1000);
    stats.setLongValue(CrawlerEngineStats.ID,CrawlerEngineStats.Name.HTTPFetcher_CumilativeKBytesOUT, NIOHttpConnection.getCumilativeBytesWritten() / 1000);

  }

}