CrawlList.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.concurrent.Callable;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.CRC32;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.util.GZIPUtils;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.PersistentCrawlTarget;
import org.commoncrawl.service.crawler.RobotRulesParser.RobotRuleSet;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FileUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.IntrusiveList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.GZIPUtils.UnzipResult;
import org.commoncrawl.util.IntrusiveList.IntrusiveListElement;
import org.junit.Test;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

/**
 * CrawlList - a collection of CrawlTargets (disk backed)
 * 
 * @author rana
 *
 */
public final class CrawlList extends IntrusiveList.IntrusiveListElement<CrawlList> {

	/** offline (disk) storage support **/
  private static int DISK_FLUSH_THRESHOLD = 50;
  private static int DISK_LOAD_THRESHOLD = 10;
  private static int IDEAL_TARGET_COUNT = 25;
  private static final int MAX_ROBOTS_EXCLUSION_IN_LOOP = 3;
  private static final int MAX_FAILED_TARGETS_IN_LOOP   = 50;
  private static final int IOEXCEPTION_TIMEOUT_BOOST = 60000;
  private static final int PAUSE_STATE_RETRY_DELAY = 10 * 60000; // 10 minutes
  
 
  static final AtomicLong seq = new AtomicLong();
  
  public static class DiskQueueEntry  implements Comparable<DiskQueueEntry> {
    
    final long seqNum;
    final CrawlList entry;
    final boolean isLoadRequest;
    
    public DiskQueueEntry(CrawlList item,boolean isLoadRequest) {
      this.seqNum = seq.getAndIncrement();
      this.entry = item;
      this.isLoadRequest = isLoadRequest;
    }
    
    public CrawlList getListItem() { return entry; }

    @Override
    public int compareTo(DiskQueueEntry o) {
      if (entry == null && o.entry != null) 
        return -1;
      else if (o.entry == null && entry != null) 
        return 1;
      else { 
        if (isLoadRequest && !o.isLoadRequest)
          return -1;
        else if (!isLoadRequest && o.isLoadRequest)
          return 1;
        else { 
          return ((Long)seqNum).compareTo(o.seqNum);
        }
      }
    }
  }  
  
  private static PriorityBlockingQueue<DiskQueueEntry> _diskOperationQueue = new PriorityBlockingQueue<DiskQueueEntry>();
  private static Thread  _diskOperationThread = null;
  private static boolean _diskOpThreadShuttingDown = false;
  private static long    _diskHeaderActiveVersionTimestamp = System.currentTimeMillis();
  
    
  /** logging **/
  private static final Log LOG = LogFactory.getLog(CrawlList.class);
  
  
  /** server repsonsible for servicing this domain **/
  private CrawlListHost  _host;
  /** host name **/
  private String _listName;
  /** host metadata */
  private int   _baseListId;
  /** unique list id **/
  private long   _uniqueListId;
  /** next crawl interface used to service this list **/
  private int    _nextCrawlInterface = 0;
  /** cumilative list of crawl targets associated with this queue ...*/ 
  private IntrusiveList<CrawlTarget> _pending = new IntrusiveList<CrawlTarget>();
  /** list of crawl targets directly scheduled for disk queue */
  private IntrusiveList<CrawlTarget> _queued = new IntrusiveList<CrawlTarget>();
  
  /** offline item count  - the set of crawl targets that are stored offline on disk **/
  private int _offlineTargetCount = 0;
  /** disk request pending **/
  private boolean _diskRequestPending = false;
  
  /** currently scheduled item **/
  private CrawlTarget _scheduled = null;
  /** fetch start time **/
  private long _fetchStartTime = -1;
  /** fetch end time **/
  private long _fetchEndTime = -1;
  /** last  successful fetch time (total time in milliseconds)**/
  private int  _lastRequestDownloadTime = -1;
  /** last successful request redirect count **/
  private int  _lastRequestRedirectCount = 0;
  
  /** active connection **/
  private NIOHttpConnection _activeConnection;

  /** SubDomain Stats and Robots State Information Structure **/
  private static class DomainInfo extends IntrusiveListElement<DomainInfo>{ 
    public String  _domainName;
    public boolean _domainFailed = false;
    public boolean _domainBlackListed = false;
    public long    _robotsCRC = -1;
    public long    _lastTouched;
    /** host retry counter **/
    public byte   _domainRetryCounter = 0;
    /** total 400 errors **/
    public int    _HTTP400Count = 0;
    /** total 500 errors **/
    public int    _HTTP500Count = 0;
    /** total 200 status code count**/
    public int    _HTTP200Count = 0;
    /** sequential failure count **/
    public short    _SequentialHTTPFailuresCount =0;
    
    public boolean   _robotsReturned400;
    
    public boolean   _robotsReturned403;
  }
  /** domain info map **/
  private IntrusiveList<DomainInfo> _domainInfo = new IntrusiveList<DomainInfo>();
  /** active domain info **/
  private DomainInfo _activeDomainInfo;
  
  /** the active robots rule set to apply robots policy for this host **/
  private RobotRuleSet _ruleSet = null;
  /** robots returned 400 **/
  private boolean   _robotsReturned400;
  private boolean   _robotsReturned403;

  /** the crc value for the active rule set - computed at pre-parse time**/
  private long _robotsCRC = 0;
  /** robots file retrieved */
  private boolean   _robotsRetrieved;
  /** robots host name **/
  private String    _robotsHostName;
  /** last fetched robots host Name**/
  private String    _lastFetchedRobotsHostName;
  /** last fetched robots data **/
  private String    _lastFetchedRobotsData;
  /** crc calculator **/
  private static CRC32 _crc32 = new CRC32();
  /** last request was io exception **/
  private boolean _lastRequestWasIOException = false;
  
  
  private static final int MAX_ITEM_RETRY = 2;
  
  private static final int MAX_HOST_RETRY = 7;
  
  private static final int DEFAULT_ITEM_RETRY_WAIT_TIME = 20000;
  
  private static final int DEFAULT_HOST_RETRY_WAIT_TIME = 20000;
  
  private static final int MIN_CRAWL_DELAY = 1;
  private static final int MAX_CRAWL_DELAY = 3500;
  
  private static int   STATS_CHECK_CODE_SAMPLE_THRESHOLD = 50; // don't do anything util we have retireved at least 50 urls
  
  private static float BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD = .80f; // if 85% of urls are bad, fail the domain 
  
  // private static int   SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER = 500;
  private static int   SEQUENTIAL_FAILURES_NO_200_TRIGGER = 10; // if we get 20 sequential failures we bail
  // private static int   SEQUENTIAL_FAILURES_SOME_200_TRIGGER = 1000;
  
  
  private static final int MAX_DNS_CACHE_ITEMS = 100;
  private static int MAX_DOMAIN_CACHE_ENTIRES = 1000;

  /** the reference to the singleton server object **/
  private static CrawlerServer _server = null;
  
  
   
  public enum Disposition { 
    ItemAvailable,
    WaitingOnCompletion,
    WaitingOnTime,
    QueueEmpty
  }
  
  /** domain's dipsition(state) **/
  private Disposition _disposition;
  
  
  private static byte WWWRULE_Remove  = 1 << 0;
  private static byte WWWRULE_Add     = 1 << 1;
    
  /** www rewrite rule patterns **/
  static class WWWReWriteItem extends IntrusiveList.IntrusiveListElement<WWWReWriteItem> {
    
    public WWWReWriteItem(String domainName,byte ruleType) { 
      _wwwRuleDomain = domainName;
      _wwwRuleType = ruleType;
      _lastUpdateTime = System.currentTimeMillis();
    }
    
    public String  _wwwRuleDomain = null;
    public long    _lastUpdateTime = -1;    
    public byte    _wwwRuleType = 0;
  };
  
  private static final int MAX_REWRITE_ITEMS = 5;
  
  IntrusiveList<WWWReWriteItem> _rewriteItemList = null;

  
  static class DNSCacheItem extends IntrusiveList.IntrusiveListElement<DNSCacheItem> {
    
    public DNSCacheItem(String hostName,int ipAddress,long ttl) { 
      _hostName = hostName;
      _ipAddress = ipAddress;
      _ttl       = ttl;
      _lastAccessTime = System.currentTimeMillis();
    }
    
    public String _hostName;
    public int    _ipAddress;
    public long   _ttl;
    public long   _lastAccessTime = -1;
  }

  IntrusiveList<DNSCacheItem> _dnsCacheItem = new IntrusiveList<DNSCacheItem>();
  
  public void cacheDNSEntry(String hostName,int ipAddress,long ttl) { 
    DNSCacheItem oldestItem = null;
    DNSCacheItem found = null;
    for (DNSCacheItem item : _dnsCacheItem) { 
      if (item._hostName.equals(hostName)) { 
        item._ipAddress = ipAddress;
        item._ttl = ttl;
        item._lastAccessTime = System.currentTimeMillis();
        found = item;
      }
      oldestItem = (oldestItem == null) ? item : (oldestItem._lastAccessTime > item._lastAccessTime) ? item : oldestItem;
    }
    
    if (found == null) { 
      if (_dnsCacheItem.size() == MAX_DNS_CACHE_ITEMS) {
        //LOG.info("###DNS Cache Full for Host:" + getListName() + " Flushing Host:" + oldestItem._hostName);
        _dnsCacheItem.removeElement(oldestItem);
      }
      _dnsCacheItem.addHead(new DNSCacheItem(hostName,ipAddress,ttl));
    }
    else { 
      _dnsCacheItem.removeElement(found);
      _dnsCacheItem.addHead(found);
    }
  }
  
  private void addWWWReWriteItem(String originalItem,byte itemType) { 
    WWWReWriteItem oldestItem = null;
    WWWReWriteItem found = null;
    if (_rewriteItemList == null) { 
      _rewriteItemList = new IntrusiveList<WWWReWriteItem>();
    }
    
    for (WWWReWriteItem item : _rewriteItemList) { 
      if (item._wwwRuleDomain.equals(originalItem)) { 
        item._lastUpdateTime = System.currentTimeMillis();
        found = item;
      }
      oldestItem = (oldestItem == null) ? item : (oldestItem._lastUpdateTime > item._lastUpdateTime) ? item : oldestItem;
    }
    if (found == null) { 
      if (_rewriteItemList.size() == MAX_REWRITE_ITEMS) { 
        _rewriteItemList.removeElement(oldestItem);
      }
      _rewriteItemList.addHead(new WWWReWriteItem(originalItem,itemType));
    }
    if (found != null && found != _rewriteItemList.getHead()) { 
      _rewriteItemList.removeElement(found);
      _rewriteItemList.addHead(found);
    }
  }
  
  
  public CrawlList(CrawlListHost crawlHost,int baseListId) { 
	  _host = crawlHost;
	  _baseListId = baseListId;
	  _uniqueListId = (((long)crawlHost.getIPAddress()) << 32) | _baseListId;
	  _listName = "List:" + _baseListId + " For:" + IPAddressUtils.IntegerToIPAddressString(crawlHost.getIPAddress());
	  _robotsRetrieved = false;
	  _disposition = Disposition.QueueEmpty;
  }
  
  /** host access **/
  public CrawlListHost getHost() { 
    return _host;
  }
  
  /** server access **/
  static CrawlerServer getServerSingleton() { 
  	return _server;
  }
  
  static void setServerSingleton(CrawlerServer server) { 
  	_server = server;
  }
  
  public int getListId() { 
    return _baseListId;
  }
  
  public long getUniqueListId() { 
    return _uniqueListId;
  }
  
  /** host name **/
  public String getListName() { 
    return _listName;
  }
  
  /** disposition **/
  Disposition getDisposition() {
    return _disposition;
  }
  
  void updateLastModifiedTime(long time) {
    _host.updateLastModifiedTime(time);  
  }
  
  /** get next crawl interface used to service this list 
   * 
   */
  public int getNextCrawlInterface() { 
    return _nextCrawlInterface;
  }
  
  /** set next crawl interface to use for this list 
   * 
   */
  public void setNextCrawlInterface(int crawlInterface) { 
    _nextCrawlInterface = crawlInterface;
  }
  
  /** get the pending urls count **/
  synchronized int getPendingURLCount() {
    return _pending.size();
  }
  
  /** get the offline url count 
   */
  synchronized int getOfflineURLCount() { 
    return _offlineTargetCount;
  }
  
  boolean isScheduled() { 
    return _scheduled != null;
  }
  
  int getActiveURLCount() { 
    return (_scheduled != null) ? 1 : 0;
  }
  
  void updateLastFetchStartTime(long newTime) { 
    _fetchStartTime = newTime;
    getHost().updateLastFetchStartTime(newTime);
  }
  
  long getLastFetchStartTime() { 
    return _fetchStartTime;
  }
  
  /** get fetch time in milliseconds for last request **/
  int getLastRequestFetchTime() { 
    if (_fetchStartTime != -1 && _fetchEndTime != -1){ 
      return (int) Math.max(0,_fetchEndTime - _fetchStartTime);
    }
    return 0;
  }
  
  int getLastSuccessfulDownloadTime() { 
    return _lastRequestDownloadTime;
  }
  
  public synchronized void stopCrawl() { 
     
    // add anything scheduled to pending ... 
    if (_scheduled != null) {
      _pending.addTail(_scheduled);
      _scheduled = null;
    }
    if (_pending.size() != 0 || !_robotsRetrieved) {
      _disposition = Disposition.ItemAvailable;
    }
    else { 
      _disposition = Disposition.QueueEmpty;
    }
  }
  
  
  /** add a new crawl target to the host queue **/
  public synchronized void addCrawlTarget(CrawlTarget target,boolean toFrontOfQueue) {
    
    // LOG.info("DOMAIN:" + this.getHostName() + " ADDING TGT:" + target.getOriginalURL());
    
    Disposition oldDisposition = _disposition;
     
    if (toFrontOfQueue) {
      target.setFlags(target.getFlags() | CrawlURL.Flags.IsHighPriorityURL);
      _pending.addHead(target);
    }
    else {
      // if offline != 0 add to pending 
      if (_offlineTargetCount == 0) { 
        _pending.addTail(target);
      }
      // otherwise add to queued 
      else {  
        //LOG.info("### QUEUED Adding to Queued List for CrawlList:" + getListName());
        _queued.addTail(target);
      }
    }
    
    if (_disposition == Disposition.QueueEmpty) { 
      _disposition = Disposition.ItemAvailable;
    }

    if (oldDisposition != _disposition && getHost().getActiveList() == this) { 
      getHost().listDispositionChanged(this, oldDisposition, _disposition);
    }
    
    if (_pending.size() >= DISK_FLUSH_THRESHOLD || _queued.size() != 0) {
      if (!_diskRequestPending) {  
        _diskRequestPending = true;
        _diskOperationQueue.add(new DiskQueueEntry(this,false));
      }
    }
  }
     
  private boolean activeDomainRequiresRobotsFetch(String activeDomainName) {
    
    if (_robotsRetrieved && _robotsHostName != null && _robotsHostName.equalsIgnoreCase(activeDomainName)) {
      // no, the active robots file matches the active domain name. no need to fetch anything ... 
      return false;
    }
    else {
      
      DomainInfo domainInfo = getDomainInfoFromDomain(activeDomainName);
      
      // get the cached crc for the active domain if it exists ... 
      long cachedRobotsCRC = (domainInfo == null) ? -1 : domainInfo._robotsCRC;
      
      // if cached crc found ...   
      if (cachedRobotsCRC != -1) { 
        // if cached robots file matches the actvie robots file's crc ... 
        if (_robotsRetrieved && cachedRobotsCRC == _robotsCRC) {
          //LOG.info("### Skipping Robots Fetch. Cached CRC == robotsCRC");
          // no need to refetch 
          return false;
        }
        // otherwise, check the host's cache ... 
        else { 
          // special case for the empty rule set 
          if (cachedRobotsCRC == 0) {
            
            _robotsCRC = cachedRobotsCRC;
            _robotsHostName = activeDomainName;
            _robotsReturned400 = domainInfo._robotsReturned400;
            _robotsReturned403 = domainInfo._robotsReturned403;
            _ruleSet = RobotRulesParser.getEmptyRules();
            _robotsRetrieved = true;
            if (Environment.detailLogEnabled())
              LOG.info("### Skipping Robots Fetch. Cached CRC is Zero, indicating empty rule set.");
            
            return false;
          }
          else {
            
            // check the rule set cache in the host (by crc)
            RobotRuleSet ruleSet = _host.getCachedRobotsEntry(cachedRobotsCRC);
            // if cached object found .... 
            if (ruleSet != null) { 
              _robotsCRC = cachedRobotsCRC;
              _robotsHostName = activeDomainName;
              _robotsReturned400 = domainInfo._robotsReturned400;
              _robotsReturned403 = domainInfo._robotsReturned403;
              _ruleSet = ruleSet;
              _robotsRetrieved = true;
              
              if (Environment.detailLogEnabled())
                LOG.info("### Skipping Robots Fetch. Cached CRC is Non-Zero and cached rule-set found via host.");
              
              return false;
            }
          }
        }
      }
    }
    return true;
  }
  
  public void setActiveDomainName(String hostName) {
    _activeDomainInfo = getDomainInfoFromDomain(hostName);
  }
  
  public DomainInfo getActiveDomain() { 
    return _activeDomainInfo;
  }

  private DomainInfo getDomainInfoFromDomain(String domainName) { 
    DomainInfo oldestItem = null;
    DomainInfo found = null;
    for (DomainInfo item : _domainInfo) { 
      if (item._domainName.equals(domainName)) {
      	if (getServerSingleton() != null) { 
	        if (item._lastTouched < getServerSingleton().getFilterUpdateTime()) {
	        	if (CrawlerServer.getEngine() != null) { 
	        		item._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName);
	        	}
	        	else { 
	        		item._domainBlackListed = false;
	        	}
	        }
      	}
        item._lastTouched = System.currentTimeMillis();
        found = item;
      }
      oldestItem = (oldestItem == null) ? item : (oldestItem._lastTouched > item._lastTouched) ? item : oldestItem;
    }
    
    if (found == null) { 
      if (_domainInfo.size() == MAX_DOMAIN_CACHE_ENTIRES) { 
        _domainInfo.removeElement(oldestItem);
      }
      found = new DomainInfo();
      
      found._domainName = domainName;
      found._lastTouched = System.currentTimeMillis();
      if (getServerSingleton() != null) { 
      	if (CrawlerServer.getEngine() != null) { 
      		found._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName);
      	}
      	else { 
      		found._domainBlackListed = false;
      	}
      }
      _domainInfo.addHead(found);
    }    
    return found;
  }
  
  private long checkDomainCacheForRobotsCRC(String hostName){ 
    for (DomainInfo aliasInfo : _domainInfo) { 
      if (aliasInfo._domainName.equalsIgnoreCase(hostName)) {
        aliasInfo._lastTouched = System.currentTimeMillis();
        _domainInfo.removeElement(aliasInfo);
        _domainInfo.addHead(aliasInfo);
        if (aliasInfo._robotsCRC != -1) { 
          if (Environment.detailLogEnabled())
            LOG.info("### Found Robots Match in Cache for host:" + hostName);          
        }
        return aliasInfo._robotsCRC;
      }
    }
    return -1;
  }
  
  
  
  private void updateRobotsCRCForDomain(long crc,String domainName,boolean robotsReturned400,boolean robotsReturned403) {
    getDomainInfoFromDomain(domainName)._robotsCRC = crc;
    getDomainInfoFromDomain(domainName)._robotsReturned400 = robotsReturned400;
    getDomainInfoFromDomain(domainName)._robotsReturned403 = robotsReturned403;
  }
  
  private void resetRobotsState() { 
    // flip robots status ... 
    _robotsRetrieved = false;
    _robotsReturned400 = false;
    _robotsReturned403 = false;
    _robotsHostName = null;
    _robotsCRC = 0;
    _ruleSet = RobotRulesParser.getEmptyRules();
  }    
  
  private CrawlTarget buildRobotsRequest(String hostName) { 

    CrawlTarget targetOut = null;
    
    // reset the robots state ... 
    resetRobotsState();
    
    // log the situation 
    // LOG.info("####Robots-fetching robots for host:" + hostName);
    
    // and set up some initial robots state 
    _robotsHostName = hostName;

    //build a robots.txt url
    URL robotsURL = null;
    try {
      robotsURL = new URL(getHost().getScheme(),_robotsHostName,"/robots.txt");
    } catch (MalformedURLException e) {
      
    }
        
    if (robotsURL == null) {
      if (Environment.detailLogEnabled())
        LOG.error("####Robots Unable to fetch Robots for host:"+ _robotsHostName);
      // cheat 
      _robotsRetrieved = true;
      // and update the robot info in the alias map 
      updateRobotsCRCForDomain(_robotsCRC, _robotsHostName,_robotsReturned400,_robotsReturned403);
    }
    else {
      // ok , the robots url is good 
      targetOut = new CrawlTarget(0,this);
      // set the url ... 
      targetOut.setOriginalURL(robotsURL.toString());
      // and mark the target as a robots get 
      targetOut.setFlags(CrawlURL.Flags.IsRobotsURL);
      
      CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();
      
      synchronized (crawlerStats) { 
        crawlerStats.setActvieRobotsRequests(crawlerStats.getActvieRobotsRequests() + 1);
      }
      
    }
    return targetOut;
  }
  
  public boolean populateIPAddressForTarget(String hostName,CrawlTarget target) { 
    for (DNSCacheItem item : _dnsCacheItem) { 
      if (item._hostName.equalsIgnoreCase(hostName)) { 
        if (item._ttl >= System.currentTimeMillis()) { 
          //LOG.info("###Using Cached IP Address for target:" + target.getActiveURL() + " Cached IP:" + item._ipAddress + " TTL:" + item._ttl);
          target.setServerIP(item._ipAddress);
          target.setServerIPTTL(item._ttl);
          return true;
        }
        return false;
      }
    }
    return false;
  }
  
  private void applyRewriteRulesToTarget(String hostName,CrawlTarget target) {
    
    if(_rewriteItemList != null) { 
      for (WWWReWriteItem item : _rewriteItemList) { 
        if (item._wwwRuleDomain.equalsIgnoreCase(hostName)) { 
          if ((item._wwwRuleType & WWWRULE_Add) != 0) { 
            target.setOriginalURL(target.getOriginalURL().replaceFirst(hostName, "www." + hostName));
          }
          else { 
            target.setOriginalURL(target.getOriginalURL().replaceFirst(hostName, hostName.substring(4)));
          }
          break;
        }
      }
    }
  }
  
  static CrawlURLMetadata rewriteTestMetadata = new CrawlURLMetadata();
  static FilterResults rewriteFilterResults = new FilterResults();
  
  /** get next crawl candidate */
  public synchronized CrawlTarget getNextTarget() { 

    if (_scheduled != null) { 
      throw new RuntimeException("Scheduled Not Null and getNextTarget called!");
    }
    
    int maxRobotsExclusionInLoop = (CrawlerServer.getServer() != null) ? CrawlerServer.getServer().getMaxRobotsExlusionsInLoopOverride() : -1;
    if (maxRobotsExclusionInLoop == -1) { 
      maxRobotsExclusionInLoop = MAX_ROBOTS_EXCLUSION_IN_LOOP;
    }
    
    // target out is currently null
    CrawlTarget targetOut = null;
    
    Disposition oldDisposition = _disposition;
    
    int robotsExcludedCount = 0;
    int failedTargetsCount = 0;
    
    String domainName = "";
    
    while (targetOut == null && getNextPending(false) != null && getDisposition() == CrawlList.Disposition.ItemAvailable) { 
      
      // pop the next target off of the queue ... 
      CrawlTarget potentialTarget = getNextPending(true);
      
      // mark request start time 
      potentialTarget.setRequestStartTime(System.currentTimeMillis());
      
      
      // get the host name (fast)
      domainName = URLUtils.fastGetHostFromURL(potentialTarget.getActiveURL());
      
      // if not valid ... fail explicitly 
      if (domainName == null || domainName.length() == 0) { 
        // explicitly fail this url ... 
        CrawlTarget.failURL(potentialTarget.createFailureCrawlURLObject(CrawlURL.FailureReason.MalformedURL, null),potentialTarget, CrawlURL.FailureReason.MalformedURL,null);
      }
      else {

        /*
        // potentially rewrite domain name 
        if (getServer().getDNSRewriteFilter() != null) { 
          synchronized (rewriteTestMetadata) { 
            if (getServer().getDNSRewriteFilter().filterItem(domainName, "", rewriteTestMetadata, rewriteFilterResults) == FilterResult.Filter_Modified) {
              LOG.info("### FILTER Rewrote DomainName:" + domainName + " To:" + rewriteFilterResults.getRewrittenDomainName());
              domainName = rewriteFilterResults.getRewrittenDomainName();
            }
          }
        }
        */
        
        // set the active host name 
        setActiveDomainName(domainName);
        
        // check to see if the domain has been marked as failed or the host has been marked as failed ...
        if (!getActiveDomain()._domainFailed && !getActiveDomain()._domainBlackListed && !_host.isFailedServer()) { 
        
          // if the the active target does not match the current robots file ...  
          if (activeDomainRequiresRobotsFetch(domainName)) {
            
            // add the target back to the head of the queue... 
            _pending.addHead(potentialTarget);
  
            // and build a robots request ... 
            targetOut = buildRobotsRequest(domainName);
            
          }
          // otherwise ... go ahead try to fetch the next url in the queue 
          else {
            
            // now if disposition is still item available ... 
            if (potentialTarget != null && _disposition == Disposition.ItemAvailable) { 
                  
              targetOut = potentialTarget;
              
              URL theTargetURL = null;
              
              try {
                theTargetURL = new URL(targetOut.getOriginalURL());
              } catch (MalformedURLException e) {
                theTargetURL = null;
                LOG.error("Error parsing URL:"+targetOut.getOriginalURL() + " for Host:"+ domainName);
              }
              
              if (theTargetURL == null) { 
                // explicitly fail this url ... 
                CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.MalformedURL, null), targetOut,CrawlURL.FailureReason.MalformedURL,null);
                
                // and set target out to null!!
                targetOut = null;
              }
              else { 
                
                boolean robotsExcluded = !_ruleSet.isAllowed(theTargetURL);
                boolean serverExcluded = false;
                if (!robotsExcluded) { 
                  serverExcluded = CrawlerServer.getServer().isURLInBlockList(theTargetURL);
                }
                
                // validate against the robots file ...
                if (robotsExcluded || serverExcluded) { 
                  //track number of robots exclusion in this loop 
                  ++robotsExcludedCount;
                  // inform host 
                  _host.incrementCounter(CrawlListHost.CounterId.RobotsExcludedCount, 1);
                  // explicitly fail this url ...
                  if (robotsExcluded) {
                    if (Environment.detailLogEnabled())
                      LOG.info("### ROBOTS Excluded URL:" + theTargetURL + " via Robots File");
                    CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.RobotsExcluded, null),targetOut, CrawlURL.FailureReason.RobotsExcluded,null);
                  }
                  else {  
                    if (Environment.detailLogEnabled())
                      LOG.info("### ROBOTS Excluded URL:" + theTargetURL + " via Blacklist");
                    CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.BlackListedURL, null),targetOut, CrawlURL.FailureReason.BlackListedURL,null);
                  }

                  // and set target out to null
                  targetOut = null;
                  
                  
                  // if robots processed in loop exceeds maximum 
                  if (robotsExcludedCount >= maxRobotsExclusionInLoop) { 
                    
                    if (_pending.size() != 0 || _offlineTargetCount != 0) { 
                      // wait on time ...
                      _disposition = Disposition.WaitingOnTime;
                    }
                    else { 
                      _disposition = Disposition.QueueEmpty;
                    }
                    // and break out
                    break;
                  }
                }
                //
              }
            }
          }
        }
        // otherwise ... if the domain has failed ... 
        else {
          if (potentialTarget != null) {
            int failureReason = CrawlURL.FailureReason.TooManyErrors;
            String failureDesc = "Host Failed due to too many errors";
            if (getActiveDomain()._domainBlackListed) { 
              failureReason = CrawlURL.FailureReason.BlackListedURL;
              failureDesc = "Host Black Listed";
            }
            else if (getHost().isBlackListedHost()) { 
              failureReason = CrawlURL.FailureReason.BlackListedHost;
              failureDesc = "Host Black Host";
            }
            
            // fail the url and move on ...
            //TODO: DISABLING THIS BECAUSE FAILING FOR ABOVE REASONS IS NOT REALLY A PERSISTENT FAILURE ATTRIBUTABLE TO THE URL 
            // CrawlTarget.failURL(potentialTarget.createFailureCrawlURLObject(failureReason, failureDesc),potentialTarget, failureReason,null);
          }
          // set 
          targetOut = null;
          // increment failed item count 
          failedTargetsCount++;
          // now, if failed count exceeds max failures in loop 
          if (failedTargetsCount >= MAX_FAILED_TARGETS_IN_LOOP) { 
            
            if (_pending.size() != 0 || _offlineTargetCount != 0) { 
              // wait on time ...
              _disposition = Disposition.WaitingOnTime;
            }
            else { 
              _disposition = Disposition.QueueEmpty;
            }
            // and break out
            break;
          }
        }
      }
    }

    // ok, if we have a target ... fetch it ... 
    if (targetOut != null) { 
    	
    	// ok before we can fetch this guy, we need to check to see if the associated host is in a paused state ... 
    	if (_host.isPaused()) { 
    		LOG.info("***getNextItem for Host:" + domainName + " is Paused!!");
    		// null target out, which will set us in a waiting on time state again 
    		targetOut = null;
    	}
    	
    	// now again, if target out is not null 
    	if (targetOut != null) { 
    	
	      if (Environment.detailLogEnabled())
	        LOG.info("getNextItem for Host:" + domainName + " Returned URL:" + targetOut.getOriginalURL() + " object:" + targetOut.toString());
	      // set scheduled item pointer ... 
	      _scheduled = targetOut;
	
	      // set the active host name 
	      setActiveDomainName(domainName);
	      // get ip address info (if available)
	      populateIPAddressForTarget(domainName,_scheduled);
	      
	      // change disposition ... 
	      _disposition = Disposition.WaitingOnCompletion;
	
	      //set initial fetch start time ... 
	      updateLastFetchStartTime(System.currentTimeMillis());
    	}
    }

    
    // if target out is null, and pending size is zero but there are offline targets ... 
    if (targetOut == null) {
      
      if (_pending.size() != 0 || _offlineTargetCount != 0) {
        // then set disposition to waiting on time ... 
        _disposition = CrawlList.Disposition.WaitingOnTime;
      }
      else { 
        _disposition = CrawlList.Disposition.QueueEmpty;
      }
    }
    
    // check to see if we need to load more items from disk 
    potentiallyQueueDiskLoad();
    
    if (targetOut != null) {
      // finally rewrite target url if necessary 
      applyRewriteRulesToTarget(domainName,targetOut);
    }
    
    if (targetOut != null) { 
      // LOG.info("### getNextIem for Host:" + domainName + " Returned URL:" + targetOut.getActiveURL());
    }
    return targetOut;
  }
  
  
  void fetchStarting(CrawlTarget target,NIOHttpConnection connection) { 
    _activeConnection = connection;
  }
  
  
  /** fetch started callback - called from CrawlTarget **/
  void fetchStarted(CrawlTarget target) { 

    // record fetch start time ...  
    updateLastFetchStartTime(System.currentTimeMillis());
    
    // and notify host as well 
    _host.updateLastFetchStartTime(getLastFetchStartTime());
    
    if (_scheduled == target) { 

      if (Environment.detailLogEnabled())  
        LOG.info("Fetch Started URL:" + target.getOriginalURL());
    }
    else {
      if (_scheduled == null) { 
        LOG.error("fetchStarted - scheduled target is null and fetch started target is:" + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName() );
      }
      else { 
        LOG.error ( "fetchStarted - scheduled target is: " + _scheduled.getOriginalURL().toString() +" list:" +_scheduled.getSourceList().getListName() + " and fetch started target is:" + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName() );
      }
    }
    
  }
  
  /** if in memory queue is exhausted or below threshold and there offline targets, queue up a load from disk for this domain **/
  private void potentiallyQueueDiskLoad() { 
    if (_pending.size() <= DISK_LOAD_THRESHOLD && (!_diskRequestPending || _pending.size() ==0) && _offlineTargetCount != 0) { 
      _diskRequestPending = true;
      _diskOperationQueue.add(new DiskQueueEntry(this,true));
    }
  }
  
    
  static class RobotRuleResult { 
    public RobotRuleSet ruleSet;
    public long         crcValue;
  };
  
  /** fetch succeeded **/
  void fetchSucceeded(final CrawlTarget target,int downloadTime,final NIOHttpHeaders httpHeaders,final Buffer contentBuffer) { 
   
    _lastRequestWasIOException = false;
    _lastRequestDownloadTime  = downloadTime;
    _lastRequestRedirectCount = target.getRedirectCount();
    _fetchEndTime = System.currentTimeMillis();
    
    _activeConnection = null;
    
    getHost().incrementCounter(CrawlListHost.CounterId.SuccessfullGetCount,1);        
        
    // reset host's io error count
    _host.resetCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount);
    
    if (getActiveDomain() != null)
      getActiveDomain()._domainRetryCounter = 0;
    
    Disposition oldDisposition = _disposition;
    
    final String originalHost  = URLUtils.fastGetHostFromURL(target.getOriginalURL());
    final String activeHost    = URLUtils.fastGetHostFromURL(target.getActiveURL());
    
    if (originalHost != null && activeHost != null) { 
	    // update our server ip information from information contained within crawl target ...
	    cacheDNSEntry(activeHost,target.getServerIP(),target.getServerIPTTL());
	    // if the target was redirected ... cache the original ip address and ttl as well ... 
	    if (target.isRedirected()) { 
	      if (target.getOriginalRequestData() != null) { 
	        cacheDNSEntry(originalHost,target.getOriginalRequestData()._serverIP,target.getOriginalRequestData()._serverIPTTL);
	      }
	    }
    }
    
    final int resultCode = NIOHttpConnection.getHttpResponseCode(httpHeaders);
    
    if (resultCode == 200){
      
      getHost().incrementCounter(CrawlListHost.CounterId.Http200Count,1);        

      if (getActiveDomain() != null) {  
        getActiveDomain()._HTTP200Count++;
        getActiveDomain()._SequentialHTTPFailuresCount = 0;
      }
      
      // validate www rewrite rule if not set and target was redirected ... 
      if (target.isRedirected()) { 
        /* this is broken for the new list design
        if (!originalHost.equalsIgnoreCase(activeHost)) { 
          // if redirect strips the www then ... 
          if ((originalHost.startsWith("www.") || originalHost.startsWith("WWW.")) && activeHost.equalsIgnoreCase(originalHost.substring(4))) { 
            addWWWReWriteItem(originalHost,WWWRULE_Remove);
          }
          // else if redirect adds the www then ...
          else if ((activeHost.startsWith("www.") || activeHost.startsWith("WWW.")) && originalHost.equalsIgnoreCase(activeHost.substring(4))) {
            addWWWReWriteItem(originalHost,WWWRULE_Add);
          }
        }
        */
      }
    }
    else if (resultCode >= 400 && resultCode < 500) {
      if (resultCode == 403) { 
        // inform host for stats tracking purposes 
        _host.incrementCounter(CrawlListHost.CounterId.Http403Count,1);
      }
      if (getActiveDomain() != null) 
        getActiveDomain()._SequentialHTTPFailuresCount++;
    }
    else if (resultCode >=500 && resultCode < 600) {
      if (getActiveDomain() != null) {
        getActiveDomain()._SequentialHTTPFailuresCount++;
      }
    }
    
    if (_scheduled != target) { 
      if (_scheduled == null)
        LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!");
      else 
        LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + 
            target.getOriginalURL() + " " + target.toString() + " ActiveTarget is:" + _scheduled.getOriginalURL() + " " + _scheduled.toString());
    }
    else { 
      
      // clear active ... 
      _scheduled = null;
      
      // if this is the robots target ... 
      if ( (target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) { 
        
        final CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();
        
        
        // process the robots data if any ... 
        // check for null queue (in case of unit test);
        if (resultCode == 200) { 
  
          _robotsRetrieved = true;
          
          synchronized (crawlerStats) { 
            crawlerStats.setRobotsRequestsSucceeded(crawlerStats.getRobotsRequestsSucceeded() + 1);
            crawlerStats.setRobotsRequestsQueuedForParse(crawlerStats.getRobotsRequestsQueuedForParse() + 1);
          }

          LOG.info("### Scheduling Robots Parse for:"+target.getActiveURL());
          
          // transition to a waiting on completion disposition ... 
          _disposition = Disposition.WaitingOnCompletion;

          
          
          if (getServerSingleton() != null) { 
            // schedule a robots parser parse attempt ... 
          	getServerSingleton().registerThreadPool("robots", 5).execute(new ConcurrentTask<RobotRuleResult>(getServerSingleton().getEventLoop(), 
                
                new Callable<RobotRuleResult>() {
        
                  public RobotRuleResult call() throws Exception {
                    
                    try {
                      
                      TextBytes contentData = new TextBytes(contentBuffer.get());
                      
                      String contentEncoding = httpHeaders.findValue("Content-Encoding");
                      
                      if (contentEncoding != null && contentEncoding.equalsIgnoreCase("gzip")) {
  
                        if (Environment.detailLogEnabled())
                          LOG.info("GZIP Encoding Detected for Robots File For:"+activeHost);
  
                        UnzipResult result = GZIPUtils.unzipBestEffort(contentData.getBytes(),CrawlEnvironment.CONTENT_SIZE_LIMIT);
  
                        if (result == null) {
                          contentData = null;
                          if (Environment.detailLogEnabled())
                            LOG.info("GZIP Decoder returned NULL for Robots File For:"+activeHost);
                        }
                        else { 
                          contentData.set(result.data.get(),result.data.getOffset(),result.data.getCount());
                        }
                      }

                      try {
                        if (contentData != null) { 
                          String robotsTxt = contentData.toString().trim().toLowerCase();
                          if (robotsTxt.startsWith("<html") || robotsTxt.startsWith("<!doctype html")) { 
                            contentData = null;

                            CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                                resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,CrawlerEngine.RobotsParseFlag_ContentWasHTML);
                          }
                          else { 
                            CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                                resultCode, robotsTxt,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,0);
                            
                            synchronized (this) { 
                              _lastFetchedRobotsData = robotsTxt;
                              _lastFetchedRobotsHostName = _robotsHostName;
                            }
                          }
                        }
                        else { 
                          CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                              resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                              CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                        }
                      }
                      catch (Exception e) { 
                        LOG.error(CCStringUtils.stringifyException(e));
                        CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                            resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                            CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                      }
                      
                      if (Environment.detailLogEnabled())
                        LOG.info("Parsing Robots File for Host:"+activeHost);
                      RobotRuleResult result = new RobotRuleResult();
                      
                      if (contentData != null) { 
                        synchronized (_crc32) { 
                          _crc32.reset();
                          _crc32.update(contentData.getBytes(),contentData.getOffset(),contentData.getLength());
                          result.crcValue = _crc32.getValue();
                        }
                        RobotRulesParser parser = new RobotRulesParser(getServerSingleton().getConfig());
                        result.ruleSet = parser.parseRules(contentData.getBytes(),contentData.getOffset(),contentData.getLength()); 
                      }
                      else {
                        result.ruleSet = RobotRulesParser.getEmptyRules();
                        result.crcValue = 0;
                      }
                      return result;
                    }
                    catch (Exception e) { 
                      LOG.error(CCStringUtils.stringifyException(e));
                      throw e;
                    }
                  }
                }, 
        
                new ConcurrentTask.CompletionCallback<RobotRuleResult>() {
        
                  public void taskComplete(RobotRuleResult  loadResult) {

                    synchronized (crawlerStats) { 
                      crawlerStats.setRobotsRequestsQueuedForParse(crawlerStats.getRobotsRequestsQueuedForParse() - 1);
                    }
                    
                    if (loadResult != null) { 

                      boolean disallowsAll = !_ruleSet.isAllowed("/");
                      boolean robotsHadCrawlDelay = _ruleSet.getCrawlDelay() != -1;
                      boolean explicitMention   = _ruleSet.explicitMention;
                      
                      int logFlags = 0;
                      if (disallowsAll)
                        logFlags |= CrawlerEngine.RobotsParseFlag_ExcludesAll;
                      if (explicitMention)
                        logFlags |= CrawlerEngine.RobotsParseFlag_ExplicitMention;
                      if (robotsHadCrawlDelay)
                        logFlags |= CrawlerEngine.RobotsParseFlag_HasCrawlDelay;
                      
                      synchronized (crawlerStats) { 
                        crawlerStats.setRobotsRequestsSuccessfullParse(crawlerStats.getRobotsRequestsSuccessfullParse() + 1);
                        if (disallowsAll) { 
                          crawlerStats.setRobotsFileExcludesAllContent(crawlerStats.getRobotsFileExcludesAllContent() + 1);
                          if (explicitMention)
                            crawlerStats.setRobotsFileExplicitlyExcludesAll(crawlerStats.getRobotsFileExplicitlyExcludesAll() + 1);
                        }
                        if (explicitMention) { 
                          crawlerStats.setRobotsFileHasExplicitMention(crawlerStats.getRobotsFileHasExplicitMention() + 1);
                        }
                        if (robotsHadCrawlDelay) { 
                          crawlerStats.setRobotsFileHadCrawlDelay(crawlerStats.getRobotsFileHadCrawlDelay() + 1);
                        }
                      }
                      
                      CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                          0 , null,CrawlerEngine.RobotsLogEventType.Parse_Succeeded,logFlags);

                      
                      _ruleSet    = loadResult.ruleSet;
                      _robotsCRC  = loadResult.crcValue;
                      
                      _host.cacheRobotsFile(_ruleSet, _robotsCRC);
                    }
                    else {

                      CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
                          0 , null,CrawlerEngine.RobotsLogEventType.Parse_Failed,0);
                      
                      synchronized (crawlerStats) { 
                        crawlerStats.setRobotsRequestsFailedParse(crawlerStats.getRobotsRequestsFailedParse() + 1);
                      }
                      
                      // LOG.error("####Robots parsing for host:" + activeHost + " failed.");
                      _ruleSet    = RobotRulesParser.getEmptyRules();
                      _robotsCRC  = 0;
                    }
                    
                    //if (Environment.detailLogEnabled())
                      LOG.info("####Robots RETRIEVED for Host:"+activeHost + " CrawlDelay IS:" + getCrawlDelay(false));
                    
                    if (originalHost != null && activeHost != null) { 
	                    updateRobotsCRCForDomain(_robotsCRC, originalHost,_robotsReturned400,_robotsReturned403);
	                    if (activeHost.compareToIgnoreCase(originalHost) != 0) { 
	                      updateRobotsCRCForDomain(_robotsCRC, activeHost,_robotsReturned400,_robotsReturned403);
	                    }
                    }
                      
                    Disposition oldDisposition = _disposition;
                    
                    if (getNextPending(false) != null) {
                      _disposition = Disposition.ItemAvailable;
                    }
                    else { 
                      _disposition = Disposition.WaitingOnTime;
                    }
                    
                    if (oldDisposition != _disposition) { 
                      // notify queue 
                      getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition);
                    }
                  }
    
                  public void taskFailed(Exception e) {
                    if (Environment.detailLogEnabled())
                      LOG.error("####Robots parsing for host:" + _robotsHostName +" failed with exception" + e);
                    _ruleSet = RobotRulesParser.getEmptyRules();
                    
                    Disposition oldDisposition = _disposition;
                    
                    if (getNextPending(false) != null) {
                      _disposition = Disposition.ItemAvailable;
                    }
                    else { 
                      _disposition = Disposition.WaitingOnTime;
                    }
                    
                    if (oldDisposition != _disposition) { 
                      // notify queue 
                      getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition);
                    }                
                  } 
                }));
          }
          // explitly return here ( inorder to wait for the async completion event)
          return;
        }
        //otherwise ... 
        else {
         
          synchronized (crawlerStats) { 
            crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1);
          }
          
          
          CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
              resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed,0);
          
          _robotsCRC = 0;
          if (Environment.detailLogEnabled())
            LOG.info("####Robots GET for Host:" + activeHost + "FAILED With Result Code:" + resultCode);
          //TODO: MAKE THIS MORE ROBUST ... 
          // clear robots flag ... 
          _robotsRetrieved = true;
          // see if result code was a 403 
          if (resultCode >= 400 && resultCode <= 499) {
            _robotsReturned400 = true;
            if (resultCode == 403)
              _robotsReturned403 = true;
              
          }
          // for now, assume no robots rules for any error conditions ... 
          _ruleSet = RobotRulesParser.getEmptyRules();
          
          if (originalHost != null && activeHost != null) { 
	          updateRobotsCRCForDomain(_robotsCRC, originalHost,_robotsReturned400,_robotsReturned403);
	          if (activeHost.compareToIgnoreCase(originalHost) != 0) { 
	            updateRobotsCRCForDomain(_robotsCRC, activeHost,_robotsReturned400,_robotsReturned403);
	          }
          }
          
        }
      }
  
      if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) { 
        // update active host stats and check for failure ... 
        checkActiveHostStatsForFailure();
      }
        
      // if there are no more items in the queue 
      if (getNextPending(false) == null) {
        // if offline count is zero then mark this domain's queue as empty
        if (_offlineTargetCount == 0) { 
          _disposition = Disposition.QueueEmpty;
        }
        // otherwise put us in a wait state and potentially queue up a disk load 
        else { 
          _disposition = Disposition.WaitingOnTime;
          // potentially queue up a disk load 
          potentiallyQueueDiskLoad();
        }
      }
      else { 
        // if we are ready to fetch the next item ... 
        if (calculateNextWaitTime() < System.currentTimeMillis()) { 
          _disposition = Disposition.ItemAvailable;
        }
        else { 
        // transition to a new wait state ... 
          _disposition = Disposition.WaitingOnTime;
        }
      }
        
      if (oldDisposition != _disposition) { 
        // either way ... notify queue 
        getHost().listDispositionChanged(this, oldDisposition, _disposition);
      }
    }
  }
  
  /** get total request count **/
  private final int getTotalFailureCount() {
    if (getActiveDomain() != null)
      return getActiveDomain()._HTTP400Count + getActiveDomain()._HTTP500Count;
    return 0;
  }
  
  /** check to see if we should fail the host based on collected stats **/
  private boolean checkActiveHostStatsForFailure() { 

    boolean failHost = false;
    
    if (getActiveDomain() != null) { 
      String errorReason = null;

      /*
      if (getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER && _robotsReturned403) { 
        errorReason  ="Too Many Sequential Errors AFTER Robots Returned 403. HTTP200 Count:" + getActiveDomain()._HTTP200Count;
        failHost =true;
      }
      */
      if ((getActiveDomain()._HTTP200Count == 0 && getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_NO_200_TRIGGER)) { 
        errorReason  ="Too Many Sequential Errors. RobotsReturned400:" + _robotsReturned400 + " 400 Count:" + getActiveDomain()._HTTP400Count + " 500 Count:" + getActiveDomain()._HTTP500Count + " 200 Count:" + getActiveDomain()._HTTP200Count;
        failHost =true;
      }
      else { 
  
        int totalFailureCount = getTotalFailureCount();
        int totalRequestCount = totalFailureCount + getActiveDomain()._HTTP200Count;
        
        if (totalRequestCount != 0 && totalRequestCount >= STATS_CHECK_CODE_SAMPLE_THRESHOLD) {
          
          float badToGoodPercent = (float)totalFailureCount / (float)totalRequestCount;
          if (badToGoodPercent >= BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD) { 
            failHost = true;
            errorReason  ="Bad To Good URL Pct:" + badToGoodPercent +" exceeded Threshold:" +
            BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD + " RobotsReturned400:" + _robotsReturned400 + 
            " 400 Count:" + getActiveDomain()._HTTP400Count + " 500 Count:" + getActiveDomain()._HTTP500Count + " 200 Count:" + getActiveDomain()._HTTP200Count;
          }
        }
      }
      
      if (failHost) { 
        failActiveDomain(CrawlURL.FailureReason.TooManyErrors, errorReason);
        LOG.error("#### HOST FAILURE - List:" + getListName() + "Host: " + getActiveDomain()._domainName +" Reason:" + errorReason);
      }
    }    
    return failHost;
  }
  
  private static final int FAIL_STRATEGY_RETRY_ITEM = 0;   // increment the failure count on the item and retry 
  private static final int FAIL_STRATEGY_RETRY_HOST = 1; //  increment the failure count on the host and retry ... 
  private static final int FAIL_STRATEGY_FAIL_ITEM = 2;     //  immediately fail the item ... 
//  private static final int FAIL_STRATEGY_FAIL_HOST  = 3;    // immediately fail the host ... 
    
  private static final int failureCodeStrategyTable[] = { 
    
    FAIL_STRATEGY_RETRY_ITEM, // UNKNOWN            - result: Inc Fail Count on Item, potentially reschedule 
    FAIL_STRATEGY_FAIL_ITEM,// UknownProtocol  - result: Immediately Fail Item  
    FAIL_STRATEGY_FAIL_ITEM,// MalformedURL     - result: Immediately Fail Item  
    FAIL_STRATEGY_RETRY_ITEM,// Timeout                - result: Inc Fail Count on Host, potentially reschedule
    FAIL_STRATEGY_FAIL_ITEM,// DNSFailure            -result: reschedule, set waitstate for Host
    FAIL_STRATEGY_RETRY_HOST,// ResolverFailure     -result: Inc Fail Count on Host, potentially reschedule
    FAIL_STRATEGY_RETRY_ITEM,// IOException          -result: Inc Fail Count on Item, potentially reschedule 
    FAIL_STRATEGY_FAIL_ITEM, // RobotsExcluded
    FAIL_STRATEGY_FAIL_ITEM,// NoData            = 9;
    FAIL_STRATEGY_RETRY_ITEM,// RobotsParseError  = 10;
    FAIL_STRATEGY_FAIL_ITEM,// RedirectFailed    = 11;
    FAIL_STRATEGY_RETRY_ITEM,// RuntimeError      = 12;
    FAIL_STRATEGY_RETRY_HOST,// ConnectTimeout    = 13;
    FAIL_STRATEGY_FAIL_ITEM,//BlackListedHost   = 14;
    FAIL_STRATEGY_FAIL_ITEM,//BlackListedURL    = 15;
    FAIL_STRATEGY_FAIL_ITEM,//TooManyErrors     = 16;
    FAIL_STRATEGY_FAIL_ITEM,//InCache           = 17;
    FAIL_STRATEGY_FAIL_ITEM// InvalidResponseCode = 18;
    
  };
  
  private void failURL(CrawlTarget target,int failureReason,String errorDescription) {
    
    // explicitly fail the item ... 
    CrawlTarget.failURL(target.createFailureCrawlURLObject(failureReason,errorDescription),target,failureReason,errorDescription);
  }
  
  
  private synchronized void failActiveDomain(int  failureReason,String errorDescription) { 
    
    if (getActiveDomain() != null) { 
     
      LOG.error("### Failing Active Domain:" + getActiveDomain()._domainName + " in List:" + getListName() + " ReasonCode:" + failureReason + " Description:" + errorDescription);
      
      // _disposition = Disposition.QueueEmpty;
      getActiveDomain()._domainFailed = true;
      
      /*
      // fail scheduled url  ... 
      if (_scheduled != null) { 
        _scheduled = null;
      }
      // just remove all pending urls from list for now ...
      _pending.removeAll();
      // reset offline count... 
      _offlineTargetCount = 0;
      // reset disk operation pending indiciator ... 
      _diskRequestPending = false;
      */
      getHost().incrementCounter(CrawlListHost.CounterId.FailedDomainCount,1);
      
      CrawlerServer.getEngine().failDomain(getActiveDomain()._domainName);
    }
  }
  
  synchronized void fetchFailed(CrawlTarget target, int failureReason,String description) { 

    _activeConnection = null;
    _lastRequestRedirectCount = target.getRedirectCount();
    _fetchEndTime = System.currentTimeMillis();
    
    getHost().incrementCounter(CrawlListHost.CounterId.FailedGetCount,1);
    
    if (getActiveDomain() != null) {  
      getActiveDomain()._SequentialHTTPFailuresCount++;
    }
    
    _lastRequestWasIOException = false;
    
    //check to see if the error is an io exception or a timeout 
    if (failureReason == CrawlURL.FailureReason.IOException || failureReason == CrawlURL.FailureReason.Timeout) { 
      // increment host failure counter ... 
      _host.incrementCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount,1);
      _lastRequestWasIOException = true;
    }
    
    // the rest is similar to a host retry strategy ... 
    Disposition oldDisposition = _disposition;
    
    if (_scheduled != target) { 
      if (_scheduled == null)
        LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!");
      else 
        LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is:" + _scheduled.getOriginalURL());
    }
    else { 
    
      // reset active and scheduled ... 
      _scheduled = null;
      
      // if we failed on the robots get ... 
      if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) { 
  
        CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();
        
        synchronized (crawlerStats) { 
          crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1);
        }
        
        //TODO: FIGURE THIS OUT LATER ... FOR NOW .. ON A FAILURE OF ROBOTS.TXT GET, WE ASSUME NO ROBOTS.TXT FILE ... 
  
        //LOG.warn("Robots Fetch for host:"+getHostName() + " Failed with Reason:" + failureReason +" Desc:" + description);
        //LOG.warn("Assuming NO-ROBOTS FILE");
  
        CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 
            0 , null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed,0);
        
        target.logFailure(CrawlerServer.getEngine(),failureReason,description);
        
        _robotsRetrieved = true;
        _ruleSet = RobotRulesParser.getEmptyRules();
  
        // and clear scheduled ... 
        _scheduled = null;
  
        updateLastFetchStartTime(-1);
        
        // and transition to wait state .... 
        _disposition = Disposition.WaitingOnTime;
      }
      // otherwise pass on to underlying crawl target handler ... 
      else { 
  
        // default failure strategy ... 
        int failureStrategy = FAIL_STRATEGY_RETRY_ITEM;
            
        // if failure code is within known failure codes ... 
        if (failureReason >= CrawlURL.FailureReason.UNKNOWN && failureReason <= CrawlURL.FailureReason.InvalidResponseCode) { 
          // use table to map strategy ... 
          failureStrategy = failureCodeStrategyTable[failureReason-1];
        }
        
        switch (failureStrategy) { 
        
          case FAIL_STRATEGY_RETRY_HOST:  
          case FAIL_STRATEGY_RETRY_ITEM: { 

            // increment retry counter ... 
            getActiveDomain()._domainRetryCounter ++;
            // increment retry counter on target ... 
            target.incrementRetryCounter();
            // IFF server failed ... 
            // OR retry count on item exceeded ... 
            // OR this item is a high priority dispatch item ...
            // THEN immediately fail this item 
            // ELSE queue up this item for subsequent retry
            if (_host.isFailedServer() || target.getRetryCount() >= MAX_ITEM_RETRY || ((target.getFlags() & CrawlURL.Flags.IsHighPriorityURL) != 0) ) {
              failURL(target,failureReason,description);
            }
            else { 
              // and add it back to the pending list ... 
              _pending.addTail(target);
            }
          }break;
    
          case FAIL_STRATEGY_FAIL_ITEM: { 
            failURL(target,failureReason,description);
          }break;
  
  /*        
          case FAIL_STRATEGY_FAIL_HOST: { 
            // just put the entire host in a fail state ...
            failDomain(failureReason,description);
          }break;
  */        
        }
              
        switch (failureStrategy) { 
          
          case FAIL_STRATEGY_RETRY_ITEM: 
          case FAIL_STRATEGY_FAIL_ITEM: 
          case FAIL_STRATEGY_RETRY_HOST: { 
            
            // check to see if there are items in the pending queue .... 
            if (_pending.size() == 0 && _offlineTargetCount == 0) { 
              // if not... transition to Queue Empty 
              _disposition = Disposition.QueueEmpty;
            }
            else {
              long waitTime = calculateNextWaitTime();
              // if we can fetch the next item ... 
              if (waitTime <= System.currentTimeMillis()) { 
                if (_pending.size() != 0)
                  // shift to an available disposition ... 
                  _disposition = Disposition.ItemAvailable;
                else 
                  // shift to waiting on time disposition (to wait for disk queue load).
                  _disposition = Disposition.WaitingOnTime;
              }
              else { 
                // wait on time ...
                _disposition = Disposition.WaitingOnTime;
              }
            }
          }
          break;
        }
        
        if (description == null)
          description = "";
        if (Environment.detailLogEnabled())
          LOG.error("Fetch Failed for URL:"+target.getOriginalURL() + " Reason:"+failureReason + " Description:" + description + " Strategy:"+failureStrategy + " OldDisp:" + oldDisposition + " NewDisp:" + _disposition);
      }
      
      
      if (_disposition == Disposition.WaitingOnCompletion) { 
        LOG.error("### BUG Fetch Faile for URL:" + target.getOriginalURL() +" failed to transition List to proper disposition!");
      }
      
      if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) { 
        // update active host stats and check for failure ... 
        checkActiveHostStatsForFailure();
      }
      
      // notify queue if disposition changed ... 
      if (_disposition != oldDisposition) { 
        getHost().listDispositionChanged(this,oldDisposition,_disposition);
      }
    }
  }
  
  
  
  /** clear a pre-existing wait state **/
  synchronized void  clearWaitState() { 
    
    Disposition oldDisposition = _disposition;
    
    // if robots retrieval is pending ... 
    if (_robotsRetrieved == false ) { 
      
     // LOG.debug("clearWaitState called on Host:"+getHostName()+ " after initial robots fetch");
      
      // explicitly transition to availabel (to retry robots fetch... )
      _disposition = Disposition.ItemAvailable;
    }
    // otherwise if pending queue size is zero or host has failed ... 
    else if ((_pending.size() == 0 && _offlineTargetCount  == 0 && _queued.size() == 0)) { 
      // LOG.debug("clearWaitState called on Host:"+getHostName()+ " and queue is empty. transitioning to QueueEmpty");
      // transition to queue empty disposition
      _disposition = Disposition.QueueEmpty;
    }
    else {

      // if active request size < max simulatenous requests ... 
      if (_scheduled == null) {
        // if there are items to be read from the in memory list ... 
        if (_pending.size() != 0) { 
          // LOG.error("clearWaitState called Host:"+getHostName()+ " and getNextPendingReturned object. transitioning to ItemAvailable");
          // immediately transition to an available state ... 
          _disposition = Disposition.ItemAvailable;
        }
        else { 
          if (!_diskRequestPending) { 
            _diskRequestPending = true;
            _diskOperationQueue.add(new DiskQueueEntry(this,true));
          }
          _disposition = Disposition.WaitingOnTime;
        }
      }
      // otherwise... we are waiting on completion now ... 
      else {
        LOG.warn("clearWaitState called on already scheduled list:"+getListName());
        _disposition = Disposition.WaitingOnCompletion;
      }
    }
    
    getHost().listDispositionChanged(this,oldDisposition,_disposition);
  }
  
  /** calculateRetryWaitTime */
  public long calculateNextWaitTime() {
  	
  	// ok check to see if the related host is paused ...
  	if (_host.isPaused()) { 
  		LOG.info("*** host is paused. pausing crawl for: " + PAUSE_STATE_RETRY_DELAY + " milliseconds");
  		// ok suspend for pause delay 
  		return System.currentTimeMillis() + PAUSE_STATE_RETRY_DELAY;
  	}
  	
    if (_fetchStartTime == -1) {
      return System.currentTimeMillis();
    }
    else {
      // first calculate crawl delay based on robots delay value * number of hops to service last request 
      //int crawlDelay = (getCrawlDelay(true) * (_lastRequestRedirectCount+1));
      int crawlDelay = getCrawlDelay(true);
      
      // if the crawl delay is the default host crawl delay 
      if (crawlDelay == _host.getCrawlDelay()) { 
        // see if fetch time is available 
        int lastDocFetchTime = getLastRequestFetchTime();
        
        if (lastDocFetchTime != 0) { 
          // calculate alternate crawl delay based on fetch time ... 
          int alternateCrawlDelay = lastDocFetchTime * 4;
          if (alternateCrawlDelay > crawlDelay) { 
            crawlDelay = alternateCrawlDelay;
            if (Environment.detailLogEnabled())
              LOG.info("### CRAWLDELAY Using Alternate Crawl Delay of:" + alternateCrawlDelay + " for URL:" + getNextPending(false));
          }
        }
      }
      
      /*
      if (_lastRequestDownloadTime != -1) { 
        // next see if host took more than crawl delay millseconds to respond
        if (_lastRequestDownloadTime >= getCrawlDelay()) { 
          // add request time to crawl delay 
          crawlDelay += _lastRequestDownloadTime;
        }
        // add one second for every 2 seconds of request time 
        else { 
          crawlDelay += 1000 * (_lastRequestDownloadTime / 2000);
        }
      }
      */
      // ok ... adjust crawl delay by the number of hops it took to get the result
      if (_fetchStartTime != -1) { 
      	return _fetchStartTime + crawlDelay;
      }
      return System.currentTimeMillis() + crawlDelay;
    }
  }
  
  /** getNextPending */
  private synchronized CrawlTarget getNextPending(boolean removeItem) {

    CrawlTarget targetOut = null;
    
    if (_pending.size() != 0) { 
      targetOut = _pending.getHead();
     if (removeItem && targetOut != null) { 
       _pending.removeElement(targetOut);
      }
    }
    return targetOut;
  }

  /** indicates if robots file need to be retrieved for the specified host */
  public boolean robotsRetrieved() { return _robotsRetrieved; }
 
  /** */
  private final int getCrawlDelay(boolean checkForOverride) { 
    
    if (checkForOverride) { 
      CrawlTarget potentialTarget = getNextPending(false);
      
      if (potentialTarget != null) { 
  
        try {
          URL targetURL = new URL(potentialTarget.getActiveURL());
          // validate against the server for crawl delay
          //LOG.info("Checking Crawl Delay for url:" + targetURL.toString());
          int overridenCrawlDelay = CrawlerServer.getServer().checkForCrawlRateOverride(targetURL);
          if (overridenCrawlDelay != -1) {
            if (Environment.detailLogEnabled())
              LOG.info("### CRAWLDELAY - Overriding Crawl Delay for URL:" + targetURL + " Delay is:" + overridenCrawlDelay );
            return overridenCrawlDelay;
          }
        } catch (MalformedURLException e) {
          
        }
      }
    }
    
    int crawlDelayOut = 0;
    
    if (_ruleSet == null || _ruleSet.getCrawlDelay() == -1) { 
      crawlDelayOut += getHost().getCrawlDelay();
    }
    else { 
      crawlDelayOut += (int)Math.min(_ruleSet.getCrawlDelay(),MAX_CRAWL_DELAY);
      crawlDelayOut = Math.max(MIN_CRAWL_DELAY, crawlDelayOut);
      
    }
    
    if (_lastRequestWasIOException) { 
      crawlDelayOut += IOEXCEPTION_TIMEOUT_BOOST;
    }
    
    return crawlDelayOut;
  }

  /** clear the host's state **/
  public synchronized void clear() { 
    _pending.removeAll();
    _scheduled = null;
    _diskRequestPending = false;
    _offlineTargetCount = 0;
  }

  public synchronized void dumpDetailsToHTML(StringBuffer sb){
//    synchronized (_pending) { 
      sb.append("ListName:" + getListName() + "\n");
      sb.append("RobotsRetrieved:" + _robotsRetrieved + "\n");
      sb.append("Disposition:" + _disposition + "\n");
      sb.append("Scheduled:" + ((_scheduled != null)?_scheduled.getOriginalURL() : "null") + "\n");
      sb.append("PendingCount:" + _pending.size() + "\n");
      sb.append("QueuedCount:" + _queued.size() + "\n");
      sb.append("OfflineCount:" + _offlineTargetCount +"\n");
      sb.append("ActiveConnection:" + _activeConnection +"\n");
      sb.append("LastFetchedRobotsHost:" + _lastFetchedRobotsHostName +"\n");
      
      if (_pending.size() != 0) {
        
        sb.append("next 100 scheduled urls:\n");
        
        int itemCount =0;
        CrawlTarget target = _pending.getHead();
        while (target != null) { 
          
          sb.append("["+(itemCount++)+"]:<a href='" + target.getOriginalURL() +"'>" + target.getOriginalURL()  + "</a>\n");
          
          target = target.getNext();
        }
      }
      
      sb.append("\n\nLastFetchedRobotsData:\n\n");
      if (_lastFetchedRobotsData != null) { 
        sb.append(_lastFetchedRobotsData);
        sb.append("\n");
      }
      
//    }
  }
  
  /**************
   * Disk Operation Support 
   */
    
  
    public static int getPendingDiskOperationCount() { 
      return _diskOperationQueue.size();
    }
  
    public static void stopDiskQueueingThread() { 
      
      if (_diskOperationThread != null) {
        _diskOpThreadShuttingDown = true;
        LOG.info("shutting down Disk Queue Thread - sending null item to queue");
        _diskOperationQueue.add(new DiskQueueEntry(null,false));
        try {
          LOG.info("Waiting for Disk Queue Thread to Die");
          _diskOperationThread.join();
          LOG.info("Done Waiting for Disk Queue Thread");
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
        
        _diskOpThreadShuttingDown = false;
        _diskOperationThread = null;
      }
    }
    public static void startDiskQueueingThread(final EventLoop serverEventLoop,final File baseStoragePath) { 
      
      // figure out 
      
      // and finally start the blocking writer thread ...
      _diskOperationThread = new Thread(new Runnable() {
    
        public void run() {
          
          for (;;) { 
            try {
              
              DiskQueueEntry entry = _diskOperationQueue.take();

              // if buffer item is null... this is considered an eof condition ... break out ... 
              if (entry.getListItem() == null) {
                LOG.info("### DiskThread:Received Null Item ... Shutting down CrawlDomain Disk Queue Thread");
                // now matter what ... break out ... 
                break;
              }
              // otherwise .. figure out what to do with the domain ... 
              else { 

                if (_diskOpThreadShuttingDown == false) {
                  
                  final CrawlList domain = entry.getListItem();
                  
                  try {
                    if (Environment.detailLogEnabled())
                      LOG.info("### DiskThread: Got List:" + domain.getListName());
                    // build a hierarchichal path for the given domain id ... 
                    File logFilePath = null;
                    String listName = null;
                    synchronized(domain) { 
                      logFilePath = FileUtils.buildHierarchicalPathForId(baseStoragePath,domain.getUniqueListId());
                      listName = domain.getListName();
                    }
                    // get the immediate parent directory ... 
                    File parentDirectory = logFilePath.getParentFile();
                    // and recursively create the directory chain (if necessary).
                    parentDirectory.mkdirs();
                    
                    IntrusiveList<CrawlTarget> flushList = null;
                    
                    int desiredLoadAmount = 0;

                    boolean truncateFile = false;
                    
                    synchronized(domain) { 
                      if (domain._offlineTargetCount == 0) { 
                        truncateFile = true;
                      }
                    }
                    
                    if (truncateFile && logFilePath.exists()) { 
                      
                      if (Environment.detailLogEnabled())
                        LOG.info("### DiskThread: Truncating Existing Log File for List:" + listName);
                      
                      LogFileHeader header = new LogFileHeader();
                           
                      RandomAccessFile file = new RandomAccessFile(logFilePath,"rw"); 
                      
                      try { 
                          writeLogFileHeader(file,header);
                      }
                      finally { 
                        file.close();
                      }
                    }
                    
                    // now lock access to the domain's pending queue
                    synchronized(domain) { 
                      
                      // if a disk request was pending ...
                      if (domain._diskRequestPending) { 
                      
                        // reset disk request pending flag here to prevent race condition ...
                        domain._diskRequestPending = false;
                        
                        // figure out what action to take with respect to the domain ...
                        
                        // if list count exceeds flush threshold 
                        if (domain._pending.size() >= DISK_FLUSH_THRESHOLD || domain._queued.size() != 0) {
                         
                          if (domain._queued.size() == 0) { 
                            LinkedList<CrawlTarget> candidates = new LinkedList<CrawlTarget>();
                            for (CrawlTarget candidate : domain._pending) { 
                              if ((candidate.getFlags() & CrawlURL.Flags.IsHighPriorityURL) == 0) { 
                                // add candidates in proper order ... 
                                candidates.add(candidate);
                              }
                            }
                            
                            // if there are low priority candidates we can flush ... 
                            if (candidates.size() != 0) {
                              // create a new flush list ... 
                              flushList = new IntrusiveList<CrawlTarget>();
                              
                              // reverse candidate list and start removing items from pending 
                              for (CrawlTarget candidate : Lists.reverse(candidates)) { 
                                domain._pending.removeElement(candidate);
                                flushList.addHead(candidate);
                                // if we are back to ideal target count bail ... 
                                if (domain._pending.size() <= IDEAL_TARGET_COUNT)
                                  break;
                              }
                              
                              if (Environment.detailLogEnabled())
                                LOG.info("### DiskThread: List:" + domain.getListName() + " Created FetchList FROM PENDING of Size:" + flushList.size());
                              
                              // increment offline target count ...
                              domain._offlineTargetCount += flushList.size();
                            }
                          }
                          else { 
                            flushList = domain._queued.detach(domain._queued.getHead());
                            
                            if (Environment.detailLogEnabled())
                              LOG.info("### DiskThread: List:" + domain.getListName() + " Created FetchList FROM QUEUED of Size:" + flushList.size());

                            // increment offline target count ...
                            domain._offlineTargetCount += flushList.size();
                            
                          }
                          

                          /*
                          // walk one past IDEAL target item count... 
                          int i=0;
                          CrawlTarget target = domain._pending.getHead();  
                          while (i<IDEAL_TARGET_COUNT) { 
                            target = target.getNext();
                            ++i;
                          }
                          
                          // and extract a sub-list starting at the target ...  
                          flushList = domain._pending.detach(target);
                          */
                          //and immediately update offline target count in domain ... 
                          //domain._offlineTargetCount += flushList.size();
                        }
                        // otherwise ... 
                        else { 
                          // check queued size ... 
                          if (domain._queued.size() != 0) {
                            // if pending size <= DISK_LOAD_THRESHOLD 
                            if (domain._offlineTargetCount == 0 && domain._pending.size() <= DISK_LOAD_THRESHOLD) {
                              if (Environment.detailLogEnabled())
                                LOG.info("### DiskThread: Moving Items from Queued List to Pending List for CrawlList:" + domain.getListName());
                              // move over items from queued to pending 
                              
                              while (domain._queued.getHead() != null) { 
                                domain._pending.addTail(domain._queued.removeHead());
                                if (domain._pending.size() == (DISK_FLUSH_THRESHOLD - 1))
                                  break;
                              }
                            }
                            
                            //now if domain queue exceeds flush threshold ... 
                            if (domain._queued.size() >= IDEAL_TARGET_COUNT) { 
                              if (Environment.detailLogEnabled())
                                LOG.info("### DiskThread: Queued Size Exceed Flush Threshold. Flushing to Disk for CrawlList:" + domain.getListName());
                              // extract a sub-list starting at head of queued list   
                              flushList = domain._queued.detach(domain._queued.getHead());
                              //and immediately update offline target count in domain ... 
                              domain._offlineTargetCount += flushList.size();
                            }
                          }
                          
                          // check to see if a load is desired ...
                          if (domain._pending.size() <= DISK_LOAD_THRESHOLD) { 
                            // calculate load amount ... 
                            desiredLoadAmount = IDEAL_TARGET_COUNT - domain._pending.size();
                          }
                        }
                      }
                      else { 
                        if (Environment.detailLogEnabled())
                          LOG.info("### DiskThread: Skipping List:" + domain.getListName());
                      }
                    }
                    
                    // now figure out what to do ... 
                    if (flushList != null) {
                      if (Environment.detailLogEnabled())
                        LOG.info("### DiskThread: Flushing"+ flushList.size() + " Items To Disk for Domain:" + domain.getListName());
                      // flush crawl targets to disk ... 
                      appendTargetsToLogFile(logFilePath,flushList);
                      // clear list ... 
                      flushList.removeAll();
                    }
                    // ... if load is desired ... 
                    if (desiredLoadAmount  != 0) { 
                      
                      IntrusiveList<CrawlTarget> loadList = new IntrusiveList<CrawlTarget>();

                      int loadCount = readTargetsFromLogFile(domain,logFilePath,desiredLoadAmount,loadList);
                      
                      // if (Environment.detailLogEnabled())
                        LOG.info("### DiskThread:Disk Queue Loaded: " + loadCount + "Items To Disk for Domain:" + domain.getListName());
                      
                      if (loadCount != 0) { 
                        // time to lock domain again ... 
                        synchronized(domain) { 
                          // and reduce offline count ... 
                          domain._offlineTargetCount -= loadList.size(); 
                          // load new items into domain's list ... 
                          domain._pending.attach(loadList);
                        }
                      }
                    }
                  }
                  catch (IOException e) { 
                    LOG.error("### DiskThread:" + CCStringUtils.stringifyException(e));
                  }
                }
              }
            } catch (InterruptedException e) {
    
            }
            catch (Exception e) { 
              LOG.fatal("### DiskThread: Encountered Unhandled Exception:" + CCStringUtils.stringifyException(e));
            }
          }
         
          LOG.info("### DiskThread: Exiting CrawlDomain Disk Queue Thread");
        } 
      });
      // launch the writer thread ... 
      _diskOperationThread.start();
    }
    
    private static class LogFileHeader {
      
      public static final int LogFileHeaderBytes = 0xCC00CC00;
      public static final int LogFileVersion         = 1;
      
      public LogFileHeader() { 
        _readPos = 0;
        _writePos = 0;
        _itemCount = 0;
      }
      
      public long _readPos;
      public long _writePos;
      public int   _itemCount;
      
      public void writeHeader(DataOutput stream) throws IOException { 
        stream.writeInt(LogFileHeaderBytes);
        stream.writeInt(LogFileVersion);
        stream.writeLong(_diskHeaderActiveVersionTimestamp);
        stream.writeLong(_readPos);
        stream.writeLong(_writePos);
        stream.writeInt(_itemCount);
      }
      
      public void readHeader(DataInput stream) throws IOException { 
        int headerBytes = stream.readInt();
        int version         = stream.readInt();
        long timestamp = stream.readLong();
        
        if (headerBytes != LogFileHeaderBytes && version !=LogFileVersion) { 
          throw new IOException("Invalid CrawlLog File Header Detected!");
        }
        
        _readPos     = stream.readLong();
        _writePos     = stream.readLong();
        _itemCount  = stream.readInt();
        
        // if timestamps don't match ... 
        if (timestamp != _diskHeaderActiveVersionTimestamp) {
          // then reset cursors .. eveything in the file is invalid ... 
          _writePos = 0;
          _readPos = 0;
          _itemCount =0;
        }
      }
    }
    
    
    private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream { 
      public CustomByteArrayOutputStream(int initialSize) { 
        super(initialSize);
      }
      public byte[] getBuffer() { return buf; }
    }
    
    private static void appendTargetsToLogFile(File logFileName,IntrusiveList<CrawlTarget> list)throws IOException { 
      
      LogFileHeader header = new LogFileHeader();
      
      boolean preExistingHeader = logFileName.exists();
            
      RandomAccessFile file = new RandomAccessFile(logFileName,"rw"); 
      
      try { 
        long headerOffset = 0;
        
        if(preExistingHeader) {
          
          headerOffset = readLogFileHeader(file, header);
          
          if (header._writePos == 0) { 
            file.seek(headerOffset);
          }
          else {
            // seelk to appropriate write position 
            file.seek(header._writePos);
          }
        }
        else { 
          headerOffset = writeLogFileHeader(file,header);
        }
        
  
        CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17);
        DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream);
        CRC32 crc = new CRC32();
        
        for (CrawlTarget target : list) { 
          
          PersistentCrawlTarget persistentTarget = target.createPersistentTarget();
          
          bufferOutputStream.reset();
          // write to intermediate stream ... 
          persistentTarget.write(dataOutputStream);
          // and crc the data ... 
          crc.reset();
          crc.update(bufferOutputStream.getBuffer(),0,bufferOutputStream.size());
          // write out length first 
          file.writeInt(bufferOutputStream.size());
          //crc next
          long computedValue = crc.getValue();
          //TODO: waste of space - write 32 bit values as long because having problems with java sign promotion rules during read...
          file.writeLong(computedValue);
          // and then the data 
          file.write(bufferOutputStream.getBuffer(),0,bufferOutputStream.size());
        }
        
        // now update header ... 
        header._itemCount += list.size();
        header._writePos   = file.getFilePointer();
        
        // now write out header anew ... 
        writeLogFileHeader(file,header);
        
      }
      finally { 
        if (file != null) { 
          file.close();
        }
      }
    }
    
    private static int readTargetsFromLogFile(CrawlList domain,File logFileName,int desiredReadAmount,IntrusiveList<CrawlTarget> targetsOut)throws IOException { 
     
      int itemsRead = 0;
      
      if (logFileName.exists()) { 
       
        RandomAccessFile file = new RandomAccessFile(logFileName,"rw");
        
        LogFileHeader header = new LogFileHeader();     
        
        try { 
          
          long headerOffset = readLogFileHeader(file, header);
            
          // seelk to appropriate write position 
          if (header._readPos != 0)
            file.seek(header._readPos);
          
          int itemsToRead  = Math.min(desiredReadAmount, header._itemCount);

          PersistentCrawlTarget persistentTarget = new PersistentCrawlTarget();
          CRC32 crc = new CRC32();
          CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 16);
          
          for (int i=0;i<itemsToRead;++i) { 
            // read length ... 
            int urlDataLen     = file.readInt();
            long urlDataCRC = file.readLong();

            buffer.reset();
            
            if (urlDataLen > buffer.getBuffer().length) { 
              buffer = new  CustomByteArrayOutputStream( ((urlDataLen / 65536) + 1) * 65536 );
            }
            file.read(buffer.getBuffer(), 0, urlDataLen);
            crc.reset();
            crc.update(buffer.getBuffer(), 0, urlDataLen);
            
            long computedValue = crc.getValue();
            
            // validate crc values ... 
            if (computedValue != urlDataCRC) { 
              throw new IOException("Crawl Target Log File Corrupt");
            }
            else { 
              //populate a persistentTarget from the (in memory) data stream
              DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(),0,urlDataLen));
              
              persistentTarget.clear();
              persistentTarget.readFields(bufferReader);

              //populate a new crawl target structure ... 
              CrawlTarget newTarget = new CrawlTarget(domain,persistentTarget);
              
              targetsOut.addTail(newTarget);
            }
          }
          
          itemsRead = itemsToRead;
          
          // now update header ... 
          header._itemCount -= itemsRead;
          // now if item count is non zero ... 
          if (header._itemCount != 0) { 
            // set read cursor to next record location 
            header._readPos     = file.getFilePointer();
          }
          // otherwise ... 
          else { 
            // reset both cursors ... 
            header._readPos = 0;
            header._writePos = 0;
          }
          
          // now write out header anew ... 
          writeLogFileHeader(file,header);
        }
        catch (IOException e) { 
          LOG.fatal("Encountered Exception Reading From Offline Queue for LogFile:" + logFileName + ". Truncating Queue! \n" + CCStringUtils.stringifyException(e));
          header._itemCount = 0;
          header._readPos = 0;
          header._writePos = 0;
          
          writeLogFileHeader(file,header);
        }
        
        finally { 
          if (file != null) { 
            file.close();
          }
        }
      }
      return itemsRead;
    }

    
    private static long writeLogFileHeader(RandomAccessFile file, LogFileHeader header )throws IOException {
      
      // set the position at zero .. 
      file.seek(0);
      // and write header to disk ... 
      header.writeHeader(file);
      
      //took sync out because it was becoming a sever bottleneck
      // file.getFD().sync();
      
      return file.getFilePointer();
    }
    
    private static long readLogFileHeader(RandomAccessFile file,LogFileHeader header) throws IOException {
      
      file.seek(0);
      
      header.readHeader(file);
      
      return file.getFilePointer();
    }
    
  
    
    public static class CrawlDomainTester { 

      
      public static void main(String[] args) {
        try {
          testGetNextItemCode();
        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
      @Test
      public static void testGetNextItemCode() throws Exception { 
        
      	/*
        CrawlListHost host = new CrawlListHost(null,1);
        CrawlList list = host.getCrawlList(1);
        
        
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com/foobar"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://blogger.com"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://foo.blogger.com"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/bar"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/zzz"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://####/foo/zzz"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"garbage"),false);
        list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,""),false);
        list._offlineTargetCount = 100;
        
        CrawlTarget target = null;
        NIOHttpHeaders headers = new NIOHttpHeaders();
        headers.add(null, "HTTP1.1 200 OK");

        while (list.getDisposition() != CrawlList.Disposition.QueueEmpty) {
          if (list.getDisposition() == CrawlList.Disposition.ItemAvailable) { 
            target = list.getNextTarget();
            if (target == null)
              System.out.println("Target:NULL");
            else 
              System.out.println("Target:" + target.getOriginalURL());
          }
          else if (list.getDisposition() == CrawlList.Disposition.WaitingOnCompletion) { 
            if (target != null) {
              if (target.getActiveURL().startsWith("http://www.redirecttest.com")) { 
                target.setRedirectURL(target.getActiveURL().replaceFirst("http://www.redirecttest.com", "http://redirecttest.com"));
                target.setFlags(target.getFlags() | CrawlURL.Flags.IsRedirected);

              }
              list.fetchStarted(target);
              if (target.getActiveURL().startsWith("http://failed.domain")) { 
                list.getActiveDomain()._domainFailed = true;
              }
              list.fetchSucceeded(target,0, headers, null);
              

              target = null;
            }
            else { 
              list._disposition = Disposition.WaitingOnTime;
            }
          }
          else if (list.getDisposition() == CrawlList.Disposition.WaitingOnTime) {
            System.out.println("clearing WaitState");
            list.clearWaitState();
          }
        }
        */
      }
      
      
      //@Test
      public void testDiskQueue() throws Exception { 
        
        String hostName = "poodleskirtcentral.com";
        long domainFP = URLFingerprint.generate64BitURLFPrint(hostName);
        File logFilePath  =FileUtils.buildHierarchicalPathForId(new File("/foo"),domainFP); 
        
        System.out.println(logFilePath.getAbsolutePath());
      }
      
      
      /*
      //@Test
      public void testDiskWriter() throws Exception {
        // initialize ...
        Configuration conf = new Configuration();
        
        conf.addResource("nutch-default.xml");
        conf.addResource("nutch-site.xml");
        conf.addResource("hadoop-default.xml");
        conf.addResource("hadoop-site.xml");
        conf.addResource("commoncrawl-default.xml");
        conf.addResource("commoncrawl-site.xml");
        
        CrawlEnvironment.setHadoopConfig(conf);
        CrawlEnvironment.setDefaultHadoopFSURI("file:///");
        CrawlEnvironment.setCrawlSegmentDataDirectory("./tests/crawlSegmentSamples/");
  
        EventLoop eventLoop = new EventLoop();
        eventLoop.start();
        
        DNSCache cache = new DNSCache() { 
          public DNSResult resolveName(CrawlSegmentHost host) {
            return null;
          } 
        };

        CrawlList.DISK_FLUSH_THRESHOLD = 5;
        CrawlList.DISK_LOAD_THRESHOLD = 2;
        CrawlList.IDEAL_TARGET_COUNT = 3;
        
        File basePath = new File("./data/diskQueueTest");
        basePath.mkdir();
        CrawlList.startDiskQueueingThread(eventLoop,basePath);

        CrawlSegmentDetail detailCC06 = SegmentLoader.loadCrawlSegment(1,1, "cc06",null, cache,null,null);
        
        CrawlListHost host = new CrawlListHost(null,0);
        
        int domainCount = 0;
        CrawlList firstDomain = null;
        
        for (CrawlSegmentHost segmentHost: detailCC06.getHosts()) { 
          CrawlList domain = new CrawlList(host,segmentHost.getListId());
          if (domainCount==0)
            firstDomain = domain;
          if (domainCount ==0)
            System.out.println("Domain:" + domain.getListName() + " FP:" + domain.getListId());
          for (CrawlSegmentURL segmentURL : segmentHost.getUrlTargets()) { 
            if (domainCount ==0)
              System.out.println("\tAdding Target::" + segmentURL.getUrl());
            CrawlTarget target = new CrawlTarget(1,domain,segmentHost,segmentURL);
            domain.addCrawlTarget(target, false);
          }
          
          if (++domainCount == 10) 
            break;
        }
        while (true) { 
          synchronized (firstDomain) { 
            if (firstDomain._pending.size() < CrawlList.DISK_FLUSH_THRESHOLD)
              break;
            Thread.sleep(5000);
          }
        }
        
        while (true) { 
          if (firstDomain.getDisposition() == CrawlList.Disposition.ItemAvailable) { 
            CrawlTarget nextTarget = null;
            while ((nextTarget = firstDomain.getNextTarget()) != null) { 
              System.out.println("Domain: "+ firstDomain.getListName() + " Got Target:" + nextTarget.getOriginalURL());
              firstDomain.fetchStarted(nextTarget);
              firstDomain.fetchSucceeded(nextTarget, 0,null, null);
              if (firstDomain.getDisposition() == CrawlList.Disposition.WaitingOnTime) { 
                firstDomain.clearWaitState();
              }
            }
          }
          else { 
            System.out.println("Domain Queue Empty ... Waiting");
            Thread.sleep(5000);
          }
        }
        //CrawlDomain._diskOperationThread.join();
      }
      */
    }
    
    @Override
    public String toString() {
      return "List Id:" + _baseListId + " Name:" + _listName;      
    }
}