/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.commoncrawl.service.crawler; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.net.MalformedURLException; import java.net.URL; import java.util.LinkedList; import java.util.concurrent.Callable; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.atomic.AtomicLong; import java.util.zip.CRC32; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.record.Buffer; import org.commoncrawl.util.GZIPUtils; import org.commoncrawl.async.ConcurrentTask; import org.commoncrawl.async.EventLoop; import org.commoncrawl.common.Environment; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.service.crawler.PersistentCrawlTarget; import org.commoncrawl.service.crawler.RobotRulesParser.RobotRuleSet; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.service.statscollector.CrawlerStats; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.IPAddressUtils; import org.commoncrawl.util.IntrusiveList; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.GZIPUtils.UnzipResult; import org.commoncrawl.util.IntrusiveList.IntrusiveListElement; import org.junit.Test; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; /** * CrawlList - a collection of CrawlTargets (disk backed) * * @author rana * */ public final class CrawlList extends IntrusiveList.IntrusiveListElement<CrawlList> { /** offline (disk) storage support **/ private static int DISK_FLUSH_THRESHOLD = 50; private static int DISK_LOAD_THRESHOLD = 10; private static int IDEAL_TARGET_COUNT = 25; private static final int MAX_ROBOTS_EXCLUSION_IN_LOOP = 3; private static final int MAX_FAILED_TARGETS_IN_LOOP = 50; private static final int IOEXCEPTION_TIMEOUT_BOOST = 60000; private static final int PAUSE_STATE_RETRY_DELAY = 10 * 60000; // 10 minutes static final AtomicLong seq = new AtomicLong(); public static class DiskQueueEntry implements Comparable<DiskQueueEntry> { final long seqNum; final CrawlList entry; final boolean isLoadRequest; public DiskQueueEntry(CrawlList item,boolean isLoadRequest) { this.seqNum = seq.getAndIncrement(); this.entry = item; this.isLoadRequest = isLoadRequest; } public CrawlList getListItem() { return entry; } @Override public int compareTo(DiskQueueEntry o) { if (entry == null && o.entry != null) return -1; else if (o.entry == null && entry != null) return 1; else { if (isLoadRequest && !o.isLoadRequest) return -1; else if (!isLoadRequest && o.isLoadRequest) return 1; else { return ((Long)seqNum).compareTo(o.seqNum); } } } } private static PriorityBlockingQueue<DiskQueueEntry> _diskOperationQueue = new PriorityBlockingQueue<DiskQueueEntry>(); private static Thread _diskOperationThread = null; private static boolean _diskOpThreadShuttingDown = false; private static long _diskHeaderActiveVersionTimestamp = System.currentTimeMillis(); /** logging **/ private static final Log LOG = LogFactory.getLog(CrawlList.class); /** server repsonsible for servicing this domain **/ private CrawlListHost _host; /** host name **/ private String _listName; /** host metadata */ private int _baseListId; /** unique list id **/ private long _uniqueListId; /** next crawl interface used to service this list **/ private int _nextCrawlInterface = 0; /** cumilative list of crawl targets associated with this queue ...*/ private IntrusiveList<CrawlTarget> _pending = new IntrusiveList<CrawlTarget>(); /** list of crawl targets directly scheduled for disk queue */ private IntrusiveList<CrawlTarget> _queued = new IntrusiveList<CrawlTarget>(); /** offline item count - the set of crawl targets that are stored offline on disk **/ private int _offlineTargetCount = 0; /** disk request pending **/ private boolean _diskRequestPending = false; /** currently scheduled item **/ private CrawlTarget _scheduled = null; /** fetch start time **/ private long _fetchStartTime = -1; /** fetch end time **/ private long _fetchEndTime = -1; /** last successful fetch time (total time in milliseconds)**/ private int _lastRequestDownloadTime = -1; /** last successful request redirect count **/ private int _lastRequestRedirectCount = 0; /** active connection **/ private NIOHttpConnection _activeConnection; /** SubDomain Stats and Robots State Information Structure **/ private static class DomainInfo extends IntrusiveListElement<DomainInfo>{ public String _domainName; public boolean _domainFailed = false; public boolean _domainBlackListed = false; public long _robotsCRC = -1; public long _lastTouched; /** host retry counter **/ public byte _domainRetryCounter = 0; /** total 400 errors **/ public int _HTTP400Count = 0; /** total 500 errors **/ public int _HTTP500Count = 0; /** total 200 status code count**/ public int _HTTP200Count = 0; /** sequential failure count **/ public short _SequentialHTTPFailuresCount =0; public boolean _robotsReturned400; public boolean _robotsReturned403; } /** domain info map **/ private IntrusiveList<DomainInfo> _domainInfo = new IntrusiveList<DomainInfo>(); /** active domain info **/ private DomainInfo _activeDomainInfo; /** the active robots rule set to apply robots policy for this host **/ private RobotRuleSet _ruleSet = null; /** robots returned 400 **/ private boolean _robotsReturned400; private boolean _robotsReturned403; /** the crc value for the active rule set - computed at pre-parse time**/ private long _robotsCRC = 0; /** robots file retrieved */ private boolean _robotsRetrieved; /** robots host name **/ private String _robotsHostName; /** last fetched robots host Name**/ private String _lastFetchedRobotsHostName; /** last fetched robots data **/ private String _lastFetchedRobotsData; /** crc calculator **/ private static CRC32 _crc32 = new CRC32(); /** last request was io exception **/ private boolean _lastRequestWasIOException = false; private static final int MAX_ITEM_RETRY = 2; private static final int MAX_HOST_RETRY = 7; private static final int DEFAULT_ITEM_RETRY_WAIT_TIME = 20000; private static final int DEFAULT_HOST_RETRY_WAIT_TIME = 20000; private static final int MIN_CRAWL_DELAY = 1; private static final int MAX_CRAWL_DELAY = 3500; private static int STATS_CHECK_CODE_SAMPLE_THRESHOLD = 50; // don't do anything util we have retireved at least 50 urls private static float BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD = .80f; // if 85% of urls are bad, fail the domain // private static int SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER = 500; private static int SEQUENTIAL_FAILURES_NO_200_TRIGGER = 10; // if we get 20 sequential failures we bail // private static int SEQUENTIAL_FAILURES_SOME_200_TRIGGER = 1000; private static final int MAX_DNS_CACHE_ITEMS = 100; private static int MAX_DOMAIN_CACHE_ENTIRES = 1000; /** the reference to the singleton server object **/ private static CrawlerServer _server = null; public enum Disposition { ItemAvailable, WaitingOnCompletion, WaitingOnTime, QueueEmpty } /** domain's dipsition(state) **/ private Disposition _disposition; private static byte WWWRULE_Remove = 1 << 0; private static byte WWWRULE_Add = 1 << 1; /** www rewrite rule patterns **/ static class WWWReWriteItem extends IntrusiveList.IntrusiveListElement<WWWReWriteItem> { public WWWReWriteItem(String domainName,byte ruleType) { _wwwRuleDomain = domainName; _wwwRuleType = ruleType; _lastUpdateTime = System.currentTimeMillis(); } public String _wwwRuleDomain = null; public long _lastUpdateTime = -1; public byte _wwwRuleType = 0; }; private static final int MAX_REWRITE_ITEMS = 5; IntrusiveList<WWWReWriteItem> _rewriteItemList = null; static class DNSCacheItem extends IntrusiveList.IntrusiveListElement<DNSCacheItem> { public DNSCacheItem(String hostName,int ipAddress,long ttl) { _hostName = hostName; _ipAddress = ipAddress; _ttl = ttl; _lastAccessTime = System.currentTimeMillis(); } public String _hostName; public int _ipAddress; public long _ttl; public long _lastAccessTime = -1; } IntrusiveList<DNSCacheItem> _dnsCacheItem = new IntrusiveList<DNSCacheItem>(); public void cacheDNSEntry(String hostName,int ipAddress,long ttl) { DNSCacheItem oldestItem = null; DNSCacheItem found = null; for (DNSCacheItem item : _dnsCacheItem) { if (item._hostName.equals(hostName)) { item._ipAddress = ipAddress; item._ttl = ttl; item._lastAccessTime = System.currentTimeMillis(); found = item; } oldestItem = (oldestItem == null) ? item : (oldestItem._lastAccessTime > item._lastAccessTime) ? item : oldestItem; } if (found == null) { if (_dnsCacheItem.size() == MAX_DNS_CACHE_ITEMS) { //LOG.info("###DNS Cache Full for Host:" + getListName() + " Flushing Host:" + oldestItem._hostName); _dnsCacheItem.removeElement(oldestItem); } _dnsCacheItem.addHead(new DNSCacheItem(hostName,ipAddress,ttl)); } else { _dnsCacheItem.removeElement(found); _dnsCacheItem.addHead(found); } } private void addWWWReWriteItem(String originalItem,byte itemType) { WWWReWriteItem oldestItem = null; WWWReWriteItem found = null; if (_rewriteItemList == null) { _rewriteItemList = new IntrusiveList<WWWReWriteItem>(); } for (WWWReWriteItem item : _rewriteItemList) { if (item._wwwRuleDomain.equals(originalItem)) { item._lastUpdateTime = System.currentTimeMillis(); found = item; } oldestItem = (oldestItem == null) ? item : (oldestItem._lastUpdateTime > item._lastUpdateTime) ? item : oldestItem; } if (found == null) { if (_rewriteItemList.size() == MAX_REWRITE_ITEMS) { _rewriteItemList.removeElement(oldestItem); } _rewriteItemList.addHead(new WWWReWriteItem(originalItem,itemType)); } if (found != null && found != _rewriteItemList.getHead()) { _rewriteItemList.removeElement(found); _rewriteItemList.addHead(found); } } public CrawlList(CrawlListHost crawlHost,int baseListId) { _host = crawlHost; _baseListId = baseListId; _uniqueListId = (((long)crawlHost.getIPAddress()) << 32) | _baseListId; _listName = "List:" + _baseListId + " For:" + IPAddressUtils.IntegerToIPAddressString(crawlHost.getIPAddress()); _robotsRetrieved = false; _disposition = Disposition.QueueEmpty; } /** host access **/ public CrawlListHost getHost() { return _host; } /** server access **/ static CrawlerServer getServerSingleton() { return _server; } static void setServerSingleton(CrawlerServer server) { _server = server; } public int getListId() { return _baseListId; } public long getUniqueListId() { return _uniqueListId; } /** host name **/ public String getListName() { return _listName; } /** disposition **/ Disposition getDisposition() { return _disposition; } void updateLastModifiedTime(long time) { _host.updateLastModifiedTime(time); } /** get next crawl interface used to service this list * */ public int getNextCrawlInterface() { return _nextCrawlInterface; } /** set next crawl interface to use for this list * */ public void setNextCrawlInterface(int crawlInterface) { _nextCrawlInterface = crawlInterface; } /** get the pending urls count **/ synchronized int getPendingURLCount() { return _pending.size(); } /** get the offline url count */ synchronized int getOfflineURLCount() { return _offlineTargetCount; } boolean isScheduled() { return _scheduled != null; } int getActiveURLCount() { return (_scheduled != null) ? 1 : 0; } void updateLastFetchStartTime(long newTime) { _fetchStartTime = newTime; getHost().updateLastFetchStartTime(newTime); } long getLastFetchStartTime() { return _fetchStartTime; } /** get fetch time in milliseconds for last request **/ int getLastRequestFetchTime() { if (_fetchStartTime != -1 && _fetchEndTime != -1){ return (int) Math.max(0,_fetchEndTime - _fetchStartTime); } return 0; } int getLastSuccessfulDownloadTime() { return _lastRequestDownloadTime; } public synchronized void stopCrawl() { // add anything scheduled to pending ... if (_scheduled != null) { _pending.addTail(_scheduled); _scheduled = null; } if (_pending.size() != 0 || !_robotsRetrieved) { _disposition = Disposition.ItemAvailable; } else { _disposition = Disposition.QueueEmpty; } } /** add a new crawl target to the host queue **/ public synchronized void addCrawlTarget(CrawlTarget target,boolean toFrontOfQueue) { // LOG.info("DOMAIN:" + this.getHostName() + " ADDING TGT:" + target.getOriginalURL()); Disposition oldDisposition = _disposition; if (toFrontOfQueue) { target.setFlags(target.getFlags() | CrawlURL.Flags.IsHighPriorityURL); _pending.addHead(target); } else { // if offline != 0 add to pending if (_offlineTargetCount == 0) { _pending.addTail(target); } // otherwise add to queued else { //LOG.info("### QUEUED Adding to Queued List for CrawlList:" + getListName()); _queued.addTail(target); } } if (_disposition == Disposition.QueueEmpty) { _disposition = Disposition.ItemAvailable; } if (oldDisposition != _disposition && getHost().getActiveList() == this) { getHost().listDispositionChanged(this, oldDisposition, _disposition); } if (_pending.size() >= DISK_FLUSH_THRESHOLD || _queued.size() != 0) { if (!_diskRequestPending) { _diskRequestPending = true; _diskOperationQueue.add(new DiskQueueEntry(this,false)); } } } private boolean activeDomainRequiresRobotsFetch(String activeDomainName) { if (_robotsRetrieved && _robotsHostName != null && _robotsHostName.equalsIgnoreCase(activeDomainName)) { // no, the active robots file matches the active domain name. no need to fetch anything ... return false; } else { DomainInfo domainInfo = getDomainInfoFromDomain(activeDomainName); // get the cached crc for the active domain if it exists ... long cachedRobotsCRC = (domainInfo == null) ? -1 : domainInfo._robotsCRC; // if cached crc found ... if (cachedRobotsCRC != -1) { // if cached robots file matches the actvie robots file's crc ... if (_robotsRetrieved && cachedRobotsCRC == _robotsCRC) { //LOG.info("### Skipping Robots Fetch. Cached CRC == robotsCRC"); // no need to refetch return false; } // otherwise, check the host's cache ... else { // special case for the empty rule set if (cachedRobotsCRC == 0) { _robotsCRC = cachedRobotsCRC; _robotsHostName = activeDomainName; _robotsReturned400 = domainInfo._robotsReturned400; _robotsReturned403 = domainInfo._robotsReturned403; _ruleSet = RobotRulesParser.getEmptyRules(); _robotsRetrieved = true; if (Environment.detailLogEnabled()) LOG.info("### Skipping Robots Fetch. Cached CRC is Zero, indicating empty rule set."); return false; } else { // check the rule set cache in the host (by crc) RobotRuleSet ruleSet = _host.getCachedRobotsEntry(cachedRobotsCRC); // if cached object found .... if (ruleSet != null) { _robotsCRC = cachedRobotsCRC; _robotsHostName = activeDomainName; _robotsReturned400 = domainInfo._robotsReturned400; _robotsReturned403 = domainInfo._robotsReturned403; _ruleSet = ruleSet; _robotsRetrieved = true; if (Environment.detailLogEnabled()) LOG.info("### Skipping Robots Fetch. Cached CRC is Non-Zero and cached rule-set found via host."); return false; } } } } } return true; } public void setActiveDomainName(String hostName) { _activeDomainInfo = getDomainInfoFromDomain(hostName); } public DomainInfo getActiveDomain() { return _activeDomainInfo; } private DomainInfo getDomainInfoFromDomain(String domainName) { DomainInfo oldestItem = null; DomainInfo found = null; for (DomainInfo item : _domainInfo) { if (item._domainName.equals(domainName)) { if (getServerSingleton() != null) { if (item._lastTouched < getServerSingleton().getFilterUpdateTime()) { if (CrawlerServer.getEngine() != null) { item._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName); } else { item._domainBlackListed = false; } } } item._lastTouched = System.currentTimeMillis(); found = item; } oldestItem = (oldestItem == null) ? item : (oldestItem._lastTouched > item._lastTouched) ? item : oldestItem; } if (found == null) { if (_domainInfo.size() == MAX_DOMAIN_CACHE_ENTIRES) { _domainInfo.removeElement(oldestItem); } found = new DomainInfo(); found._domainName = domainName; found._lastTouched = System.currentTimeMillis(); if (getServerSingleton() != null) { if (CrawlerServer.getEngine() != null) { found._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName); } else { found._domainBlackListed = false; } } _domainInfo.addHead(found); } return found; } private long checkDomainCacheForRobotsCRC(String hostName){ for (DomainInfo aliasInfo : _domainInfo) { if (aliasInfo._domainName.equalsIgnoreCase(hostName)) { aliasInfo._lastTouched = System.currentTimeMillis(); _domainInfo.removeElement(aliasInfo); _domainInfo.addHead(aliasInfo); if (aliasInfo._robotsCRC != -1) { if (Environment.detailLogEnabled()) LOG.info("### Found Robots Match in Cache for host:" + hostName); } return aliasInfo._robotsCRC; } } return -1; } private void updateRobotsCRCForDomain(long crc,String domainName,boolean robotsReturned400,boolean robotsReturned403) { getDomainInfoFromDomain(domainName)._robotsCRC = crc; getDomainInfoFromDomain(domainName)._robotsReturned400 = robotsReturned400; getDomainInfoFromDomain(domainName)._robotsReturned403 = robotsReturned403; } private void resetRobotsState() { // flip robots status ... _robotsRetrieved = false; _robotsReturned400 = false; _robotsReturned403 = false; _robotsHostName = null; _robotsCRC = 0; _ruleSet = RobotRulesParser.getEmptyRules(); } private CrawlTarget buildRobotsRequest(String hostName) { CrawlTarget targetOut = null; // reset the robots state ... resetRobotsState(); // log the situation // LOG.info("####Robots-fetching robots for host:" + hostName); // and set up some initial robots state _robotsHostName = hostName; //build a robots.txt url URL robotsURL = null; try { robotsURL = new URL(getHost().getScheme(),_robotsHostName,"/robots.txt"); } catch (MalformedURLException e) { } if (robotsURL == null) { if (Environment.detailLogEnabled()) LOG.error("####Robots Unable to fetch Robots for host:"+ _robotsHostName); // cheat _robotsRetrieved = true; // and update the robot info in the alias map updateRobotsCRCForDomain(_robotsCRC, _robotsHostName,_robotsReturned400,_robotsReturned403); } else { // ok , the robots url is good targetOut = new CrawlTarget(0,this); // set the url ... targetOut.setOriginalURL(robotsURL.toString()); // and mark the target as a robots get targetOut.setFlags(CrawlURL.Flags.IsRobotsURL); CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats(); synchronized (crawlerStats) { crawlerStats.setActvieRobotsRequests(crawlerStats.getActvieRobotsRequests() + 1); } } return targetOut; } public boolean populateIPAddressForTarget(String hostName,CrawlTarget target) { for (DNSCacheItem item : _dnsCacheItem) { if (item._hostName.equalsIgnoreCase(hostName)) { if (item._ttl >= System.currentTimeMillis()) { //LOG.info("###Using Cached IP Address for target:" + target.getActiveURL() + " Cached IP:" + item._ipAddress + " TTL:" + item._ttl); target.setServerIP(item._ipAddress); target.setServerIPTTL(item._ttl); return true; } return false; } } return false; } private void applyRewriteRulesToTarget(String hostName,CrawlTarget target) { if(_rewriteItemList != null) { for (WWWReWriteItem item : _rewriteItemList) { if (item._wwwRuleDomain.equalsIgnoreCase(hostName)) { if ((item._wwwRuleType & WWWRULE_Add) != 0) { target.setOriginalURL(target.getOriginalURL().replaceFirst(hostName, "www." + hostName)); } else { target.setOriginalURL(target.getOriginalURL().replaceFirst(hostName, hostName.substring(4))); } break; } } } } static CrawlURLMetadata rewriteTestMetadata = new CrawlURLMetadata(); static FilterResults rewriteFilterResults = new FilterResults(); /** get next crawl candidate */ public synchronized CrawlTarget getNextTarget() { if (_scheduled != null) { throw new RuntimeException("Scheduled Not Null and getNextTarget called!"); } int maxRobotsExclusionInLoop = (CrawlerServer.getServer() != null) ? CrawlerServer.getServer().getMaxRobotsExlusionsInLoopOverride() : -1; if (maxRobotsExclusionInLoop == -1) { maxRobotsExclusionInLoop = MAX_ROBOTS_EXCLUSION_IN_LOOP; } // target out is currently null CrawlTarget targetOut = null; Disposition oldDisposition = _disposition; int robotsExcludedCount = 0; int failedTargetsCount = 0; String domainName = ""; while (targetOut == null && getNextPending(false) != null && getDisposition() == CrawlList.Disposition.ItemAvailable) { // pop the next target off of the queue ... CrawlTarget potentialTarget = getNextPending(true); // mark request start time potentialTarget.setRequestStartTime(System.currentTimeMillis()); // get the host name (fast) domainName = URLUtils.fastGetHostFromURL(potentialTarget.getActiveURL()); // if not valid ... fail explicitly if (domainName == null || domainName.length() == 0) { // explicitly fail this url ... CrawlTarget.failURL(potentialTarget.createFailureCrawlURLObject(CrawlURL.FailureReason.MalformedURL, null),potentialTarget, CrawlURL.FailureReason.MalformedURL,null); } else { /* // potentially rewrite domain name if (getServer().getDNSRewriteFilter() != null) { synchronized (rewriteTestMetadata) { if (getServer().getDNSRewriteFilter().filterItem(domainName, "", rewriteTestMetadata, rewriteFilterResults) == FilterResult.Filter_Modified) { LOG.info("### FILTER Rewrote DomainName:" + domainName + " To:" + rewriteFilterResults.getRewrittenDomainName()); domainName = rewriteFilterResults.getRewrittenDomainName(); } } } */ // set the active host name setActiveDomainName(domainName); // check to see if the domain has been marked as failed or the host has been marked as failed ... if (!getActiveDomain()._domainFailed && !getActiveDomain()._domainBlackListed && !_host.isFailedServer()) { // if the the active target does not match the current robots file ... if (activeDomainRequiresRobotsFetch(domainName)) { // add the target back to the head of the queue... _pending.addHead(potentialTarget); // and build a robots request ... targetOut = buildRobotsRequest(domainName); } // otherwise ... go ahead try to fetch the next url in the queue else { // now if disposition is still item available ... if (potentialTarget != null && _disposition == Disposition.ItemAvailable) { targetOut = potentialTarget; URL theTargetURL = null; try { theTargetURL = new URL(targetOut.getOriginalURL()); } catch (MalformedURLException e) { theTargetURL = null; LOG.error("Error parsing URL:"+targetOut.getOriginalURL() + " for Host:"+ domainName); } if (theTargetURL == null) { // explicitly fail this url ... CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.MalformedURL, null), targetOut,CrawlURL.FailureReason.MalformedURL,null); // and set target out to null!! targetOut = null; } else { boolean robotsExcluded = !_ruleSet.isAllowed(theTargetURL); boolean serverExcluded = false; if (!robotsExcluded) { serverExcluded = CrawlerServer.getServer().isURLInBlockList(theTargetURL); } // validate against the robots file ... if (robotsExcluded || serverExcluded) { //track number of robots exclusion in this loop ++robotsExcludedCount; // inform host _host.incrementCounter(CrawlListHost.CounterId.RobotsExcludedCount, 1); // explicitly fail this url ... if (robotsExcluded) { if (Environment.detailLogEnabled()) LOG.info("### ROBOTS Excluded URL:" + theTargetURL + " via Robots File"); CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.RobotsExcluded, null),targetOut, CrawlURL.FailureReason.RobotsExcluded,null); } else { if (Environment.detailLogEnabled()) LOG.info("### ROBOTS Excluded URL:" + theTargetURL + " via Blacklist"); CrawlTarget.failURL(targetOut.createFailureCrawlURLObject(CrawlURL.FailureReason.BlackListedURL, null),targetOut, CrawlURL.FailureReason.BlackListedURL,null); } // and set target out to null targetOut = null; // if robots processed in loop exceeds maximum if (robotsExcludedCount >= maxRobotsExclusionInLoop) { if (_pending.size() != 0 || _offlineTargetCount != 0) { // wait on time ... _disposition = Disposition.WaitingOnTime; } else { _disposition = Disposition.QueueEmpty; } // and break out break; } } // } } } } // otherwise ... if the domain has failed ... else { if (potentialTarget != null) { int failureReason = CrawlURL.FailureReason.TooManyErrors; String failureDesc = "Host Failed due to too many errors"; if (getActiveDomain()._domainBlackListed) { failureReason = CrawlURL.FailureReason.BlackListedURL; failureDesc = "Host Black Listed"; } else if (getHost().isBlackListedHost()) { failureReason = CrawlURL.FailureReason.BlackListedHost; failureDesc = "Host Black Host"; } // fail the url and move on ... //TODO: DISABLING THIS BECAUSE FAILING FOR ABOVE REASONS IS NOT REALLY A PERSISTENT FAILURE ATTRIBUTABLE TO THE URL // CrawlTarget.failURL(potentialTarget.createFailureCrawlURLObject(failureReason, failureDesc),potentialTarget, failureReason,null); } // set targetOut = null; // increment failed item count failedTargetsCount++; // now, if failed count exceeds max failures in loop if (failedTargetsCount >= MAX_FAILED_TARGETS_IN_LOOP) { if (_pending.size() != 0 || _offlineTargetCount != 0) { // wait on time ... _disposition = Disposition.WaitingOnTime; } else { _disposition = Disposition.QueueEmpty; } // and break out break; } } } } // ok, if we have a target ... fetch it ... if (targetOut != null) { // ok before we can fetch this guy, we need to check to see if the associated host is in a paused state ... if (_host.isPaused()) { LOG.info("***getNextItem for Host:" + domainName + " is Paused!!"); // null target out, which will set us in a waiting on time state again targetOut = null; } // now again, if target out is not null if (targetOut != null) { if (Environment.detailLogEnabled()) LOG.info("getNextItem for Host:" + domainName + " Returned URL:" + targetOut.getOriginalURL() + " object:" + targetOut.toString()); // set scheduled item pointer ... _scheduled = targetOut; // set the active host name setActiveDomainName(domainName); // get ip address info (if available) populateIPAddressForTarget(domainName,_scheduled); // change disposition ... _disposition = Disposition.WaitingOnCompletion; //set initial fetch start time ... updateLastFetchStartTime(System.currentTimeMillis()); } } // if target out is null, and pending size is zero but there are offline targets ... if (targetOut == null) { if (_pending.size() != 0 || _offlineTargetCount != 0) { // then set disposition to waiting on time ... _disposition = CrawlList.Disposition.WaitingOnTime; } else { _disposition = CrawlList.Disposition.QueueEmpty; } } // check to see if we need to load more items from disk potentiallyQueueDiskLoad(); if (targetOut != null) { // finally rewrite target url if necessary applyRewriteRulesToTarget(domainName,targetOut); } if (targetOut != null) { // LOG.info("### getNextIem for Host:" + domainName + " Returned URL:" + targetOut.getActiveURL()); } return targetOut; } void fetchStarting(CrawlTarget target,NIOHttpConnection connection) { _activeConnection = connection; } /** fetch started callback - called from CrawlTarget **/ void fetchStarted(CrawlTarget target) { // record fetch start time ... updateLastFetchStartTime(System.currentTimeMillis()); // and notify host as well _host.updateLastFetchStartTime(getLastFetchStartTime()); if (_scheduled == target) { if (Environment.detailLogEnabled()) LOG.info("Fetch Started URL:" + target.getOriginalURL()); } else { if (_scheduled == null) { LOG.error("fetchStarted - scheduled target is null and fetch started target is:" + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName() ); } else { LOG.error ( "fetchStarted - scheduled target is: " + _scheduled.getOriginalURL().toString() +" list:" +_scheduled.getSourceList().getListName() + " and fetch started target is:" + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName() ); } } } /** if in memory queue is exhausted or below threshold and there offline targets, queue up a load from disk for this domain **/ private void potentiallyQueueDiskLoad() { if (_pending.size() <= DISK_LOAD_THRESHOLD && (!_diskRequestPending || _pending.size() ==0) && _offlineTargetCount != 0) { _diskRequestPending = true; _diskOperationQueue.add(new DiskQueueEntry(this,true)); } } static class RobotRuleResult { public RobotRuleSet ruleSet; public long crcValue; }; /** fetch succeeded **/ void fetchSucceeded(final CrawlTarget target,int downloadTime,final NIOHttpHeaders httpHeaders,final Buffer contentBuffer) { _lastRequestWasIOException = false; _lastRequestDownloadTime = downloadTime; _lastRequestRedirectCount = target.getRedirectCount(); _fetchEndTime = System.currentTimeMillis(); _activeConnection = null; getHost().incrementCounter(CrawlListHost.CounterId.SuccessfullGetCount,1); // reset host's io error count _host.resetCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount); if (getActiveDomain() != null) getActiveDomain()._domainRetryCounter = 0; Disposition oldDisposition = _disposition; final String originalHost = URLUtils.fastGetHostFromURL(target.getOriginalURL()); final String activeHost = URLUtils.fastGetHostFromURL(target.getActiveURL()); if (originalHost != null && activeHost != null) { // update our server ip information from information contained within crawl target ... cacheDNSEntry(activeHost,target.getServerIP(),target.getServerIPTTL()); // if the target was redirected ... cache the original ip address and ttl as well ... if (target.isRedirected()) { if (target.getOriginalRequestData() != null) { cacheDNSEntry(originalHost,target.getOriginalRequestData()._serverIP,target.getOriginalRequestData()._serverIPTTL); } } } final int resultCode = NIOHttpConnection.getHttpResponseCode(httpHeaders); if (resultCode == 200){ getHost().incrementCounter(CrawlListHost.CounterId.Http200Count,1); if (getActiveDomain() != null) { getActiveDomain()._HTTP200Count++; getActiveDomain()._SequentialHTTPFailuresCount = 0; } // validate www rewrite rule if not set and target was redirected ... if (target.isRedirected()) { /* this is broken for the new list design if (!originalHost.equalsIgnoreCase(activeHost)) { // if redirect strips the www then ... if ((originalHost.startsWith("www.") || originalHost.startsWith("WWW.")) && activeHost.equalsIgnoreCase(originalHost.substring(4))) { addWWWReWriteItem(originalHost,WWWRULE_Remove); } // else if redirect adds the www then ... else if ((activeHost.startsWith("www.") || activeHost.startsWith("WWW.")) && originalHost.equalsIgnoreCase(activeHost.substring(4))) { addWWWReWriteItem(originalHost,WWWRULE_Add); } } */ } } else if (resultCode >= 400 && resultCode < 500) { if (resultCode == 403) { // inform host for stats tracking purposes _host.incrementCounter(CrawlListHost.CounterId.Http403Count,1); } if (getActiveDomain() != null) getActiveDomain()._SequentialHTTPFailuresCount++; } else if (resultCode >=500 && resultCode < 600) { if (getActiveDomain() != null) { getActiveDomain()._SequentialHTTPFailuresCount++; } } if (_scheduled != target) { if (_scheduled == null) LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!"); else LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchSucceed Target is:" + target.getOriginalURL() + " " + target.toString() + " ActiveTarget is:" + _scheduled.getOriginalURL() + " " + _scheduled.toString()); } else { // clear active ... _scheduled = null; // if this is the robots target ... if ( (target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) { final CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats(); // process the robots data if any ... // check for null queue (in case of unit test); if (resultCode == 200) { _robotsRetrieved = true; synchronized (crawlerStats) { crawlerStats.setRobotsRequestsSucceeded(crawlerStats.getRobotsRequestsSucceeded() + 1); crawlerStats.setRobotsRequestsQueuedForParse(crawlerStats.getRobotsRequestsQueuedForParse() + 1); } LOG.info("### Scheduling Robots Parse for:"+target.getActiveURL()); // transition to a waiting on completion disposition ... _disposition = Disposition.WaitingOnCompletion; if (getServerSingleton() != null) { // schedule a robots parser parse attempt ... getServerSingleton().registerThreadPool("robots", 5).execute(new ConcurrentTask<RobotRuleResult>(getServerSingleton().getEventLoop(), new Callable<RobotRuleResult>() { public RobotRuleResult call() throws Exception { try { TextBytes contentData = new TextBytes(contentBuffer.get()); String contentEncoding = httpHeaders.findValue("Content-Encoding"); if (contentEncoding != null && contentEncoding.equalsIgnoreCase("gzip")) { if (Environment.detailLogEnabled()) LOG.info("GZIP Encoding Detected for Robots File For:"+activeHost); UnzipResult result = GZIPUtils.unzipBestEffort(contentData.getBytes(),CrawlEnvironment.CONTENT_SIZE_LIMIT); if (result == null) { contentData = null; if (Environment.detailLogEnabled()) LOG.info("GZIP Decoder returned NULL for Robots File For:"+activeHost); } else { contentData.set(result.data.get(),result.data.getOffset(),result.data.getCount()); } } try { if (contentData != null) { String robotsTxt = contentData.toString().trim().toLowerCase(); if (robotsTxt.startsWith("<html") || robotsTxt.startsWith("<!doctype html")) { contentData = null; CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,CrawlerEngine.RobotsParseFlag_ContentWasHTML); } else { CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, resultCode, robotsTxt,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,0); synchronized (this) { _lastFetchedRobotsData = robotsTxt; _lastFetchedRobotsHostName = _robotsHostName; } } } else { CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, CrawlerEngine.RobotsParseFlag_ContentDecodeFailed); } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, resultCode, null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete, CrawlerEngine.RobotsParseFlag_ContentDecodeFailed); } if (Environment.detailLogEnabled()) LOG.info("Parsing Robots File for Host:"+activeHost); RobotRuleResult result = new RobotRuleResult(); if (contentData != null) { synchronized (_crc32) { _crc32.reset(); _crc32.update(contentData.getBytes(),contentData.getOffset(),contentData.getLength()); result.crcValue = _crc32.getValue(); } RobotRulesParser parser = new RobotRulesParser(getServerSingleton().getConfig()); result.ruleSet = parser.parseRules(contentData.getBytes(),contentData.getOffset(),contentData.getLength()); } else { result.ruleSet = RobotRulesParser.getEmptyRules(); result.crcValue = 0; } return result; } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw e; } } }, new ConcurrentTask.CompletionCallback<RobotRuleResult>() { public void taskComplete(RobotRuleResult loadResult) { synchronized (crawlerStats) { crawlerStats.setRobotsRequestsQueuedForParse(crawlerStats.getRobotsRequestsQueuedForParse() - 1); } if (loadResult != null) { boolean disallowsAll = !_ruleSet.isAllowed("/"); boolean robotsHadCrawlDelay = _ruleSet.getCrawlDelay() != -1; boolean explicitMention = _ruleSet.explicitMention; int logFlags = 0; if (disallowsAll) logFlags |= CrawlerEngine.RobotsParseFlag_ExcludesAll; if (explicitMention) logFlags |= CrawlerEngine.RobotsParseFlag_ExplicitMention; if (robotsHadCrawlDelay) logFlags |= CrawlerEngine.RobotsParseFlag_HasCrawlDelay; synchronized (crawlerStats) { crawlerStats.setRobotsRequestsSuccessfullParse(crawlerStats.getRobotsRequestsSuccessfullParse() + 1); if (disallowsAll) { crawlerStats.setRobotsFileExcludesAllContent(crawlerStats.getRobotsFileExcludesAllContent() + 1); if (explicitMention) crawlerStats.setRobotsFileExplicitlyExcludesAll(crawlerStats.getRobotsFileExplicitlyExcludesAll() + 1); } if (explicitMention) { crawlerStats.setRobotsFileHasExplicitMention(crawlerStats.getRobotsFileHasExplicitMention() + 1); } if (robotsHadCrawlDelay) { crawlerStats.setRobotsFileHadCrawlDelay(crawlerStats.getRobotsFileHadCrawlDelay() + 1); } } CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 0 , null,CrawlerEngine.RobotsLogEventType.Parse_Succeeded,logFlags); _ruleSet = loadResult.ruleSet; _robotsCRC = loadResult.crcValue; _host.cacheRobotsFile(_ruleSet, _robotsCRC); } else { CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 0 , null,CrawlerEngine.RobotsLogEventType.Parse_Failed,0); synchronized (crawlerStats) { crawlerStats.setRobotsRequestsFailedParse(crawlerStats.getRobotsRequestsFailedParse() + 1); } // LOG.error("####Robots parsing for host:" + activeHost + " failed."); _ruleSet = RobotRulesParser.getEmptyRules(); _robotsCRC = 0; } //if (Environment.detailLogEnabled()) LOG.info("####Robots RETRIEVED for Host:"+activeHost + " CrawlDelay IS:" + getCrawlDelay(false)); if (originalHost != null && activeHost != null) { updateRobotsCRCForDomain(_robotsCRC, originalHost,_robotsReturned400,_robotsReturned403); if (activeHost.compareToIgnoreCase(originalHost) != 0) { updateRobotsCRCForDomain(_robotsCRC, activeHost,_robotsReturned400,_robotsReturned403); } } Disposition oldDisposition = _disposition; if (getNextPending(false) != null) { _disposition = Disposition.ItemAvailable; } else { _disposition = Disposition.WaitingOnTime; } if (oldDisposition != _disposition) { // notify queue getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition); } } public void taskFailed(Exception e) { if (Environment.detailLogEnabled()) LOG.error("####Robots parsing for host:" + _robotsHostName +" failed with exception" + e); _ruleSet = RobotRulesParser.getEmptyRules(); Disposition oldDisposition = _disposition; if (getNextPending(false) != null) { _disposition = Disposition.ItemAvailable; } else { _disposition = Disposition.WaitingOnTime; } if (oldDisposition != _disposition) { // notify queue getHost().listDispositionChanged(CrawlList.this, oldDisposition, _disposition); } } })); } // explitly return here ( inorder to wait for the async completion event) return; } //otherwise ... else { synchronized (crawlerStats) { crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1); } CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, resultCode, null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed,0); _robotsCRC = 0; if (Environment.detailLogEnabled()) LOG.info("####Robots GET for Host:" + activeHost + "FAILED With Result Code:" + resultCode); //TODO: MAKE THIS MORE ROBUST ... // clear robots flag ... _robotsRetrieved = true; // see if result code was a 403 if (resultCode >= 400 && resultCode <= 499) { _robotsReturned400 = true; if (resultCode == 403) _robotsReturned403 = true; } // for now, assume no robots rules for any error conditions ... _ruleSet = RobotRulesParser.getEmptyRules(); if (originalHost != null && activeHost != null) { updateRobotsCRCForDomain(_robotsCRC, originalHost,_robotsReturned400,_robotsReturned403); if (activeHost.compareToIgnoreCase(originalHost) != 0) { updateRobotsCRCForDomain(_robotsCRC, activeHost,_robotsReturned400,_robotsReturned403); } } } } if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) { // update active host stats and check for failure ... checkActiveHostStatsForFailure(); } // if there are no more items in the queue if (getNextPending(false) == null) { // if offline count is zero then mark this domain's queue as empty if (_offlineTargetCount == 0) { _disposition = Disposition.QueueEmpty; } // otherwise put us in a wait state and potentially queue up a disk load else { _disposition = Disposition.WaitingOnTime; // potentially queue up a disk load potentiallyQueueDiskLoad(); } } else { // if we are ready to fetch the next item ... if (calculateNextWaitTime() < System.currentTimeMillis()) { _disposition = Disposition.ItemAvailable; } else { // transition to a new wait state ... _disposition = Disposition.WaitingOnTime; } } if (oldDisposition != _disposition) { // either way ... notify queue getHost().listDispositionChanged(this, oldDisposition, _disposition); } } } /** get total request count **/ private final int getTotalFailureCount() { if (getActiveDomain() != null) return getActiveDomain()._HTTP400Count + getActiveDomain()._HTTP500Count; return 0; } /** check to see if we should fail the host based on collected stats **/ private boolean checkActiveHostStatsForFailure() { boolean failHost = false; if (getActiveDomain() != null) { String errorReason = null; /* if (getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER && _robotsReturned403) { errorReason ="Too Many Sequential Errors AFTER Robots Returned 403. HTTP200 Count:" + getActiveDomain()._HTTP200Count; failHost =true; } */ if ((getActiveDomain()._HTTP200Count == 0 && getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_NO_200_TRIGGER)) { errorReason ="Too Many Sequential Errors. RobotsReturned400:" + _robotsReturned400 + " 400 Count:" + getActiveDomain()._HTTP400Count + " 500 Count:" + getActiveDomain()._HTTP500Count + " 200 Count:" + getActiveDomain()._HTTP200Count; failHost =true; } else { int totalFailureCount = getTotalFailureCount(); int totalRequestCount = totalFailureCount + getActiveDomain()._HTTP200Count; if (totalRequestCount != 0 && totalRequestCount >= STATS_CHECK_CODE_SAMPLE_THRESHOLD) { float badToGoodPercent = (float)totalFailureCount / (float)totalRequestCount; if (badToGoodPercent >= BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD) { failHost = true; errorReason ="Bad To Good URL Pct:" + badToGoodPercent +" exceeded Threshold:" + BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD + " RobotsReturned400:" + _robotsReturned400 + " 400 Count:" + getActiveDomain()._HTTP400Count + " 500 Count:" + getActiveDomain()._HTTP500Count + " 200 Count:" + getActiveDomain()._HTTP200Count; } } } if (failHost) { failActiveDomain(CrawlURL.FailureReason.TooManyErrors, errorReason); LOG.error("#### HOST FAILURE - List:" + getListName() + "Host: " + getActiveDomain()._domainName +" Reason:" + errorReason); } } return failHost; } private static final int FAIL_STRATEGY_RETRY_ITEM = 0; // increment the failure count on the item and retry private static final int FAIL_STRATEGY_RETRY_HOST = 1; // increment the failure count on the host and retry ... private static final int FAIL_STRATEGY_FAIL_ITEM = 2; // immediately fail the item ... // private static final int FAIL_STRATEGY_FAIL_HOST = 3; // immediately fail the host ... private static final int failureCodeStrategyTable[] = { FAIL_STRATEGY_RETRY_ITEM, // UNKNOWN - result: Inc Fail Count on Item, potentially reschedule FAIL_STRATEGY_FAIL_ITEM,// UknownProtocol - result: Immediately Fail Item FAIL_STRATEGY_FAIL_ITEM,// MalformedURL - result: Immediately Fail Item FAIL_STRATEGY_RETRY_ITEM,// Timeout - result: Inc Fail Count on Host, potentially reschedule FAIL_STRATEGY_FAIL_ITEM,// DNSFailure -result: reschedule, set waitstate for Host FAIL_STRATEGY_RETRY_HOST,// ResolverFailure -result: Inc Fail Count on Host, potentially reschedule FAIL_STRATEGY_RETRY_ITEM,// IOException -result: Inc Fail Count on Item, potentially reschedule FAIL_STRATEGY_FAIL_ITEM, // RobotsExcluded FAIL_STRATEGY_FAIL_ITEM,// NoData = 9; FAIL_STRATEGY_RETRY_ITEM,// RobotsParseError = 10; FAIL_STRATEGY_FAIL_ITEM,// RedirectFailed = 11; FAIL_STRATEGY_RETRY_ITEM,// RuntimeError = 12; FAIL_STRATEGY_RETRY_HOST,// ConnectTimeout = 13; FAIL_STRATEGY_FAIL_ITEM,//BlackListedHost = 14; FAIL_STRATEGY_FAIL_ITEM,//BlackListedURL = 15; FAIL_STRATEGY_FAIL_ITEM,//TooManyErrors = 16; FAIL_STRATEGY_FAIL_ITEM,//InCache = 17; FAIL_STRATEGY_FAIL_ITEM// InvalidResponseCode = 18; }; private void failURL(CrawlTarget target,int failureReason,String errorDescription) { // explicitly fail the item ... CrawlTarget.failURL(target.createFailureCrawlURLObject(failureReason,errorDescription),target,failureReason,errorDescription); } private synchronized void failActiveDomain(int failureReason,String errorDescription) { if (getActiveDomain() != null) { LOG.error("### Failing Active Domain:" + getActiveDomain()._domainName + " in List:" + getListName() + " ReasonCode:" + failureReason + " Description:" + errorDescription); // _disposition = Disposition.QueueEmpty; getActiveDomain()._domainFailed = true; /* // fail scheduled url ... if (_scheduled != null) { _scheduled = null; } // just remove all pending urls from list for now ... _pending.removeAll(); // reset offline count... _offlineTargetCount = 0; // reset disk operation pending indiciator ... _diskRequestPending = false; */ getHost().incrementCounter(CrawlListHost.CounterId.FailedDomainCount,1); CrawlerServer.getEngine().failDomain(getActiveDomain()._domainName); } } synchronized void fetchFailed(CrawlTarget target, int failureReason,String description) { _activeConnection = null; _lastRequestRedirectCount = target.getRedirectCount(); _fetchEndTime = System.currentTimeMillis(); getHost().incrementCounter(CrawlListHost.CounterId.FailedGetCount,1); if (getActiveDomain() != null) { getActiveDomain()._SequentialHTTPFailuresCount++; } _lastRequestWasIOException = false; //check to see if the error is an io exception or a timeout if (failureReason == CrawlURL.FailureReason.IOException || failureReason == CrawlURL.FailureReason.Timeout) { // increment host failure counter ... _host.incrementCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount,1); _lastRequestWasIOException = true; } // the rest is similar to a host retry strategy ... Disposition oldDisposition = _disposition; if (_scheduled != target) { if (_scheduled == null) LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!"); else LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName() + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is:" + _scheduled.getOriginalURL()); } else { // reset active and scheduled ... _scheduled = null; // if we failed on the robots get ... if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) { CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats(); synchronized (crawlerStats) { crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1); } //TODO: FIGURE THIS OUT LATER ... FOR NOW .. ON A FAILURE OF ROBOTS.TXT GET, WE ASSUME NO ROBOTS.TXT FILE ... //LOG.warn("Robots Fetch for host:"+getHostName() + " Failed with Reason:" + failureReason +" Desc:" + description); //LOG.warn("Assuming NO-ROBOTS FILE"); CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),_robotsHostName, 0 , null,CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed,0); target.logFailure(CrawlerServer.getEngine(),failureReason,description); _robotsRetrieved = true; _ruleSet = RobotRulesParser.getEmptyRules(); // and clear scheduled ... _scheduled = null; updateLastFetchStartTime(-1); // and transition to wait state .... _disposition = Disposition.WaitingOnTime; } // otherwise pass on to underlying crawl target handler ... else { // default failure strategy ... int failureStrategy = FAIL_STRATEGY_RETRY_ITEM; // if failure code is within known failure codes ... if (failureReason >= CrawlURL.FailureReason.UNKNOWN && failureReason <= CrawlURL.FailureReason.InvalidResponseCode) { // use table to map strategy ... failureStrategy = failureCodeStrategyTable[failureReason-1]; } switch (failureStrategy) { case FAIL_STRATEGY_RETRY_HOST: case FAIL_STRATEGY_RETRY_ITEM: { // increment retry counter ... getActiveDomain()._domainRetryCounter ++; // increment retry counter on target ... target.incrementRetryCounter(); // IFF server failed ... // OR retry count on item exceeded ... // OR this item is a high priority dispatch item ... // THEN immediately fail this item // ELSE queue up this item for subsequent retry if (_host.isFailedServer() || target.getRetryCount() >= MAX_ITEM_RETRY || ((target.getFlags() & CrawlURL.Flags.IsHighPriorityURL) != 0) ) { failURL(target,failureReason,description); } else { // and add it back to the pending list ... _pending.addTail(target); } }break; case FAIL_STRATEGY_FAIL_ITEM: { failURL(target,failureReason,description); }break; /* case FAIL_STRATEGY_FAIL_HOST: { // just put the entire host in a fail state ... failDomain(failureReason,description); }break; */ } switch (failureStrategy) { case FAIL_STRATEGY_RETRY_ITEM: case FAIL_STRATEGY_FAIL_ITEM: case FAIL_STRATEGY_RETRY_HOST: { // check to see if there are items in the pending queue .... if (_pending.size() == 0 && _offlineTargetCount == 0) { // if not... transition to Queue Empty _disposition = Disposition.QueueEmpty; } else { long waitTime = calculateNextWaitTime(); // if we can fetch the next item ... if (waitTime <= System.currentTimeMillis()) { if (_pending.size() != 0) // shift to an available disposition ... _disposition = Disposition.ItemAvailable; else // shift to waiting on time disposition (to wait for disk queue load). _disposition = Disposition.WaitingOnTime; } else { // wait on time ... _disposition = Disposition.WaitingOnTime; } } } break; } if (description == null) description = ""; if (Environment.detailLogEnabled()) LOG.error("Fetch Failed for URL:"+target.getOriginalURL() + " Reason:"+failureReason + " Description:" + description + " Strategy:"+failureStrategy + " OldDisp:" + oldDisposition + " NewDisp:" + _disposition); } if (_disposition == Disposition.WaitingOnCompletion) { LOG.error("### BUG Fetch Faile for URL:" + target.getOriginalURL() +" failed to transition List to proper disposition!"); } if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) { // update active host stats and check for failure ... checkActiveHostStatsForFailure(); } // notify queue if disposition changed ... if (_disposition != oldDisposition) { getHost().listDispositionChanged(this,oldDisposition,_disposition); } } } /** clear a pre-existing wait state **/ synchronized void clearWaitState() { Disposition oldDisposition = _disposition; // if robots retrieval is pending ... if (_robotsRetrieved == false ) { // LOG.debug("clearWaitState called on Host:"+getHostName()+ " after initial robots fetch"); // explicitly transition to availabel (to retry robots fetch... ) _disposition = Disposition.ItemAvailable; } // otherwise if pending queue size is zero or host has failed ... else if ((_pending.size() == 0 && _offlineTargetCount == 0 && _queued.size() == 0)) { // LOG.debug("clearWaitState called on Host:"+getHostName()+ " and queue is empty. transitioning to QueueEmpty"); // transition to queue empty disposition _disposition = Disposition.QueueEmpty; } else { // if active request size < max simulatenous requests ... if (_scheduled == null) { // if there are items to be read from the in memory list ... if (_pending.size() != 0) { // LOG.error("clearWaitState called Host:"+getHostName()+ " and getNextPendingReturned object. transitioning to ItemAvailable"); // immediately transition to an available state ... _disposition = Disposition.ItemAvailable; } else { if (!_diskRequestPending) { _diskRequestPending = true; _diskOperationQueue.add(new DiskQueueEntry(this,true)); } _disposition = Disposition.WaitingOnTime; } } // otherwise... we are waiting on completion now ... else { LOG.warn("clearWaitState called on already scheduled list:"+getListName()); _disposition = Disposition.WaitingOnCompletion; } } getHost().listDispositionChanged(this,oldDisposition,_disposition); } /** calculateRetryWaitTime */ public long calculateNextWaitTime() { // ok check to see if the related host is paused ... if (_host.isPaused()) { LOG.info("*** host is paused. pausing crawl for: " + PAUSE_STATE_RETRY_DELAY + " milliseconds"); // ok suspend for pause delay return System.currentTimeMillis() + PAUSE_STATE_RETRY_DELAY; } if (_fetchStartTime == -1) { return System.currentTimeMillis(); } else { // first calculate crawl delay based on robots delay value * number of hops to service last request //int crawlDelay = (getCrawlDelay(true) * (_lastRequestRedirectCount+1)); int crawlDelay = getCrawlDelay(true); // if the crawl delay is the default host crawl delay if (crawlDelay == _host.getCrawlDelay()) { // see if fetch time is available int lastDocFetchTime = getLastRequestFetchTime(); if (lastDocFetchTime != 0) { // calculate alternate crawl delay based on fetch time ... int alternateCrawlDelay = lastDocFetchTime * 4; if (alternateCrawlDelay > crawlDelay) { crawlDelay = alternateCrawlDelay; if (Environment.detailLogEnabled()) LOG.info("### CRAWLDELAY Using Alternate Crawl Delay of:" + alternateCrawlDelay + " for URL:" + getNextPending(false)); } } } /* if (_lastRequestDownloadTime != -1) { // next see if host took more than crawl delay millseconds to respond if (_lastRequestDownloadTime >= getCrawlDelay()) { // add request time to crawl delay crawlDelay += _lastRequestDownloadTime; } // add one second for every 2 seconds of request time else { crawlDelay += 1000 * (_lastRequestDownloadTime / 2000); } } */ // ok ... adjust crawl delay by the number of hops it took to get the result if (_fetchStartTime != -1) { return _fetchStartTime + crawlDelay; } return System.currentTimeMillis() + crawlDelay; } } /** getNextPending */ private synchronized CrawlTarget getNextPending(boolean removeItem) { CrawlTarget targetOut = null; if (_pending.size() != 0) { targetOut = _pending.getHead(); if (removeItem && targetOut != null) { _pending.removeElement(targetOut); } } return targetOut; } /** indicates if robots file need to be retrieved for the specified host */ public boolean robotsRetrieved() { return _robotsRetrieved; } /** */ private final int getCrawlDelay(boolean checkForOverride) { if (checkForOverride) { CrawlTarget potentialTarget = getNextPending(false); if (potentialTarget != null) { try { URL targetURL = new URL(potentialTarget.getActiveURL()); // validate against the server for crawl delay //LOG.info("Checking Crawl Delay for url:" + targetURL.toString()); int overridenCrawlDelay = CrawlerServer.getServer().checkForCrawlRateOverride(targetURL); if (overridenCrawlDelay != -1) { if (Environment.detailLogEnabled()) LOG.info("### CRAWLDELAY - Overriding Crawl Delay for URL:" + targetURL + " Delay is:" + overridenCrawlDelay ); return overridenCrawlDelay; } } catch (MalformedURLException e) { } } } int crawlDelayOut = 0; if (_ruleSet == null || _ruleSet.getCrawlDelay() == -1) { crawlDelayOut += getHost().getCrawlDelay(); } else { crawlDelayOut += (int)Math.min(_ruleSet.getCrawlDelay(),MAX_CRAWL_DELAY); crawlDelayOut = Math.max(MIN_CRAWL_DELAY, crawlDelayOut); } if (_lastRequestWasIOException) { crawlDelayOut += IOEXCEPTION_TIMEOUT_BOOST; } return crawlDelayOut; } /** clear the host's state **/ public synchronized void clear() { _pending.removeAll(); _scheduled = null; _diskRequestPending = false; _offlineTargetCount = 0; } public synchronized void dumpDetailsToHTML(StringBuffer sb){ // synchronized (_pending) { sb.append("ListName:" + getListName() + "\n"); sb.append("RobotsRetrieved:" + _robotsRetrieved + "\n"); sb.append("Disposition:" + _disposition + "\n"); sb.append("Scheduled:" + ((_scheduled != null)?_scheduled.getOriginalURL() : "null") + "\n"); sb.append("PendingCount:" + _pending.size() + "\n"); sb.append("QueuedCount:" + _queued.size() + "\n"); sb.append("OfflineCount:" + _offlineTargetCount +"\n"); sb.append("ActiveConnection:" + _activeConnection +"\n"); sb.append("LastFetchedRobotsHost:" + _lastFetchedRobotsHostName +"\n"); if (_pending.size() != 0) { sb.append("next 100 scheduled urls:\n"); int itemCount =0; CrawlTarget target = _pending.getHead(); while (target != null) { sb.append("["+(itemCount++)+"]:<a href='" + target.getOriginalURL() +"'>" + target.getOriginalURL() + "</a>\n"); target = target.getNext(); } } sb.append("\n\nLastFetchedRobotsData:\n\n"); if (_lastFetchedRobotsData != null) { sb.append(_lastFetchedRobotsData); sb.append("\n"); } // } } /************** * Disk Operation Support */ public static int getPendingDiskOperationCount() { return _diskOperationQueue.size(); } public static void stopDiskQueueingThread() { if (_diskOperationThread != null) { _diskOpThreadShuttingDown = true; LOG.info("shutting down Disk Queue Thread - sending null item to queue"); _diskOperationQueue.add(new DiskQueueEntry(null,false)); try { LOG.info("Waiting for Disk Queue Thread to Die"); _diskOperationThread.join(); LOG.info("Done Waiting for Disk Queue Thread"); } catch (InterruptedException e) { e.printStackTrace(); } _diskOpThreadShuttingDown = false; _diskOperationThread = null; } } public static void startDiskQueueingThread(final EventLoop serverEventLoop,final File baseStoragePath) { // figure out // and finally start the blocking writer thread ... _diskOperationThread = new Thread(new Runnable() { public void run() { for (;;) { try { DiskQueueEntry entry = _diskOperationQueue.take(); // if buffer item is null... this is considered an eof condition ... break out ... if (entry.getListItem() == null) { LOG.info("### DiskThread:Received Null Item ... Shutting down CrawlDomain Disk Queue Thread"); // now matter what ... break out ... break; } // otherwise .. figure out what to do with the domain ... else { if (_diskOpThreadShuttingDown == false) { final CrawlList domain = entry.getListItem(); try { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Got List:" + domain.getListName()); // build a hierarchichal path for the given domain id ... File logFilePath = null; String listName = null; synchronized(domain) { logFilePath = FileUtils.buildHierarchicalPathForId(baseStoragePath,domain.getUniqueListId()); listName = domain.getListName(); } // get the immediate parent directory ... File parentDirectory = logFilePath.getParentFile(); // and recursively create the directory chain (if necessary). parentDirectory.mkdirs(); IntrusiveList<CrawlTarget> flushList = null; int desiredLoadAmount = 0; boolean truncateFile = false; synchronized(domain) { if (domain._offlineTargetCount == 0) { truncateFile = true; } } if (truncateFile && logFilePath.exists()) { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Truncating Existing Log File for List:" + listName); LogFileHeader header = new LogFileHeader(); RandomAccessFile file = new RandomAccessFile(logFilePath,"rw"); try { writeLogFileHeader(file,header); } finally { file.close(); } } // now lock access to the domain's pending queue synchronized(domain) { // if a disk request was pending ... if (domain._diskRequestPending) { // reset disk request pending flag here to prevent race condition ... domain._diskRequestPending = false; // figure out what action to take with respect to the domain ... // if list count exceeds flush threshold if (domain._pending.size() >= DISK_FLUSH_THRESHOLD || domain._queued.size() != 0) { if (domain._queued.size() == 0) { LinkedList<CrawlTarget> candidates = new LinkedList<CrawlTarget>(); for (CrawlTarget candidate : domain._pending) { if ((candidate.getFlags() & CrawlURL.Flags.IsHighPriorityURL) == 0) { // add candidates in proper order ... candidates.add(candidate); } } // if there are low priority candidates we can flush ... if (candidates.size() != 0) { // create a new flush list ... flushList = new IntrusiveList<CrawlTarget>(); // reverse candidate list and start removing items from pending for (CrawlTarget candidate : Lists.reverse(candidates)) { domain._pending.removeElement(candidate); flushList.addHead(candidate); // if we are back to ideal target count bail ... if (domain._pending.size() <= IDEAL_TARGET_COUNT) break; } if (Environment.detailLogEnabled()) LOG.info("### DiskThread: List:" + domain.getListName() + " Created FetchList FROM PENDING of Size:" + flushList.size()); // increment offline target count ... domain._offlineTargetCount += flushList.size(); } } else { flushList = domain._queued.detach(domain._queued.getHead()); if (Environment.detailLogEnabled()) LOG.info("### DiskThread: List:" + domain.getListName() + " Created FetchList FROM QUEUED of Size:" + flushList.size()); // increment offline target count ... domain._offlineTargetCount += flushList.size(); } /* // walk one past IDEAL target item count... int i=0; CrawlTarget target = domain._pending.getHead(); while (i<IDEAL_TARGET_COUNT) { target = target.getNext(); ++i; } // and extract a sub-list starting at the target ... flushList = domain._pending.detach(target); */ //and immediately update offline target count in domain ... //domain._offlineTargetCount += flushList.size(); } // otherwise ... else { // check queued size ... if (domain._queued.size() != 0) { // if pending size <= DISK_LOAD_THRESHOLD if (domain._offlineTargetCount == 0 && domain._pending.size() <= DISK_LOAD_THRESHOLD) { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Moving Items from Queued List to Pending List for CrawlList:" + domain.getListName()); // move over items from queued to pending while (domain._queued.getHead() != null) { domain._pending.addTail(domain._queued.removeHead()); if (domain._pending.size() == (DISK_FLUSH_THRESHOLD - 1)) break; } } //now if domain queue exceeds flush threshold ... if (domain._queued.size() >= IDEAL_TARGET_COUNT) { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Queued Size Exceed Flush Threshold. Flushing to Disk for CrawlList:" + domain.getListName()); // extract a sub-list starting at head of queued list flushList = domain._queued.detach(domain._queued.getHead()); //and immediately update offline target count in domain ... domain._offlineTargetCount += flushList.size(); } } // check to see if a load is desired ... if (domain._pending.size() <= DISK_LOAD_THRESHOLD) { // calculate load amount ... desiredLoadAmount = IDEAL_TARGET_COUNT - domain._pending.size(); } } } else { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Skipping List:" + domain.getListName()); } } // now figure out what to do ... if (flushList != null) { if (Environment.detailLogEnabled()) LOG.info("### DiskThread: Flushing"+ flushList.size() + " Items To Disk for Domain:" + domain.getListName()); // flush crawl targets to disk ... appendTargetsToLogFile(logFilePath,flushList); // clear list ... flushList.removeAll(); } // ... if load is desired ... if (desiredLoadAmount != 0) { IntrusiveList<CrawlTarget> loadList = new IntrusiveList<CrawlTarget>(); int loadCount = readTargetsFromLogFile(domain,logFilePath,desiredLoadAmount,loadList); // if (Environment.detailLogEnabled()) LOG.info("### DiskThread:Disk Queue Loaded: " + loadCount + "Items To Disk for Domain:" + domain.getListName()); if (loadCount != 0) { // time to lock domain again ... synchronized(domain) { // and reduce offline count ... domain._offlineTargetCount -= loadList.size(); // load new items into domain's list ... domain._pending.attach(loadList); } } } } catch (IOException e) { LOG.error("### DiskThread:" + CCStringUtils.stringifyException(e)); } } } } catch (InterruptedException e) { } catch (Exception e) { LOG.fatal("### DiskThread: Encountered Unhandled Exception:" + CCStringUtils.stringifyException(e)); } } LOG.info("### DiskThread: Exiting CrawlDomain Disk Queue Thread"); } }); // launch the writer thread ... _diskOperationThread.start(); } private static class LogFileHeader { public static final int LogFileHeaderBytes = 0xCC00CC00; public static final int LogFileVersion = 1; public LogFileHeader() { _readPos = 0; _writePos = 0; _itemCount = 0; } public long _readPos; public long _writePos; public int _itemCount; public void writeHeader(DataOutput stream) throws IOException { stream.writeInt(LogFileHeaderBytes); stream.writeInt(LogFileVersion); stream.writeLong(_diskHeaderActiveVersionTimestamp); stream.writeLong(_readPos); stream.writeLong(_writePos); stream.writeInt(_itemCount); } public void readHeader(DataInput stream) throws IOException { int headerBytes = stream.readInt(); int version = stream.readInt(); long timestamp = stream.readLong(); if (headerBytes != LogFileHeaderBytes && version !=LogFileVersion) { throw new IOException("Invalid CrawlLog File Header Detected!"); } _readPos = stream.readLong(); _writePos = stream.readLong(); _itemCount = stream.readInt(); // if timestamps don't match ... if (timestamp != _diskHeaderActiveVersionTimestamp) { // then reset cursors .. eveything in the file is invalid ... _writePos = 0; _readPos = 0; _itemCount =0; } } } private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream { public CustomByteArrayOutputStream(int initialSize) { super(initialSize); } public byte[] getBuffer() { return buf; } } private static void appendTargetsToLogFile(File logFileName,IntrusiveList<CrawlTarget> list)throws IOException { LogFileHeader header = new LogFileHeader(); boolean preExistingHeader = logFileName.exists(); RandomAccessFile file = new RandomAccessFile(logFileName,"rw"); try { long headerOffset = 0; if(preExistingHeader) { headerOffset = readLogFileHeader(file, header); if (header._writePos == 0) { file.seek(headerOffset); } else { // seelk to appropriate write position file.seek(header._writePos); } } else { headerOffset = writeLogFileHeader(file,header); } CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17); DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream); CRC32 crc = new CRC32(); for (CrawlTarget target : list) { PersistentCrawlTarget persistentTarget = target.createPersistentTarget(); bufferOutputStream.reset(); // write to intermediate stream ... persistentTarget.write(dataOutputStream); // and crc the data ... crc.reset(); crc.update(bufferOutputStream.getBuffer(),0,bufferOutputStream.size()); // write out length first file.writeInt(bufferOutputStream.size()); //crc next long computedValue = crc.getValue(); //TODO: waste of space - write 32 bit values as long because having problems with java sign promotion rules during read... file.writeLong(computedValue); // and then the data file.write(bufferOutputStream.getBuffer(),0,bufferOutputStream.size()); } // now update header ... header._itemCount += list.size(); header._writePos = file.getFilePointer(); // now write out header anew ... writeLogFileHeader(file,header); } finally { if (file != null) { file.close(); } } } private static int readTargetsFromLogFile(CrawlList domain,File logFileName,int desiredReadAmount,IntrusiveList<CrawlTarget> targetsOut)throws IOException { int itemsRead = 0; if (logFileName.exists()) { RandomAccessFile file = new RandomAccessFile(logFileName,"rw"); LogFileHeader header = new LogFileHeader(); try { long headerOffset = readLogFileHeader(file, header); // seelk to appropriate write position if (header._readPos != 0) file.seek(header._readPos); int itemsToRead = Math.min(desiredReadAmount, header._itemCount); PersistentCrawlTarget persistentTarget = new PersistentCrawlTarget(); CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 16); for (int i=0;i<itemsToRead;++i) { // read length ... int urlDataLen = file.readInt(); long urlDataCRC = file.readLong(); buffer.reset(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream( ((urlDataLen / 65536) + 1) * 65536 ); } file.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { throw new IOException("Crawl Target Log File Corrupt"); } else { //populate a persistentTarget from the (in memory) data stream DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(),0,urlDataLen)); persistentTarget.clear(); persistentTarget.readFields(bufferReader); //populate a new crawl target structure ... CrawlTarget newTarget = new CrawlTarget(domain,persistentTarget); targetsOut.addTail(newTarget); } } itemsRead = itemsToRead; // now update header ... header._itemCount -= itemsRead; // now if item count is non zero ... if (header._itemCount != 0) { // set read cursor to next record location header._readPos = file.getFilePointer(); } // otherwise ... else { // reset both cursors ... header._readPos = 0; header._writePos = 0; } // now write out header anew ... writeLogFileHeader(file,header); } catch (IOException e) { LOG.fatal("Encountered Exception Reading From Offline Queue for LogFile:" + logFileName + ". Truncating Queue! \n" + CCStringUtils.stringifyException(e)); header._itemCount = 0; header._readPos = 0; header._writePos = 0; writeLogFileHeader(file,header); } finally { if (file != null) { file.close(); } } } return itemsRead; } private static long writeLogFileHeader(RandomAccessFile file, LogFileHeader header )throws IOException { // set the position at zero .. file.seek(0); // and write header to disk ... header.writeHeader(file); //took sync out because it was becoming a sever bottleneck // file.getFD().sync(); return file.getFilePointer(); } private static long readLogFileHeader(RandomAccessFile file,LogFileHeader header) throws IOException { file.seek(0); header.readHeader(file); return file.getFilePointer(); } public static class CrawlDomainTester { public static void main(String[] args) { try { testGetNextItemCode(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Test public static void testGetNextItemCode() throws Exception { /* CrawlListHost host = new CrawlListHost(null,1); CrawlList list = host.getCrawlList(1); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com/foobar"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://blogger.com"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://foo.blogger.com"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/bar"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/zzz"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://####/foo/zzz"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"garbage"),false); list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,""),false); list._offlineTargetCount = 100; CrawlTarget target = null; NIOHttpHeaders headers = new NIOHttpHeaders(); headers.add(null, "HTTP1.1 200 OK"); while (list.getDisposition() != CrawlList.Disposition.QueueEmpty) { if (list.getDisposition() == CrawlList.Disposition.ItemAvailable) { target = list.getNextTarget(); if (target == null) System.out.println("Target:NULL"); else System.out.println("Target:" + target.getOriginalURL()); } else if (list.getDisposition() == CrawlList.Disposition.WaitingOnCompletion) { if (target != null) { if (target.getActiveURL().startsWith("http://www.redirecttest.com")) { target.setRedirectURL(target.getActiveURL().replaceFirst("http://www.redirecttest.com", "http://redirecttest.com")); target.setFlags(target.getFlags() | CrawlURL.Flags.IsRedirected); } list.fetchStarted(target); if (target.getActiveURL().startsWith("http://failed.domain")) { list.getActiveDomain()._domainFailed = true; } list.fetchSucceeded(target,0, headers, null); target = null; } else { list._disposition = Disposition.WaitingOnTime; } } else if (list.getDisposition() == CrawlList.Disposition.WaitingOnTime) { System.out.println("clearing WaitState"); list.clearWaitState(); } } */ } //@Test public void testDiskQueue() throws Exception { String hostName = "poodleskirtcentral.com"; long domainFP = URLFingerprint.generate64BitURLFPrint(hostName); File logFilePath =FileUtils.buildHierarchicalPathForId(new File("/foo"),domainFP); System.out.println(logFilePath.getAbsolutePath()); } /* //@Test public void testDiskWriter() throws Exception { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("hadoop-default.xml"); conf.addResource("hadoop-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("file:///"); CrawlEnvironment.setCrawlSegmentDataDirectory("./tests/crawlSegmentSamples/"); EventLoop eventLoop = new EventLoop(); eventLoop.start(); DNSCache cache = new DNSCache() { public DNSResult resolveName(CrawlSegmentHost host) { return null; } }; CrawlList.DISK_FLUSH_THRESHOLD = 5; CrawlList.DISK_LOAD_THRESHOLD = 2; CrawlList.IDEAL_TARGET_COUNT = 3; File basePath = new File("./data/diskQueueTest"); basePath.mkdir(); CrawlList.startDiskQueueingThread(eventLoop,basePath); CrawlSegmentDetail detailCC06 = SegmentLoader.loadCrawlSegment(1,1, "cc06",null, cache,null,null); CrawlListHost host = new CrawlListHost(null,0); int domainCount = 0; CrawlList firstDomain = null; for (CrawlSegmentHost segmentHost: detailCC06.getHosts()) { CrawlList domain = new CrawlList(host,segmentHost.getListId()); if (domainCount==0) firstDomain = domain; if (domainCount ==0) System.out.println("Domain:" + domain.getListName() + " FP:" + domain.getListId()); for (CrawlSegmentURL segmentURL : segmentHost.getUrlTargets()) { if (domainCount ==0) System.out.println("\tAdding Target::" + segmentURL.getUrl()); CrawlTarget target = new CrawlTarget(1,domain,segmentHost,segmentURL); domain.addCrawlTarget(target, false); } if (++domainCount == 10) break; } while (true) { synchronized (firstDomain) { if (firstDomain._pending.size() < CrawlList.DISK_FLUSH_THRESHOLD) break; Thread.sleep(5000); } } while (true) { if (firstDomain.getDisposition() == CrawlList.Disposition.ItemAvailable) { CrawlTarget nextTarget = null; while ((nextTarget = firstDomain.getNextTarget()) != null) { System.out.println("Domain: "+ firstDomain.getListName() + " Got Target:" + nextTarget.getOriginalURL()); firstDomain.fetchStarted(nextTarget); firstDomain.fetchSucceeded(nextTarget, 0,null, null); if (firstDomain.getDisposition() == CrawlList.Disposition.WaitingOnTime) { firstDomain.clearWaitState(); } } } else { System.out.println("Domain Queue Empty ... Waiting"); Thread.sleep(5000); } } //CrawlDomain._diskOperationThread.join(); } */ } @Override public String toString() { return "List Id:" + _baseListId + " Name:" + _listName; } }