package guang.crawler.siteManager.daemon; import guang.crawler.commons.WebURL; import guang.crawler.siteManager.SiteConfig; import guang.crawler.siteManager.SiteManager; import guang.crawler.siteManager.jobQueue.MapQueue; import guang.crawler.siteManager.jobQueue.MapQueueIterator; import java.util.Date; import java.util.TimerTask; /** * 该类用来定时的清理working list中过时的URL * * @author yang * */ public class QueueCleannerDaemon extends TimerTask { public static QueueCleannerDaemon newDaemon() { SiteManager siteManager = SiteManager.me(); SiteConfig siteConfig = SiteConfig.me(); return new QueueCleannerDaemon(siteManager.getWorkingTaskList(), siteManager.getToDoTaskList(), siteManager.getFailedTaskList(), siteConfig.getJobTimeout(), siteConfig.getJobTryTime()); } private final MapQueue<WebURL> workingList; private final MapQueue<WebURL> todoList; private final MapQueue<WebURL> failedList; /** * 超时的时间,以毫秒计算,默认5分钟 */ private long timeout = 300000; /** * 重试的次数 */ private int tryTime = 3; /** * 启动一个清理任务 */ private QueueCleannerDaemon(final MapQueue<WebURL> workingList, final MapQueue<WebURL> todoList, final MapQueue<WebURL> failedList, final long timeout, final int tryTime) { this.workingList = workingList; this.todoList = todoList; this.failedList = failedList; if (timeout > 0) { this.timeout = timeout; } if (tryTime > 0) { this.tryTime = tryTime; } } @Override public void run() { if (SiteConfig.me() .isBackupTime()) // 如果当前正在进行备份,那么停止清理 { return; } long current = new Date().getTime(); MapQueueIterator<WebURL> it = this.workingList.iterator(); try { while (it.hasNext()) { WebURL webURL = it.next(); if ((current - webURL.getStartTime()) > this.timeout) // 如果已经超时了 { if (webURL.getTryTime() > this.tryTime) // 如果已经超过了重复尝试的次数,那么就应当将其放在失败列表中 { this.failedList.put(webURL); it.remove(); } else // 如果还可以继续尝试,那么就将其放在准备爬取的列表中 { this.todoList.put(webURL); it.remove(); } } } } finally { it.close(); } } }