package com.brucezee.jspider.scheduler; import com.brucezee.jspider.Request; import com.brucezee.jspider.Task; import com.brucezee.jspider.scheduler.handler.RepeatHandler; import com.brucezee.jspider.paging.PagingRequestFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.List; import java.util.concurrent.locks.ReentrantLock; /** * 具有去重功能的请求任务调度器 * Created by brucezee on 2017/1/7. */ public abstract class NoRepeatScheduler implements Scheduler, RepeatHandler { private Logger logger = LoggerFactory.getLogger(NoRepeatScheduler.class); private ReentrantLock lock = new ReentrantLock(); //锁 private RepeatHandler repeatHandler; //请求去重处理器 private PagingRequestFactory pagingRequestFactory; //分页请求任务生成器 /** * 构造方法,默认没有实现去重 */ public NoRepeatScheduler() { this(null, null); } /** * 构造方法 * @param repeatHandler 请求去重处理器 */ public NoRepeatScheduler(RepeatHandler repeatHandler) { this(repeatHandler, null); } /** * 构造方法 * @param repeatHandler 请求去重处理器 * @param pagingRequestFactory 分页请求任务生成器 */ public NoRepeatScheduler(RepeatHandler repeatHandler, PagingRequestFactory pagingRequestFactory) { this.repeatHandler = repeatHandler; this.pagingRequestFactory = pagingRequestFactory; } @Override public boolean push(Task task, Request request) { if (shouldReserved(task, request)) { pushWhenNoRepeat(task, request); logger.debug("Push to queue {}", request.key()); return true; } if (!isDuplicate(task, request)) { pushWhenNoRepeat(task, request); addRepeatCheck(task, request); logger.debug("Push to queue {}", request.key()); return true; } return false; } /** * 控制任务添加 * @param task 任务 * @param request 请求 * @return 是否需要强制添加 */ protected boolean shouldReserved(Task task, Request request) { return false; } /** * 去重后添加到任务队列 * @param task 任务 * @param request 请求 */ protected abstract void pushWhenNoRepeat(Task task, Request request); @Override public boolean isDuplicate(Task task, Request request) { if (repeatHandler != null) { return repeatHandler.isDuplicate(task, request); } return false; } @Override public void addRepeatCheck(Task task, Request request) { if (repeatHandler != null) { repeatHandler.addRepeatCheck(task, request); } } @Override public void resetAllRepeatCheck(Task task) { if (repeatHandler != null) { repeatHandler.resetAllRepeatCheck(task); } } @Override public void resetRequestRepeatCheck(Task task, Request request) { if (repeatHandler != null) { repeatHandler.resetRequestRepeatCheck(task, request); } } @Override public Request poll(Task task) { //获取请求任务 Request request = doPoll(task); if (request == null) { //如果任务为空,根据需要重新添加任务 if (handleEmptyPoll(task)) { //再次获取 request = doPoll(task); } } return request; } /** * 获取请求任务 * @param task 爬虫任务 * @return 请求任务 */ protected abstract Request doPoll(Task task); /** * 如果获取任务返回为空,是否处理(比如添加新的任务等) * @param task 爬虫任务 * @return 处理返回true,不处理返回false。 */ protected boolean handleEmptyPoll(Task task) { if (pagingRequestFactory != null) { try { lock.lock(); List<Request> requests = pagingRequestFactory.getRequests(task); if (requests != null && !requests.isEmpty()) { boolean success = false; for (Request request : requests) { success = push(task, request) || success; } return success; } } finally { lock.unlock(); } } return false; } public RepeatHandler getRepeatHandler() { return repeatHandler; } public void setRepeatHandler(RepeatHandler repeatHandler) { this.repeatHandler = repeatHandler; } public PagingRequestFactory getPagingRequestFactory() { return pagingRequestFactory; } public void setPagingRequestFactory(PagingRequestFactory pagingRequestFactory) { this.pagingRequestFactory = pagingRequestFactory; } }