package com.brucezee.jspider; import com.brucezee.jspider.common.Config; import com.brucezee.jspider.common.ThreadPool; import com.brucezee.jspider.common.utils.SpiderIOUtils; import com.brucezee.jspider.common.utils.SpiderStrUtils; import com.brucezee.jspider.common.utils.SpiderTimeUtils; import com.brucezee.jspider.downloader.CookieStorePool; import com.brucezee.jspider.downloader.Downloader; import com.brucezee.jspider.downloader.httpclient.DefaultHttpClientPool; import com.brucezee.jspider.downloader.httpclient.HttpClientDownloader; import com.brucezee.jspider.downloader.httpclient.HttpClientFactory; import com.brucezee.jspider.downloader.httpclient.HttpClientPool; import com.brucezee.jspider.downloader.proxy.HttpProxyPool; import com.brucezee.jspider.pipeline.ConsolePipeline; import com.brucezee.jspider.pipeline.Pipeline; import com.brucezee.jspider.processor.PageProcessor; import com.brucezee.jspider.scheduler.QueueScheduler; import com.brucezee.jspider.scheduler.Scheduler; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; /** * 爬虫任务 * Created by brucezee on 2017/1/6. */ public class Spider implements Runnable, Task { private static final Logger logger = LoggerFactory.getLogger(Spider.class); @Override public void run() { init(); working(); cleanup(); } @Override public String getUUID() { return spiderConfig.getUUID(); } @Override public void start() { //异步执行 runAsync(); } @Override public void stop() { if (stat.compareAndSet(Status.Running.getValue(), Status.Stopping.getValue())) { logger.info("Trying to stop Spider " + getUUID() + "."); } else { logger.info("Failed stopping Spider " + getUUID() + "!"); } } protected PageProcessor pageProcessor; //页面解析器 protected Downloader downloader; //请求任务下载器 protected Scheduler scheduler; //请求任务调度器 protected Pipeline pipeline; //结果集处理器 protected SiteConfig siteConfig; //网络请求配置 protected SpiderConfig spiderConfig; //爬虫任务配置 protected HttpClientPool httpClientPool; //HttpClient池 protected HttpProxyPool httpProxyPool; //Http代理池 protected CookieStorePool cookieStorePool; //CookieStore池 protected String uuid; //爬虫任务id protected int threadCount; //爬虫任务线程池大小 protected ThreadPool threadPool; //线程池 private ReentrantLock lock = new ReentrantLock(); //锁 private Condition condition = lock.newCondition(); //锁对应条件 protected AtomicInteger stat = new AtomicInteger(Status.Init.getValue()); //爬虫状态 private List<SpiderListener> spiderListeners; //爬虫任务监听器列表 private List<Request> startRequests; //初始化请求任务 private Date startTime; //爬虫任务开始时间 public static Spider create(String uuid, int threadCount, PageProcessor pageProcessor) { return create(SpiderConfig.create(uuid, threadCount), SiteConfig.create(), pageProcessor); } public static Spider create(PageProcessor pageProcessor) { return create(SpiderConfig.create(SpiderStrUtils.getSpiderUUID(), Config.DEFAULT_THREAD_COUNT), SiteConfig.create(), pageProcessor); } public static Spider create(SpiderConfig spiderConfig, SiteConfig siteConfig, PageProcessor pageProcessor) { return new Spider(spiderConfig, siteConfig, pageProcessor); } public static Spider create() { return create(null); } public Spider(SpiderConfig spiderConfig, SiteConfig siteConfig, PageProcessor pageProcessor) { this.spiderConfig = spiderConfig; this.siteConfig = siteConfig; this.pageProcessor = pageProcessor; } public Spider setStartRequests(List<Request> startRequests) { checkIfRunning(); this.startRequests = startRequests; return this; } public Spider setScheduler(Scheduler scheduler) { checkIfRunning(); Scheduler oldScheduler = this.scheduler; this.scheduler = scheduler; if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { this.scheduler.push(this, request); } } return this; } public Spider setPipeline(Pipeline pipeline) { checkIfRunning(); this.pipeline = pipeline; return this; } public Spider setSiteConfig(SiteConfig siteConfig) { checkIfRunning(); this.siteConfig = siteConfig; return this; } public Spider setSpiderConfig(SpiderConfig spiderConfig) { checkIfRunning(); this.spiderConfig = spiderConfig; return this; } public Spider setPageProcessor(PageProcessor pageProcessor) { checkIfRunning(); this.pageProcessor = pageProcessor; return this; } public Spider setDownloader(Downloader downloader) { checkIfRunning(); this.downloader = downloader; return this; } public Spider setHttpProxyPool(HttpProxyPool httpProxyPool) { checkIfRunning(); this.httpProxyPool = httpProxyPool; return this; } public Spider setCookieStorePool(CookieStorePool cookieStorePool) { checkIfRunning(); this.cookieStorePool = cookieStorePool; return this; } public Spider setHttpClientPool(HttpClientPool httpClientPool) { checkIfRunning(); this.httpClientPool = httpClientPool; return this; } public Spider addStartRequests(String... urls) { checkIfRunning(); if (startRequests == null) { startRequests = new ArrayList<Request>(urls.length); } for (String url : urls) { startRequests.add(new Request(url)); } return this; } public Spider addStartRequests(Request... requests) { checkIfRunning(); if (startRequests == null) { startRequests = new ArrayList<Request>(requests.length); } for (Request request : requests) { startRequests.add(request); } return this; } public Spider addTargetRequests(String... urls) { checkScheduler(); for (String url : urls) { scheduler.push(this, new Request(url)); } return this; } public Spider addTargetRequests(Request... requests) { checkScheduler(); for (Request request : requests) { scheduler.push(this, request); } return this; } public Spider setSpiderListeners(List<SpiderListener> spiderListeners) { this.spiderListeners = spiderListeners; return this; } public Spider addSpiderListeners(SpiderListener ... spiderListeners) { if (this.spiderListeners == null) { this.spiderListeners = new LinkedList<SpiderListener>(); } for (SpiderListener spiderListener : spiderListeners) { this.spiderListeners.add(spiderListener); } return this; } public Spider setUUID(String uuid) { this.uuid = uuid; return this; } public Spider setThreadCount(int threadCount) { this.threadCount = threadCount; return this; } private void checkScheduler() { if (scheduler == null) { throw new IllegalStateException("Scheduler is required."); } } public Date getStartTime() { return startTime; } public Scheduler getScheduler() { return scheduler; } public int getThreadAlive() { return threadPool == null ? 0 : threadPool.getThreadAlive(); } public Status getStatus() { return Status.parse(stat.get()); } public void runAsync() { Thread thread = new Thread(this); thread.setDaemon(false); thread.setName(getUUID()); thread.start(); } protected void checkIfRunning() { if (stat.get() == Status.Running.getValue()) { throw new IllegalStateException("Spider is already running!"); } } protected void init() { if (stat.get() != Status.Init.getValue() && stat.get() != Status.Stopped.getValue()) { throw new IllegalStateException("Spider " + getUUID() + " is busy now!"); } if (pageProcessor == null) { throw new IllegalArgumentException("PageProcessor is required!"); } if (threadCount > 0) { spiderConfig.setThreadCount(threadCount); } if (StringUtils.isNotBlank(uuid)) { spiderConfig.setUUID(uuid); } stat.set(Status.Running.getValue()); if (downloader == null) { if (httpClientPool == null) { httpClientPool = new DefaultHttpClientPool(new HttpClientFactory()); } downloader = new HttpClientDownloader(httpClientPool, httpProxyPool, cookieStorePool); } else if (downloader instanceof HttpClientDownloader) { HttpClientDownloader httpClientDownloader = (HttpClientDownloader) downloader; if (httpClientDownloader.getHttpProxyPool() != null) { httpProxyPool = httpClientDownloader.getHttpProxyPool(); } else if (httpProxyPool != null) { httpClientDownloader.setHttpProxyPool(httpProxyPool); } if (httpClientDownloader.getCookieStorePool() != null) { cookieStorePool = httpClientDownloader.getCookieStorePool(); } else if (cookieStorePool != null) { httpClientDownloader.setCookieStorePool(cookieStorePool); } if (httpClientDownloader.getHttpClientPool() != null) { httpClientPool = httpClientDownloader.getHttpClientPool(); } } if (pipeline == null) { pipeline = new ConsolePipeline(); } if (scheduler == null) { scheduler = new QueueScheduler(); } if (threadPool == null || threadPool.isShutdown()) { threadPool = new ThreadPool(spiderConfig.getThreadCount()); } if (startRequests != null && !startRequests.isEmpty()) { for (Request request : startRequests) { scheduler.push(this, request); } startRequests.clear(); } startTime = new Date(); logger.info("Spider {} started at {}.", getUUID(), SpiderTimeUtils.formatTime(startTime, Config.DATE_TIME_FORMAT)); } protected void working() { while (!Thread.currentThread().isInterrupted() && stat.get() == Status.Running.getValue()) { Request request = scheduler.poll(this); if (request != null) { threadPool.execute(new SpiderExecutor(request)); } else { if (isCompletedToExit()) { break; } waitNewRequest(); } } } private class SpiderExecutor implements Runnable { private Request request; public SpiderExecutor(Request request) { this.request = request; } @Override public void run() { Page page = null; try { page = downloader.download(siteConfig, request); if (page == null || !page.isSuccess()) { //download failed onError(request, page); return; } Result result = pageProcessor.process(request, page); if (result != null) { //no result pipeline.persist(request, result); } onSuccess(request, page, result); } catch (Exception e) { logger.error("Spider {} execute task exception {}", getUUID(), e); onError(request, page); } finally { try { if (page != null) { extractAndAddRequests(page); SpiderIOUtils.closeQuietly(page);//input stream page } if (httpProxyPool != null) { httpProxyPool.returnProxy(request, page != null ? page.getStatusCode() : 0); } } finally { signalNewRequest(); } } } } private void waitNewRequest() { try { lock.lock(); if (isCompletedToExit()) { return; } logger.warn("Spider {} is waiting for new request.", getUUID()); condition.await(spiderConfig.getEmptySleepMillis(), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { logger.warn("Spider {} waiting for new request interrupted, error {}", getUUID(), e); } finally { lock.unlock(); } } private void signalNewRequest() { try { lock.lock(); condition.signalAll(); } finally { lock.unlock(); } } private boolean isCompletedToExit() { return spiderConfig.isExitWhenComplete() && getThreadAlive() <= 0; } protected void cleanup() { stat.set(Status.Stopping.getValue()); if (spiderConfig.isDestroyWhenExit()) { closeAll(); } stat.set(Status.Stopped.getValue()); Date now = new Date(); logger.info("Spider {} shutdown at {}.", getUUID(), SpiderTimeUtils.formatTime(now, Config.DATE_TIME_FORMAT)); } private void closeAll() { closeDelay(); SpiderIOUtils.closeQuietly(downloader); SpiderIOUtils.closeQuietly(scheduler); SpiderIOUtils.closeQuietly(pipeline); SpiderIOUtils.closeQuietly(pageProcessor); SpiderIOUtils.closeQuietly(threadPool); } private void closeDelay() { try { long closeDelayMillis = spiderConfig.getCloseDelayMillis(); while (getThreadAlive() > 0 && closeDelayMillis > 0) { logger.info("Spider {} active work left {}, waiting for shutdown.", getUUID(), getThreadAlive()); Thread.sleep(1000); closeDelayMillis -= 1000; } } catch (InterruptedException e) { } } protected void onError(Request request, Page page) { if (spiderListeners != null && !spiderListeners.isEmpty()) { for (SpiderListener spiderListener : spiderListeners) { try { spiderListener.onError(request, page); } catch (Exception e) { logger.error("Spider {} listener on error process exception {}", getUUID(), e); } } } } protected void onSuccess(Request request, Page page, Result result) { if (spiderListeners != null && !spiderListeners.isEmpty()) { for (SpiderListener spiderListener : spiderListeners) { try { spiderListener.onSuccess(request, page, result); } catch (Exception e) { logger.error("Spider {} listener on success process exception {}", getUUID(), e); } } } } protected void extractAndAddRequests(Page page) { List<Request> targetRequests = page.getTargetRequests(); if (CollectionUtils.isNotEmpty(targetRequests)) { for (Request request : targetRequests) { addTargetRequests(request); } } } public enum Status { Init(0), Running(1), Stopping(2), Stopped(3); Status(int value) { this.value = value; } private int value; int getValue() { return value; } public static Status parse(int value) { for (Status status : Status.values()) { if (status.getValue() == value) { return status; } } //default value return Init; } } }