package com.geccocrawler.gecco.spider; import java.util.List; import java.util.concurrent.CountDownLatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.geccocrawler.gecco.GeccoEngine; import com.geccocrawler.gecco.downloader.AfterDownload; import com.geccocrawler.gecco.downloader.BeforeDownload; import com.geccocrawler.gecco.downloader.Downloader; import com.geccocrawler.gecco.downloader.DownloadException; import com.geccocrawler.gecco.pipeline.Pipeline; import com.geccocrawler.gecco.request.HttpRequest; import com.geccocrawler.gecco.response.HttpResponse; import com.geccocrawler.gecco.scheduler.Scheduler; import com.geccocrawler.gecco.scheduler.UniqueSpiderScheduler; import com.geccocrawler.gecco.spider.render.Render; /** * 一个爬虫引擎可以包含多个爬虫,每个爬虫可以认为是一个单独线程,爬虫会从Scheduler中获取需要待抓取的请求。 * 爬虫的任务就是下载网页并渲染相应的JavaBean。 * * @author huchengyi * */ public class Spider implements Runnable { private static Log log = LogFactory.getLog(Spider.class); private CountDownLatch pauseCountDown; private volatile boolean stop; private volatile boolean pause; private GeccoEngine engine; private Scheduler spiderScheduler; /** * 当前待渲染的bean */ public Class<? extends SpiderBean> currSpiderBeanClass; public Spider(GeccoEngine engine) { this.engine = engine; this.spiderScheduler = new UniqueSpiderScheduler(); this.pause = false; this.stop = false; } public void run() { //将spider放入线程本地变量,之后需要使用 SpiderThreadLocal.set(this); while(true) { //停止 if(stop) { //告知engine线程执行结束 engine.notifyComplete(); break; } //暂停抓取 if(pause) { try { this.pauseCountDown.await(); } catch (InterruptedException e) { log.error("can't pause : ", e); } } //获取待抓取的url boolean start = false; HttpRequest request = spiderScheduler.out(); if(request == null) { //startScheduler request = engine.getScheduler().out(); if(request == null) { //告知engine线程执行结束 engine.notifyComplete(); break; } start = true; } if(log.isDebugEnabled()) { log.debug("match url : " + request.getUrl()); } //匹配SpiderBean currSpiderBeanClass = engine.getSpiderBeanFactory().matchSpider(request); //download HttpResponse response = null; try { if(currSpiderBeanClass == null) {//如果无法匹配但是是302跳转,需要放入抓取队列继续抓取 response = defaultDownload(request); if(response.getStatus() == 302 || response.getStatus() == 301){ spiderScheduler.into(request.subRequest(response.getContent())); } else { log.error("cant't match url : " + request.getUrl()); } } else { //获取SpiderBean的上下文:downloader,beforeDownloader,afterDownloader,render,pipelines SpiderBeanContext context = getSpiderBeanContext(); response = download(context, request); if(response.getStatus() == 200) { //render Render render = context.getRender(); SpiderBean spiderBean = null; spiderBean = render.inject(currSpiderBeanClass, request, response); //pipelines pipelines(spiderBean, context); } else if(response.getStatus() == 302 || response.getStatus() == 301){ spiderScheduler.into(request.subRequest(response.getContent())); } } } catch(Exception ex) { if(engine.isDebug()) { log.error(request.getUrl() + " ERROR : ", ex); } log.error(request.getUrl() + " ERROR : " + ex.getClass().getName() + ex.getMessage()); } finally { if(response != null) { response.close(); } } //抓取间隔 interval(); //开始地址放入队尾重新抓取 if(start && engine.isLoop()) { //如果是一个开始抓取请求,再返回开始队列中 engine.getScheduler().into(request); } } } /** * 暂停,当前正在抓取的请求会继续抓取完成,之后会等到restart的调用才继续抓取 */ public void pause() { this.pauseCountDown = new CountDownLatch(1); this.pause = true; } /** * 重新开始 */ public void restart() { this.pauseCountDown.countDown(); this.pause = false; } /** * 停止抓取 */ public void stop() { this.stop = true; } @SuppressWarnings({ "rawtypes", "unchecked" }) private void pipelines(SpiderBean spiderBean, SpiderBeanContext context) { if(spiderBean == null) { return ; } List<Pipeline> pipelines = context.getPipelines(); if(pipelines != null) { for(Pipeline pipeline : pipelines) { pipeline.process(spiderBean); } } } private void interval() { int interval = engine.getInterval(); if(interval > 0) { try { Thread.sleep(randomInterval(interval)); } catch (InterruptedException e) {} } } /** * 默认下载 * * @param request * @return */ private HttpResponse defaultDownload(HttpRequest request) throws DownloadException { HttpResponse response = download(null, request); return response; } private HttpResponse download(SpiderBeanContext context, HttpRequest request) throws DownloadException { Downloader currDownloader = null; BeforeDownload before = null; AfterDownload after = null; int timeout = 1000; if(context != null) { currDownloader = context.getDownloader(); before = context.getBeforeDownload(); after = context.getAfterDownload(); timeout = context.getTimeout(); } else { currDownloader = engine.getSpiderBeanFactory().getDownloaderFactory().defaultDownloader(); } if(before != null) { before.process(request); } HttpResponse response = currDownloader.download(request, timeout); if(after != null) { after.process(request, response); } return response; } /** * 间隔时间在左右1s的范围内随机 * * @param interval * @return */ private int randomInterval(int interval) { int min = interval - 1000; if(min < 1) { min = 1; } int max = interval + 1000; return (int)Math.rint(Math.random()*(max-min)+min); } public GeccoEngine getEngine() { return engine; } public Scheduler getSpiderScheduler() { return spiderScheduler; } public void setSpiderScheduler(Scheduler spiderScheduler) { this.spiderScheduler = spiderScheduler; } public SpiderBeanContext getSpiderBeanContext() { return engine.getSpiderBeanFactory().getContext(currSpiderBeanClass); } }