package us.codecraft.webmagic; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.downloader.Destroyable; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.utils.ThreadUtils; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; /** * <pre> * webmagic爬虫的入口类。 * * 示例: * 定义一个最简单的爬虫: * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run(); * * 使用FilePipeline保存结果到文件: * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); * * 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取: * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); * </pre> * * @author code4crafter@gmail.com <br> * Date: 13-4-21 Time: 上午6:53 */ public class Spider implements Runnable, Task { protected Downloader downloader; protected List<Pipeline> pipelines = new ArrayList<Pipeline>(); protected PageProcessor pageProcessor; protected List<String> startUrls; protected Site site; protected String uuid; protected Scheduler scheduler = new QueueScheduler(); protected Logger logger = Logger.getLogger(getClass()); protected ExecutorService executorService; protected int threadNum = 1; protected AtomicInteger stat = new AtomicInteger(STAT_INIT); protected final static int STAT_INIT = 0; protected final static int STAT_RUNNING = 1; protected final static int STAT_STOPPED = 2; private boolean isTest = false; /** * 使用已定义的抽取规则新建一个Spider。 * * @param pageProcessor * 已定义的抽取规则 */ public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); this.startUrls = pageProcessor.getSite().getStartUrls(); } /** * 使用已定义的抽取规则新建一个Spider。 * * @param pageProcessor * 已定义的抽取规则 * @return 新建的Spider */ public static Spider create(PageProcessor pageProcessor) { return new Spider(pageProcessor); } /** * 重新设置startUrls,会覆盖Site本身的startUrls。 * * @param startUrls * @return this */ public Spider startUrls(List<String> startUrls) { checkIfNotRunning(); this.startUrls = startUrls; return this; } /** * 为爬虫设置一个唯一ID,用于标志任务,默认情况下使用domain作为uuid,对于单domain多任务的情况,请为重复任务设置不同的ID。 * * @param uuid * 唯一ID * @return this */ public Spider setUUID(String uuid) { this.uuid = uuid; return this; } /** * 设置调度器。调度器用于保存待抓取URL,并可以进行去重、同步、持久化等工作。默认情况下使用内存中的阻塞队列进行调度。 * * @param scheduler * 调度器 * @return this */ public Spider scheduler(Scheduler scheduler) { checkIfNotRunning(); this.scheduler = scheduler; return this; } /** * 设置处理管道。处理管道用于最终抽取结果的后处理,例如:保存到文件、保存到数据库等。默认情况下会输出到控制台。 * * @param pipeline * 处理管道 * @return this */ public Spider pipeline(Pipeline pipeline) { checkIfNotRunning(); this.pipelines.add(pipeline); return this; } public Spider downloader(Downloader downloader) { checkIfNotRunning(); this.downloader = downloader; return this; } protected void checkComponent() { if (downloader == null) { this.downloader = new HttpClientDownloader(); } if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } downloader.setThread(threadNum); } @Override public void run() { if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!"); } checkComponent(); if (startUrls != null) { for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this); } } Request request = scheduler.poll(this); // singel thread if (executorService == null) { while (request != null) { processRequest(request); request = scheduler.poll(this); } } else { // multi thread final AtomicInteger threadAlive = new AtomicInteger(0); while (true) { if (request == null) { // when no request found but some thread is alive, sleep a // while. try { Thread.sleep(100); } catch (InterruptedException e) { } } else { final Request requestFinal = request; threadAlive.incrementAndGet(); executorService.execute(new Runnable() { @Override public void run() { processRequest(requestFinal); threadAlive.decrementAndGet(); } }); } request = scheduler.poll(this); if (threadAlive.get() == 0) { request = scheduler.poll(this); if (request == null) { break; } } } executorService.shutdown(); } stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); // release some resources destroy(); } protected void destroy() { destroyEach(downloader); destroyEach(pageProcessor); for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } } private void destroyEach(Object object) { if (object instanceof Destroyable) { ((Destroyable) object).destroy(); } } /** * 用某些特定URL进行爬虫测试 * * @param urls * 要抓取的url */ public void test(String... urls) { checkComponent(); isTest = true; if (urls.length > 0) { for (String url : urls) { if (validate(url)) { Request request = new Request(url); request.putExtra(Constant.IS_FROM_TEST_REQUEST, isTest); processRequest(request); } } } } protected boolean validate(String url) { return true; } protected void processRequest(Request request) { Page page = downloader.download(request, this); if (page == null) { sleep(site.getSleepTime()); return; } pageProcessor.process(page); addRequest(page); if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); } } sleep(site.getSleepTime()); } protected void sleep(int time) { try { Thread.sleep(time); } catch (InterruptedException e) { e.printStackTrace(); } } protected void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { scheduler.push(request, this); } } } protected void checkIfNotRunning() { if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { throw new IllegalStateException("Spider is already running!"); } } public void runAsync() { Thread thread = new Thread(this); thread.setDaemon(false); thread.start(); } /** * 建立多个线程下载 * * @param threadNum * 线程数 * @return this */ public Spider thread(int threadNum) { checkIfNotRunning(); this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } if (threadNum == 1) { return this; } synchronized (this) { this.executorService = ThreadUtils.newFixedThreadPool(threadNum); } return this; } public Spider clearPipeline() { pipelines = new ArrayList<Pipeline>(); return this; } @Override public String getUUID() { if (uuid != null) { return uuid; } if (site != null) { return site.getDomain(); } return null; } @Override public Site getSite() { return site; } @Override public void cron(String expr) { // TODO Auto-generated method stub } }