package com.github.xjtushilei.core; import com.github.xjtushilei.core.downloader.Downloader; import com.github.xjtushilei.core.downloader.PreDefine.HttpClientPoolDownloader; import com.github.xjtushilei.core.pageprocesser.PageProcessor; import com.github.xjtushilei.core.pageprocesser.PreDefine.TextPageProcessor; import com.github.xjtushilei.core.saver.PreDefine.ConsoleSaver; import com.github.xjtushilei.core.saver.Saver; import com.github.xjtushilei.core.scheduler.PreDefine.QueueScheduler; import com.github.xjtushilei.core.scheduler.Scheduler; import com.github.xjtushilei.model.Page; import com.github.xjtushilei.model.RegexRule; import com.github.xjtushilei.model.UrlSeed; import com.github.xjtushilei.utils.TimeSleep; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; import java.util.List; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; /** * Created by shilei on 2017/4/10. */ public class Spider { private Logger logger = LoggerFactory.getLogger(getClass()); private Scheduler scheduler; private Downloader downloader; private PageProcessor pageProcessor; private Saver saver; //新种子的过滤器,只有通过正则的,才会加入到待爬取种子队列 private RegexRule regexRule; private int threadNum = 5;//线程池大小。默认5个爬虫在进行。 private ThreadPoolExecutor pool; /** * 最多几个爬虫在进行。 * 默认5个。 * * @param threadNum * @return 自己 */ public Spider thread(int threadNum) { this.threadNum = threadNum; if (threadNum <= 0) { this.threadNum = 5; } pool = new ThreadPoolExecutor(threadNum, threadNum, 1500L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>()); return this; } public static Spider build() { return new Spider(); } public Spider() { setDefaultComponents(); regexRule = new RegexRule(); } public Spider setScheduler(Scheduler scheduler) { this.scheduler = scheduler; return this; } public Spider setDownloader(Downloader d) { this.downloader = d; return this; } public Spider setProcessor(PageProcessor p) { this.pageProcessor = p; return this; } public Spider setSaver(Saver s) { this.saver = s; return this; } /** * 添加初始化种子,可以多个 * * @param url * @return Spider */ public Spider addUrlSeed(String url) { scheduler.push(new UrlSeed(url)); return this; } /** * 添加新种子需要满足的正则信息(正则规则有两种,正正则和反正则) * <p> * URL符合正则规则需要满足下面条件: * 1.至少能匹配一条正正则 * 2.不能和任何反正则匹配 * 举例: * 正正则示例:+a.*c是一条正正则,正则的内容为a.*c,起始加号表示正正则 * 反正则示例:-a.*c时一条反正则,正则的内容为a.*c,起始减号表示反正则 * 如果一个规则的起始字符不为加号且不为减号,则该正则为正正则,正则的内容为自身 * 例如a.*c是一条正正则,正则的内容为a.*c * * @param regex 正则 * @return Spider */ public Spider addRegexRule(String regex) { regexRule.addRule(regex); return this; } private Spider setDefaultComponents() { thread(threadNum); if (scheduler == null) { scheduler = new QueueScheduler(); } if (downloader == null) { downloader = new HttpClientPoolDownloader(); } if (pageProcessor == null) { pageProcessor = new TextPageProcessor(); } if (saver == null) { saver = new ConsoleSaver(); } return this; } public void run() { logger.info("爬虫启动!"); UrlSeed urlSeed = null; while (true) { logger.info("当前线程池" + "已完成:" + pool.getCompletedTaskCount() + " 运行中:" + pool.getActiveCount() + " 最大运行:" + pool.getPoolSize() + " 等待队列:" + pool.getQueue().size()); if (pool.getQueue().size() > pool.getCorePoolSize()) { //如果等待队列大于了100.就暂停接收新的url。不然会影响优先级队列的使用。 TimeSleep.sleep(1000); continue; } urlSeed = scheduler.poll(); if (urlSeed == null && pool.getActiveCount() == 0) { pool.shutdown(); try { pool.awaitTermination(10, TimeUnit.SECONDS); } catch (InterruptedException e) { logger.error("关闭线程池失败!", e); } logger.info("爬虫结束!"); break; } else if (urlSeed == null) { //没有取到种子就等待! TimeSleep.sleep(1000); } else { logger.info("正在处理:" + urlSeed.getUrl() + " 优先级(默认:5):" + urlSeed.getPriority()); pool.execute(new SpiderWork(urlSeed.clone())); } } } class SpiderWork implements Runnable { private UrlSeed urlSeed; SpiderWork(UrlSeed urlSeed) { this.urlSeed = urlSeed; } public void run() { logger.debug("线程:[" + Thread.currentThread().getName() + "]正在处理:" + urlSeed.getUrl()); logger.debug("当前线程池" + "已完成:" + pool.getCompletedTaskCount() + " 运行中:" + pool.getActiveCount() + " 最大运行:" + pool.getPoolSize() + " 等待队列:" + pool.getQueue().size()); //整个流程为: // (download下载) -> (pageProcessor解析处理) -> (save存储) Page nowPage = downloader.download(urlSeed); pageProcessor.process(nowPage); //正则处理 List<UrlSeed> urlSeedList = nowPage.links(); for (Iterator<UrlSeed> it = urlSeedList.iterator(); it.hasNext(); ) { UrlSeed seed = it.next(); if (!regexRule.regex(seed.getUrl())) { // System.out.println(seed.getUrl()); it.remove(); } } nowPage.setNewUrlSeed(urlSeedList); pageProcessor.processNewUrlSeeds(nowPage); nowPage.getNewUrlSeed().forEach(seed -> scheduler.push(seed)); saver.save(nowPage); } } }