package guang.crawler.crawlWorker; import guang.crawler.centerConfig.CenterConfig; import guang.crawler.centerConfig.workers.WorkerInfo; import guang.crawler.commons.WebURL; import guang.crawler.connector.WebDataTableConnector; import guang.crawler.crawlWorker.daemon.SiteManagerConnectorManager; import guang.crawler.crawlWorker.pageProcessor.ConfigLoadException; import guang.crawler.crawlWorker.pageProcessor.ExtractDataToSavePlugin; import guang.crawler.crawlWorker.pageProcessor.ExtractLinksToFollowPlugin; import guang.crawler.crawlWorker.pageProcessor.PageProcessor; import guang.crawler.crawlWorker.pageProcessor.SaveExtractedDataPlugin; import guang.crawler.crawlWorker.pageProcessor.UploadExtractedLinksPlugin; import java.io.IOException; /** * 爬虫工作者,用来启动一个爬虫.爬虫是一直存活的,不会死去. * * @author sun * */ public class CrawlerWorker implements Runnable { /** * 爬虫工作者的单例 */ private static CrawlerWorker crawlerWorker; /** * 获取爬虫工作者的单例 * * @return */ public static CrawlerWorker me() { if (CrawlerWorker.crawlerWorker == null) { CrawlerWorker.crawlerWorker = new CrawlerWorker(); } return CrawlerWorker.crawlerWorker; } /** * 站点管理器连接的管理者,用来更新在线的站点管理器连接并获取有效的URL. */ private SiteManagerConnectorManager siteManagerConnectManager; /** * 爬虫工作者的本地配置信息 */ private WorkerConfig workerConfig; /** * 中央配置器 */ private CenterConfig controller; /** * 页面处理器,用来对下载的页面进行处理 */ private PageProcessor pageProcessor; /** * HBase中存放的网页数据的连接器 */ private WebDataTableConnector webDataTableConnector; private CrawlerWorker() { } /** * 初始化爬虫工作者 * * @return * @throws IOException * @throws InterruptedException * @throws ConfigLoadException */ public CrawlerWorker init() throws IOException, InterruptedException, ConfigLoadException { // 加载本地配置信息 this.workerConfig = WorkerConfig.me() .init(); // 在中央配置器中注册一个爬虫工作者 this.controller = CenterConfig.me() .init(this.workerConfig.getZookeeperQuorum()); WorkerInfo workerInfo = this.controller.getWorkersInfo() .getOnlineWorkers() .registWorker(); this.workerConfig.setCrawlerController(this.controller); this.workerConfig.setWorkerInfo(workerInfo); // 创建并初始化站点管理器连接的管理器 this.siteManagerConnectManager = SiteManagerConnectorManager.me() .init(); // 创建HBase的连接器并打开连接 this.webDataTableConnector = new WebDataTableConnector( this.workerConfig.getZookeeperQuorum()); try { this.webDataTableConnector.open(); } catch (IOException e) { System.out.println("Can not open hbase connect"); } // 添加页面处理器插件,当页面下载完成后依次调用这些插件 this.pageProcessor = new PageProcessor(); this.pageProcessor.addPlugin(new ExtractDataToSavePlugin()); this.pageProcessor.addPlugin(new SaveExtractedDataPlugin( this.webDataTableConnector)); this.pageProcessor.addPlugin(new ExtractLinksToFollowPlugin()); this.pageProcessor.addPlugin(new UploadExtractedLinksPlugin()); return this; } /** * 爬虫工作者的工作主线程,不断的获取URL并进行页面的处理. */ @Override public void run() { // 开始查找在线的站点管理器 this.siteManagerConnectManager.start(); WebURL url = null; // 无限循环 while (true) { // 获取一个可用的URL try { url = this.siteManagerConnectManager.getURL(); } catch (InterruptedException e) { break; } // 获取URL之后处理该URL if (url != null) { this.pageProcessor.processUrl(url); } }// 循环体结束,说明有异常或者被中断 // 退出并关闭相关资源 try { this.siteManagerConnectManager.exit(); } catch (IOException e) { e.printStackTrace(); } this.pageProcessor.shutdown(); } /** * 在新的线程中启动爬虫工作者,当前的处理是单线程的,可以考虑改写成多线程的. */ public void start() { new Thread(this).start(); } }