package guang.crawler.siteManager; import guang.crawler.centerConfig.CenterConfig; import guang.crawler.centerConfig.siteManagers.SiteManagerInfo; import guang.crawler.commons.WebURL; import guang.crawler.jsonServer.AcceptJsonServer; import guang.crawler.jsonServer.JsonServer; import guang.crawler.jsonServer.ServerStartException; import guang.crawler.siteManager.daemon.QueueCleannerDaemon; import guang.crawler.siteManager.daemon.SiteBackupDaemon; import guang.crawler.siteManager.daemon.SiteManagerWatcherDaemon; import guang.crawler.siteManager.docid.DocidServer; import guang.crawler.siteManager.docid.MD5UrlDocidServer; import guang.crawler.siteManager.jobQueue.JEQueue; import guang.crawler.siteManager.jobQueue.JEQueueElementTransfer; import guang.crawler.siteManager.jobQueue.MapQueue; import guang.crawler.siteManager.jobQueue.WebURLTransfer; import guang.crawler.siteManager.urlFilter.BitMapFilter; import guang.crawler.siteManager.urlFilter.ObjectFilter; import guang.crawler.siteManager.util.IOHelper; import guang.crawler.util.NetworkHelper; import java.io.File; import java.io.IOException; import java.net.UnknownHostException; import java.util.Timer; import org.apache.zookeeper.KeeperException; /** * 当前类是站点管理器,用来启动和关闭站点管理器 * * @author sun * */ public class SiteManager { public static SiteManager me() { if (SiteManager.siteManager == null) { SiteManager.siteManager = new SiteManager(); } return SiteManager.siteManager; } /** * 当前采集点正在运行 */ private boolean running = false; /** * 下一步将要爬取的URL列表 */ private MapQueue<WebURL> toDoTaskList; /** * 正在爬取的URL列表 */ private MapQueue<WebURL> workingTaskList; /** * 爬取失败的URL列表 */ private MapQueue<WebURL> failedTaskList; /** * 定时对列表进行清理的后台线程 */ private QueueCleannerDaemon cleannerDaemon; /** * 定时对站点管理器进行备份的后台线程 */ private SiteBackupDaemon backuperDaemon; /** * 当前站点的本地配置信息 */ private SiteConfig siteConfig; /** * 用来产生文档ID的服务 */ private DocidServer docidServer; /** * 站点管理器启动的json服务器.爬虫工作者通过与该服务器通信完成各项业务. */ private JsonServer jsonServer; /** * 当前类的单例对象 */ private static SiteManager siteManager; /** * 用来过滤重复网页的过滤器 */ private ObjectFilter urlsFilter; /** * 站点管理器维护的定时器,所有的定时服务都应当绑定到该定时器中,不应当再单独创建定时器了. */ private Timer siteManagerTimer; /** * 用来监控中央配置器中当前站点管理器的节点的后台线程. */ private Thread siteManagerWatcherDaemon; /** * 中央配置器 */ private CenterConfig centerConfig; private SiteManager() { } /** * 获取定时备份后台线程 * * @return */ public SiteBackupDaemon getBackuperDaemon() { return this.backuperDaemon; } /** * 获取产生文档ID的服务 * * @return */ public DocidServer getDocidServer() { return this.docidServer; } /** * 获取失败的URL列表 * * @return */ public MapQueue<WebURL> getFailedTaskList() { return this.failedTaskList; } /** * 获取将要爬取的URL列表 * * @return */ public MapQueue<WebURL> getToDoTaskList() { return this.toDoTaskList; } /** * 获取URL过滤器 * * @return */ public ObjectFilter getUrlsFilter() { return this.urlsFilter; } /** * 获取正在处理的URL列表 * * @return */ public MapQueue<WebURL> getWorkingTaskList() { return this.workingTaskList; } /** * 初始化工作,读取配置文件,初始化中央管理器 * * @return * @throws SiteManagerException */ public SiteManager init() throws SiteManagerException { try { this.siteConfig = SiteConfig.me() .init(); this.centerConfig = CenterConfig.me() .init(this.siteConfig.getZookeeperQuorum()); return this; } catch (Exception e) { throw new SiteManagerException("Site manager inited failed.", e); } } /** * 初始化工作队列,创建相应的目录和数据结构 * * @throws Exception */ private void initJobQueue() throws Exception { // 每个不同的siteManager都有其自身的工作目录 File envHome = new File(this.siteConfig.getWorkDir() + "/" + SiteConfig.me() .getSiteManagerInfo() .getSiteToHandle() + "/je-queues"); if (envHome.exists()) { IOHelper.deleteFolderContents(envHome); } if (!envHome.exists()) { if (!envHome.mkdirs()) { throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath()); } } JEQueueElementTransfer<WebURL> transfer = new WebURLTransfer(); this.toDoTaskList = new JEQueue<WebURL>(envHome, "todo", false, transfer); this.workingTaskList = new JEQueue<WebURL>(envHome, "working", false, transfer); this.failedTaskList = new JEQueue<WebURL>(envHome, "failed", false, transfer); } /** * 初始化JSON 服务器,JSON服务器将被启动 * * @throws InterruptedException * @throws SiteManagerException */ private void initJSONServer() throws InterruptedException, SiteManagerException { String configFileName = this.siteConfig.getCrawlerHome() + "/conf/site-manager/commandlet.xml"; File configFile = new File(configFileName); String schemaFileName = this.siteConfig.getCrawlerHome() + "/etc/xsd/components.xsd"; File schemaFile = new File(schemaFileName); try { this.jsonServer = new AcceptJsonServer(0, 10, this.siteConfig.getJsonserverThreadNum(), configFile, schemaFile); try { this.siteConfig.getSiteManagerInfo() .setManagerAddress(NetworkHelper.getIPAddress() + ":" + this.jsonServer.getPort(), true); } catch (UnknownHostException e) { throw new SiteManagerException( "can not regist the json server", e); } catch (IOException e) { throw new SiteManagerException( "can not regist the json server", e); } catch (KeeperException e) { throw new SiteManagerException( "can not regist the json server", e); } } catch (ServerStartException e) { System.out.println("[Failed] server created failed!"); e.printStackTrace(); } } /** * 当前站点管理器是否被关闭了. * * @return */ public boolean isShutdown() { if (this.jsonServer.isShutdown()) { return true; } return false; } /** * 装载工作队列. * <p> * 首先检查是否有备份的数据,如果有备份的数据,那么加载备份数据.然后将种子站点加入进去. * * @throws IOException * @throws InterruptedException */ private void loadWorkQueue() throws IOException, InterruptedException { // 首先加载备份的数据 boolean backed = this.backuperDaemon.loadBackupData(); if (backed) { // 将failed list和 working list中的数据重新加载到todo list中 this.backuperDaemon.rescheduleTaskList(this.workingTaskList); this.backuperDaemon.rescheduleTaskList(this.failedTaskList); } // 将种子站点添加到todo List中 String seedsString = this.siteConfig.getSiteToHandle() .getWebGatherNodeInfo() .getWgnEntryUrl() .trim(); String seeds[] = seedsString.split(","); for (String seed : seeds) { seed = seed.trim(); if (seed.equals("")) { continue; } WebURL url = WebURL.newWebURL() .setURL(seed) .setDepth((short) 1) .setSiteManagerId(this.siteConfig.getSiteManagerInfo() .getSiteManagerId()) .setSiteId(this.siteConfig.getSiteToHandle() .getSiteId()); url.setDocid(this.docidServer.next(url)); this.toDoTaskList.put(url); } } /** * 启动站点管理器,主要是注册一个站点管理器角色,然后监听zookeeper的消息,作出相应的操作。 */ public void start() { SiteManagerInfo managerInfo = null; try { managerInfo = this.centerConfig.getSiteManagersConfigInfo() .getOnlineSiteManagers() .registSiteManager(); } catch (InterruptedException e) { e.printStackTrace(); System.out.println("Error to start site manager: regist site manager failed."); return; } catch (IOException e) { e.printStackTrace(); System.out.println("Error to start site manager: regist site manager failed."); return; } catch (KeeperException e) { e.printStackTrace(); System.out.println("Error to start site manager: regist site manager failed."); return; } this.siteConfig.setSiteManagerInfo(managerInfo); this.siteManagerWatcherDaemon = new Thread( new SiteManagerWatcherDaemon(), "site-manager-watcher daemon"); this.siteManagerWatcherDaemon.start(); } /** * 初始化一些后台线程,维护系统的运行 * * @throws IOException */ private void startDaemon() { this.siteManagerTimer = new Timer(true); this.siteManagerTimer.schedule(this.cleannerDaemon, this.siteConfig.getJobTimeout(), this.siteConfig.getQueueCleanerPeriod()); this.siteManagerTimer.schedule(this.backuperDaemon, this.siteConfig.getBackupPeriod(), this.siteConfig.getBackupPeriod()); this.jsonServer.start(); } /** * 开始爬取信息 * * @throws Exception */ public synchronized void startGathering() throws Exception { if (!this.running) { // 1. 初始化工作队列 this.initJobQueue(); this.urlsFilter = BitMapFilter.newFilter(); // 2. 初始化相关后台线程 this.initJSONServer(); this.docidServer = new MD5UrlDocidServer(); this.cleannerDaemon = QueueCleannerDaemon.newDaemon(); this.backuperDaemon = SiteBackupDaemon.newDaemon() .init(); // 3. 加载备份数据 this.loadWorkQueue(); // 4. 启动这些后台线程 System.out.println("[INFO] Starting site manager ...."); this.startDaemon(); System.out.println("[SUCC] Starting JSON Server success."); this.running = true; } } /** * 关闭所有的后台线程 */ private void stopDaemon() { this.siteManagerTimer.cancel(); this.jsonServer.shutdown(); this.jsonServer.waitForStop(); } /** * 关闭站点管理器 关闭后台线程,然后强制进行一次备份 */ public synchronized void stopGathering() { if (this.running) { // 1. 关闭所有线程 this.stopDaemon(); // 2. 强制进行一次备份 this.backuperDaemon.forceBackup(); // 3. 关闭相关数据结构 this.toDoTaskList.close(); this.workingTaskList.close(); this.failedTaskList.close(); this.running = false; } } }