package focusedCrawler.crawler.async; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import focusedCrawler.config.ConfigService; import focusedCrawler.crawler.async.HttpDownloader.Callback; import focusedCrawler.link.LinkStorage; import focusedCrawler.link.frontier.LinkRelevance; import focusedCrawler.target.TargetStorage; import focusedCrawler.util.DataNotFoundException; import focusedCrawler.util.MetricsManager; import focusedCrawler.util.storage.Storage; import focusedCrawler.util.storage.StorageConfig; import focusedCrawler.util.storage.StorageException; import focusedCrawler.util.storage.StorageFactoryException; import focusedCrawler.util.storage.distribution.StorageCreator; public class AsyncCrawler { private static final Logger logger = LoggerFactory.getLogger(AsyncCrawler.class); private final Storage targetStorage; private final Storage linkStorage; private final HttpDownloader downloader; private final Map<LinkRelevance.Type, HttpDownloader.Callback> handlers = new HashMap<>(); private volatile boolean shouldStop = false; private Object running = new Object(); private boolean isShutdown = false; public AsyncCrawler(Storage targetStorage, Storage linkStorage, AsyncCrawlerConfig crawlerConfig, String dataPath, MetricsManager metricsManager) { this.targetStorage = targetStorage; this.linkStorage = linkStorage; this.downloader = new HttpDownloader(crawlerConfig.getDownloaderConfig(), dataPath, metricsManager); this.handlers.put(LinkRelevance.Type.FORWARD, new FetchedResultHandler(targetStorage)); this.handlers.put(LinkRelevance.Type.SITEMAP, new SitemapXmlHandler(linkStorage)); this.handlers.put(LinkRelevance.Type.ROBOTS, new RobotsTxtHandler(linkStorage, crawlerConfig.getDownloaderConfig().getUserAgentName())); Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { shutdown(); } }); } public void run() { synchronized (running) { while(!this.shouldStop) { try { LinkRelevance link = (LinkRelevance) linkStorage.select(null); if(link != null) { Callback handler = handlers.get(link.getType()); if(handler == null) { logger.error("No registered handler for link type: "+link.getType()); continue; } downloader.dipatchDownload(link, handler); } } catch (DataNotFoundException e) { // There are no more links available in the frontier right now if(downloader.hasPendingDownloads() || !e.ranOutOfLinks()) { // If there are still pending downloads, new links // may be found in these pages, so we should wait some // time until more links are available and try again try { logger.info("Waiting for links from pages being downloaded..."); Thread.sleep(1000); } catch (InterruptedException ie) { } continue; } // There are no more pending downloads and there are no // more links available in the frontier, so stop crawler logger.info("LinkStorage ran out of links, stopping crawler."); this.shouldStop = true; break; } catch (StorageException e) { logger.error("Problem when selecting link from LinkStorage.", e); } catch (Exception e) { logger.error("An unexpected error happened.", e); } } } } public void shutdown() { shouldStop = true; synchronized(running) { if(isShutdown) { return; } logger.info("Starting crawler shuttdown..."); downloader.await(); downloader.close(); if(linkStorage instanceof LinkStorage) { ((LinkStorage)linkStorage).close(); } if(targetStorage instanceof TargetStorage) { ((TargetStorage)targetStorage).close(); } isShutdown = true; logger.info("Shutdown finished."); } } public static void run(ConfigService config, String dataPath) throws IOException, NumberFormatException { logger.info("Starting CrawlerManager..."); try { StorageConfig linkStorageServerConfig = config.getLinkStorageConfig().getStorageServerConfig(); Storage linkStorage = new StorageCreator(linkStorageServerConfig).produce(); StorageConfig targetServerConfig = config.getTargetStorageConfig().getStorageServerConfig(); Storage targetStorage = new StorageCreator(targetServerConfig).produce(); AsyncCrawlerConfig crawlerConfig = config.getCrawlerConfig(); AsyncCrawler crawler = new AsyncCrawler(targetStorage, linkStorage, crawlerConfig, dataPath, new MetricsManager()); crawler.run(); } catch (StorageFactoryException ex) { logger.error("An error occurred while starting CrawlerManager. ", ex); } } }