package com.fpcms.common.webcrawler; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; public class ImageCrawlerMain { public static void main(String[] args) throws Exception { String crawlStorageFolder = "/data/crawl/root"; int numberOfCrawlers = 2; CrawlConfig config = newCrawlConfig(crawlStorageFolder); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); CrawlController controller = new CrawlController(config, pageFetcher,newRobotstxtServer(pageFetcher)); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ // controller.addSeed("http://www.aaafaipiao.com"); controller.addSeed("http://www.22mm.cc"); controller.addSeed("http://www.22mm.cc/mm/qingliang/ggiejhb_ljgijd.html"); ImageCrawler.configure(new String[]{"http://www.22mm.cc","http://qlimg1.meimei22.com"}, crawlStorageFolder); /* * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ controller.start(ImageCrawler.class, numberOfCrawlers); } private static CrawlConfig newCrawlConfig(String crawlStorageFolder) { CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); config.setMaxDepthOfCrawling(30); config.setIncludeBinaryContentInCrawling(true); return config; } private static RobotstxtServer newRobotstxtServer(PageFetcher pageFetcher) { RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); robotstxtConfig.setEnabled(false); robotstxtConfig.setUserAgentName("Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); return robotstxtServer; } }