/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.examples.multiple; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; /** * @author Yasser Ganjisaffar <lastname at gmail dot com> */ public class MultipleCrawlerController { public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("Needed parameter: "); System.out.println("\t rootFolder (it will contain intermediate crawl data)"); return; } /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = args[0]; CrawlConfig config1 = new CrawlConfig(); CrawlConfig config2 = new CrawlConfig(); /* * The two crawlers should have different storage folders for their * intermediate data */ config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1"); config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2"); config1.setPolitenessDelay(1000); config2.setPolitenessDelay(2000); config1.setMaxPagesToFetch(50); config2.setMaxPagesToFetch(100); /* * We will use different PageFetchers for the two crawlers. */ PageFetcher pageFetcher1 = new PageFetcher(config1); PageFetcher pageFetcher2 = new PageFetcher(config2); /* * We will use the same RobotstxtServer for both of the crawlers. */ RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1); CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer); CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer); String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" }; String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" }; controller1.setCustomData(crawler1Domains); controller2.setCustomData(crawler2Domains); controller1.addSeed("http://www.ics.uci.edu/"); controller1.addSeed("http://www.cnn.com/"); controller1.addSeed("http://www.ics.uci.edu/~lopes/"); controller1.addSeed("http://www.cnn.com/POLITICS/"); controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page"); controller2.addSeed("http://en.wikipedia.org/wiki/Obama"); controller2.addSeed("http://en.wikipedia.org/wiki/Bing"); /* * The first crawler will have 5 cuncurrent threads and the second * crawler will have 7 threads. */ controller1.startNonBlocking(BasicCrawler.class, 5); controller2.startNonBlocking(BasicCrawler.class, 7); controller1.waitUntilFinish(); System.out.println("Crawler 1 is finished."); controller2.waitUntilFinish(); System.out.println("Crawler 2 is finished."); } }