/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.examples.imagecrawler;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
/*
* IMPORTANT: Make sure that you update crawler4j.properties file and set
* crawler.include_images to true
*/
public class ImageCrawlController {
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
System.out.println("\t storageFolder (a folder for storing downloaded images)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
String storageFolder = args[2];
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
/*
* Since images are binary content, we need to set this parameter to
* true to make sure they are included in the crawl.
*/
config.setIncludeBinaryContentInCrawling(true);
String[] crawlDomains = new String[] { "http://uci.edu/" };
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String domain : crawlDomains) {
controller.addSeed(domain);
}
ImageCrawler.configure(crawlDomains, storageFolder);
controller.start(ImageCrawler.class, numberOfCrawlers);
}
}