/** * Copyright 2008 - 2009 Pro-Netics S.P.A. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.pronetics.madstore.crawler.impl; import it.pronetics.madstore.crawler.CrawlerConfiguration; import it.pronetics.madstore.crawler.MadStoreCrawler; import it.pronetics.madstore.crawler.downloader.Downloader; import it.pronetics.madstore.crawler.model.Link; import it.pronetics.madstore.crawler.parser.Parser; import it.pronetics.madstore.crawler.publisher.AtomPublisher; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Default {@link it.pronetics.madstore.crawler.MadStoreCrawler} implementation.<br> * The actual crawling process execution is defined by a {@link CrawlerTask} implementation, * created through the configured {@link CrawlerTaskFactory}. * <br> * Each site is crawled concurrently by a different {@link CrawlerTask} instance: the whole * crawling cycle ends once all configured sites are crawled. * * @author Sergio Bossa * @author Salvatore Incandela */ public class MadStoreCrawlerImpl implements MadStoreCrawler { private static final transient Logger LOG = LoggerFactory.getLogger(MadStoreCrawlerImpl.class); private ExecutorService crawlerExecutor = Executors.newCachedThreadPool(); private List<CrawlerConfiguration> crawlerConfigurations; private CrawlerTaskFactory crawlerTaskFactory; private Parser parser; private Downloader downloader; private AtomPublisher publisher; public void setCrawlerConfigurations(List<CrawlerConfiguration> crawlerConfigurations) { this.crawlerConfigurations = new LinkedList<CrawlerConfiguration>(crawlerConfigurations); } public List<CrawlerConfiguration> getCrawlerConfigurations() { return Collections.unmodifiableList(crawlerConfigurations); } /** * Start the crawling process, composed by a concurrent crawling task for each site to crawl. * <br> * This method call is blocking: it ends once all sites are crawled. */ public void start() { try { LOG.info("Start crawling process."); Collection<Callable<Object>> tasks = new ArrayList<Callable<Object>>(crawlerConfigurations.size()); for (final CrawlerConfiguration configuration : this.crawlerConfigurations) { final String server = configuration.getServer(); final String startLink = configuration.getStartLink(); tasks.add(new Callable() { public Object call() throws Exception { CrawlerTask task = crawlerTaskFactory.makeCrawlerTask( downloader, parser, publisher, configuration.getPipeline(), configuration.getMaxConcurrentDownloads(),configuration.getMaxVisitedLinks()); task.execute(new Link(server + "/" + startLink)); return null; } }); } crawlerExecutor.invokeAll(tasks); LOG.info("Finished crawling process."); } catch (InterruptedException ex) { LOG.error(ex.getMessage(), ex); } } public void setCrawlerTaskFactory(CrawlerTaskFactory crawlerTaskFactory) { this.crawlerTaskFactory = crawlerTaskFactory; } public void setDownloader(Downloader downloader) { this.downloader = downloader; } public void setParser(Parser parser) { this.parser = parser; } public void setPublisher(AtomPublisher publisher) { this.publisher = publisher; } }