/**
* Copyright 2008 - 2009 Pro-Netics S.P.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package it.pronetics.madstore.crawler.impl.local;
import com.googlecode.actorom.Actor;
import com.googlecode.actorom.Address;
import com.googlecode.actorom.Topology;
import com.googlecode.actorom.local.LocalTopology;
import com.googlecode.actorom.support.ThreadingPolicies;
import it.pronetics.madstore.crawler.Pipeline;
import it.pronetics.madstore.crawler.downloader.Downloader;
import it.pronetics.madstore.crawler.impl.CrawlerTask;
import it.pronetics.madstore.crawler.model.Link;
import it.pronetics.madstore.crawler.parser.Parser;
import it.pronetics.madstore.crawler.publisher.AtomPublisher;
import java.util.concurrent.CountDownLatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link it.pronetics.madstore.crawler.impl.CrawlerTask} implementation based on the actors concurrency
* model. <br>
* It splits the crawling process in the following parallel actors:
* <ul>
* <li>{@link CrawlerActor}</li>
* <li>{@link DownloaderActor}</li>
* <li>{@link ParserActor}</li>
* <li>{@link ProcessorActor}</li>
* </ul>
* @author Sergio Bossa
*/
public class LocalCrawlerTask implements CrawlerTask {
private static final transient Logger LOG = LoggerFactory.getLogger(LocalCrawlerTask.class);
//
private final Downloader downloader;
private final Parser parser;
private final AtomPublisher publisher;
private final Pipeline pipeline;
private final int maxConcurrentDownloads;
private int maxVisitedLinks;
public LocalCrawlerTask(Downloader downloader, Parser parser, AtomPublisher publisher, Pipeline pipeline, int maxConcurrentDownloads, int maxVisitedLinks) {
this.downloader = downloader;
this.parser = parser;
this.publisher = publisher;
this.pipeline = pipeline;
this.maxConcurrentDownloads = maxConcurrentDownloads;
this.maxVisitedLinks = maxVisitedLinks;
}
public void execute(Link startLink) {
Topology actorsTopology = new LocalTopology(ThreadingPolicies.newOSThreadingPolicy(4));
try {
LOG.info("Crawling process started from {}", startLink);
Address crawlerAddress = actorsTopology.spawnActor(CrawlerActor.class.toString(), new CrawlerActor(maxVisitedLinks));
Address processorAddress = actorsTopology.spawnActor(ProcessorActor.class.toString(), new ProcessorActor(publisher, pipeline, crawlerAddress));
Address parserAddress = actorsTopology.spawnActor(ParserActor.class.toString(), new ParserActor(parser, processorAddress));
Address downloaderAddress = actorsTopology.spawnActor(DownloaderActor.class.toString(), new DownloaderActor(maxConcurrentDownloads, downloader, crawlerAddress, parserAddress));
Actor crawlerActor = actorsTopology.getActor(crawlerAddress);
Actor downloaderActor = actorsTopology.getActor(downloaderAddress);
Actor parserActor = actorsTopology.getActor(parserAddress);
Actor processorActor = actorsTopology.getActor(processorAddress);
crawlerActor.link(downloaderActor);
crawlerActor.link(parserActor);
crawlerActor.link(processorActor);
CountDownLatch finishLatch = new CountDownLatch(1);
crawlerActor.send(new StartCrawlingMessage(startLink, downloaderAddress, finishLatch));
finishLatch.await();
LOG.info("Crawling process stopped!");
} catch (Exception ex) {
LOG.error(ex.getMessage(), ex);
} finally {
actorsTopology.shutdown();
}
}
}