/** * Copyright 2008 - 2009 Pro-Netics S.P.A. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.pronetics.madstore.crawler.impl.grid; import it.pronetics.madstore.crawler.Pipeline; import it.pronetics.madstore.crawler.downloader.Downloader; import it.pronetics.madstore.crawler.impl.CrawlerTask; import it.pronetics.madstore.crawler.impl.grid.support.MadStoreGrid; import it.pronetics.madstore.crawler.model.Link; import it.pronetics.madstore.crawler.model.Page; import it.pronetics.madstore.crawler.parser.Parser; import it.pronetics.madstore.crawler.publisher.AtomPublisher; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import org.gridgain.grid.GridException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Grid-based implementation for {@link it.pronetics.madstore.crawler.impl.CrawlerTask}. * @author Sergio Bossa * @author Christian Mongillo */ public class GridCrawlerTask implements CrawlerTask { private static final transient Logger LOG = LoggerFactory.getLogger(GridCrawlerTask.class); private final MadStoreGrid madStoreGrid; private final Downloader downloader; private final Parser parser; private final AtomPublisher publisher; private final Pipeline pipeline; private final int maxConcurrentDownload; private final HashMap<String, Page> visitedLinks = new HashMap<String, Page>(); private final HashMap<String, Link> toParseLinks = new HashMap<String, Link>(); private int maxVisitedLinks; private int visitedLinksCounter = 1; public GridCrawlerTask(MadStoreGrid madStoreGrid, Downloader downloader, Parser parser, AtomPublisher publisher, Pipeline pipeline, int maxConcurrentDownload, int maxVisitedLinks) { this.madStoreGrid = madStoreGrid; this.downloader = downloader; this.parser = parser; this.publisher = publisher; this.pipeline = pipeline; this.maxConcurrentDownload = maxConcurrentDownload; this.maxVisitedLinks = maxVisitedLinks; } /** * Execute the crawling process on the target site, starting from the given link. <br> * The process is composed by the following sequential steps: * <ol> * <li>Crawling and harvesting of page links, executed by distributed grid nodes.</li> * <li>Pipeline-based processing of linked pages, executed by distributed grid nodes.</li> * <li>Storing of extracted Atom feeds, executed locally.</li> * </ol> */ public void execute(Link startLink) { try { toParseLinks.put(startLink.getLink(), startLink); Collection<Page> extractedPages = doParsing(); Collection<Page> processedPages = doProcessing(extractedPages); doPublishing(processedPages); } catch (Exception ex) { LOG.error(ex.getMessage(), ex); } } private Collection<Page> doParsing() throws GridException { do { Collection<Link> linksToVisit = getLinksToVisit(); LOG.info("Downloading and parsing links ...", linksToVisit.size()); if (linksToVisit.size() > 0) { LOG.info("Downloading and parsing {} links.", linksToVisit.size()); ParserTask task = new ParserTask(parser, downloader); Collection<ParserTaskResult> results = madStoreGrid.<Collection<Link>, Collection<ParserTaskResult>>executeInGrid(task, linksToVisit); for (ParserTaskResult result : results) { if (result != null) { pushVisitedPage(result.getPage()); pushExtractedLinks(result.getExtractedLinks()); } } if (LOG.isDebugEnabled()) { LOG.debug("To parse links : {}", toParseLinks.keySet()); LOG.debug("Visited links : {}", visitedLinks.keySet()); } } } while (!toParseLinks.isEmpty()); return Collections.unmodifiableCollection(visitedLinks.values()); } private Collection<Link> getLinksToVisit() { ArrayList<Link> linksToVisit = new ArrayList<Link>(); int limit = 1; while (!toParseLinks.isEmpty() && limit <= maxConcurrentDownload) { Collection<String> linksToParse = toParseLinks.keySet(); String link = (String) linksToParse.toArray()[0]; if (!visitedLinks.containsKey(link)) { linksToVisit.add(toParseLinks.get(link)); limit++; } toParseLinks.remove(link); } LOG.debug("Link to visit : {}", linksToVisit); return linksToVisit; } private void pushVisitedPage(Page page) { visitedLinks.put(page.getLink().getLink(), page); } private void pushExtractedLinks(Collection<Link> links) { for (Link link : links) { if ((visitedLinksCounter < maxVisitedLinks) && (!toParseLinks.containsKey(link.getLink()) && !visitedLinks.containsKey(link.getLink()))) { toParseLinks.put(link.getLink(), link); ++visitedLinksCounter; } } } private Collection<Page> doProcessing(Collection<Page> extractedPages) { LOG.info("Start page processing ..."); ProcessorTask task = new ProcessorTask(pipeline); try { Collection<ProcessorTaskResult> results = madStoreGrid.<Collection<Page>, Collection<ProcessorTaskResult>>executeInGrid(task, extractedPages); Collection<Page> processedPages = new LinkedList<Page>(); for (ProcessorTaskResult result : results) { if (result != null) { processedPages.add(result.getPage()); } } return processedPages; } catch (Exception ex) { LOG.error(ex.getMessage(), ex); return new ArrayList<Page>(0); } } private void doPublishing(Collection<Page> processedPages) { LOG.info("Start page publishing ..."); for (Page page : processedPages) { publisher.publish(page); } } }