/** * Copyright 2008 - 2009 Pro-Netics S.P.A. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.pronetics.madstore.crawler.impl.grid; import it.pronetics.madstore.crawler.downloader.Downloader; import it.pronetics.madstore.crawler.model.Link; import it.pronetics.madstore.crawler.model.Page; import it.pronetics.madstore.crawler.parser.Parser; import it.pronetics.madstore.crawler.parser.filter.impl.ServerFilter; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.gridgain.grid.GridException; import org.gridgain.grid.GridJob; import org.gridgain.grid.GridJobAdapter; import org.gridgain.grid.GridJobResult; import org.gridgain.grid.GridTaskSplitAdapter; import org.gridgain.grid.resources.GridSpringResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Grid task which splits the downloads and parsing of links into several distributed jobs, * each one downloading and parsing a single link. * * @author Christian Mongillo * @author Sergio Bossa */ public class ParserTask extends GridTaskSplitAdapter<Collection<Link>, Collection<ParserTaskResult>> { private static final transient Logger LOG = LoggerFactory.getLogger(ParserTask.class); @GridSpringResource(resourceName = "parser") private transient Parser parser; @GridSpringResource(resourceName = "downloader") private transient Downloader downloader; public ParserTask(Parser parser, Downloader downloader) { this.parser = parser; this.downloader = downloader; } @Override protected Collection<? extends GridJob> split(int gridSize, Collection<Link> links) throws GridException { Collection<GridJob> jobs = new ArrayList<GridJob>(links.size()); for (final Link link : links) { jobs.add(new GridJobAdapter() { public Serializable execute() throws GridException { try { Page page = downloader.download(link); if (page != null && !page.isEmpty()) { Collection<Link> newLinks = parser.parse(page, new ServerFilter(page.getLink())); ParserTaskResult result = new ParserTaskResult(page, newLinks); return result; } else { return null; } } catch (Exception ex) { LOG.error(ex.getMessage(), ex); return null; } } }); } return jobs; } public Collection<ParserTaskResult> reduce(List<GridJobResult> jobResults) throws GridException { Collection<ParserTaskResult> parserResults = new ArrayList<ParserTaskResult>(jobResults.size()); for(GridJobResult jobResult : jobResults) { parserResults.add(jobResult.<ParserTaskResult>getData()); } return parserResults; } }