/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.storm.crawler.bolt; import static com.digitalpebble.storm.crawler.Constants.StatusStreamName; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.slf4j.LoggerFactory; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import com.digitalpebble.storm.crawler.Constants; import com.digitalpebble.storm.crawler.Metadata; import com.digitalpebble.storm.crawler.filtering.URLFilters; import com.digitalpebble.storm.crawler.parse.Outlink; import com.digitalpebble.storm.crawler.parse.ParseFilter; import com.digitalpebble.storm.crawler.parse.ParseFilters; import com.digitalpebble.storm.crawler.persistence.Status; import com.digitalpebble.storm.crawler.protocol.HttpHeaders; import com.digitalpebble.storm.crawler.util.ConfUtils; import com.digitalpebble.storm.crawler.util.MetadataTransfer; import com.digitalpebble.storm.crawler.util.URLUtil; import crawlercommons.sitemaps.AbstractSiteMap; import crawlercommons.sitemaps.SiteMap; import crawlercommons.sitemaps.SiteMapIndex; import crawlercommons.sitemaps.SiteMapURL; import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency; import crawlercommons.sitemaps.UnknownFormatException; /** * Extracts URLs from sitemap files. The parsing is triggered by the presence of * 'isSitemap=true' in the metadata. Any tuple which does not have this * key/value in the metadata is simply passed on to the default stream, whereas * any URLs extracted from the sitemaps is sent to the 'status' field. */ @SuppressWarnings("serial") public class SiteMapParserBolt extends BaseRichBolt { public static final String isSitemapKey = "isSitemap"; private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(SiteMapParserBolt.class); private OutputCollector collector; private boolean strictMode = false; private MetadataTransfer metadataTransfer; private URLFilters urlFilters; private ParseFilter parseFilters; @Override public void execute(Tuple tuple) { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // TODO check that we have the right number of fields ? String isSitemap = metadata.getFirstValue(isSitemapKey); if (!Boolean.valueOf(isSitemap)) { // just pass it on this.collector.emit(tuple.getValues()); this.collector.ack(tuple); return; } // it does have the right key/value byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); List<Outlink> outlinks = Collections.emptyList(); try { outlinks = parseSiteMap(url, content, ct, metadata); } catch (Exception e) { // exception while parsing the sitemap String errorMessage = "Exception while parsing " + url + ": " + e; LOG.error(errorMessage); // send to status stream in case another component wants to update // its status metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(Constants.StatusStreamName, new Values(url, metadata, Status.ERROR)); this.collector.ack(tuple); return; } // apply the parse filters if any to the current document try { parseFilters.filter(url, content, null, metadata, outlinks); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; LOG.error(errorMessage); metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); return; } // send to status stream for (Outlink ol : outlinks) { Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED); collector.emit(Constants.StatusStreamName, v); } // marking the main URL as successfully fetched // regardless of whether we got a parse exception or not collector.emit(Constants.StatusStreamName, new Values(url, metadata, Status.FETCHED)); this.collector.ack(tuple); } private List<Outlink> parseSiteMap(String url, byte[] content, String contentType, Metadata parentMetadata) throws UnknownFormatException, IOException { crawlercommons.sitemaps.SiteMapParser parser = new crawlercommons.sitemaps.SiteMapParser( strictMode); URL sURL = new URL(url); AbstractSiteMap siteMap = null; // TODO guess CT when next version of cc is released // if (StringUtils.isBlank(contentType)) { // siteMap = parser.parseSiteMap(content, sURL); // } else { siteMap = parser.parseSiteMap(contentType, content, sURL); } List<Outlink> links = new ArrayList<Outlink>(); if (siteMap.isIndex()) { SiteMapIndex smi = ((SiteMapIndex) siteMap); Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps(); // keep the subsitemaps as outlinks // they will be fetched and parsed in the following steps Iterator<AbstractSiteMap> iter = subsitemaps.iterator(); while (iter.hasNext()) { String target = iter.next().getUrl().toExternalForm(); // build an absolute URL try { target = URLUtil.resolveURL(sURL, target).toExternalForm(); } catch (MalformedURLException e) { LOG.debug("MalformedURLException on {}", target); continue; } // apply filtering to outlinks if (urlFilters != null) { target = urlFilters.filter(sURL, parentMetadata, target); } if (StringUtils.isBlank(target)) continue; // configure which metadata gets inherited from parent Metadata metadata = metadataTransfer.getMetaForOutlink(target, url, parentMetadata); metadata.setValue(isSitemapKey, "true"); Outlink ol = new Outlink(target); ol.setMetadata(metadata); links.add(ol); LOG.debug("{} : [sitemap] {}", url, target); } } // sitemap files else { SiteMap sm = ((SiteMap) siteMap); // TODO see what we can do with the LastModified info Collection<SiteMapURL> sitemapURLs = sm.getSiteMapUrls(); Iterator<SiteMapURL> iter = sitemapURLs.iterator(); while (iter.hasNext()) { SiteMapURL smurl = iter.next(); double priority = smurl.getPriority(); // TODO handle priority in metadata ChangeFrequency freq = smurl.getChangeFrequency(); // TODO convert the frequency into a numerical value and handle // it in metadata String target = smurl.getUrl().toExternalForm(); // build an absolute URL try { target = URLUtil.resolveURL(sURL, target).toExternalForm(); } catch (MalformedURLException e) { LOG.debug("MalformedURLException on {}", target); continue; } // apply filtering to outlinks if (urlFilters != null) { target = urlFilters.filter(sURL, parentMetadata, target); } if (StringUtils.isBlank(target)) continue; // configure which metadata gets inherited from parent Metadata metadata = metadataTransfer.getMetaForOutlink(target, url, parentMetadata); metadata.setValue(isSitemapKey, "false"); Outlink ol = new Outlink(target); ol.setMetadata(metadata); links.add(ol); LOG.debug("{} : [sitemap] {}", url, target); } } return links; } @Override @SuppressWarnings({ "rawtypes", "unchecked" }) public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this.collector = collector; this.metadataTransfer = MetadataTransfer.getInstance(stormConf); urlFilters = URLFilters.emptyURLFilters; String urlconfigfile = ConfUtils.getString(stormConf, "urlfilters.config.file", "urlfilters.json"); if (urlconfigfile != null) { try { urlFilters = new URLFilters(stormConf, urlconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the URLFilters"); throw new RuntimeException( "Exception caught while loading the URLFilters", e); } } String parseconfigfile = ConfUtils.getString(stormConf, "parsefilters.config.file", "parsefilters.json"); parseFilters = ParseFilters.emptyParseFilter; if (parseconfigfile != null) { try { parseFilters = new ParseFilters(stormConf, parseconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the ParseFilters"); throw new RuntimeException( "Exception caught while loading the ParseFilters", e); } } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("url", "content", "metadata")); declarer.declareStream(Constants.StatusStreamName, new Fields("url", "metadata", "status")); } }