/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.bolt; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Locale; import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import org.apache.storm.metric.api.IMetric; import org.apache.storm.metric.api.MeanReducer; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.metric.api.MultiReducedMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; import org.apache.storm.utils.Utils; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.persistence.Status; import com.digitalpebble.stormcrawler.protocol.HttpHeaders; import com.digitalpebble.stormcrawler.protocol.Protocol; import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; import com.digitalpebble.stormcrawler.protocol.RobotRules; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.PerSecondReducer; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import crawlercommons.robots.BaseRobotRules; import crawlercommons.domains.PaidLevelDomain; /** * A single-threaded fetcher with no internal queue. Use of this fetcher * requires that the user implement an external queue that enforces crawl-delay * politeness constraints. */ @SuppressWarnings("serial") public class SimpleFetcherBolt extends StatusEmitterBolt { private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(SimpleFetcherBolt.class); public static final String QUEUE_MODE_HOST = "byHost"; public static final String QUEUE_MODE_DOMAIN = "byDomain"; public static final String QUEUE_MODE_IP = "byIP"; private Config conf; private MultiCountMetric eventCounter; private MultiReducedMetric averagedMetrics; private MultiReducedMetric perSecMetrics; private ProtocolFactory protocolFactory; private int taskID = -1; boolean sitemapsAutoDiscovery = false; // TODO configure the max time private Cache<String, Long> throttler = CacheBuilder.newBuilder() .expireAfterAccess(30, TimeUnit.SECONDS).build(); private String queueMode; /** default crawl delay in msec, can be overridden by robots directives **/ private long crawlDelay = 1000; /** max value accepted from robots.txt **/ private long maxCrawlDelay = 30000; private void checkConfiguration() { // ensure that a value has been set for the agent name and that that // agent name is the first value in the agents we advertise for robot // rules parsing String agentName = (String) getConf().get("http.agent.name"); if (agentName == null || agentName.trim().length() == 0) { String message = "Fetcher: No agents listed in 'http.agent.name'" + " property."; LOG.error(message); throw new IllegalArgumentException(message); } } private Config getConf() { return this.conf; } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); this.conf = new Config(); this.conf.putAll(stormConf); checkConfiguration(); this.taskID = context.getThisTaskId(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); long start = System.currentTimeMillis(); LOG.info("[Fetcher #{}] : starting at {}", taskID, sdf.format(start)); // Register a "MultiCountMetric" to count different events in this bolt // Storm will emit the counts every n seconds to a special bolt via a // system stream // The data can be accessed by registering a "MetricConsumer" in the // topology int metricsTimeBucketSecs = ConfUtils.getInt(conf, "fetcher.metrics.time.bucket.secs", 10); this.eventCounter = context.registerMetric("fetcher_counter", new MultiCountMetric(), metricsTimeBucketSecs); this.averagedMetrics = context.registerMetric("fetcher_average", new MultiReducedMetric(new MeanReducer()), metricsTimeBucketSecs); this.perSecMetrics = context.registerMetric("fetcher_average_persec", new MultiReducedMetric(new PerSecondReducer()), metricsTimeBucketSecs); context.registerMetric("throttler_size", new IMetric() { @Override public Object getValueAndReset() { return throttler.size(); } }, metricsTimeBucketSecs); protocolFactory = new ProtocolFactory(conf); sitemapsAutoDiscovery = ConfUtils.getBoolean(stormConf, "sitemap.discovery", false); queueMode = ConfUtils.getString(conf, "fetcher.queue.mode", QUEUE_MODE_HOST); // check that the mode is known if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN) && !queueMode.equals(QUEUE_MODE_HOST)) { LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode); queueMode = QUEUE_MODE_HOST; } LOG.info("Using queue mode : {}", queueMode); this.crawlDelay = (long) (ConfUtils.getFloat(conf, "fetcher.server.delay", 1.0f) * 1000); this.maxCrawlDelay = (long) ConfUtils.getInt(conf, "fetcher.max.crawl.delay", 30) * 1000; } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { super.declareOutputFields(declarer); declarer.declare(new Fields("url", "content", "metadata")); } @Override public void cleanup() { protocolFactory.cleanup(); } @Override public void execute(Tuple input) { String urlString = input.getStringByField("url"); if (StringUtils.isBlank(urlString)) { LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskID, input); // ignore silently collector.ack(input); return; } Metadata metadata = null; if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata"); if (metadata == null) metadata = Metadata.empty; URL url; try { url = new URL(urlString); } catch (MalformedURLException e) { LOG.error("{} is a malformed URL", urlString); // Report to status stream and ack if (metadata == Metadata.empty) { metadata = new Metadata(); } metadata.setValue(Constants.STATUS_ERROR_CAUSE, "malformed URL"); collector.emit( com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); collector.ack(input); return; } String key = getPolitenessKey(url); long delay = 0; try { Protocol protocol = protocolFactory.getProtocol(url); BaseRobotRules rules = protocol.getRobotRules(urlString); boolean fromCache = false; if (rules instanceof RobotRules && ((RobotRules) rules).getContentLengthFetched().length == 0) { fromCache = true; eventCounter.scope("robots.fromCache").incrBy(1); } else { eventCounter.scope("robots.fetched").incrBy(1); } // autodiscovery of sitemaps // the sitemaps will be sent down the topology // as many times as there is a URL for a given host // the status updater will certainly cache things // but we could also have a simple cache mechanism here // as well. // if the robot come from the cache there is no point // in sending the sitemap URLs again if (!fromCache && sitemapsAutoDiscovery) { for (String sitemapURL : rules.getSitemaps()) { emitOutlink(input, url, sitemapURL, metadata, SiteMapParserBolt.isSitemapKey, "true"); } } if (!rules.isAllowed(urlString)) { LOG.info("Denied by robots.txt: {}", urlString); metadata.setValue(Constants.STATUS_ERROR_CAUSE, "robots.txt"); // Report to status stream and ack collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); collector.ack(input); return; } // check when we are allowed to process it long timeWaiting = 0; Long timeAllowed = throttler.getIfPresent(key); if (timeAllowed != null) { long now = System.currentTimeMillis(); long timeToWait = timeAllowed - now; if (timeToWait > 0) { timeWaiting = timeToWait; try { Thread.sleep(timeToWait); } catch (InterruptedException e) { LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting"); Thread.currentThread().interrupt(); } } } delay = this.crawlDelay; // get the delay from robots // value is negative when not set long robotsDelay = rules.getCrawlDelay(); if (robotsDelay > 0) { // cap the value to a maximum // as some sites specify ridiculous values if (robotsDelay > maxCrawlDelay) { LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url); delay = maxCrawlDelay; } else { delay = robotsDelay; } } LOG.debug("[Fetcher #{}] : Fetching {}", taskID, urlString); long start = System.currentTimeMillis(); ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata); long timeFetching = System.currentTimeMillis() - start; final int byteLength = response.getContent().length; averagedMetrics.scope("wait_time").update(timeWaiting); averagedMetrics.scope("fetch_time").update(timeFetching); averagedMetrics.scope("bytes_fetched").update(byteLength); eventCounter.scope("fetched").incrBy(1); eventCounter.scope("bytes_fetched").incrBy(byteLength); perSecMetrics.scope("bytes_fetched_perSec").update(byteLength); perSecMetrics.scope("fetched_perSec").update(1); LOG.info( "[Fetcher #{}] Fetched {} with status {} in {} after waiting {}", taskID, urlString, response.getStatusCode(), timeFetching, timeWaiting); response.getMetadata().putAll(metadata); response.getMetadata().setValue("fetch.statusCode", Integer.toString(response.getStatusCode())); response.getMetadata().setValue("fetch.loadingTime", Long.toString(timeFetching)); // determine the status based on the status code final Status status = Status.fromHTTPCode(response.getStatusCode()); // used when sending to status stream final Values values4status = new Values(urlString, response.getMetadata(), status); // if the status is OK emit on default stream if (status.equals(Status.FETCHED)) { if (response.getStatusCode() == 304) { // mark this URL as fetched so that it gets // rescheduled // but do not try to parse or index collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, values4status); } else { collector.emit(Utils.DEFAULT_STREAM_ID, input, new Values(urlString, response.getContent(), response.getMetadata())); } } else if (status.equals(Status.REDIRECTION)) { // find the URL it redirects to String redirection = response.getMetadata().getFirstValue( HttpHeaders.LOCATION); // stores the URL it redirects to // used for debugging mainly - do not resolve the target // URL if (StringUtils.isNotBlank(redirection)) { response.getMetadata().setValue("_redirTo", redirection); } if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(input, url, redirection, response.getMetadata()); } // Mark URL as redirected collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, values4status); } else { // Error collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, values4status); } } catch (Exception exece) { String message = exece.getMessage(); if (message == null) message = ""; // common exceptions for which we log only a short message if (exece.getCause() instanceof java.util.concurrent.TimeoutException || message.contains(" timed out")) { LOG.error("Socket timeout fetching {}", urlString); message = "Socket timeout fetching"; } else if (exece.getCause() instanceof java.net.UnknownHostException || exece instanceof java.net.UnknownHostException) { LOG.error("Unknown host {}", urlString); message = "Unknown host"; } else { LOG.error("Exception while fetching {}", urlString, exece); message = exece.getClass().getName(); } eventCounter.scope("exception").incrBy(1); // could be an empty, immutable Metadata if (metadata.size() == 0) { metadata = new Metadata(); } // add the reason of the failure in the metadata metadata.setValue("fetch.exception", message); collector.emit( com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.FETCH_ERROR)); } // update the throttler throttler.put(key, System.currentTimeMillis() + delay); collector.ack(input); } private String getPolitenessKey(URL u) { String key; if (QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { try { final InetAddress addr = InetAddress.getByName(u.getHost()); key = addr.getHostAddress(); } catch (final UnknownHostException e) { // unable to resolve it, so don't fall back to host name LOG.warn("Unable to resolve: {}, skipping.", u.getHost()); return null; } } else if (QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) { key = PaidLevelDomain.getPLD(u.getHost()); if (key == null) { LOG.warn("Unknown domain for url: {}, using hostname as key", u.toExternalForm()); key = u.getHost(); } } else { key = u.getHost(); if (key == null) { LOG.warn("Unknown host for url: {}, using URL string as key", u.toExternalForm()); key = u.toExternalForm(); } } return key.toLowerCase(Locale.ROOT); } }