/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.bolt; import java.io.File; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Deque; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import org.apache.storm.metric.api.IMetric; import org.apache.storm.metric.api.MeanReducer; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.metric.api.MultiReducedMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.persistence.Status; import com.digitalpebble.stormcrawler.protocol.HttpHeaders; import com.digitalpebble.stormcrawler.protocol.Protocol; import com.digitalpebble.stormcrawler.protocol.ProtocolFactory; import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; import com.digitalpebble.stormcrawler.protocol.RobotRules; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.PerSecondReducer; import crawlercommons.domains.PaidLevelDomain; import crawlercommons.robots.BaseRobotRules; /** * A multithreaded, queue-based fetcher adapted from Apache Nutch. Enforces the * politeness and handles the fetching threads itself. */ @SuppressWarnings("serial") public class FetcherBolt extends StatusEmitterBolt { private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(FetcherBolt.class); private final AtomicInteger activeThreads = new AtomicInteger(0); private final AtomicInteger spinWaiting = new AtomicInteger(0); private FetchItemQueues fetchQueues; private MultiCountMetric eventCounter; private MultiReducedMetric averagedMetrics; private ProtocolFactory protocolFactory; private int taskID = -1; boolean sitemapsAutoDiscovery = false; private MultiReducedMetric perSecMetrics; private File debugfiletrigger; /** blocks the processing of new URLs if this value is reached **/ private int maxNumberURLsInQueues = -1; /** * This class described the item to be fetched. */ private static class FetchItem { String queueID; String url; URL u; Tuple t; public FetchItem(String url, URL u, Tuple t, String queueID) { this.url = url; this.u = u; this.queueID = queueID; this.t = t; } /** * Create an item. Queue id will be created based on * <code>queueMode</code> argument, either as a protocol + hostname * pair, protocol + IP address pair or protocol+domain pair. */ public static FetchItem create(URL u, Tuple t, String queueMode) { String queueID; String url = u.toExternalForm(); String key = null; // reuse any key that might have been given // be it the hostname, domain or IP if (t.contains("key")) { key = t.getStringByField("key"); } if (StringUtils.isNotBlank(key)) { queueID = key.toLowerCase(Locale.ROOT); return new FetchItem(url, u, t, queueID); } if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { try { final InetAddress addr = InetAddress.getByName(u.getHost()); key = addr.getHostAddress(); } catch (final UnknownHostException e) { LOG.warn( "Unable to resolve IP for {}, using hostname as key.", u.getHost()); key = u.getHost(); } } else if (FetchItemQueues.QUEUE_MODE_DOMAIN .equalsIgnoreCase(queueMode)) { key = PaidLevelDomain.getPLD(u.getHost()); if (key == null) { LOG.warn( "Unknown domain for url: {}, using hostname as key", url); key = u.getHost(); } } else { key = u.getHost(); } if (key == null) { LOG.warn("Unknown host for url: {}, using URL string as key", url); key = u.toExternalForm(); } queueID = key.toLowerCase(Locale.ROOT); return new FetchItem(url, u, t, queueID); } } /** * This class handles FetchItems which come from the same host ID (be it a * proto/hostname or proto/IP pair). It also keeps track of requests in * progress and elapsed time between requests. */ private static class FetchItemQueue { Deque<FetchItem> queue = new LinkedBlockingDeque<>(); AtomicInteger inProgress = new AtomicInteger(); AtomicLong nextFetchTime = new AtomicLong(); long crawlDelay; final long minCrawlDelay; final int maxThreads; public FetchItemQueue(int maxThreads, long crawlDelay, long minCrawlDelay) { this.maxThreads = maxThreads; this.crawlDelay = crawlDelay; this.minCrawlDelay = minCrawlDelay; // ready to start setEndTime(System.currentTimeMillis() - crawlDelay); } public int getQueueSize() { return queue.size(); } public int getInProgressSize() { return inProgress.get(); } public void finishFetchItem(FetchItem it, boolean asap) { if (it != null) { inProgress.decrementAndGet(); setEndTime(System.currentTimeMillis(), asap); } } public void addFetchItem(FetchItem it) { queue.add(it); } public FetchItem getFetchItem() { if (inProgress.get() >= maxThreads) return null; long now = System.currentTimeMillis(); if (nextFetchTime.get() > now) return null; FetchItem it = null; if (queue.isEmpty()) return null; try { it = queue.removeFirst(); inProgress.incrementAndGet(); } catch (Exception e) { LOG.error( "Cannot remove FetchItem from queue or cannot add it to inProgress queue", e); } return it; } private void setEndTime(long endTime) { setEndTime(endTime, false); } private void setEndTime(long endTime, boolean asap) { if (!asap) nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay)); else nextFetchTime.set(endTime); } } /** * Convenience class - a collection of queues that keeps track of the total * number of items, and provides items eligible for fetching from any queue. */ private static class FetchItemQueues { Map<String, FetchItemQueue> queues = Collections .synchronizedMap(new LinkedHashMap<String, FetchItemQueue>()); AtomicInteger inQueues = new AtomicInteger(0); final int defaultMaxThread; final long crawlDelay; final long minCrawlDelay; final Config conf; public static final String QUEUE_MODE_HOST = "byHost"; public static final String QUEUE_MODE_DOMAIN = "byDomain"; public static final String QUEUE_MODE_IP = "byIP"; String queueMode; public FetchItemQueues(Config conf) { this.conf = conf; this.defaultMaxThread = ConfUtils.getInt(conf, "fetcher.threads.per.queue", 1); queueMode = ConfUtils.getString(conf, "fetcher.queue.mode", QUEUE_MODE_HOST); // check that the mode is known if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN) && !queueMode.equals(QUEUE_MODE_HOST)) { LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode); queueMode = QUEUE_MODE_HOST; } LOG.info("Using queue mode : {}", queueMode); this.crawlDelay = (long) (ConfUtils.getFloat(conf, "fetcher.server.delay", 1.0f) * 1000); this.minCrawlDelay = (long) (ConfUtils.getFloat(conf, "fetcher.server.min.delay", 0.0f) * 1000); } public synchronized void addFetchItem(URL u, Tuple input) { FetchItem it = FetchItem.create(u, input, queueMode); FetchItemQueue fiq = getFetchItemQueue(it.queueID); fiq.addFetchItem(it); inQueues.incrementAndGet(); } public synchronized void finishFetchItem(FetchItem it, boolean asap) { FetchItemQueue fiq = queues.get(it.queueID); if (fiq == null) { LOG.warn("Attempting to finish item from unknown queue: {}", it.queueID); return; } fiq.finishFetchItem(it, asap); } public synchronized FetchItemQueue getFetchItemQueue(String id) { FetchItemQueue fiq = queues.get(id); if (fiq == null) { // custom maxThread value? final int customThreadVal = ConfUtils.getInt(conf, "fetcher.maxThreads." + id, defaultMaxThread); // initialize queue fiq = new FetchItemQueue(customThreadVal, crawlDelay, minCrawlDelay); queues.put(id, fiq); } return fiq; } public synchronized FetchItem getFetchItem() { if (queues.isEmpty()) { return null; } FetchItemQueue start = null; do { Iterator<Entry<String, FetchItemQueue>> i = queues.entrySet() .iterator(); if (!i.hasNext()) { return null; } Map.Entry<String, FetchItemQueue> nextEntry = i.next(); if (nextEntry == null) { return null; } FetchItemQueue fiq = nextEntry.getValue(); // We remove the entry and put it at the end of the map i.remove(); // reap empty queues if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) { continue; } // Put the entry at the end no matter the result queues.put(nextEntry.getKey(), nextEntry.getValue()); // In case of we are looping if (start == null) { start = fiq; } else if (fiq == start) { return null; } FetchItem fit = fiq.getFetchItem(); if (fit != null) { inQueues.decrementAndGet(); return fit; } } while (!queues.isEmpty()); return null; } } /** * This class picks items from queues and fetches the pages. */ private class FetcherThread extends Thread { // longest delay accepted from robots.txt private final long maxCrawlDelay; public FetcherThread(Config conf, int num) { this.setDaemon(true); // don't hang JVM on exit this.setName("FetcherThread #" + num); // use an informative name this.maxCrawlDelay = ConfUtils.getInt(conf, "fetcher.max.crawl.delay", 30) * 1000; } @Override public void run() { while (true) { FetchItem fit = fetchQueues.getFetchItem(); if (fit == null) { LOG.debug("{} spin-waiting ...", getName()); // spin-wait. spinWaiting.incrementAndGet(); try { Thread.sleep(100); } catch (InterruptedException e) { LOG.error("{} caught interrupted exception", getName()); Thread.currentThread().interrupt(); } spinWaiting.decrementAndGet(); continue; } activeThreads.incrementAndGet(); // count threads LOG.debug( "[Fetcher #{}] {} => activeThreads={}, spinWaiting={}, queueID={}", taskID, getName(), activeThreads, spinWaiting, fit.queueID); LOG.debug("[Fetcher #{}] {} : Fetching {}", taskID, getName(), fit.url); Metadata metadata = null; if (fit.t.contains("metadata")) { metadata = (Metadata) fit.t.getValueByField("metadata"); } if (metadata == null) { metadata = Metadata.empty; } boolean asap = false; try { URL URL = new URL(fit.url); Protocol protocol = protocolFactory.getProtocol(URL); if (protocol == null) throw new RuntimeException( "No protocol implementation found for " + fit.url); BaseRobotRules rules = protocol.getRobotRules(fit.url); boolean fromCache = false; if (rules instanceof RobotRules && ((RobotRules) rules).getContentLengthFetched().length == 0) { fromCache = true; eventCounter.scope("robots.fromCache").incrBy(1); } else { eventCounter.scope("robots.fetched").incrBy(1); } // autodiscovery of sitemaps // the sitemaps will be sent down the topology // as many times as there is a URL for a given host // the status updater will certainly cache things // but we could also have a simple cache mechanism here // as well // if the robot come from the cache there is no point // in sending the sitemap URLs again if (!fromCache && sitemapsAutoDiscovery) { for (String sitemapURL : rules.getSitemaps()) { emitOutlink(fit.t, URL, sitemapURL, metadata, SiteMapParserBolt.isSitemapKey, "true"); } } if (!rules.isAllowed(fit.u.toString())) { LOG.info("Denied by robots.txt: {}", fit.url); // pass the info about denied by robots metadata.setValue(Constants.STATUS_ERROR_CAUSE, "robots.txt"); collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, fit.t, new Values(fit.url, metadata, Status.ERROR)); // no need to wait next time as we won't request from // that site asap = true; continue; } if (rules.getCrawlDelay() > 0) { if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { LOG.info( "Crawl-Delay for {} too long ({}), skipping", fit.url, rules.getCrawlDelay()); // pass the info about crawl delay metadata.setValue(Constants.STATUS_ERROR_CAUSE, "crawl_delay"); collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, fit.t, new Values(fit.url, metadata, Status.ERROR)); // no need to wait next time as we won't request // from that site asap = true; continue; } else { FetchItemQueue fiq = fetchQueues .getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); LOG.info( "Crawl delay for queue: {} is set to {} as per robots.txt. url: {}", fit.queueID, fiq.crawlDelay, fit.url); } } long start = System.currentTimeMillis(); ProtocolResponse response = protocol.getProtocolOutput( fit.url, metadata); long timeFetching = System.currentTimeMillis() - start; final int byteLength = response.getContent().length; averagedMetrics.scope("fetch_time").update(timeFetching); averagedMetrics.scope("bytes_fetched").update(byteLength); perSecMetrics.scope("bytes_fetched_perSec").update( byteLength); perSecMetrics.scope("fetched_perSec").update(1); eventCounter.scope("fetched").incrBy(1); eventCounter.scope("bytes_fetched").incrBy(byteLength); LOG.info( "[Fetcher #{}] Fetched {} with status {} in msec {}", taskID, fit.url, response.getStatusCode(), timeFetching); // passes the input metadata if any to the response one response.getMetadata().putAll(metadata); response.getMetadata().setValue("fetch.statusCode", Integer.toString(response.getStatusCode())); response.getMetadata().setValue("fetch.loadingTime", Long.toString(timeFetching)); // determine the status based on the status code final Status status = Status.fromHTTPCode(response .getStatusCode()); final Values tupleToSend = new Values(fit.url, response.getMetadata(), status); // if the status is OK emit on default stream if (status.equals(Status.FETCHED)) { if (response.getStatusCode() == 304) { // mark this URL as fetched so that it gets // rescheduled // but do not try to parse or index collector.emit(Constants.StatusStreamName, fit.t, tupleToSend); } else { // send content for parsing collector.emit(fit.t, new Values(fit.url, response.getContent(), response.getMetadata())); } } else if (status.equals(Status.REDIRECTION)) { // find the URL it redirects to String redirection = response.getMetadata() .getFirstValue(HttpHeaders.LOCATION); // stores the URL it redirects to // used for debugging mainly - do not resolve the target // URL if (StringUtils.isNotBlank(redirection)) { response.getMetadata().setValue("_redirTo", redirection); } // mark this URL as redirected collector.emit(Constants.StatusStreamName, fit.t, tupleToSend); if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(fit.t, URL, redirection, response.getMetadata()); } } // error else { collector.emit(Constants.StatusStreamName, fit.t, tupleToSend); } } catch (Exception exece) { String message = exece.getMessage(); if (message == null) message = ""; // common exceptions for which we log only a short message if (exece.getCause() instanceof java.util.concurrent.TimeoutException || message.contains(" timed out")) { LOG.error("Socket timeout fetching {}", fit.url); message = "Socket timeout fetching"; } else if (exece.getCause() instanceof java.net.UnknownHostException || exece instanceof java.net.UnknownHostException) { LOG.error("Unknown host {}", fit.url); message = "Unknown host"; } else { LOG.error("Exception while fetching {}", fit.url, exece); message = exece.getClass().getName(); } if (metadata.size() == 0) { metadata = new Metadata(); } // add the reason of the failure in the metadata metadata.setValue("fetch.exception", message); // send to status stream collector.emit(Constants.StatusStreamName, fit.t, new Values(fit.url, metadata, Status.FETCH_ERROR)); eventCounter.scope("exception").incrBy(1); } finally { fetchQueues.finishFetchItem(fit, asap); activeThreads.decrementAndGet(); // count threads // ack it whatever happens collector.ack(fit.t); } } } } private void checkConfiguration(Config stormConf) { // ensure that a value has been set for the agent name and that that // agent name is the first value in the agents we advertise for robot // rules parsing String agentName = (String) stormConf.get("http.agent.name"); if (agentName == null || agentName.trim().length() == 0) { String message = "Fetcher: No agents listed in 'http.agent.name'" + " property."; LOG.error(message); throw new IllegalArgumentException(message); } } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); Config conf = new Config(); conf.putAll(stormConf); checkConfiguration(conf); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); long start = System.currentTimeMillis(); LOG.info("[Fetcher #{}] : starting at {}", taskID, sdf.format(start)); int metricsTimeBucketSecs = ConfUtils.getInt(conf, "fetcher.metrics.time.bucket.secs", 10); // Register a "MultiCountMetric" to count different events in this bolt // Storm will emit the counts every n seconds to a special bolt via a // system stream // The data can be accessed by registering a "MetricConsumer" in the // topology this.eventCounter = context.registerMetric("fetcher_counter", new MultiCountMetric(), metricsTimeBucketSecs); // create gauges context.registerMetric("activethreads", new IMetric() { @Override public Object getValueAndReset() { return activeThreads.get(); } }, metricsTimeBucketSecs); context.registerMetric("in_queues", new IMetric() { @Override public Object getValueAndReset() { return fetchQueues.inQueues.get(); } }, metricsTimeBucketSecs); context.registerMetric("num_queues", new IMetric() { @Override public Object getValueAndReset() { return fetchQueues.queues.size(); } }, metricsTimeBucketSecs); this.averagedMetrics = context.registerMetric("fetcher_average_perdoc", new MultiReducedMetric(new MeanReducer()), metricsTimeBucketSecs); this.perSecMetrics = context.registerMetric("fetcher_average_persec", new MultiReducedMetric(new PerSecondReducer()), metricsTimeBucketSecs); protocolFactory = new ProtocolFactory(conf); this.fetchQueues = new FetchItemQueues(conf); this.taskID = context.getThisTaskId(); int threadCount = ConfUtils.getInt(conf, "fetcher.threads.number", 10); for (int i = 0; i < threadCount; i++) { // spawn threads new FetcherThread(conf, i).start(); } sitemapsAutoDiscovery = ConfUtils.getBoolean(stormConf, "sitemap.discovery", false); maxNumberURLsInQueues = ConfUtils.getInt(conf, "fetcher.max.urls.in.queues", -1); /** * If set to a valid path e.g. /tmp/fetcher-dump-{port} on a worker * node, the content of the queues will be dumped to the logs for * debugging. The port number needs to match the one used by the * FetcherBolt instance. **/ String debugfiletriggerpattern = ConfUtils.getString(conf, "fetcherbolt.queue.debug.filepath"); if (StringUtils.isNotBlank(debugfiletriggerpattern)) { debugfiletrigger = new File( debugfiletriggerpattern.replaceAll("\\{port\\}", Integer.toString(context.getThisWorkerPort()))); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { super.declareOutputFields(declarer); declarer.declare(new Fields("url", "content", "metadata")); } @Override public void cleanup() { protocolFactory.cleanup(); } @Override public void execute(Tuple input) { boolean toomanyurlsinqueues = false; do { if (this.maxNumberURLsInQueues != -1 && (this.activeThreads.get() + this.fetchQueues.inQueues .get()) >= maxNumberURLsInQueues) { toomanyurlsinqueues = true; try { Thread.sleep(500); } catch (InterruptedException e) { LOG.error("Interrupted exception caught in execute method"); Thread.currentThread().interrupt(); } } LOG.info("[Fetcher #{}] Threads : {}\tqueues : {}\tin_queues : {}", taskID, this.activeThreads.get(), this.fetchQueues.queues.size(), this.fetchQueues.inQueues.get()); } while (toomanyurlsinqueues); // detect whether there is a file indicating that we should // dump the content of the queues to the log if (debugfiletrigger != null && debugfiletrigger.exists()) { LOG.info("Found trigger file {}", debugfiletrigger); logQueuesContent(); debugfiletrigger.delete(); } String urlString = input.getStringByField("url"); URL url; if (StringUtils.isBlank(urlString)) { LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskID, input); // ignore silently collector.ack(input); return; } try { url = new URL(urlString); } catch (MalformedURLException e) { LOG.error("{} is a malformed URL", urlString); Metadata metadata = (Metadata) input.getValueByField("metadata"); if (metadata == null) { metadata = new Metadata(); } // Report to status stream and ack metadata.setValue(Constants.STATUS_ERROR_CAUSE, "malformed URL"); collector.emit( com.digitalpebble.stormcrawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); collector.ack(input); return; } fetchQueues.addFetchItem(url, input); } private void logQueuesContent() { StringBuilder sb = new StringBuilder(); synchronized (fetchQueues.queues) { sb.append("\nNum queues : ").append(fetchQueues.queues.size()); Iterator<Entry<String, FetchItemQueue>> iterator = fetchQueues.queues .entrySet().iterator(); while (iterator.hasNext()) { Entry<String, FetchItemQueue> entry = iterator.next(); sb.append("\nQueue ID : ").append(entry.getKey()); FetchItemQueue fiq = entry.getValue(); sb.append("\t size : ").append(fiq.getQueueSize()); sb.append("\t in progress : ").append(fiq.getInProgressSize()); Iterator<FetchItem> urlsIter = fiq.queue.iterator(); while (urlsIter.hasNext()) { sb.append("\n\t").append(urlsIter.next().url); } } LOG.info("Dumping queue content {}", sb.toString()); } } }