/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.protocol; import java.net.URL; import java.util.LinkedList; import java.util.List; import java.util.Locale; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.google.common.cache.Cache; import com.google.common.primitives.Ints; import crawlercommons.robots.BaseRobotRules; /** * This class is used for parsing robots for urls belonging to HTTP protocol. It * extends the generic {@link RobotRulesParser} class and contains Http protocol * specific implementation for obtaining the robots file. */ public class HttpRobotRulesParser extends RobotRulesParser { protected boolean allowForbidden = false; HttpRobotRulesParser() { } public HttpRobotRulesParser(Config conf) { setConf(conf); } @Override public void setConf(Config conf) { super.setConf(conf); allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow", true); } /** * Compose unique key to store and access robot rules in cache for given URL */ protected static String getCacheKey(URL url) { String protocol = url.getProtocol().toLowerCase(Locale.ROOT); String host = url.getHost().toLowerCase(Locale.ROOT); int port = url.getPort(); if (port == -1) { port = url.getDefaultPort(); } /* * Robot rules apply only to host, protocol, and port where robots.txt * is hosted (cf. NUTCH-1752). Consequently */ return protocol + ":" + host + ":" + port; } /** * Get the rules from robots.txt which applies for the given {@code url}. * Robot rules are cached for a unique combination of host, protocol, and * port. If no rules are found in the cache, a HTTP request is send to fetch * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and * the rules are cached to avoid re-fetching and re-parsing it again. * * @param http * The {@link Protocol} object * @param url * URL robots.txt applies to * * @return {@link BaseRobotRules} holding the rules from robots.txt */ @Override public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { String cacheKey = getCacheKey(url); // check in the error cache first BaseRobotRules robotRules = ERRORCACHE.getIfPresent(cacheKey); if (robotRules != null) { return robotRules; } // now try the proper cache robotRules = CACHE.getIfPresent(cacheKey); if (robotRules != null) { return robotRules; } boolean cacheRule = true; URL redir = null; LOG.debug("Cache miss {} for {}", cacheKey, url); List<Integer> bytesFetched = new LinkedList<>(); try { ProtocolResponse response = http.getProtocolOutput(new URL(url, "/robots.txt").toString(), Metadata.empty); int code = response.getStatusCode(); bytesFetched.add(response.getContent() != null ? response .getContent().length : 0); // try one level of redirection ? if (code == 301 || code == 302 || code == 307 || code == 308) { String redirection = response.getMetadata().getFirstValue( HttpHeaders.LOCATION); if (StringUtils.isNotBlank(redirection)) { if (!redirection.startsWith("http")) { // RFC says it should be absolute, but apparently it // isn't redir = new URL(url, redirection); } else { redir = new URL(redirection); } response = http.getProtocolOutput(redir.toString(), Metadata.empty); code = response.getStatusCode(); bytesFetched.add(response.getContent() != null ? response .getContent().length : 0); } } if (code == 200) // found rules: parse them { String ct = response.getMetadata().getFirstValue( HttpHeaders.CONTENT_TYPE); robotRules = parseRules(url.toString(), response.getContent(), ct, agentNames); } else if ((code == 403) && (!allowForbidden)) { robotRules = FORBID_ALL_RULES; // use forbid all } else if (code >= 500) { cacheRule = false; robotRules = EMPTY_RULES; } else robotRules = EMPTY_RULES; // use default rules } catch (Throwable t) { LOG.info("Couldn't get robots.txt for {} : {}", url, t.toString()); cacheRule = false; robotRules = EMPTY_RULES; } RobotRules cached = new RobotRules(robotRules); Cache<String, BaseRobotRules> cacheToUse = CACHE; String cacheName = "success"; if (!cacheRule) { cacheToUse = ERRORCACHE; cacheName = "error"; } LOG.debug("Caching robots for {} under key {} in cache {}", url, cacheKey, cacheName); cacheToUse.put(cacheKey, cached); if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host String keyredir = getCacheKey(redir); LOG.debug("Caching robots for {} under key {} in cache {}", redir, keyredir, cacheName); cacheToUse.put(keyredir, cached); } RobotRules live = new RobotRules(robotRules); live.setContentLengthFetched(Ints.toArray(bytesFetched)); return live; } }