/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.protocol.http.api; // JDK imports import java.io.IOException; import java.net.InetAddress; import java.net.URL; import java.net.UnknownHostException; import java.util.HashMap; import java.util.LinkedList; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.protocol.RobotRules; import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.DeflateUtils; import org.apache.nutch.util.LogUtil; // Hadoop imports import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; /** * @author Jérôme Charron */ public abstract class HttpBase implements Protocol { public static final int BUFFER_SIZE = 8 * 1024; private static final byte[] EMPTY_CONTENT = new byte[0]; private RobotRulesParser robots = null; /** The proxy hostname. */ protected String proxyHost = null; /** The proxy port. */ protected int proxyPort = 8080; /** Indicates if a proxy is used */ protected boolean useProxy = false; /** The network timeout in millisecond */ protected int timeout = 10000; /** The length limit for downloaded content, in bytes. */ protected int maxContent = 64 * 1024; /** The number of times a thread will delay when trying to fetch a page. */ protected int maxDelays = 3; /** * The maximum number of threads that should be allowed * to access a host at one time. */ protected int maxThreadsPerHost = 1; /** * The number of seconds the fetcher will delay between * successive requests to the same server. */ protected long serverDelay = 1000; /** The Nutch 'User-Agent' request header */ protected String userAgent = getAgentString( "NutchCVS", null, "Nutch", "http://lucene.apache.org/nutch/bot.html", "nutch-agent@lucene.apache.org"); /** * Maps from host to a Long naming the time it should be unblocked. * The Long is zero while the host is in use, then set to now+wait when * a request finishes. This way only one thread at a time accesses a * host. */ private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap(); /** * Maps a host to the number of threads accessing that host. */ private static HashMap THREADS_PER_HOST_COUNT = new HashMap(); /** * Queue of blocked hosts. This contains all of the non-zero entries * from BLOCKED_ADDR_TO_TIME, ordered by increasing time. */ private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList(); /** The default logger */ private final static Log LOGGER = LogFactory.getLog(HttpBase.class); /** The specified logger */ private Log logger = LOGGER; /** The nutch configuration */ private Configuration conf = null; /** Do we block by IP addresses or by hostnames? */ private boolean byIP = true; /** Do we use HTTP/1.1? */ protected boolean useHttp11 = false; /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; /** Plugin should handle host blocking internally. */ protected boolean checkBlocking = true; /** Plugin should handle robot rules checking internally. */ protected boolean checkRobots = true; /** Creates a new instance of HttpBase */ public HttpBase() { this(null); } /** Creates a new instance of HttpBase */ public HttpBase(Log logger) { if (logger != null) { this.logger = logger; } robots = new RobotRulesParser(); } // Inherited Javadoc public void setConf(Configuration conf) { this.conf = conf; this.proxyHost = conf.get("http.proxy.host"); this.proxyPort = conf.getInt("http.proxy.port", 8080); this.useProxy = (proxyHost != null && proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 64 * 1024); this.maxDelays = conf.getInt("http.max.delays", 3); this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000); // backward-compatible default setting this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.robots.setConf(conf); this.checkBlocking = conf.getBoolean(Protocol.CHECK_BLOCKING, true); this.checkRobots = conf.getBoolean(Protocol.CHECK_ROBOTS, true); logConf(); } // Inherited Javadoc public Configuration getConf() { return this.conf; } public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); try { URL u = new URL(urlString); if (checkRobots) { try { if (!robots.isAllowed(this, u)) { return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url)); } } catch (Throwable e) { // XXX Maybe bogus: assume this is allowed. if (logger.isTraceEnabled()) { logger.trace("Exception checking robot rules for " + url + ": " + e); } } } long crawlDelay = robots.getCrawlDelay(this, u); long delay = crawlDelay > 0 ? crawlDelay : serverDelay; if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) { // skip this page, otherwise the thread would block for too long. LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max=" + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000)); return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK); } String host = null; if (checkBlocking) { try { host = blockAddr(u, delay); } catch (BlockedException be) { return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED); } } Response response; try { response = getResponse(u, datum, false); // make a request } finally { if (checkBlocking) unblockAddr(host, delay); } int code = response.getCode(); byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), this.conf); if (code == 200) { // got a good response return new ProtocolOutput(c); // return it } else if (code == 410) { // page is gone return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url)); } else if (code >= 300 && code < 400) { // handle redirect String location = response.getHeader("Location"); // some broken servers, such as MS IIS, use lowercase header name... if (location == null) location = response.getHeader("location"); if (location == null) location = ""; u = new URL(u, location); int protocolStatusCode; switch (code) { case 300: // multiple choices, preferred value in Location protocolStatusCode = ProtocolStatus.MOVED; break; case 301: // moved permanently case 305: // use proxy (Location is URL of proxy) protocolStatusCode = ProtocolStatus.MOVED; break; case 302: // found (temporarily moved) case 303: // see other (redirect after POST) case 307: // temporary redirect protocolStatusCode = ProtocolStatus.TEMP_MOVED; break; case 304: // not modified protocolStatusCode = ProtocolStatus.NOTMODIFIED; break; default: protocolStatusCode = ProtocolStatus.MOVED; } // handle this in the higher layer. return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); } else if (code == 401) { // requires authorization, but no valid auth provided. if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString)); } else if (code == 404) { return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u)); } else if (code == 410) { // permanently GONE return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); } else { return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u)); } } catch (Throwable e) { e.printStackTrace(LogUtil.getErrorStream(logger)); return new ProtocolOutput(null, new ProtocolStatus(e)); } } /* -------------------------- * * </implementation:Protocol> * * -------------------------- */ public String getProxyHost() { return proxyHost; } public int getProxyPort() { return proxyPort; } public boolean useProxy() { return useProxy; } public int getTimeout() { return timeout; } public int getMaxContent() { return maxContent; } public int getMaxDelays() { return maxDelays; } public int getMaxThreadsPerHost() { return maxThreadsPerHost; } public long getServerDelay() { return serverDelay; } public String getUserAgent() { return userAgent; } public boolean getUseHttp11() { return useHttp11; } private String blockAddr(URL url, long crawlDelay) throws ProtocolException { String host; if (byIP) { try { InetAddress addr = InetAddress.getByName(url.getHost()); host = addr.getHostAddress(); } catch (UnknownHostException e) { // unable to resolve it, so don't fall back to host name throw new HttpException(e); } } else { host = url.getHost(); if (host == null) throw new HttpException("Unknown host for url: " + url); host = host.toLowerCase(); } int delays = 0; while (true) { cleanExpiredServerBlocks(); // free held addresses Long time; synchronized (BLOCKED_ADDR_TO_TIME) { time = (Long) BLOCKED_ADDR_TO_TIME.get(host); if (time == null) { // address is free // get # of threads already accessing this addr Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(host); int count = (counter == null) ? 0 : counter.intValue(); count++; // increment & store THREADS_PER_HOST_COUNT.put(host, new Integer(count)); if (count >= maxThreadsPerHost) { BLOCKED_ADDR_TO_TIME.put(host, new Long(0)); // block it } return host; } } if (delays == maxDelays) throw new BlockedException("Exceeded http.max.delays: retry later."); long done = time.longValue(); long now = System.currentTimeMillis(); long sleep = 0; if (done == 0) { // address is still in use sleep = crawlDelay; // wait at least delay } else if (now < done) { // address is on hold sleep = done - now; // wait until its free } try { Thread.sleep(sleep); } catch (InterruptedException e) {} delays++; } } private void unblockAddr(String host, long crawlDelay) { synchronized (BLOCKED_ADDR_TO_TIME) { int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue(); if (addrCount == 1) { THREADS_PER_HOST_COUNT.remove(host); BLOCKED_ADDR_QUEUE.addFirst(host); BLOCKED_ADDR_TO_TIME.put (host, new Long(System.currentTimeMillis() + crawlDelay)); } else { THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1)); } } } private static void cleanExpiredServerBlocks() { synchronized (BLOCKED_ADDR_TO_TIME) { for (int i = BLOCKED_ADDR_QUEUE.size() - 1; i >= 0; i--) { String host = (String) BLOCKED_ADDR_QUEUE.get(i); long time = ((Long) BLOCKED_ADDR_TO_TIME.get(host)).longValue(); if (time <= System.currentTimeMillis()) { BLOCKED_ADDR_TO_TIME.remove(host); BLOCKED_ADDR_QUEUE.remove(i); } } } } private static String getAgentString(String agentName, String agentVersion, String agentDesc, String agentURL, String agentEmail) { if ( (agentName == null) || (agentName.trim().length() == 0) ) { // TODO : NUTCH-258 if (LOGGER.isFatalEnabled()) { LOGGER.fatal("No User-Agent string set (http.agent.name)!"); } } StringBuffer buf= new StringBuffer(); buf.append(agentName); if (agentVersion != null) { buf.append("/"); buf.append(agentVersion); } if ( ((agentDesc != null) && (agentDesc.length() != 0)) || ((agentEmail != null) && (agentEmail.length() != 0)) || ((agentURL != null) && (agentURL.length() != 0)) ) { buf.append(" ("); if ((agentDesc != null) && (agentDesc.length() != 0)) { buf.append(agentDesc); if ( (agentURL != null) || (agentEmail != null) ) buf.append("; "); } if ((agentURL != null) && (agentURL.length() != 0)) { buf.append(agentURL); if (agentEmail != null) buf.append("; "); } if ((agentEmail != null) && (agentEmail.length() != 0)) buf.append(agentEmail); buf.append(")"); } return buf.toString(); } protected void logConf() { if (logger.isInfoEnabled()) { logger.info("http.proxy.host = " + proxyHost); logger.info("http.proxy.port = " + proxyPort); logger.info("http.timeout = " + timeout); logger.info("http.content.limit = " + maxContent); logger.info("http.agent = " + userAgent); logger.info(Protocol.CHECK_BLOCKING + " = " + checkBlocking); logger.info(Protocol.CHECK_ROBOTS + " = " + checkRobots); if (checkBlocking) { logger.info("fetcher.server.delay = " + serverDelay); logger.info("http.max.delays = " + maxDelays); } } } public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException { if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); } byte[] content; if (getMaxContent() >= 0) { content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); } else { content = GZIPUtils.unzipBestEffort(compressed); } if (content == null) throw new IOException("unzipBestEffort returned null"); if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length + " bytes of compressed content (expanded to " + content.length + " bytes) from " + url); } return content; } public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException { if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); } byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); if (content == null) throw new IOException("inflateBestEffort returned null"); if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length + " bytes of compressed content (expanded to " + content.length + " bytes) from " + url); } return content; } protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; String usage = "Usage: Http [-verbose] [-timeout N] url"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-timeout")) { // found -timeout option http.timeout = Integer.parseInt(args[++i]) * 1000; } else if (args[i].equals("-verbose")) { // found -verbose option verbose = true; } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); } else // root is required parameter url = args[i]; } // if (verbose) { // LOGGER.setLevel(Level.FINE); // } ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum()); Content content = out.getContent(); System.out.println("Status: " + out.getStatus()); if (content != null) { System.out.println("Content Type: " + content.getContentType()); System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.out.println("Content:"); String text = new String(content.getContent()); System.out.println(text); } } protected abstract Response getResponse(URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException; public RobotRules getRobotRules(Text url, CrawlDatum datum) { return robots.getRobotRulesSet(this, url); } }