/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.storm.crawler.protocol.httpclient; import java.io.IOException; import java.util.Locale; import org.apache.commons.lang.StringUtils; import org.apache.http.Header; import org.apache.http.HeaderIterator; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.ResponseHandler; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.DefaultProxyRoutePlanner; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import org.slf4j.LoggerFactory; import backtype.storm.Config; import com.digitalpebble.storm.crawler.Metadata; import com.digitalpebble.storm.crawler.protocol.AbstractHttpProtocol; import com.digitalpebble.storm.crawler.protocol.HttpRobotRulesParser; import com.digitalpebble.storm.crawler.protocol.ProtocolResponse; import com.digitalpebble.storm.crawler.protocol.RobotRulesParser; import com.digitalpebble.storm.crawler.util.ConfUtils; import crawlercommons.robots.BaseRobotRules; /** * Uses Apache httpclient to handle http and https **/ public class HttpProtocol extends AbstractHttpProtocol implements ResponseHandler<ProtocolResponse> { private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(HttpProtocol.class); private final static PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager(); static { // Increase max total connection to 200 CONNECTION_MANAGER.setMaxTotal(200); // Increase default max connection per route to 20 CONNECTION_MANAGER.setDefaultMaxPerRoute(20); } private com.digitalpebble.storm.crawler.protocol.HttpRobotRulesParser robots; /** * TODO record response time in the meta data, see property * http.store.responsetime. */ private boolean responseTime = true; // TODO find way of limiting the content fetched private int maxContent; private boolean skipRobots = false; private HttpClientBuilder builder; private RequestConfig requestConfig; @Override public void configure(final Config conf) { this.maxContent = ConfUtils.getInt(conf, "http.content.limit", 64 * 1024); String userAgent = getAgentString( ConfUtils.getString(conf, "http.agent.name"), ConfUtils.getString(conf, "http.agent.version"), ConfUtils.getString(conf, "http.agent.description"), ConfUtils.getString(conf, "http.agent.url"), ConfUtils.getString(conf, "http.agent.email")); this.responseTime = ConfUtils.getBoolean(conf, "http.store.responsetime", true); this.skipRobots = ConfUtils.getBoolean(conf, "http.skip.robots", false); robots = new HttpRobotRulesParser(conf); builder = HttpClients.custom().setUserAgent(userAgent) .setConnectionManager(CONNECTION_MANAGER) .setConnectionManagerShared(true).disableRedirectHandling(); String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null); int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080); boolean useProxy = (proxyHost != null && proxyHost.length() > 0); // use a proxy? if (useProxy) { HttpHost proxy = new HttpHost(proxyHost, proxyPort); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner( proxy); builder.setRoutePlanner(routePlanner); } int timeout = ConfUtils.getInt(conf, "http.timeout", 10000); requestConfig = RequestConfig.custom().setSocketTimeout(timeout) .setConnectTimeout(timeout).build(); } @Override public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Exception { LOG.debug("HTTP connection manager stats {}", CONNECTION_MANAGER.getTotalStats()); HttpGet httpget = new HttpGet(url); httpget.setConfig(requestConfig); if (md != null) { String ifModifiedSince = md.getFirstValue("cachedLastModified"); if (StringUtils.isNotBlank(ifModifiedSince)) { httpget.addHeader("If-Modified-Since", ifModifiedSince); } String ifNoneMatch = md.getFirstValue("cachedEtag"); if (StringUtils.isNotBlank(ifNoneMatch)) { httpget.addHeader("If-None-Match", ifNoneMatch); } } // no need to release the connection explicitly as this is handled // automatically. The client itself must be closed though. try (CloseableHttpClient httpclient = builder.build()) { return httpclient.execute(httpget, this); } } @Override public ProtocolResponse handleResponse(HttpResponse response) throws ClientProtocolException, IOException { int status = response.getStatusLine().getStatusCode(); Metadata metadata = new Metadata(); HeaderIterator iter = response.headerIterator(); while (iter.hasNext()) { Header header = iter.nextHeader(); metadata.addValue(header.getName().toLowerCase(Locale.ROOT), header.getValue()); } // TODO find a way of limiting by maxContent byte[] bytes = EntityUtils.toByteArray(response.getEntity()); return new ProtocolResponse(bytes, status, metadata); } @Override public BaseRobotRules getRobotRules(String url) { if (this.skipRobots) return RobotRulesParser.EMPTY_RULES; return robots.getRobotRulesSet(this, url); } public static void main(String args[]) throws Exception { HttpProtocol protocol = new HttpProtocol(); String url = args[0]; Config conf = ConfUtils.loadConf(args[1]); protocol.configure(conf); if (!protocol.skipRobots) { BaseRobotRules rules = protocol.getRobotRules(url); System.out.println("is allowed : " + rules.isAllowed(url)); } Metadata md = new Metadata(); ProtocolResponse response = protocol.getProtocolOutput(url, md); System.out.println(url); System.out.println(response.getMetadata()); System.out.println(response.getStatusCode()); System.out.println(response.getContent().length); } }