/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.protocol.httpclient; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.mutable.MutableBoolean; import org.apache.http.Header; import org.apache.http.HeaderIterator; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.ResponseHandler; import org.apache.http.client.config.AuthSchemes; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.DefaultProxyRoutePlanner; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.Args; import org.apache.http.util.ByteArrayBuffer; import org.apache.storm.Config; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.persistence.Status; import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol; import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.CookieConverter; import org.apache.http.cookie.Cookie; /** * Uses Apache httpclient to handle http and https **/ public class HttpProtocol extends AbstractHttpProtocol implements ResponseHandler<ProtocolResponse> { private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(HttpProtocol.class); private static final PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager(); private int maxContent; private HttpClientBuilder builder; private RequestConfig requestConfig; public static final String RESPONSE_COOKIES_HEADER = "set-cookie"; @Override public void configure(final Config conf) { super.configure(conf); // allow up to 200 connections or same as the number of threads used for // fetching int maxFetchThreads = ConfUtils.getInt(conf, "fetcher.threads.number", 200); CONNECTION_MANAGER.setMaxTotal(maxFetchThreads); CONNECTION_MANAGER.setDefaultMaxPerRoute(20); this.maxContent = ConfUtils.getInt(conf, "http.content.limit", -1); String userAgent = getAgentString( ConfUtils.getString(conf, "http.agent.name"), ConfUtils.getString(conf, "http.agent.version"), ConfUtils.getString(conf, "http.agent.description"), ConfUtils.getString(conf, "http.agent.url"), ConfUtils.getString(conf, "http.agent.email")); builder = HttpClients.custom().setUserAgent(userAgent) .setConnectionManager(CONNECTION_MANAGER) .setConnectionManagerShared(true).disableRedirectHandling() .disableAutomaticRetries(); int timeout = ConfUtils.getInt(conf, "http.timeout", 10000); RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setSocketTimeout(timeout).setConnectTimeout(timeout) .setConnectionRequestTimeout(timeout) .setCookieSpec(CookieSpecs.STANDARD); String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null); int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080); boolean useProxy = proxyHost != null && proxyHost.length() > 0; // use a proxy? if (useProxy) { String proxyUser = ConfUtils.getString(conf, "http.proxy.user", null); String proxyPass = ConfUtils.getString(conf, "http.proxy.pass", null); if (StringUtils.isNotBlank(proxyUser) && StringUtils.isNotBlank(proxyPass)) { List<String> authSchemes = new ArrayList<>(); // Can make configurable and add more in future authSchemes.add(AuthSchemes.BASIC); requestConfigBuilder.setProxyPreferredAuthSchemes(authSchemes); BasicCredentialsProvider basicAuthCreds = new BasicCredentialsProvider(); basicAuthCreds.setCredentials(new AuthScope(proxyHost, proxyPort), new UsernamePasswordCredentials(proxyUser, proxyPass)); builder.setDefaultCredentialsProvider(basicAuthCreds); } HttpHost proxy = new HttpHost(proxyHost, proxyPort); DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner( proxy); builder.setRoutePlanner(routePlanner); } requestConfig = requestConfigBuilder.build(); } @Override public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Exception { LOG.debug("HTTP connection manager stats {}", CONNECTION_MANAGER.getTotalStats()); HttpGet httpget = new HttpGet(url); httpget.setConfig(requestConfig); if (md != null) { String lastModified = md.getFirstValue("last-modified"); if (StringUtils.isNotBlank(lastModified)) { httpget.addHeader("If-Modified-Since", lastModified); } String ifNoneMatch = md.getFirstValue("etag"); if (StringUtils.isNotBlank(ifNoneMatch)) { httpget.addHeader("If-None-Match", ifNoneMatch); } if (useCookies) { addCookiesToRequest(httpget, md); } } // no need to release the connection explicitly as this is handled // automatically. The client itself must be closed though. try (CloseableHttpClient httpclient = builder.build()) { return httpclient.execute(httpget, this); } } private void addCookiesToRequest(HttpGet httpget, Metadata md) { String[] cookieStrings = md.getValues(RESPONSE_COOKIES_HEADER); if (cookieStrings != null && cookieStrings.length > 0) { List<Cookie> cookies; try { cookies = CookieConverter.getCookies(cookieStrings, httpget .getURI().toURL()); for (Cookie c : cookies) { httpget.addHeader("Cookie", c.getName() + "=" + c.getValue()); } } catch (MalformedURLException e) { // Bad url , nothing to do } } } @Override public ProtocolResponse handleResponse(HttpResponse response) throws IOException { StatusLine statusLine = response.getStatusLine(); int status = statusLine.getStatusCode(); StringBuilder verbatim = new StringBuilder(); if (storeHTTPHeaders) { verbatim.append(statusLine.toString()).append("\r\n"); } Metadata metadata = new Metadata(); HeaderIterator iter = response.headerIterator(); while (iter.hasNext()) { Header header = iter.nextHeader(); if (storeHTTPHeaders) { verbatim.append(header.toString()).append("\r\n"); } metadata.addValue(header.getName().toLowerCase(Locale.ROOT), header.getValue()); } MutableBoolean trimmed = new MutableBoolean(); byte[] bytes = new byte[] {}; if (!Status.REDIRECTION.equals(Status.fromHTTPCode(status))) { bytes = HttpProtocol.toByteArray(response.getEntity(), maxContent, trimmed); if (trimmed.booleanValue()) { metadata.setValue("http.trimmed", "true"); LOG.warn("HTTP content trimmed to {}", bytes.length); } } if (storeHTTPHeaders) { verbatim.append("\r\n"); metadata.setValue("_response.headers_", verbatim.toString()); } return new ProtocolResponse(bytes, status, metadata); } private static final byte[] toByteArray(final HttpEntity entity, int maxContent, MutableBoolean trimmed) throws IOException { if (entity == null) return new byte[] {}; final InputStream instream = entity.getContent(); if (instream == null) { return null; } Args.check(entity.getContentLength() <= Integer.MAX_VALUE, "HTTP entity too large to be buffered in memory"); int reportedLength = (int) entity.getContentLength(); // set minimal size for buffer if (reportedLength < 0) { reportedLength = 4096; } // avoid init of too large a buffer when we will trim anyway if (maxContent != -1 && reportedLength > maxContent) { reportedLength = maxContent; } final ByteArrayBuffer buffer = new ByteArrayBuffer(reportedLength); final byte[] tmp = new byte[4096]; int lengthRead; while ((lengthRead = instream.read(tmp)) != -1) { // check whether we need to trim if (maxContent != -1 && buffer.length() + lengthRead > maxContent) { buffer.append(tmp, 0, buffer.capacity() - buffer.length()); trimmed.setValue(true); break; } buffer.append(tmp, 0, lengthRead); } return buffer.toByteArray(); } public static void main(String args[]) throws Exception { HttpProtocol.main(new HttpProtocol(), args); } }