/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package focusedCrawler.crawler.crawlercommons.fetcher.http;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLHandshakeException;
import javax.net.ssl.TrustManager;
import javax.net.ssl.TrustManagerFactory;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HeaderElementIterator;
import org.apache.http.HttpClientConnection;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpException;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.HttpInetConnection;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ProtocolException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.RedirectException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ConnectionKeepAliveStrategy;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHeaderElementIterator;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.protocol.HttpCoreContext;
import org.apache.http.protocol.HttpRequestExecutor;
import org.apache.tika.metadata.Metadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchReason;
import focusedCrawler.crawler.crawlercommons.fetcher.BadProtocolFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.EncodingUtils;
import focusedCrawler.crawler.crawlercommons.fetcher.EncodingUtils.ExpandedResult;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.crawler.crawlercommons.fetcher.IOFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.Payload;
import focusedCrawler.crawler.crawlercommons.fetcher.RedirectFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.RedirectFetchException.RedirectExceptionReason;
import focusedCrawler.crawler.crawlercommons.fetcher.UrlFetchException;
@SuppressWarnings("serial")
public class SimpleHttpFetcher extends BaseHttpFetcher {
private static Logger LOGGER = LoggerFactory.getLogger(SimpleHttpFetcher.class);
// We tried 10 seconds for all of these, but got a number of connection/read
// timeouts for
// sites that would have eventually worked, so bumping it up to 30 seconds.
private static final int DEFAULT_SOCKET_TIMEOUT = 30 * 1000;
// As of HttpComponents v.4.2.1, this will also include timeout needed to
// get Connection from Pool.
// From initial comment of the deprecated 'CONNECTION_POOL_TIMEOUT' static
// element:
// "This normally doesen't ever hit this timeout, since we manage the number
// of
// fetcher threads to be <= the maxThreads value used to configure a
// HttpFetcher. However the limit of connections/host can cause a timeout,
// when redirects cause multiple threads to hit the same domain.
// We therefore jack this right up."
private static final int DEFAULT_CONNECTION_TIMEOUT = 100 * 1000;
private static final int DEFAULT_MAX_THREADS = 1;
private static final int BUFFER_SIZE = 8 * 1024;
private static final int DEFAULT_MAX_RETRY_COUNT = 10;
private static final int DEFAULT_BYTEARRAY_SIZE = 32 * 1024;
// Use the same values as Firefox (except that we don't accept deflate,
// which we're not sure is implemented correctly - see the notes in
// EncodingUtils/EncodingUtilsTest for more details).
private static final String DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
private static final String DEFAULT_ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.7";
private static final String DEFAULT_ACCEPT_ENCODING = "x-gzip, gzip";
// Keys used to access data in the Http execution context.
private static final String PERM_REDIRECT_CONTEXT_KEY = "perm-redirect";
private static final String REDIRECT_COUNT_CONTEXT_KEY = "redirect-count";
private static final String HOST_ADDRESS = "host-address";
// To be polite, set it small; if we use it, we will use less than a second
// delay between subsequent fetches
private static final int DEFAULT_KEEP_ALIVE_DURATION = 5000;
private IdleConnectionMonitorThread monitor;
private ThreadLocal<CookieStore> localCookieStore = new ThreadLocal<CookieStore>() {
protected CookieStore initialValue() {
CookieStore cookieStore = new LocalCookieStore();
return cookieStore;
}
};
private static final String SSL_CONTEXT_NAMES[] = { "TLS", "Default", "SSL", };
private static final String TEXT_MIME_TYPES[] = { "text/html", "application/x-asp", "application/xhtml+xml", "application/vnd.wap.xhtml+xml", };
private HttpVersion _httpVersion;
private int _socketTimeout;
private int _connectionTimeout;
private int _connectionRequestTimeout;
private int _maxRetryCount;
private HttpHost proxy;
transient private CloseableHttpClient _httpClient;
transient private PoolingHttpClientConnectionManager _connectionManager;
private static class MyRequestRetryHandler implements HttpRequestRetryHandler {
private int _maxRetryCount;
public MyRequestRetryHandler(int maxRetryCount) {
_maxRetryCount = maxRetryCount;
}
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("Decide about retry #" + executionCount + " for exception " + exception.getMessage());
}
if (executionCount >= _maxRetryCount) {
// Do not retry if over max retry count
return false;
} else if (exception instanceof NoHttpResponseException) {
// Retry if the server dropped connection on us
return true;
} else if (exception instanceof SSLHandshakeException) {
// Do not retry on SSL handshake exception
return false;
}
HttpRequest request = (HttpRequest) context.getAttribute(HttpCoreContext.HTTP_REQUEST);
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
// Retry if the request is considered idempotent
return idempotent;
}
}
private static class MyRedirectException extends RedirectException {
private URI _uri;
private RedirectExceptionReason _reason;
public MyRedirectException(String message, URI uri, RedirectExceptionReason reason) {
super(message);
_uri = uri;
_reason = reason;
}
public URI getUri() {
return _uri;
}
public RedirectExceptionReason getReason() {
return _reason;
}
}
/**
* Handler to record last permanent redirect (if any) in context.
*
*/
private static class MyRedirectStrategy extends DefaultRedirectStrategy {
private RedirectMode _redirectMode;
public MyRedirectStrategy(RedirectMode redirectMode) {
super();
_redirectMode = redirectMode;
}
@Override
public URI getLocationURI(final HttpRequest request, final HttpResponse response, final HttpContext context) throws ProtocolException {
URI result = super.getLocationURI(request, response, context);
// HACK - some sites return a redirect with an explicit port number
// that's the same as
// the default port (e.g. 80 for http), and then when you use this
// to make the next
// request, the presence of the port in the domain triggers another
// redirect, so you
// fail with a circular redirect error. Avoid that by converting the
// port number to
// -1 in that case.
//
// Detailed scenrio:
// http://www.test.com/MyPage ->
// http://www.test.com:80/MyRedirectedPage ->
// http://www.test.com/MyRedirectedPage
// We can save bandwidth:
if (result.getScheme().equalsIgnoreCase("http") && (result.getPort() == 80)) {
try {
result = new URI(result.getScheme(), result.getUserInfo(), result.getHost(), -1, result.getPath(), result.getQuery(), result.getFragment());
} catch (URISyntaxException e) {
LOGGER.warn("Unexpected exception removing port from URI", e);
}
}
// Keep track of the number of redirects.
Integer count = (Integer) context.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
if (count == null) {
count = new Integer(0);
}
context.setAttribute(REDIRECT_COUNT_CONTEXT_KEY, count + 1);
// Record the last permanent redirect
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY) {
context.setAttribute(PERM_REDIRECT_CONTEXT_KEY, result);
}
RedirectExceptionReason reason = null;
if (_redirectMode == RedirectMode.FOLLOW_NONE) {
switch (statusCode) {
case HttpStatus.SC_MOVED_TEMPORARILY:
reason = RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED;
break;
case HttpStatus.SC_MOVED_PERMANENTLY:
reason = RedirectExceptionReason.PERM_REDIRECT_DISALLOWED;
break;
case HttpStatus.SC_TEMPORARY_REDIRECT:
reason = RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED;
break;
case HttpStatus.SC_SEE_OTHER:
reason = RedirectExceptionReason.SEE_OTHER_DISALLOWED;
break;
default:
}
}
if (_redirectMode == RedirectMode.FOLLOW_TEMP) {
switch (statusCode) {
case HttpStatus.SC_MOVED_PERMANENTLY:
reason = RedirectExceptionReason.PERM_REDIRECT_DISALLOWED;
break;
case HttpStatus.SC_SEE_OTHER:
reason = RedirectExceptionReason.SEE_OTHER_DISALLOWED;
break;
default:
}
}
if (reason != null)
throw new MyRedirectException("RedirectMode disallowed redirect: " + _redirectMode, result, reason);
return result;
}
}
/**
* HttpExecutor to record host address in context.
*/
static class MyHttpRequestExecutor extends HttpRequestExecutor {
@Override
public HttpResponse execute(HttpRequest request, HttpClientConnection conn, HttpContext context)
throws IOException, HttpException {
HttpInetConnection connection = (HttpInetConnection) conn;
context.setAttribute(HOST_ADDRESS, connection.getRemoteAddress().getHostAddress());
return super.execute(request, conn, context);
}
}
private static class DummyX509TrustManager implements X509TrustManager {
private X509TrustManager standardTrustManager = null;
/**
* Constructor for DummyX509TrustManager.
*/
public DummyX509TrustManager(KeyStore keystore) throws NoSuchAlgorithmException, KeyStoreException {
super();
String algo = TrustManagerFactory.getDefaultAlgorithm();
TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
factory.init(keystore);
TrustManager[] trustmanagers = factory.getTrustManagers();
if (trustmanagers.length == 0) {
throw new NoSuchAlgorithmException(algo + " trust manager not supported");
}
this.standardTrustManager = (X509TrustManager) trustmanagers[0];
}
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
*/
@SuppressWarnings("unused")
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
}
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
*/
@SuppressWarnings("unused")
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
}
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
}
public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
// do nothing
}
public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
// do nothing
}
}
public static class MyConnectionKeepAliveStrategy implements ConnectionKeepAliveStrategy {
public long getKeepAliveDuration(HttpResponse response, HttpContext context) {
if (response == null) {
throw new IllegalArgumentException("HTTP response may not be null");
}
HeaderElementIterator it = new BasicHeaderElementIterator(response.headerIterator(HTTP.CONN_KEEP_ALIVE));
while (it.hasNext()) {
HeaderElement he = it.nextElement();
String param = he.getName();
String value = he.getValue();
if (value != null && param.equalsIgnoreCase("timeout")) {
try {
return Long.parseLong(value) * 1000;
} catch (NumberFormatException ignore) {
}
}
}
return DEFAULT_KEEP_ALIVE_DURATION;
}
}
public static class IdleConnectionMonitorThread extends Thread {
private final HttpClientConnectionManager connMgr;
public IdleConnectionMonitorThread(HttpClientConnectionManager connMgr) {
super();
this.connMgr = connMgr;
this.setDaemon(true);
}
@Override
public void run() {
while (!interrupted()) {
// Close expired connections
connMgr.closeExpiredConnections();
// Optionally, close connections
// that have been idle longer than 30 sec
connMgr.closeIdleConnections(30, TimeUnit.SECONDS);
try {
// TODO is it better to implement as
// Thread.currentThread().sleep(30000);
// and add a javac declaration?
Thread.currentThread();
Thread.sleep(30000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
}
public SimpleHttpFetcher(UserAgent userAgent) {
this(DEFAULT_MAX_THREADS, userAgent);
}
public SimpleHttpFetcher(int maxThreads, UserAgent userAgent) {
super(maxThreads, userAgent);
_httpVersion = HttpVersion.HTTP_1_1;
_socketTimeout = DEFAULT_SOCKET_TIMEOUT;
_connectionTimeout = DEFAULT_CONNECTION_TIMEOUT;
_maxRetryCount = DEFAULT_MAX_RETRY_COUNT;
// Just to be explicit, we rely on lazy initialization of this so that
// we don't have to worry about serializing it.
_httpClient = null;
}
public HttpVersion getHttpVersion() {
return _httpVersion;
}
public void setHttpVersion(HttpVersion httpVersion) {
if (_httpClient == null) {
_httpVersion = httpVersion;
} else {
throw new IllegalStateException("Can't change HTTP version after HttpClient has been initialized");
}
}
public int getSocketTimeout() {
return _socketTimeout;
}
public void setSocketTimeout(int socketTimeoutInMs) {
if (_httpClient == null) {
_socketTimeout = socketTimeoutInMs;
} else {
throw new IllegalStateException("Can't change socket timeout after HttpClient has been initialized");
}
}
public int getConnectionTimeout() {
return _connectionTimeout;
}
public void setConnectionTimeout(int connectionTimeoutInMs) {
if (_httpClient == null) {
_connectionTimeout = connectionTimeoutInMs;
} else {
throw new IllegalStateException("Can't change connection timeout after HttpClient has been initialized");
}
}
public int getConnectionRequestTimeout() {
return _connectionRequestTimeout;
}
public void setConnectionRequestTimeout(int _connectionRequestTimeoutInMs) {
if (_httpClient == null) {
_connectionRequestTimeout = _connectionRequestTimeoutInMs;
} else {
throw new IllegalStateException("Can't change connection request timeout after HttpClient has been initialized");
}
}
public int getMaxRetryCount() {
return _maxRetryCount;
}
public void setMaxRetryCount(int maxRetryCount) {
_maxRetryCount = maxRetryCount;
}
@Override
public FetchedResult get(String url, Payload payload) throws BaseFetchException {
try {
URL realUrl = new URL(url);
String protocol = realUrl.getProtocol();
if (!protocol.equals("http") && !protocol.equals("https")) {
throw new BadProtocolFetchException(url);
}
} catch (MalformedURLException e) {
throw new UrlFetchException(url, e.getMessage());
}
return request(new HttpGet(), url, payload);
}
private FetchedResult request(HttpRequestBase request, String url, Payload payload) throws BaseFetchException {
init();
try {
return doRequest(request, url, payload);
// } catch (HttpFetchException e) {
// // Don't bother generating a trace for a 404 (not found)
// if (LOGGER.isTraceEnabled() && (e.getHttpStatus() != HttpStatus.SC_NOT_FOUND)) {
// LOGGER.trace("Exception fetching {} {}", url, e.getMessage());
// }
//
// throw e;
} catch (AbortedFetchException e) {
// Don't bother reporting that we bailed because the mime-type
// wasn't one that we wanted.
if (e.getAbortReason() != AbortedFetchReason.INVALID_MIMETYPE) {
LOGGER.debug("Exception fetching {} {}", url, e.getMessage());
}
throw e;
} catch (BaseFetchException e) {
LOGGER.debug("Exception fetching {} {}", url, e.getMessage());
throw e;
}
}
public FetchedResult fetch(String url) throws BaseFetchException {
return fetch(new HttpGet(), url, new Payload());
}
public FetchedResult fetch(HttpRequestBase request, String url, Payload payload) throws BaseFetchException {
init();
try {
return doRequest(request, url, payload);
} catch (BaseFetchException e) {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("Exception fetching {} {}", url, e.getMessage());
}
throw e;
}
}
private FetchedResult doRequest(HttpRequestBase request, String url, Payload payload) throws BaseFetchException {
LOGGER.trace("Fetching " + url);
HttpResponse response;
long readStartTime;
Metadata headerMap = new Metadata();
String redirectedUrl = null;
String newBaseUrl = null;
int numRedirects = 0;
boolean needAbort = true;
String contentType = "";
String mimeType = "";
String hostAddress = null;
int statusCode = HttpStatus.SC_INTERNAL_SERVER_ERROR;
String reasonPhrase = null;
// Create a local instance of cookie store, and bind to local context
// Without this we get killed w/lots of threads, due to sync() on single
// cookie store.
HttpContext localContext = new BasicHttpContext();
CookieStore cookieStore = localCookieStore.get();
localContext.setAttribute(HttpClientContext.COOKIE_STORE, cookieStore);
StringBuilder fetchTrace = null;
if (LOGGER.isTraceEnabled()) {
fetchTrace = new StringBuilder("Fetched url: " + url);
}
try {
request.setURI(new URI(url));
readStartTime = System.currentTimeMillis();
response = _httpClient.execute(request, localContext);
Header[] headers = response.getAllHeaders();
for (Header header : headers) {
headerMap.add(header.getName(), header.getValue());
}
statusCode = response.getStatusLine().getStatusCode();
reasonPhrase = response.getStatusLine().getReasonPhrase();
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; status code: " + statusCode);
if (headerMap.get(HttpHeaders.CONTENT_LENGTH) != null) {
fetchTrace.append("; Content-Length: " + headerMap.get(HttpHeaders.CONTENT_LENGTH));
}
if (headerMap.get(HttpHeaders.LOCATION) != null) {
fetchTrace.append("; Location: " + headerMap.get(HttpHeaders.LOCATION));
}
}
// if ((statusCode < 200) || (statusCode >= 300)) {
// // We can't just check against SC_OK, as some wackos return 201,
// // 202, etc
// throw new HttpFetchException(url, "Error fetching " + url + " due to \"" + reasonPhrase + "\"", statusCode, headerMap);
// }
redirectedUrl = extractRedirectedUrl(url, localContext);
URI permRedirectUri = (URI) localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY);
if (permRedirectUri != null) {
newBaseUrl = permRedirectUri.toURL().toExternalForm();
}
Integer redirects = (Integer) localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
if (redirects != null) {
numRedirects = redirects.intValue();
}
hostAddress = (String) (localContext.getAttribute(HOST_ADDRESS));
if (hostAddress == null) {
throw new UrlFetchException(url, "Host address not saved in context");
}
Header cth = response.getFirstHeader(HttpHeaders.CONTENT_TYPE);
if (cth != null) {
contentType = cth.getValue();
}
// Check if we should abort due to mime-type filtering. Note that
// this will fail if the server
// doesn't report a mime-type, but that's how we want it as this
// configuration is typically
// used when only a subset of parsers are installed/enabled, so we
// don't want the auto-detect
// code in Tika to get triggered & try to process an unsupported
// type. If you want unknown
// mime-types from the server to be processed, set "" as one of the
// valid mime-types in
// FetcherPolicy.
mimeType = getMimeTypeFromContentType(contentType);
Set<String> mimeTypes = getValidMimeTypes();
if ((mimeTypes != null) && (mimeTypes.size() > 0)) {
if (!mimeTypes.contains(mimeType)) {
throw new AbortedFetchException(url, "Invalid mime-type: " + mimeType, AbortedFetchReason.INVALID_MIMETYPE);
}
}
needAbort = false;
} catch (ClientProtocolException e) {
// Oleg guarantees that no abort is needed in the case of an
// IOException
// (which is is a subclass of)
needAbort = false;
// If the root case was a "too many redirects" error, we want to map
// this to a specific
// exception that contains the final redirect.
if (e.getCause() instanceof MyRedirectException) {
MyRedirectException mre = (MyRedirectException) e.getCause();
String redirectUrl = url;
try {
redirectUrl = mre.getUri().toURL().toExternalForm();
} catch (MalformedURLException e2) {
LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri());
}
throw new RedirectFetchException(url, redirectUrl, mre.getReason());
} else if (e.getCause() instanceof RedirectException) {
LOGGER.error(e.getMessage());
throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS);
} else {
throw new IOFetchException(url, e);
}
} catch (IOException e) {
// Oleg guarantees that no abort is needed in the case of an
// IOException
needAbort = false;
throw new IOFetchException(url, e);
} catch (URISyntaxException e) {
throw new UrlFetchException(url, e.getMessage());
} catch (IllegalStateException e) {
throw new UrlFetchException(url, e.getMessage());
} catch (BaseFetchException e) {
throw e;
} catch (Exception e) {
// Map anything else to a generic IOFetchException
// TODO KKr - create generic fetch exception
throw new IOFetchException(url, new IOException(e));
} finally {
safeAbort(needAbort, request);
}
// Figure out how much data we want to try to fetch.
int maxContentSize = getMaxContentSize(mimeType);
int targetLength = maxContentSize;
boolean truncated = false;
String contentLengthStr = headerMap.get(HttpHeaders.CONTENT_LENGTH);
if (contentLengthStr != null) {
try {
int contentLength = Integer.parseInt(contentLengthStr);
if (contentLength > targetLength) {
truncated = true;
} else {
targetLength = contentLength;
}
} catch (NumberFormatException e) {
// Ignore (and log) invalid content length values.
LOGGER.warn("Invalid content length in header: " + contentLengthStr);
}
}
// Now finally read in response body, up to targetLength bytes.
// Note that entity might be null, for zero length responses.
byte[] content = new byte[0];
long readRate = 0;
HttpEntity entity = response.getEntity();
needAbort = true;
if (entity != null) {
InputStream in = null;
try {
in = entity.getContent();
byte[] buffer = new byte[BUFFER_SIZE];
int bytesRead = 0;
int totalRead = 0;
ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE);
int readRequests = 0;
int minResponseRate = getMinResponseRate();
// TODO KKr - we need to monitor the rate while reading a
// single block. Look at HttpClient
// metrics support for how to do this. Once we fix this, fix
// the test to read a smaller (< 20K)
// chuck of data.
while ((totalRead < targetLength) && ((bytesRead = in.read(buffer, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) {
readRequests += 1;
totalRead += bytesRead;
out.write(buffer, 0, bytesRead);
// Assume read time is at least one millisecond, to avoid
// DBZ exception.
long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime);
readRate = (totalRead * 1000L) / totalReadTime;
// Don't bail on the first read cycle, as we can get a
// hiccup starting out.
// Also don't bail if we've read everything we need.
if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) {
throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE);
}
// Check to see if we got interrupted, but don't clear the
// interrupted flag.
if (Thread.currentThread().isInterrupted()) {
throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED);
}
}
content = out.toByteArray();
needAbort = truncated || (in.available() > 0);
} catch (IOException e) {
// We don't need to abort if there's an IOException
throw new IOFetchException(url, e);
} finally {
safeAbort(needAbort, request);
safeClose(in);
}
}
// Toss truncated image content.
if ((truncated) && (!isTextMimeType(mimeType))) {
throw new AbortedFetchException(url, "Truncated image", AbortedFetchReason.CONTENT_SIZE);
}
// Now see if we need to uncompress the content.
String contentEncoding = headerMap.get(HttpHeaders.CONTENT_ENCODING);
if (contentEncoding != null) {
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; Content-Encoding: " + contentEncoding);
}
// TODO KKr We might want to just decompress a truncated gzip
// containing text (since we have a max content size to save us
// from any gzip corruption). We might want to break the following
// out into a separate method, by the way (if not refactor this
// entire monolithic method).
//
try {
if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
if (truncated) {
throw new AbortedFetchException(url, "Truncated compressed data", AbortedFetchReason.CONTENT_SIZE);
} else {
ExpandedResult expandedResult = EncodingUtils.processGzipEncoded(content, maxContentSize);
truncated = expandedResult.isTruncated();
if ((truncated) && (!isTextMimeType(mimeType))) {
throw new AbortedFetchException(url, "Truncated decompressed image", AbortedFetchReason.CONTENT_SIZE);
} else {
content = expandedResult.getExpanded();
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; unzipped to " + content.length + " bytes");
}
}
// } else if ("deflate".equals(contentEncoding)) {
// content =
// EncodingUtils.processDeflateEncoded(content);
// if (LOGGER.isTraceEnabled()) {
// fetchTrace.append("; inflated to " + content.length +
// " bytes");
// }
}
}
} catch (IOException e) {
throw new IOFetchException(url, e);
}
}
// Finally dump out the trace msg we've been building.
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(fetchTrace.toString());
}
// TODO KKr - Save truncated flag in FetchedResult/FetchedDatum.
return new FetchedResult(url, redirectedUrl, System.currentTimeMillis(), headerMap, content, contentType, (int) readRate, payload, newBaseUrl, numRedirects, hostAddress, statusCode,
reasonPhrase);
}
private boolean isTextMimeType(String mimeType) {
for (String textContentType : TEXT_MIME_TYPES) {
if (textContentType.equals(mimeType)) {
return true;
}
}
return false;
}
private String extractRedirectedUrl(String url, HttpContext localContext) {
// This was triggered by HttpClient with the redirect count was
// exceeded.
HttpHost host = (HttpHost) localContext.getAttribute(HttpCoreContext.HTTP_TARGET_HOST);
HttpUriRequest finalRequest = (HttpUriRequest) localContext.getAttribute(HttpCoreContext.HTTP_REQUEST);
try {
URL hostUrl = new URI(host.toURI()).toURL();
return new URL(hostUrl, finalRequest.getURI().toString()).toExternalForm();
} catch (MalformedURLException e) {
LOGGER.warn("Invalid host/uri specified in final fetch: " + host + finalRequest.getURI());
return url;
} catch (URISyntaxException e) {
LOGGER.warn("Invalid host/uri specified in final fetch: " + host + finalRequest.getURI());
return url;
}
}
private static void safeClose(Closeable o) {
if (o != null) {
try {
o.close();
} catch (Exception e) {
// Ignore any errors
}
}
}
private static void safeAbort(boolean needAbort, HttpRequestBase request) {
if (needAbort && (request != null)) {
try {
request.abort();
} catch (Throwable t) {
// Ignore any errors
}
}
}
private void init() {
if (_httpClient == null) {
synchronized (SimpleHttpFetcher.class) {
if (_httpClient != null)
return;
final HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
final RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
// Set the socket and connection timeout to be something
// reasonable.
requestConfigBuilder.setSocketTimeout(_socketTimeout);
requestConfigBuilder.setConnectTimeout(_connectionTimeout);
requestConfigBuilder.setConnectionRequestTimeout(_connectionRequestTimeout);
if(proxy != null){
LOGGER.info("Configuring fetcher to use proxy: "+proxy.toURI());
httpClientBuilder.setProxy(proxy);
}
/*
* CoreConnectionPNames.TCP_NODELAY='http.tcp.nodelay':
* determines whether Nagle's algorithm is to be used. Nagle's
* algorithm tries to conserve bandwidth by minimizing the
* number of segments that are sent. When applications wish to
* decrease network latency and increase performance, they can
* disable Nagle's algorithm (that is enable TCP_NODELAY. Data
* will be sent earlier, at the cost of an increase in bandwidth
* consumption. This parameter expects a value of type
* java.lang.Boolean. If this parameter is not set, TCP_NODELAY
* will be enabled (no delay).
*/
// FIXME Could not find this parameter in http-client version 4.5
// HttpConnectionParams.setTcpNoDelay(params, true);
// HttpProtocolParams.setVersion(params, _httpVersion);
httpClientBuilder.setUserAgent(_userAgentString);
// HttpProtocolParams.setContentCharset(params, "UTF-8");
// HttpProtocolParams.setHttpElementCharset(params, "UTF-8");
/*
* CoreProtocolPNames.USE_EXPECT_CONTINUE=
* 'http.protocol.expect-continue': activates the Expect:
* 100-Continue handshake for the entity enclosing methods. The
* purpose of the Expect: 100-Continue handshake is to allow the
* client that is sending a request message with a request body
* to determine if the origin server is willing to accept the
* request (based on the request headers) before the client
* sends the request body. The use of the Expect: 100-continue
* handshake can result in a noticeable performance improvement
* for entity enclosing requests (such as POST and PUT) that
* require the target server's authentication. The Expect:
* 100-continue handshake should be used with caution, as it may
* cause problems with HTTP servers and proxies that do not
* support HTTP/1.1 protocol. This parameter expects a value of
* type java.lang.Boolean. If this parameter is not set,
* HttpClient will not attempt to use the handshake.
*/
requestConfigBuilder.setExpectContinueEnabled(true);
/*
* CoreProtocolPNames.WAIT_FOR_CONTINUE=
* 'http.protocol.wait-for-continue': defines the maximum period
* of time in milliseconds the client should spend waiting for a
* 100-continue response. This parameter expects a value of type
* java.lang.Integer. If this parameter is not set HttpClient
* will wait 3 seconds for a confirmation before resuming the
* transmission of the request body.
*/
// FIXME Could not find this parameter in http-client version 4.5
// params.setIntParameter(CoreProtocolPNames.WAIT_FOR_CONTINUE, 5000);
// FIXME Could not find this parameter in http-client version 4.5
// CookieSpecParamBean cookieParams = new CookieSpecParamBean(params);
// cookieParams.setSingleHeader(false);
// Create and initialize connection socket factory registry
RegistryBuilder<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create();
registry.register("http", PlainConnectionSocketFactory.getSocketFactory());
SSLConnectionSocketFactory sf = createSSLConnectionSocketFactory();
if (sf != null) {
registry.register("https", sf);
} else {
LOGGER.warn("No valid SSLContext found for https");
}
_connectionManager = new PoolingHttpClientConnectionManager(registry.build());
_connectionManager.setMaxTotal(_maxThreads);
_connectionManager.setDefaultMaxPerRoute(getMaxConnectionsPerHost());
/*
* CoreConnectionPNames.STALE_CONNECTION_CHECK=
* 'http.connection.stalecheck': determines whether stale
* connection check is to be used. Disabling stale connection
* check may result in a noticeable performance improvement (the
* check can cause up to 30 millisecond overhead per request) at
* the risk of getting an I/O error when executing a request
* over a connection that has been closed at the server side.
* This parameter expects a value of type java.lang.Boolean. For
* performance critical operations the check should be disabled.
* If this parameter is not set, the stale connection check will
* be performed before each request execution.
*
* We don't need I/O exceptions in case if Server doesn't
* support Kee-Alive option; our client by default always tries
* keep-alive.
*/
// Even with stale checking enabled, a connection can "go stale"
// between the check and the next request. So we still need to
// handle the case of a closed socket (from the server side),
// and disabling this check improves performance.
// Stale connections will be checked in a separate monitor thread
_connectionManager.setValidateAfterInactivity(-1);
httpClientBuilder.setConnectionManager(_connectionManager);
httpClientBuilder.setRetryHandler(new MyRequestRetryHandler(_maxRetryCount));
httpClientBuilder.setRedirectStrategy(new MyRedirectStrategy(getRedirectMode()));
httpClientBuilder.setRequestExecutor(new MyHttpRequestExecutor());
// FUTURE KKr - support authentication
// FIXME Could not find this parameter in http-client version 4.5
// HttpClientParams.setAuthenticating(params, false);
requestConfigBuilder.setCookieSpec(CookieSpecs.DEFAULT);
if (getMaxRedirects() == 0) {
requestConfigBuilder.setRedirectsEnabled(false);
} else {
requestConfigBuilder.setRedirectsEnabled(true);
requestConfigBuilder.setMaxRedirects(getMaxRedirects());
}
// Set up default headers. This helps us get back from servers
// what we want.
HashSet<Header> defaultHeaders = new HashSet<Header>();
defaultHeaders.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, getAcceptLanguage()));
defaultHeaders.add(new BasicHeader(HttpHeaders.ACCEPT_CHARSET, DEFAULT_ACCEPT_CHARSET));
defaultHeaders.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, DEFAULT_ACCEPT_ENCODING));
defaultHeaders.add(new BasicHeader(HttpHeaders.ACCEPT, DEFAULT_ACCEPT));
httpClientBuilder.setDefaultHeaders(defaultHeaders);
httpClientBuilder.setKeepAliveStrategy(new MyConnectionKeepAliveStrategy());
monitor = new IdleConnectionMonitorThread(_connectionManager);
monitor.start();
httpClientBuilder.setDefaultRequestConfig(requestConfigBuilder.build());
_httpClient = httpClientBuilder.build();
}
}
}
private SSLConnectionSocketFactory createSSLConnectionSocketFactory() {
SSLConnectionSocketFactory sf = null;
for (String contextName : SSL_CONTEXT_NAMES) {
try {
SSLContext sslContext = SSLContext.getInstance(contextName);
sslContext.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null);
HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE;
sf = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);
break;
} catch (NoSuchAlgorithmException e) {
LOGGER.debug("SSLContext algorithm not available: " + contextName);
} catch (Exception e) {
LOGGER.debug("SSLContext can't be initialized: " + contextName, e);
}
}
return sf;
}
public void setProxy(String scheme, String host, int port) {
this.proxy = new HttpHost(host, port, scheme);
}
@Override
public void abort() {
// TODO Actually try to abort
}
@Override
protected void finalize() {
monitor.interrupt();
_connectionManager.shutdown();
IOUtils.closeQuietly(_httpClient);
_httpClient = null;
}
public void setUserAgentString(String userAgentString) {
this._userAgentString = userAgentString;
}
}