package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; /** * The http downloader based on HttpClient. * * @author code4crafter@gmail.com <br> * @since 0.1.0 */ @ThreadSafe public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); private ProxyProvider proxyProvider; private boolean responseHeader = true; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; } private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } logger.debug("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.debug("downloading page success {}", page); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getResponseContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } private String getResponseContent(String charset, HttpResponse httpResponse) throws IOException { if (charset == null) { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(httpResponse, contentBytes); if (htmlCharset != null) { return new String(contentBytes, htmlCharset); } else { logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); return new String(contentBytes); } } else { return IOUtils.toString(httpResponse.getEntity().getContent(), charset); } } private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } }