package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.log4j.Logger; import us.codecraft.webmagic.Constant; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; import java.util.HashSet; import java.util.Set; /** * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。<br> * * @author code4crafter@gmail.com <br> * Date: 13-4-21 Time: 下午12:15 */ public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); private int poolSize = 1; /** * 直接下载页面的简便方法 * * @param url * @return */ public Html download(String url) { Page page = download(new Request(url), null); return (Html) page.getHtml(); } @Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } int retryTimes = 0; Set<Integer> acceptStatCode; String charset = null; if (site != null) { retryTimes = site.getRetryTimes(); acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); } else { acceptStatCode = new HashSet<Integer>(); acceptStatCode.add(200); } logger.info("downloading page " + request.getUrl()); HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site); try { HttpGet httpGet = new HttpGet(request.getUrl()); HttpResponse httpResponse = null; int tried = 0; boolean retry; do { try { httpResponse = httpClient.execute(httpGet); retry = false; } catch (IOException e) { tried++; if (tried > retryTimes) { logger.warn("download page " + request.getUrl() + " error", e); return null; } logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); retry = true; } } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { // charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } // handleGzip(httpResponse); return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); } } catch (Exception e) { logger.warn("download page " + request.getUrl() + " error", e); } return null; } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); // set http response value page.putHttpResponse(Constant.STATUS_CODE, httpResponse.getStatusLine().getStatusCode() + ""); Header[] headers = httpResponse.getAllHeaders(); for (Header header : headers) { page.putHttpResponse(header.getName(), header.getValue()); } return page; } @Override public void setThread(int thread) { poolSize = thread; } private void handleGzip(HttpResponse httpResponse) { Header ceheader = httpResponse.getEntity().getContentEncoding(); if (ceheader != null) { HeaderElement[] codecs = ceheader.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { httpResponse.setEntity(new GzipDecompressingEntity(httpResponse.getEntity())); } } } } }