package com.bmk.crawler; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.zip.GZIPInputStream; import javax.net.ssl.SSLHandshakeException; import org.apache.commons.lang.StringUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpEntityEnclosingRequest; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.HttpVersion; import org.apache.http.NoHttpResponseException; import org.apache.http.ParseException; import org.apache.http.StatusLine; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.params.ConnManagerParams; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.conn.routing.HttpRoute; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.message.BasicHeader; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; /** * @Intro descrption here * @author Lee * @Date 2013-8-8 */ public class HttpConnnectionManager { /** * * 连接池里的最大连接数 */ public static final int MAX_TOTAL_CONNECTIONS = 100; /** * * 每个路由的默认最大连接数 */ public static final int MAX_ROUTE_CONNECTIONS = 50; /** * * 连接超时时间 */ public static final int CONNECT_TIMEOUT = 50000; /** * * 套接字超时时间 */ public static final int SOCKET_TIMEOUT = 50000; /** * * 连接池中 连接请求执行被阻塞的超时时间 */ public static final long CONN_MANAGER_TIMEOUT = 60000; /** * * http连接相关参数 */ private static HttpParams parentParams; /** * * http线程池管理器 */ private static PoolingClientConnectionManager cm; /** * * http客户端 */ private static DefaultHttpClient httpClient; /** * * 默认目标主机 */ private static final HttpHost DEFAULT_TARGETHOST = new HttpHost( "http://category.dangdang.com/all/?category_path=01.00.00.00.00.00", 80); /** * * 初始化http连接池,设置参数、http头等等信息 */ static { SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register( new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register( new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(MAX_TOTAL_CONNECTIONS); cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS); cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20); // 设置对目标主机的最大连接数 parentParams = new BasicHttpParams(); parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1); parentParams .setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST); // 设置默认targetHost parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT); parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT); parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT); parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true); parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true); // 设置头信息,模拟浏览器 Collection collection = new ArrayList(); collection .add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)")); collection .add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")); collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5")); collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7")); collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate")); parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection); // 请求重试处理 HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() { public boolean retryRequest(IOException exception, int executionCount, HttpContext context) { if (executionCount >= 5) { // 如果超过最大重试次数,那么就不要继续了 return false; } if (exception instanceof NoHttpResponseException) { // 如果服务器丢掉了连接,那么就重试 return true; } if (exception instanceof SSLHandshakeException) { // 不要重试SSL握手异常 return false; } HttpRequest request = (HttpRequest) context .getAttribute(ExecutionContext.HTTP_REQUEST); boolean idempotent = !(request instanceof HttpEntityEnclosingRequest); if (idempotent) { // 如果请求被认为是幂等的,那么就重试 return true; } return false; } }; httpClient = new DefaultHttpClient(cm, parentParams); httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler); } /** * * 抓取页面代码 * * @param url * 目标页面的url * * @return 页面代码 */ public static String getHtml(String url) { HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);// 代理 String html = getHtml(url, proxyHost); int count = 0; while (StringUtils.isEmpty(html)) { proxyHost = new HttpHost("211.142.236.137", 80);// 更换代理 html = getHtml(url, proxyHost); count++; if (count > 3) { System.out.println("抓取失败"); break; } } //System.out.println(html.length()); return html; } /** * * 抓取url所指的页面代码 * * @param url * 目标页面的url * * @return 页面代码 */ private static String getHtml(String url, HttpHost proxyHost) { String html = ""; HttpGet httpGet = new HttpGet(url); httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);// 设置代理 HttpResponse httpResponse; HttpEntity httpEntity; try { httpResponse = httpClient.execute(httpGet); StatusLine statusLine = httpResponse.getStatusLine(); int statusCode = statusLine.getStatusCode(); // System.out.println(statusCode); if (200 != statusCode) { return html; } httpEntity = httpResponse.getEntity(); if (httpEntity != null) { html = readHtmlContentFromEntity(httpEntity); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (httpGet != null) { httpGet.releaseConnection(); } } return html; } /** * * 从response返回的实体中读取页面代码 * * @param httpEntity * Http实体 * * @return 页面代码 * * @throws ParseException * * @throws IOException */ private static String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException { String html = ""; Header header = httpEntity.getContentEncoding(); if (httpEntity.getContentLength() < 2147483647L) { // EntityUtils无法处理ContentLength超过2147483647L的Entity if (header != null && "gzip".equals(header.getValue())) { html = EntityUtils.toString(new GzipDecompressingEntity( httpEntity)); } else { html = EntityUtils.toString(httpEntity); } } else { InputStream in = httpEntity.getContent(); if (header != null && "gzip".equals(header.getValue())) { html = unZip(in, ContentType.getOrDefault(httpEntity) .getCharset().toString()); } else { html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset() .toString()); } if (in != null) { in.close(); } } return html; } /** * * 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样) * * @param httpHost * 封装了代理的ip地址和端口 * * @param url * 用来测试的页面 * * @return true 可用 false 不可用 */ public boolean isProxyUsable(HttpHost proxyHost, String url) { HttpGet httpGet = new HttpGet(url); httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost); try { HttpResponse httpResponse = httpClient.execute(httpGet); StatusLine statusLine = httpResponse.getStatusLine(); int statusCode = statusLine.getStatusCode(); System.out.println(statusCode); if (200 != statusCode) { return false; } HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity != null) { String html = readHtmlContentFromEntity(httpEntity); if (StringUtils.isEmpty(html)) { return false; } } else { return false; } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } return true; } /** * * 解压服务器返回的gzip流 * * @param in * 抓取返回的InputStream流 * * @param charSet * 页面内容编码 * * @return 页面内容的String格式 * * @throws IOException */ private static String unZip(InputStream in, String charSet) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPInputStream gis = null; try { gis = new GZIPInputStream(in); byte[] _byte = new byte[1024]; int len = 0; while ((len = gis.read(_byte)) != -1) { baos.write(_byte, 0, len); } String unzipString = new String(baos.toByteArray(), charSet); return unzipString; } finally { if (gis != null) { gis.close(); } if (baos != null) { baos.close(); } } } /** * * 读取InputStream流 * * @param in * InputStream流 * * @return 从流中读取的String * * @throws IOException */ private static String readInStreamToString(InputStream in, String charSet) throws IOException { StringBuilder str = new StringBuilder(); String line; BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(in, charSet)); while ((line = bufferedReader.readLine()) != null) { str.append(line); str.append("\n"); } if (bufferedReader != null) { bufferedReader.close(); } return str.toString(); } }