package com.bmk.crawler;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.net.ssl.SSLHandshakeException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
/**
* @Intro descrption here
* @author Lee
* @Date 2013-8-8
*/
public class HttpConnnectionManager {
/**
*
* 连接池里的最大连接数
*/
public static final int MAX_TOTAL_CONNECTIONS = 100;
/**
*
* 每个路由的默认最大连接数
*/
public static final int MAX_ROUTE_CONNECTIONS = 50;
/**
*
* 连接超时时间
*/
public static final int CONNECT_TIMEOUT = 50000;
/**
*
* 套接字超时时间
*/
public static final int SOCKET_TIMEOUT = 50000;
/**
*
* 连接池中 连接请求执行被阻塞的超时时间
*/
public static final long CONN_MANAGER_TIMEOUT = 60000;
/**
*
* http连接相关参数
*/
private static HttpParams parentParams;
/**
*
* http线程池管理器
*/
private static PoolingClientConnectionManager cm;
/**
*
* http客户端
*/
private static DefaultHttpClient httpClient;
/**
*
* 默认目标主机
*/
private static final HttpHost DEFAULT_TARGETHOST = new HttpHost(
"http://category.dangdang.com/all/?category_path=01.00.00.00.00.00",
80);
/**
*
* 初始化http连接池,设置参数、http头等等信息
*/
static {
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(
new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(
new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
cm = new PoolingClientConnectionManager(schemeRegistry);
cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);
cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20); // 设置对目标主机的最大连接数
parentParams = new BasicHttpParams();
parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION,
HttpVersion.HTTP_1_1);
parentParams
.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST); // 设置默认targetHost
parentParams.setParameter(ClientPNames.COOKIE_POLICY,
CookiePolicy.BROWSER_COMPATIBILITY);
parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT,
CONN_MANAGER_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,
CONNECT_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT,
SOCKET_TIMEOUT);
parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);
parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
// 设置头信息,模拟浏览器
Collection collection = new ArrayList();
collection
.add(new BasicHeader("User-Agent",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
collection
.add(new BasicHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
collection.add(new BasicHeader("Accept-Language",
"zh-cn,zh,en-US,en;q=0.5"));
collection.add(new BasicHeader("Accept-Charset",
"ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
// 请求重试处理
HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(IOException exception,
int executionCount, HttpContext context) {
if (executionCount >= 5) {
// 如果超过最大重试次数,那么就不要继续了
return false;
}
if (exception instanceof NoHttpResponseException) {
// 如果服务器丢掉了连接,那么就重试
return true;
}
if (exception instanceof SSLHandshakeException) {
// 不要重试SSL握手异常
return false;
}
HttpRequest request = (HttpRequest) context
.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果请求被认为是幂等的,那么就重试
return true;
}
return false;
}
};
httpClient = new DefaultHttpClient(cm, parentParams);
httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
}
/**
*
* 抓取页面代码
*
* @param url
* 目标页面的url
*
* @return 页面代码
*/
public static String getHtml(String url) {
HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);// 代理
String html = getHtml(url, proxyHost);
int count = 0;
while (StringUtils.isEmpty(html)) {
proxyHost = new HttpHost("211.142.236.137", 80);// 更换代理
html = getHtml(url, proxyHost);
count++;
if (count > 3) {
System.out.println("抓取失败");
break;
}
}
//System.out.println(html.length());
return html;
}
/**
*
* 抓取url所指的页面代码
*
* @param url
* 目标页面的url
*
* @return 页面代码
*/
private static String getHtml(String url, HttpHost proxyHost) {
String html = "";
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
proxyHost);// 设置代理
HttpResponse httpResponse;
HttpEntity httpEntity;
try {
httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
// System.out.println(statusCode);
if (200 != statusCode) {
return html;
}
httpEntity = httpResponse.getEntity();
if (httpEntity != null) {
html = readHtmlContentFromEntity(httpEntity);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (httpGet != null) {
httpGet.releaseConnection();
}
}
return html;
}
/**
*
* 从response返回的实体中读取页面代码
*
* @param httpEntity
* Http实体
*
* @return 页面代码
*
* @throws ParseException
*
* @throws IOException
*/
private static String readHtmlContentFromEntity(HttpEntity httpEntity)
throws ParseException, IOException {
String html = "";
Header header = httpEntity.getContentEncoding();
if (httpEntity.getContentLength() < 2147483647L) { // EntityUtils无法处理ContentLength超过2147483647L的Entity
if (header != null && "gzip".equals(header.getValue())) {
html = EntityUtils.toString(new GzipDecompressingEntity(
httpEntity));
} else {
html = EntityUtils.toString(httpEntity);
}
} else {
InputStream in = httpEntity.getContent();
if (header != null && "gzip".equals(header.getValue())) {
html = unZip(in, ContentType.getOrDefault(httpEntity)
.getCharset().toString());
} else {
html = readInStreamToString(in,
ContentType.getOrDefault(httpEntity).getCharset()
.toString());
}
if (in != null) {
in.close();
}
}
return html;
}
/**
*
* 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
*
* @param httpHost
* 封装了代理的ip地址和端口
*
* @param url
* 用来测试的页面
*
* @return true 可用 false 不可用
*/
public boolean isProxyUsable(HttpHost proxyHost, String url) {
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
proxyHost);
try {
HttpResponse httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if (200 != statusCode) {
return false;
}
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity != null) {
String html = readHtmlContentFromEntity(httpEntity);
if (StringUtils.isEmpty(html)) {
return false;
}
} else {
return false;
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
/**
*
* 解压服务器返回的gzip流
*
* @param in
* 抓取返回的InputStream流
*
* @param charSet
* 页面内容编码
*
* @return 页面内容的String格式
*
* @throws IOException
*/
private static String unZip(InputStream in, String charSet)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPInputStream gis = null;
try {
gis = new GZIPInputStream(in);
byte[] _byte = new byte[1024];
int len = 0;
while ((len = gis.read(_byte)) != -1) {
baos.write(_byte, 0, len);
}
String unzipString = new String(baos.toByteArray(), charSet);
return unzipString;
} finally {
if (gis != null) {
gis.close();
}
if (baos != null) {
baos.close();
}
}
}
/**
*
* 读取InputStream流
*
* @param in
* InputStream流
*
* @return 从流中读取的String
*
* @throws IOException
*/
private static String readInStreamToString(InputStream in, String charSet)
throws IOException {
StringBuilder str = new StringBuilder();
String line;
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(in, charSet));
while ((line = bufferedReader.readLine()) != null) {
str.append(line);
str.append("\n");
}
if (bufferedReader != null) {
bufferedReader.close();
}
return str.toString();
}
}