package guang.crawler.crawlWorker.fetcher;
import guang.crawler.commons.WebURL;
import guang.crawler.crawlWorker.WorkerConfig;
import guang.crawler.crawlWorker.url.URLCanonicalizer;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HttpContext;
import org.apache.log4j.Logger;
/**
* 页面抓取器,从网上抓取指定URL所对应的页面
*
* @author yang
*/
public class PageFetcher {
/**
* GZip格式数据信息的转化
*
* @author sun
*
*/
private static class GzipDecompressingEntity extends HttpEntityWrapper {
public GzipDecompressingEntity(final HttpEntity entity) {
super(entity);
}
@Override
public InputStream getContent() throws IOException,
IllegalStateException {
// the wrapped entity's getContent() decides about repeatability
InputStream wrappedin = this.wrappedEntity.getContent();
return new GZIPInputStream(wrappedin);
}
@Override
public long getContentLength() {
return -1;
}
}
protected static final Logger logger = Logger.getLogger(PageFetcher.class);
/**
* 连接管理器.用来控制当前主机的连接情况
*/
protected PoolingClientConnectionManager connectionManager;
/**
* HTTP连接客户端
*/
protected DefaultHttpClient httpClient;
/**
* 锁
*/
protected final Object mutex = new Object();
/**
* 用来清理空闲连接的线程
*/
protected IdleConnectionMonitorThread connectionMonitorThread = null;
public PageFetcher() {
WorkerConfig config = WorkerConfig.me();
// 创建连接管理器
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(new Scheme("http", 80,
PlainSocketFactory.getSocketFactory()));
if (config.isIncludeHttpsPages()) {
schemeRegistry.register(new Scheme("https", 443,
SSLSocketFactory.getSocketFactory()));
}
this.connectionManager = new PoolingClientConnectionManager(
schemeRegistry);
this.connectionManager.setMaxTotal(config.getMaxTotalConnections());
this.connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
// 创建HTTP连接客户端
HttpParams params = new BasicHttpParams();
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1);
paramsBean.setContentCharset("UTF-8");
paramsBean.setUseExpectContinue(false);
params.setParameter(ClientPNames.COOKIE_POLICY,
CookiePolicy.BROWSER_COMPATIBILITY);
params.setParameter(CoreProtocolPNames.USER_AGENT,
config.getUserAgentString());
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT,
config.getSocketTimeout());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,
config.getConnectionTimeout());
params.setBooleanParameter("http.protocol.handle-redirects",
config.isFollowRedirects());
this.httpClient = new DefaultHttpClient(this.connectionManager, params);
if (config.getProxyHost() != null) {
if (config.getProxyUsername() != null) {
this.httpClient.getCredentialsProvider()
.setCredentials(new AuthScope(
config.getProxyHost(),
config.getProxyPort()),
new UsernamePasswordCredentials(
config.getProxyUsername(),
config.getProxyPassword()));
}
HttpHost proxy = new HttpHost(config.getProxyHost(),
config.getProxyPort());
this.httpClient.getParams()
.setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
this.httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
@Override
public void process(final HttpResponse response,
final HttpContext context) throws HttpException,
IOException {
HttpEntity entity = response.getEntity();
Header contentEncoding = entity.getContentEncoding();
if (contentEncoding != null) {
HeaderElement[] codecs = contentEncoding.getElements();
for (HeaderElement codec : codecs) {
if (codec.getName()
.equalsIgnoreCase("gzip")) {
response.setEntity(new GzipDecompressingEntity(
response.getEntity()));
return;
}
}
}
}
});
if (this.connectionMonitorThread == null) {
this.connectionMonitorThread = new IdleConnectionMonitorThread(
this.connectionManager);
}
this.connectionMonitorThread.start();
}
/**
* 下载数据
*
* @param webUrl
* @return
*/
public PageFetchResult fetchData(final WebURL webUrl) {
PageFetchResult fetchResult = new PageFetchResult();
String toFetchURL = webUrl.getURL();
HttpGet get = null;
try {
get = new HttpGet(toFetchURL);
get.addHeader("Accept-Encoding", "gzip");
HttpResponse response = this.httpClient.execute(get);
// 获取请求得到的结果数据
fetchResult.setEntity(response.getEntity());
fetchResult.setResponseHeaders(response.getAllHeaders());
int statusCode = response.getStatusLine()
.getStatusCode();
// 如果遇到的是重定向,那么就设置重定向URL。
if (statusCode != HttpStatus.SC_OK) {
if (statusCode != HttpStatus.SC_NOT_FOUND) {
if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY)
|| (statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) {
Header header = response.getFirstHeader("Location");
if (header != null) {
String movedToUrl = header.getValue();
movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl,
toFetchURL);
fetchResult.setMovedToUrl(movedToUrl);
}
fetchResult.setStatusCode(statusCode);
return fetchResult;
}
PageFetcher.logger.info("Failed: "
+ response.getStatusLine()
.toString() + ", while fetching "
+ toFetchURL);
}
fetchResult.setStatusCode(response.getStatusLine()
.getStatusCode());
return fetchResult;
} else {
fetchResult.setFetchedUrl(toFetchURL);
String uri = get.getURI()
.toString();
// 有可能获得的结果是转发之后的,那么将fetchedUrl设置成实际爬取的URL。
if (!uri.equals(toFetchURL)) {
if (!URLCanonicalizer.getCanonicalURL(uri)
.equals(toFetchURL)) {
fetchResult.setFetchedUrl(uri);
}
}
// 如果爬取的页面有内容,检测一下内容是否过大了。
if (fetchResult.getEntity() != null) {
long size = fetchResult.getEntity()
.getContentLength();
if (size == -1) {
Header length = response.getLastHeader("Content-Length");
if (length == null) {
length = response.getLastHeader("Content-length");
}
if (length != null) {
size = Integer.parseInt(length.getValue());
} else {
size = -1;
}
}
if (size > WorkerConfig.me()
.getMaxDownloadSize()) {
fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
get.abort();
return fetchResult;
}
fetchResult.setStatusCode(HttpStatus.SC_OK);
return fetchResult;
}
}
get.abort();
} catch (IOException e) {
PageFetcher.logger.error("Fatal transport error: " + e.getMessage()
+ " while fetching " + toFetchURL + " (link found in doc #"
+ webUrl.getParentDocid() + ")");
fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
return fetchResult;
} catch (IllegalStateException e) {
// ignoring exceptions that occur because of not registering https
// and other schemes
} catch (Exception e) {
if (e.getMessage() == null) {
PageFetcher.logger.error("Error while fetching "
+ webUrl.getURL());
} else {
PageFetcher.logger.error(e.getMessage() + " while fetching "
+ webUrl.getURL());
}
} finally {
try {
if ((fetchResult.getEntity() == null) && (get != null)) {
get.abort();
}
} catch (Exception e) {
e.printStackTrace();
}
}
fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
return fetchResult;
}
public HttpClient getHttpClient() {
return this.httpClient;
}
/**
* 关闭连接管理器
*/
public synchronized void shutDown() {
if (this.connectionMonitorThread != null) {
this.connectionManager.shutdown();
this.connectionMonitorThread.shutdown();
}
}
}