package guang.crawler.crawlWorker.fetcher; import guang.crawler.commons.Page; import java.io.EOFException; import java.io.IOException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; /** * 页面下载的结果 * * @author sun * */ public class PageFetchResult { protected static final Logger logger = Logger.getLogger(PageFetchResult.class); /** * 请求服务器之后响应的返回码 */ protected int statusCode; /** * 得到的结果数据 */ protected HttpEntity entity = null; /** * 请求得到的响应的头信息 */ protected Header[] responseHeaders = null; /** * 如果页面没有重定向,那么设置该域为当前爬取的页面 */ protected String fetchedUrl = null; /** * 如果页面被重定向了,那么就设置该域 */ protected String movedToUrl = null; /** * 如果当前entity不为空,那么就直接放弃 */ public void discardContentIfNotConsumed() { try { if (this.entity != null) { EntityUtils.consume(this.entity); } } catch (EOFException e) { // We can ignore this exception. It can happen on compressed streams // which are not repeatable } catch (IOException e) { // We can ignore this exception. It can happen if the stream is // closed. } catch (Exception e) { e.printStackTrace(); } } public HttpEntity getEntity() { return this.entity; } public String getFetchedUrl() { return this.fetchedUrl; } public String getMovedToUrl() { return this.movedToUrl; } public Header[] getResponseHeaders() { return this.responseHeaders; } public int getStatusCode() { return this.statusCode; } public void setEntity(final HttpEntity entity) { this.entity = entity; } public void setFetchedUrl(final String fetchedUrl) { this.fetchedUrl = fetchedUrl; } public void setMovedToUrl(final String movedToUrl) { this.movedToUrl = movedToUrl; } public void setResponseHeaders(final Header[] responseHeaders) { this.responseHeaders = responseHeaders; } public void setStatusCode(final int statusCode) { this.statusCode = statusCode; } /** * 将当前对象转化为一个Page对象 * * @param page * @return */ public boolean transformToPage(final Page page) { try { page.load(this.entity); page.setFetchResponseHeaders(this.responseHeaders); return true; } catch (Exception e) { PageFetchResult.logger.info("Exception while fetching content for: " + page.getWebURL() .getURL() + " [" + e.getMessage() + "]"); } return false; } }