package guang.crawler.crawlWorker.pageProcessor;
import guang.crawler.commons.Page;
import guang.crawler.commons.WebURL;
import guang.crawler.crawlWorker.fetcher.PageFetchResult;
import guang.crawler.crawlWorker.fetcher.PageFetcher;
import guang.crawler.crawlWorker.parser.Parser;
import java.util.LinkedList;
import org.apache.http.HttpStatus;
/**
* 该类用来处理页面的下载,解析以及后续处理
*
* @author sun
*
*/
public class PageProcessor {
/**
* 页面解析器
*/
private Parser parser;
/**
* 页面下载器
*/
private PageFetcher pageFetcher;
/**
* 下载插件,用来对下载的页面进行处理
*/
private LinkedList<DownloadPlugin> downloadPlugins;
public PageProcessor() {
this.parser = new Parser();
this.pageFetcher = new PageFetcher();
this.downloadPlugins = new LinkedList<DownloadPlugin>();
}
/**
* 为页面处理器添加一个插件
*
* @param plugin
*/
public void addPlugin(final DownloadPlugin plugin) {
this.downloadPlugins.add(plugin);
}
/**
* 下载指定URL对应的页面
*
* @param curURL
* @return
*/
private PageFetchResult download(final WebURL curURL) {
PageFetchResult fetchResult = null;
fetchResult = this.pageFetcher.fetchData(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
return fetchResult;
}
return null;
}
/**
* 对下载的内容进行解析
*
* @param fetchResult
* @param curURL
* @return
*/
private Page parse(final PageFetchResult fetchResult, final WebURL curURL) {
try {
Page page = new Page(curURL);
fetchResult.transformToPage(page);
if (this.parser.parse(page, curURL.getURL())) {
return page;
}
} catch (Exception e) {
return null;
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
/**
* 处理某个指定的URL
*
* @param url
*/
public void processUrl(final WebURL url) {
System.out.println("Processing: " + url);
// 先下载
PageFetchResult fetchResult = this.download(url);
// 然后解析
Page page = this.parse(fetchResult, url);
// 解析完成后应用插件进行处理
if (page != null) {
for (DownloadPlugin plugin : this.downloadPlugins) {
boolean success = plugin.work(page);
if (!success) {
break;
}
}
} else {
System.out.println("Couldn't fetch the content of the page.");
}
}
/**
* 关闭页面处理器
*/
public void shutdown() {
if (this.pageFetcher != null) {
this.pageFetcher.shutDown();
}
}
}