package guang.crawler.crawlWorker.pageProcessor;
import guang.crawler.commons.Page;
import guang.crawler.commons.WebURL;
import guang.crawler.crawlWorker.WorkerConfig;
import guang.crawler.extension.urlExtractor.URLsExtractor;
import guang.crawler.localConfig.ComponentLoader;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
* 从Page页面中抽取URL的插件.这里应该重点研究一下究竟应当以怎样的方式进行.
*
* @author sun
*
*/
public class ExtractLinksToFollowPlugin implements DownloadPlugin {
/**
* 需要过滤掉的URL的正则表达式样式
*/
private Pattern filter = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
+ "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf"
+ "|rm|smil|wmv|swf|wma|zip|rar|gz|ico))$");
/**
* URLsExtractor的加载器,用来从指定配置文件中加载URLsExtractor.
*/
private ComponentLoader<URLsExtractor> extractorLoader;
public ExtractLinksToFollowPlugin() throws ConfigLoadException {
String configFileName = WorkerConfig.me()
.getCrawlerHome()
+ "/conf/crawler-worker/url-extractors.xml";
File configFile = new File(configFileName);
String schemaFileName = WorkerConfig.me()
.getCrawlerHome()
+ "/etc/xsd/components.xsd";
File schemaFile = new File(schemaFileName);
this.extractorLoader = new ComponentLoader<URLsExtractor>(configFile,
schemaFile);
try {
this.extractorLoader.load();
} catch (Exception e) {
throw new ConfigLoadException(
"load url-extractors.xml file failed!", e);
}
}
@Override
public boolean work(final Page page) {
if (page != null) {
// 获取URLExtractor
URLsExtractor extractor = this.extractorLoader.getComponent(page.getWebURL()
.getURL());
if (extractor != null) {
// 利用URLExtractor抽取URL列表
extractor.extractURLs(page);
}
List<WebURL> resultURLs = page.getLinksToFollow();
// 过滤掉那些不需要的页面
if (resultURLs != null) {
Iterator<WebURL> it = resultURLs.iterator();
while (it.hasNext()) {
if (this.filter.matcher(it.next()
.getURL())
.matches()) {
it.remove();
}
}
}
return true;
}
return false;
}
}