package guang.crawler.crawlWorker.pageProcessor; import guang.crawler.commons.Page; import guang.crawler.crawlWorker.WorkerConfig; import guang.crawler.extension.filedExtractor.FieldsExtractor; import guang.crawler.localConfig.ComponentLoader; import java.io.File; /** * 从页面中获取需要保存的信息 * * @author sun * */ public class ExtractDataToSavePlugin implements DownloadPlugin { /** * FieldsExtractor的加载器,用来从指定配置文件中加载FieldsExtractor */ private ComponentLoader<FieldsExtractor> fieldsExtractorLoader; public ExtractDataToSavePlugin() throws ConfigLoadException { String configFileName = WorkerConfig.me() .getCrawlerHome() + "/conf/crawler-worker/filed-extractors.xml"; File configFile = new File(configFileName); String schemaFileName = WorkerConfig.me() .getCrawlerHome() + "/etc/xsd/components.xsd"; File schemaFile = new File(schemaFileName); this.fieldsExtractorLoader = new ComponentLoader<FieldsExtractor>( configFile, schemaFile); try { this.fieldsExtractorLoader.load(); } catch (Exception e) { throw new ConfigLoadException( "load fileds-extractors.xml file failed!", e); } } @Override public boolean work(final Page page) { if (page != null) { // 获取URLExtractor FieldsExtractor extractor = this.fieldsExtractorLoader.getComponent(page.getWebURL() .getURL()); if (extractor != null) { // 利用URLExtractor抽取URL列表 extractor.extractFields(page); } return true; } return false; } }