package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Constant;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 基于PageProcessor的扩展点。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 下午8:46 <br>
*/
class ModelPageProcessor implements PageProcessor {
private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
private Site site;
private Set<Pattern> targetUrlPatterns = new HashSet<Pattern>();
public static ModelPageProcessor create(Site site, Class<?>... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
for (Class<?> clazz : clazzs) {
modelPageProcessor.addPageModel(clazz);
}
return modelPageProcessor;
}
public ModelPageProcessor addPageModel(Class<?> clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns());
targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns());
pageModelExtractorList.add(pageModelExtractor);
return this;
}
private ModelPageProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object obj = page.getRequest().getExtra(Constant.IS_FROM_TEST_REQUEST);
if (obj == null || !(Boolean)obj) {
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
}
Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List<?>) process).size() == 0)) {
page.getResultItems().setSkip(true);
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
}
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
protected void postProcessPageModel(Class<?> clazz, Object object) {
}
@Override
public Site getSite() {
return site;
}
}