package us.codecraft.webmagic.model;
import java.lang.annotation.Annotation;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 基于Model的Spider,封装后的入口类。<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 上午9:51 <br>
*/
public class OOSpider extends Spider {
private ModelPageProcessor modelPageProcessor;
private ModelPipeline modelPipeline;
protected OOSpider(ModelPageProcessor modelPageProcessor) {
super(modelPageProcessor);
this.modelPageProcessor = modelPageProcessor;
}
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor);
}
/**
* 创建一个爬虫。<br>
* @param site
* @param pageModelPipeline
* @param pageModels
*/
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
this(ModelPageProcessor.create(site, pageModels));
this.modelPipeline = new ModelPipeline();
super.pipeline(modelPipeline);
if (pageModelPipeline!=null){
for (Class pageModel : pageModels) {
this.modelPipeline.put(pageModel, pageModelPipeline);
}
}
}
public static OOSpider create(Site site, Class... pageModels) {
return new OOSpider(site, null, pageModels);
}
public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
return new OOSpider(site, pageModelPipeline, pageModels);
}
public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) {
for (Class pageModel : pageModels) {
modelPageProcessor.addPageModel(pageModel);
modelPipeline.put(pageModel, pageModelPipeline);
}
return this;
}
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
protected boolean validate(String url) {
Map<Class, PageModelPipeline> map = this.modelPipeline.getPageModelPipelines();
Set<Class> classSet = map.keySet();
Iterator<Class> iter = classSet.iterator();
if (iter.hasNext()) {
Class clazz = iter.next();
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile(".*"));
} else {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
}
boolean match = false;
for(Pattern p : targetUrlPatterns) {
Matcher m = p.matcher(url);
if (m.find()) {
match = true;
break;
}
}
if (!match && !targetUrlPatterns.isEmpty()) {
logger.warn(String.format("the url(%s) don't match the targeUrl regex(%s)", url, targetUrlPatterns.get(0)));
}
return match;
}
return super.validate(url);
}
}