package com.geccocrawler.gecco.spider;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.reflections.Reflections;
import org.reflections.scanners.TypeAnnotationsScanner;
import org.reflections.util.ConfigurationBuilder;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.downloader.DownloaderAOPFactory;
import com.geccocrawler.gecco.downloader.DownloaderFactory;
import com.geccocrawler.gecco.downloader.MonitorDownloaderFactory;
import com.geccocrawler.gecco.dynamic.GeccoClassLoader;
import com.geccocrawler.gecco.dynamic.GeccoJavaReflectionAdapter;
import com.geccocrawler.gecco.pipeline.DefaultPipelineFactory;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.pipeline.PipelineFactory;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.render.MonitorRenderFactory;
import com.geccocrawler.gecco.spider.render.RenderFactory;
import com.geccocrawler.gecco.spider.render.RenderType;
import com.geccocrawler.gecco.utils.ReflectUtils;
import com.geccocrawler.gecco.utils.UrlMatcher;
/**
* SpiderBean是爬虫渲染的JavaBean的统一接口类,所有Bean均继承该接口。SpiderBeanFactroy会根据请求的url地址,
* 匹配相应的SpiderBean,同时生成该SpiderBean的上下文SpiderBeanContext. SpiderBeanContext包括需要改SpiderBean的渲染类
* (目前支持HTML、JSON两种Bean的渲染方式)、下载前处理类、下载后处理类以及渲染完成后对SpiderBean的后续处理Pipeline。
*
* @author huchengyi
*
*/
public class SpiderBeanFactory {
private static final Log LOG = LogFactory.getLog(SpiderBeanFactory.class);
/**
* 匹配的SpriderBean matchUrl:SpiderBean
*/
private Map<String, Class<? extends SpiderBean>> spiderBeans;
/**
* 匹配的SpiderBean上下文 SpiderBeanClassName:SpiderBeanClass
*/
private Map<String, SpiderBeanContext> spiderBeanContexts;
private DownloaderFactory downloaderFactory;
private DownloaderAOPFactory downloaderAOPFactory;
private RenderFactory renderFactory;
private PipelineFactory pipelineFactory;
protected Reflections reflections;
public SpiderBeanFactory(String classPath) {
this(classPath, null);
}
public SpiderBeanFactory(String classPath, PipelineFactory pipelineFactory) {
if (StringUtils.isNotEmpty(classPath)) {
reflections = new Reflections(
ConfigurationBuilder.build("com.geccocrawler.gecco", classPath, GeccoClassLoader.get())
.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
// reflections = new Reflections("com.geccocrawler.gecco", classPath);
} else {
reflections = new Reflections(ConfigurationBuilder.build("com.geccocrawler.gecco", GeccoClassLoader.get())
.setMetadataAdapter(new GeccoJavaReflectionAdapter()));
// reflections = new Reflections("com.geccocrawler.gecco");
}
dynamic();
this.downloaderFactory = new MonitorDownloaderFactory(reflections);
this.downloaderAOPFactory = new DownloaderAOPFactory(reflections);
this.renderFactory = new MonitorRenderFactory(reflections);
if (pipelineFactory != null) {
this.pipelineFactory = pipelineFactory;
} else {
this.pipelineFactory = new DefaultPipelineFactory(reflections);
}
this.spiderBeans = new ConcurrentHashMap<String, Class<? extends SpiderBean>>();
this.spiderBeanContexts = new ConcurrentHashMap<String, SpiderBeanContext>();
loadSpiderBean(reflections);
}
/**
* 动态增加的spiderBean
*/
private void dynamic() {
GeccoClassLoader gcl = GeccoClassLoader.get();
for (String className : gcl.getClasses().keySet()) {
reflections.getStore().get(TypeAnnotationsScanner.class.getSimpleName()).put(Gecco.class.getName(),
className);
}
}
private void loadSpiderBean(Reflections reflections) {
Set<Class<?>> spiderBeanClasses = reflections.getTypesAnnotatedWith(Gecco.class);
for (Class<?> spiderBeanClass : spiderBeanClasses) {
addSpiderBean(spiderBeanClass);
}
}
@SuppressWarnings({ "unchecked" })
public void addSpiderBean(Class<?> spiderBeanClass) {
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
for(String matchUrl : gecco.matchUrl()) {
//String matchUrl = gecco.matchUrl();
try {
// SpiderBean spider = (SpiderBean)spiderBeanClass.newInstance();
// 判断是不是SpiderBeanClass????
if (spiderBeans.containsKey(matchUrl)) {
LOG.warn("there are multil '" + matchUrl + "' ,first htmlBean will be Override。");
}
spiderBeans.put(matchUrl, (Class<? extends SpiderBean>) spiderBeanClass);
SpiderBeanContext context = initContext(spiderBeanClass);
spiderBeanContexts.put(spiderBeanClass.getName(), context);
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
public void removeSpiderBean(Class<?> spiderBeanClass) {
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
for(String matchUrl : gecco.matchUrl()) {
//String matchUrl = gecco.matchUrl();
try {
spiderBeans.remove(matchUrl);
spiderBeanContexts.remove(spiderBeanClass.getName());
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
public Class<? extends SpiderBean> matchSpider(HttpRequest request) {
String url = request.getUrl();
Class<? extends SpiderBean> commonSpider = null;// 通用爬虫
for (Map.Entry<String, Class<? extends SpiderBean>> entrys : spiderBeans.entrySet()) {
Class<? extends SpiderBean> spider = entrys.getValue();
String urlPattern = entrys.getKey();
Map<String, String> params = UrlMatcher.match(url, urlPattern);
if (params != null) {
request.setParameters(params);
return spider;
} else {
if (urlPattern.equals("*")) {
commonSpider = spider;
}
}
}
if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫
return commonSpider;
}
return null;
}
public SpiderBeanContext getContext(Class<? extends SpiderBean> spider) {
return spiderBeanContexts.get(spider.getName());
}
private SpiderBeanContext initContext(Class<?> spiderBeanClass) {
SpiderBeanContext context = new SpiderBeanContext();
// 关联的after、before、downloader
downloadContext(context, spiderBeanClass);
// 关联的render
renderContext(context, spiderBeanClass);
// 关联的pipelines
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
String[] pipelineNames = gecco.pipelines();
pipelineContext(context, pipelineNames);
return context;
}
private void downloadContext(SpiderBeanContext context, Class<?> spiderBeanClass) {
String geccoName = spiderBeanClass.getName();
context.setBeforeDownload(downloaderAOPFactory.getBefore(geccoName));
context.setAfterDownload(downloaderAOPFactory.getAfter(geccoName));
Gecco gecco = spiderBeanClass.getAnnotation(Gecco.class);
String downloader = gecco.downloader();
context.setDownloader(downloaderFactory.getDownloader(downloader));
context.setTimeout(gecco.timeout());
}
private void renderContext(SpiderBeanContext context, Class<?> spiderBeanClass) {
RenderType renderType = RenderType.HTML;
if (ReflectUtils.haveSuperType(spiderBeanClass, JsonBean.class)) {
renderType = RenderType.JSON;
}
context.setRender(renderFactory.getRender(renderType));
}
@SuppressWarnings({ "rawtypes" })
private void pipelineContext(SpiderBeanContext context, String[] pipelineNames) {
if (pipelineNames != null && pipelineNames.length > 0) {
List<Pipeline> pipelines = new ArrayList<Pipeline>();
for (String pipelineName : pipelineNames) {
if (StringUtils.isEmpty(pipelineName)) {
continue;
}
Pipeline pipeline = pipelineFactory.getPipeline(pipelineName);
if (pipeline != null) {
pipelines.add(pipeline);
}
}
context.setPipelines(pipelines);
}
}
public DownloaderAOPFactory getDownloaderAOPFactory() {
return downloaderAOPFactory;
}
public RenderFactory getRenderFactory() {
return renderFactory;
}
public PipelineFactory getPipelineFactory() {
return pipelineFactory;
}
public DownloaderFactory getDownloaderFactory() {
return downloaderFactory;
}
public Reflections getReflections() {
return reflections;
}
}