package guang.crawler.siteManager.commandlet;
import guang.crawler.commons.WebURL;
import guang.crawler.jsonServer.Commandlet;
import guang.crawler.jsonServer.DataPacket;
import guang.crawler.siteManager.SiteConfig;
import guang.crawler.siteManager.SiteManager;
import guang.crawler.siteManager.docid.DocidServer;
import guang.crawler.siteManager.urlFilter.ObjectFilter;
import java.util.LinkedList;
import java.util.regex.Pattern;
import com.alibaba.fastjson.JSON;
/**
* 爬虫工作者爬取了网页之后获得了一些URL列表,需要传输到站点管理器中,当前Commandlet将处理该请求.
*
* @author sun
*
*/
public class URLsPutter implements Commandlet {
/**
* URL过滤器
*/
private final ObjectFilter urlFilter;
/**
* 该站点管理器管理的采集点配置的允许的URL的正则表达式
*/
private Pattern[] allowPatterns;
/**
* 该站点管理器管理的采集点配置的拒绝的URL的正则表达式
*/
private Pattern[] denyPatterns;
/**
* URL的限制深度
*/
private byte limitDepth;
/**
* 创建URLsPutter.
*/
public URLsPutter() {
// 获取URL过滤器
this.urlFilter = SiteManager.me()
.getUrlsFilter();
// 获取限制的深度
Byte wgnlimitDepth = SiteConfig.me()
.getSiteToHandle()
.getWebGatherNodeInfo()
.getWgnDepthLimit();
if (wgnlimitDepth == null) {
this.limitDepth = -1;
} else {
this.limitDepth = wgnlimitDepth;
}
// 获取允许的URL正则表达式格式
String allowRule = SiteConfig.me()
.getSiteToHandle()
.getWebGatherNodeInfo()
.getWgnAllowRule();
if ((allowRule != null) && (allowRule.trim()
.length() != 0)) {
String[] allowRules = allowRule.split(",");
this.allowPatterns = new Pattern[allowRules.length];
for (int i = 0; i < allowRules.length; i++) {
this.allowPatterns[i] = Pattern.compile(allowRules[i]);
}
}
// 获取禁止的URL正则表达式格式
String denyRule = SiteConfig.me()
.getSiteToHandle()
.getWebGatherNodeInfo()
.getWgnDenyRule();
if ((denyRule != null) && (denyRule.trim()
.length() != 0)) {
String[] denyRules = denyRule.split(",");
this.denyPatterns = new Pattern[denyRules.length];
for (int i = 0; i < denyRules.length; i++) {
this.denyPatterns[i] = Pattern.compile(denyRules[i]);
}
}
}
@Override
public DataPacket doCommand(final DataPacket request) {
LinkedList<WebURL> filteredResult = null;
SiteManager siteManager = null;
siteManager = SiteManager.me();
WebURL parent = this.getParentURL(request);
int count = this.getURLListCount(request);
if (count > 0) {
// 过滤URL列表
filteredResult = this.filterURLs(request, parent, count);
// 添加URL列表
this.setAndAddURLs(filteredResult, siteManager);
}
if (parent != null) {
siteManager.getWorkingTaskList()
.delete(parent);
System.out.println("[DELETEED] " + parent.getURL());
}
return null;
}
/**
* 根据种种过滤条件过滤掉相关的URL
*
* @param request
* @param parent
* @param count
* @return
*/
private LinkedList<WebURL> filterURLs(final DataPacket request,
final WebURL parent, final int count) {
LinkedList<WebURL> filteredResult;
filteredResult = new LinkedList<WebURL>();
for (int i = 0; i < count; i++) {
String webUrlJson = request.getData()
.get("URL" + i);
WebURL url = JSON.parseObject(webUrlJson, WebURL.class);
// 检查该URL是否符合站点的过滤条件
// 1. 检查该URL的深度是否合法
boolean success = this.ifAllowByDepth(parent, url);
if (!success) {
continue;
}
// 2. 检查该URL是否被允许
boolean allow = this.ifAllowByRules(url);
if (!allow) {
continue;
}
// 3. 检查该URL是否被拒绝
boolean deny = this.ifDenyByRules(url);
if (deny) {
continue;
}
// 4. 检查该URL是否已经重复
boolean contains = this.urlFilter.containsAndSet(url.getURL());
if (!contains) {
filteredResult.add(url);
}
}
return filteredResult;
}
/**
* 获取当前URL的父URL
*
* @param request
* @return
*/
private WebURL getParentURL(final DataPacket request) {
String parentJSON = request.getData()
.get("PARENT");
WebURL parent = null;
if (parentJSON != null) {
parent = JSON.parseObject(parentJSON, WebURL.class);
}
return parent;
}
/**
* 获取请求添加的URL的数量
*
* @param request
* @return
*/
private int getURLListCount(final DataPacket request) {
String countStr = request.getData()
.get("COUNT");
int count = 0;
if (countStr != null) {
try {
count = Integer.parseInt(countStr);
} catch (NumberFormatException e) {
count = 0;
}
}
return count;
}
/**
* 检查URL的深度是否合法
*
* @param limitDepth
* @param parent
* @param current
* @return
*/
private boolean ifAllowByDepth(final WebURL parent, final WebURL current) {
// 获取当前深度
short depth = 0;
if (parent != null) {
if (current.isShouldDepthIncrease()) {
depth = (short) (parent.getDepth() + 1);
} else {
depth = parent.getDepth();
}
current.setParentDocid(parent.getDocid());
}
current.setDepth(depth);
// 检查深度是否合法
if ((this.limitDepth < 0) || (depth <= this.limitDepth)) {
return true;
}
return false;
}
/**
* 检查是否被允许
*
* @param allowPatterns
* @param current
* @return
*/
private boolean ifAllowByRules(final WebURL current) {
if (this.allowPatterns == null) {
return true;
}
for (Pattern pattern : this.allowPatterns) {
if (pattern.matcher(current.getURL())
.matches()) {
return true;
}
}
return false;
}
/**
* 检查是否被拒绝
*
* @param denyPatterns
* @param current
* @return
*/
private boolean ifDenyByRules(final WebURL current) {
if (this.denyPatterns == null) {
return false;
}
if (this.denyPatterns != null) {
for (Pattern pattern : this.denyPatterns) {
if (pattern.matcher(current.getURL())
.matches()) {
return true;
}
}
}
return false;
}
/**
* 为原始的URL列表添加相关属性,然后加入todo队列中。
*
* @param filteredResult
* @param siteManager
*/
private void setAndAddURLs(final LinkedList<WebURL> filteredResult,
final SiteManager siteManager) {
if (filteredResult.size() > 0) {
DocidServer docidServer = siteManager.getDocidServer();
String siteManagerId = SiteConfig.me()
.getSiteManagerInfo()
.getSiteManagerId();
String siteId = SiteConfig.me()
.getSiteToHandle()
.getSiteId();
for (WebURL url : filteredResult) {
url.setSiteManagerId(siteManagerId);
url.setSiteId(siteId);
url.setDocid(docidServer.next(url));
siteManager.getToDoTaskList()
.put(url);
System.out.println("[ADD] " + url.getURL());
}
}
}
}