package com.geccocrawler.gecco.scheduler;
import java.util.Comparator;
import java.util.NavigableSet;
import java.util.concurrent.ConcurrentSkipListSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.geccocrawler.gecco.request.HttpRequest;
/**
* 保证队列内容唯一,剔除重复抓取
*
* @author huchengyi
*
*/
public class UniqueSpiderScheduler implements Scheduler {
private static Log log = LogFactory.getLog(UniqueSpiderScheduler.class);
private NavigableSet<SortHttpRequest> set;
public UniqueSpiderScheduler() {
set = new ConcurrentSkipListSet<SortHttpRequest>(new Comparator<SortHttpRequest>() {
@Override
public int compare(SortHttpRequest o1, SortHttpRequest o2) {
if(o1.getHttpRequest().hashCode() == o2.getHttpRequest().hashCode()) {
if(o1.getHttpRequest().equals(o2.getHttpRequest())) {
return 0;
}
}
return (o1.getPriority() - o2.getPriority()) > 0 ? 1 : -1 ;
}
});
}
@Override
public HttpRequest out() {
SortHttpRequest sortHttpRequest = set.pollFirst();
if(sortHttpRequest == null) {
return null;
}
long priority = sortHttpRequest.getPriority();
HttpRequest request = sortHttpRequest.getHttpRequest();
if(request != null && log.isDebugEnabled()) {
log.debug("OUT("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
}
return request;
}
@Override
public void into(HttpRequest request) {
long priority = System.nanoTime();
boolean success = set.add(new SortHttpRequest(priority, request));
if(success && log.isDebugEnabled()) {
log.debug("INTO("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
}
if(!success && log.isDebugEnabled()) {
log.error("not unique request : " + request.getUrl());
}
}
private class SortHttpRequest {
private long priority;
private HttpRequest httpRequest;
public SortHttpRequest(long priority, HttpRequest httpRequest) {
super();
this.priority = priority;
this.httpRequest = httpRequest;
}
public long getPriority() {
return priority;
}
public HttpRequest getHttpRequest() {
return httpRequest;
}
}
}