package focusedCrawler.link;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.concurrent.atomic.AtomicInteger;
import focusedCrawler.link.frontier.LinkRelevance;
public class DownloadScheduler {
private static class DomainNode {
final String domainName;
final Deque<LinkRelevance> links;
volatile long lastAccessTime;
public DomainNode(String domainName, long lastAccessTime) {
this.domainName = domainName;
this.links = new LinkedList<>();
this.lastAccessTime = lastAccessTime;
}
}
private final PriorityQueue<DomainNode> domainsQueue;
private final PriorityQueue<DomainNode> emptyDomainsQueue;
private final Map<String, DomainNode> domains;
private final long minimumAccessTime;
private final int maxLinksInScheduler;
private AtomicInteger numberOfLinks = new AtomicInteger(0);
public DownloadScheduler(int minimumAccessTimeInterval, int maxLinksInScheduler) {
this.minimumAccessTime = minimumAccessTimeInterval;
this.maxLinksInScheduler = maxLinksInScheduler;
this.domains = new HashMap<>();
this.emptyDomainsQueue = createDomainPriorityQueue();
this.domainsQueue = createDomainPriorityQueue();
}
private PriorityQueue<DomainNode> createDomainPriorityQueue() {
int initialCapacity = 10;
return new PriorityQueue<DomainNode>(initialCapacity, new Comparator<DomainNode>() {
@Override
public int compare(DomainNode o1, DomainNode o2) {
return Long.compare(o1.lastAccessTime, o2.lastAccessTime);
}
});
}
public boolean addLink(LinkRelevance link) {
removeExpiredNodes();
if(numberOfLinks() >= maxLinksInScheduler) {
return false; // ignore link
}
numberOfLinks.incrementAndGet();
String domainName = link.getTopLevelDomainName();
synchronized(this) {
DomainNode domainNode = domains.get(domainName);
if(domainNode == null) {
domainNode = new DomainNode(domainName, 0l);
domains.put(domainName, domainNode);
}
if(domainNode.links.isEmpty()) {
emptyDomainsQueue.remove(domainNode);
domainsQueue.add(domainNode);
}
domainNode.links.addLast(link);
}
return true;
}
private synchronized void removeExpiredNodes() {
while(true) {
DomainNode node = emptyDomainsQueue.peek();
if(node == null) {
break;
}
long expirationTime = node.lastAccessTime + minimumAccessTime;
if(System.currentTimeMillis() > expirationTime) {
emptyDomainsQueue.poll();
domains.remove(node.domainName);
} else {
break;
}
}
}
public LinkRelevance nextLink() {
LinkRelevance linkRelevance;
synchronized (this) {
DomainNode domainNode = domainsQueue.peek();
if (domainNode == null) {
// no domains available to be crawled
return null;
}
long now = System.currentTimeMillis();
long timeSinceLastAccess = now - domainNode.lastAccessTime;
if (timeSinceLastAccess < minimumAccessTime) {
// the domain with longest access time cannot be crawled right now
return null;
}
domainsQueue.poll();
linkRelevance = domainNode.links.removeFirst();
domainNode.lastAccessTime = System.currentTimeMillis();
if (domainNode.links.isEmpty()) {
emptyDomainsQueue.add(domainNode);
} else {
domainsQueue.add(domainNode);
}
}
numberOfLinks.decrementAndGet();
return linkRelevance;
}
public int numberOfNonExpiredDomains() {
removeExpiredNodes();
return domains.size();
}
public int numberOfAvailableDomains() {
int available = 0;
for(DomainNode node : domainsQueue) {
if(isAvailable(node)){
available++;
}
}
return available;
}
public int numberOfEmptyDomains() {
return emptyDomainsQueue.size();
}
public int numberOfLinks() {
return numberOfLinks.get();
}
public boolean hasPendingLinks() {
return numberOfLinks() > 0;
}
public boolean hasLinksAvailable() {
// pick domain with longest access time
DomainNode domainNode = domainsQueue.peek();
if(domainNode == null) {
return false;
}
return isAvailable(domainNode);
}
private boolean isAvailable(DomainNode domainNode) {
long now = System.currentTimeMillis();
long timeSinceLastAccess = now - domainNode.lastAccessTime;
if(timeSinceLastAccess < minimumAccessTime) {
return false;
}
return true;
}
public synchronized void clear() {
Iterator<Entry<String, DomainNode>> it = domains.entrySet().iterator();
while(it.hasNext()) {
DomainNode node = it.next().getValue();
numberOfLinks.addAndGet(-node.links.size()); // adds negative value
node.links.clear();
}
while(true) {
DomainNode node = domainsQueue.poll();
if(node == null) {
break;
}
emptyDomainsQueue.add(node);
}
}
public boolean canDownloadNow(LinkRelevance link) {
DomainNode domain = domains.get(link.getTopLevelDomainName());
if(domain == null) {
return true;
} else {
return isAvailable(domain);
}
}
}