package focusedCrawler.link.frontier.selector; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import com.google.common.collect.MinMaxPriorityQueue; import focusedCrawler.link.frontier.LinkRelevance; /** * Implements a link selection strategy that tries to select the top-k links from all different * top-level domains. */ public class MaximizeWebsitesLinkSelector implements LinkSelector { private static final int MAX_LINKS_PER_DOMAIN = 5; private Map<String, MinMaxPriorityQueue<LinkRelevance>> topkLinksPerDomain; private int numberOfLinks; @Override public void startSelection(int numberOfLinks) { this.numberOfLinks = numberOfLinks; this.topkLinksPerDomain = new HashMap<>(); } @Override public void evaluateLink(LinkRelevance link) { if (link.getRelevance() > 0) { String domainName = link.getTopLevelDomainName(); MinMaxPriorityQueue<LinkRelevance> domainQueue = topkLinksPerDomain.get(domainName); if (domainQueue == null) { domainQueue = newPriorityQueue(MAX_LINKS_PER_DOMAIN); topkLinksPerDomain.put(domainName, domainQueue); } domainQueue.add(link); } } @Override public List<LinkRelevance> getSelectedLinks() { List<LinkRelevance> links = new ArrayList<>(); while (links.size() < numberOfLinks && !topkLinksPerDomain.isEmpty()) { // adds the URL with max score of each domain MinMaxPriorityQueue<LinkRelevance> topk = newPriorityQueue(numberOfLinks); Iterator<Entry<String, MinMaxPriorityQueue<LinkRelevance>>> it = topkLinksPerDomain.entrySet().iterator(); while (it.hasNext()) { MinMaxPriorityQueue<LinkRelevance> domain = it.next().getValue(); topk.add(domain.poll()); if (domain.isEmpty()) { it.remove(); } } for(LinkRelevance link : topk) { links.add(link); } } this.topkLinksPerDomain = null; // clean-up reference return links; } private MinMaxPriorityQueue<LinkRelevance> newPriorityQueue(int maxSize) { return MinMaxPriorityQueue .orderedBy(LinkRelevance.DESC_ORDER_COMPARATOR) .maximumSize(maxSize) .create(); } }