/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link.frontier;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Paths;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Timer;
import com.codahale.metrics.Timer.Context;
import focusedCrawler.link.DownloadScheduler;
import focusedCrawler.link.frontier.selector.LinkSelector;
import focusedCrawler.util.DataNotFoundException;
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.LogFile;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.persistence.Tuple;
import focusedCrawler.util.persistence.TupleIterator;
/**
* This class manages the crawler frontier
*
* @author Luciano Barbosa
* @version 1.0
*/
public class FrontierManager {
private static final Logger logger = LoggerFactory.getLogger(FrontierManager.class);
private final Frontier frontier;
private final int linksToLoad;
private final LinkFilter linkFilter;
private final LinkSelector linkSelector;
private final HostManager hostsManager;
private final boolean downloadRobots;
private final DownloadScheduler scheduler;
private final LogFile schedulerLog;
private final MetricsManager metricsManager;
private boolean linksRejectedDuringLastLoad;
private int availableLinksDuringLoad;
private int rejectedLinksDuringLoad;
private int uncrawledLinksDuringLoad;
private int unavailableLinksDuringLoad;
private Timer frontierLoadTimer;
private Timer insertTimer;
private Timer selectTimer;
public FrontierManager(Frontier frontier, String dataPath, boolean downloadRobots,
int linksToLoad, int schedulerMaxLinks, int schdulerMinAccessInterval,
LinkSelector linkSelector, LinkFilter linkFilter,
MetricsManager metricsManager) {
this.frontier = frontier;
this.hostsManager = new HostManager(Paths.get(dataPath, "data_hosts"));;
this.downloadRobots = downloadRobots;
this.linksToLoad = linksToLoad;
this.linkSelector = linkSelector;
this.linkFilter = linkFilter;
this.scheduler = new DownloadScheduler(schdulerMinAccessInterval, schedulerMaxLinks);
this.schedulerLog = new LogFile(Paths.get(dataPath, "data_monitor", "scheduledlinks.csv"));
this.metricsManager = metricsManager;
this.setupMetrics();
this.loadQueue(linksToLoad);
}
private void setupMetrics() {
Gauge<Integer> numberOfLinksGauge = () -> scheduler.numberOfLinks();
metricsManager.register("frontier_manager.scheduler.number_of_links", numberOfLinksGauge);
Gauge<Integer> nonExpiredDomainsGauge = () -> scheduler.numberOfNonExpiredDomains();
metricsManager.register("frontier_manager.scheduler.non_expired_domains", nonExpiredDomainsGauge);
Gauge<Integer> emptyDomainsGauge = () -> scheduler.numberOfEmptyDomains();
metricsManager.register("frontier_manager.scheduler.empty_domains", emptyDomainsGauge);
Gauge<Integer> availableLinksGauge = () -> availableLinksDuringLoad;
metricsManager.register("frontier_manager.last_load.available", availableLinksGauge);
Gauge<Integer> unavailableLinksGauge = () -> unavailableLinksDuringLoad;
metricsManager.register("frontier_manager.last_load.unavailable", unavailableLinksGauge);
Gauge<Integer> rejectedLinksGauge = () -> rejectedLinksDuringLoad;
metricsManager.register("frontier_manager.last_load.rejected", rejectedLinksGauge);
Gauge<Integer> uncrawledLinksGauge = () -> uncrawledLinksDuringLoad;
metricsManager.register("frontier_manager.last_frontier_load.uncrawled", uncrawledLinksGauge);
frontierLoadTimer = metricsManager.getTimer("frontier_manager.load.time");
insertTimer = metricsManager.getTimer("frontier_manager.insert.time");
selectTimer = metricsManager.getTimer("frontier_manager.select.time");
}
public Frontier getFrontierPersistent() {
return this.frontier;
}
public void clearFrontier() {
logger.info("Cleaning frontier... current queue size: " + scheduler.numberOfLinks());
scheduler.clear();
logger.info("# Queue size:" + scheduler.numberOfLinks());
}
private void loadQueue(int numberOfLinks) {
logger.info("Loading more links from frontier into the scheduler...");
scheduler.clear();
frontier.commit();
Context timerContext = frontierLoadTimer.time();
try(TupleIterator<LinkRelevance> it = frontier.iterator()) {
int rejectedLinks = 0;
int uncrawledLinks = 0;
int availableLinks = 0;
int unavailableLinks = 0;
linkSelector.startSelection(numberOfLinks);
while(it.hasNext()) {
Tuple<LinkRelevance> tuple = it.next();
LinkRelevance link = tuple.getValue();
// Links already downloaded or not relevant
if (link.getRelevance() <= 0) {
continue;
}
uncrawledLinks++;
// check whether link can be download now according to politeness constraints
if(scheduler.canDownloadNow(link)) {
// consider link to be downloaded
linkSelector.evaluateLink(link);
availableLinks++;
} else {
unavailableLinks++;
rejectedLinks++;
}
}
List<LinkRelevance> selectedLinks = linkSelector.getSelectedLinks();
int linksAdded = 0;
for (LinkRelevance link : selectedLinks) {
boolean addedLink = scheduler.addLink(link);
if(addedLink) {
linksAdded++;
} else {
rejectedLinks++;
}
}
this.availableLinksDuringLoad = availableLinks;
this.unavailableLinksDuringLoad = unavailableLinks;
this.uncrawledLinksDuringLoad = uncrawledLinks;
this.rejectedLinksDuringLoad = rejectedLinks;
this.linksRejectedDuringLastLoad = rejectedLinks > 0;
logger.info("Loaded {} links.", linksAdded);
} catch (Exception e) {
logger.error("Failed to read items from the frontier.", e);
} finally {
timerContext.stop();
}
}
public boolean isRelevant(LinkRelevance elem) throws FrontierPersistentException {
if (elem.getRelevance() <= 0) {
return false;
}
Integer value = frontier.exist(elem);
if (value != null) {
return false;
}
String url = elem.getURL().toString();
if (linkFilter.accept(url) == false) {
return false;
}
return true;
}
public void insert(LinkRelevance[] linkRelevance) throws FrontierPersistentException {
for (int i = 0; i < linkRelevance.length; i++) {
LinkRelevance elem = linkRelevance[i];
this.insert(elem);
}
}
public boolean insert(LinkRelevance linkRelevance) throws FrontierPersistentException {
Context timerContext = insertTimer.time();
try {
boolean insert = isRelevant(linkRelevance);
if (insert) {
if (downloadRobots) {
URL url = linkRelevance.getURL();
String hostName = url.getHost();
if (!hostsManager.isKnown(hostName)) {
hostsManager.insert(hostName);
try {
URL robotUrl = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
LinkRelevance sitemap = new LinkRelevance(robotUrl, 299, LinkRelevance.Type.ROBOTS);
frontier.insert(sitemap);
} catch (Exception e) {
logger.warn("Failed to insert robots.txt for host: " + hostName, e);
}
}
}
insert = frontier.insert(linkRelevance);
}
return insert;
} finally {
timerContext.stop();
}
}
public LinkRelevance nextURL() throws FrontierPersistentException, DataNotFoundException {
Context timerContext = selectTimer.time();
try {
if (!scheduler.hasLinksAvailable()) {
loadQueue(linksToLoad);
}
LinkRelevance link = scheduler.nextLink();
if (link == null) {
if (scheduler.hasPendingLinks() || linksRejectedDuringLastLoad) {
throw new DataNotFoundException(false, "No links available for selection right now.");
} else {
throw new DataNotFoundException(true, "Frontier run out of links.");
}
}
frontier.delete(link);
schedulerLog.printf("%d\t%.5f\t%s\n", System.currentTimeMillis(),
link.getRelevance(), link.getURL().toString());
return link;
} finally {
timerContext.stop();
}
}
public void close() {
frontier.commit();
frontier.close();
hostsManager.close();
schedulerLog.close();
}
public Frontier getFrontier() {
return frontier;
}
public void addSeeds(String[] seeds) {
if (seeds != null && seeds.length > 0) {
int count = 0;
for (String seed : seeds) {
logger.info("Adding seed URL: " + seed);
URL seedUrl;
try {
seedUrl = new URL(seed);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("Invalid seed URL provided: " + seed, e);
}
LinkRelevance link = new LinkRelevance(seedUrl, LinkRelevance.DEFAULT_RELEVANCE);
try {
boolean inserted = insert(link);
if (inserted)
count++;
} catch (FrontierPersistentException e) {
throw new RuntimeException("Failed to insert seed URL: " + seed, e);
}
}
logger.info("Number of seeds added: " + count);
}
}
}