package focusedCrawler.link.frontier;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.link.LinkStorageConfig;
import focusedCrawler.link.frontier.selector.LinkSelector;
import focusedCrawler.link.frontier.selector.MaximizeWebsitesLinkSelector;
import focusedCrawler.link.frontier.selector.MultiLevelLinkSelector;
import focusedCrawler.link.frontier.selector.NonRandomLinkSelector;
import focusedCrawler.link.frontier.selector.RandomLinkSelector;
import focusedCrawler.link.frontier.selector.TopkLinkSelector;
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.ParameterFile;
public class FrontierManagerFactory {
private static final Logger logger = LoggerFactory.getLogger(FrontierManagerFactory.class);
public static FrontierManager create(LinkStorageConfig config,
String configPath,
String dataPath,
String seedFile,
MetricsManager metricsManager) {
String[] seedUrls = ParameterFile.getSeeds(seedFile);
String directory = Paths.get(dataPath, config.getLinkDirectory()).toString();
Frontier frontier = null;
if (config.isUseScope()) {
Map<String, Integer> scope = extractDomains(seedUrls);
frontier = new Frontier(directory, config.getMaxCacheUrlsSize(), scope);
} else {
frontier = new Frontier(directory, config.getMaxCacheUrlsSize());
}
LinkFilter linkFilter = new LinkFilter(configPath);
LinkSelector linkSelector = createLinkSelector(config);
logger.info("LINK_SELECTOR: "+linkSelector.getClass().getName());
FrontierManager frontierManager = new FrontierManager(
frontier,
dataPath,
config.getDownloadSitemapXml(),
config.getSchedulerMaxLinks(),
config.getSchedulerMaxLinks(),
config.getSchedulerHostMinAccessInterval(),
linkSelector,
linkFilter,
metricsManager);
frontierManager.addSeeds(seedUrls);
return frontierManager;
}
private static LinkSelector createLinkSelector(LinkStorageConfig config) {
String linkSelector = config.getLinkSelector();
if (linkSelector == null || linkSelector.isEmpty()) {
throw new IllegalArgumentException("Link selector not configured: " + linkSelector);
}
if (linkSelector.equals("TopkLinkSelector")) {
return new TopkLinkSelector();
} else if (linkSelector.equals("RandomLinkSelector")) {
return new RandomLinkSelector();
} else if (linkSelector.equals("NonRandomLinkSelector")) {
return new NonRandomLinkSelector();
} else if (linkSelector.equals("MultiLevelLinkSelector")) {
return new MultiLevelLinkSelector();
} else if (linkSelector.equals("MaximizeWebsitesLinkSelector")) {
return new MaximizeWebsitesLinkSelector();
} else {
throw new IllegalArgumentException("Unknown link selector configured: " + linkSelector);
}
}
private static HashMap<String, Integer> extractDomains(String[] urls) {
HashMap<String, Integer> scope = new HashMap<String, Integer>();
for (int i = 0; i < urls.length; i++) {
try {
URL url = new URL(urls[i]);
String host = url.getHost();
scope.put(host, new Integer(1));
} catch (MalformedURLException e) {
logger.warn("Invalid URL in seeds file. Ignoring URL: " + urls[i]);
}
}
logger.info("Using scope of following domains:");
for (String host: scope.keySet()) {
logger.info(host);
}
return scope;
}
}