/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.crawler.async.RobotsTxtHandler;
import focusedCrawler.crawler.async.SitemapXmlHandler;
import focusedCrawler.link.backlink.BacklinkSurfer;
import focusedCrawler.link.classifier.LinkClassifier;
import focusedCrawler.link.classifier.LinkClassifierFactory;
import focusedCrawler.link.classifier.LinkClassifierHub;
import focusedCrawler.link.classifier.builder.LinkClassifierBuilder;
import focusedCrawler.link.classifier.builder.LinkNeighborhoodWrapper;
import focusedCrawler.link.frontier.FrontierManager;
import focusedCrawler.link.frontier.FrontierManagerFactory;
import focusedCrawler.link.frontier.FrontierPersistentException;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.DataNotFoundException;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.storage.Storage;
import focusedCrawler.util.storage.StorageDefault;
import focusedCrawler.util.storage.StorageException;
import focusedCrawler.util.storage.distribution.StorageBinder;
import focusedCrawler.util.string.StopList;
import focusedCrawler.util.string.StopListFile;
/**
*
* <p>Description: This class receives links to be inserted
* in frontier, sends links to crawler and starts the link storage server.</p>
*
* <p>Copyright: Copyright (c) 2004</p>
*
* @author Luciano Barbosa
* @version 1.0
*/
public class LinkStorage extends StorageDefault {
public static final Logger logger = LoggerFactory.getLogger(LinkStorage.class);
private final boolean getBacklinks;
private final boolean getOutlinks;
private final int learnLimit;
private final FrontierManager frontierManager;
private final BipartiteGraphManager graphManager;
private final OnlineLearning onlineLearning;
private AtomicBoolean onlineLearningIsRunning = new AtomicBoolean(false);
private AtomicInteger numberOfPages = new AtomicInteger(0);
private AtomicInteger numberOfBacklink = new AtomicInteger(0);
public LinkStorage(LinkStorageConfig config,
BipartiteGraphManager manager,
FrontierManager frontierManager)
throws IOException {
this(config, manager, frontierManager, null);
}
public LinkStorage(LinkStorageConfig config,
BipartiteGraphManager manager,
FrontierManager frontierManager,
OnlineLearning onlineLearning) throws IOException {
this.frontierManager = frontierManager;
this.graphManager = manager;
this.getBacklinks = config.getBacklinks();
this.getOutlinks = config.getOutlinks();
this.onlineLearning = onlineLearning;
this.learnLimit = config.getLearningLimit();
}
public void close(){
logger.info("Shutting down GraphManager...");
graphManager.getRepository().close();
logger.info("Shutting down FrontierManager...");
this.frontierManager.close();
logger.info("done.");
}
/**
* This method inserts links from a given page into the frontier
*
* @param obj
* Object - page containing links
* @return Object
*/
public Object insert(Object obj) throws StorageException {
if(obj instanceof Page) {
return insert((Page) obj);
}
else if(obj instanceof RobotsTxtHandler.RobotsData) {
insert((RobotsTxtHandler.RobotsData) obj);
}
else if(obj instanceof SitemapXmlHandler.SitemapData) {
insert((SitemapXmlHandler.SitemapData) obj);
}
return null;
}
public void insert(RobotsTxtHandler.RobotsData robotsData) {
for (String sitemap : robotsData.sitemapUrls) {
try {
frontierManager.insert(new LinkRelevance(sitemap, 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
logger.error("Failed to insert sitemap from robot: "+sitemap);
}
}
}
public void insert(SitemapXmlHandler.SitemapData sitemapData) {
for (String link : sitemapData.links) {
try {
frontierManager.insert(new LinkRelevance(link, 1.0d, LinkRelevance.Type.FORWARD));
} catch (MalformedURLException | FrontierPersistentException e) {
logger.error("Failed to insert link into the frontier: "+link);
}
}
logger.info("Added {} URLs from sitemap.", sitemapData.links.size());
for (String sitemap : sitemapData.sitemaps) {
try {
frontierManager.insert(new LinkRelevance(new URL(sitemap), 299, LinkRelevance.Type.SITEMAP));
} catch (MalformedURLException | FrontierPersistentException e) {
logger.error("Failed to insert sitemap into the frontier: "+sitemap);
}
}
logger.info("Added {} child sitemaps.", sitemapData.sitemaps.size());
}
public Object insert(Page page) throws StorageException {
int numberOfPages = this.numberOfPages.incrementAndGet();
try {
if (getBacklinks && page.isAuth()) {
logger.info(">>>>>GETTING BACKLINKS:" + page.getURL().toString());
graphManager.insertBacklinks(page);
numberOfBacklink.incrementAndGet();
logger.info("TOTAL BACKLINKS:" + numberOfBacklink.get());
}
if (onlineLearning != null && numberOfPages % learnLimit == 0) {
if(onlineLearningIsRunning.compareAndSet(false, true)) {
// onlineLearningIsRunning is true
logger.info("RUNNING ONLINE LEARNING...");
onlineLearning.execute();
frontierManager.clearFrontier();
onlineLearningIsRunning.set(false);
}
}
if (getBacklinks) {
if (page.isHub()) {
graphManager.insertOutlinks(page);
}
} else {
if (getOutlinks) {
graphManager.insertOutlinks(page);
}
}
} catch (Exception ex) {
logger.info("Failed to insert page into LinkStorage.", ex);
throw new StorageException(ex.getMessage(), ex);
}
return null;
}
/**
* This method sends a link to crawler
* @throws DataNotFoundException
*/
public synchronized Object select(Object obj) throws StorageException, DataNotFoundException {
try {
return frontierManager.nextURL();
} catch (FrontierPersistentException e) {
throw new StorageException(e.getMessage(), e);
}
}
public static void runServer(String configPath, String seedFilePath,
String dataOutputPath, String modelPath,
LinkStorageConfig config)
throws FrontierPersistentException {
try {
MetricsManager metricsManager = new MetricsManager();
Storage linkStorage = createLinkStorage(configPath, seedFilePath,
dataOutputPath, modelPath,
config, metricsManager);
StorageBinder binder = new StorageBinder(config.getStorageServerConfig());
binder.bind(linkStorage);
} catch (Exception e) {
logger.error("Problem while starting LinkStorage.", e);
}
}
public static Storage createLinkStorage(String configPath, String seedFile,
String dataPath, String modelPath,
LinkStorageConfig config,
MetricsManager metricsManager)
throws FrontierPersistentException,
IOException {
Path stoplistPath = Paths.get(configPath, "/stoplist.txt");
StopList stoplist;
if(Files.exists(stoplistPath)) {
stoplist = new StopListFile(stoplistPath.toFile().getCanonicalPath());
} else {
stoplist = StopListFile.DEFAULT;
}
LinkClassifierFactory.setDefaultStoplist(stoplist);
LinkClassifier linkClassifier = LinkClassifierFactory.create(modelPath, config.getTypeOfClassifier());
FrontierManager frontierManager = FrontierManagerFactory.create(config, configPath, dataPath, seedFile, metricsManager);
BipartiteGraphRepository graphRep = new BipartiteGraphRepository(dataPath);
BipartiteGraphManager manager = createBipartiteGraphManager(config, linkClassifier, frontierManager, graphRep);
LinkStorage linkStorage;
if (config.isUseOnlineLearning()) {
LinkNeighborhoodWrapper wrapper = new LinkNeighborhoodWrapper(stoplist);
LinkClassifierBuilder cb = new LinkClassifierBuilder(dataPath, graphRep, stoplist, wrapper, frontierManager.getFrontier());
OnlineLearning onlineLearning = new OnlineLearning(frontierManager.getFrontier(), manager, cb, config.getOnlineMethod(), dataPath);
logger.info("ONLINE LEARNING:" + config.getOnlineMethod());
linkStorage = new LinkStorage(config, manager, frontierManager, onlineLearning);
} else {
linkStorage = new LinkStorage(config, manager, frontierManager);
}
return linkStorage;
}
private static BipartiteGraphManager createBipartiteGraphManager(LinkStorageConfig config,
LinkClassifier linkClassifier, FrontierManager frontierManager,
BipartiteGraphRepository graphRepository) {
if (config.getBacklinks()) {
return new BipartiteGraphManager(frontierManager, graphRepository, linkClassifier,
config.getMaxPagesPerDomain(), new BacklinkSurfer(config.getBackSurferConfig()),
new LinkClassifierHub());
} else {
return new BipartiteGraphManager(frontierManager, graphRepository, linkClassifier,
config.getMaxPagesPerDomain(), null, null);
}
}
}