package focusedCrawler.crawler.async;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.BaseRobotsParser;
import crawlercommons.robots.SimpleRobotRulesParser;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.util.CommunicationException;
import focusedCrawler.util.storage.Storage;
import focusedCrawler.util.storage.StorageException;
public class RobotsTxtHandler implements HttpDownloader.Callback {
@SuppressWarnings("serial")
public static class RobotsData implements Serializable {
public List<String> sitemapUrls = new ArrayList<>();
public String content;
public RobotsData(List<String> sitemapsUrls) {
this.sitemapUrls = sitemapsUrls;
}
}
private static final Logger logger = LoggerFactory.getLogger(RobotsTxtHandler.class);
private BaseRobotsParser parser = new SimpleRobotRulesParser();
private Storage linkStorage;
private String userAgentName;
public RobotsTxtHandler(Storage linkStorage, String userAgentName) {
this.linkStorage = linkStorage;
this.userAgentName = userAgentName;
}
@Override
public void completed(LinkRelevance link, FetchedResult response) {
int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
logger.info("Successfully downloaded URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
processRobot(link, response, false);
} else {
logger.info("Server returned bad code for URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
processRobot(link, response, true);
}
}
@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}",
link.getURL().toString(), afe.getAbortReason());
} else {
logger.info("Failed to download URL: "+link.getURL().toString(), e.getMessage());
}
processRobot(link, null, true);
}
private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {
BaseRobotRules robotRules;
if(fetchFailed || response == null) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
}
else {
String contentType = response.getContentType();
boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
if ((response.getNumRedirects() > 0) && !isPlainText) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
robotRules = parser.parseContent(
response.getFetchedUrl(),
response.getContent(),
response.getContentType(),
userAgentName
);
}
}
try {
RobotsData robotsData = new RobotsData(robotRules.getSitemaps());
linkStorage.insert(robotsData);
} catch (StorageException | CommunicationException e) {
logger.error("Failed to insert robot.txt data into link storage.", e);
}
}
}