package focusedCrawler.crawler.async;
import java.io.IOException;
import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.util.CommunicationException;
import focusedCrawler.util.storage.Storage;
import focusedCrawler.util.storage.StorageException;
public class SitemapXmlHandler implements HttpDownloader.Callback {
@SuppressWarnings("serial")
public static class SitemapData implements Serializable {
public List<String> sitemaps = new ArrayList<>();
public List<String> links = new ArrayList<>();
}
private static final Logger logger = LoggerFactory.getLogger(SitemapXmlHandler.class);
private Storage linkStorage;
private SiteMapParser parser = new SiteMapParser(false);
public SitemapXmlHandler(Storage linkStorage) {
this.linkStorage = linkStorage;
}
@Override
public void completed(LinkRelevance link, FetchedResult response) {
int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
logger.info("Successfully downloaded URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
processData(link, response);
} else {
logger.info("Server returned bad code for URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
}
}
@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}",
link.getURL().toString(), afe.getAbortReason());
} else {
logger.info("Failed to download URL: "+link.getURL().toString(), e.getMessage());
}
}
private void processData(LinkRelevance link, FetchedResult response) {
AbstractSiteMap sm;
try {
sm = parser.parseSiteMap(response.getContent(), new URL(response.getFetchedUrl()));
} catch (UnknownFormatException | IOException e) {
logger.warn("Failed to download sitemap: "+link.getURL().toString(), e);
return;
}
SitemapData sitemapData = new SitemapData();
if (sm.isIndex()) {
Collection<AbstractSiteMap> links = ((SiteMapIndex) sm).getSitemaps();
for (AbstractSiteMap asm : links) {
sitemapData.sitemaps.add(asm.getUrl().toString());
}
} else {
Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls();
for (SiteMapURL smu : links) {
sitemapData.links.add(smu.getUrl().toString());
}
}
try {
linkStorage.insert(sitemapData);
} catch (StorageException | CommunicationException e) {
logger.error("Failed to insert sitemaps data into link storage.", e);
}
}
}