package org.archive.crawler.reporting; import java.io.IOException; import java.io.PrintWriter; import java.util.LinkedHashMap; import java.util.Map; import org.archive.crawler.restlet.XmlMarshaller; import org.archive.modules.writer.WARCWriterProcessor; import org.archive.util.ArchiveUtils; public class XmlCrawlSummaryReport extends Report { private String scheduledDate; public void setScheduledDate(String scheduledDate) { this.scheduledDate = scheduledDate; } public String getScheduledDate() { return this.scheduledDate; } @Override public void write(PrintWriter writer, StatisticsTracker stats) { Map<String,Object> info = new LinkedHashMap<String,Object>(); CrawlStatSnapshot snapshot = stats.getLastSnapshot(); info.put("crawlName", ((WARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix()); info.put("crawlJobShortName", stats.getCrawlController().getMetadata().getJobName()); info.put("scheduledDate", this.scheduledDate); info.put("crawlStatus", stats.getCrawlController().getCrawlExitStatus().desc); info.put("duration", ArchiveUtils.formatMillisecondsToConventional(stats.getCrawlElapsedTime())); stats.tallySeeds(); info.put("seedsCrawled", stats.seedsCrawled); info.put("seedsUncrawled",stats.seedsTotal - stats.seedsCrawled); info.put("hostsVisited",stats.serverCache.hostKeys().size() - 1); info.put("urisProcessed", snapshot.finishedUriCount); info.put("uriSuccesses", snapshot.downloadedUriCount); info.put("uriFailures", snapshot.downloadFailures); info.put("uriDisregards", snapshot.downloadDisregards); info.put("novelUris", stats.crawledBytes.get("novelCount")); long duplicateCount = stats.crawledBytes.containsKey("dupByHashCount") ? stats.crawledBytes .get("dupByHashCount").longValue() : 0L; info.put("duplicateByHashUris", duplicateCount); long notModifiedCount = stats.crawledBytes .containsKey("notModifiedCount") ? stats.crawledBytes.get( "notModifiedCount").longValue() : 0L; info.put("notModifiedUris", notModifiedCount); info.put("totalCrawledBytes", snapshot.bytesProcessed); info.put("novelCrawledBytes", stats.crawledBytes.get("novel")); long duplicateByHashCrawledBytes = stats.crawledBytes .containsKey("dupByHash") ? stats.crawledBytes.get("dupByHash") .longValue() : 0L; info.put("duplicateByHashCrawledBytes",duplicateByHashCrawledBytes); long notModifiedCrawledBytes = stats.crawledBytes .containsKey("notModified") ? stats.crawledBytes.get( "notModified").longValue() : 0L; info.put("notModifiedCrawledBytes",notModifiedCrawledBytes); info.put("urisPerSec", ArchiveUtils.doubleToString(snapshot.docsPerSecond, 2)); info.put("kbPerSec", snapshot.totalKiBPerSec); try { XmlMarshaller.marshalDocument(writer, XmlCrawlSummaryReport.class.getCanonicalName(), info); } catch (IOException e) { e.printStackTrace(); } } @Override public String getFilename() { return "crawl-report.xml"; } }