XmlCrawlSummaryReport.java example

Explorer
heritrix3-master
package org.archive.crawler.reporting;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedHashMap;
import java.util.Map;

import org.archive.crawler.restlet.XmlMarshaller;
import org.archive.modules.writer.WARCWriterProcessor;
import org.archive.util.ArchiveUtils;

public class XmlCrawlSummaryReport extends Report {

    private String scheduledDate;

    public void setScheduledDate(String scheduledDate) {
        this.scheduledDate = scheduledDate;
    }

    public String getScheduledDate() {
        return this.scheduledDate;
    }

    @Override
    public void write(PrintWriter writer, StatisticsTracker stats) {
        Map<String,Object> info = new LinkedHashMap<String,Object>();

        CrawlStatSnapshot snapshot = stats.getLastSnapshot();

        info.put("crawlName", 
                ((WARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix());
        info.put("crawlJobShortName", 
                stats.getCrawlController().getMetadata().getJobName());
        info.put("scheduledDate", this.scheduledDate);
        info.put("crawlStatus",
                stats.getCrawlController().getCrawlExitStatus().desc);
        info.put("duration", 
                ArchiveUtils.formatMillisecondsToConventional(stats.getCrawlElapsedTime()));

        stats.tallySeeds();
        info.put("seedsCrawled", stats.seedsCrawled);
        info.put("seedsUncrawled",stats.seedsTotal - stats.seedsCrawled);

        info.put("hostsVisited",stats.serverCache.hostKeys().size() - 1);

        info.put("urisProcessed", snapshot.finishedUriCount);
        info.put("uriSuccesses", snapshot.downloadedUriCount);
        info.put("uriFailures", snapshot.downloadFailures);
        info.put("uriDisregards", snapshot.downloadDisregards);

        info.put("novelUris", stats.crawledBytes.get("novelCount"));

        long duplicateCount = stats.crawledBytes.containsKey("dupByHashCount") ? stats.crawledBytes
                .get("dupByHashCount").longValue() : 0L;

        info.put("duplicateByHashUris", duplicateCount);
        long notModifiedCount = stats.crawledBytes
                .containsKey("notModifiedCount") ? stats.crawledBytes.get(
                "notModifiedCount").longValue() : 0L;

        info.put("notModifiedUris", notModifiedCount);

        info.put("totalCrawledBytes", snapshot.bytesProcessed);

        info.put("novelCrawledBytes", stats.crawledBytes.get("novel"));

        long duplicateByHashCrawledBytes = stats.crawledBytes
                .containsKey("dupByHash") ? stats.crawledBytes.get("dupByHash")
                .longValue() : 0L;

        info.put("duplicateByHashCrawledBytes",duplicateByHashCrawledBytes);
        long notModifiedCrawledBytes = stats.crawledBytes
                .containsKey("notModified") ? stats.crawledBytes.get(
                "notModified").longValue() : 0L;

        info.put("notModifiedCrawledBytes",notModifiedCrawledBytes);

        info.put("urisPerSec",
                ArchiveUtils.doubleToString(snapshot.docsPerSecond, 2));
        info.put("kbPerSec", snapshot.totalKiBPerSec);
        
        try {
            XmlMarshaller.marshalDocument(writer,
                    XmlCrawlSummaryReport.class.getCanonicalName(), info);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    @Override
    public String getFilename() {
        return "crawl-report.xml";
    }

}