/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.fetcher; import java.io.PrintWriter; import java.io.Serializable; import java.util.LinkedHashMap; import java.util.Map; import org.archive.crawler.util.CrawledBytesHistotable; import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; import org.archive.util.ReportUtils; import org.archive.util.Reporter; /** * Collector of statistics for a 'subset' of a crawl, * such as a server (host:port), host, or frontier group * (eg queue). * * @author gojomo */ public class FetchStats extends CrawledBytesHistotable implements Serializable, FetchStatusCodes, Reporter { private static final long serialVersionUID = 2l; public enum Stage {SCHEDULED, RELOCATED, RETRIED, SUCCEEDED, DISREGARDED, FAILED}; public static final String TOTAL_SCHEDULED = "totalScheduled"; // anything initially scheduled // (totalScheduled - (fetchSuccesses + fetchFailures) public static final String FETCH_SUCCESSES = "fetchSuccesses"; // anything disposed-success // (HTTP 2XX response codes, other non-errors) public static final String FETCH_FAILURES = "fetchFailures"; // anything disposed-failure public static final String FETCH_DISREGARDS = "fetchDisregards";// anything disposed-disregard public static final String FETCH_RESPONSES = "fetchResponses"; // all positive responses (incl. 3XX, 4XX, 5XX) public static final String ROBOTS_DENIALS = "robotsDenials"; // all robots-precluded failures public static final String SUCCESS_BYTES = "successBytes"; // total size of all success responses public static final String TOTAL_BYTES = "totalBytes"; // total size of all responses public static final String FETCH_NONRESPONSES = "fetchNonResponses"; // processing attempts resulting in no response // (both failures and temp deferrals) public interface HasFetchStats { public FetchStats getSubstats(); } public interface CollectsFetchStats { public void tally(CrawlURI curi, Stage stage); } protected long lastSuccessTime; public synchronized void tally(CrawlURI curi, Stage stage) { switch(stage) { case SCHEDULED: tally(TOTAL_SCHEDULED, 1); break; case RETRIED: if(curi.getFetchStatus()<=0) { tally(FETCH_NONRESPONSES, 1); } break; case SUCCEEDED: tally(FETCH_SUCCESSES, 1); tally(FETCH_RESPONSES, 1); tally(TOTAL_BYTES, curi.getContentSize()); tally(SUCCESS_BYTES, curi.getContentSize()); lastSuccessTime = curi.getFetchCompletedTime(); break; case DISREGARDED: tally(FETCH_DISREGARDS, 1); if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) { tally(ROBOTS_DENIALS, 1); } break; case FAILED: if(curi.getFetchStatus()<=0) { tally(FETCH_NONRESPONSES, 1); } else { tally(FETCH_RESPONSES, 1); tally(TOTAL_BYTES, curi.getContentSize()); } tally(FETCH_FAILURES, 1); break; default: break; } if (curi.getFetchStatus() > 0) { this.accumulate(curi); } } public long getFetchSuccesses() { return get(FETCH_SUCCESSES); } public long getFetchResponses() { return get(FETCH_RESPONSES); } public long getSuccessBytes() { return get(SUCCESS_BYTES); } public long getTotalBytes() { return get(TOTAL_BYTES); } public long getFetchNonResponses() { return get(FETCH_NONRESPONSES); } public long getTotalScheduled() { return get(TOTAL_SCHEDULED); } public long getFetchDisregards() { return get(FETCH_DISREGARDS); } public long getRobotsDenials() { return get(ROBOTS_DENIALS); } public long getRemaining() { return get(TOTAL_SCHEDULED) - (get(FETCH_SUCCESSES) + get(FETCH_FAILURES)+ get(FETCH_DISREGARDS)); } public long getRecordedFinishes() { return get(FETCH_SUCCESSES) + get(FETCH_FAILURES); } public long getNovelBytes() { return get(NOVEL); } public long getNovelUrls() { return get(NOVELCOUNT); } public long getNotModifiedBytes() { return get(NOTMODIFIED); } public long getNotModifiedUrls() { return get(NOTMODIFIEDCOUNT); } public long getDupByHashBytes() { return get(DUPLICATE); } public long getDupByHashUrls() { return get(DUPLICATECOUNT); } public long getOtherDupBytes() { return get(OTHERDUPLICATE); } public long getOtherDupUrls() { return get(OTHERDUPLICATECOUNT); } /* (non-Javadoc) * @see org.archive.util.Reporter#reportTo(java.io.PrintWriter) */ @Override // Reporter public void reportTo(PrintWriter writer) { writer.println(shortReportLegend()); shortReportLineTo(writer); } @Override public String shortReportLegend() { return "totalScheduled fetchSuccesses fetchFailures fetchDisregards " + "fetchResponses robotsDenials successBytes totalBytes " + "fetchNonResponses lastSuccessTime"; } public String shortReportLine() { return ReportUtils.shortReportLine(this); } @Override public void shortReportLineTo(PrintWriter writer) { writer.print(get(TOTAL_SCHEDULED)); writer.print(" "); writer.print(get(FETCH_SUCCESSES)); writer.print(" "); writer.print(get(FETCH_FAILURES)); writer.print(" "); writer.print(get(FETCH_DISREGARDS)); writer.print(" "); writer.print(get(FETCH_RESPONSES)); writer.print(" "); writer.print(get(ROBOTS_DENIALS)); writer.print(" "); writer.print(get(SUCCESS_BYTES)); writer.print(" "); writer.print(get(TOTAL_BYTES)); writer.print(" "); writer.print(get(FETCH_NONRESPONSES)); writer.print(" "); writer.print(ArchiveUtils.getLog17Date(lastSuccessTime)); } @Override public Map<String, Object> shortReportMap() { Map<String,Object> map = new LinkedHashMap<String, Object>(this); map.put("lastSuccessTime",lastSuccessTime); return map; } public long getLastSuccessTime() { return lastSuccessTime; } }