/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.util;
import java.util.Map;
import org.archive.io.warc.WARCWriter;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.archive.modules.revisit.ServerNotModifiedRevisit;
import org.archive.util.ArchiveUtils;
import org.archive.util.Histotable;
public class CrawledBytesHistotable extends Histotable<String>
implements CoreAttributeConstants {
private static final long serialVersionUID = 7923431123239026213L;
public static final String NOTMODIFIED = "notModified";
public static final String DUPLICATE = "dupByHash";
public static final String OTHERDUPLICATE = "otherDup";
public static final String NOVEL = "novel";
public static final String NOTMODIFIEDCOUNT = "notModifiedCount";
public static final String DUPLICATECOUNT = "dupByHashCount";
public static final String OTHERDUPLICATECOUNT = "otherDupCount";
public static final String NOVELCOUNT = "novelCount";
// total size of warc response and resource record payloads (includes http
// headers, does not include warc record headers)
public static final String WARC_NOVEL_CONTENT_BYTES = "warcNovelContentBytes";
public static final String WARC_NOVEL_URLS = "warcNovelUrls";
public CrawledBytesHistotable() {
super();
}
@SuppressWarnings("unchecked")
public void accumulate(CrawlURI curi) {
if (curi.getRevisitProfile() instanceof ServerNotModifiedRevisit) {
tally(NOTMODIFIED, curi.getContentSize());
tally(NOTMODIFIEDCOUNT,1);
} else if (curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) {
tally(DUPLICATE,curi.getContentSize());
tally(DUPLICATECOUNT,1);
} else if (curi.getRevisitProfile() != null) {
tally(OTHERDUPLICATE, curi.getContentSize());
tally(OTHERDUPLICATECOUNT, 1);
} else {
tally(NOVEL,curi.getContentSize());
tally(NOVELCOUNT,1);
}
Map<String,Map<String,Long>> warcStats = (Map<String,Map<String,Long>>) curi.getData().get(A_WARC_STATS);
if (warcStats != null) {
tally(WARC_NOVEL_CONTENT_BYTES,
WARCWriter.getStat(warcStats, "response", "contentBytes")
+ WARCWriter.getStat(warcStats, "resource", "contentBytes"));
tally(WARC_NOVEL_URLS,
WARCWriter.getStat(warcStats, "response", "numRecords")
+ WARCWriter.getStat(warcStats, "resource", "numRecords"));
}
}
public String summary() {
StringBuilder sb = new StringBuilder();
sb.append(ArchiveUtils.formatBytesForDisplay(getTotalBytes()));
sb.append(" crawled (");
sb.append(ArchiveUtils.formatBytesForDisplay(get(NOVEL)));
sb.append(" novel");
if(get(DUPLICATE)!=null) {
sb.append(", ");
sb.append(ArchiveUtils.formatBytesForDisplay(get(DUPLICATE)));
sb.append(" ");
sb.append(DUPLICATE);
}
if(get(NOTMODIFIED)!=null) {
sb.append(", ");
sb.append(ArchiveUtils.formatBytesForDisplay(get(NOTMODIFIED)));
sb.append(" ");
sb.append(NOTMODIFIED);
}
if(get(OTHERDUPLICATE)!=null) {
sb.append(", ");
sb.append(ArchiveUtils.formatBytesForDisplay(get(OTHERDUPLICATE)));
sb.append(" ");
sb.append(OTHERDUPLICATECOUNT);
}
sb.append(")");
return sb.toString();
}
public long getTotalBytes() {
return get(NOVEL) + get(DUPLICATE) + get(NOTMODIFIED) + get(OTHERDUPLICATE);
}
public long getTotalUrls() {
return get(NOVELCOUNT) + get(DUPLICATECOUNT) + get(NOTMODIFIEDCOUNT) + get(OTHERDUPLICATECOUNT);
}
}