package uk.bl.monitrix.analytics;
import java.util.Map;
import java.util.Map.Entry;
import uk.bl.monitrix.model.KnownHost;
public class HostAnalytics {
// String constant - 'text' prefix for checking MIME types
private static final String TEXT = "text";
/**
* Extracts the percentage of redirect status codes observed at the specified host,
* using the fetch-status distribution table of the specified host. Redirects are
* considered everything with an HTTP status code of 3xx.
* @param host the host
* @return the percentage of requests that received an HTTP 3xx repsonse
*/
public static double computePercentagOfRedirects(KnownHost host) {
Map<String, Integer> statusDistribution = host.getFetchStatusDistribution();
if (statusDistribution.size() == 0)
return 0;
double redirects = 0;
for (Entry<String, Integer> entry : statusDistribution.entrySet()) {
if (entry.getKey().startsWith("3"))
redirects += entry.getValue();
}
return redirects / host.getCrawledURLs();
}
/**
* Extracts the percentage of HTTP requests that were precluded by robots.txt rules, using the
* fetch-status distribution table of the specified host.
* According to http://crawler.archive.org/articles/user_manual/analysis.html, a block caused
* by robots.txt is indicated by a Heritrix fetch status code of -9998.
* @param host the host
* @return the percentage of requests blocked by robots.txt
*/
public static double computePercentageOfRobotsTxtBlocks(KnownHost host) {
Map<String, Integer> statusDistribution = host.getFetchStatusDistribution();
if (statusDistribution.size() == 0)
return 0;
Integer blocked = statusDistribution.get("-9998");
if (blocked == null)
return 0;
return ((double) blocked.intValue()) / host.getCrawledURLs();
}
/**
* Extracts the text-to-nontext ratio from the fetch-status distribution table
* of the specified host.
* @param host the host
* @return the text-to-nontext ratio
*/
public static double computeTextToNonTextRatio(KnownHost host) {
Map<String, Integer> statusDistribution = host.getFetchStatusDistribution();
if (statusDistribution.size() == 0)
return 0;
double text = 0;
for (Entry<String, Integer> entry : statusDistribution.entrySet()) {
if (entry.getKey().startsWith(TEXT))
text++;
}
return text / statusDistribution.size();
}
}