package uk.bl.monitrix.model; import java.util.List; import java.util.Map; /** * The Known Host domain object interface. Encapsulates the information collected about a specific host * during the crawl. * @author Rainer Simon <rainer.simon@ait.ac.at> */ public abstract class KnownHost { /** * Host name. * @return the host name */ public abstract String getHostname(); /** * The top level domain (e.g. "com" or "uk") * @return the top level domain */ public abstract String getTopLevelDomain(); /** * The domain name (a.k.a. private suffix), excluding any subdomains (e.g. bbc.co.uk). * * @return the domain name */ public abstract String getDomain(); /** * The subdomain part of this host. * @return the subdomains */ public abstract String getSubdomain(); /** * UNIX timestamp of the first recorded access to this host. * @return */ public abstract long getFirstAccess(); /** * UNIX timestamp of the last recorded access to this host. * @return the last access to the host */ public abstract long getLastAccess(); /** * The list of crawlers that have been crawling this host. * @return the list of crawler IDs */ public abstract List<String> getCrawlerIDs(); /** * The number of URLs crawled at this host. * @return */ public abstract long getCrawledURLs(); /** * The number of URLs that were successfully fetched (and not only attempted) * @return the no. of successfully fetched URLs */ public abstract long getSuccessfullyFetchedURLs(); /** * The average fetch duration observed at this host (in milliseconds). * @return the average fetch duration */ public abstract double getAverageFetchDuration(); /** * The average number of retries over all (eventually) successful fetches. * @return the average retry rate */ public abstract double getAverageRetryRate(); /** * The distribution of Heritrix fetch status codes for the URLs crawled at * this host. The return value is a map that has the encountered fetch status * codes (200, 404, -1, ...) as keys, and the number of URLs that have ended * with that status as values. * @return the fetch status distribution */ public abstract Map<String, Integer> getFetchStatusDistribution(); /** * The distribution of MIME content types for the URLs crawled at this * host. The return value is a map that has the mime type names as * keys, and the number of URLs that have returned that MIME type * as values. * @return the content type distribution */ public abstract Map<String, Integer> getContentTypeDistribution(); /** * Returns the virus stats that have been recorded for this host. The * return value is a map that has the name of the viruses as keys, * and the number of URLs infected with that virus as values. * @return the virus stats */ public abstract Map<String, Integer> getVirusStats(); /** * Returns the percentage of HTTP request that were precluded by Robots.txt rules. * @return the percentage of blocks caused by robots.txt */ public abstract double getRobotsBlockPercentage(); /** * Returns the percentage of HTTP requests that received an HTTP 3xx response. * @return the percentage of 3xx responses */ public abstract double getRedirectPercentage(); /** * Returns the ratio of text vs. non-text MIME types. * @return the text vs. non-text content type ratio */ public abstract double getTextToNoneTextRatio(); /** * Helper method to split a host name into tokens. Host names * will be split at the following characters: '.', '-', '_' * * Note: keeping this in a separate method, although it's a * one-liner. Possibly we want to do more elaborate things in the future. * * @param hostname the host name * @return the tokens */ public static String[] tokenizeName(String hostname) { return hostname.split("-|_|\\."); } @Override public String toString() { return getHostname() + " (last access: " + getLastAccess() + ", first access: " + getFirstAccess() + ", average delay: " + getAverageFetchDuration() + ")"; } }