package uk.bl.monitrix.model; import java.util.Date; /** * Represents a single line in a Heritrix log. * Cf. http://crawler.archive.org/articles/user_manual/analysis.html * @author Rainer Simon <rainer.simon@ait.ac.at> */ public abstract class CrawlLogEntry { public static final String ANNOTATION_CAPPED_CRAWL = "Q:serverMaxSuccessKb"; public static final String ANNOTATION_WARC_REVISIT = "warcRevisit"; public static final String ANNOTATION_WARC_REVISIT_DIGEST = "warcRevisit:digest"; public static final String ANNOTATION_WARC_REVISIT_NOT_MODIFIED = "warcRevisit:notModified"; /** * The ID of the log this entry is from. * @return the log file path */ public abstract String getLogId(); /** * Column #1 - ISO timestamp. * @return the crawl time */ public abstract Date getLogTimestamp(); /** * Column #2 - the HTTP status or Heritrix error code ('fetch status code'). * @return the HTTP status or error code */ public abstract int getHTTPCode(); /** * Column #3 - the download file size in bytes. * @return the filesize */ public abstract long getDownloadSize(); /** * Column #4 - the URL of the crawled document. * @return the URL */ public abstract String getURL(); /** * The hostname, extracted from the URL. * @return the hostname */ public abstract String getHost(); /** * The domain, extracted from the host name. * @return the domain */ public abstract String getDomain(); /** * The subdomain, extracted from the host name. * @return the subdomain */ public abstract String getSubdomain(); /** * Column #5 - 'breadcrumb codes' showing the trail of downloads that lead to the * current URL ('discovery path'). The discovery path of a seed is empty. * Cf. http://crawler.archive.org/articles/user_manual/glossary.html#discoverypath * * @return */ public abstract String getBreadcrumbCodes(); /** * <code>true</code> if the URL is a seed (i.e. its discovery path is an empty string). * @return */ public boolean isSeed() { return getBreadcrumbCodes().isEmpty(); } /** * Column #6 - the URL that immediately referenced this URL ('referrer'). The * referrer will be empty if the URL is a seed. * @return the referrer URL */ public abstract String getReferrer(); /** * Column #7 - content type. * @return the content type */ public abstract String getContentType(); /** * Column #8 - worker thread ID. * @return the worker thread ID */ public abstract String getWorkerThread(); /** * The fetch timestamp, derived from column #8. * @return the fetch timestamp */ public abstract Date getFetchTimestamp(); /** * The fetch timestamp, derived from column #8; * @return */ public abstract int getFetchDuration(); /** * Column #10 - file SHA1 hash. * @return the file hash */ public abstract String getSHA1Hash(); /** * Column #12 - annotations (comma-separated) * @return the annotations */ public abstract String getAnnotations(); /** * Returns the number of HTTP retries (based on info in column #12) * @return the number of retries */ public abstract int getRetries(); /** * Returns the 'compressability' of the URL * @return the compressability */ public abstract double getCompressability(); /** * Determines whether this is a revisit record. * @return */ public boolean isRevisitRecord() { for( String anno : getAnnotations().split(",") ) { if( anno.startsWith(ANNOTATION_WARC_REVISIT) ) return true; } return false; } /** * Determines is this is a capped-crawl record. * @return */ public boolean isCappedCrawlRecord() { for( String anno : getAnnotations().split(",") ) { if( anno.startsWith(ANNOTATION_CAPPED_CRAWL)) return true; } return false; } }