package uk.bl.monitrix.heritrix;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.zip.Deflater;
import play.Logger;
import com.google.common.net.InternetDomainName;
import uk.bl.monitrix.model.Alert;
import uk.bl.monitrix.model.Alert.AlertType;
import uk.bl.monitrix.model.CrawlLogEntry;
import uk.bl.monitrix.model.VirusRecord;
/**
* An in-memory implementation of {@link CrawlLogEntry}, for use with {@link SimpleLogfileReader}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class LogFileEntry extends CrawlLogEntry {
// TODO make configurable via config file
private static final int TOO_MANY_PATH_SEGMENTS_THRESHOLD = 16;
private static final String MSG_MALFORMED_URL = "Malformed URL: ";
private static final String MSG_TOO_MANY_PATH_SEGMENTS = "Too many path segments in URL: ";
private static DateFormat ISO_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
private static DateFormat RFC2550_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS");
private String logPath;
private String line;
private List<String> fields = new ArrayList<String>();
private String bufferedHost = null;
private String bufferedDomain = null;
private String bufferedSubdomain = null;
private Double bufferedCompressability = null;
private List<Alert> alerts = new ArrayList<Alert>();
private boolean parseFailed;
public LogFileEntry(String logPath, String line) {
init(logPath, line);
}
/**
* Package-private constructor and init method that can be used to re-use an instance of this object
* thus reducing GC activity.
*/
public LogFileEntry() {
}
void init(String logPath, String line) {
this.logPath = logPath;
this.line = line;
String[] split = line.split(" ");
if( split.length < 11 ) {
this.parseFailed = true;
Logger.error("Got a split of length: "+split.length);
} else {
this.parseFailed = false;
}
// Column 1 - 11
int ctr = 0;
fields.clear();
while (fields.size() < 11 && ctr < split.length) {
if (!split[ctr].isEmpty())
fields.add(split[ctr].trim());
ctr++;
}
// Column 12 (annotations) - note that annotations may contain white spaces, so we need to re-join
StringBuilder sb = new StringBuilder();
for (int i=ctr; i<split.length; i++) {
sb.append(split[i] + " ");
}
fields.add(sb.toString().trim());
alerts.clear();
for (Alert alert : validate())
alerts.add(alert);
// Reset buffers
bufferedHost = null;
bufferedSubdomain = null;
bufferedCompressability = null;
}
public boolean getParseFailed() {
return this.parseFailed;
}
private List<Alert> validate() {
List<Alert> alerts = new ArrayList<Alert>();
double compressability = getCompressability();
// TODO find the right threshold + make configurable
if (compressability < 0.1)
alerts.add(new DefaultAlert(this.getLogTimestamp().getTime(), this.getHost(), AlertType.COMPRESSABILITY, getURL()));
// If the line indicates the crawl has hit a cap, then raise and alert:
if( this.isCappedCrawlRecord() )
alerts.add(new DefaultAlert(this.getLogTimestamp().getTime(), this.getHost(), AlertType.HOST_CAPPED, getURL()));
String[] pathSegments = getURL().split("/");
if ((pathSegments.length - 1) > TOO_MANY_PATH_SEGMENTS_THRESHOLD)
alerts.add(new DefaultAlert(this.getLogTimestamp().getTime(), this.getHost(), AlertType.TOO_MANY_PATH_SEGMENTS, MSG_TOO_MANY_PATH_SEGMENTS + this.getURL()));
return alerts;
}
private void parseHost() {
HostParseResult result = LogFileEntry.extractDomainNames(this);
this.bufferedHost = result.host;
this.bufferedDomain = result.domain;
this.bufferedSubdomain = result.subdomain;
if (result.alert != null)
alerts.add(result.alert);
}
public List<Alert> getAlerts() {
return alerts;
}
@Override
public String getLogId() {
return logPath;
}
@Override
public Date getLogTimestamp() {
try {
return ISO_FORMAT.parse(fields.get(0).replaceAll("Z$", "+0000"));
} catch (ParseException e) {
// Should never happen!
throw new RuntimeException(e);
}
}
@Override
public int getHTTPCode() {
return Integer.parseInt(fields.get(1));
}
@Override
public long getDownloadSize() {
if (fields.get(2).equals("-"))
return 0;
return Long.parseLong(fields.get(2));
}
@Override
public String getURL() {
return fields.get(3);
}
@Override
public String getHost() {
if (bufferedHost == null)
parseHost();
return bufferedHost;
}
@Override
public String getDomain() {
if (bufferedDomain == null)
parseHost();
return bufferedDomain;
}
@Override
public String getSubdomain() {
if (bufferedSubdomain == null)
parseHost();
return bufferedSubdomain;
}
@Override
public String getBreadcrumbCodes() {
return fields.get(4);
}
@Override
public String getReferrer() {
return fields.get(5);
}
@Override
public String getContentType() {
return fields.get(6);
}
@Override
public String getWorkerThread() {
return fields.get(7);
}
@Override
public Date getFetchTimestamp() {
try {
String timestamp = fields.get(8);
if ("-".equals(timestamp)) return null;
if (timestamp.indexOf('+') > -1)
timestamp = timestamp.substring(0, timestamp.indexOf('+'));
//Logger.info("fetch timestamp: " + timestamp);
return RFC2550_FORMAT.parse(timestamp);
} catch (ParseException e) {
Logger.error("Bad date in line: "+this.line);
// Should never happen!
throw new RuntimeException(e);
}
}
@Override
public int getFetchDuration() {
String duration = fields.get(8);
if (duration.indexOf('+') > -1) {
duration = duration.substring(duration.indexOf('+') + 1);
return Integer.parseInt(duration);
}
return 0;
}
@Override
public String getSHA1Hash() {
return fields.get(9);
}
@Override
public String getAnnotations() {
return fields.get(11);
}
@Override
public int getRetries() {
for (String a : fields.get(11).split(",")) {
if (a.endsWith("t")) {
String retries = a.substring(0, a.length() - 1);
try {
return Integer.parseInt(retries);
} catch (Throwable t) {
// Do nothing
}
}
}
return 0;
}
// TODO Maybe switch to Snappy: http://xerial.org/snappy-java/
Deflater compresser = new Deflater(Deflater.BEST_SPEED);
@Override
public double getCompressability() {
if (bufferedCompressability == null) {
try {
String url = getURL();
if( url == null ) {
Logger.error("Got URL == null from line: '"+line+"'");
return 1.0;
}
try {
// Get the input as bytes:
byte[] input = url.getBytes("UTF-8");
// Compress the bytes and get compressed length:
byte[] output = new byte[input.length+100];
compresser.setInput(input);
compresser.finish();
int compressedDataLength = compresser.deflate(output);
//compresser.end();
compresser.reset();
// The smaller this ratio is, the more 'compressible' the string,
// i.e. the more repetitive the URL
bufferedCompressability = ((double) compressedDataLength) / ((double) input.length);
} catch (IOException e) {
Logger.error("Could not analyse URL for compressability: " + url);
}
} catch ( Exception e ) {
Logger.error("Caught exception '"+e+"' when reading this URL from crawl log line: '"+ line + "'");
}
}
return bufferedCompressability;
}
/**
* Attempt to make sure the compressor gets cleaned up properly on GC:
*/
protected void finalize() {
compresser.end();
}
@Override
public String toString() {
return line;
}
/**
* Helper method to extract the domain name from a URL.
* Cf. http://stackoverflow.com/questions/4819775/implementing-public-suffix-extraction-using-java
* @param url the URL
* @return the domain name
*/
private static HostParseResult extractDomainNames(LogFileEntry entry) {
// Not the nicest solution - but neither java.net.URL nor com.google.common.net.InternetDomainName
// can handle Heritrix' custom 'dns:' protocol prefix.
String url = entry.getURL();
if (url.startsWith("dns:"))
url = "http://" + url.substring(4);
String host = null;
String domain = null;
String subdomain = null;
try {
host = new URL(url).getHost();
domain = InternetDomainName.from(host).topPrivateDomain().name();
if (!domain.equals(host))
subdomain = host.substring(0, host.lastIndexOf(domain) - 1);
return new HostParseResult(host, domain, subdomain, null);
} catch (MalformedURLException e) {
// Logger.warn(e.getMessage());
return new HostParseResult(url, url, subdomain, new DefaultAlert(entry.getLogTimestamp().getTime(), url, AlertType.MALFORMED_CRAWL_URL, MSG_MALFORMED_URL + url));
} catch (IllegalArgumentException e) {
// Will be thrown by InternetDomainName.from in case the host name looks weird
// Logger.warn(e.getMessage());
// Special handling for the most common error cause - subdomains ending with '-'
String[] tokens = host.split("\\.");
int offendingToken = -1;
for (int i=0; i<tokens.length; i++) {
if (tokens[i].endsWith("-"))
offendingToken = i;
}
if (offendingToken > -1) {
StringBuilder subdomainBuilder = new StringBuilder();
for (int i=0; i<offendingToken + 1; i++)
subdomainBuilder.append("." + tokens[i]);
subdomain = subdomainBuilder.toString().substring(1);
StringBuilder hostBuilder = new StringBuilder();
for (int i=offendingToken + 1; i<tokens.length; i++)
hostBuilder.append("." + tokens[i]);
domain = hostBuilder.toString().substring(1);
}
return new HostParseResult(host, domain, subdomain, new DefaultAlert(entry.getLogTimestamp().getTime(), host, AlertType.MALFORMED_CRAWL_URL, MSG_MALFORMED_URL + url));
} catch (IllegalStateException e) {
// Will be thrown by InternetDomainName.from in case the host name looks weird
Logger.warn(e.getMessage());
return new HostParseResult(host, domain, subdomain, new DefaultAlert(entry.getLogTimestamp().getTime(), url, AlertType.MALFORMED_CRAWL_URL, MSG_MALFORMED_URL + url));
} catch (Throwable e) {
Logger.warn("Offending host: " + host);
Logger.warn("Extracted subdomain: " + subdomain);
throw new RuntimeException(e);
}
}
/**
* Simple helper class to wrap the result of host-from-URL parsing.
*/
private static class HostParseResult {
private String host;
private String domain;
private String subdomain;
private Alert alert;
HostParseResult(String host, String domain, String subdomain, Alert alert) {
this.host = host;
this.domain = domain;
this.subdomain = subdomain;
this.alert = alert;
}
}
/**
* An in-memory implementation of {@link Alert}.
*/
public static class DefaultAlert implements Alert {
private long timestamp;
private String offendingHost;
private AlertType type;
private String description;
public DefaultAlert(long timestamp, String offendingHost, AlertType type, String description) {
this.timestamp = timestamp;
this.offendingHost = offendingHost;
this.type = type;
this.description = description;
}
@Override
public long getTimestamp() {
return timestamp;
}
@Override
public String getOffendingHost() {
return offendingHost;
}
@Override
public AlertType getAlertType() {
return type;
}
@Override
public String getAlertDescription() {
return description;
}
}
/**
* An in-memory representation of a {@link VirusRecord}.
*/
public static class DefaultVirusRecord implements VirusRecord {
private String name;
private Map<String, Integer> occurences;
public DefaultVirusRecord(String name, Map<String, Integer> occurences ) {
this.name = name;
this.occurences = occurences;
}
@Override
public String getName() {
return this.name;
}
@Override
public Map<String, Integer> getOccurences() {
return this.occurences;
}
}
}