package uk.bl.monitrix.database.mongodb.ingest;
import java.util.ArrayList;
import java.util.Map;
import play.Logger;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import uk.bl.monitrix.analytics.HostAnalytics;
import uk.bl.monitrix.database.mongodb.model.MongoAlert;
import uk.bl.monitrix.database.mongodb.model.MongoKnownHost;
import uk.bl.monitrix.database.mongodb.model.MongoKnownHostList;
import uk.bl.monitrix.model.Alert.AlertType;
/**
* An extended version of {@link MongoKnownHostList} that adds insert/update capability.
*
* TODO this whole class really needs some cleanup!
*
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
class MongoKnownHostImporter extends MongoKnownHostList {
private static final String ALERT_MSG_TOO_MANY_SUBDOMAINS = "The host %s has a suspiciously high number of subdomains (%s)";
private static final String ALERT_MSG_TXT_TO_NONTEXT_RATIO = "The host %s serves a suspiciously high ratio of text vs. non-text resources";
private MongoAlertLogImporter alertLog;
public MongoKnownHostImporter(DB db, MongoAlertLogImporter alertLog) {
super(db);
this.alertLog = alertLog;
}
/**
* Adds a new host to the Known Hosts list. Note that this method ONLY writes to
* the in-memory cache! In order to write to the database, execute the .commit() method
* after your additions are done.
* @param hostname the host name
* @param accessTime the access time
*/
public MongoKnownHost addToList(String hostname, String domain, String subdomain, long accessTime) {
MongoKnownHost knownHost = new MongoKnownHost(new BasicDBObject());
knownHost.setHostname(hostname);
knownHost.setTopLevelDomain(hostname.substring(hostname.lastIndexOf('.') + 1));
knownHost.setDomain(domain);
knownHost.setSubdomain(subdomain);
knownHost.setFirstAccess(accessTime);
knownHost.setLastAccess(accessTime);
cache.put(hostname, knownHost);
return knownHost;
}
/**
* Updates the last access time for the specified host. Note that this method ONLY
* writes to the in-memory cache! In order to write to the database, execute the .commit()
* method after your additions are done.
* @param hostname the hostname
* @param lastAccess the new last access time
*/
public void setLastAccess(String hostname, long lastAccess) {
// In this case we know it's a safe cast
MongoKnownHost dbo = (MongoKnownHost) getKnownHost(hostname);
if (dbo != null)
dbo.setLastAccess(lastAccess);
else
Logger.warn("Attempt to write last access info to unknown host: " + hostname);
}
public void addCrawlerID(String hostname, String crawlerId) {
// In this case we know it's a safe cast
MongoKnownHost dbo = (MongoKnownHost) getKnownHost(hostname);
if (dbo != null)
dbo.addCrawlerID(crawlerId);
else
Logger.warn("Attempt to write crawlerID info to unknown host: " + hostname);
}
public void incrementFetchStatusCounter(String hostname, int fetchStatus) {
// In this case we know it's a safe cast
MongoKnownHost host = (MongoKnownHost) getKnownHost(hostname);
if (host != null) {
String key = Integer.toString(fetchStatus);
Map<String, Integer> fetchStatusMap = host.getFetchStatusDistribution();
Integer value = fetchStatusMap.get(key);
if (value == null)
fetchStatusMap.put(key, 1);
else
fetchStatusMap.put(key, value.intValue() + 1);
host.setFetchStatusDistribution(fetchStatusMap);
} else {
Logger.warn("Attempt to write fetch status info to unknown host: " + hostname);
}
}
public void incrementCrawledURLCounter(String hostname) {
MongoKnownHost host = (MongoKnownHost) getKnownHost(hostname);
if (host != null) {
long crawledURLs = host.getCrawledURLs();
host.setCrawledURLs(crawledURLs + 1);
} else {
Logger.warn("Attempt to increment crawled URL counter for unknown host: " + hostname);
}
}
public void incrementContentTypeCounter(String hostname, String contentType) {
// According to MongoDB rules: "fields stored in the db can't have . in them"
contentType = contentType.replace('.', '@');
// In this case we know it's a safe cast
MongoKnownHost host = (MongoKnownHost) getKnownHost(hostname);
if (host != null) {
Map<String, Integer> contentTypeMap = host.getContentTypeDistribution();
Integer value = contentTypeMap.get(contentType);
if (value == null)
contentTypeMap.put(contentType, 1);
else
contentTypeMap.put(contentType, value.intValue() + 1);
host.setContentTypeDistribution(contentTypeMap);
} else {
Logger.warn("Attempt to write content type info to unknown host: " + hostname);
}
}
public void incrementVirusStats(String hostname, String virusName) {
// In this case we know it's a safe cast
MongoKnownHost host = (MongoKnownHost) getKnownHost(hostname);
if (host != null) {
Map<String, Integer> virusMap = host.getVirusStats();
Integer value = virusMap.get(virusName);
if (value == null)
virusMap.put(virusName, 1);
else
virusMap.put(virusName, value.intValue() + 1);
host.setVirusStats(virusMap);
} else {
Logger.warn("Attempt to write virus stats info to unknown host: " + hostname);
}
}
public void updateAverageResponseTimeAndRetryRate(String hostname, int fetchDuration, int retries) {
if (fetchDuration > 0) {
MongoKnownHost host = (MongoKnownHost) getKnownHost(hostname);
if (host != null) {
long successCount = host.getSuccessfullyFetchedURLs();
double currentAvgResponseTime = host.getAverageFetchDuration();
double newAvgResponseTime = (currentAvgResponseTime * successCount + fetchDuration) / (successCount + 1);
double currentAvgRetryRate = host.getAverageRetryRate();
double newAvgRetryRate = (currentAvgRetryRate * successCount + retries) / (successCount + 1);
host.setSuccessfullyFetchedURLs(successCount + 1);
host.setAverageFetchDuration(newAvgResponseTime);
host.setAverageRetryRate(newAvgRetryRate);
} else {
Logger.warn("Attempt to update average response time for known host: " + hostname);
}
}
}
/**
* Writes the contents of the cache to the database.
*/
public void commit() {
Logger.info("Updating known hosts list (" + cache.size() + " hosts)");
for (MongoKnownHost knownHost : new ArrayList<MongoKnownHost>(cache.values())) {
// Looks a little recursive...
knownHost.setRobotsBlockPercentage(HostAnalytics.computePercentageOfRobotsTxtBlocks(knownHost));
knownHost.setRedirectPercentage(HostAnalytics.computePercentagOfRedirects(knownHost));
knownHost.setTextToNoneTextRatio(HostAnalytics.computeTextToNonTextRatio(knownHost));
collection.save(knownHost.getBackingDBO());
}
// Compute host-level alerts
// Note: we only need to consider hosts that were added in this batch - i.e. those in the cache!
// FIXME: Moving to the host-wise model means we've lost the alerts.
Logger.info("Computing host-level alerts");
for (MongoKnownHost host : cache.values()) {
// Subdomain limit
int subdomains = 1; //host.getSubdomain().size();
if (subdomains > 100) {
MongoAlert alert = new MongoAlert(new BasicDBObject());
alert.setTimestamp(host.getLastAccess());
alert.setOffendingHost(host.getHostname());
alert.setAlertType(AlertType.TOO_MANY_SUBDOMAINS);
alert.setAlertDescription(String.format(ALERT_MSG_TOO_MANY_SUBDOMAINS, host.getHostname(), Integer.toString(subdomains)));
alertLog.insert(alert);
}
// Text-to-Nontext content type ratio limit
if (host.getTextToNoneTextRatio() > 0.9) {
MongoAlert alert = new MongoAlert(new BasicDBObject());
alert.setTimestamp(host.getLastAccess());
alert.setOffendingHost(host.getHostname());
alert.setAlertType(AlertType.TXT_TO_NONTEXT_RATIO);
alert.setAlertDescription(String.format(ALERT_MSG_TXT_TO_NONTEXT_RATIO, host.getHostname()));
alertLog.insert(alert);
}
}
cache.clear();
}
}