package uk.bl.monitrix.database.mongodb.ingest;
import play.Logger;
import uk.bl.monitrix.analytics.LogAnalytics;
import uk.bl.monitrix.database.mongodb.MongoProperties;
import uk.bl.monitrix.database.mongodb.model.MongoCrawlStats;
import uk.bl.monitrix.database.mongodb.model.MongoCrawlStatsUnit;
import uk.bl.monitrix.model.CrawlLogEntry;
import uk.bl.monitrix.model.KnownHost;
import com.mongodb.DB;
import com.mongodb.BasicDBObject;
/**
* An extended version of {@link MongoCrawlStats} that adds ingest capability.
* The ingest is 'smart' in the sense as it also performs various aggregation computations,
* including those involving the known hosts list.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
class MongoCrawlStatsImporter extends MongoCrawlStats {
private MongoKnownHostImporter knownHosts;
private MongoVirusLogImporter virusLog;
public MongoCrawlStatsImporter(DB db, MongoKnownHostImporter knownHosts, MongoVirusLogImporter virusLog) {
super(db);
this.knownHosts = knownHosts;
this.virusLog = virusLog;
}
/**
* Updates the crawl stats with a single log entry. Note that this method ONLY writes to
* the in-memory cache to avoid excessive DB transactions! To write to the DB, execute the
* .commit() method after your updates are done.
* @param entry the log entry
*/
public void update(CrawlLogEntry entry, String crawl_id) {
// Step 1 - compute the timeslot
long timeslot = toTimeslot(entry.getLogTimestamp().getTime());
// Step 2 - update data for this timeslot
MongoCrawlStatsUnit currentUnit = (MongoCrawlStatsUnit) getStatsForTimestamp(timeslot,crawl_id);
if (currentUnit == null) {
// Step 3a - init data for this timeslot
currentUnit = new MongoCrawlStatsUnit(new BasicDBObject());
currentUnit.setTimestamp(timeslot);
currentUnit.setNumberOfURLsCrawled(1);
currentUnit.setDownloadVolume(entry.getDownloadSize());
currentUnit.setNumberOfNewHostsCrawled(0);
} else {
// Step 3b - update existing data for this timeslot
currentUnit.setNumberOfURLsCrawled(currentUnit.getNumberOfURLsCrawled() + 1);
currentUnit.setDownloadVolume(currentUnit.getDownloadVolume() + entry.getDownloadSize());
}
// Step 3 - update hosts info
String hostname = entry.getHost();
if (knownHosts.isKnown(hostname)) {
KnownHost host = knownHosts.getKnownHost(hostname);
// Update host completion time
long lastRecordedAccess = host.getLastAccess();
if (lastRecordedAccess < timeslot) {
MongoCrawlStatsUnit unitToModify = (MongoCrawlStatsUnit) getStatsForTimestamp(toTimeslot(lastRecordedAccess), crawl_id);
unitToModify.setCompletedHosts(unitToModify.countCompletedHosts() - 1);
currentUnit.setCompletedHosts(currentUnit.countCompletedHosts() + 1);
}
// Update last access time
knownHosts.setLastAccess(hostname, entry.getLogTimestamp().getTime());
} else {
long timestamp = entry.getLogTimestamp().getTime();
knownHosts.addToList(hostname, entry.getDomain(), entry.getSubdomain(), timestamp);
currentUnit.setNumberOfNewHostsCrawled(currentUnit.getNumberOfNewHostsCrawled() + 1);
currentUnit.setCompletedHosts(currentUnit.countCompletedHosts() + 1);
}
// Note: it's a little confusing that these aggregation steps are in this class
// TODO move into the main MongoBatchImporter
knownHosts.incrementFetchStatusCounter(hostname, entry.getHTTPCode());
knownHosts.incrementCrawledURLCounter(hostname);
knownHosts.updateAverageResponseTimeAndRetryRate(hostname, entry.getFetchDuration(), entry.getRetries());
// Warning: there seems to be a bug in Heritrix which sometimes leaves a 'content type template' (?)
// in the log line: content type = '$ctype'. This causes MongoDB to crash, because it can't use
// strings starting with '$' as JSON keys. Therefore, we'll cut off the '$' and log a warning.
String contentType = entry.getContentType();
if (contentType.charAt(0) == '$') {
Logger.warn("Invalid content type found in log: " + contentType);
contentType = contentType.substring(1);
}
knownHosts.incrementContentTypeCounter(hostname, contentType);
String virusName = LogAnalytics.extractVirusName(entry);
if (virusName != null) {
// MongoDB says: fields stored in the db can't have . in them.
knownHosts.incrementVirusStats(hostname, virusName.replace('.', '@'));
virusLog.recordOccurence(virusName, hostname);
}
// Step 5 - save
// TODO optimize caching - insert LRU elements into DB when reasonable
cache.put(timeslot, currentUnit);
}
private long toTimeslot(long timestamp) {
return (timestamp / MongoProperties.PRE_AGGREGATION_RESOLUTION_MILLIS) * MongoProperties.PRE_AGGREGATION_RESOLUTION_MILLIS;
}
/**
* Writes the contents of the cache to the database.
*/
public void commit() {
// This means we're making individual commits to the DB
// TODO see if we can optimize
for (MongoCrawlStatsUnit dbo : cache.values()) {
save(dbo);
}
cache.clear();
knownHosts.commit();
}
/**
* Saves the wrapped DBObject to the collection.
* @param dbo the wrapped DBObject
*/
public void save(MongoCrawlStatsUnit unit) {
collection.save(unit.getBackingDBO());
}
}