package uk.bl.monitrix.database.cassandra.ingest;
import play.Logger;
import com.datastax.driver.core.BoundStatement;
import com.datastax.driver.core.PreparedStatement;
import com.datastax.driver.core.Session;
import uk.bl.monitrix.analytics.LogAnalytics;
import uk.bl.monitrix.database.cassandra.CassandraProperties;
import uk.bl.monitrix.database.cassandra.model.CassandraCrawlStats;
import uk.bl.monitrix.database.cassandra.model.CassandraCrawlStatsUnit;
import uk.bl.monitrix.model.CrawlLogEntry;
import uk.bl.monitrix.model.IngestSchedule;
import uk.bl.monitrix.model.KnownHost;
/**
* An extended version of {@link CassandraCrawlStats} that adds ingest capability.
* The ingest is 'smart' in the sense as it also performs various aggregation computations,
* including those involving the known hosts list.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
class CassandraCrawlStatsImporter extends CassandraCrawlStats {
private PreparedStatement statement = null;
private CassandraKnownHostImporter knownHosts;
private CassandraVirusLogImporter virusLog;
public CassandraCrawlStatsImporter(Session db, IngestSchedule schedule, CassandraKnownHostImporter knownHosts, CassandraVirusLogImporter virusLog) {
super(db, schedule);
this.knownHosts = knownHosts;
this.virusLog = virusLog;
prepareNewCrawlStatsLine();
}
private void prepareNewCrawlStatsLine() {
this.statement = session.prepare(
"INSERT INTO " + CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_CRAWL_STATS + " (" +
CassandraProperties.FIELD_CRAWL_STATS_CRAWL_ID + ", " +
CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP + ", " +
CassandraProperties.FIELD_CRAWL_STATS_DOWNLOAD_VOLUME + ", " +
CassandraProperties.FIELD_CRAWL_STATS_NUMBER_OF_URLS_CRAWLED + ", " +
CassandraProperties.FIELD_CRAWL_STATS_NEW_HOSTS_CRAWLED + ", " +
CassandraProperties.FIELD_CRAWL_STATS_COMPLETED_HOSTS + ") " +
"VALUES (?, ?, ?, ?, ?, ?);");
}
private void insertNewCrawlStatsLine(String crawl_id, long timeslot, CrawlLogEntry entry) {
BoundStatement boundStatement = new BoundStatement(statement);
session.execute(boundStatement.bind(
crawl_id,
timeslot,
entry.getDownloadSize(),
1l, 0l, 0l));
}
/**
* Updates the crawl stats with a single log entry. Note that this method ONLY writes to
* the in-memory cache to avoid excessive DB transactions! To write to the DB, execute the
* .commit() method after your updates are done.
* @param entry the log entry
*/
public void update(CrawlLogEntry entry, String crawl_id) {
// Step 1 - compute the timeslot
long timeslot = toTimeslot(entry.getLogTimestamp().getTime());
// Step 2 - update data for this timeslot
CassandraCrawlStatsUnit currentUnit = (CassandraCrawlStatsUnit) getStatsForTimestamp(timeslot, crawl_id);
if (currentUnit == null) {
insertNewCrawlStatsLine(crawl_id, timeslot, entry);
// This also ensures we got the unit in the cache now
currentUnit = (CassandraCrawlStatsUnit) getStatsForTimestamp(timeslot, crawl_id);
} else {
currentUnit.setDownloadVolume(currentUnit.getDownloadVolume() + entry.getDownloadSize());
currentUnit.setNumberOfURLsCrawled(currentUnit.getNumberOfURLsCrawled() + 1);
}
// Step 3 - update hosts info
String hostname = entry.getHost();
if (knownHosts.isKnown(hostname)) {
KnownHost host = knownHosts.getKnownHost(hostname);
// Update host completion time
long lastRecordedAccess = host.getLastAccess();
if (lastRecordedAccess < timeslot) {
CassandraCrawlStatsUnit unitToModify = (CassandraCrawlStatsUnit) getStatsForTimestamp(toTimeslot(lastRecordedAccess), crawl_id);
// create crawl stats table for the current time slot if it does not exist
if(unitToModify == null) {
prepareNewCrawlStatsLine();
insertNewCrawlStatsLine(crawl_id, timeslot, entry);
} else {
unitToModify.setCompletedHosts(unitToModify.countCompletedHosts() - 1);
}
currentUnit.setCompletedHosts(currentUnit.countCompletedHosts() + 1);
}
// Update last access time
knownHosts.setLastAccess(hostname, entry.getLogTimestamp().getTime());
} else {
long timestamp = entry.getLogTimestamp().getTime();
knownHosts.addToList(hostname, entry.getDomain(), entry.getSubdomain(), timestamp);
currentUnit.setNumberOfNewHostsCrawled(currentUnit.getNumberOfNewHostsCrawled() + 1);
currentUnit.setCompletedHosts(currentUnit.countCompletedHosts() + 1);
}
// Note: it's a little confusing that these aggregation steps are in this class
// TODO move into the main CassandraBatchImporter
knownHosts.incrementFetchStatusCounter(hostname, entry.getHTTPCode());
knownHosts.incrementCrawledURLCounter(hostname);
knownHosts.updateAverageResponseTimeAndRetryRate(hostname, entry.getFetchDuration(), entry.getRetries());
// Warning: there seems to be a bug in Heritrix which sometimes leaves a 'content type template' (?)
// in the log line: content type = '$ctype'. This causes CassandraDB to crash, because it can't use
// strings starting with '$' as JSON keys. Therefore, we'll cut off the '$' and log a warning.
String contentType = entry.getContentType();
if (contentType.charAt(0) == '$') {
Logger.warn("Invalid content type found in log: " + contentType);
contentType = contentType.substring(1);
}
knownHosts.incrementContentTypeCounter(hostname, contentType);
String virusName = LogAnalytics.extractVirusName(entry);
if (virusName != null) {
knownHosts.incrementVirusStats(hostname, virusName);
virusLog.recordOccurence(virusName, hostname);
}
// Step 5 - save
// TODO optimize caching - insert LRU elements into DB when reasonable
cache.put(timeslot, currentUnit);
}
private long toTimeslot(long timestamp) {
return (timestamp / CassandraProperties.PRE_AGGREGATION_RESOLUTION_MILLIS) * CassandraProperties.PRE_AGGREGATION_RESOLUTION_MILLIS;
}
public void commit() {
for (CassandraCrawlStatsUnit csu : cache.values()) {
csu.save(session);
}
knownHosts.commit();
}
}