package uk.bl.monitrix.database.mongodb.ingest;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import play.Logger;
import com.mongodb.DB;
import com.mongodb.Mongo;
import com.mongodb.BasicDBObject;
import uk.bl.monitrix.database.DBIngestConnector;
import uk.bl.monitrix.database.mongodb.MongoProperties;
import uk.bl.monitrix.database.mongodb.model.MongoAlert;
import uk.bl.monitrix.database.mongodb.model.MongoCrawlLogEntry;
import uk.bl.monitrix.database.mongodb.model.MongoIngestSchedule;
import uk.bl.monitrix.heritrix.LogFileEntry;
import uk.bl.monitrix.model.Alert;
import uk.bl.monitrix.model.IngestSchedule;
/**
* An importer class that ingests a batch of crawl log entries, performing all necessary
* data aggregation computations.
*
* IMPORTANT: the ingest process is not thread safe!
*
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class MongoDBIngestConnector implements DBIngestConnector {
// MongoDB host
private Mongo mongo;
// Monitrix database
private DB db;
// Ingest schedule
private MongoIngestSchedule ingestSchedule;
// Crawl log
private MongoCrawlLogImporter crawlLogImporter;
// Alert log
private MongoAlertLogImporter alertLogImporter;
// Known host list
private MongoKnownHostImporter knownHostImporter;
// Crawl stats
private MongoCrawlStatsImporter crawlStatsImporter;
public MongoDBIngestConnector() throws IOException {
init(MongoProperties.DB_HOST, MongoProperties.DB_NAME, MongoProperties.DB_PORT);
}
public MongoDBIngestConnector(String hostName, String dbName, int dbPort) throws IOException {
init(hostName, dbName, dbPort);
}
private void init(String hostName, String dbName, int dbPort) throws IOException {
this.mongo = new Mongo(hostName, dbPort);
this.db = mongo.getDB(dbName);
this.ingestSchedule = new MongoIngestSchedule(db);
this.crawlLogImporter = new MongoCrawlLogImporter(db);
this.alertLogImporter = new MongoAlertLogImporter(db);
this.knownHostImporter = new MongoKnownHostImporter(db, this.alertLogImporter);
this.crawlStatsImporter = new MongoCrawlStatsImporter(db, knownHostImporter, new MongoVirusLogImporter(db));
}
@Override
public IngestSchedule getIngestSchedule() {
return ingestSchedule;
}
@Override
public void insert(String logId, Iterator<LogFileEntry> iterator) {
long start = System.currentTimeMillis();
String crawlerId = ingestSchedule.getLog(logId).getCrawlerId();
while (iterator.hasNext()) {
long bulkStart = System.currentTimeMillis();
List<MongoCrawlLogEntry> logEntryBatch = new ArrayList<MongoCrawlLogEntry>();
List<MongoAlert> alertBatch = new ArrayList<MongoAlert>();
int counter = 0; // Should be slightly faster than using list.size() to count
long timeOfFirstLogEntryInBatch = Long.MAX_VALUE;
while (iterator.hasNext() && (counter < MongoProperties.BULK_INSERT_CHUNK_SIZE)) {
LogFileEntry next = iterator.next();
counter++;
// Skip bad ones:
if( next.getParseFailed() ) {
Logger.error("Skipping storing a line due to a parse failure. "+counter);
continue;
}
long timestamp = next.getLogTimestamp().getTime();
if (timestamp < timeOfFirstLogEntryInBatch)
timeOfFirstLogEntryInBatch = timestamp;
// Assemble MongoDB entity
MongoCrawlLogEntry dbo = new MongoCrawlLogEntry(new BasicDBObject());
dbo.setLogId(logId);
dbo.setTimestamp(timestamp);
dbo.setURL(next.getURL());
dbo.setHost(next.getHost());
dbo.setSubdomain(next.getSubdomain());
dbo.setCrawlerID(next.getWorkerThread());
dbo.setHTTPCode(next.getHTTPCode());
dbo.setAnnotations(next.getAnnotations());
dbo.setLogLine(next.toString());
dbo.setRetries(next.getRetries());
dbo.setCompressability(next.getCompressability());
logEntryBatch.add(dbo);
// Update pre-aggregated stats
crawlStatsImporter.update(next, crawlerId);
// Host info
knownHostImporter.addCrawlerID(next.getHost(), crawlerId);
// Log-entry-level alerts
for (Alert a : next.getAlerts()) {
MongoAlert alert = new MongoAlert(new BasicDBObject());
alert.setTimestamp(next.getLogTimestamp().getTime());
alert.setOffendingHost(a.getOffendingHost());
alert.setAlertType(a.getAlertType());
alert.setAlertDescription(a.getAlertDescription());
alertBatch.add(alert);
}
}
Logger.info("Processed " + counter + " log entries (" + (System.currentTimeMillis() - bulkStart) + " ms) - writing to DB");
bulkStart = System.currentTimeMillis();
crawlLogImporter.insert(logEntryBatch);
logEntryBatch.clear();
alertLogImporter.insert(alertBatch);
alertBatch.clear();
crawlStatsImporter.commit();
ingestSchedule.incrementIngestedLogLines(logId, counter);
Logger.info("Done (" + (System.currentTimeMillis() - bulkStart) + " ms)");
}
Logger.debug("Done - took " + (System.currentTimeMillis() - start) + " ms");
}
}