package uk.bl.monitrix.database.cassandra.ingest; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import play.Logger; import uk.bl.monitrix.database.DBConnector; import uk.bl.monitrix.database.DBIngestConnector; import uk.bl.monitrix.database.cassandra.CassandraDBConnector; import uk.bl.monitrix.database.cassandra.CassandraProperties; import uk.bl.monitrix.database.cassandra.model.CassandraIngestSchedule; import uk.bl.monitrix.heritrix.LogFileEntry; import uk.bl.monitrix.heritrix.LogFileEntry.DefaultAlert; import uk.bl.monitrix.model.Alert; import uk.bl.monitrix.model.IngestSchedule; /** * An importer class that ingests a batch of crawl log entries, performing all necessary * data aggregation computations. * * IMPORTANT: the ingest process is not thread safe! * * @author Rainer Simon <rainer.simon@ait.ac.at> */ public class CassandraDBIngestConnector implements DBIngestConnector { // DB connection: private CassandraDBConnector db; // Ingest schedule private CassandraIngestSchedule ingestSchedule; // Crawl log private CassandraCrawlLogImporter crawlLogImporter; // Alert log private CassandraAlertLogImporter alertLogImporter; // Known host list private CassandraKnownHostImporter knownHostImporter; // Crawl stats private CassandraCrawlStatsImporter crawlStatsImporter; public CassandraDBIngestConnector(DBConnector db) throws IOException { this.db = (CassandraDBConnector) db; this.init(); } private void init() throws IOException { this.ingestSchedule = new CassandraIngestSchedule(db.getSession()); this.crawlLogImporter = new CassandraCrawlLogImporter(db.getSession()); this.alertLogImporter = new CassandraAlertLogImporter(db.getSession(), crawlLogImporter); this.knownHostImporter = new CassandraKnownHostImporter(db.getSession(), this.ingestSchedule, this.alertLogImporter); this.crawlStatsImporter = new CassandraCrawlStatsImporter(db.getSession(), ingestSchedule, knownHostImporter, new CassandraVirusLogImporter(db.getSession())); } @Override public IngestSchedule getIngestSchedule() { return ingestSchedule; } @Override public void insert(String logId, Iterator<LogFileEntry> iterator) { long start = System.currentTimeMillis(); String crawlerId = ingestSchedule.getLog(logId).getCrawlerId(); while (iterator.hasNext()) { long bulkStart = System.currentTimeMillis(); List<DefaultAlert> alertBatch = new ArrayList<DefaultAlert>(); int counter = 0; // Should be slightly faster than using list.size() to count int revisits = 0; long timeOfFirstLogEntryInBatch = Long.MAX_VALUE; long timeOfLastLogEntryInPatch = 0; Logger.info("Processing next batch of log entries"); while (iterator.hasNext() && (counter < CassandraProperties.BULK_INSERT_CHUNK_SIZE)) { LogFileEntry next = iterator.next(); counter++; if (next.isRevisitRecord()) revisits++; // Skip bad ones: if (next.getParseFailed()) { Logger.error("Skipping storing a line due to a parse failure. " + counter); continue; } long timestamp = next.getLogTimestamp().getTime(); if (timestamp < timeOfFirstLogEntryInBatch) timeOfFirstLogEntryInBatch = timestamp; if( timestamp > timeOfLastLogEntryInPatch) timeOfLastLogEntryInPatch = timestamp; crawlLogImporter.insert(next); crawlStatsImporter.update(next, crawlerId); knownHostImporter.addCrawlerID(next.getHost(), crawlerId); // FIXME Check for long runs and raise alerts? // Log-entry-level alerts for (Alert a : next.getAlerts()) { alertBatch.add((DefaultAlert) a); } } Logger.info("Updating crawl info"); ingestSchedule.incrementIngestedLogLines(logId, counter, revisits); crawlLogImporter.updateCrawlInfo(crawlerId, timeOfFirstLogEntryInBatch, timeOfLastLogEntryInPatch ); Logger.info("Inserting alerts"); alertLogImporter.insert(alertBatch); alertBatch.clear(); Logger.info("Committing crawl stats"); crawlStatsImporter.commit(); Logger.info("Batch ingest complete (" + (System.currentTimeMillis() - bulkStart) + " ms)"); } Logger.debug("Done - took " + (System.currentTimeMillis() - start) + " ms"); } }