package uk.bl.monitrix.database.cassandra.ingest; import java.util.Date; import com.datastax.driver.core.BoundStatement; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.ResultSet; import com.datastax.driver.core.Row; import com.datastax.driver.core.Session; import uk.bl.monitrix.database.cassandra.CassandraProperties; import uk.bl.monitrix.database.cassandra.model.CassandraCrawlLog; import uk.bl.monitrix.heritrix.LogFileEntry; /** * An extended version of {@link CassandraCrawlLog} that adds insert capability. * @author Rainer Simon <rainer.simon@ait.ac.at> */ class CassandraCrawlLogImporter extends CassandraCrawlLog { private PreparedStatement crawlLogStatement = null; public CassandraCrawlLogImporter(Session db) { super(db); this.crawlLogStatement = session.prepare( "INSERT INTO crawl_uris.crawl_log (" + "log_id, timestamp, long_timestamp, coarse_timestamp, status_code, downloaded_bytes, uri, host, " + "discovery_path, referer, content_type, worker_thread, fetch_ts, hash, annotations, ip_address, " + "compressability, line) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"); } public void updateCrawlInfo(String crawl_id, long timeOfFirstLogEntryInPatch, long timeOfLastLogEntryInPatch ) { ResultSet results = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + " WHERE " + CassandraProperties.FIELD_INGEST_CRAWL_ID + "='" + crawl_id + "';"); Row r = results.one(); long startTs = r.getLong(CassandraProperties.FIELD_INGEST_START_TS); long endTs = r.getLong(CassandraProperties.FIELD_INGEST_END_TS); if (startTs == 0 || timeOfFirstLogEntryInPatch < startTs) startTs = timeOfFirstLogEntryInPatch; if (timeOfLastLogEntryInPatch > endTs ) endTs = timeOfLastLogEntryInPatch; session.execute("UPDATE " + TABLE_INGEST_SCHEDULE + " SET " + CassandraProperties.FIELD_INGEST_START_TS + "=" + startTs + ", " + CassandraProperties.FIELD_INGEST_END_TS + "=" + endTs + " WHERE " + CassandraProperties.FIELD_INGEST_CRAWL_ID + "='" + crawl_id + "';"); } public void insert(LogFileEntry l) { // Check timestamp - should be the discovery/queue timestamp: Date log_ts = l.getLogTimestamp(); Date fetch_ts = l.getFetchTimestamp(); if (fetch_ts == null) fetch_ts = log_ts; Date coarse_ts = getCoarseTimestamp(log_ts); BoundStatement boundStatement = new BoundStatement(crawlLogStatement); session.execute(boundStatement.bind( l.getLogId(), log_ts.toString(), log_ts.getTime(), coarse_ts.getTime(), l.getHTTPCode(), l.getDownloadSize(), l.getURL(), l.getHost(), l.getBreadcrumbCodes(), l.getReferrer(), l.getContentType(), l.getWorkerThread(), fetch_ts.getTime(), l.getSHA1Hash(), l.getAnnotations(), l.getAnnotations(), l.getCompressability(), l.toString())); // Update URL compressability histogram int compressabilityBucket = (int) Math.round(l.getCompressability() * 1000); String query = "SELECT * FROM " + TABLE_COMPRESSABILITY_HISTOGRAM + " WHERE " + CassandraProperties.FIELD_COMPRESSABILITY_BUCKET + "=" + compressabilityBucket + ";"; Row r = session.execute(query).one(); long count = r.getLong(CassandraProperties.FIELD_COMPRESSABILITY_COUNT); session.execute("UPDATE " + TABLE_COMPRESSABILITY_HISTOGRAM + " SET " + CassandraProperties.FIELD_COMPRESSABILITY_COUNT + "=" + (count + 1) + " WHERE " + CassandraProperties.FIELD_COMPRESSABILITY_BUCKET + "=" + compressabilityBucket + ";"); } }