package uk.bl.monitrix.database.cassandra; import java.io.IOException; import java.util.HashMap; import play.Logger; import com.datastax.driver.core.BoundStatement; import com.datastax.driver.core.Cluster; import com.datastax.driver.core.Host; import com.datastax.driver.core.Metadata; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.ResultSet; import com.datastax.driver.core.Row; import com.datastax.driver.core.Session; import uk.bl.monitrix.database.DBConnector; import uk.bl.monitrix.database.ExtensionTable; import uk.bl.monitrix.database.cassandra.model.CassandraAlertLog; import uk.bl.monitrix.database.cassandra.model.CassandraCrawlLog; import uk.bl.monitrix.database.cassandra.model.CassandraCrawlStats; import uk.bl.monitrix.database.cassandra.model.CassandraIngestSchedule; import uk.bl.monitrix.database.cassandra.model.CassandraKnownHostList; import uk.bl.monitrix.database.cassandra.model.CassandraVirusLog; import uk.bl.monitrix.model.AlertLog; import uk.bl.monitrix.model.CrawlLog; import uk.bl.monitrix.model.CrawlStats; import uk.bl.monitrix.model.IngestSchedule; import uk.bl.monitrix.model.KnownHostList; import uk.bl.monitrix.model.VirusLog; /** * A Cassandra-backed implementation of {@link DBConnector}. * @author Rainer Simon <rainer.simon@ait.ac.at> */ public class CassandraDBConnector implements DBConnector { // The Cassandra Cluster private Cluster cluster; // The Cassandra Session private Session session; // The resolution of the time-wise logging public static final long HOUR_AS_MILLIS = 1000*60*60; // Ingest schedule private volatile IngestSchedule ingestSchedule; // Crawl log private CrawlLog crawlLog; // Crawl stats private CrawlStats crawlStats; // Known hosts list private KnownHostList knownHosts; // Alert log private AlertLog alertLog; // Virus log private VirusLog virusLog; // Extension tables private HashMap<String, ExtensionTable> extensionTables = new HashMap<String, ExtensionTable>(); public CassandraDBConnector() throws IOException { init(CassandraProperties.HOST, CassandraProperties.KEYSPACE, CassandraProperties.DB_PORT); } public CassandraDBConnector(String hostName, String keyspace, int dbPort) throws IOException { init(hostName, keyspace, dbPort); } private void init(String hostName, String keyspace, int dbPort) throws IOException { Logger.info("Initializing database connection"); cluster = Cluster.builder() .addContactPoint(hostName).build(); Metadata metadata = cluster.getMetadata(); Logger.info("Connected to Cassandra cluster: " + metadata.getClusterName()); for ( Host host : metadata.getAllHosts() ) { Logger.info("Datacenter: " + host.getDatacenter() + "; Host: " + host.getAddress() + "; Rack: " + host.getRack()); } session = cluster.connect(); // Add schema if needed if (!schemaExists()) createSchema(); this.ingestSchedule = new CassandraIngestSchedule(session); this.crawlLog = new CassandraCrawlLog(session); this.crawlStats = new CassandraCrawlStats(session, ingestSchedule); this.knownHosts = new CassandraKnownHostList(session); this.alertLog = new CassandraAlertLog(session, crawlLog); this.virusLog = new CassandraVirusLog(session); } public boolean isAvailable() { boolean anyHostUp = false; for (Host host : this.getSession().getCluster().getMetadata().getAllHosts()) { if(host.isUp()) anyHostUp = true; if(anyHostUp) continue; } return anyHostUp; } private boolean schemaExists() { Logger.info("Checking if schema exists..."); ResultSet rows = session.execute("select * from system.schema_keyspaces;"); for (Row r : rows ) { if (r.getString("keyspace_name").equals(CassandraProperties.KEYSPACE)) return true; } Logger.info("No schema defined"); return false; } private void createSchema() { Logger.info("Creating schema..."); session.execute("CREATE KEYSPACE " + CassandraProperties.KEYSPACE + " WITH replication " + "= {'class':'SimpleStrategy', 'replication_factor':1};"); // This is a fairly denormalised model, with URL-based lookup for frontier management // and de-duplication, and time-wise lookups session.execute( "CREATE TABLE crawl_uris.crawl_log(" + "log_id varchar , " + "timestamp varchar, " + "long_timestamp bigint, " + "coarse_timestamp bigint, " + "status_code int, " + "downloaded_bytes bigint, " + "uri varchar, " + "host varchar, " + "domain varchar, " + "subdomain varchar, " + "discovery_path varchar, " + "referer varchar, " + "content_type varchar, " + "worker_thread varchar, " + "fetch_ts bigint, " + "hash varchar, " + "annotations varchar, " + "ip_address varchar, " + "compressability double, " + "line varchar, " + "PRIMARY KEY (hash, timestamp) ); "); buildCompressabilityHistogram(); session.execute( "CREATE TABLE crawl_uris.ingest_schedule(" + "crawl_id varchar PRIMARY KEY, " + "log_path varchar, " + "start_ts bigint, " + "end_ts bigint, " + "ingested_lines bigint, " + "revisit_records bigint, "+ "is_monitored boolean) " + "WITH COMPACT STORAGE;"); session.execute( "CREATE TABLE crawl_uris.known_hosts(" + "host varchar PRIMARY KEY, " + "tld varchar, " + "domain varchar, " + "subdomain varchar, " + "first_access bigint, " + "last_access bigint, " + "crawlers varchar, " + "crawled_urls bigint, " + "successfully_fetched_urls bigint, " + "avg_fetch_duration double, " + "avg_retry_rate double, " + "fetch_status_codes varchar, " + "content_types varchar, " + "virus_stats varchar, " + "redirect_percentage double, " + "robots_block_percentage double, " + "text_to_nontext_ratio double);"); session.execute( "CREATE TABLE crawl_uris.known_tlds(" + "tld varchar PRIMARY KEY," + "count bigint);"); session.execute( "CREATE TABLE crawl_uris.crawl_stats(" + "crawl_id varchar, " + "stat_ts bigint, " + "downloaded_bytes bigint, " + "uris_crawled bigint, " + "new_hosts bigint, " + "completed_hosts bigint, " + "PRIMARY KEY (crawl_id, stat_ts) );"); session.execute( "CREATE TABLE crawl_uris.alert_log(" + "timestamp bigint, " + "timestamp_hour bigint, " + "offending_host varchar, " + "alert_type varchar, " + "alert_description varchar, " + "PRIMARY KEY (offending_host, timestamp_hour, timestamp) );"); session.execute( "CREATE TABLE crawl_uris.virus_log(" + "virus_name varchar PRIMARY KEY, " + "occurences varchar )" + "WITH COMPACT STORAGE;"); // Crawl log indexes session.execute("CREATE INDEX log_id on crawl_uris.crawl_log(log_id);"); session.execute("CREATE INDEX long_log_ts on crawl_uris.crawl_log(long_timestamp);"); session.execute("CREATE INDEX coarse_ts on crawl_uris.crawl_log(coarse_timestamp);"); session.execute("CREATE INDEX uri on crawl_uris.crawl_log(uri);"); session.execute("CREATE INDEX host on crawl_uris.crawl_log(host);"); session.execute("CREATE INDEX annotations on crawl_uris.crawl_log(annotations);"); session.execute("CREATE INDEX compressability on crawl_uris.crawl_log(compressability);"); // Ingest schedule indexes session.execute("CREATE INDEX log_path on crawl_uris.ingest_schedule(log_path);"); // Known host table indexes session.execute("CREATE INDEX tld on crawl_uris.known_hosts(tld);"); session.execute("CREATE INDEX avg_fetch_duration on crawl_uris.known_hosts(avg_fetch_duration);"); session.execute("CREATE INDEX avg_retry_rate on crawl_uris.known_hosts(avg_retry_rate);"); session.execute("CREATE INDEX robots_block_percentage on crawl_uris.known_hosts(robots_block_percentage);"); session.execute("CREATE INDEX redirect_percentage on crawl_uris.known_hosts(redirect_percentage);"); // Alert log indexes session.execute("CREATE INDEX alert_type on crawl_uris.alert_log(alert_type);"); } private void buildCompressabilityHistogram() { // Note: for some reason, doubles don't properly work as primary keys // So we use an integer as bucket, where key = compressability * 1000 session.execute( "CREATE TABLE crawl_uris.compressability_histogram(" + "bucket int PRIMARY KEY," + "url_count bigint) " + "WITH COMPACT STORAGE;"); PreparedStatement insertHistogramBucket = session.prepare( "INSERT INTO crawl_uris.compressability_histogram (" + "bucket, url_count) VALUES (?, ?);"); int bucket = 0; Logger.info("Initializing URL compressability histogram"); while (bucket < 2000) { BoundStatement boundStatement = new BoundStatement(insertHistogramBucket); session.execute(boundStatement.bind(bucket, 0l)); bucket += 1; } Logger.info("Done."); } public void dropSchema() { if (schemaExists() ) session.execute("DROP KEYSPACE " + CassandraProperties.KEYSPACE); } public Session getSession() { return this.session; } @Override public IngestSchedule getIngestSchedule() { return ingestSchedule; } @Override public CrawlLog getCrawlLog() { return crawlLog; } @Override public CrawlStats getCrawlStats() { return crawlStats; } @Override public AlertLog getAlertLog() { return alertLog; } @Override public VirusLog getVirusLog() { return virusLog; } @Override public KnownHostList getKnownHostList() { return knownHosts; } @Override public void close() { this.session.shutdown(); this.cluster.shutdown(); } @Override @SuppressWarnings("unchecked") public <T extends ExtensionTable> T getExtensionTable(String name, Class<T> type) { ExtensionTable ext = extensionTables.get(name); if (ext == null) { try { ext = null;//type.getConstructor(DB.class).newInstance(db); extensionTables.put(name, ext); } catch (Exception e) { throw new RuntimeException(e); } } return (T) ext; } }