package uk.bl.monitrix.database.cassandra.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import com.datastax.driver.core.exceptions.NoHostAvailableException;
import play.Logger;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import uk.bl.monitrix.database.cassandra.CassandraProperties;
import uk.bl.monitrix.database.cassandra.CassandraDBConnector;
import uk.bl.monitrix.model.CrawlLog;
import uk.bl.monitrix.model.CrawlLogEntry;
import uk.bl.monitrix.model.SearchResult;
import uk.bl.monitrix.model.SearchResultItem;
/**
* A CassandraDB-backed implementation of {@link CrawlLog}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*
*/
public class CassandraCrawlLog extends CrawlLog {
public class Tuple {
Tuple(int _1, int _2) {
this._1 = _1;
this._2 = _2;
}
public int _1;
public int _2;
}
protected Session session;
protected final String TABLE_CRAWL_LOG = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_CRAWL_LOG;
protected final String TABLE_COMPRESSABILITY_HISTOGRAM = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_COMPRESSABILITY_HISTOGRAM;
protected final String TABLE_INGEST_SCHEDULE = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_INGEST_SCHEDULE;
public CassandraCrawlLog(Session session) {
this.session = session;
}
protected Date getCoarseTimestamp(Date timestamp) {
return new Date(CassandraDBConnector.HOUR_AS_MILLIS * (timestamp.getTime() / CassandraDBConnector.HOUR_AS_MILLIS));
}
@Override
public long getCrawlStartTime() {
long crawlStartTime = Long.MAX_VALUE;
try {
Iterator<Row> rows = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + ";").iterator();
while (rows.hasNext()) {
Row r = rows.next();
long start_ts = r.getLong(CassandraProperties.FIELD_INGEST_START_TS);
if (start_ts < crawlStartTime)
crawlStartTime = start_ts;
}
Logger.info("Crawl start time: " + crawlStartTime);
if (crawlStartTime == 0)
return -1;
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
crawlStartTime = 0;
}
return crawlStartTime;
}
@Override
public long getTimeOfLastCrawlActivity() {
long lastCrawlActivity = 0;
try {
Iterator<Row> rows = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + ";").iterator();
while (rows.hasNext()) {
Row r = rows.next();
long end_ts = r.getLong(CassandraProperties.FIELD_INGEST_END_TS);
if (end_ts > lastCrawlActivity)
lastCrawlActivity = end_ts;
}
Logger.info("Last crawl activity: " + lastCrawlActivity);
if (lastCrawlActivity == 0)
return -1;
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return lastCrawlActivity;
}
@Override
public List<CrawlLogEntry> getMostRecentEntries(int n) {
long startTime = 0L;
List<CrawlLogEntry> recent = new ArrayList<CrawlLogEntry>();
try {
startTime = getTimeOfLastCrawlActivity();
// Round the time down:
Date coarse_ts = this.getCoarseTimestamp(new Date(startTime));
// Search based on KEY, and range (TODO?)
Iterator<Row> cursor =
session.execute("SELECT * FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_COARSE_TIMESTAMP + "=" + coarse_ts.getTime() + " LIMIT 100;")
.iterator();
while(cursor.hasNext())
recent.add(new CassandraCrawlLogEntry(cursor.next()));
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return recent;
}
@Override
public long countEntries() {
long grand_total = 0;
try {
ResultSet results = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + ";");
Iterator<Row> rows = results.iterator();
while( rows.hasNext() ) {
grand_total += rows.next().getLong(CassandraProperties.FIELD_INGEST_INGESTED_LINES);
}
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return grand_total;
}
@Override
public long countRevisits() {
long grand_total = 0;
try {
ResultSet results = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + ";");
Iterator<Row> rows = results.iterator();
while( rows.hasNext() ) {
grand_total += rows.next().getLong(CassandraProperties.FIELD_INGEST_REVISIT_RECORDS);
}
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return grand_total;
}
@Override
public List<String> listLogIds() {
List<String> collection = new ArrayList<String>();
try {
ResultSet results = session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + ";");
Iterator<Row> rows = results.iterator();
while (rows.hasNext()) {
collection.add(rows.next().getString(CassandraProperties.FIELD_INGEST_CRAWL_ID));
}
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return collection;
}
private List<String> listLogPaths() {
List<String> collection = new ArrayList<String>();
try {
ResultSet results = session.execute("SELECT " + CassandraProperties.FIELD_INGEST_CRAWLER_PATH + " FROM " + TABLE_INGEST_SCHEDULE + ";");
Iterator<Row> rows = results.iterator();
while (rows.hasNext()) {
collection.add(rows.next().getString(CassandraProperties.FIELD_INGEST_CRAWLER_PATH));
}
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return collection;
}
@Override
public long countEntriesForLog(String logId) {
long count = 0L;
try {
ResultSet results =
session.execute("SELECT * FROM " + TABLE_INGEST_SCHEDULE + " WHERE " + CassandraProperties.FIELD_INGEST_CRAWL_ID + "='" + logId + "';");
count = results.one().getLong(CassandraProperties.FIELD_INGEST_INGESTED_LINES);
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return count;
}
@Override
public List<CrawlLogEntry> getEntriesForURL(String url) {
List<CrawlLogEntry> entries = null;
try {
ResultSet results = session.execute("SELECT * FROM " + TABLE_CRAWL_LOG +
" WHERE " + CassandraProperties.FIELD_CRAWL_LOG_URL + " = '" + url + "';");
entries = toLogEntries(results);
} catch(NoHostAvailableException ex) {
Logger.warn("No hosts available ...");
}
return entries;
}
private List<CrawlLogEntry> toLogEntries(ResultSet results) {
List<CrawlLogEntry> entries = new ArrayList<CrawlLogEntry>();
for (Row r : results.all()) {
entries.add(new CassandraCrawlLogEntry(r));
}
return entries;
}
// TODO eliminate code duplication
@Override
public SearchResult searchByURL(String query, int limit, int offset) {
long startTime = System.currentTimeMillis();
int off_limit = offset + limit;
ResultSet results =
session.execute("SELECT * FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_URL + " = '" + query + "' LIMIT " + off_limit + ";");
List<CrawlLogEntry> entries = new ArrayList<CrawlLogEntry>();
// int i = 0;
Iterator<Row> rows = results.iterator();
while( rows.hasNext() ) {
/* Row r = */ rows.next();
/*
if( i >= offset ) {
entries.add( this.getLogEntryForUriResult(query,r) );
}
if( i == off_limit )
break;
i++;
*/
}
ResultSet totalResults = session.execute("SELECT COUNT(*) FROM " + TABLE_CRAWL_LOG +
" WHERE uri = '" + query + "';");
long total = totalResults.one().getLong("count");
List<SearchResultItem> urls = new ArrayList<SearchResultItem>();
for( CrawlLogEntry entry : entries ) {
urls.add(new SearchResultItem(entry.getURL(), entry.toString()));
}
return new SearchResult(query, total, urls, limit, offset, System.currentTimeMillis() - startTime);
}
@Override
public SearchResult searchByAnnotation(String annotation, int limit, int offset) {
long startTime = System.currentTimeMillis();
//DBObject q = new BasicDBObject(CassandraProperties.FIELD_CRAWL_LOG_ANNOTATIONS_TOKENIZED, annotation);
long total = 0;//collection.count(q);
List<SearchResultItem> urls = new ArrayList<SearchResultItem>();
//DBCursor cursor = collection.find(q).skip(offset).limit(limit);
//while (cursor.hasNext()) {
// CrawlLogEntry entry = new CassandraCrawlLogEntry(cursor.next());
// urls.add(new SearchResultItem(entry.getURL(), entry.toString()));
//}
return new SearchResult(annotation, total, urls, limit, offset, System.currentTimeMillis() - startTime);
}
@Override
public SearchResult searchByCompressability(double from, double to, int limit, int offset) {
Logger.debug("Searching by compressability");
long startTime = System.currentTimeMillis();
List<SearchResultItem> concatenated = new ArrayList<SearchResultItem>();
Long total = 0l;
for (String logPath : listLogPaths()) {
// Count total
String count = "SELECT COUNT(*) FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " > " + from +
" AND " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " < " + to + " AND " +
CassandraProperties.FIELD_CRAWL_LOG_LOG_ID + " = '" + logPath + "' ALLOW FILTERING ;";
ResultSet totalResults = session.execute(count);
total += totalResults.one().getLong("count");
String query = "SELECT * FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " > " + from +
" AND " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " < " + to + " AND " +
CassandraProperties.FIELD_CRAWL_LOG_LOG_ID + " = '" + logPath + "'";
if (limit > 0 && offset > 0) {
query += " LIMIT " + (limit + offset) + " ALLOW FILTERING ;";
} else {
query += " ALLOW FILTERING ;";
}
Iterator<Row> results = session.execute(query).iterator();
for (int i=0; i<offset; i++) {
if (results.hasNext())
results.next();
}
while (results.hasNext()) {
CrawlLogEntry entry = new CassandraCrawlLogEntry(results.next());
concatenated.add(new SearchResultItem(entry.getURL(), entry.toString()));
}
}
Logger.debug("Done - took " + (System.currentTimeMillis() - startTime));
return new SearchResult(null, total, concatenated, limit, offset, System.currentTimeMillis() - startTime);
}
public List<Tuple> getCompressabilityHistogram() {
Logger.debug("Retrieving entire compressability histogram");
String query = "SELECT * FROM " + TABLE_COMPRESSABILITY_HISTOGRAM + " LIMIT 2000 ;";
List<Row> result = session.execute(query).all();
List<Tuple> tuples = new ArrayList<Tuple>();
for (Row r : result) {
tuples.add(new Tuple(r.getInt(CassandraProperties.FIELD_COMPRESSABILITY_BUCKET), (int) r.getLong(CassandraProperties.FIELD_COMPRESSABILITY_COUNT)));
}
Collections.sort(tuples, new Comparator<Tuple>() {
@Override
public int compare(Tuple o1, Tuple o2) {
return o1._1 - o2._1;
}
});
return tuples;
}
@Override
public long countByCompressability(double from, double to) {
Logger.debug("Counting by compressability");
long startTime = System.currentTimeMillis();
long totalCount = 0l;
int currentBucket = (int) Math.round(from * 1000);
while (currentBucket < to) {
String query = "SELECT * FROM " + TABLE_COMPRESSABILITY_HISTOGRAM + " WHERE " +
CassandraProperties.FIELD_COMPRESSABILITY_BUCKET + " = " + currentBucket;
Row r = session.execute(query).one();
totalCount += r.getLong(CassandraProperties.FIELD_COMPRESSABILITY_COUNT);
}
/*
long total = 0l;
for (String logPath : listLogPaths()) {
// Count total
String count = "SELECT COUNT(*) FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " > " + from +
" AND " + CassandraProperties.FIELD_CRAWL_LOG_COMPRESSABILITY + " < " + to + " AND " +
CassandraProperties.FIELD_CRAWL_LOG_LOG_ID + " = '" + logPath + "' ALLOW FILTERING ;";
Logger.debug(count);
ResultSet totalResults = session.execute(count);
total += totalResults.one().getLong("count");
}
*/
Logger.debug("Done. Got " + totalCount + " - took " + (System.currentTimeMillis() - startTime));
return totalCount;
}
@Override
public long countEntriesForHost(String hostname) {
ResultSet totalResults =
session.execute("SELECT COUNT(*) FROM " + TABLE_CRAWL_LOG + " WHERE " + CassandraProperties.FIELD_CRAWL_LOG_HOST +
" = '" + hostname + "';");
return totalResults.one().getLong("count");
}
@Override
public Iterator<CrawlLogEntry> getEntriesForHost(String hostname) {
ResultSet results = session.execute("SELECT * FROM " + TABLE_CRAWL_LOG +
" WHERE " + CassandraProperties.FIELD_CRAWL_LOG_HOST + "='" + hostname + "';");
final Iterator<Row> cursor = results.iterator();
return new Iterator<CrawlLogEntry>() {
@Override
public boolean hasNext() {
return cursor.hasNext();
}
@Override
public CrawlLogEntry next() {
return new CassandraCrawlLogEntry(cursor.next());
}
@Override
public void remove() {
cursor.remove();
}
};
}
@Override
public List<String> extractHostsForAnnotation(String annotation) {
// TODO
/*
List<String> hosts = new ArrayList<String>();
Iterator<Row> rows = session.execute("SELECT host FROM crawl_uris.annotations WHERE annotation='"+annotation+"' LIMIT 1;").iterator();
String host = null;
while( rows.hasNext() ) {
host = rows.next().getString("host");
hosts.add(host);
rows = session.execute("SELECT host FROM crawl_uris.annotations WHERE annotation='"+annotation+"' AND host > '"+host+"' LIMIT 1;").iterator();
}
return hosts;
*/
return new ArrayList<String>();
}
}