package uk.bl.monitrix.database.cassandra.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import play.Logger;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.Row;
import uk.bl.monitrix.database.cassandra.CassandraProperties;
import uk.bl.monitrix.model.KnownHost;
import uk.bl.monitrix.model.KnownHostList;
import uk.bl.monitrix.model.SearchResult;
import uk.bl.monitrix.model.SearchResultItem;
/**
* A CassandraDB-backed implementation of {@link KnownHostList}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class CassandraKnownHostList implements KnownHostList {
private static final String TABLE_HOSTS = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_KNOWN_HOSTS;
private static final String TABLE_TLDS = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_KNOWN_TLDS;
protected Session session;
// A simple in-memory buffer for quick host lookups
protected Map<String, CassandraKnownHost> cache = new HashMap<String, CassandraKnownHost>();
public CassandraKnownHostList(Session session) {
this.session = session;
}
@Override
public long count() {
ResultSet results = session.execute("SELECT COUNT(*) FROM " + TABLE_HOSTS + " ;");
return results.one().getLong("count");
}
@Override
public long countSuccessful() {
ResultSet results = session.execute("SELECT " + CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS + " FROM " + TABLE_HOSTS + ";");
long total = 0;
Iterator<Row> rows = results.iterator();
while (rows.hasNext()) {
long fetched = rows.next().getLong(CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS);
if (fetched > 0)
total += 1;
}
return total;
}
@Override
public long getMaxFetchDuration() {
Iterator<Row> rows = session.execute("SELECT * FROM " + TABLE_HOSTS + ";").iterator();
double max = 0;
while( rows.hasNext() ) {
double fd = rows.next().getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION);
if (fd > max ) max = fd;
}
return Math.round(max);
}
@Override
public boolean isKnown(String hostname) {
if (cache.containsKey(hostname))
return true;
ResultSet results = session.execute("SELECT * FROM " + TABLE_HOSTS + " WHERE " + CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME + "='" + hostname + "';");
if (results.isExhausted())
return false;
return true;
}
@Override
public KnownHost getKnownHost(String hostname) {
CassandraKnownHost knownHost = cache.get(hostname);
if (knownHost == null) {
ResultSet results = session.execute("SELECT * FROM " + TABLE_HOSTS + " WHERE " + CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME + "='" + hostname + "';");
if (!results.isExhausted()) {
knownHost = new CassandraKnownHost(results.one());
cache.put(hostname, knownHost);
}
}
return knownHost;
}
@Override
public SearchResult searchHosts(String query, int limit, int offset) {
Iterator<Row> results = session.execute(
"SELECT * FROM " + TABLE_HOSTS + " WHERE " + CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME + "='" + query + "' LIMIT " + (limit + offset) + ";")
.iterator();
for (int i=0; i<offset; i++) {
if (results.hasNext())
results.next();
}
return search(query, results, limit, offset);
}
@Override
public SearchResult searchByTopLevelDomain(String tld, int limit, int offset) {
ResultSet results = session.execute("SELECT * FROM crawl_uris.known_hosts WHERE tld='"+tld+"';");
return search(tld, results.iterator(), limit, offset);
}
private SearchResult searchByRange(double min, double max, int limit, int offset, String property) {
List<Row> concatenated = new ArrayList<Row>();
for (String tld : getTopLevelDomains()) {
String q = "SELECT * FROM " + TABLE_HOSTS + " WHERE " + property + " > " + min + " AND " + property + " < " + max +
" AND " + CassandraProperties.FIELD_KNOWN_HOSTS_TLD + " ='" + tld + "'";
if (limit > 0 && offset > 0) {
q += " LIMIT " + (limit + offset) + " ALLOW FILTERING ;";
} else {
q+= " ALLOW FILTERING ;";
}
Iterator<Row> results = session.execute(q).iterator();
for (int i=0; i<offset; i++) {
if (results.hasNext())
results.next();
}
while (results.hasNext())
concatenated.add(results.next());
}
return search(null, concatenated.iterator(), limit, offset);
}
@Override
public SearchResult searchByAverageFetchDuration(long min, long max, int limit, int offset) {
return searchByRange(min, max, limit, offset, CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION);
}
@Override
public SearchResult searchByAverageRetries(int min, int max, int limit, int offset) {
return searchByRange(min, max, limit, offset, CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE);
}
@Override
public SearchResult searchByRobotsBlockPercentage(double min, double max, int limit, int offset) {
return searchByRange(min, max, limit, offset, CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE);
}
@Override
public SearchResult searchByRedirectPercentage(double min, double max, int limit, int offset) {
return searchByRange(min, max, limit, offset, CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE);
}
private SearchResult search(String queryString, Iterator<Row> cursor, int limit, int offset) {
long startTime = System.currentTimeMillis();
long total = 0;
List<SearchResultItem> hostnames = new ArrayList<SearchResultItem>();
// Right now the number of URLs per host are packed into the 'description field' - not ideal!
// TODO we need to find a better way to handle 'search result metadata'
while (cursor.hasNext()) {
KnownHost host = new CassandraKnownHost(cursor.next());
hostnames.add(new SearchResultItem(host.getHostname(), Long.toString(host.getCrawledURLs())));
// Update the total number of results.
total++;
}
Logger.info("Total = "+total);
return new SearchResult(null, total, hostnames, limit, offset, System.currentTimeMillis() - startTime);
}
/**
* FIXME this seems not to be called anywhere, so take it out as it's not simple to implement in Cassandra?
*/
@Override
public List<KnownHost> getCrawledHosts(long since) {
// DBObject query = new BasicDBObject(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS,
// new BasicDBObject(MONGO_QUERY_GREATER_OR_EQUAL, since));
//
// List<KnownHost> hostnames = new ArrayList<KnownHost>();
// DBCursor cursor = collection.find(query).sort(new BasicDBObject(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS, -1));
// while (cursor.hasNext())
// hostnames.add(new CassandraKnownHost(cursor.next()));
//
// return hostnames;
return new ArrayList<KnownHost>();
}
@Override
public List<String> getTopLevelDomains() {
Iterator<Row> results =
session.execute("SELECT " + CassandraProperties.FIELD_KNOWN_TLDS_TLD + " from " + TABLE_TLDS + ";").iterator();
List<String> tlds = new ArrayList<String>();
while (results.hasNext()) {
String tld = results.next().getString(CassandraProperties.FIELD_KNOWN_TLDS_TLD);
tlds.add(tld);
}
return tlds;
}
@Override
public long countForTopLevelDomain(String tld) {
ResultSet results = session.execute("SELECT * FROM " + TABLE_TLDS + " WHERE " + CassandraProperties.FIELD_KNOWN_TLDS_TLD + "='" + tld + "';");
return results.one().getLong(CassandraProperties.FIELD_KNOWN_TLDS_COUNT);
}
// FIXME Rounder
protected double rounder_res = 100.0;
protected double rounder_step = 1.0;
protected double rounder(double in) {
return Math.floor(rounder_res*in);
}
}