package uk.bl.monitrix.database.cassandra.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.commons.lang.StringUtils;
import play.Logger;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import uk.bl.monitrix.model.CrawlStats;
import uk.bl.monitrix.model.CrawlStatsUnit;
import uk.bl.monitrix.model.IngestSchedule;
import uk.bl.monitrix.model.IngestedLog;
import uk.bl.monitrix.database.cassandra.CassandraProperties;
/**
* A CassandraDB-backed implementation of {@link CrawlStats}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class CassandraCrawlStats implements CrawlStats {
private final String TABLE_STATS = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_CRAWL_STATS;
protected Session session;
private IngestSchedule ingestSchedule;
// A simple in-memory buffer for quick stats lookups
protected Map<Long, CassandraCrawlStatsUnit> cache = new HashMap<Long, CassandraCrawlStatsUnit>();
public CassandraCrawlStats(Session session, IngestSchedule ingestSchedule) {
this.session = session;
this.ingestSchedule = ingestSchedule;
}
@Override
public Iterator<CrawlStatsUnit> getCrawlStats() {
Logger.info("Getting crawl stats");
// This is ridiculous
List<String> logs = new ArrayList<String>();
for (IngestedLog l : ingestSchedule.getLogs()) {
logs.add(l.getId());
}
final Iterator<Row> cursor =
session.execute("SELECT * FROM " + TABLE_STATS + " WHERE " + CassandraProperties.FIELD_CRAWL_STATS_CRAWL_ID +
" IN ('" + StringUtils.join(logs, ",") + "') ORDER BY " + CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP + ";")
.iterator();
/*
return new Iterator<CrawlStatsUnit>() {
@Override
public boolean hasNext() {
return cursor.hasNext();
}
@Override
public CrawlStatsUnit next() {
return new CassandraCrawlStatsUnit(cursor.next());
}
@Override
public void remove() {
cursor.remove();
}
};
*/
// This re-implementation of Scala's .groupBy function is also quite sad :-(
Map<Long, List<CrawlStatsUnit>> groupedByTimestamp = new TreeMap<Long, List<CrawlStatsUnit>>();
while (cursor.hasNext()) {
CrawlStatsUnit u = new CassandraCrawlStatsUnit(cursor.next());
List<CrawlStatsUnit> unitsFromIndividualCrawls = groupedByTimestamp.get(u.getTimestamp());
if (unitsFromIndividualCrawls == null)
unitsFromIndividualCrawls = new ArrayList<CrawlStatsUnit>();
unitsFromIndividualCrawls.add(u);
groupedByTimestamp.put(u.getTimestamp(), unitsFromIndividualCrawls);
}
List<CrawlStatsUnit> conflated = new ArrayList<CrawlStatsUnit>();
for (final Entry<Long, List<CrawlStatsUnit>> entry : groupedByTimestamp.entrySet()) {
final List<CrawlStatsUnit> units = entry.getValue();
conflated.add(new CrawlStatsUnit() {
@Override
public long getTimestamp() {
return entry.getKey();
}
@Override
public long getNumberOfURLsCrawled() {
long urls = 0;
for (CrawlStatsUnit u: units)
urls += u.getNumberOfURLsCrawled();
return urls;
}
@Override
public long getNumberOfNewHostsCrawled() {
long hosts = 0;
for (CrawlStatsUnit u: units)
hosts += u.getNumberOfNewHostsCrawled();
return hosts;
}
@Override
public long getDownloadVolume() {
long volume = 0;
for (CrawlStatsUnit u: units)
volume += u.getDownloadVolume();
return volume;
}
@Override
public long countCompletedHosts() {
long hosts = 0;
for (CrawlStatsUnit u: units)
hosts += u.countCompletedHosts();
return hosts;
}
});
}
return conflated.iterator();
}
@Override
public Iterator<CrawlStatsUnit> getCrawlStats(String crawl_id) {
final Iterator<Row> cursor =
session.execute("SELECT * FROM " + TABLE_STATS + " WHERE " + CassandraProperties.FIELD_CRAWL_STATS_CRAWL_ID + "='" +
crawl_id + "' ORDER BY " + CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP + ";")
.iterator();
return new Iterator<CrawlStatsUnit>() {
@Override
public boolean hasNext() {
return cursor.hasNext();
}
@Override
public CrawlStatsUnit next() {
return new CassandraCrawlStatsUnit(cursor.next());
}
@Override
public void remove() {
cursor.remove();
}
};
}
@Override
public CrawlStatsUnit getStatsForTimestamp(long timestamp, String crawl_id) {
// TODO conflate stats from different crawls
if (cache.containsKey(timestamp))
return cache.get(timestamp);
ResultSet results = session.execute("SELECT * FROM " + TABLE_STATS + " WHERE " + CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP +
"=" + timestamp + " AND " + CassandraProperties.FIELD_CRAWL_STATS_CRAWL_ID + "='" + crawl_id + "' ORDER BY " +
CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP + ";");
if (results.isExhausted() ) {
return null;
} else {
CassandraCrawlStatsUnit stats = new CassandraCrawlStatsUnit(results.one());
cache.put(timestamp, stats);
return stats;
}
}
@Override
public List<CrawlStatsUnit> getMostRecentStats(int n) {
// TODO conflate stats from different crawls
Iterator<Row> cursor =
session.execute("SELECT * FROM " + TABLE_STATS + " ORDER BY " +
CassandraProperties.FIELD_CRAWL_STATS_TIMESTAMP + " LIMIT " + n +";").iterator();
List<CrawlStatsUnit> recent = new ArrayList<CrawlStatsUnit>();
while(cursor.hasNext())
recent.add(new CassandraCrawlStatsUnit(cursor.next()));
return recent;
}
}