package uk.bl.monitrix.analytics;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import uk.bl.monitrix.model.CrawlStatsUnit;
/**
* Helper functions for computing various stats from CrawlStats collection.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class CrawlStatsAnalytics {
/**
* Extracts the datavolume timeseries (i.e. the amount of data that has been downloaded over time)
* from the provided crawl stats. The resulting timeseries will be resampled from its original
* resolution, so that the result has at most <code>maxDatapoints</code>
* data points.
* @param crawlStats the source crawl stats
* @param maxDatapoints the maximum number of datapoints the timeseries should have
* @return the timeseries
*/
public static List<TimeseriesValue> getDatavolumeHistory(final List<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return resample(new AbstractList<TimeseriesValue>() {
@Override
public TimeseriesValue get(int index) {
CrawlStatsUnit unit = crawlStats.get(index);
return new TimeseriesValue(unit.getTimestamp(), unit.getDownloadVolume());
}
@Override
public int size() {
return crawlStats.size();
}
}, crawlStats.size() / maxDatapoints);
}
public static List<TimeseriesValue> getDatavolumeHistory(Iterator<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return getDatavolumeHistory(toList(crawlStats), maxDatapoints);
}
/**
* Extracts the crawled URLs timeseries (i.e. the number of URLs visited over time) from the provided
* crawl stats. The resulting timeseries will be resampled from its original resolution, so that the
* result has at most <code>maxDatapoints</code> data points.
* @param crawlStats the source crawl stats
* @param maxDatapoints the maximum number of datapoints the timeseries should have
* @return the timeseries
*/
public static List<TimeseriesValue> getCrawledURLsHistory(final List<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return resample(new AbstractList<TimeseriesValue>() {
@Override
public TimeseriesValue get(int index) {
CrawlStatsUnit unit = crawlStats.get(index);
return new TimeseriesValue(unit.getTimestamp(), unit.getNumberOfURLsCrawled());
}
@Override
public int size() {
return crawlStats.size();
}
}, crawlStats.size() / maxDatapoints);
}
public static List<TimeseriesValue> getCrawledURLsHistory(Iterator<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return getCrawledURLsHistory(toList(crawlStats), maxDatapoints);
}
/**
* Extracts the new hosts timeseries (i.e. the number of hosts that were visited for the
* first time over the duration of the crawl. The timeseries will be resampled from its
* internal base resolution, so that the result has at most <code>maxDatapoints</code>
* data points.
* @param crawlStats the source crawl stats
* @param maxDatapoints the maximum number of datapoints the timeseries should have
* @return
*/
public static List<TimeseriesValue> getNewHostsCrawledHistory(final List<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return resample(new AbstractList<TimeseriesValue>() {
@Override
public TimeseriesValue get(int index) {
CrawlStatsUnit unit = crawlStats.get(index);
return new TimeseriesValue(unit.getTimestamp(), unit.getNumberOfNewHostsCrawled());
}
@Override
public int size() {
return crawlStats.size();
}
}, crawlStats.size() / maxDatapoints);
}
public static List<TimeseriesValue> getNewHostsCrawledHistory(Iterator<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return getNewHostsCrawledHistory(toList(crawlStats), maxDatapoints);
}
public static List<TimeseriesValue> getCompletedHostsHistory(final List<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return resample(new AbstractList<TimeseriesValue>() {
@Override
public TimeseriesValue get(int index) {
CrawlStatsUnit unit = crawlStats.get(index);
return new TimeseriesValue(unit.getTimestamp(), unit.countCompletedHosts());
}
@Override
public int size() {
return crawlStats.size();
}
}, crawlStats.size() / maxDatapoints);
}
public static List<TimeseriesValue> getCompletedHostsHistory(Iterator<CrawlStatsUnit> crawlStats, int maxDatapoints) {
return getCompletedHostsHistory(toList(crawlStats), maxDatapoints);
}
private static List<TimeseriesValue> resample(List<TimeseriesValue> series, int factor) {
Iterator<TimeseriesValue> original = series.iterator();
List<TimeseriesValue> resampled = new ArrayList<TimeseriesValue>();
while (original.hasNext()) {
TimeseriesValue next = original.next();
int counter = 0;
long timestamp = next.getTimestamp();
long aggregatedValue = next.getValue();
while (original.hasNext() && counter < (factor - 1)) {
next = original.next();
aggregatedValue += next.getValue();
counter++;
}
resampled.add(new TimeseriesValue(timestamp, aggregatedValue));
}
return resampled;
}
private static List<CrawlStatsUnit> toList(Iterator<CrawlStatsUnit> iterator) {
List<CrawlStatsUnit> list = new ArrayList<CrawlStatsUnit>();
while (iterator.hasNext())
list.add(iterator.next());
return list;
}
}