package uk.bl.monitrix.analytics; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import play.Logger; import uk.bl.monitrix.model.CrawlLogEntry; /** * Helper functions for computing various stats from log entries. * @author Rainer Simon <rainer.simon@ait.ac.at> */ public class LogAnalytics { // String constant - 'text' prefix for checking MIME types private static final String TEXT = "text"; // Virus flag 'FOUND' private static final String FOUND = "FOUND"; /** * Extracts the unique crawler IDs from the log entries. * @param log the log entries * @return the IDs of the crawlers that crawled this host */ public static List<String> getCrawlerIDs(Iterator<CrawlLogEntry> log) { long computeStart = System.currentTimeMillis(); Set<String> crawlers = new HashSet<String>(); while (log.hasNext()) crawlers.add(log.next().getWorkerThread()); List<String> sorted = new ArrayList<String>(crawlers); Collections.sort(sorted); Logger.info("Extracted Crawler IDs - took " + (System.currentTimeMillis() - computeStart) + "ms"); return sorted; } /** * Computes the average crawl rate values (i.e. average number of URLs crawled per minute) * for the given log entries. * @param log the log entries * @return average download crawl rate in URLs/minute */ public static long getAverageCrawlRate(Iterator<CrawlLogEntry> log) { long startTime = Long.MAX_VALUE; long endTime = Long.MIN_VALUE; long totalURLs = 0; while (log.hasNext()) { CrawlLogEntry next = log.next(); long timestamp = next.getLogTimestamp().getTime(); if (timestamp > endTime) endTime = timestamp; if (timestamp < startTime) startTime = timestamp; totalURLs++; } double urlsPerMillisecond = ((double) totalURLs) / ((double) (endTime - startTime)); return Math.round(urlsPerMillisecond * 60000); } /** * Computes the average download rate value (in MB/minute) for the given log entries. * @param log the log entries * @return average download rate in MB/minute */ public static long getAverageDownloadRate(Iterator<CrawlLogEntry> log) { long startTime = Long.MAX_VALUE; long endTime = Long.MIN_VALUE; long downloadVolume = 0; while (log.hasNext()) { CrawlLogEntry next = log.next(); long timestamp = next.getLogTimestamp().getTime(); if (timestamp > endTime) endTime = timestamp; if (timestamp < startTime) startTime = timestamp; downloadVolume += next.getDownloadSize(); } double bytesPerMillisecond = ((double) downloadVolume) / ((double) (endTime - startTime)); return Math.round(bytesPerMillisecond * 60000); } public static String extractVirusName(CrawlLogEntry entry) { String annotations = entry.getAnnotations(); if (annotations.contains(FOUND)) { // Example: 84.45.47.58,1: stream: JS.Redir-11 FOUND String virusName = annotations.substring(0, annotations.indexOf(FOUND)).trim(); virusName = virusName.substring(virusName.lastIndexOf(' ') + 1); return virusName; } return null; } /** * Computes the distribution of fetch status codes in the log entries. * @param log the log entries * @return the distribution of fetch status codes */ public static List<PieChartValue> getFetchStatusDistribution(Iterator<CrawlLogEntry> log) { long computeStart = System.currentTimeMillis(); Map<Integer, Integer> codes = new HashMap<Integer, Integer>(); while (log.hasNext()) { Integer code = log.next().getHTTPCode(); Integer value = codes.get(code); if (value == null) value = Integer.valueOf(1); else value = Integer.valueOf(value.intValue() + 1); codes.put(code, value); } List<PieChartValue> pieChart = new ArrayList<PieChartValue>(); for (Entry<Integer, Integer> entry : codes.entrySet()) pieChart.add(new PieChartValue(entry.getKey().toString(), entry.getValue())); Logger.info("Computed fetch status distribution - took " + (System.currentTimeMillis() - computeStart) + "ms"); return pieChart; } /** * Computes the distribution of MIME types from the log entries. * @param log the log entries * @return the distribution of MIME types */ public static List<PieChartValue> getMimeTypeDistribution(Iterator<CrawlLogEntry> log) { long computeStart = System.currentTimeMillis(); Map<String, Integer> mimeTypes = new HashMap<String, Integer>(); while (log.hasNext()) { String mime = log.next().getContentType(); Integer count = mimeTypes.get(mime); if (count == null) count = Integer.valueOf(1); else count = Integer.valueOf(count.intValue() + 1); mimeTypes.put(mime, count); } List<PieChartValue> pieChart = new ArrayList<PieChartValue>(); for (Entry<String, Integer> entry : mimeTypes.entrySet()) pieChart.add(new PieChartValue(entry.getKey(), entry.getValue())); Logger.info("Computed MIME type distribution - took " + (System.currentTimeMillis() - computeStart) + "ms"); return pieChart; } /** * Helper function to compute the ratio of text MIME types (i.e. every type starting with 'text/') vs. * all other types. * @param log the log entries * @return the ratio text/non-text resources */ public static double getTextToNonTextResourceRatio(Iterator<CrawlLogEntry> log) { int text = 0; int nonText = 0; while (log.hasNext()) { String contentType = log.next().getContentType(); if (contentType.startsWith(TEXT)) text++; else nonText++; } if (nonText == 0) return 99; return ((double) text)/((double) nonText); } /** * Computes the distribution of clean vs. virused URLs from the log entries. * @param log the log entries * @return the virus distribution */ public static List<PieChartValue> getVirusDistribution(Iterator<CrawlLogEntry> log) { long computeStart = System.currentTimeMillis(); // TODO dummy impl only - improve so that viruses are recorded by name int clean = 0; int infected = 0; while (log.hasNext()) { String annotations = log.next().getAnnotations(); if (annotations.contains("FOUND")) { infected++; } else { clean++; } } List<PieChartValue> pieChart = new ArrayList<PieChartValue>(); pieChart.add(new PieChartValue("Clean", clean)); if (infected > 0) pieChart.add(new PieChartValue("Infected", infected)); Logger.info("Computed virus distribution - took " + (System.currentTimeMillis() - computeStart) + "ms"); return pieChart; } public static List<TimeseriesValue> getCrawledURLsHistory(Collection<CrawlLogEntry> log, int maxDatapoints) { Logger.info("Computing URL history timeseries for " + log.size() + " log entries"); // Get log start and end time long logStartTime = Long.MAX_VALUE; long logEndTime = 0; long count = 0; for (CrawlLogEntry entry : log) { long timestamp = entry.getLogTimestamp().getTime(); if (timestamp > logEndTime) logEndTime = timestamp; if (timestamp < logStartTime) logStartTime = timestamp; count++; } // Compute timeseries resolution (= # of millis in one data point bucket) if( count < 2 * maxDatapoints ) maxDatapoints = (int) (count / 2); long resolution = (logEndTime - logStartTime) / maxDatapoints; // (timeslot -> # of URLs) Map<Long, TimeseriesValue> graph = new HashMap<Long, TimeseriesValue>(maxDatapoints); for (CrawlLogEntry entry : log) { long timeslot = (entry.getLogTimestamp().getTime() - logStartTime) / resolution; TimeseriesValue urlCount = graph.get(timeslot); if (urlCount == null) graph.put(timeslot, new TimeseriesValue(logStartTime + timeslot * resolution, 1)); else urlCount.setValue(urlCount.getValue() + 1); } List<TimeseriesValue> timeseries = new ArrayList<TimeseriesValue>(graph.values()); Collections.sort(timeseries); Logger.info("Done."); return timeseries; } }