package uk.bl.monitrix.database.mongodb.model; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import play.Logger; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import uk.bl.monitrix.database.mongodb.MongoProperties; import uk.bl.monitrix.model.CrawlLog; import uk.bl.monitrix.model.CrawlLogEntry; import uk.bl.monitrix.model.SearchResult; import uk.bl.monitrix.model.SearchResultItem; /** * A MongoDB-backed implementation of {@link CrawlLog}. * @author Rainer Simon <rainer.simon@ait.ac.at> * */ public class MongoCrawlLog extends CrawlLog { protected DBCollection collection; public MongoCrawlLog(DB db) { this.collection = db.getCollection(MongoProperties.COLLECTION_CRAWL_LOG); // The Heritrix Log collection is indexed by crawl log id, timestamp, url, hostname, annotations and HTTP retries this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_LOG_ID, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_URL, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_HOST, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS_TOKENIZED, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_RETRIES, 1)); this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_COMPRESSABILITY, 1)); } @Override public long getCrawlStartTime() { // TODO cache long crawlStartTime = 0; DBCursor cursor = collection.find().limit(1).sort(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP, 1)); while (cursor.hasNext()) crawlStartTime = new MongoCrawlLogEntry(cursor.next()).getLogTimestamp().getTime(); return crawlStartTime; } @Override public long getTimeOfLastCrawlActivity() { // TODO cache long lastCrawlActivity = 0; DBCursor cursor = collection.find().limit(1).sort(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP, -1)); while (cursor.hasNext()) lastCrawlActivity = new MongoCrawlLogEntry(cursor.next()).getLogTimestamp().getTime(); return lastCrawlActivity; } @Override public List<CrawlLogEntry> getMostRecentEntries(int n) { DBCursor cursor = collection.find().sort(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_TIMESTAMP, -1)).limit(n); List<CrawlLogEntry> recent = new ArrayList<CrawlLogEntry>(); while(cursor.hasNext()) recent.add(new MongoCrawlLogEntry(cursor.next())); return recent; } @Override public long countEntries() { return collection.count(); } @Override public long countRevisits() { return searchByAnnotation("warcRevisit:digest", 1, 0).totalResults(); } @Override @SuppressWarnings("unchecked") public List<String> listLogIds() { return (List<String>) collection.distinct(MongoProperties.FIELD_CRAWL_LOG_LOG_ID); } @Override public long countEntriesForLog(String logId) { return collection.count(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_LOG_ID, logId)); } @Override public List<CrawlLogEntry> getEntriesForURL(String url) { DBObject q = new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_URL, url); List<CrawlLogEntry> entries = new ArrayList<CrawlLogEntry>(); DBCursor cursor = collection.find(q); while (cursor.hasNext()) entries.add(new MongoCrawlLogEntry(cursor.next())); return entries; } // TODO eliminate code duplication @Override public SearchResult searchByURL(String query, int limit, int offset) { long startTime = System.currentTimeMillis(); DBObject q = new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_URL, query); long total = collection.count(q); List<SearchResultItem> urls = new ArrayList<SearchResultItem>(); DBCursor cursor = collection.find(q).skip(offset).limit(limit); while (cursor.hasNext()) { CrawlLogEntry entry = new MongoCrawlLogEntry(cursor.next()); urls.add(new SearchResultItem(entry.getURL(), entry.toString())); } return new SearchResult(query, total, urls, limit, offset, System.currentTimeMillis() - startTime); } @Override public SearchResult searchByAnnotation(String annotation, int limit, int offset) { long startTime = System.currentTimeMillis(); DBObject q = new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS_TOKENIZED, annotation); long total = collection.count(q); List<SearchResultItem> urls = new ArrayList<SearchResultItem>(); DBCursor cursor = collection.find(q).skip(offset).limit(limit); while (cursor.hasNext()) { CrawlLogEntry entry = new MongoCrawlLogEntry(cursor.next()); urls.add(new SearchResultItem(entry.getURL(), entry.toString())); } return new SearchResult(annotation, total, urls, limit, offset, System.currentTimeMillis() - startTime); } @Override public SearchResult searchByCompressability(double from, double to, int limit, int offset) { Logger.debug("Searching by compressability"); long startTime = System.currentTimeMillis(); DBObject query = new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_COMPRESSABILITY, new BasicDBObject("$gte", from).append("$lt", to)); long total = collection.count(query); List<SearchResultItem> urls = new ArrayList<SearchResultItem>(); if (limit > 0) { DBCursor cursor = collection.find(query).skip(offset).limit(limit); while (cursor.hasNext()) { CrawlLogEntry entry = new MongoCrawlLogEntry(cursor.next()); urls.add(new SearchResultItem(entry.getURL(), entry.toString())); } } Logger.debug("Done - took " + (System.currentTimeMillis() - startTime)); return new SearchResult(null, total, urls, limit, offset, System.currentTimeMillis() - startTime); } @Override public long countByCompressability(double from, double to) { return searchByCompressability(from, to, 0, 0).totalResults(); } @Override public long countEntriesForHost(String hostname) { return collection.count(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_HOST, hostname)); } @Override public Iterator<CrawlLogEntry> getEntriesForHost(String hostname) { long limit = collection.count(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_HOST, hostname)); // We're using a count first to improve performance (?) // Cf. http://docs.mongodb.org/manual/applications/optimization/ final DBCursor cursor = collection .find(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_HOST, hostname)) .hint(new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_HOST, 1)) .limit((int) limit); return new Iterator<CrawlLogEntry>() { @Override public boolean hasNext() { return cursor.hasNext(); } @Override public CrawlLogEntry next() { return new MongoCrawlLogEntry(cursor.next()); } @Override public void remove() { cursor.remove(); } }; } @Override @SuppressWarnings("unchecked") public List<String> extractHostsForAnnotation(String annotation) { DBObject q = new BasicDBObject(MongoProperties.FIELD_CRAWL_LOG_ANNOTATIONS_TOKENIZED, annotation); return (List<String>) collection.distinct(MongoProperties.FIELD_CRAWL_LOG_HOST, q); } }