package uk.bl.monitrix.database.mongodb.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import uk.bl.monitrix.database.mongodb.MongoProperties;
import uk.bl.monitrix.model.KnownHost;
import uk.bl.monitrix.model.KnownHostList;
import uk.bl.monitrix.model.SearchResult;
import uk.bl.monitrix.model.SearchResultItem;
/**
* A MongoDB-backed implementation of {@link KnownHostList}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class MongoKnownHostList implements KnownHostList {
// MongoDB query operator for selecting documents where the field value equals any value in a list
private static final String MONGO_QUERY_ALL = "$all";
// MongoDB query operator for selecting documents where the field value is greater or equal to a specified value
private static final String MONGO_QUERY_GREATER_OR_EQUAL = "$gte";
protected DBCollection collection;
// A simple in-memory buffer for quick host lookups
// private Set<String> knownHostsLookupCache = null;
protected Map<String, MongoKnownHost> cache = new HashMap<String, MongoKnownHost>();
public MongoKnownHostList(DB db) {
this.collection = db.getCollection(MongoProperties.COLLECTION_KNOWN_HOSTS);
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_HOSTNAME, 1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_HOSTNAME_TOKENIZED, 1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_TLD, 1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS, 1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION, 1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS, -1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE, -1));
this.collection.ensureIndex(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE, -1));
}
@Override
public long count() {
return collection.count();
}
@Override
public long countSuccessful() {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS, new BasicDBObject("$exists", true));
return collection.count(query);
}
@Override
public long getMaxFetchDuration() {
DBCursor cursor = collection.find().sort(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION, -1)).limit(1);
if (cursor.hasNext()) {
MongoKnownHost h = new MongoKnownHost(cursor.next());
return (long) h.getAverageFetchDuration();
}
return 0;
}
@Override
public boolean isKnown(String hostname) {
if (cache.containsKey(hostname))
return true;
DBObject dbo = collection.findOne(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_HOSTNAME, hostname));
if (dbo == null)
return false;
MongoKnownHost wrapped = new MongoKnownHost(dbo);
cache.put(wrapped.getHostname(), wrapped);
return true;
}
@Override
public KnownHost getKnownHost(String hostname) {
if (cache.containsKey(hostname))
return cache.get(hostname);
DBObject dbo = collection.findOne(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_HOSTNAME, hostname));
if (dbo == null)
return null;
MongoKnownHost wrapped = new MongoKnownHost(dbo);
cache.put(hostname, wrapped);
return wrapped;
}
@Override
public SearchResult searchHosts(String query, int limit, int offset) {
// Parse query
List<String> tokens = Arrays.asList(KnownHost.tokenizeName(query));
DBObject q = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_HOSTNAME_TOKENIZED,
new BasicDBObject(MONGO_QUERY_ALL, tokens));
return search(query, q, limit, offset);
}
@Override
public SearchResult searchByTopLevelDomain(String tld, int limit, int offset) {
DBObject q = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_TLD, tld);
return search(tld, q, limit, offset);
}
@Override
public SearchResult searchByAverageFetchDuration(long min, long max, int limit, int offset) {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION,
new BasicDBObject("$gt", min).append("$lte", max));
return search(null, query, limit, offset);
}
@Override
public SearchResult searchByAverageRetries(int min, int max, int limit, int offset) {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE,
new BasicDBObject("$gte", min).append("$lt", max));
return search(null, query, limit, offset);
}
@Override
public SearchResult searchByRobotsBlockPercentage(double min, double max, int limit, int offset) {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE,
new BasicDBObject("$gte", min).append("$lt", max));
return search(null, query, limit, offset);
}
@Override
public SearchResult searchByRedirectPercentage(double min, double max, int limit, int offset) {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE,
new BasicDBObject("$gte", min).append("$lt", max));
return search(null, query, limit, offset);
}
private SearchResult search(String queryString, DBObject query, int limit, int offset) {
long startTime = System.currentTimeMillis();
long total = collection.count(query);
List<SearchResultItem> hostnames = new ArrayList<SearchResultItem>();
if (limit > 0) {
DBCursor cursor = collection.find(query).skip(offset).limit(limit);
// Right now the number of URLs per host are packed into the 'description field' - not ideal!
// TODO we need to find a better way to handle 'search result metadata'
while (cursor.hasNext()) {
KnownHost host = new MongoKnownHost(cursor.next());
hostnames.add(new SearchResultItem(host.getHostname(), Long.toString(host.getCrawledURLs())));
}
}
return new SearchResult(null, total, hostnames, limit, offset, System.currentTimeMillis() - startTime);
}
@Override
public List<KnownHost> getCrawledHosts(long since) {
DBObject query = new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS,
new BasicDBObject(MONGO_QUERY_GREATER_OR_EQUAL, since));
List<KnownHost> hostnames = new ArrayList<KnownHost>();
DBCursor cursor = collection.find(query).sort(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS, -1));
while (cursor.hasNext())
hostnames.add(new MongoKnownHost(cursor.next()));
return hostnames;
}
@Override
@SuppressWarnings("unchecked")
public List<String> getTopLevelDomains() {
return (List<String>) collection.distinct(MongoProperties.FIELD_KNOWN_HOSTS_TLD);
}
@Override
public long countForTopLevelDomain(String tld) {
return collection.count(new BasicDBObject(MongoProperties.FIELD_KNOWN_HOSTS_TLD, tld));
}
}