package uk.bl.monitrix.database.cassandra.model;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import uk.bl.monitrix.database.cassandra.CassandraProperties;
import uk.bl.monitrix.model.KnownHost;
/**
* A CassandraDB-backed implementation of {@link KnownHost}.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*
*/
public class CassandraKnownHost extends KnownHost {
private static final String TABLE = CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_KNOWN_HOSTS;
private Map<String, Object> cachedRow = new HashMap<String, Object>();
public CassandraKnownHost(Row row) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME, row.getString(CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_TLD, row.getString(CassandraProperties.FIELD_KNOWN_HOSTS_TLD));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_DOMAIN, row.getString(CassandraProperties.FIELD_KNOWN_HOSTS_DOMAIN));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_SUBDOMAIN, row.getString(CassandraProperties.FIELD_KNOWN_HOSTS_SUBDOMAIN));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_FIRST_ACCESS, row.getLong(CassandraProperties.FIELD_KNOWN_HOSTS_FIRST_ACCESS));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS, row.getLong(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLED_URLS, row.getLong(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLED_URLS));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS, row.getLong(CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION, row.getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE, row.getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE, row.getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE, row.getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_TEXT_TO_NONTEXT_RATIO, row.getDouble(CassandraProperties.FIELD_KNOWN_HOSTS_TEXT_TO_NONTEXT_RATIO));
// Crawler IDs
String crawlerIds = row.getString(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS);
Set<String> crawlerIdSet = new HashSet<String>(Arrays.asList(crawlerIds.split(";")));
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS, crawlerIdSet);
// Map-type data fields
deserializeMap(row, CassandraProperties.FIELD_KNOWN_HOSTS_FETCH_STATUS_CODES);
deserializeMap(row, CassandraProperties.FIELD_KNOWN_HOSTS_CONTENT_TYPES);
deserializeMap(row, CassandraProperties.FIELD_KNOWN_HOSTS_VIRUS_STATS);
}
private void deserializeMap(Row row, String key) {
try {
// We're using JSON to (de)serialize map-like data
String serialized = row.getString(key);
if (serialized == null || serialized.isEmpty()) {
cachedRow.put(key, new HashMap<String, Integer>());
} else {
@SuppressWarnings("unchecked")
Map<String, Integer> deserialized = new ObjectMapper().readValue(serialized, HashMap.class);
cachedRow.put(key, deserialized);
}
} catch (JsonParseException e) {
e.printStackTrace();
} catch (JsonMappingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public String getHostname() {
return (String) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME);
}
@Override
public String getTopLevelDomain() {
return (String) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_TLD);
}
@Override
public String getDomain() {
return (String) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_DOMAIN);
}
@Override
public String getSubdomain() {
return (String) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_SUBDOMAIN);
}
@Override
public long getFirstAccess() {
return (Long) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_FIRST_ACCESS);
}
@Override
public long getLastAccess() {
return (Long) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS);
}
public void setLastAccess(long lastAccess) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS, lastAccess);
}
@Override
@SuppressWarnings("unchecked")
public List<String> getCrawlerIDs() {
List<String> ids = new ArrayList<String>();
ids.addAll((Set<String>) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS));
return ids;
}
public void addCrawlerID(String id) {
@SuppressWarnings("unchecked")
Set<String> ids = (Set<String>) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS);
ids.add(id);
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS, ids);
}
@Override
public long getCrawledURLs() {
Long crawledURLs = (Long) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLED_URLS);
if (crawledURLs == null)
return 0;
return crawledURLs;
}
public void setCrawledURLs(long crawledURLs) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLED_URLS, crawledURLs);
}
@Override
public long getSuccessfullyFetchedURLs() {
return (Long) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS);
}
public void setSuccessfullyFetchedURLs(long urls) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS, urls);
}
@Override
public double getAverageFetchDuration() {
return (Double) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION);
}
public void setAverageFetchDuration(double duration) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION, duration);
}
@Override
public double getAverageRetryRate() {
return (Double) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE);
}
public void setAverageRetryRate(double rate) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE, rate);
}
@Override
@SuppressWarnings("unchecked")
public Map<String, Integer> getFetchStatusDistribution() {
return (Map<String, Integer>) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_FETCH_STATUS_CODES);
}
public void setFetchStatusDistribution(Map<String, Integer> fetchStatusDistribution) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_FETCH_STATUS_CODES, fetchStatusDistribution);
}
@Override
@SuppressWarnings("unchecked")
public Map<String, Integer> getContentTypeDistribution() {
return (Map<String, Integer>) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_CONTENT_TYPES);
}
public void setContentTypeDistribution(Map<String, Integer> contentTypeDistribution) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_CONTENT_TYPES, contentTypeDistribution);
}
@Override
@SuppressWarnings("unchecked")
public Map<String, Integer> getVirusStats() {
return (Map<String, Integer>) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_VIRUS_STATS);
}
public void setVirusStats(Map<String, Integer> virusStats) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_VIRUS_STATS, virusStats);
}
@Override
public double getRobotsBlockPercentage() {
return (Double) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE);
}
public void setRobotsBlockPercentage(double percentage) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE, percentage);
}
@Override
public double getRedirectPercentage() {
return (Double) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE);
}
public void setRedirectPercentage(double percentage) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE, percentage);
}
@Override
public double getTextToNoneTextRatio() {
return (Double) cachedRow.get(CassandraProperties.FIELD_KNOWN_HOSTS_TEXT_TO_NONTEXT_RATIO);
}
public void setTextToNoneTextRatio(double ratio) {
cachedRow.put(CassandraProperties.FIELD_KNOWN_HOSTS_TEXT_TO_NONTEXT_RATIO, ratio);
}
public void save(Session session) {
String cql = "UPDATE " + TABLE + " SET ";
for (Entry<String, Object> e : cachedRow.entrySet()) {
if (!e.getKey().equals(CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME)) {
if (e.getValue() instanceof String) {
cql += e.getKey() + "='" + e.getValue() + "', ";
} else if (e.getValue() instanceof Set) {
@SuppressWarnings("unchecked")
Set<String> set = (Set<String>) e.getValue();
cql += e.getKey() + "='" + StringUtils.join(set, ";") + "', ";
} else if (e.getValue() instanceof Map) {
try {
@SuppressWarnings("unchecked")
Map<String, Integer> map = (Map<String, Integer>) e.getValue();
StringWriter writer = new StringWriter();
new ObjectMapper().writeValue(writer, map);
String escaped = writer.toString().replace("'", "''");
cql += e.getKey() + "='" + escaped + "', ";
} catch (Exception exception) {
exception.printStackTrace();
}
} else {
cql += e.getKey() + "=" + e.getValue() + ", ";
}
}
}
// Eliminate last comma
cql = cql.substring(0, cql.length() - 2);
// Logger.info(cql);
cql += " WHERE " + CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME + "='" + getHostname() + "';";
session.execute(cql);
}
}