package uk.bl.monitrix.database.cassandra.ingest; import java.util.Map; import com.datastax.driver.core.BoundStatement; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; import play.Logger; import uk.bl.monitrix.analytics.HostAnalytics; import uk.bl.monitrix.database.cassandra.CassandraProperties; import uk.bl.monitrix.database.cassandra.model.CassandraIngestSchedule; import uk.bl.monitrix.database.cassandra.model.CassandraKnownHost; import uk.bl.monitrix.database.cassandra.model.CassandraKnownHostList; import uk.bl.monitrix.heritrix.LogFileEntry.DefaultAlert; import uk.bl.monitrix.model.Alert.AlertType; /** * An extended version of {@link CassandraKnownHostList} that adds insert/update capability. * * TODO this whole class really needs some cleanup! * * @author Rainer Simon <rainer.simon@ait.ac.at> */ class CassandraKnownHostImporter extends CassandraKnownHostList { // private static final String ALERT_MSG_TOO_MANY_SUBDOMAINS = "The host %s has a suspiciously high number of subdomains (%s)"; private static final String ALERT_MSG_TXT_TO_NONTEXT_RATIO = "The host %s serves a suspiciously high ratio of text vs. non-text resources"; private CassandraAlertLogImporter alertLog; private CassandraKnownTLDImporter knownTLDs; private PreparedStatement statementHosts; private PreparedStatement statementTLD; public CassandraKnownHostImporter(Session db, CassandraIngestSchedule ingestSchedule, CassandraAlertLogImporter alertLog) { super(db); this.alertLog = alertLog; this.knownTLDs = new CassandraKnownTLDImporter(db); this.statementHosts = session.prepare( "INSERT INTO " + CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_KNOWN_HOSTS + " (" + CassandraProperties.FIELD_KNOWN_HOSTS_HOSTNAME + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_TLD + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_DOMAIN + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_SUBDOMAIN + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_FIRST_ACCESS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_LAST_ACCESS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLERS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_CRAWLED_URLS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_SUCCESSFULLY_FETCHED_URLS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_AVG_FETCH_DURATION + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_AVG_RETRY_RATE + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_FETCH_STATUS_CODES + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_CONTENT_TYPES + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_VIRUS_STATS + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_REDIRECT_PERCENTAGE + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_ROBOTS_BLOCK_PERCENTAGE + ", " + CassandraProperties.FIELD_KNOWN_HOSTS_TEXT_TO_NONTEXT_RATIO + ") " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"); this.statementTLD = session.prepare( "INSERT INTO " + CassandraProperties.KEYSPACE + "." + CassandraProperties.COLLECTION_KNOWN_TLDS + " (" + CassandraProperties.FIELD_KNOWN_TLDS_TLD + ") " + "VALUES (?);"); } /** * Adds a new host to the Known Hosts list. Note that this method ONLY writes to * the in-memory cache! In order to write to the database, execute the .commit() method * after your additions are done. * @param hostname the host name * @param accessTime the access time */ public CassandraKnownHost addToList(String hostname, String domain, String subdomain, long accessTime) { BoundStatement boundHostStatement = new BoundStatement(statementHosts); String tld = hostname.substring(hostname.lastIndexOf('.') + 1); session.execute(boundHostStatement.bind( hostname, tld, domain, subdomain, accessTime, accessTime, "", 0l, 0l, 0.0, 0.0, "", "", "", 0.0, 0.0, 0.0)); BoundStatement boundTLDStatement = new BoundStatement(statementTLD); session.execute(boundTLDStatement.bind(tld)); knownTLDs.incrementTLDCount(tld); return (CassandraKnownHost) getKnownHost(hostname); } /** * Updates the last access time for the specified host. Note that this method ONLY * writes to the in-memory cache! In order to write to the database, execute the .commit() * method after your additions are done. * @param hostname the hostname * @param lastAccess the new last access time */ public void setLastAccess(String hostname, long lastAccess) { CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) host.setLastAccess(lastAccess); else Logger.warn("Attempt to write last access info to unknown host: " + hostname); } public void addCrawlerID(String hostname, String crawlerId) { // In this case we know it's a safe cast CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) host.addCrawlerID(crawlerId); else Logger.warn("Attempt to write crawlerID info to unknown host: " + hostname); } public void incrementFetchStatusCounter(String hostname, int fetchStatus) { // In this case we know it's a safe cast CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) { String key = Integer.toString(fetchStatus); Map<String, Integer> fetchStatusMap = host.getFetchStatusDistribution(); Integer value = fetchStatusMap.get(key); if (value == null) fetchStatusMap.put(key, 1); else fetchStatusMap.put(key, value.intValue() + 1); host.setFetchStatusDistribution(fetchStatusMap); } else { Logger.warn("Attempt to write fetch status info to unknown host: " + hostname); } } public void incrementCrawledURLCounter(String hostname) { CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) { long crawledURLs = host.getCrawledURLs(); host.setCrawledURLs(crawledURLs + 1); } else { Logger.warn("Attempt to increment crawled URL counter for unknown host: " + hostname); } } public void incrementContentTypeCounter(String hostname, String contentType) { // In this case we know it's a safe cast CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) { Map<String, Integer> contentTypeMap = host.getContentTypeDistribution(); Integer value = contentTypeMap.get(contentType); if (value == null) contentTypeMap.put(contentType, 1); else contentTypeMap.put(contentType, value.intValue() + 1); host.setContentTypeDistribution(contentTypeMap); } else { Logger.warn("Attempt to write content type info to unknown host: " + hostname); } } public void incrementVirusStats(String hostname, String virusName) { // In this case we know it's a safe cast CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) { Map<String, Integer> virusMap = host.getVirusStats(); Integer value = virusMap.get(virusName); if (value == null) virusMap.put(virusName, 1); else virusMap.put(virusName, value.intValue() + 1); host.setVirusStats(virusMap); } else { Logger.warn("Attempt to write virus stats info to unknown host: " + hostname); } } public void updateAverageResponseTimeAndRetryRate(String hostname, int fetchDuration, int retries) { if (fetchDuration > 0) { CassandraKnownHost host = (CassandraKnownHost) getKnownHost(hostname); if (host != null) { long successCount = host.getSuccessfullyFetchedURLs(); double currentAvgResponseTime = host.getAverageFetchDuration(); double newAvgResponseTime = (currentAvgResponseTime * successCount + fetchDuration) / (successCount + 1); double currentAvgRetryRate = host.getAverageRetryRate(); double newAvgRetryRate = rounder((currentAvgRetryRate * successCount + retries) / (successCount + 1)); host.setSuccessfullyFetchedURLs(successCount + 1); host.setAverageFetchDuration(newAvgResponseTime); host.setAverageRetryRate(newAvgRetryRate); } else { Logger.warn("Attempt to update average response time for known host: " + hostname); } } } public void commit() { Logger.info("Updating known hosts list (" + cache.size() + " hosts)"); for (CassandraKnownHost knownHost : cache.values()) { knownHost.setRobotsBlockPercentage(HostAnalytics.computePercentageOfRobotsTxtBlocks(knownHost)); knownHost.setRedirectPercentage(HostAnalytics.computePercentagOfRedirects(knownHost)); knownHost.setTextToNoneTextRatio(HostAnalytics.computeTextToNonTextRatio(knownHost)); knownHost.save(session); } // Compute host-level alerts // Note: we only need to consider hosts that were added in this batch - i.e. those in the cache! // FIXME: Moving to the host-wise model means we've lost the alerts. Logger.info("Computing host-level alerts"); for (CassandraKnownHost host : cache.values()) { /* Subdomain limit int subdomains = 1; //host.getSubdomain().size(); if (subdomains > 100) { CassandraAlert alert = new MongoAlert(new BasicDBObject()); alert.setTimestamp(host.getLastAccess()); alert.setOffendingHost(host.getHostname()); alert.setAlertType(AlertType.TOO_MANY_SUBDOMAINS); alert.setAlertDescription(String.format(ALERT_MSG_TOO_MANY_SUBDOMAINS, host.getHostname(), Integer.toString(subdomains))); alertLog.insert(alert); } */ // Text-to-Nontext content type ratio limit if (host.getTextToNoneTextRatio() > 0.9) { DefaultAlert a = new DefaultAlert( host.getLastAccess(), host.getHostname(), AlertType.TXT_TO_NONTEXT_RATIO, String.format(ALERT_MSG_TXT_TO_NONTEXT_RATIO, host.getHostname())); alertLog.insert(a); } } knownTLDs.commit(); cache.clear(); } }