package org.archive.wayback.accesscontrol.robotstxt.redis; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.IOUtils; import org.archive.wayback.accesscontrol.robotstxt.redis.RedisRobotsLogic.RedisValue; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.webapp.PerfStats; import com.google.common.io.ByteStreams; public class SimpleRedisRobotsCache implements LiveWebCache { private final static Logger LOGGER = Logger .getLogger(SimpleRedisRobotsCache.class.getName()); enum PerfStat { RobotsRedis, RobotsLive, }; /* REDIS */ protected RedisRobotsLogic redisCmds; /* EXTERNAL CACHE */ protected LiveWebCache liveweb = null; protected boolean gzipRobots = false; final static int STATUS_OK = 200; final static int STATUS_ERROR = 502; final static int MAX_ROBOTS_SIZE = 500000; final static int ONE_DAY = 60 * 60 * 24; private int totalTTL = ONE_DAY * 10; private int refreshTTL = ONE_DAY; private int notAvailTotalTTL = ONE_DAY * 2; private int notAvailRefreshTTL = ONE_DAY / 2; final static String ROBOTS_TOKEN_EMPTY = "0_ROBOTS_EMPTY"; final static String ROBOTS_TOKEN_ERROR = "0_ROBOTS_ERROR-"; final static String ROBOTS_TOKEN_ERROR_UNKNOWN = "0_ROBOTS_ERROR-0"; final static String UPDATE_QUEUE_KEY = "robots_update_queue"; final static int MAX_UPDATE_QUEUE_SIZE = 50000; @Override public Resource getCachedResource(URL urlURL, long maxCacheMS, boolean bUseOlder) throws LiveDocumentNotAvailableException, LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException { String url = urlURL.toExternalForm(); RedisValue value = null; try { PerfStats.timeStart(PerfStat.RobotsRedis); if (redisCmds != null) { value = redisCmds.getValue(url); } } catch (LiveWebCacheUnavailableException lw) { value = null; } finally { PerfStats.timeEnd(PerfStat.RobotsRedis); } // Use the old liveweb cache, if provided if (value == null) { RobotsResult result = loadExternal(urlURL, maxCacheMS, bUseOlder); PerfStats.timeStart(PerfStat.RobotsRedis); this.updateCache(result.robots, url, null, result.status, true); PerfStats.timeEnd(PerfStat.RobotsRedis); if (result == null || result.status != STATUS_OK) { throw new LiveDocumentNotAvailableException("Error Loading Live Robots"); } return new RobotsTxtResource(result.robots); } else { if (isExpired(value, url, 0)) { PerfStats.timeStart(PerfStat.RobotsRedis); redisCmds.pushKey(UPDATE_QUEUE_KEY, url, MAX_UPDATE_QUEUE_SIZE); PerfStats.timeEnd(PerfStat.RobotsRedis); } String currentRobots = value.value; if (currentRobots.startsWith(ROBOTS_TOKEN_ERROR)) { throw new LiveDocumentNotAvailableException("Robots Error: " + currentRobots); } else if (value.equals(ROBOTS_TOKEN_EMPTY)) { currentRobots = ""; } return new RobotsTxtResource(currentRobots); } } @Override public void shutdown() { if (redisCmds != null) { redisCmds.close(); redisCmds = null; } } static boolean isFailedError(int status) { return (status == 0) || ((status >= 500)); } static boolean isRedirect(int status) { return (status == 301) || (status == 302); } static boolean isFailedError(String code) { try { int status = Integer.parseInt(code); return isFailedError(status); } catch (NumberFormatException n) { return true; } } public boolean isExpired(RedisValue value, String url, int customRefreshTime) { int maxTime, refreshTime; boolean isFailedError = value.value.startsWith(ROBOTS_TOKEN_ERROR); if (isFailedError) { String code = value.value.substring(ROBOTS_TOKEN_ERROR.length()); isFailedError = isFailedError(code); } if (isFailedError) { maxTime = notAvailTotalTTL; refreshTime = notAvailRefreshTTL; } else { maxTime = totalTTL; refreshTime = refreshTTL; } if (customRefreshTime > 0) { refreshTime = customRefreshTime; } if ((maxTime - value.ttl) >= refreshTime) { // LOGGER.info("Queue for robot refresh: " // + (maxTime - value.ttl) + ">=" + refreshTime + " " + url); return true; } return false; } class RobotsResult { String oldRobots; String robots; int status; RobotsResult(String oldRobots) { status = 0; this.oldRobots = oldRobots; robots = null; } RobotsResult(String robots, int status) { this.robots = robots; this.status = status; } boolean isSameRobots() { return (robots == null) || (oldRobots == null) || robots.equals(oldRobots); } } protected RobotsResult loadExternal(URL urlURL, long maxCacheMS, boolean bUseOlder) { //RobotsContext context = new RobotsContext(url, current, true, true); Resource origResource = null; int status = 0; String contents = null; try { PerfStats.timeStart(PerfStat.RobotsLive); origResource = liveweb.getCachedResource(urlURL, maxCacheMS, bUseOlder); status = origResource.getStatusCode(); if (status == STATUS_OK) { if (origResource instanceof RobotsTxtResource) { contents = ((RobotsTxtResource)origResource).getContents(); } else { contents = IOUtils.toString(ByteStreams.limit(origResource, MAX_ROBOTS_SIZE), "UTF-8"); } } } catch (Exception e) { status = STATUS_ERROR; } finally { if (origResource != null) { try { origResource.close(); } catch (IOException e) { } } PerfStats.timeEnd(PerfStat.RobotsLive); } return new RobotsResult(contents, status); } protected void updateCache(final String contents, final String url, final String current, int status, boolean cacheFails) { String newRedisValue = null; int newTTL = 0; boolean ttlOnly = false; if (status == STATUS_OK) { //contents = context.getNewRobots(); newTTL = totalTTL; if (contents == null || contents.isEmpty()) { newRedisValue = ROBOTS_TOKEN_EMPTY; } else if (contents.length() > MAX_ROBOTS_SIZE) { newRedisValue = contents.substring(0, MAX_ROBOTS_SIZE); } else { newRedisValue = contents; } } else { if (isFailedError(status)) { newTTL = notAvailTotalTTL; // Only Cacheing successful lookups if (!cacheFails) { return; } } else { newTTL = totalTTL; } newRedisValue = ROBOTS_TOKEN_ERROR + status; } String currentValue = current; if (currentValue != null) { if (currentValue.equals(newRedisValue)) { ttlOnly = true; } // Don't override a valid robots with a timeout error if (!isRedirect(status) && !isValidRobots(newRedisValue) && isValidRobots(currentValue)) { newTTL = totalTTL; ttlOnly = true; if (LOGGER.isLoggable(Level.INFO)) { LOGGER.info("REFRESH ERROR: " + status + " - Keeping same robots for " + url); } } } final RedisValue value = new RedisValue((ttlOnly ? null : newRedisValue), newTTL); redisCmds.updateValue(url, value, gzipRobots); } protected boolean isValidRobots(String value) { return !value.startsWith(ROBOTS_TOKEN_ERROR) && !value.equals(ROBOTS_TOKEN_EMPTY); } public RobotsResult forceUpdate(String url, int minUpdateTime, boolean cacheFails) { String current = null; try { RedisValue value = null; try { PerfStats.timeStart(PerfStat.RobotsRedis); value = redisCmds.getValue(url); } finally { PerfStats.timeEnd(PerfStat.RobotsRedis); } // Just in case, avoid too many updates if ((minUpdateTime > 0) && (value != null) && !isExpired(value, url, minUpdateTime)) { return new RobotsResult(value.value); } current = (value != null ? value.value : null); } catch (LiveWebCacheUnavailableException lw) { current = lw.toString(); } RobotsResult result = null; try { result = loadExternal(new URL(url), 0, false); } catch (MalformedURLException e) { return new RobotsResult(current); } if ((result.status == STATUS_OK) || cacheFails) { this.updateCache(result.robots, url, current, result.status, cacheFails); // if (LOGGER.isLoggable(Level.INFO)) { // LOGGER.info("Force updated: " + url); // } } result.oldRobots = current; return result; } // public RedisRobotsLogic getRedisConnMan() { // return redisCmds.; // } public void setRedisConnMan(RedisConnectionManager redisConn) { this.redisCmds = new RedisRobotsLogic(redisConn); } public LiveWebCache getLiveweb() { return liveweb; } public void setLiveweb(LiveWebCache liveweb) { this.liveweb = liveweb; } public boolean isGzipRobots() { return gzipRobots; } public void setGzipRobots(boolean gzipRobots) { this.gzipRobots = gzipRobots; } }