/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.query.engine; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import com.addthis.basis.util.Parameter; import com.google.common.base.Objects; import com.google.common.cache.CacheBuilder; import com.google.common.cache.LoadingCache; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class implements an LRU cache to keep our QueryEngines. It is instantiated only from MeshQuerySource. * <p/> * It uses guava's cache loader to do most of the work. We periodically check to see if new data is available for * a job, and if so, asynchronously prepare the new database before swapping it in. Multiple get or refresh attempts * will block and wait on the existing one to finish. * <p/> * As per guava's specs, it is not guaranteed that we will wait until we are at maximum capacity to evict engines. * Also, we are okay with evicting non-idle engines, but we do not force them to close. Rather, we set a flag and * trust the query using them to close it when it is finished. This means we may have a number of engines open equal * to the cache capacity + number of running queries. It is also possible for a few engines to be transiently open * while waiting for the eviction listener to close engines. This is somewhat balanced by guava's more aggressive * eviction policy, but in general we should not rely on the capacity as being an absolute hard max. In practice, it * should be more than sufficient though. * <p/> * Basic flow is : * Constructed from MQSource * MQSource calls getAndLease() * See if we have a suitable engine * If so, return it, if not, make one and return it */ public class QueryEngineCache { private static final Logger log = LoggerFactory.getLogger(QueryEngineCache.class); /** * 'soft cap' on the number of engines to have open. this + concurrent queries +/- a few should closely * resemble the real cap on open engines */ private static final long DEFAULT_ENGINE_CACHE_SIZE = Parameter.longValue("queryEngineCache.engineCacheSize", 5); /** * seconds to let an engine be in cache before attempting to refresh it. Refreshing it means checking whether * or not the job has finished running and has a new data directory; it does not force the reopening of the same * directory. It is important to note that this scheduled refresh is not checked unless a get is called on it, * and that even if the refresh returns the old engine, it resets the fail timer. */ private static final long DEFAULT_REFRESH_INTERVAL = Parameter.longValue("queryEngineCache.refreshInterval", 2 * 60); /** * seconds in between cache malongenance runs. This helps query sources and jobs in lower throughput environments. * It does the guava api clean up method which handles any pending expiration events, and also attempts to provoke * refresh attempts on cached keys by calling get on them. The latter is more important for our purposes. Without it, * relatively idle engines would become stale or subject to undesired eviction by the fail longerval. 0 disables it. */ private static final long DEFAULT_MAINTENANCE_INTERVAL = Parameter.longValue("queryEngineCache.maintenanceInterval", 20 * 60); /** * seconds to let an engine be in cache after the most recent write. This is longended only for situations * where re-opening that engine is failing, and thus while the refresh is not occuring. it might appear that * an engine is alive and up to date and this attempts to limit that disparity if desired. Note that by failing, * we mean that the refresh method is throwing exceptions. */ private static final long DEFAULT_FAIL_INTERVAL = Parameter.longValue("queryEngineCache.failInterval", 70 * 60); /** * thread pool for cache maintenance runs. Should only need one thread. */ private final ScheduledExecutorService queryEngineCacheMaintainer = MoreExecutors .getExitingScheduledExecutorService(new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().setNameFormat("queryEngineCacheMaintainer=%d").build())); /** * The {@link LoadingCache} that provides the backing data structure for this class. * Acts like an intelligent semi-persistent Map that has logic for loading and reloading complex objects. */ protected final LoadingCache<String, QueryEngine> loadingEngineCache; private final long engineCacheSize; private final long refreshInterval; private final long failInterval; private final long maintenanceInterval; /** * Initialize a {@link LoadingCache} that is capable of loading and reloading * {@link QueryEngine}s. Reloads occur asynchronously to prevent blocking operations * during unrelated calls to the cache. When reload is called the current engine will be compared with the * newest available data directory. If the current engine is up to date it will be returned, otherwise a new * engine will be opened to replace the current engine with the latest available. * <p/> * On removal, we have a listener that will call closeWhenIdle on engines. It has a guard against removal events * generated by refreshes where we decide to keep the existing engine (no new job data is available). There is a * race condition where that test can be passed more than once so any clean up done there must be okay with that. * The race condition is such that the test will always be passed at least once, and never when the engine is still * available to new get calls. This meets our requirements. */ public QueryEngineCache() { this(DEFAULT_ENGINE_CACHE_SIZE, DEFAULT_REFRESH_INTERVAL, DEFAULT_FAIL_INTERVAL, DEFAULT_MAINTENANCE_INTERVAL); } public QueryEngineCache(long engineCacheSize, long refreshInterval, long failInterval, long maintenanceInterval) { this(engineCacheSize, refreshInterval, failInterval, maintenanceInterval, new EngineLoader()); } public QueryEngineCache(long engineCacheSize, long refreshInterval, long failInterval, long maintenanceInterval, EngineLoader engineLoader) { this.engineCacheSize = engineCacheSize; this.refreshInterval = refreshInterval; this.failInterval = failInterval; this.maintenanceInterval = maintenanceInterval; log.info("Initializing QueryEngineCache: {}", this); //using 'this' is just more efficient // no easy way around escaping 'this' here, but at least it is more obvious what is going on now loadingEngineCache = CacheBuilder.newBuilder() .maximumWeight(engineCacheSize * 100) .<String, QueryEngine>weigher( (dir, engine) -> (int) (100 * engine.getTree().getAdvancedSettings().cacheWeight())) .refreshAfterWrite(refreshInterval, TimeUnit.SECONDS) .expireAfterWrite(failInterval, TimeUnit.SECONDS) .removalListener(new EngineRemovalListener(this)) .build(engineLoader); //schedule maintenance runs maybeInitMaintenance(); } /** * schedules maintenance for the cache using the maintenanceInterval parameter. Values less than 1 * are treated as 'do not do maintenance'. Maintenance includes cache loader cleanUp() and an attempt * to trigger refreshes in relatively idle engines. This is done by the thread safe iterator from * the loading cache and performing getIfPresent calls on each entry. This will only trigger refreshes * if the refresh interval has passed, and avoids a potential race condition where doing refresh() could * end up re-loading an engine that was just evicted. This is important because in addition to being * incorrect cache behavior, refresh will block instead of being asynchronous while doing so -- possibly * leading to even more race conditions. * <p/> * since the thread safe iterator is weakly consistent, it is a good idea to configure the intervals so * that maintenance will be performed more than once before the fail interval occurs (if we do not desire * to evict and close 'relatively idle' engines). eg. maintenanceInterval * 2 < failInterval * <p/> * unfortunately, this somewhat confuses the eviction order heuristic because it considers these all to be * valid r/ws. This is one reason to keep this value relatively long. It is possible to optimize against this * somewhat, but probably at the cost of greatly increased complexity. It seems unlikely that it will have a * large impact if performed infrequently enough though, especially since the evictor is not a simple LRU. */ private void maybeInitMaintenance() { if (maintenanceInterval > 0) { queryEngineCacheMaintainer.scheduleAtFixedRate(() -> { loadingEngineCache.cleanUp(); loadingEngineCache.asMap().keySet().forEach(loadingEngineCache::getIfPresent); }, maintenanceInterval, maintenanceInterval, TimeUnit.SECONDS); } } /** * Takes an unresolved (usually the gold path) path to a bdb query directory. This is mostly a thin * layer between this class and the backing LoadingCache. * <p/> * Most importantly, it also attempts to lease the engine. This is because there is a rare race condition * where after acquiring the engine, but before leasing it ourselves, it is evicted from the cache. Probably * caused by refresh, since it is less likely that an engine we just acquired would be the target of size * eviction in most cases. It is relatively unlikely to happen even twice in a row, but we try three times * here anyway. I have never seen this exception but if we start to see it a lot, we can re-evaluate this approach. * * @param directoryPath The path of the engine directory * @return a QueryEngine from the cache or constructed on demand (constructing blocks this thread) * @throws Exception - any problem while getting the engine. Likely either an issue with leasing or with opening an engine */ public QueryEngine getAndLease(String directoryPath) throws Exception { for (int i = 0; i < 3; i++) { QueryEngine qe = loadingEngineCache.get(directoryPath); if (qe.lease()) { return qe; } } log.warn("Tried three times but unable to get lease for engine with path: {}", directoryPath); throw new RuntimeException("Can't lease engine"); } @Override public String toString() { return Objects.toStringHelper(this) .add("engineCacheSize", engineCacheSize) .add("refreshInterval", refreshInterval) .add("maintenanceInterval", maintenanceInterval) .add("failInterval", failInterval) .toString(); } }