QueryEngineCache.java example

Explorer
hydra-master
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.data.query.engine;

import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import com.addthis.basis.util.Parameter;

import com.google.common.base.Objects;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.LoadingCache;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class implements an LRU cache to keep our QueryEngines. It is instantiated only from MeshQuerySource.
 * <p/>
 * It uses guava's cache loader to do most of the work. We periodically check to see if new data is available for
 * a job, and if so, asynchronously prepare the new database before swapping it in. Multiple get or refresh attempts
 * will block and wait on the existing one to finish.
 * <p/>
 * As per guava's specs, it is not guaranteed that we will wait until we are at maximum capacity to evict engines.
 * Also, we are okay with evicting non-idle engines, but we do not force them to close. Rather, we set a flag and
 * trust the query using them to close it when it is finished. This means we may have a number of engines open equal
 * to the cache capacity + number of running queries. It is also possible for a few engines to be transiently open
 * while waiting for the eviction listener to close engines. This is somewhat balanced by guava's more aggressive
 * eviction policy, but in general we should not rely on the capacity as being an absolute hard max. In practice, it
 * should be more than sufficient though.
 * <p/>
 * Basic flow is :
 * Constructed from MQSource
 * MQSource calls getAndLease()
 * See if we have a suitable engine
 * If so, return it, if not, make one and return it
 */
public class QueryEngineCache {

    private static final Logger log = LoggerFactory.getLogger(QueryEngineCache.class);

    /**
     * 'soft cap' on the number of engines to have open. this + concurrent queries +/- a few should closely
     * resemble the real cap on open engines
     */
    private static final long DEFAULT_ENGINE_CACHE_SIZE = Parameter.longValue("queryEngineCache.engineCacheSize", 5);

    /**
     * seconds to let an engine be in cache before attempting to refresh it. Refreshing it means checking whether
     * or not the job has finished running and has a new data directory; it does not force the reopening of the same
     * directory. It is important to note that this scheduled refresh is not checked unless a get is called on it,
     * and that even if the refresh returns the old engine, it resets the fail timer.
     */
    private static final long DEFAULT_REFRESH_INTERVAL = Parameter.longValue("queryEngineCache.refreshInterval", 2 * 60);

    /**
     * seconds in between cache malongenance runs. This helps query sources and jobs in lower throughput environments.
     * It does the guava api clean up method which handles any pending expiration events, and also attempts to provoke
     * refresh attempts on cached keys by calling get on them. The latter is more important for our purposes. Without it,
     * relatively idle engines would become stale or subject to undesired eviction by the fail longerval. 0 disables it.
     */
    private static final long DEFAULT_MAINTENANCE_INTERVAL = Parameter.longValue("queryEngineCache.maintenanceInterval", 20 * 60);

    /**
     * seconds to let an engine be in cache after the most recent write. This is longended only for situations
     * where re-opening that engine is failing, and thus while the refresh is not occuring. it might appear that
     * an engine is alive and up to date and this attempts to limit that disparity if desired. Note that by failing,
     * we mean that the refresh method is throwing exceptions.
     */
    private static final long DEFAULT_FAIL_INTERVAL = Parameter.longValue("queryEngineCache.failInterval", 70 * 60);

    /**
     * thread pool for cache maintenance runs. Should only need one thread.
     */
    private final ScheduledExecutorService queryEngineCacheMaintainer = MoreExecutors
            .getExitingScheduledExecutorService(new ScheduledThreadPoolExecutor(1,
                    new ThreadFactoryBuilder().setNameFormat("queryEngineCacheMaintainer=%d").build()));

    /**
     * The {@link LoadingCache} that provides the backing data structure for this class.
     * Acts like an intelligent semi-persistent Map that has logic for loading and reloading complex objects.
     */
    protected final LoadingCache<String, QueryEngine> loadingEngineCache;

    private final long engineCacheSize;
    private final long refreshInterval;
    private final long failInterval;
    private final long maintenanceInterval;

    /**
     * Initialize a {@link LoadingCache} that is capable of loading and reloading
     * {@link QueryEngine}s. Reloads occur asynchronously to prevent blocking operations
     * during unrelated calls to the cache.  When reload is called the current engine will be compared with the
     * newest available data directory.  If the current engine is up to date it will be returned, otherwise a new
     * engine will be opened to replace the current engine with the latest available.
     * <p/>
     * On removal, we have a listener that will call closeWhenIdle on engines. It has a guard against removal events
     * generated by refreshes where we decide to keep the existing engine (no new job data is available). There is a
     * race condition where that test can be passed more than once so any clean up done there must be okay with that.
     * The race condition is such that the test will always be passed at least once, and never when the engine is still
     * available to new get calls. This meets our requirements.
     */
    public QueryEngineCache() {
        this(DEFAULT_ENGINE_CACHE_SIZE, DEFAULT_REFRESH_INTERVAL, DEFAULT_FAIL_INTERVAL, DEFAULT_MAINTENANCE_INTERVAL);
    }

    public QueryEngineCache(long engineCacheSize, long refreshInterval, long failInterval, long maintenanceInterval) {
        this(engineCacheSize, refreshInterval, failInterval, maintenanceInterval, new EngineLoader());
    }

    public QueryEngineCache(long engineCacheSize, long refreshInterval, long failInterval, long maintenanceInterval,
            EngineLoader engineLoader) {
        this.engineCacheSize = engineCacheSize;
        this.refreshInterval = refreshInterval;
        this.failInterval = failInterval;
        this.maintenanceInterval = maintenanceInterval;

        log.info("Initializing QueryEngineCache: {}", this); //using 'this' is just more efficient

        // no easy way around escaping 'this' here, but at least it is more obvious what is going on now
        loadingEngineCache = CacheBuilder.newBuilder()
                .maximumWeight(engineCacheSize * 100)
                .<String, QueryEngine>weigher(
                        (dir, engine) -> (int) (100 * engine.getTree().getAdvancedSettings().cacheWeight()))
                .refreshAfterWrite(refreshInterval, TimeUnit.SECONDS)
                .expireAfterWrite(failInterval, TimeUnit.SECONDS)
                .removalListener(new EngineRemovalListener(this))
                .build(engineLoader);

        //schedule maintenance runs
        maybeInitMaintenance();
    }


    /**
     * schedules maintenance for the cache using the maintenanceInterval parameter. Values less than 1
     * are treated as 'do not do maintenance'. Maintenance includes cache loader cleanUp() and an attempt
     * to trigger refreshes in relatively idle engines. This is done by the thread safe iterator from
     * the loading cache and performing getIfPresent calls on each entry. This will only trigger refreshes
     * if the refresh interval has passed, and avoids a potential race condition where doing refresh() could
     * end up re-loading an engine that was just evicted. This is important because in addition to being
     * incorrect cache behavior, refresh will block instead of being asynchronous while doing so -- possibly
     * leading to even more race conditions.
     * <p/>
     * since the thread safe iterator is weakly consistent, it is a good idea to configure the intervals so
     * that maintenance will be performed more than once before the fail interval occurs (if we do not desire
     * to evict and close 'relatively idle' engines). eg. maintenanceInterval * 2 < failInterval
     * <p/>
     * unfortunately, this somewhat confuses the eviction order heuristic because it considers these all to be
     * valid r/ws. This is one reason to keep this value relatively long. It is possible to optimize against this
     * somewhat, but probably at the cost of greatly increased complexity. It seems unlikely that it will have a
     * large impact if performed infrequently enough though, especially since the evictor is not a simple LRU.
     */
    private void maybeInitMaintenance() {
        if (maintenanceInterval > 0) {
            queryEngineCacheMaintainer.scheduleAtFixedRate(() -> {
                loadingEngineCache.cleanUp();
                loadingEngineCache.asMap().keySet().forEach(loadingEngineCache::getIfPresent);
            }, maintenanceInterval, maintenanceInterval, TimeUnit.SECONDS);
        }
    }

    /**
     * Takes an unresolved (usually the gold path) path to a bdb query directory. This is mostly a thin
     * layer between this class and the backing LoadingCache.
     * <p/>
     * Most importantly, it also attempts to lease the engine. This is because there is a rare race condition
     * where after acquiring the engine, but before leasing it ourselves, it is evicted from the cache. Probably
     * caused by refresh, since it is less likely that an engine we just acquired would be the target of size
     * eviction in most cases. It is relatively unlikely to happen even twice in a row, but we try three times
     * here anyway. I have never seen this exception but if we start to see it a lot, we can re-evaluate this approach.
     *
     * @param directoryPath The path of the engine directory
     * @return a QueryEngine from the cache or constructed on demand (constructing blocks this thread)
     * @throws Exception - any problem while getting the engine. Likely either an issue with leasing or with opening an engine
     */
    public QueryEngine getAndLease(String directoryPath) throws Exception {
        for (int i = 0; i < 3; i++) {
            QueryEngine qe = loadingEngineCache.get(directoryPath);
            if (qe.lease()) {
                return qe;
            }
        }
        log.warn("Tried three times but unable to get lease for engine with path: {}", directoryPath);
        throw new RuntimeException("Can't lease engine");
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this)
                .add("engineCacheSize", engineCacheSize)
                .add("refreshInterval", refreshInterval)
                .add("maintenanceInterval", maintenanceInterval)
                .add("failInterval", failInterval)
                .toString();
    }

}