/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.client; import java.io.BufferedOutputStream; import java.io.Closeable; import java.io.DataInputStream; import java.io.DataOutputStream; import org.apache.hadoop.classification.InterfaceAudience; import java.io.IOException; import java.nio.MappedByteBuffer; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.lang.mutable.MutableBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.ExtendedBlockId; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.client.ShortCircuitReplica; import org.apache.hadoop.hdfs.ShortCircuitShm.Slot; import org.apache.hadoop.hdfs.net.DomainPeer; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; import org.apache.hadoop.hdfs.protocolPB.PBHelper; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.ipc.RetriableException; import org.apache.hadoop.net.unix.DomainSocket; import org.apache.hadoop.net.unix.DomainSocketWatcher; import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Waitable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.util.concurrent.ThreadFactoryBuilder; /** * The ShortCircuitCache tracks things which the client needs to access * HDFS block files via short-circuit. * * These things include: memory-mapped regions, file descriptors, and shared * memory areas for communicating with the DataNode. */ @InterfaceAudience.Private public class ShortCircuitCache implements Closeable { public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class); /** * Expiry thread which makes sure that the file descriptors get closed * after a while. */ private class CacheCleaner implements Runnable, Closeable { private ScheduledFuture<?> future; /** * Run the CacheCleaner thread. * * Whenever a thread requests a ShortCircuitReplica object, we will make * sure it gets one. That ShortCircuitReplica object can then be re-used * when another thread requests a ShortCircuitReplica object for the same * block. So in that sense, there is no maximum size to the cache. * * However, when a ShortCircuitReplica object is unreferenced by the * thread(s) that are using it, it becomes evictable. There are two * separate eviction lists-- one for mmaped objects, and another for * non-mmaped objects. We do this in order to avoid having the regular * files kick the mmaped files out of the cache too quickly. Reusing * an already-existing mmap gives a huge performance boost, since the * page table entries don't have to be re-populated. Both the mmap * and non-mmap evictable lists have maximum sizes and maximum lifespans. */ @Override public void run() { ShortCircuitCache.this.lock.lock(); try { if (ShortCircuitCache.this.closed) return; long curMs = Time.monotonicNow(); if (LOG.isDebugEnabled()) { LOG.debug(this + ": cache cleaner running at " + curMs); } int numDemoted = demoteOldEvictableMmaped(curMs); int numPurged = 0; Long evictionTimeNs = Long.valueOf(0); while (true) { Entry<Long, ShortCircuitReplica> entry = evictableMmapped.ceilingEntry(evictionTimeNs); if (entry == null) break; evictionTimeNs = entry.getKey(); long evictionTimeMs = TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break; ShortCircuitReplica replica = entry.getValue(); if (LOG.isTraceEnabled()) { LOG.trace("CacheCleaner: purging " + replica + ": " + StringUtils.getStackTrace(Thread.currentThread())); } purge(replica); numPurged++; } if (LOG.isDebugEnabled()) { LOG.debug(this + ": finishing cache cleaner run started at " + curMs + ". Demoted " + numDemoted + " mmapped replicas; " + "purged " + numPurged + " replicas."); } } finally { ShortCircuitCache.this.lock.unlock(); } } @Override public void close() throws IOException { if (future != null) { future.cancel(false); } } public void setFuture(ScheduledFuture<?> future) { this.future = future; } /** * Get the rate at which this cleaner thread should be scheduled. * * We do this by taking the minimum expiration time and dividing by 4. * * @return the rate in milliseconds at which this thread should be * scheduled. */ public long getRateInMs() { long minLifespanMs = Math.min(maxNonMmappedEvictableLifespanMs, maxEvictableMmapedLifespanMs); long sampleTimeMs = minLifespanMs / 4; return (sampleTimeMs < 1) ? 1 : sampleTimeMs; } } /** * A task which asks the DataNode to release a short-circuit shared memory * slot. If successful, this will tell the DataNode to stop monitoring * changes to the mlock status of the replica associated with the slot. * It will also allow us (the client) to re-use this slot for another * replica. If we can't communicate with the DataNode for some reason, * we tear down the shared memory segment to avoid being in an inconsistent * state. */ private class SlotReleaser implements Runnable { /** * The slot that we need to release. */ private final Slot slot; SlotReleaser(Slot slot) { this.slot = slot; } @Override public void run() { if (LOG.isTraceEnabled()) { LOG.trace(ShortCircuitCache.this + ": about to release " + slot); } final DfsClientShm shm = (DfsClientShm)slot.getShm(); final DomainSocket shmSock = shm.getPeer().getDomainSocket(); DomainSocket sock = null; DataOutputStream out = null; final String path = shmSock.getPath(); boolean success = false; try { sock = DomainSocket.connect(path); out = new DataOutputStream( new BufferedOutputStream(sock.getOutputStream())); new Sender(out).releaseShortCircuitFds(slot.getSlotId()); DataInputStream in = new DataInputStream(sock.getInputStream()); ReleaseShortCircuitAccessResponseProto resp = ReleaseShortCircuitAccessResponseProto.parseFrom( PBHelper.vintPrefixed(in)); if (resp.getStatus() != Status.SUCCESS) { String error = resp.hasError() ? resp.getError() : "(unknown)"; throw new IOException(resp.getStatus().toString() + ": " + error); } if (LOG.isTraceEnabled()) { LOG.trace(ShortCircuitCache.this + ": released " + slot); } success = true; } catch (IOException e) { LOG.error(ShortCircuitCache.this + ": failed to release " + "short-circuit shared memory slot " + slot + " by sending " + "ReleaseShortCircuitAccessRequestProto to " + path + ". Closing shared memory segment.", e); } finally { if (success) { shmManager.freeSlot(slot); } else { shm.getEndpointShmManager().shutdown(shm); } IOUtils.cleanup(LOG, sock, out); } } } public interface ShortCircuitReplicaCreator { /** * Attempt to create a ShortCircuitReplica object. * * This callback will be made without holding any locks. * * @return a non-null ShortCircuitReplicaInfo object. */ ShortCircuitReplicaInfo createShortCircuitReplicaInfo(); } /** * Lock protecting the cache. */ private final ReentrantLock lock = new ReentrantLock(); /** * The executor service that runs the cacheCleaner. */ private final ScheduledThreadPoolExecutor cleanerExecutor = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner"). build()); /** * The executor service that runs the cacheCleaner. */ private final ScheduledThreadPoolExecutor releaserExecutor = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser"). build()); /** * A map containing all ShortCircuitReplicaInfo objects, organized by Key. * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken * exception. */ private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> replicaInfoMap = new HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>>(); /** * The CacheCleaner. We don't create this and schedule it until it becomes * necessary. */ private CacheCleaner cacheCleaner; /** * Tree of evictable elements. * * Maps (unique) insertion time in nanoseconds to the element. */ private final TreeMap<Long, ShortCircuitReplica> evictable = new TreeMap<Long, ShortCircuitReplica>(); /** * Maximum total size of the cache, including both mmapped and * no$-mmapped elements. */ private final int maxTotalSize; /** * Non-mmaped elements older than this will be closed. */ private long maxNonMmappedEvictableLifespanMs; /** * Tree of mmaped evictable elements. * * Maps (unique) insertion time in nanoseconds to the element. */ private final TreeMap<Long, ShortCircuitReplica> evictableMmapped = new TreeMap<Long, ShortCircuitReplica>(); /** * Maximum number of mmaped evictable elements. */ private int maxEvictableMmapedSize; /** * Mmaped elements older than this will be closed. */ private final long maxEvictableMmapedLifespanMs; /** * The minimum number of milliseconds we'll wait after an unsuccessful * mmap attempt before trying again. */ private final long mmapRetryTimeoutMs; /** * How long we will keep replicas in the cache before declaring them * to be stale. */ private final long staleThresholdMs; /** * True if the ShortCircuitCache is closed. */ private boolean closed = false; /** * Number of existing mmaps associated with this cache. */ private int outstandingMmapCount = 0; /** * Manages short-circuit shared memory segments for the client. */ private final DfsClientShmManager shmManager; /** * Create a {@link ShortCircuitCache} object from a {@link Configuration} */ public static ShortCircuitCache fromConf(Configuration conf) { return new ShortCircuitCache( conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY, DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT), conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY, DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT), conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE, DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT), conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT), conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS, DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT), conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS, DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT), conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT)); } public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs, int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs, long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) { Preconditions.checkArgument(maxTotalSize >= 0); this.maxTotalSize = maxTotalSize; Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0); this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs; Preconditions.checkArgument(maxEvictableMmapedSize >= 0); this.maxEvictableMmapedSize = maxEvictableMmapedSize; Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0); this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs; this.mmapRetryTimeoutMs = mmapRetryTimeoutMs; this.staleThresholdMs = staleThresholdMs; DfsClientShmManager shmManager = null; if ((shmInterruptCheckMs > 0) && (DomainSocketWatcher.getLoadingFailureReason() == null)) { try { shmManager = new DfsClientShmManager(shmInterruptCheckMs); } catch (IOException e) { LOG.error("failed to create ShortCircuitShmManager", e); } } this.shmManager = shmManager; } public long getMmapRetryTimeoutMs() { return mmapRetryTimeoutMs; } public long getStaleThresholdMs() { return staleThresholdMs; } /** * Increment the reference count of a replica, and remove it from any free * list it may be in. * * You must hold the cache lock while calling this function. * * @param replica The replica we're removing. */ private void ref(ShortCircuitReplica replica) { lock.lock(); try { Preconditions.checkArgument(replica.refCount > 0, "can't ref " + replica + " because its refCount reached " + replica.refCount); Long evictableTimeNs = replica.getEvictableTimeNs(); replica.refCount++; if (evictableTimeNs != null) { String removedFrom = removeEvictable(replica); if (LOG.isTraceEnabled()) { LOG.trace(this + ": " + removedFrom + " no longer contains " + replica + ". refCount " + (replica.refCount - 1) + " -> " + replica.refCount + StringUtils.getStackTrace(Thread.currentThread())); } } else if (LOG.isTraceEnabled()) { LOG.trace(this + ": replica refCount " + (replica.refCount - 1) + " -> " + replica.refCount + StringUtils.getStackTrace(Thread.currentThread())); } } finally { lock.unlock(); } } /** * Unreference a replica. * * You must hold the cache lock while calling this function. * * @param replica The replica being unreferenced. */ void unref(ShortCircuitReplica replica) { lock.lock(); try { // If the replica is stale, but we haven't purged it yet, let's do that. // It would be a shame to evict a non-stale replica so that we could put // a stale one into the cache. if ((!replica.purged) && replica.isStale()) { purge(replica); } String addedString = ""; boolean shouldTrimEvictionMaps = false; int newRefCount = --replica.refCount; if (newRefCount == 0) { // Close replica, since there are no remaining references to it. Preconditions.checkArgument(replica.purged, "Replica " + replica + " reached a refCount of 0 without " + "being purged"); replica.close(); } else if (newRefCount == 1) { Preconditions.checkState(null == replica.getEvictableTimeNs(), "Replica " + replica + " had a refCount higher than 1, " + "but was still evictable (evictableTimeNs = " + replica.getEvictableTimeNs() + ")"); if (!replica.purged) { // Add the replica to the end of an eviction list. // Eviction lists are sorted by time. if (replica.hasMmap()) { insertEvictable(System.nanoTime(), replica, evictableMmapped); addedString = "added to evictableMmapped, "; } else { insertEvictable(System.nanoTime(), replica, evictable); addedString = "added to evictable, "; } shouldTrimEvictionMaps = true; } } else { Preconditions.checkArgument(replica.refCount >= 0, "replica's refCount went negative (refCount = " + replica.refCount + " for " + replica + ")"); } if (LOG.isTraceEnabled()) { LOG.trace(this + ": unref replica " + replica + ": " + addedString + " refCount " + (newRefCount + 1) + " -> " + newRefCount + StringUtils.getStackTrace(Thread.currentThread())); } if (shouldTrimEvictionMaps) { trimEvictionMaps(); } } finally { lock.unlock(); } } /** * Demote old evictable mmaps into the regular eviction map. * * You must hold the cache lock while calling this function. * * @param now Current time in monotonic milliseconds. * @return Number of replicas demoted. */ private int demoteOldEvictableMmaped(long now) { int numDemoted = 0; boolean needMoreSpace = false; Long evictionTimeNs = Long.valueOf(0); while (true) { Entry<Long, ShortCircuitReplica> entry = evictableMmapped.ceilingEntry(evictionTimeNs); if (entry == null) break; evictionTimeNs = entry.getKey(); long evictionTimeMs = TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) { if (evictableMmapped.size() < maxEvictableMmapedSize) { break; } needMoreSpace = true; } ShortCircuitReplica replica = entry.getValue(); if (LOG.isTraceEnabled()) { String rationale = needMoreSpace ? "because we need more space" : "because it's too old"; LOG.trace("demoteOldEvictable: demoting " + replica + ": " + rationale + ": " + StringUtils.getStackTrace(Thread.currentThread())); } removeEvictable(replica, evictableMmapped); munmap(replica); insertEvictable(evictionTimeNs, replica, evictable); numDemoted++; } return numDemoted; } /** * Trim the eviction lists. */ private void trimEvictionMaps() { long now = Time.monotonicNow(); demoteOldEvictableMmaped(now); while (true) { long evictableSize = evictable.size(); long evictableMmappedSize = evictableMmapped.size(); if (evictableSize + evictableMmappedSize <= maxTotalSize) { return; } ShortCircuitReplica replica; if (evictableSize == 0) { replica = evictableMmapped.firstEntry().getValue(); } else { replica = evictable.firstEntry().getValue(); } if (LOG.isTraceEnabled()) { LOG.trace(this + ": trimEvictionMaps is purging " + replica + StringUtils.getStackTrace(Thread.currentThread())); } purge(replica); } } /** * Munmap a replica, updating outstandingMmapCount. * * @param replica The replica to munmap. */ private void munmap(ShortCircuitReplica replica) { replica.munmap(); outstandingMmapCount--; } /** * Remove a replica from an evictable map. * * @param replica The replica to remove. * @return The map it was removed from. */ private String removeEvictable(ShortCircuitReplica replica) { if (replica.hasMmap()) { removeEvictable(replica, evictableMmapped); return "evictableMmapped"; } else { removeEvictable(replica, evictable); return "evictable"; } } /** * Remove a replica from an evictable map. * * @param replica The replica to remove. * @param map The map to remove it from. */ private void removeEvictable(ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) { Long evictableTimeNs = replica.getEvictableTimeNs(); Preconditions.checkNotNull(evictableTimeNs); ShortCircuitReplica removed = map.remove(evictableTimeNs); Preconditions.checkState(removed == replica, "failed to make " + replica + " unevictable"); replica.setEvictableTimeNs(null); } /** * Insert a replica into an evictable map. * * If an element already exists with this eviction time, we add a nanosecond * to it until we find an unused key. * * @param evictionTimeNs The eviction time in absolute nanoseconds. * @param replica The replica to insert. * @param map The map to insert it into. */ private void insertEvictable(Long evictionTimeNs, ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) { while (map.containsKey(evictionTimeNs)) { evictionTimeNs++; } Preconditions.checkState(null == replica.getEvictableTimeNs()); Long time = Long.valueOf(evictionTimeNs); replica.setEvictableTimeNs(time); map.put(time, replica); } /** * Purge a replica from the cache. * * This doesn't necessarily close the replica, since there may be * outstanding references to it. However, it does mean the cache won't * hand it out to anyone after this. * * You must hold the cache lock while calling this function. * * @param replica The replica being removed. */ private void purge(ShortCircuitReplica replica) { boolean removedFromInfoMap = false; String evictionMapName = null; Preconditions.checkArgument(!replica.purged); replica.purged = true; Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key); if (val != null) { ShortCircuitReplicaInfo info = val.getVal(); if ((info != null) && (info.getReplica() == replica)) { replicaInfoMap.remove(replica.key); removedFromInfoMap = true; } } Long evictableTimeNs = replica.getEvictableTimeNs(); if (evictableTimeNs != null) { evictionMapName = removeEvictable(replica); } if (LOG.isTraceEnabled()) { StringBuilder builder = new StringBuilder(); builder.append(this).append(": ").append(": purged "). append(replica).append(" from the cache."); if (removedFromInfoMap) { builder.append(" Removed from the replicaInfoMap."); } if (evictionMapName != null) { builder.append(" Removed from ").append(evictionMapName); } LOG.trace(builder.toString()); } unref(replica); } /** * Fetch or create a replica. * * You must hold the cache lock while calling this function. * * @param key Key to use for lookup. * @param creator Replica creator callback. Will be called without * the cache lock being held. * * @return Null if no replica could be found or created. * The replica, otherwise. */ public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key, ShortCircuitReplicaCreator creator) { Waitable<ShortCircuitReplicaInfo> newWaitable = null; lock.lock(); try { ShortCircuitReplicaInfo info = null; do { if (closed) { if (LOG.isTraceEnabled()) { LOG.trace(this + ": can't fetchOrCreate " + key + " because the cache is closed."); } return null; } Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key); if (waitable != null) { try { info = fetch(key, waitable); } catch (RetriableException e) { if (LOG.isDebugEnabled()) { LOG.debug(this + ": retrying " + e.getMessage()); } continue; } } } while (false); if (info != null) return info; // We need to load the replica ourselves. newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition()); replicaInfoMap.put(key, newWaitable); } finally { lock.unlock(); } return create(key, creator, newWaitable); } /** * Fetch an existing ReplicaInfo object. * * @param key The key that we're using. * @param waitable The waitable object to wait on. * @return The existing ReplicaInfo object, or null if there is * none. * * @throws RetriableException If the caller needs to retry. */ private ShortCircuitReplicaInfo fetch(ExtendedBlockId key, Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException { // Another thread is already in the process of loading this // ShortCircuitReplica. So we simply wait for it to complete. ShortCircuitReplicaInfo info; try { if (LOG.isTraceEnabled()) { LOG.trace(this + ": found waitable for " + key); } info = waitable.await(); } catch (InterruptedException e) { LOG.info(this + ": interrupted while waiting for " + key); Thread.currentThread().interrupt(); throw new RetriableException("interrupted"); } if (info.getInvalidTokenException() != null) { LOG.warn(this + ": could not get " + key + " due to InvalidToken " + "exception.", info.getInvalidTokenException()); return info; } ShortCircuitReplica replica = info.getReplica(); if (replica == null) { LOG.warn(this + ": failed to get " + key); return info; } if (replica.purged) { // Ignore replicas that have already been purged from the cache. throw new RetriableException("Ignoring purged replica " + replica + ". Retrying."); } // Check if the replica is stale before using it. // If it is, purge it and retry. if (replica.isStale()) { LOG.info(this + ": got stale replica " + replica + ". Removing " + "this replica from the replicaInfoMap and retrying."); // Remove the cache's reference to the replica. This may or may not // trigger a close. purge(replica); throw new RetriableException("ignoring stale replica " + replica); } ref(replica); return info; } private ShortCircuitReplicaInfo create(ExtendedBlockId key, ShortCircuitReplicaCreator creator, Waitable<ShortCircuitReplicaInfo> newWaitable) { // Handle loading a new replica. ShortCircuitReplicaInfo info = null; try { if (LOG.isTraceEnabled()) { LOG.trace(this + ": loading " + key); } info = creator.createShortCircuitReplicaInfo(); } catch (RuntimeException e) { LOG.warn(this + ": failed to load " + key, e); } if (info == null) info = new ShortCircuitReplicaInfo(); lock.lock(); try { if (info.getReplica() != null) { // On success, make sure the cache cleaner thread is running. if (LOG.isTraceEnabled()) { LOG.trace(this + ": successfully loaded " + info.getReplica()); } startCacheCleanerThreadIfNeeded(); // Note: new ShortCircuitReplicas start with a refCount of 2, // indicating that both this cache and whoever requested the // creation of the replica hold a reference. So we don't need // to increment the reference count here. } else { // On failure, remove the waitable from the replicaInfoMap. Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key); if (waitableInMap == newWaitable) replicaInfoMap.remove(key); if (info.getInvalidTokenException() != null) { LOG.warn(this + ": could not load " + key + " due to InvalidToken " + "exception.", info.getInvalidTokenException()); } else { LOG.warn(this + ": failed to load " + key); } } newWaitable.provide(info); } finally { lock.unlock(); } return info; } private void startCacheCleanerThreadIfNeeded() { if (cacheCleaner == null) { cacheCleaner = new CacheCleaner(); long rateMs = cacheCleaner.getRateInMs(); ScheduledFuture<?> future = cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs, TimeUnit.MILLISECONDS); cacheCleaner.setFuture(future); if (LOG.isDebugEnabled()) { LOG.debug(this + ": starting cache cleaner thread which will run " + "every " + rateMs + " ms"); } } } ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica, boolean anchored) { Condition newCond; lock.lock(); try { while (replica.mmapData != null) { if (replica.mmapData instanceof MappedByteBuffer) { ref(replica); MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData; return new ClientMmap(replica, mmap, anchored); } else if (replica.mmapData instanceof Long) { long lastAttemptTimeMs = (Long)replica.mmapData; long delta = Time.monotonicNow() - lastAttemptTimeMs; if (delta < staleThresholdMs) { if (LOG.isTraceEnabled()) { LOG.trace(this + ": can't create client mmap for " + replica + " because we failed to " + "create one just " + delta + "ms ago."); } return null; } if (LOG.isTraceEnabled()) { LOG.trace(this + ": retrying client mmap for " + replica + ", " + delta + " ms after the previous failure."); } } else if (replica.mmapData instanceof Condition) { Condition cond = (Condition)replica.mmapData; cond.awaitUninterruptibly(); } else { Preconditions.checkState(false, "invalid mmapData type " + replica.mmapData.getClass().getName()); } } newCond = lock.newCondition(); replica.mmapData = newCond; } finally { lock.unlock(); } MappedByteBuffer map = replica.loadMmapInternal(); lock.lock(); try { if (map == null) { replica.mmapData = Long.valueOf(Time.monotonicNow()); newCond.signalAll(); return null; } else { outstandingMmapCount++; replica.mmapData = map; ref(replica); newCond.signalAll(); return new ClientMmap(replica, map, anchored); } } finally { lock.unlock(); } } /** * Close the cache and free all associated resources. */ @Override public void close() { try { lock.lock(); if (closed) return; closed = true; LOG.info(this + ": closing"); maxNonMmappedEvictableLifespanMs = 0; maxEvictableMmapedSize = 0; // Close and join cacheCleaner thread. IOUtils.cleanup(LOG, cacheCleaner); // Purge all replicas. while (true) { Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry(); if (entry == null) break; purge(entry.getValue()); } while (true) { Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry(); if (entry == null) break; purge(entry.getValue()); } } finally { lock.unlock(); } IOUtils.cleanup(LOG, shmManager); } @VisibleForTesting // ONLY for testing public interface CacheVisitor { void visit(int numOutstandingMmaps, Map<ExtendedBlockId, ShortCircuitReplica> replicas, Map<ExtendedBlockId, InvalidToken> failedLoads, Map<Long, ShortCircuitReplica> evictable, Map<Long, ShortCircuitReplica> evictableMmapped); } @VisibleForTesting // ONLY for testing public void accept(CacheVisitor visitor) { lock.lock(); try { Map<ExtendedBlockId, ShortCircuitReplica> replicas = new HashMap<ExtendedBlockId, ShortCircuitReplica>(); Map<ExtendedBlockId, InvalidToken> failedLoads = new HashMap<ExtendedBlockId, InvalidToken>(); for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry : replicaInfoMap.entrySet()) { Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue(); if (waitable.hasVal()) { if (waitable.getVal().getReplica() != null) { replicas.put(entry.getKey(), waitable.getVal().getReplica()); } else { // The exception may be null here, indicating a failed load that // isn't the result of an invalid block token. failedLoads.put(entry.getKey(), waitable.getVal().getInvalidTokenException()); } } } if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); builder.append("visiting ").append(visitor.getClass().getName()). append("with outstandingMmapCount=").append(outstandingMmapCount). append(", replicas="); String prefix = ""; for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) { builder.append(prefix).append(entry.getValue()); prefix = ","; } prefix = ""; builder.append(", failedLoads="); for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) { builder.append(prefix).append(entry.getValue()); prefix = ","; } prefix = ""; builder.append(", evictable="); for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) { builder.append(prefix).append(entry.getKey()). append(":").append(entry.getValue()); prefix = ","; } prefix = ""; builder.append(", evictableMmapped="); for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) { builder.append(prefix).append(entry.getKey()). append(":").append(entry.getValue()); prefix = ","; } LOG.debug(builder.toString()); } visitor.visit(outstandingMmapCount, replicas, failedLoads, evictable, evictableMmapped); } finally { lock.unlock(); } } @Override public String toString() { return "ShortCircuitCache(0x" + Integer.toHexString(System.identityHashCode(this)) + ")"; } /** * Allocate a new shared memory slot. * * @param datanode The datanode to allocate a shm slot with. * @param peer A peer connected to the datanode. * @param usedPeer Will be set to true if we use up the provided peer. * @param blockId The block id and block pool id of the block we're * allocating this slot for. * @param clientName The name of the DFSClient allocating the shared * memory. * @return Null if short-circuit shared memory is disabled; * a short-circuit memory slot otherwise. * @throws IOException An exception if there was an error talking to * the datanode. */ public Slot allocShmSlot(DatanodeInfo datanode, DomainPeer peer, MutableBoolean usedPeer, ExtendedBlockId blockId, String clientName) throws IOException { if (shmManager != null) { return shmManager.allocSlot(datanode, peer, usedPeer, blockId, clientName); } else { return null; } } /** * Free a slot immediately. * * ONLY use this if the DataNode is not yet aware of the slot. * * @param slot The slot to free. */ public void freeSlot(Slot slot) { Preconditions.checkState(shmManager != null); slot.makeInvalid(); shmManager.freeSlot(slot); } /** * Schedule a shared memory slot to be released. * * @param slot The slot to release. */ public void scheduleSlotReleaser(Slot slot) { Preconditions.checkState(shmManager != null); releaserExecutor.execute(new SlotReleaser(slot)); } @VisibleForTesting public DfsClientShmManager getDfsClientShmManager() { return shmManager; } }