package org.dcache.pool.classic; import com.google.common.base.Throwables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.InterruptedIOException; import java.nio.channels.CompletionHandler; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import java.util.stream.Stream; import diskCacheV111.util.CacheException; import diskCacheV111.util.DiskErrorCacheException; import diskCacheV111.vehicles.IoJobInfo; import diskCacheV111.vehicles.JobInfo; import diskCacheV111.vehicles.ProtocolInfo; import dmg.cells.nucleus.CDC; import org.dcache.pool.FaultAction; import org.dcache.pool.FaultEvent; import org.dcache.pool.FaultListener; import org.dcache.pool.movers.Mover; import org.dcache.util.AdjustableSemaphore; import org.dcache.util.IoPrioritizable; import org.dcache.util.IoPriority; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static java.util.stream.Collectors.joining; import static org.dcache.pool.classic.IoRequestState.*; public class MoverRequestScheduler { private static final Logger LOGGER = LoggerFactory.getLogger(MoverRequestScheduler.class); /** * The name of IoScheduler. */ private final String _name; /** * All movers, both queued and running, managed by the scheduler. */ private final Map<Integer, PrioritizedRequest> _jobs = new ConcurrentHashMap<>(128); /** * Requests by door unique request id. */ private final Map<String, PrioritizedRequest> _moverByRequests = new ConcurrentHashMap<>(128); /** * ID of the current queue. Used to identify queue in {@link * IoQueueManager}. */ private final int _queueId; private final List<FaultListener> _faultListeners = new CopyOnWriteArrayList<>(); /** * Number of free job slots. */ private final AdjustableSemaphore _semaphore = new AdjustableSemaphore(); /** * JTM timeout since last activity. */ private long _lastAccessed; /** * JTM timeout since transfer start. */ private long _total; /** * Current queue order. */ private Order _order; /** * Queued movers. */ private BlockingQueue<PrioritizedRequest> _queue; /** * Job id generator */ private int _nextId; /** * True when scheduler has been terminated. */ private volatile boolean _isShutdown; public enum Order { FIFO, LIFO } public MoverRequestScheduler(String name, int queueId, Order order) { _name = name; _queueId = queueId; _order = order; _queue = createQueue(order); _semaphore.setMaxPermits(2); } public void addFaultListener(FaultListener listener) { _faultListeners.add(listener); } public void removeFaultListener(FaultListener listener) { _faultListeners.remove(listener); } private PriorityBlockingQueue<PrioritizedRequest> createQueue(Order order) { /* PriorityBlockingQueue returns the least elements first, that is, the * the highest priority requests have to be first in the ordering. */ Comparator<IoPrioritizable> comparator = order == Order.FIFO ? Comparator .comparing(IoPrioritizable::getPriority) .reversed() .thenComparingLong(IoPrioritizable::getCreateTime) : Comparator .comparing(IoPrioritizable::getPriority) .thenComparingLong(IoPrioritizable::getCreateTime) .reversed(); return new PriorityBlockingQueue<>(16, comparator); } public Order getOrder() { return _order; } public synchronized void setOrder(Order order) { if (order != _order) { PriorityBlockingQueue<PrioritizedRequest> queue = createQueue(order); _queue.drainTo(queue); _queue = queue; _order = order; } } /** * Get mover id for given door request. If there is no mover associated with {@code doorUniqueueRequest} a new mover * will be created by using provided {@code moverSupplier}. * <p> * The returned mover id generated with following encoding: * | 31- queue id -24|23- job id -0| * * @param moverSupplier {@link MoverSupplier} which can create a mover for given requests. * @param doorUniqueId unique request identifier generated by the door. * @param priority * @return mover id */ public int getOrCreateMover(MoverSupplier moverSupplier, String doorUniqueId, IoPriority priority) throws CacheException { checkState(!_isShutdown); try { /* Create the request if it doesn't already exists. */ PrioritizedRequest request = _moverByRequests.computeIfAbsent(doorUniqueId, key -> { try { return createRequest(moverSupplier, key, priority); } catch (CacheException e) { throw new RuntimeException(e); } }); /* If not already queued, submit it. */ if (request.queue()) { if (submit(request)) { /* There was a free slot in the queue so we submit directly to execution. */ sendToExecution(request); } else if (_semaphore.getMaxPermits() <= 0) { LOGGER.warn("A task was added to queue '{}', however the queue is not " + "configured to execute any tasks.", _name); } } return request.getId(); } catch (RuntimeException e) { Throwables.throwIfInstanceOf(e.getCause(), CacheException.class); throw e; } } private PrioritizedRequest createRequest(MoverSupplier moverSupplier, String doorUniqueId, IoPriority priority) throws CacheException { return new PrioritizedRequest(_queueId << 24 | nextId(), doorUniqueId, moverSupplier.createMover(), priority); } /** * Add a request to the scheduler. * <p> * Returns true if the caller acquired a job slot and must send the job to execution. * * @param request * @return */ private synchronized boolean submit(PrioritizedRequest request) { if (_jobs.put(request.getId(), request) != null) { throw new RuntimeException("Duplicate mover id detected. Please report to support@dcache.org."); } if (_semaphore.tryAcquire()) { return true; } else { _queue.add(request); return false; } } /** * Returns the next job or releases a job slot. If a non-null value is returned, the caller * must submit the job to execution. Should only be caller by a caller than currently holds * a job slot. * * @return */ private synchronized PrioritizedRequest nextOrRelease() { PrioritizedRequest request = _queue.poll(); if (request == null) { _semaphore.release(); } return request; } private synchronized int nextId() { if (_nextId == 0x00FFFFFF) { _nextId = 0; } else { _nextId++; } return _nextId; } /** * Get current number of concurrently running jobs. * * @return number of running jobs. */ public synchronized int getActiveJobs() { return _jobs.size() - _queue.size(); } /** * Get job information * * @param id * @return * @throws NoSuchElementException if job with specified <code>id</code> does * not exist */ public JobInfo getJobInfo(int id) throws NoSuchElementException { PrioritizedRequest request = _jobs.get(id); if (request == null) { throw new NoSuchElementException("Job not found : Job-" + id); } return request.toJobInfo(); } /** * Get list of all jobs in this queue. * * @return list of all jobs */ public List<IoJobInfo> getJobInfos() { return Collections.unmodifiableList(_jobs.values().stream() .map(PrioritizedRequest::toJobInfo) .collect(Collectors.toList()) ); } /** * Get a {@link Stream} of all jobs in this queue. * * @return list of all jobs */ Stream<PrioritizedRequest> getJobs() { return _jobs.values().stream(); } /** * Get the maximal number allowed of concurrently running jobs by this scheduler. * * @return maximal number of jobs. */ public int getMaxActiveJobs() { return _semaphore.getMaxPermits(); } /** * Set maximal number of concurrently running jobs by this scheduler. All * pending jobs will be executed. * * @param maxJobs */ public void setMaxActiveJobs(int maxJobs) { _semaphore.setMaxPermits(maxJobs); PrioritizedRequest request; while (_semaphore.tryAcquire() && (request = nextOrRelease()) != null) { sendToExecution(request); } } /** * Get number of requests waiting for execution. * * @return number of pending requests. */ public int getQueueSize() { BlockingQueue<PrioritizedRequest> queue; synchronized (this) { queue = _queue; } return queue.size(); } /** * Get the number of write requests running or waiting to run. */ public int getCountByPriority(IoPriority priority) { BlockingQueue<PrioritizedRequest> queue; synchronized (this) { queue = _queue; } return (int) queue.stream() .filter(r -> r.getPriority() == priority) .count(); } /** * Get the name of this scheduler. * * @return name of the scheduler */ public String getName() { return _name; } public int getId() { return _queueId; } /** * Cancel the request. Any IO in progress will be interrupted. * * @param id * @param explanation A reason to log * @throws NoSuchElementException */ public synchronized void cancel(int id, @Nullable String explanation) throws NoSuchElementException { PrioritizedRequest request = _jobs.get(id); if (request == null) { throw new NoSuchElementException("Job " + id + " not found"); } request.kill(explanation); if (_queue.remove(request)) { postprocessWithoutJobSlot(request); } } private void postprocessWithoutJobSlot(PrioritizedRequest request) { try (CDC ignore = request.getCdc().restore()) { request.getMover().close( new CompletionHandler<Void, Void>() { @Override public void completed(Void result, Void attachment) { release(); } private void release() { request.done(); _jobs.remove(request.getId()); _moverByRequests.remove(request.getDoorUniqueId()); } @Override public void failed(Throwable exc, Void attachment) { release(); } }); } } /** * Shutdown the scheduler. All subsequent execution request will be rejected. */ public void shutdown() throws InterruptedException { checkState(!_isShutdown); _isShutdown = true; /* Drain jobs from the queue so they will never be started. Has to be done * before killing jobs as otherwise the queued jobs will immediatley fill * the freed job slot. */ Collection<PrioritizedRequest> toBeCancelled = new ArrayList<>(); _queue.drainTo(toBeCancelled); /* Kill both the jobs that were queued and which are running. */ _jobs.values().forEach(j -> j.kill("shutdown")); /* Jobs that were queued were never submitted for execution and thus we * manually trigger postprocessing. */ toBeCancelled.forEach(this::postprocessWithoutJobSlot); LOGGER.info("Waiting for movers on queue '{}' to finish", _name); if (!_semaphore.tryAcquire(_semaphore.getMaxPermits(), 2, TimeUnit.SECONDS)) { // This is often due to a mover not reacting to interrupt or the transfer // doing a lengthy checksum calculation during post processing. String versions = _jobs.values().stream() .map(PrioritizedRequest::getMover) .map(Mover::getProtocolInfo) .map(ProtocolInfo::getVersionString) .collect(joining(",")); LOGGER.warn("Failed to terminate some movers prior to shutdown: {}", versions); } } private void sendToExecution(final PrioritizedRequest request) { try (CDC ignore = request.getCdc().restore()) { request.transfer( new CompletionHandler<Void, Void>() { @Override public void completed(Void result, Void attachment) { postprocess(); } @Override public void failed(Throwable exc, Void attachment) { if (exc instanceof InterruptedException || exc instanceof InterruptedIOException) { request.getMover().setTransferStatus(CacheException.DEFAULT_ERROR_CODE, "Transfer was killed"); } else if (exc instanceof DiskErrorCacheException) { FaultEvent faultEvent = new FaultEvent("transfer", FaultAction.DISABLED, exc.getMessage(), exc); _faultListeners.forEach(l -> l.faultOccurred(faultEvent)); } postprocess(); } private void postprocess() { try (CDC ignore = request.getCdc().restore()) { request.getMover().close( new CompletionHandler<Void, Void>() { @Override public void completed(Void result, Void attachment) { release(); } @Override public void failed(Throwable exc, Void attachment) { if (exc instanceof DiskErrorCacheException) { FaultEvent faultEvent = new FaultEvent("post-processing", FaultAction.DISABLED, exc.getMessage(), exc); _faultListeners.forEach(l -> l.faultOccurred(faultEvent)); } release(); } private void release() { request.done(); _jobs.remove(request.getId()); _moverByRequests.remove(request.getDoorUniqueId()); PrioritizedRequest nextRequest = nextOrRelease(); if (nextRequest != null) { sendToExecution(nextRequest); } } }); } } }); } } public synchronized boolean isExpired(JobInfo job, long now) { long started = job.getStartTime(); long lastAccessed = job instanceof IoJobInfo ? ((IoJobInfo) job).getLastTransferred() : now; return ((getLastAccessed() > 0L) && (lastAccessed > 0L) && ((now - lastAccessed) > getLastAccessed())) || ((getTotal() > 0L) && (started > 0L) && ((now - started) > getTotal())); } public synchronized long getLastAccessed() { return _lastAccessed; } public synchronized void setLastAccessed(long lastAccessed) { checkArgument(lastAccessed >= 0L, "The lastAccess timeout must be greater than or equal to 0."); _lastAccessed = lastAccessed; } public synchronized long getTotal() { return _total; } public synchronized void setTotal(long total) { checkArgument(total >= 0L, "The total timeout must be greater than or equal to 0."); _total = total; } static class PrioritizedRequest implements IoPrioritizable { private final Mover<?> _mover; private final IoPriority _priority; private final long _ctime; private final int _id; private final CDC _cdc; /** * Request creation time. */ private final long _submitTime; private final String _doorUniqueId; private IoRequestState _state; /** * Transfer start time. */ private long _startTime; private Cancellable _cancellable; PrioritizedRequest(int id, String doorUniqueId, Mover<?> mover, IoPriority p) { _id = id; _mover = mover; _priority = p; _ctime = System.nanoTime(); _submitTime = System.currentTimeMillis(); _state = NEW; _doorUniqueId = doorUniqueId; _cdc = new CDC(); } public Mover<?> getMover() { return _mover; } public CDC getCdc() { return _cdc; } public int getId() { return _id; } public String getDoorUniqueId() { return _doorUniqueId; } @Override public IoPriority getPriority() { return _priority; } @Override public long getCreateTime() { return _ctime; } @Override public int hashCode() { return _id; } @Override public boolean equals(Object o) { if (o == this) { return true; } if (!(o instanceof PrioritizedRequest)) { return false; } final PrioritizedRequest other = (PrioritizedRequest) o; return _id == other._id; } @Override public synchronized String toString() { return _state + " : " + _mover.toString() + " si={" + _mover.getFileAttributes().getStorageClass() + "}"; } public synchronized IoJobInfo toJobInfo() { return new IoJobInfo(_submitTime, _startTime, _state.toString(), _id, _mover.getPathToDoor().getDestinationAddress().toString(), _mover.getClientId(), _mover.getFileAttributes().getPnfsId(), _mover.getBytesTransferred(), _mover.getTransferTime(), _mover.getLastTransferred()); } public synchronized boolean queue() { if (_state == NEW) { _state = QUEUED; return true; } return false; } public synchronized void transfer(CompletionHandler<Void, Void> completionHandler) { try { if (_state != QUEUED) { completionHandler.failed(new InterruptedException("Transfer cancelled"), null); } _state = RUNNING; _startTime = System.currentTimeMillis(); _cancellable = _mover.execute(completionHandler); } catch (RuntimeException e) { completionHandler.failed(e, null); } } public synchronized void kill(@Nullable String explanation) { if (_state == CANCELED || _state == DONE) { return; } if (_cancellable != null) { _cancellable.cancel(explanation); } else { String why = explanation == null ? "Transfer cancelled" : ("Transfer cancelled: " + explanation); _mover.setTransferStatus(CacheException.DEFAULT_ERROR_CODE, why); } _state = CANCELED; } public synchronized void done() { _state = DONE; } } }