/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.procedure2; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.DelayQueue; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ProcedureInfo; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; import org.apache.hadoop.hbase.procedure2.Procedure.LockState; import org.apache.hadoop.hbase.procedure2.store.ProcedureStore; import org.apache.hadoop.hbase.procedure2.store.ProcedureStore.ProcedureIterator; import org.apache.hadoop.hbase.procedure2.util.DelayedUtil; import org.apache.hadoop.hbase.procedure2.util.DelayedUtil.DelayedWithTimeout; import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.NonceKey; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; /** * Thread Pool that executes the submitted procedures. * The executor has a ProcedureStore associated. * Each operation is logged and on restart the pending procedures are resumed. * * Unless the Procedure code throws an error (e.g. invalid user input) * the procedure will complete (at some point in time), On restart the pending * procedures are resumed and the once failed will be rolledback. * * The user can add procedures to the executor via submitProcedure(proc) * check for the finished state via isFinished(procId) * and get the result via getResult(procId) */ @InterfaceAudience.Private @InterfaceStability.Evolving public class ProcedureExecutor<TEnvironment> { private static final Log LOG = LogFactory.getLog(ProcedureExecutor.class); public static final String CHECK_OWNER_SET_CONF_KEY = "hbase.procedure.check.owner.set"; private static final boolean DEFAULT_CHECK_OWNER_SET = false; public static final String WORKER_KEEP_ALIVE_TIME_CONF_KEY = "hbase.procedure.worker.keep.alive.time.msec"; private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = Long.MAX_VALUE; Testing testing = null; public static class Testing { protected boolean killIfSuspended = false; protected boolean killBeforeStoreUpdate = false; protected boolean toggleKillBeforeStoreUpdate = false; protected boolean shouldKillBeforeStoreUpdate() { final boolean kill = this.killBeforeStoreUpdate; if (this.toggleKillBeforeStoreUpdate) { this.killBeforeStoreUpdate = !kill; LOG.warn("Toggle KILL before store update to: " + this.killBeforeStoreUpdate); } return kill; } protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) { return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate(); } } public interface ProcedureExecutorListener { void procedureLoaded(long procId); void procedureAdded(long procId); void procedureFinished(long procId); } /** * Internal cleaner that removes the completed procedure results after a TTL. * NOTE: This is a special case handled in timeoutLoop(). * * Since the client code looks more or less like: * procId = master.doOperation() * while (master.getProcResult(procId) == ProcInProgress); * The master should not throw away the proc result as soon as the procedure is done * but should wait a result request from the client (see executor.removeResult(procId)) * The client will call something like master.isProcDone() or master.getProcResult() * which will return the result/state to the client, and it will mark the completed * proc as ready to delete. note that the client may not receive the response from * the master (e.g. master failover) so, if we delay a bit the real deletion of * the proc result the client will be able to get the result the next try. */ private static class CompletedProcedureCleaner<TEnvironment> extends ProcedureInMemoryChore<TEnvironment> { private static final Log LOG = LogFactory.getLog(CompletedProcedureCleaner.class); private static final String CLEANER_INTERVAL_CONF_KEY = "hbase.procedure.cleaner.interval"; private static final int DEFAULT_CLEANER_INTERVAL = 30 * 1000; // 30sec private static final String EVICT_TTL_CONF_KEY = "hbase.procedure.cleaner.evict.ttl"; private static final int DEFAULT_EVICT_TTL = 15 * 60000; // 15min private static final String EVICT_ACKED_TTL_CONF_KEY ="hbase.procedure.cleaner.acked.evict.ttl"; private static final int DEFAULT_ACKED_EVICT_TTL = 5 * 60000; // 5min private static final String BATCH_SIZE_CONF_KEY = "hbase.procedure.cleaner.evict.batch.size"; private static final int DEFAULT_BATCH_SIZE = 32; private final Map<Long, ProcedureInfo> completed; private final Map<NonceKey, Long> nonceKeysToProcIdsMap; private final ProcedureStore store; private Configuration conf; public CompletedProcedureCleaner(final Configuration conf, final ProcedureStore store, final Map<Long, ProcedureInfo> completedMap, final Map<NonceKey, Long> nonceKeysToProcIdsMap) { // set the timeout interval that triggers the periodic-procedure super(conf.getInt(CLEANER_INTERVAL_CONF_KEY, DEFAULT_CLEANER_INTERVAL)); this.completed = completedMap; this.nonceKeysToProcIdsMap = nonceKeysToProcIdsMap; this.store = store; this.conf = conf; } @Override protected void periodicExecute(final TEnvironment env) { if (completed.isEmpty()) { if (LOG.isTraceEnabled()) { LOG.trace("No completed procedures to cleanup."); } return; } final long evictTtl = conf.getInt(EVICT_TTL_CONF_KEY, DEFAULT_EVICT_TTL); final long evictAckTtl = conf.getInt(EVICT_ACKED_TTL_CONF_KEY, DEFAULT_ACKED_EVICT_TTL); final int batchSize = conf.getInt(BATCH_SIZE_CONF_KEY, DEFAULT_BATCH_SIZE); final long[] batchIds = new long[batchSize]; int batchCount = 0; final long now = EnvironmentEdgeManager.currentTime(); final Iterator<Map.Entry<Long, ProcedureInfo>> it = completed.entrySet().iterator(); final boolean debugEnabled = LOG.isDebugEnabled(); while (it.hasNext() && store.isRunning()) { final Map.Entry<Long, ProcedureInfo> entry = it.next(); final ProcedureInfo procInfo = entry.getValue(); // TODO: Select TTL based on Procedure type if ((procInfo.hasClientAckTime() && (now - procInfo.getClientAckTime()) >= evictAckTtl) || (now - procInfo.getLastUpdate()) >= evictTtl) { if (debugEnabled) { LOG.debug("Evict completed " + procInfo); } batchIds[batchCount++] = entry.getKey(); if (batchCount == batchIds.length) { store.delete(batchIds, 0, batchCount); batchCount = 0; } it.remove(); final NonceKey nonceKey = procInfo.getNonceKey(); if (nonceKey != null) { nonceKeysToProcIdsMap.remove(nonceKey); } } } if (batchCount > 0) { store.delete(batchIds, 0, batchCount); } } } /** * Map the the procId returned by submitProcedure(), the Root-ProcID, to the ProcedureInfo. * Once a Root-Procedure completes (success or failure), the result will be added to this map. * The user of ProcedureExecutor should call getResult(procId) to get the result. */ private final ConcurrentHashMap<Long, ProcedureInfo> completed = new ConcurrentHashMap<>(); /** * Map the the procId returned by submitProcedure(), the Root-ProcID, to the RootProcedureState. * The RootProcedureState contains the execution stack of the Root-Procedure, * It is added to the map by submitProcedure() and removed on procedure completion. */ private final ConcurrentHashMap<Long, RootProcedureState> rollbackStack = new ConcurrentHashMap<>(); /** * Helper map to lookup the live procedures by ID. * This map contains every procedure. root-procedures and subprocedures. */ private final ConcurrentHashMap<Long, Procedure> procedures = new ConcurrentHashMap<>(); /** * Helper map to lookup whether the procedure already issued from the same client. * This map contains every root procedure. */ private final ConcurrentHashMap<NonceKey, Long> nonceKeysToProcIdsMap = new ConcurrentHashMap<>(); private final CopyOnWriteArrayList<ProcedureExecutorListener> listeners = new CopyOnWriteArrayList<>(); private Configuration conf; private ThreadGroup threadGroup; private CopyOnWriteArrayList<WorkerThread> workerThreads; private TimeoutExecutorThread timeoutExecutor; private int corePoolSize; private volatile long keepAliveTime = Long.MAX_VALUE; /** * Scheduler/Queue that contains runnable procedures. */ private final ProcedureScheduler scheduler; private final AtomicLong lastProcId = new AtomicLong(-1); private final AtomicLong workerId = new AtomicLong(0); private final AtomicInteger activeExecutorCount = new AtomicInteger(0); private final AtomicBoolean running = new AtomicBoolean(false); private final TEnvironment environment; private final ProcedureStore store; private final boolean checkOwnerSet; public ProcedureExecutor(final Configuration conf, final TEnvironment environment, final ProcedureStore store) { this(conf, environment, store, new SimpleProcedureScheduler()); } public ProcedureExecutor(final Configuration conf, final TEnvironment environment, final ProcedureStore store, final ProcedureScheduler scheduler) { this.environment = environment; this.scheduler = scheduler; this.store = store; this.conf = conf; this.checkOwnerSet = conf.getBoolean(CHECK_OWNER_SET_CONF_KEY, DEFAULT_CHECK_OWNER_SET); refreshConfiguration(conf); } private void load(final boolean abortOnCorruption) throws IOException { Preconditions.checkArgument(completed.isEmpty(), "completed not empty"); Preconditions.checkArgument(rollbackStack.isEmpty(), "rollback state not empty"); Preconditions.checkArgument(procedures.isEmpty(), "procedure map not empty"); Preconditions.checkArgument(scheduler.size() == 0, "run queue not empty"); store.load(new ProcedureStore.ProcedureLoader() { @Override public void setMaxProcId(long maxProcId) { assert lastProcId.get() < 0 : "expected only one call to setMaxProcId()"; LOG.debug("Load maxProcId=" + maxProcId); lastProcId.set(maxProcId); } @Override public void load(ProcedureIterator procIter) throws IOException { loadProcedures(procIter, abortOnCorruption); } @Override public void handleCorrupted(ProcedureIterator procIter) throws IOException { int corruptedCount = 0; while (procIter.hasNext()) { ProcedureInfo proc = procIter.nextAsProcedureInfo(); LOG.error("Corrupt " + proc); corruptedCount++; } if (abortOnCorruption && corruptedCount > 0) { throw new IOException("found " + corruptedCount + " corrupted procedure(s) on replay"); } } }); } private void loadProcedures(final ProcedureIterator procIter, final boolean abortOnCorruption) throws IOException { final boolean debugEnabled = LOG.isDebugEnabled(); // 1. Build the rollback stack int runnablesCount = 0; while (procIter.hasNext()) { final NonceKey nonceKey; final long procId; if (procIter.isNextFinished()) { ProcedureInfo proc = procIter.nextAsProcedureInfo(); nonceKey = proc.getNonceKey(); procId = proc.getProcId(); completed.put(proc.getProcId(), proc); if (debugEnabled) { LOG.debug("Completed " + proc); } } else { Procedure proc = procIter.nextAsProcedure(); nonceKey = proc.getNonceKey(); procId = proc.getProcId(); if (!proc.hasParent()) { assert !proc.isFinished() : "unexpected finished procedure"; rollbackStack.put(proc.getProcId(), new RootProcedureState()); } // add the procedure to the map proc.beforeReplay(getEnvironment()); procedures.put(proc.getProcId(), proc); if (proc.getState() == ProcedureState.RUNNABLE) { runnablesCount++; } } // add the nonce to the map if (nonceKey != null) { nonceKeysToProcIdsMap.put(nonceKey, procId); } } // 2. Initialize the stacks final ArrayList<Procedure> runnableList = new ArrayList(runnablesCount); HashSet<Procedure> waitingSet = null; procIter.reset(); while (procIter.hasNext()) { if (procIter.isNextFinished()) { procIter.skipNext(); continue; } Procedure proc = procIter.nextAsProcedure(); assert !(proc.isFinished() && !proc.hasParent()) : "unexpected completed proc=" + proc; if (debugEnabled) { LOG.debug(String.format("Loading %s", proc)); } Long rootProcId = getRootProcedureId(proc); if (rootProcId == null) { // The 'proc' was ready to run but the root procedure was rolledback? scheduler.addBack(proc); continue; } if (proc.hasParent()) { Procedure parent = procedures.get(proc.getParentProcId()); // corrupted procedures are handled later at step 3 if (parent != null && !proc.isFinished()) { parent.incChildrenLatch(); } } RootProcedureState procStack = rollbackStack.get(rootProcId); procStack.loadStack(proc); proc.setRootProcId(rootProcId); switch (proc.getState()) { case RUNNABLE: runnableList.add(proc); break; case WAITING: if (!proc.hasChildren()) { runnableList.add(proc); } break; case WAITING_TIMEOUT: if (waitingSet == null) { waitingSet = new HashSet<>(); } waitingSet.add(proc); break; case FAILED: // add the proc to the scheduler to perform the rollback scheduler.addBack(proc); break; case ROLLEDBACK: case INITIALIZING: String msg = "Unexpected " + proc.getState() + " state for " + proc; LOG.error(msg); throw new UnsupportedOperationException(msg); default: break; } } // 3. Validate the stacks int corruptedCount = 0; Iterator<Map.Entry<Long, RootProcedureState>> itStack = rollbackStack.entrySet().iterator(); while (itStack.hasNext()) { Map.Entry<Long, RootProcedureState> entry = itStack.next(); RootProcedureState procStack = entry.getValue(); if (procStack.isValid()) continue; for (Procedure proc: procStack.getSubproceduresStack()) { LOG.error("Corrupted " + proc); procedures.remove(proc.getProcId()); runnableList.remove(proc); if (waitingSet != null) waitingSet.remove(proc); corruptedCount++; } itStack.remove(); } if (abortOnCorruption && corruptedCount > 0) { throw new IOException("found " + corruptedCount + " procedures on replay"); } // 4. Push the procedures to the timeout executor if (waitingSet != null && !waitingSet.isEmpty()) { for (Procedure proc: waitingSet) { proc.afterReplay(getEnvironment()); timeoutExecutor.add(proc); } } // 5. Push the procedure to the scheduler if (!runnableList.isEmpty()) { // TODO: See ProcedureWALFormatReader#hasFastStartSupport // some procedure may be started way before this stuff. for (int i = runnableList.size() - 1; i >= 0; --i) { Procedure proc = runnableList.get(i); proc.afterReplay(getEnvironment()); if (!proc.hasParent()) { sendProcedureLoadedNotification(proc.getProcId()); } if (proc.wasExecuted()) { scheduler.addFront(proc); } else { // if it was not in execution, it can wait. scheduler.addBack(proc); } } } } /** * Start the procedure executor. * It calls ProcedureStore.recoverLease() and ProcedureStore.load() to * recover the lease, and ensure a single executor, and start the procedure * replay to resume and recover the previous pending and in-progress perocedures. * * @param numThreads number of threads available for procedure execution. * @param abortOnCorruption true if you want to abort your service in case * a corrupted procedure is found on replay. otherwise false. */ public void start(int numThreads, boolean abortOnCorruption) throws IOException { if (running.getAndSet(true)) { LOG.warn("Already running"); return; } // We have numThreads executor + one timer thread used for timing out // procedures and triggering periodic procedures. this.corePoolSize = numThreads; LOG.info("Starting executor worker threads=" + corePoolSize); // Create the Thread Group for the executors threadGroup = new ThreadGroup("ProcedureExecutor"); // Create the timeout executor timeoutExecutor = new TimeoutExecutorThread(threadGroup); // Create the workers workerId.set(0); workerThreads = new CopyOnWriteArrayList<>(); for (int i = 0; i < corePoolSize; ++i) { workerThreads.add(new WorkerThread(threadGroup)); } long st, et; // Acquire the store lease. st = EnvironmentEdgeManager.currentTime(); store.recoverLease(); et = EnvironmentEdgeManager.currentTime(); LOG.info(String.format("Recover store (%s) lease: %s", store.getClass().getSimpleName(), StringUtils.humanTimeDiff(et - st))); // start the procedure scheduler scheduler.start(); // TODO: Split in two steps. // TODO: Handle corrupted procedures (currently just a warn) // The first one will make sure that we have the latest id, // so we can start the threads and accept new procedures. // The second step will do the actual load of old procedures. st = EnvironmentEdgeManager.currentTime(); load(abortOnCorruption); et = EnvironmentEdgeManager.currentTime(); LOG.info(String.format("Load store (%s): %s", store.getClass().getSimpleName(), StringUtils.humanTimeDiff(et - st))); // Start the executors. Here we must have the lastProcId set. if (LOG.isTraceEnabled()) { LOG.trace("Start workers " + workerThreads.size()); } timeoutExecutor.start(); for (WorkerThread worker: workerThreads) { worker.start(); } // Internal chores timeoutExecutor.add(new WorkerMonitor()); // Add completed cleaner chore addChore(new CompletedProcedureCleaner(conf, store, completed, nonceKeysToProcIdsMap)); } public void stop() { if (!running.getAndSet(false)) { return; } LOG.info("Stopping"); scheduler.stop(); timeoutExecutor.sendStopSignal(); } @VisibleForTesting public void join() { assert !isRunning() : "expected not running"; // stop the timeout executor timeoutExecutor.awaitTermination(); timeoutExecutor = null; // stop the worker threads for (WorkerThread worker: workerThreads) { worker.awaitTermination(); } workerThreads = null; // Destroy the Thread Group for the executors try { threadGroup.destroy(); } catch (IllegalThreadStateException e) { LOG.error("Thread group " + threadGroup + " contains running threads"); threadGroup.list(); } finally { threadGroup = null; } // reset the in-memory state for testing completed.clear(); rollbackStack.clear(); procedures.clear(); nonceKeysToProcIdsMap.clear(); scheduler.clear(); lastProcId.set(-1); } public void refreshConfiguration(final Configuration conf) { this.conf = conf; setKeepAliveTime(conf.getLong(WORKER_KEEP_ALIVE_TIME_CONF_KEY, DEFAULT_WORKER_KEEP_ALIVE_TIME), TimeUnit.MILLISECONDS); } // ========================================================================== // Accessors // ========================================================================== public boolean isRunning() { return running.get(); } /** * @return the current number of worker threads. */ public int getWorkerThreadCount() { return workerThreads.size(); } /** * @return the core pool size settings. */ public int getCorePoolSize() { return corePoolSize; } public int getActiveExecutorCount() { return activeExecutorCount.get(); } public TEnvironment getEnvironment() { return this.environment; } public ProcedureStore getStore() { return this.store; } protected ProcedureScheduler getScheduler() { return scheduler; } public void setKeepAliveTime(final long keepAliveTime, final TimeUnit timeUnit) { this.keepAliveTime = timeUnit.toMillis(keepAliveTime); this.scheduler.signalAll(); } public long getKeepAliveTime(final TimeUnit timeUnit) { return timeUnit.convert(keepAliveTime, TimeUnit.MILLISECONDS); } // ========================================================================== // Submit/Remove Chores // ========================================================================== /** * Add a chore procedure to the executor * @param chore the chore to add */ public void addChore(final ProcedureInMemoryChore chore) { chore.setState(ProcedureState.WAITING_TIMEOUT); timeoutExecutor.add(chore); } /** * Remove a chore procedure from the executor * @param chore the chore to remove * @return whether the chore is removed, or it will be removed later */ public boolean removeChore(final ProcedureInMemoryChore chore) { chore.setState(ProcedureState.SUCCESS); return timeoutExecutor.remove(chore); } // ========================================================================== // Nonce Procedure helpers // ========================================================================== /** * Create a NoneKey from the specified nonceGroup and nonce. * @param nonceGroup * @param nonce * @return the generated NonceKey */ public NonceKey createNonceKey(final long nonceGroup, final long nonce) { return (nonce == HConstants.NO_NONCE) ? null : new NonceKey(nonceGroup, nonce); } /** * Register a nonce for a procedure that is going to be submitted. * A procId will be reserved and on submitProcedure(), * the procedure with the specified nonce will take the reserved ProcId. * If someone already reserved the nonce, this method will return the procId reserved, * otherwise an invalid procId will be returned. and the caller should procede * and submit the procedure. * * @param nonceKey A unique identifier for this operation from the client or process. * @return the procId associated with the nonce, if any otherwise an invalid procId. */ public long registerNonce(final NonceKey nonceKey) { if (nonceKey == null) return -1; // check if we have already a Reserved ID for the nonce Long oldProcId = nonceKeysToProcIdsMap.get(nonceKey); if (oldProcId == null) { // reserve a new Procedure ID, this will be associated with the nonce // and the procedure submitted with the specified nonce will use this ID. final long newProcId = nextProcId(); oldProcId = nonceKeysToProcIdsMap.putIfAbsent(nonceKey, newProcId); if (oldProcId == null) return -1; } // we found a registered nonce, but the procedure may not have been submitted yet. // since the client expect the procedure to be submitted, spin here until it is. final boolean traceEnabled = LOG.isTraceEnabled(); while (isRunning() && !(procedures.containsKey(oldProcId) || completed.containsKey(oldProcId)) && nonceKeysToProcIdsMap.containsKey(nonceKey)) { if (traceEnabled) { LOG.trace("Waiting for procId=" + oldProcId.longValue() + " to be submitted"); } Threads.sleep(100); } return oldProcId.longValue(); } /** * Remove the NonceKey if the procedure was not submitted to the executor. * @param nonceKey A unique identifier for this operation from the client or process. */ public void unregisterNonceIfProcedureWasNotSubmitted(final NonceKey nonceKey) { if (nonceKey == null) return; final Long procId = nonceKeysToProcIdsMap.get(nonceKey); if (procId == null) return; // if the procedure was not submitted, remove the nonce if (!(procedures.containsKey(procId) || completed.containsKey(procId))) { nonceKeysToProcIdsMap.remove(nonceKey); } } /** * If the failure failed before submitting it, we may want to give back the * same error to the requests with the same nonceKey. * * @param nonceKey A unique identifier for this operation from the client or process * @param procName name of the procedure, used to inform the user * @param procOwner name of the owner of the procedure, used to inform the user * @param exception the failure to report to the user */ public void setFailureResultForNonce(final NonceKey nonceKey, final String procName, final User procOwner, final IOException exception) { if (nonceKey == null) return; final Long procId = nonceKeysToProcIdsMap.get(nonceKey); if (procId == null || completed.containsKey(procId)) return; final long currentTime = EnvironmentEdgeManager.currentTime(); final ProcedureInfo result = new ProcedureInfo(procId.longValue(), procName, procOwner != null ? procOwner.getShortName() : null, ProcedureUtil.convertToProcedureState(ProcedureState.ROLLEDBACK), -1, nonceKey, exception, currentTime, currentTime, null); completed.putIfAbsent(procId, result); } // ========================================================================== // Submit/Abort Procedure // ========================================================================== /** * Add a new root-procedure to the executor. * @param proc the new procedure to execute. * @return the procedure id, that can be used to monitor the operation */ public long submitProcedure(final Procedure proc) { return submitProcedure(proc, null); } /** * Add a new root-procedure to the executor. * @param proc the new procedure to execute. * @param nonceKey the registered unique identifier for this operation from the client or process. * @return the procedure id, that can be used to monitor the operation */ @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH", justification = "FindBugs is blind to the check-for-null") public long submitProcedure(final Procedure proc, final NonceKey nonceKey) { Preconditions.checkArgument(lastProcId.get() >= 0); Preconditions.checkArgument(isRunning(), "executor not running"); prepareProcedure(proc); final Long currentProcId; if (nonceKey != null) { currentProcId = nonceKeysToProcIdsMap.get(nonceKey); Preconditions.checkArgument(currentProcId != null, "Expected nonceKey=" + nonceKey + " to be reserved, use registerNonce(); proc=" + proc); } else { currentProcId = nextProcId(); } // Initialize the procedure proc.setNonceKey(nonceKey); proc.setProcId(currentProcId.longValue()); // Commit the transaction store.insert(proc, null); if (LOG.isDebugEnabled()) { LOG.debug("Stored " + proc); } // Add the procedure to the executor return pushProcedure(proc); } /** * Add a set of new root-procedure to the executor. * @param procs the new procedures to execute. */ // TODO: Do we need to take nonces here? public void submitProcedures(final Procedure[] procs) { Preconditions.checkArgument(lastProcId.get() >= 0); Preconditions.checkArgument(isRunning(), "executor not running"); // Prepare procedure for (int i = 0; i < procs.length; ++i) { prepareProcedure(procs[i]).setProcId(nextProcId()); } // Commit the transaction store.insert(procs); if (LOG.isDebugEnabled()) { LOG.debug("Stored " + Arrays.toString(procs)); } // Add the procedure to the executor for (int i = 0; i < procs.length; ++i) { pushProcedure(procs[i]); } } private Procedure prepareProcedure(final Procedure proc) { Preconditions.checkArgument(proc.getState() == ProcedureState.INITIALIZING); Preconditions.checkArgument(isRunning(), "executor not running"); Preconditions.checkArgument(!proc.hasParent(), "unexpected parent", proc); if (this.checkOwnerSet) { Preconditions.checkArgument(proc.hasOwner(), "missing owner"); } return proc; } private long pushProcedure(final Procedure proc) { final long currentProcId = proc.getProcId(); // Update metrics on start of a procedure proc.updateMetricsOnSubmit(getEnvironment()); // Create the rollback stack for the procedure RootProcedureState stack = new RootProcedureState(); rollbackStack.put(currentProcId, stack); // Submit the new subprocedures assert !procedures.containsKey(currentProcId); procedures.put(currentProcId, proc); sendProcedureAddedNotification(currentProcId); scheduler.addBack(proc); return proc.getProcId(); } /** * Send an abort notification the specified procedure. * Depending on the procedure implementation the abort can be considered or ignored. * @param procId the procedure to abort * @return true if the procedure exist and has received the abort, otherwise false. */ public boolean abort(final long procId) { return abort(procId, true); } /** * Send an abort notification the specified procedure. * Depending on the procedure implementation the abort can be considered or ignored. * @param procId the procedure to abort * @param mayInterruptIfRunning if the proc completed at least one step, should it be aborted? * @return true if the procedure exist and has received the abort, otherwise false. */ public boolean abort(final long procId, final boolean mayInterruptIfRunning) { final Procedure proc = procedures.get(procId); if (proc != null) { if (!mayInterruptIfRunning && proc.wasExecuted()) { return false; } return proc.abort(getEnvironment()); } return false; } // ========================================================================== // Executor query helpers // ========================================================================== public Procedure getProcedure(final long procId) { return procedures.get(procId); } public <T extends Procedure> T getProcedure(final Class<T> clazz, final long procId) { final Procedure proc = getProcedure(procId); if (clazz.isInstance(proc)) { return (T)proc; } return null; } public ProcedureInfo getResult(final long procId) { return completed.get(procId); } /** * Return true if the procedure is finished. * The state may be "completed successfully" or "failed and rolledback". * Use getResult() to check the state or get the result data. * @param procId the ID of the procedure to check * @return true if the procedure execution is finished, otherwise false. */ public boolean isFinished(final long procId) { return !procedures.containsKey(procId); } /** * Return true if the procedure is started. * @param procId the ID of the procedure to check * @return true if the procedure execution is started, otherwise false. */ public boolean isStarted(final long procId) { final Procedure proc = procedures.get(procId); if (proc == null) { return completed.get(procId) != null; } return proc.wasExecuted(); } /** * Mark the specified completed procedure, as ready to remove. * @param procId the ID of the procedure to remove */ public void removeResult(final long procId) { final ProcedureInfo result = completed.get(procId); if (result == null) { assert !procedures.containsKey(procId) : "procId=" + procId + " is still running"; if (LOG.isDebugEnabled()) { LOG.debug("procId=" + procId + " already removed by the cleaner."); } return; } // The CompletedProcedureCleaner will take care of deletion, once the TTL is expired. result.setClientAckTime(EnvironmentEdgeManager.currentTime()); } public Pair<ProcedureInfo, Procedure> getResultOrProcedure(final long procId) { ProcedureInfo result = completed.get(procId); Procedure proc = null; if (result == null) { proc = procedures.get(procId); if (proc == null) { result = completed.get(procId); } } return new Pair(result, proc); } /** * Check if the user is this procedure's owner * @param procId the target procedure * @param user the user * @return true if the user is the owner of the procedure, * false otherwise or the owner is unknown. */ public boolean isProcedureOwner(final long procId, final User user) { if (user == null) return false; final Procedure proc = procedures.get(procId); if (proc != null) { return proc.getOwner().equals(user.getShortName()); } final ProcedureInfo procInfo = completed.get(procId); if (procInfo == null) { // Procedure either does not exist or has already completed and got cleaned up. // At this time, we cannot check the owner of the procedure return false; } return ProcedureInfo.isProcedureOwner(procInfo, user); } /** * List procedures. * @return the procedures in a list */ public List<ProcedureInfo> listProcedures() { final List<ProcedureInfo> procedureLists = new ArrayList<>(procedures.size() + completed.size()); for (Map.Entry<Long, Procedure> p: procedures.entrySet()) { procedureLists.add(ProcedureUtil.convertToProcedureInfo(p.getValue())); } for (Map.Entry<Long, ProcedureInfo> e: completed.entrySet()) { // Note: The procedure could show up twice in the list with different state, as // it could complete after we walk through procedures list and insert into // procedureList - it is ok, as we will use the information in the ProcedureInfo // to figure it out; to prevent this would increase the complexity of the logic. procedureLists.add(e.getValue()); } return procedureLists; } // ========================================================================== // Listeners helpers // ========================================================================== public void registerListener(ProcedureExecutorListener listener) { this.listeners.add(listener); } public boolean unregisterListener(ProcedureExecutorListener listener) { return this.listeners.remove(listener); } private void sendProcedureLoadedNotification(final long procId) { if (!this.listeners.isEmpty()) { for (ProcedureExecutorListener listener: this.listeners) { try { listener.procedureLoaded(procId); } catch (Throwable e) { LOG.error("Listener " + listener + " had an error: " + e.getMessage(), e); } } } } private void sendProcedureAddedNotification(final long procId) { if (!this.listeners.isEmpty()) { for (ProcedureExecutorListener listener: this.listeners) { try { listener.procedureAdded(procId); } catch (Throwable e) { LOG.error("Listener " + listener + " had an error: " + e.getMessage(), e); } } } } private void sendProcedureFinishedNotification(final long procId) { if (!this.listeners.isEmpty()) { for (ProcedureExecutorListener listener: this.listeners) { try { listener.procedureFinished(procId); } catch (Throwable e) { LOG.error("Listener " + listener + " had an error: " + e.getMessage(), e); } } } } // ========================================================================== // Procedure IDs helpers // ========================================================================== private long nextProcId() { long procId = lastProcId.incrementAndGet(); if (procId < 0) { while (!lastProcId.compareAndSet(procId, 0)) { procId = lastProcId.get(); if (procId >= 0) break; } while (procedures.containsKey(procId)) { procId = lastProcId.incrementAndGet(); } } assert procId >= 0 : "Invalid procId " + procId; return procId; } @VisibleForTesting protected long getLastProcId() { return lastProcId.get(); } @VisibleForTesting public Set<Long> getActiveProcIds() { return procedures.keySet(); } private Long getRootProcedureId(Procedure proc) { return Procedure.getRootProcedureId(procedures, proc); } // ========================================================================== // Executions // ========================================================================== private void executeProcedure(final Procedure proc) { final Long rootProcId = getRootProcedureId(proc); if (rootProcId == null) { // The 'proc' was ready to run but the root procedure was rolledback executeRollback(proc); return; } final RootProcedureState procStack = rollbackStack.get(rootProcId); if (procStack == null) return; do { // Try to acquire the execution if (!procStack.acquire(proc)) { if (procStack.setRollback()) { // we have the 'rollback-lock' we can start rollingback switch (executeRollback(rootProcId, procStack)) { case LOCK_ACQUIRED: break; case LOCK_YIELD_WAIT: procStack.unsetRollback(); scheduler.yield(proc); break; case LOCK_EVENT_WAIT: procStack.unsetRollback(); break; default: throw new UnsupportedOperationException(); } } else { // if we can't rollback means that some child is still running. // the rollback will be executed after all the children are done. // If the procedure was never executed, remove and mark it as rolledback. if (!proc.wasExecuted()) { switch (executeRollback(proc)) { case LOCK_ACQUIRED: break; case LOCK_YIELD_WAIT: scheduler.yield(proc); break; case LOCK_EVENT_WAIT: break; default: throw new UnsupportedOperationException(); } } } break; } // Execute the procedure assert proc.getState() == ProcedureState.RUNNABLE : proc; switch (acquireLock(proc)) { case LOCK_ACQUIRED: execProcedure(procStack, proc); releaseLock(proc, false); break; case LOCK_YIELD_WAIT: scheduler.yield(proc); break; case LOCK_EVENT_WAIT: // someone will wake us up when the lock is available break; default: throw new UnsupportedOperationException(); } procStack.release(proc); // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (testing != null && !isRunning()) { break; } if (proc.isSuccess()) { // update metrics on finishing the procedure proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); if (LOG.isDebugEnabled()) { LOG.debug("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); } // Finalize the procedure state if (proc.getProcId() == rootProcId) { procedureFinished(proc); } else { execCompletionCleanup(proc); } break; } } while (procStack.isFailed()); } private LockState acquireLock(final Procedure proc) { final TEnvironment env = getEnvironment(); // hasLock() is used in conjunction with holdLock(). // This allows us to not rewrite or carry around the hasLock() flag // for every procedure. the hasLock() have meaning only if holdLock() is true. if (proc.holdLock(env) && proc.hasLock(env)) { return LockState.LOCK_ACQUIRED; } return proc.doAcquireLock(env); } private void releaseLock(final Procedure proc, final boolean force) { final TEnvironment env = getEnvironment(); // for how the framework works, we know that we will always have the lock // when we call releaseLock(), so we can avoid calling proc.hasLock() if (force || !proc.holdLock(env)) { proc.doReleaseLock(env); } } /** * Execute the rollback of the full procedure stack. * Once the procedure is rolledback, the root-procedure will be visible as * finished to user, and the result will be the fatal exception. */ private LockState executeRollback(final long rootProcId, final RootProcedureState procStack) { final Procedure rootProc = procedures.get(rootProcId); RemoteProcedureException exception = rootProc.getException(); if (exception == null) { exception = procStack.getException(); rootProc.setFailure(exception); store.update(rootProc); } final List<Procedure> subprocStack = procStack.getSubproceduresStack(); assert subprocStack != null : "Called rollback with no steps executed rootProc=" + rootProc; int stackTail = subprocStack.size(); boolean reuseLock = false; while (stackTail --> 0) { final Procedure proc = subprocStack.get(stackTail); LockState lockState; if (!reuseLock && (lockState = acquireLock(proc)) != LockState.LOCK_ACQUIRED) { // can't take a lock on the procedure, add the root-proc back on the // queue waiting for the lock availability return lockState; } lockState = executeRollback(proc); boolean abortRollback = lockState != LockState.LOCK_ACQUIRED; abortRollback |= !isRunning() || !store.isRunning(); // If the next procedure is the same to this one // (e.g. StateMachineProcedure reuse the same instance) // we can avoid to lock/unlock each step reuseLock = stackTail > 0 && (subprocStack.get(stackTail - 1) == proc) && !abortRollback; if (!reuseLock) { releaseLock(proc, false); } // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (abortRollback) { return lockState; } subprocStack.remove(stackTail); // if the procedure is kind enough to pass the slot to someone else, yield if (proc.isYieldAfterExecutionStep(getEnvironment())) { return LockState.LOCK_YIELD_WAIT; } if (proc != rootProc) { execCompletionCleanup(proc); } } // Finalize the procedure state LOG.info("Rolled back " + rootProc + " exec-time=" + StringUtils.humanTimeDiff(rootProc.elapsedTime())); procedureFinished(rootProc); return LockState.LOCK_ACQUIRED; } /** * Execute the rollback of the procedure step. * It updates the store with the new state (stack index) * or will remove completly the procedure in case it is a child. */ private LockState executeRollback(final Procedure proc) { try { proc.doRollback(getEnvironment()); } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("Roll back attempt failed for " + proc, e); } return LockState.LOCK_YIELD_WAIT; } catch (InterruptedException e) { handleInterruptedException(proc, e); return LockState.LOCK_YIELD_WAIT; } catch (Throwable e) { // Catch NullPointerExceptions or similar errors... LOG.fatal("CODE-BUG: Uncatched runtime exception for procedure: " + proc, e); } // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (testing != null && testing.shouldKillBeforeStoreUpdate()) { LOG.debug("TESTING: Kill before store update"); stop(); return LockState.LOCK_YIELD_WAIT; } if (proc.removeStackIndex()) { proc.setState(ProcedureState.ROLLEDBACK); // update metrics on finishing the procedure (fail) proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), false); if (proc.hasParent()) { store.delete(proc.getProcId()); procedures.remove(proc.getProcId()); } else { final long[] childProcIds = rollbackStack.get(proc.getProcId()).getSubprocedureIds(); if (childProcIds != null) { store.delete(proc, childProcIds); } else { store.update(proc); } } } else { store.update(proc); } return LockState.LOCK_ACQUIRED; } /** * Executes the specified procedure * - calls the doExecute() of the procedure * - if the procedure execution didn't fail (e.g. invalid user input) * - ...and returned subprocedures * - the subprocedures are initialized. * - the subprocedures are added to the store * - the subprocedures are added to the runnable queue * - the procedure is now in a WAITING state, waiting for the subprocedures to complete * - ...if there are no subprocedure * - the procedure completed successfully * - if there is a parent (WAITING) * - the parent state will be set to RUNNABLE * - in case of failure * - the store is updated with the new state * - the executor (caller of this method) will start the rollback of the procedure */ private void execProcedure(final RootProcedureState procStack, final Procedure procedure) { Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE); // Execute the procedure boolean suspended = false; boolean reExecute = false; Procedure[] subprocs = null; do { reExecute = false; try { subprocs = procedure.doExecute(getEnvironment()); if (subprocs != null && subprocs.length == 0) { subprocs = null; } } catch (ProcedureSuspendedException e) { suspended = true; } catch (ProcedureYieldException e) { if (LOG.isTraceEnabled()) { LOG.trace("Yield " + procedure + ": " + e.getMessage()); } scheduler.yield(procedure); return; } catch (InterruptedException e) { handleInterruptedException(procedure, e); scheduler.yield(procedure); return; } catch (Throwable e) { // Catch NullPointerExceptions or similar errors... String msg = "CODE-BUG: Uncaught runtime exception: " + procedure; LOG.error(msg, e); procedure.setFailure(new RemoteProcedureException(msg, e)); } if (!procedure.isFailed()) { if (subprocs != null) { if (subprocs.length == 1 && subprocs[0] == procedure) { // quick-shortcut for a state machine like procedure subprocs = null; reExecute = true; } else { // yield the current procedure, and make the subprocedure runnable subprocs = initializeChildren(procStack, procedure, subprocs); } } else if (procedure.getState() == ProcedureState.WAITING_TIMEOUT) { timeoutExecutor.add(procedure); } else if (!suspended) { // No subtask, so we are done procedure.setState(ProcedureState.SUCCESS); } } // Add the procedure to the stack procStack.addRollbackStep(procedure); // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) { LOG.debug("TESTING: Kill before store update: " + procedure); stop(); return; } // TODO: The code here doesn't check if store is running before persisting to the store as // it relies on the method call below to throw RuntimeException to wind up the stack and // executor thread to stop. The statement following the method call below seems to check if // store is not running, to prevent scheduling children procedures, re-execution or yield // of this procedure. This may need more scrutiny and subsequent cleanup in future // Commit the transaction updateStoreOnExec(procStack, procedure, subprocs); // if the store is not running we are aborting if (!store.isRunning()) return; // if the procedure is kind enough to pass the slot to someone else, yield if (procedure.isRunnable() && !suspended && procedure.isYieldAfterExecutionStep(getEnvironment())) { scheduler.yield(procedure); return; } assert (reExecute && subprocs == null) || !reExecute; } while (reExecute); // Submit the new subprocedures if (subprocs != null && !procedure.isFailed()) { submitChildrenProcedures(subprocs); } // if the procedure is complete and has a parent, count down the children latch if (procedure.isFinished() && procedure.hasParent()) { countDownChildren(procStack, procedure); } } private Procedure[] initializeChildren(final RootProcedureState procStack, final Procedure procedure, final Procedure[] subprocs) { assert subprocs != null : "expected subprocedures"; final long rootProcId = getRootProcedureId(procedure); for (int i = 0; i < subprocs.length; ++i) { final Procedure subproc = subprocs[i]; if (subproc == null) { String msg = "subproc[" + i + "] is null, aborting the procedure"; procedure.setFailure(new RemoteProcedureException(msg, new IllegalArgumentIOException(msg))); return null; } assert subproc.getState() == ProcedureState.INITIALIZING : subproc; subproc.setParentProcId(procedure.getProcId()); subproc.setRootProcId(rootProcId); subproc.setProcId(nextProcId()); procStack.addSubProcedure(subproc); } if (!procedure.isFailed()) { procedure.setChildrenLatch(subprocs.length); switch (procedure.getState()) { case RUNNABLE: procedure.setState(ProcedureState.WAITING); break; case WAITING_TIMEOUT: timeoutExecutor.add(procedure); break; default: break; } } return subprocs; } private void submitChildrenProcedures(final Procedure[] subprocs) { for (int i = 0; i < subprocs.length; ++i) { final Procedure subproc = subprocs[i]; subproc.updateMetricsOnSubmit(getEnvironment()); assert !procedures.containsKey(subproc.getProcId()); procedures.put(subproc.getProcId(), subproc); scheduler.addFront(subproc); } } private void countDownChildren(final RootProcedureState procStack, final Procedure procedure) { final Procedure parent = procedures.get(procedure.getParentProcId()); if (parent == null) { assert procStack.isRollingback(); return; } // If this procedure is the last child awake the parent procedure final boolean traceEnabled = LOG.isTraceEnabled(); if (traceEnabled) { LOG.trace(parent + " child is done: " + procedure); } if (parent.childrenCountDown() && parent.getState() == ProcedureState.WAITING) { parent.setState(ProcedureState.RUNNABLE); store.update(parent); scheduler.addFront(parent); if (traceEnabled) { LOG.trace(parent + " all the children finished their work, resume."); } return; } } private void updateStoreOnExec(final RootProcedureState procStack, final Procedure procedure, final Procedure[] subprocs) { if (subprocs != null && !procedure.isFailed()) { if (LOG.isTraceEnabled()) { LOG.trace("Stored " + procedure + ", children " + Arrays.toString(subprocs)); } store.insert(procedure, subprocs); } else { if (LOG.isTraceEnabled()) { LOG.trace("Store update " + procedure); } if (procedure.isFinished() && !procedure.hasParent()) { // remove child procedures final long[] childProcIds = procStack.getSubprocedureIds(); if (childProcIds != null) { store.delete(procedure, childProcIds); for (int i = 0; i < childProcIds.length; ++i) { procedures.remove(childProcIds[i]); } } else { store.update(procedure); } } else { store.update(procedure); } } } private void handleInterruptedException(final Procedure proc, final InterruptedException e) { if (LOG.isTraceEnabled()) { LOG.trace("Interrupt during " + proc + ". suspend and retry it later.", e); } // NOTE: We don't call Thread.currentThread().interrupt() // because otherwise all the subsequent calls e.g. Thread.sleep() will throw // the InterruptedException. If the master is going down, we will be notified // and the executor/store will be stopped. // (The interrupted procedure will be retried on the next run) } private void execCompletionCleanup(final Procedure proc) { final TEnvironment env = getEnvironment(); if (proc.holdLock(env) && proc.hasLock(env)) { releaseLock(proc, true); } try { proc.completionCleanup(env); } catch (Throwable e) { // Catch NullPointerExceptions or similar errors... LOG.error("CODE-BUG: uncatched runtime exception for procedure: " + proc, e); } } private void procedureFinished(final Procedure proc) { // call the procedure completion cleanup handler execCompletionCleanup(proc); // update the executor internal state maps final ProcedureInfo procInfo = ProcedureUtil.convertToProcedureInfo(proc, proc.getNonceKey()); if (!proc.shouldWaitClientAck(getEnvironment())) { procInfo.setClientAckTime(0); } completed.put(procInfo.getProcId(), procInfo); rollbackStack.remove(proc.getProcId()); procedures.remove(proc.getProcId()); // call the runnableSet completion cleanup handler try { scheduler.completionCleanup(proc); } catch (Throwable e) { // Catch NullPointerExceptions or similar errors... LOG.error("CODE-BUG: uncatched runtime exception for completion cleanup: " + proc, e); } // Notify the listeners sendProcedureFinishedNotification(proc.getProcId()); } // ========================================================================== // Worker Thread // ========================================================================== private final class WorkerThread extends StoppableThread { private final AtomicLong executionStartTime = new AtomicLong(Long.MAX_VALUE); public WorkerThread(final ThreadGroup group) { super(group, "ProcExecWorker-" + workerId.incrementAndGet()); } @Override public void sendStopSignal() { scheduler.signalAll(); } @Override public void run() { final boolean traceEnabled = LOG.isTraceEnabled(); long lastUpdate = EnvironmentEdgeManager.currentTime(); while (isRunning() && keepAlive(lastUpdate)) { final Procedure procedure = scheduler.poll(keepAliveTime, TimeUnit.MILLISECONDS); if (procedure == null) continue; store.setRunningProcedureCount(activeExecutorCount.incrementAndGet()); executionStartTime.set(EnvironmentEdgeManager.currentTime()); try { if (traceEnabled) { LOG.trace("Trying to start the execution of " + procedure); } executeProcedure(procedure); } finally { store.setRunningProcedureCount(activeExecutorCount.decrementAndGet()); lastUpdate = EnvironmentEdgeManager.currentTime(); executionStartTime.set(Long.MAX_VALUE); } } LOG.debug("Worker thread terminated " + this); workerThreads.remove(this); } /** * @return the time since the current procedure is running */ public long getCurrentRunTime() { return EnvironmentEdgeManager.currentTime() - executionStartTime.get(); } private boolean keepAlive(final long lastUpdate) { if (workerThreads.size() <= corePoolSize) return true; return (EnvironmentEdgeManager.currentTime() - lastUpdate) < keepAliveTime; } } // ========================================================================== // Timeout Thread // ========================================================================== private final class TimeoutExecutorThread extends StoppableThread { private final DelayQueue<DelayedWithTimeout> queue = new DelayQueue<>(); public TimeoutExecutorThread(final ThreadGroup group) { super(group, "ProcedureTimeoutExecutor"); } @Override public void sendStopSignal() { queue.add(DelayedUtil.DELAYED_POISON); } @Override public void run() { final boolean isTraceEnabled = LOG.isTraceEnabled(); while (isRunning()) { final DelayedWithTimeout task = DelayedUtil.takeWithoutInterrupt(queue); if (task == null || task == DelayedUtil.DELAYED_POISON) { // the executor may be shutting down, // and the task is just the shutdown request continue; } if (isTraceEnabled) { LOG.trace("Trying to start the execution of " + task); } // execute the task if (task instanceof InlineChore) { execInlineChore((InlineChore)task); } else if (task instanceof DelayedProcedure) { execDelayedProcedure((DelayedProcedure)task); } else { LOG.error("CODE-BUG unknown timeout task type " + task); } } } public void add(final InlineChore chore) { chore.refreshTimeout(); queue.add(chore); } public void add(final Procedure procedure) { assert procedure.getState() == ProcedureState.WAITING_TIMEOUT; queue.add(new DelayedProcedure(procedure)); } public boolean remove(final Procedure procedure) { return queue.remove(new DelayedProcedure(procedure)); } private void execInlineChore(final InlineChore chore) { chore.run(); add(chore); } private void execDelayedProcedure(final DelayedProcedure delayed) { // TODO: treat this as a normal procedure, add it to the scheduler and // let one of the workers handle it. // Today we consider ProcedureInMemoryChore as InlineChores final Procedure procedure = delayed.getObject(); if (procedure instanceof ProcedureInMemoryChore) { executeInMemoryChore((ProcedureInMemoryChore)procedure); // if the procedure is in a waiting state again, put it back in the queue procedure.updateTimestamp(); if (procedure.isWaiting()) { delayed.setTimeout(procedure.getTimeoutTimestamp()); queue.add(delayed); } } else { executeTimedoutProcedure(procedure); } } private void executeInMemoryChore(final ProcedureInMemoryChore chore) { if (!chore.isWaiting()) return; // The ProcedureInMemoryChore is a special case, and it acts as a chore. // instead of bringing the Chore class in, we reuse this timeout thread for // this special case. try { chore.periodicExecute(getEnvironment()); } catch (Throwable e) { LOG.error("Ignoring " + chore + " exception: " + e.getMessage(), e); } } private void executeTimedoutProcedure(final Procedure proc) { // The procedure received a timeout. if the procedure itself does not handle it, // call abort() and add the procedure back in the queue for rollback. if (proc.setTimeoutFailure(getEnvironment())) { long rootProcId = Procedure.getRootProcedureId(procedures, proc); RootProcedureState procStack = rollbackStack.get(rootProcId); procStack.abort(); store.update(proc); scheduler.addFront(proc); } } } private static final class DelayedProcedure extends DelayedUtil.DelayedContainerWithTimestamp<Procedure> { public DelayedProcedure(final Procedure procedure) { super(procedure, procedure.getTimeoutTimestamp()); } } private static abstract class StoppableThread extends Thread { public StoppableThread(final ThreadGroup group, final String name) { super(group, name); } public abstract void sendStopSignal(); public void awaitTermination() { try { final long startTime = EnvironmentEdgeManager.currentTime(); for (int i = 0; isAlive(); ++i) { sendStopSignal(); join(250); if (i > 0 && (i % 8) == 0) { LOG.warn("Waiting termination of thread " + getName() + ", " + StringUtils.humanTimeDiff(EnvironmentEdgeManager.currentTime() - startTime)); } } } catch (InterruptedException e) { LOG.warn(getName() + " join wait got interrupted", e); } } } // ========================================================================== // Inline Chores (executors internal chores) // ========================================================================== private static abstract class InlineChore extends DelayedUtil.DelayedObject implements Runnable { private long timeout; public abstract int getTimeoutInterval(); protected void refreshTimeout() { this.timeout = EnvironmentEdgeManager.currentTime() + getTimeoutInterval(); } @Override public long getTimeout() { return timeout; } } // ---------------------------------------------------------------------------- // TODO-MAYBE: Should we provide a InlineChore to notify the store with the // full set of procedures pending and completed to write a compacted // version of the log (in case is a log)? // In theory no, procedures are have a short life, so at some point the store // will have the tracker saying everything is in the last log. // ---------------------------------------------------------------------------- private final class WorkerMonitor extends InlineChore { public static final String WORKER_MONITOR_INTERVAL_CONF_KEY = "hbase.procedure.worker.monitor.interval.msec"; private static final int DEFAULT_WORKER_MONITOR_INTERVAL = 5000; // 5sec public static final String WORKER_STUCK_THRESHOLD_CONF_KEY = "hbase.procedure.worker.stuck.threshold.msec"; private static final int DEFAULT_WORKER_STUCK_THRESHOLD = 10000; // 10sec public static final String WORKER_ADD_STUCK_PERCENTAGE_CONF_KEY = "hbase.procedure.worker.add.stuck.percentage"; private static final float DEFAULT_WORKER_ADD_STUCK_PERCENTAGE = 0.5f; // 50% stuck private float addWorkerStuckPercentage = DEFAULT_WORKER_ADD_STUCK_PERCENTAGE; private int timeoutInterval = DEFAULT_WORKER_MONITOR_INTERVAL; private int stuckThreshold = DEFAULT_WORKER_STUCK_THRESHOLD; public WorkerMonitor() { refreshConfig(); } @Override public void run() { final int stuckCount = checkForStuckWorkers(); checkThreadCount(stuckCount); // refresh interval (poor man dynamic conf update) refreshConfig(); } private int checkForStuckWorkers() { // check if any of the worker is stuck int stuckCount = 0; for (WorkerThread worker: workerThreads) { if (worker.getCurrentRunTime() < stuckThreshold) { continue; } // WARN the worker is stuck stuckCount++; LOG.warn("Worker stuck " + worker + " run time " + StringUtils.humanTimeDiff(worker.getCurrentRunTime())); } return stuckCount; } private void checkThreadCount(final int stuckCount) { // nothing to do if there are no runnable tasks if (stuckCount < 1 || !scheduler.hasRunnables()) return; // add a new thread if the worker stuck percentage exceed the threshold limit // and every handler is active. final float stuckPerc = ((float)stuckCount) / workerThreads.size(); if (stuckPerc >= addWorkerStuckPercentage && activeExecutorCount.get() == workerThreads.size()) { final WorkerThread worker = new WorkerThread(threadGroup); workerThreads.add(worker); worker.start(); LOG.debug("Added new worker thread " + worker); } } private void refreshConfig() { addWorkerStuckPercentage = conf.getFloat(WORKER_ADD_STUCK_PERCENTAGE_CONF_KEY, DEFAULT_WORKER_ADD_STUCK_PERCENTAGE); timeoutInterval = conf.getInt(WORKER_MONITOR_INTERVAL_CONF_KEY, DEFAULT_WORKER_MONITOR_INTERVAL); stuckThreshold = conf.getInt(WORKER_STUCK_THRESHOLD_CONF_KEY, DEFAULT_WORKER_STUCK_THRESHOLD); } @Override public int getTimeoutInterval() { return timeoutInterval; } } }