/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.procedure; import java.io.IOException; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.errorhandling.ForeignException; import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher; import org.apache.hadoop.hbase.errorhandling.ForeignExceptionListener; import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; import org.apache.hadoop.hbase.errorhandling.TimeoutExceptionInjector; /** * Distributed procedure member's Subprocedure. A procedure is sarted on a ProcedureCoordinator * which communicates with ProcedureMembers who create and start its part of the Procedure. This * sub part is called a Subprocedure * * Users should subclass this and implement {@link #acquireBarrier()} (get local barrier for this * member), {@link #insideBarrier()} (execute while globally barriered and release barrier) and * {@link #cleanup(Exception)} (release state associated with subprocedure.) * * When submitted to a ProcedureMemeber, the call method is executed in a separate thread. * Latches are use too block its progress and trigger continuations when barrier conditions are * met. * * Exception that makes it out of calls to {@link #acquireBarrier()} or {@link #insideBarrier()} * gets converted into {@link ForeignException}, which will get propagated to the * {@link ProcedureCoordinator}. * * There is a category of procedure (ex: online-snapshots), and a user-specified instance-specific * barrierName. (ex: snapshot121126). */ abstract public class Subprocedure implements Callable<Void> { private static final Log LOG = LogFactory.getLog(Subprocedure.class); // Name of the procedure final private String barrierName; // // Execution state // /** wait on before allowing the in barrier phase to proceed */ private final CountDownLatch inGlobalBarrier; /** counted down when the Subprocedure has completed */ private final CountDownLatch releasedLocalBarrier; // // Error handling // /** monitor to check for errors */ protected final ForeignExceptionDispatcher monitor; /** frequency to check for errors (ms) */ protected final long wakeFrequency; protected final TimeoutExceptionInjector executionTimeoutTimer; protected final ProcedureMemberRpcs rpcs; private volatile boolean complete = false; /** * @param member reference to the member managing this subprocedure * @param procName name of the procedure this subprocedure is associated with * @param monitor notified if there is an error in the subprocedure * @param wakeFrequency time in millis to wake to check if there is an error via the monitor (in * milliseconds). * @param timeout time in millis that will trigger a subprocedure abort if it has not completed */ public Subprocedure(ProcedureMember member, String procName, ForeignExceptionDispatcher monitor, long wakeFrequency, long timeout) { // Asserts should be caught during unit testing assert member != null : "procedure member should be non-null"; assert member.getRpcs() != null : "rpc handlers should be non-null"; assert procName != null : "procedure name should be non-null"; assert monitor != null : "monitor should be non-null"; // Default to a very large timeout this.rpcs = member.getRpcs(); this.barrierName = procName; this.monitor = monitor; // forward any failures to coordinator. Since this is a dispatcher, resend loops should not be // possible. this.monitor.addListener(new ForeignExceptionListener() { @Override public void receive(ForeignException ee) { // if this is a notification from a remote source, just log if (ee.isRemote()) { LOG.debug("Was remote foreign exception, not redispatching error", ee); return; } // if it is local, then send it to the coordinator try { rpcs.sendMemberAborted(Subprocedure.this, ee); } catch (IOException e) { // this will fail all the running procedures, since the connection is down LOG.error("Can't reach controller, not propagating error", e); } } }); this.wakeFrequency = wakeFrequency; this.inGlobalBarrier = new CountDownLatch(1); this.releasedLocalBarrier = new CountDownLatch(1); // accept error from timer thread, this needs to be started. this.executionTimeoutTimer = new TimeoutExceptionInjector(monitor, timeout); } public String getName() { return barrierName; } public String getMemberName() { return rpcs.getMemberName(); } private void rethrowException() throws ForeignException { monitor.rethrowException(); } /** * Execute the Subprocedure {@link #acquireBarrier()} and {@link #insideBarrier()} methods * while keeping some state for other threads to access. * * This would normally be executed by the ProcedureMemeber when a acquire message comes from the * coordinator. Rpcs are used to spend message back to the coordinator after different phases * are executed. Any exceptions caught during the execution (except for InterruptedException) get * converted and propagated to coordinator via {@link ProcedureMemberRpcs#sendMemberAborted( * Subprocedure, ForeignException)}. */ @SuppressWarnings("finally") final public Void call() { LOG.debug("Starting subprocedure '" + barrierName + "' with timeout " + executionTimeoutTimer.getMaxTime() + "ms"); // start the execution timeout timer executionTimeoutTimer.start(); try { // start by checking for error first rethrowException(); LOG.debug("Subprocedure '" + barrierName + "' starting 'acquire' stage"); acquireBarrier(); LOG.debug("Subprocedure '" + barrierName + "' locally acquired"); rethrowException(); // vote yes to coordinator about being prepared rpcs.sendMemberAcquired(this); LOG.debug("Subprocedure '" + barrierName + "' coordinator notified of 'acquire', waiting on" + " 'reached' or 'abort' from coordinator"); // wait for the procedure to reach global barrier before proceding waitForReachedGlobalBarrier(); rethrowException(); // if Coordinator aborts, will bail from here with exception // In traditional 2PC, if a member reaches this state the TX has been committed and the // member is responsible for rolling forward and recovering and completing the subsequent // operations in the case of failure. It cannot rollback. // // This implementation is not 2PC since it can still rollback here, and thus has different // semantics. LOG.debug("Subprocedure '" + barrierName + "' received 'reached' from coordinator."); byte[] dataToCoordinator = insideBarrier(); LOG.debug("Subprocedure '" + barrierName + "' locally completed"); rethrowException(); // Ack that the member has executed and released local barrier rpcs.sendMemberCompleted(this, dataToCoordinator); LOG.debug("Subprocedure '" + barrierName + "' has notified controller of completion"); // make sure we didn't get an external exception rethrowException(); } catch (Exception e) { String msg = null; if (e instanceof InterruptedException) { msg = "Procedure '" + barrierName + "' aborting due to interrupt!" + " Likely due to pool shutdown."; Thread.currentThread().interrupt(); } else if (e instanceof ForeignException) { msg = "Subprocedure '" + barrierName + "' aborting due to a ForeignException!"; } else { msg = "Subprocedure '" + barrierName + "' failed!"; } cancel(msg, e); LOG.debug("Subprocedure '" + barrierName + "' running cleanup."); cleanup(e); } finally { releasedLocalBarrier.countDown(); // tell the timer we are done, if we get here successfully executionTimeoutTimer.complete(); complete = true; LOG.debug("Subprocedure '" + barrierName + "' completed."); return null; } } boolean isComplete() { return complete; } /** * exposed for testing. */ ForeignExceptionSnare getErrorCheckable() { return this.monitor; } /** * The implementation of this method should gather and hold required resources (locks, disk * space, etc) to satisfy the Procedures barrier condition. For example, this would be where * to make all the regions on a RS on the quiescent for an procedure that required all regions * to be globally quiesed. * * Users should override this method. If a quiescent is not required, this is overkill but * can still be used to execute a procedure on all members and to propagate any exceptions. * * @throws ForeignException */ abstract public void acquireBarrier() throws ForeignException; /** * The implementation of this method should act with the assumption that the barrier condition * has been satisfied. Continuing the previous example, a condition could be that all RS's * globally have been quiesced, and procedures that require this precondition could be * implemented here. * The implementation should also collect the result of the subprocedure as data to be returned * to the coordinator upon successful completion. * Users should override this method. * @return the data the subprocedure wants to return to coordinator side. * @throws ForeignException */ abstract public byte[] insideBarrier() throws ForeignException; /** * Users should override this method. This implementation of this method should rollback and * cleanup any temporary or partially completed state that the {@link #acquireBarrier()} may have * created. * @param e */ abstract public void cleanup(Exception e); /** * Method to cancel the Subprocedure by injecting an exception from and external source. * @param cause */ public void cancel(String msg, Throwable cause) { LOG.error(msg, cause); complete = true; if (cause instanceof ForeignException) { monitor.receive((ForeignException) cause); } else { monitor.receive(new ForeignException(getMemberName(), cause)); } } /** * Callback for the member rpcs to call when the global barrier has been reached. This * unblocks the main subprocedure exectuion thread so that the Subprocedure's * {@link #insideBarrier()} method can be run. */ public void receiveReachedGlobalBarrier() { inGlobalBarrier.countDown(); } // // Subprocedure Internal State interface // /** * Wait for the reached global barrier notification. * * Package visibility for testing * * @throws ForeignException * @throws InterruptedException */ void waitForReachedGlobalBarrier() throws ForeignException, InterruptedException { Procedure.waitForLatch(inGlobalBarrier, monitor, wakeFrequency, barrierName + ":remote acquired"); } /** * Waits until the entire procedure has globally completed, or has been aborted. * @throws ForeignException * @throws InterruptedException */ public void waitForLocallyCompleted() throws ForeignException, InterruptedException { Procedure.waitForLatch(releasedLocalBarrier, monitor, wakeFrequency, barrierName + ":completed"); } /** * Empty Subprocedure for testing. * * Must be public for stubbing used in testing to work. */ public static class SubprocedureImpl extends Subprocedure { public SubprocedureImpl(ProcedureMember member, String opName, ForeignExceptionDispatcher monitor, long wakeFrequency, long timeout) { super(member, opName, monitor, wakeFrequency, timeout); } @Override public void acquireBarrier() throws ForeignException {} @Override public byte[] insideBarrier() throws ForeignException { return new byte[0]; } @Override public void cleanup(Exception e) {} }; }