/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.ha; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.UUID; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.RunnableFuture; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.log4j.Logger; import com.bigdata.ha.msg.HAMessageWrapper; import com.bigdata.ha.msg.HASendState; import com.bigdata.ha.msg.IHAMessage; import com.bigdata.ha.msg.IHASendState; import com.bigdata.ha.msg.IHASyncRequest; import com.bigdata.ha.msg.IHAWriteMessage; import com.bigdata.ha.pipeline.HAReceiveService; import com.bigdata.ha.pipeline.HAReceiveService.IHAReceiveCallback; import com.bigdata.ha.pipeline.HASendService; import com.bigdata.ha.pipeline.ImmediateDownstreamReplicationException; import com.bigdata.ha.pipeline.NestedPipelineException; import com.bigdata.ha.pipeline.PipelineImmediateDownstreamReplicationException; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.IBufferAccess; import com.bigdata.quorum.QCE; import com.bigdata.quorum.Quorum; import com.bigdata.quorum.QuorumException; import com.bigdata.quorum.QuorumMember; import com.bigdata.quorum.QuorumStateChangeEvent; import com.bigdata.quorum.QuorumStateChangeEventEnum; import com.bigdata.quorum.QuorumStateChangeListener; import com.bigdata.quorum.QuorumStateChangeListenerBase; import com.bigdata.quorum.ServiceLookup; import com.bigdata.util.InnerCause; import com.bigdata.util.concurrent.ExecutionExceptions; /** * {@link QuorumPipeline} implementation. * <p> * The {@link QuorumMember} must pass along the "pipeline" messages, including: * <ul> * <li>{@link QuorumMember#pipelineAdd()}</li> * <li>{@link QuorumMember#pipelineRemove()}</li> * <li>{@link QuorumMember#pipelineChange(UUID, UUID)}</li> * </ul> * When a quorum is met, the <i>leader</i> is always first in the write pipeline * since it is the node which receives writes from clients. When a service joins * the write pipeline, it always does so at the end of the chain. Services may * enter the write pipeline before joining a quorum in order to synchronize with * the quorum. If a service in the middle of the chain leaves the pipeline, then * the upstream node will reconfigure and retransmit the current cache block to * its new downstream node. This prevent nodes which are "bouncing" during * synchronization from causing write sets to be discarded. However, if the * leader leaves the write pipeline, then the quorum is broken and the write set * will be discarded. * <p> * Since the write pipeline is used to synchronize services trying to join the * quorum as well as the replicate writes for services joined with the quorum, * {@link HAReceiveService} may be live for a met quorum even though the * {@link QuorumMember} on whose behalf this class is acting is not joined with * the met quorum. * * <h3>Pipeline maintenance</h3> * * There are three broad categories which have to be handled: (1) leader leaves; * (2) pipeline leader election; and (3) follower leaves. A leader leave causes * the quorum to break, which will cause service leaves and pipeline leaves for * all joined services. However, services must add themselves to the pipeline * before they join the quorum and the pipeline will be reorganized if necessary * when the quorum leader is elected. This will result in a * {@link #pipelineElectedLeader()} event. A follower leave only causes the * follower to leave the pipeline and results in a * {@link #pipelineChange(UUID, UUID)} event. * <p> * There are two cases for a follower leave: (A) when the follower did not did * not have a downstream node; and (B) when there is downstream node. For (B), * the upstream node from the left follower should reconfigure for the new * downstream node and retransmit the current cache block and the event should * be otherwise unnoticed. * <p> * Handling a follower join requires us to synchronize the follower first which * requires some more infrastructure and should be done as part of the HA * synchronization test suite. * <p> * What follows is an example of how events will arrive for a quorum of three * services: A, B, and C. * * <pre> * A.getActor().pipelineAdd() => A.pipelineAdd() * B.getActor().pipelineAdd() => B.pipelineAdd(); A.pipelineChange(null,B); * C.getActor().pipelineAdd() => C.pipelineAdd(); B.pipelineChange(null,C); * </pre> * * At this point the pipeline order is <code>[A,B,C]</code>. Notice that the * {@link HASendService} for A is not established until the * <code>A.pipelineChange(null,B)</code> sets B as the new downstream service * for A. Likewise, B will not relay to C until it handles the * <code>B.pipelineChange(null,C)</code> event. * * <p> * * Given the pipeline order <code>[A,B,C]</code>, if B were to leave, then the * events would be: * * <pre> * B.getActor().pipelineRemove() => B.pipelineRemove(); A.pipelineChange(B,C); * </pre> * * and when this class handles the <code>A.pipelineChange(B,C)</code> event, it * must update the {@link HAReceiveService} such that it now relays data to C. * * <p> * * On the other hand, given the pipeline order <code>[A,B,C]</code>, if C were * to leave the events would be: * * <pre> * C.getActor().pipelineRemove() => C.pipelineRemove(); B.pipelineChange(C,null); * </pre> * * and when this class handles the <code>B.pipelineChange(C,null)</code> event, * it must update the C's {@link HAReceiveService} such that it continues to * receive data, but no longer relays data to a downstream service. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @param <S> * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/681" > * HAJournalServer deadlock: pipelineRemove() and getLeaderId() </a> */ abstract public class QuorumPipelineImpl<S extends HAPipelineGlue> /*extends QuorumStateChangeListenerBase */implements QuorumPipeline<S>, QuorumStateChangeListener { static private transient final Logger log = Logger .getLogger(QuorumPipelineImpl.class); /** * The timeout for a sleep before the next retry. These timeouts are * designed to allow some asynchronous processes to reconnect the * {@link HASendService} and the {@link HAReceiveService}s in write pipeline * such that a retransmit can succeed after a service has left the pipeline. * Depending on the nature of the error (i.e., a transient network problem * versus a pipeline reorganization), this can involve a number of zookeeper * events. * <p> * The retries will continue until {@link #getRetrySendTimeoutNanos()} has * elapsed. * * FIXME If this timeout is LTE 50ms, then ChecksumErrors can appear in the * sudden kill tests. The root cause is the failure to consistently tear * down and setup the pipeline when there are multiple requests queued for * the pipeline. This issue has also shown up when there is a replicated * write and a concurrent HALog replication. Specifically, an interrupt in * sendHALog() does not cause the socket channel on which the payload is * transmitted to be closed. See <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/724">HA wire * pulling and sudden kill testing</a>. */ private final int RETRY_SLEEP = 30; //200; // 50; // milliseconds. /** * Once this timeout is elapsed, retrySend() will fail. * <p> * Note: This gets overridden in the ZK aware version and set to a constant * greater than the then-current negotiated timeout for the client. */ protected long getRetrySendTimeoutNanos() { return TimeUnit.MILLISECONDS.toNanos(5000/*ms*/); } /** * The {@link QuorumMember}. */ private final QuorumMember<S> member; /** * The service {@link UUID} for the {@link QuorumMember}. */ private final UUID serviceId; /** * Lock managing the various mutable aspects of the pipeline state. */ private final ReentrantLock lock = new ReentrantLock(); /** * Condition signalled when the write replication pipeline has been changed * (either the upstream and/or downstream service was changed). */ private final Condition pipelineChanged = lock.newCondition(); /** send service (iff this is the leader). */ private HASendService sendService; /** * The receive service (iff this is a follower in a met quorum). */ private HAReceiveService<HAMessageWrapper> receiveService; /** * The buffer used to relay the data. This is only allocated for a * follower. */ private IBufferAccess receiveBuffer; /** * Cached metadata about the downstream service. */ private final AtomicReference<PipelineState<S>> pipelineStateRef = new AtomicReference<PipelineState<S>>(); /** * Inner class does the actual work once to handle an event. */ private final InnerEventHandler innerEventHandler = new InnerEventHandler(); /** * One up message identifier. */ private final AtomicLong messageId = new AtomicLong(0L); /** * Core implementation of the handler for the various events. Always run * while holding the {@link #lock}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/681" > * HAJournalServer deadlock: pipelineRemove() and getLeaderId() </a> */ private final class InnerEventHandler extends QuorumStateChangeListenerBase { /** * A queue of events that can only be handled when a write replication * operation owns the {@link QuorumPipelineImpl#lock}. * * @see QuorumPipelineImpl#lock() * @see #dispatchEvents() */ private final BlockingQueue<QuorumStateChangeEvent> queue = new LinkedBlockingQueue<QuorumStateChangeEvent>(); protected InnerEventHandler() { } /** * Enqueue an event. * * @param e * The event. */ private void queue(final QuorumStateChangeEvent e) { if (log.isInfoEnabled()) log.info("Adding StateChange: " + e); queue.add(e); } /** * Boolean controls whether or not event elision is used. See below. * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/681" > * HAJournalServer deadlock: pipelineRemove() and getLeaderId() * </a> */ static private final boolean s_eventElission = true; /** * Event elission endeavours to ensure that events processed * represent current state change. * * This is best explained with an example from its original usage * in processing graphic events. Whilst a "button click" is a singular * event and all button clicks should be processed, a "mouse move" event * could be elided with the next "mouse move" event. Thus the move events * (L1 -> L2) and (L2 -> L3) would elide to a single (L1 -> L3). * * In HA RMI calls can trigger event processing, whilst other threads monitor * state changes - such as open sockets. Without elission, monitoring threads * will observe unnecessary transitional state changes. HOWEVER, there remains * a problem with this pattern of synchronization. */ private void elideEvents() { if (!s_eventElission) { return; } /* * Check for event elission: check for PIPELINE_UPSTREAM and * PIPELINE_CHANGE and remove earlier ones check for PIPELINE_ADD * and PIPELINE_REMOVE pairings. */ final Iterator<QuorumStateChangeEvent> events = queue.iterator(); QuorumStateChangeEvent uce = null; // UPSTREAM CHANGE QuorumStateChangeEvent dce = null; // DOWNSTREAM CHANGE QuorumStateChangeEvent add = null; // PIPELINE_ADD while (events.hasNext()) { final QuorumStateChangeEvent tst = events.next(); if (tst.getEventType() == QuorumStateChangeEventEnum.PIPELINE_UPSTREAM_CHANGE) { if (uce != null) { if (log.isDebugEnabled()) log.debug("Elission removal of: " + uce); queue.remove(uce); } uce = tst; } else if (tst.getEventType() == QuorumStateChangeEventEnum.PIPELINE_CHANGE) { if (dce != null) { // replace 'from' of new state with 'from' of old tst.getDownstreamOldAndNew()[0] = dce .getDownstreamOldAndNew()[0]; if (log.isDebugEnabled()) log.debug("Elission removal of: " + dce); queue.remove(dce); } dce = tst; } else if (tst.getEventType() == QuorumStateChangeEventEnum.PIPELINE_ADD) { add = tst; } else if (tst.getEventType() == QuorumStateChangeEventEnum.PIPELINE_REMOVE) { if (add != null) { if (log.isDebugEnabled()) { log.debug("Elission removal of: " + add); log.debug("Elission removal of: " + tst); } queue.remove(add); queue.remove(tst); add = null; } if (dce != null) { if (log.isDebugEnabled()) log.debug("Elission removal of: " + dce); queue.remove(dce); dce = null; } if (uce != null) { if (log.isDebugEnabled()) log.debug("Elission removal of: " + uce); queue.remove(uce); uce = null; } } } } // elideEvents() /** * Dispatch any events in the {@link #queue}. */ private void dispatchEvents() { elideEvents(); QuorumStateChangeEvent e; // If an event is immediately available, dispatch it now. while ((e = queue.poll()) != null) { if (log.isInfoEnabled()) log.info("Dispatching: " + e); // An event is available. innerEventHandler.dispatchEvent(e); } } /** * Dispatch to the InnerEventHandler. * * @param e * The event. * * @throws IllegalMonitorStateException * if the caller does not own the {@link #lock}. */ private void dispatchEvent(final QuorumStateChangeEvent e) throws IllegalMonitorStateException { if(!lock.isHeldByCurrentThread()) { /* * The InnerEventHandler should be holding the outer lock. */ throw new IllegalMonitorStateException(); } if (log.isInfoEnabled()) log.info(e.toString()); switch (e.getEventType()) { case CONSENSUS: consensus(e.getLastCommitTimeConsensus()); break; case LOST_CONSENSUS: lostConsensus(); break; case MEMBER_ADD: memberAdd(); break; case MEMBER_REMOVE: memberRemove(); break; case PIPELINE_ADD: pipelineAdd(); break; case PIPELINE_CHANGE: { final UUID[] a = e.getDownstreamOldAndNew(); pipelineChange(a[0]/* oldDownStreamId */, a[1]/* newDownStreamId */); break; } case PIPELINE_ELECTED_LEADER: pipelineElectedLeader(); break; case PIPELINE_REMOVE: pipelineRemove(); break; case PIPELINE_UPSTREAM_CHANGE: pipelineUpstreamChange(); break; case QUORUM_BREAK: quorumBreak(); break; case QUORUM_MEET: quorumMeet(e.getToken(), e.getLeaderId()); break; case SERVICE_JOIN: serviceJoin(); break; case SERVICE_LEAVE: serviceLeave(); break; default: throw new UnsupportedOperationException(e.getEventType().toString()); } } // @Override // public void serviceLeave() { // } // // @Override // public void serviceJoin() { // } // // /** // * Extended to setup this service as a leader ({@link #setUpLeader()}), // * or a relay ({@link #setUpReceiveAndRelay()}. // */ // @Override // public void quorumMeet(final long token, final UUID leaderId) { // super.quorumMeet(token, leaderId); // lock.lock(); // try { // this.token = token; // if(leaderId.equals(serviceId)) { // setUpLeader(); // } else if(member.isPipelineMember()) { // setUpReceiveAndRelay(); // } // } finally { // lock.unlock(); // } // } // @Override // public void quorumBreak() { // super.quorumBreak(); // lock.lock(); // try { // tearDown(); // } finally { // lock.unlock(); // } // } /** * {@inheritDoc} * <p> * This implementation sets up the {@link HASendService} or the * {@link HAReceiveService} as appropriate depending on whether or not * this service is the first in the pipeline order. */ @Override public void pipelineAdd() { if (log.isInfoEnabled()) log.info(""); super.pipelineAdd(); lock.lock(); try { // The current pipeline order. final UUID[] pipelineOrder = member.getQuorum().getPipeline(); // The index of this service in the pipeline order. final int index = getIndex(serviceId, pipelineOrder); if (index == 0) { setUpSendService(); } else if (index > 0) { setUpReceiveService(); } } finally { lock.unlock(); } } @Override public void pipelineElectedLeader() { if (log.isInfoEnabled()) log.info(""); super.pipelineElectedLeader(); lock.lock(); try { tearDown(); setUpSendService(); } finally { lock.unlock(); } } /** * {@inheritDoc} * <p> * This implementation tears down the {@link HASendService} or * {@link HAReceiveService} associated with this service. */ @Override public void pipelineRemove() { if (log.isInfoEnabled()) log.info(""); super.pipelineRemove(); lock.lock(); try { tearDown(); } finally { lock.unlock(); } } /** * {@inheritDoc} * <p> * This implementation changes the target of the {@link HASendService} * for the leader (or the {@link HAReceiveService} for a follower) to * send (or relay) write cache blocks to the specified service. */ @Override public void pipelineChange(final UUID oldDownStreamId, final UUID newDownStreamId) { super.pipelineChange(oldDownStreamId, newDownStreamId); lock.lock(); try { if (oldDownStreamId == newDownStreamId) { // Nothing to do (both null or same UUID reference). return; } if (oldDownStreamId != null && newDownStreamId != null && oldDownStreamId.equals(newDownStreamId)) { /* * Nothing to do. The pipeline is already configured * correctly. */ return; } // The address of the next service in the pipeline. // final InetSocketAddress addrNext = getAddrNext(newDownStreamId); final PipelineState<S> nextState = getAddrNext(newDownStreamId); final InetSocketAddress addrNext = nextState == null ? null : nextState.addr; if (log.isInfoEnabled()) log.info("oldDownStreamId=" + oldDownStreamId + ",newDownStreamId=" + newDownStreamId + ", addrNext=" + addrNext + ", sendService=" + sendService + ", receiveService=" + receiveService); if (sendService != null) { /* * Terminate the existing connection (we were the first * service in the pipeline). */ sendService.terminate(); if (addrNext != null) { if (log.isDebugEnabled()) log.debug("sendService.start(): addrNext=" + addrNext); sendService.start(addrNext); } } else if (receiveService != null) { /* * Reconfigure the receive service to change how it is * relaying (we were relaying, so the receiveService was * running but not the sendService). */ if (log.isDebugEnabled()) log.debug("receiveService.changeDownStream(): addrNext=" + addrNext); receiveService.changeDownStream(addrNext); } // populate and/or clear the cache. pipelineStateRef.set(nextState); // cachePipelineState(newDownStreamId); // Signal pipeline change. pipelineChanged.signalAll(); if (log.isDebugEnabled()) log.debug("pipelineChange - done."); } finally { lock.unlock(); } } @Override public void pipelineUpstreamChange() { super.pipelineUpstreamChange(); lock.lock(); try { if (receiveService != null) { /* * Make sure that the receiveService closes out its client * connection with the old upstream service. */ if (log.isInfoEnabled()) log.info("receiveService=" + receiveService); receiveService.changeUpStream(); // Signal pipeline change. pipelineChanged.signalAll(); } } finally { lock.unlock(); } } // @Override // public void memberRemove() { // } // // @Override // public void memberAdd() { // } // // @Override // public void lostConsensus() { // } // // @Override // public void consensus(long lastCommitTime) { // } /** * Request the {@link InetSocketAddress} of the write pipeline for a service * (RMI). * * @param downStreamId * The service. * * @return It's {@link InetSocketAddress} */ private PipelineState<S> getAddrNext(final UUID downStreamId) { if (downStreamId == null) return null; final S service = member.getService(downStreamId); try { final InetSocketAddress addrNext = service .getWritePipelineAddr(); return new PipelineState<S>(service, addrNext); } catch (IOException e) { throw new RuntimeException(e); } } /** * Tear down any state associated with the {@link QuorumPipelineImpl}. This * implementation tears down the send/receive service and releases the * receive buffer. */ private void tearDown() { if (log.isInfoEnabled()) log.info(""); lock.lock(); try { /* * Leader tear down. */ { if (sendService != null) { sendService.terminate(); sendService = null; } } /* * Follower tear down. */ { if (receiveService != null) { receiveService.terminate(); try { receiveService.awaitShutdown(); } catch (InterruptedException e) { throw new RuntimeException(e); } finally { receiveService = null; } } if (receiveBuffer != null) { try { /* * Release the buffer back to the pool. */ receiveBuffer.release(); } catch (InterruptedException e) { throw new RuntimeException(e); } finally { receiveBuffer = null; } } } // clear cache. pipelineStateRef.set(null); // Signal pipeline change. pipelineChanged.signalAll(); } finally { lock.unlock(); } } // /** // * Populate or clear the {@link #pipelineState} cache. // * <p> // * Note: The only times we need to populate the {@link #pipelineState} are // * in response to a {@link #pipelineChange(UUID, UUID)} event or in response // * to message a {@link #pipelineElectedLeader()} event. // * // * @param downStreamId // * The downstream service {@link UUID}. // */ // private void cachePipelineState(final UUID downStreamId) { // // if (downStreamId == null) { // // pipelineStateRef.set(null); // // return; // // } // // final S nextService = member.getService(downStreamId); // // final PipelineState<S> pipelineState = new PipelineState<S>(); // // try { // // pipelineState.addr = nextService.getWritePipelineAddr(); // // } catch (IOException e) { // // throw new RuntimeException(e); // // } // // pipelineState.service = nextService; // // pipelineStateRef.set(pipelineState); // // } /** * Setup the send service. */ private void setUpSendService() { if (log.isInfoEnabled()) log.info(""); lock.lock(); try { // Allocate the send service. sendService = new HASendService(); /* * The service downstream from this service. * * Note: The downstream service in the pipeline is not available * when the first service adds itself to the pipeline. In those * cases the pipelineChange() event is used to update the * HASendService to send to the downstream service. * * Note: When we handle a pipelineLeaderElected() message the * downstream service MAY already be available, which is why we * handle downstreamId != null conditionally. */ final UUID downstreamId = member.getDownstreamServiceId(); // The address of the next service in the pipeline. final PipelineState<S> nextState = getAddrNext(downstreamId); if (nextState != null) { // // The address of the next service in the pipeline. // final InetSocketAddress addrNext = member.getService( // downstreamId).getWritePipelineAddr(); // Start the send service. sendService.start(nextState.addr); } // populate and/or clear the cache. pipelineStateRef.set(nextState); // cachePipelineState(downstreamId); // Signal pipeline change. pipelineChanged.signalAll(); } catch (Throwable t) { try { tearDown(); } catch (Throwable t2) { log.error(t2, t2); } throw new RuntimeException(t); } finally { lock.unlock(); } } /** * Setup the service to receive pipeline writes and to relay them (if there * is a downstream service). */ private void setUpReceiveService() { lock.lock(); try { // The downstream service UUID. final UUID downstreamId = member.getDownstreamServiceId(); // Acquire buffer from the pool to receive data. try { receiveBuffer = DirectBufferPool.INSTANCE.acquire(); } catch (InterruptedException e) { throw new RuntimeException(e); } // The address of this service. final InetSocketAddress addrSelf = member.getService() .getWritePipelineAddr(); // Address of the downstream service (if any). // final InetSocketAddress addrNext = downstreamId == null ? null // : member.getService(downstreamId).getWritePipelineAddr(); // final InetSocketAddress addrNext = getAddrNext(downstreamId); final PipelineState<S> nextServiceState = getAddrNext(downstreamId); final InetSocketAddress addrNext = nextServiceState == null ? null : nextServiceState.addr; // Setup the receive service. receiveService = new HAReceiveService<HAMessageWrapper>( addrSelf, addrNext, new IHAReceiveCallback<HAMessageWrapper>() { @Override public void callback(final HAMessageWrapper msg, final ByteBuffer data) throws Exception { // delegate handling of write cache blocks. handleReplicatedWrite(msg.getHASyncRequest(), (IHAWriteMessage) msg .getHAWriteMessage(), data); } @Override public void incReceive(final HAMessageWrapper msg, final int nreads, final int rdlen, final int rem) throws Exception { // delegate handling of incremental receive notify. QuorumPipelineImpl.this.incReceive(// msg.getHASyncRequest(), (IHAWriteMessage) msg.getHAWriteMessage(), // nreads, rdlen, rem); } }); // Start the receive service - will not return until service is // running receiveService.start(); // Signal pipeline change. pipelineChanged.signalAll(); } catch (Throwable t) { /* * Always tear down if there was a setup problem to avoid leaking * threads or a native ByteBuffer. */ try { tearDown(); } catch (Throwable t2) { log.error(t2, t2); } finally { log.error(t, t); } throw new RuntimeException(t); } finally { lock.unlock(); } } }; /** * Acquire {@link #lock} and {@link #dispatchEvents()}. */ private void lock() { boolean ok = false; this.lock.lock(); try { innerEventHandler.dispatchEvents();// have lock, dispatch events. ok = true; // success. } finally { if (!ok) { // release lock if there was a problem. this.lock.unlock(); } } } /** * Acquire {@link #lock} and {@link #dispatchEvents()}. */ private void lockInterruptibly() throws InterruptedException { boolean ok = false; lock.lockInterruptibly(); try { innerEventHandler.dispatchEvents(); // have lock, dispatch events. ok = true; // success. } finally { if (!ok) { // release lock if there was a problem. this.lock.unlock(); } } } /** * {@link #dispatchEvents()} and release {@link #lock}. */ private void unlock() { try { innerEventHandler.dispatchEvents(); } finally { this.lock.unlock(); } } public QuorumPipelineImpl(final QuorumMember<S> member) { if (member == null) throw new IllegalArgumentException(); this.member = member; this.serviceId = member.getServiceId(); } /** * Extended to invoke {@link #tearDown()} in order to guarantee the eventual * release of the {@link #receiveBuffer} and the shutdown of the * {@link #sendService} or {@link #receiveService}. */ @Override protected void finalize() throws Throwable { innerEventHandler.tearDown(); super.finalize(); } /** * Return the index at which the given serviceId appears in the array of * serviceIds. * * @param serviceId * The {@link UUID} of some quorum member. * @param a * An array of service {@link UUID}s. * * @return The index of the service in the array -or- <code>-1</code> if the * service does not appear in the array. */ static private int getIndex(final UUID serviceId, final UUID[] a) { if (serviceId == null) throw new IllegalArgumentException(); for (int i = 0; i < a.length; i++) { if (serviceId.equals(a[i])) { return i; } } return -1; } /** * Return the NIO buffer used to receive payloads written on the HA write * pipeline. * * @return The buffer -or- <code>null</code> if the pipeline has been torn * down or if this is the leader. */ private ByteBuffer getReceiveBuffer() { if (!lock.isHeldByCurrentThread()) { // The caller MUST be holding the lock. throw new IllegalMonitorStateException(); } // trinary pattern is safe while thread has lock. return receiveBuffer == null ? null : receiveBuffer.buffer(); } /** * Return the {@link HAReceiveService} used to receive payloads written on * the HA write pipeline. * * @return The buffer -or- <code>null</code> if the pipeline has been torn * down or if this is the leader. */ private HAReceiveService<HAMessageWrapper> getHAReceiveService() { if (!lock.isHeldByCurrentThread()) { // The caller MUST be holding the lock. throw new IllegalMonitorStateException(); } return receiveService; } /** * Return the {@link HASendService} used to write payloads on the HA write * pipeline. * * @return The {@link HASendService} -or- <code>null</code> if the pipeline * has been torn down. */ private HASendService getHASendService() { if (!lock.isHeldByCurrentThread()) { // The caller MUST be holding the lock. throw new IllegalMonitorStateException(); } return sendService; } /* * QuorumStateChangeListener * * Note: This interface is delegated using a queue. The queue allows * the processing of the events to be deferred until the appropriate * lock is held. This prevents contention for the lock and avoids * lock ordering problems such as described at [1]. * * @see InnerEventHandler */ @Override public void pipelineAdd() { innerEventHandler .queue(new QCE(QuorumStateChangeEventEnum.PIPELINE_ADD)); } @Override public void pipelineElectedLeader() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.PIPELINE_ELECTED_LEADER)); } @Override public void pipelineRemove() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.PIPELINE_REMOVE)); } @Override public void pipelineChange(final UUID oldDownStreamId, final UUID newDownStreamId) { innerEventHandler .queue(new QCE(QuorumStateChangeEventEnum.PIPELINE_CHANGE, new UUID[] { oldDownStreamId, newDownStreamId }, null/* lastCommitTimeConsensus */, null/* token */, null/* leaderId */)); } @Override public void pipelineUpstreamChange() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.PIPELINE_UPSTREAM_CHANGE)); } @Override public void memberAdd() { innerEventHandler.queue(new QCE(QuorumStateChangeEventEnum.MEMBER_ADD)); } @Override public void memberRemove() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.MEMBER_REMOVE)); } @Override public void consensus(final long lastCommitTime) { innerEventHandler.queue(new QCE(QuorumStateChangeEventEnum.CONSENSUS, null/* downstreamIds */, lastCommitTime/* lastCommitTimeConsensus */, null/* token */, null/* leaderId */)); } @Override public void lostConsensus() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.LOST_CONSENSUS)); } @Override public void serviceJoin() { innerEventHandler .queue(new QCE(QuorumStateChangeEventEnum.SERVICE_JOIN)); } @Override public void serviceLeave() { innerEventHandler.queue(new QCE( QuorumStateChangeEventEnum.SERVICE_LEAVE)); } @Override public void quorumMeet(final long token, final UUID leaderId) { innerEventHandler.queue(new QCE(QuorumStateChangeEventEnum.QUORUM_MEET, null/* downstreamIds */, null/* lastCommitTimeConsensus */, token, leaderId)); } @Override public void quorumBreak() { innerEventHandler .queue(new QCE(QuorumStateChangeEventEnum.QUORUM_BREAK)); } /* * End of QuorumStateChangeListener. */ private IHASendState newSendState() { final Quorum<?, ?> quorum = member.getQuorum(); final IHASendState snd = new HASendState(messageId.incrementAndGet(), serviceId/* originalSenderId */, serviceId/* senderId */, quorum.token(), quorum.replicationFactor()); return snd; } @Override public Future<IHAPipelineResetResponse> resetPipeline( final IHAPipelineResetRequest req) throws IOException { final FutureTask<IHAPipelineResetResponse> ft = new FutureTask<IHAPipelineResetResponse>( new ResetPipelineTaskImpl(req)); member.getExecutor().submit(ft); return ft; } /** * Task resets the pipeline on this service. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ private class ResetPipelineTaskImpl implements Callable<IHAPipelineResetResponse> { private final IHAPipelineResetRequest req; public ResetPipelineTaskImpl(final IHAPipelineResetRequest req) { this.req = req; } @Override public IHAPipelineResetResponse call() throws Exception { lock.lock(); try { return doRunWithLock(); } finally { lock.unlock(); } } /** * If there is an identified problem service and that service is either * our upstream or downstream service, then we need to wait until we * observe a pipeline change event such that it is no longer our * upstream or downstream service. Otherwise we can go ahead and reset * our pipeline. * * TODO What if this service is the problem service? */ private boolean isProblemServiceOurNeighbor() { final UUID psid = req.getProblemServiceId(); if (psid == null) { return false; } final UUID[] priorAndNext = member.getQuorum() .getPipelinePriorAndNext(member.getServiceId()); if (psid.equals(priorAndNext[0])) return true; if (psid.equals(priorAndNext[1])) return true; return false; } private IHAPipelineResetResponse doRunWithLock() throws Exception { log.warn("Will reset pipeline: req=" + req); final long begin = System.nanoTime(); final long timeout = req.getTimeoutNanos(); long remaining = timeout; if (isProblemServiceOurNeighbor()) { log.warn("Problem service is our neighbor."); do { pipelineChanged.await(remaining, TimeUnit.NANOSECONDS); // remaining = timeout - elapsed remaining = timeout - (begin - System.nanoTime()); } while (isProblemServiceOurNeighbor() && remaining > 0); if (isProblemServiceOurNeighbor()) { /* * Timeout elapsed. * * Note: This could be a false timeout, e.g., the problem * service left and re-entered and is still our neighbor. * However, the leader will just log and ignore the problem. * If the pipeline is working again, then all is good. If * not, then it will force out the problem service and reset * the pipeline again. */ throw new TimeoutException(); } } else { log.warn("Problem service is not our neighbor."); // tear down send and/or receive services. innerEventHandler.tearDown(); // The current pipeline order. final UUID[] pipelineOrder = member.getQuorum().getPipeline(); // The index of this service in the pipeline order. final int index = getIndex(serviceId, pipelineOrder); if (index == 0) { innerEventHandler.setUpSendService(); } else if (index > 0) { innerEventHandler.setUpReceiveService(); } } return new HAPipelineResetResponse(); } } /* * This is the leader, so send() the buffer. */ @Override public Future<Void> replicate(final IHASyncRequest req, final IHAWriteMessage msg, final ByteBuffer b) throws IOException { final RunnableFuture<Void> ft; lock(); try { ft = new FutureTask<Void>(new RobustReplicateTask(req, newSendState(), msg, b)); } finally { unlock(); } // Submit Future for execution (outside of the lock). member.getExecutor().execute(ft); // Return Future. Caller must wait on the Future. return ft; } /** * Task robustly replicates an {@link IHAWriteMessage} and the associated * payload. */ private class RobustReplicateTask implements Callable<Void> { /** * An historical message is indicated when the {@link IHASyncRequest} is * non-<code>null</code>. */ private final IHASyncRequest req; /** * Metadata about the state of the sender for this message. */ private final IHASendState snd; /** * The {@link IHAWriteMessage}. */ private final IHAWriteMessage msg; /** * The associated payload. */ private final ByteBuffer b; /** * The token for the leader. The service that initiates the replication * of a message MUST be the leader for this token. * <p> * The token is either taken from the {@link IHAWriteMessage} (if this * is a live write) or from the current {@link Quorum#token()}. * <p> * Either way, we verify that this service is (and remains) the leader * for that token throughout the {@link RobustReplicateTask}. */ private final long quorumToken; public RobustReplicateTask(final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg, final ByteBuffer b) { // Note: [req] MAY be null. if (snd == null) throw new IllegalArgumentException(); if (msg == null) throw new IllegalArgumentException(); if (b == null) throw new IllegalArgumentException(); this.req = req; this.snd = snd; this.msg = msg; this.b = b; if (b.remaining() == 0) { // Empty buffer. throw new IllegalStateException("Empty buffer: req=" + req + ", msg=" + msg + ", buffer=" + b); } if (req == null) { /* * Live message. * * Use the quorum token on the message. It was put there by the * WriteCacheService. This allows us to ensure that the qourum * token remains valid for all messages replicated by the * leader. */ quorumToken = msg.getQuorumToken(); // Must be the leader for that token. member.assertLeader(quorumToken); } else { /* * Historical message. */ // Use the current quorum token. quorumToken = member.getQuorum().token(); // Must be the leader for that token. member.assertLeader(quorumToken); } } /** * This collects the assertion(s) that we make for the service that is * attempting to robustly replicate a write into a single method. This * was done in order to concentrate any conditional logic and design * rationale into a single method. * <p> * Note: IFF we allow non-leaders to replicate HALog messages then this * assert MUST be changed to verify that the quorum token remains valid * and that this service remains joined with the met quorum, i.e., * * <pre> * if (!quorum.isJoined(token)) * throw new QuorumException(); * </pre> */ private void assertQuorumState() throws QuorumException { // Must be the leader for that token. member.assertLeader(quorumToken); // if (req == null) { // // /* // * This service must be the leader since this is a LIVE // * write cache message (versus a historical message that is // * being replayed). // * // * Note: The [quorumToken] is from the message IFF this is a // * live message and is otherwise the current quorum token. // */ // member.assertLeader(quorumToken); // // } } /** * Robust replication of a write cache block along the pipeline. * <p> * Note: {@link Future} for {@link #call()} is [WCS.remoteWriteFuture]. * <p> * Note: In order for replication from the leader to be robust while * still permitting the WCS to shutdown, we need to distinguish two * different ways in which replication can be interrupted/cancelled: * <p> * 1. WCS.close()/reset() => throw; * <p> * 2. pipelineChange() => retrySend() * <p> * If the WCS is being shutdown, then we MUST NOT attempt to cure the * exception thrown out of innerReplicate(). This will show up as an * {@link InterruptedException} if the interrupt is encountered in this * thread (call()). * <p> * * If there is a {@link QuorumPipelineImpl#pipelineChange(UUID, UUID)}, * then that method will cause {@link SendBufferTask} and/or * {@link ReceiveAndReplicateTask} to be cancelled. {@link Future#get()} * for those tasks thus can report a {@link CancellationException}. * <p> * If we fail to distinguish these cases, then a pipelineChange() can * cause the {@link RobustReplicateTask} to fail, which is precisely * what it should not do. Instead the pipeline should be re-established * and the writes replicated by the leader along the new pipeline order. * <p> * The following stack traces are illustrative of a problem when the * pipeline is [A,C] and [B] joins while a write is being replicated * from [A] to [C]. The join of [B] triggers pipelineChange() which * causes the {@link SendBufferTask} to be cancelled. The untrapped * {@link CancellationException} propagates from this method to * WCS.WriteTask.writeCacheBlock() to fail, failing the transaction. The * correct behavior in this case is to enter retrySend() to cure the * pipelineChange(). * * <pre> * Leader(A): * ERROR: 10:47:39,027 29698 com.bigdata.rwstore.RWStore$11 com.bigdata.io.writecache.WriteCacheService$WriteTask.call(WriteCacheService.java:937): java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.CancellationException * java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.CancellationException * at java.util.concurrent.FutureTask$Sync.innerGet(FutureTask.java:252) * at java.util.concurrent.FutureTask.get(FutureTask.java:111) * at com.bigdata.io.writecache.WriteCacheService$WriteTask.writeCacheBlock(WriteCacheService.java:1466) * at com.bigdata.io.writecache.WriteCacheService$WriteTask.doRun(WriteCacheService.java:1015) * at com.bigdata.io.writecache.WriteCacheService$WriteTask.call(WriteCacheService.java:884) * at com.bigdata.io.writecache.WriteCacheService$WriteTask.call(WriteCacheService.java:1) * at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334) * at java.util.concurrent.FutureTask.run(FutureTask.java:166) * at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110) * at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603) * at java.lang.Thread.run(Thread.java:722) * Caused by: java.lang.RuntimeException: java.util.concurrent.CancellationException * at com.bigdata.ha.QuorumPipelineImpl$RobustReplicateTask.call(QuorumPipelineImpl.java:912) * at com.bigdata.ha.QuorumPipelineImpl$RobustReplicateTask.call(QuorumPipelineImpl.java:1) * ... 5 more * Caused by: java.util.concurrent.CancellationException * at java.util.concurrent.FutureTask$Sync.innerGet(FutureTask.java:250) * at java.util.concurrent.FutureTask.get(FutureTask.java:111) * at com.bigdata.service.proxy.ThickFuture.<init>(ThickFuture.java:66) * at com.bigdata.journal.jini.ha.HAJournal$HAGlueService.getProxy(HAJournal.java:1539) * at com.bigdata.journal.AbstractJournal$BasicHA.getProxy(AbstractJournal.java:5976) * at com.bigdata.journal.AbstractJournal$BasicHA.receiveAndReplicate(AbstractJournal.java:6718) * at com.bigdata.journal.jini.ha.HAJournalTest$HAGlueTestImpl.receiveAndReplicate(HAJournalTest.java:757) * at sun.reflect.GeneratedMethodAccessor7.invoke(Unknown Source) * at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) * at java.lang.reflect.Method.invoke(Method.java:601) * at net.jini.jeri.BasicInvocationDispatcher.invoke(BasicInvocationDispatcher.java:1126) * at net.jini.jeri.BasicInvocationDispatcher.dispatch(BasicInvocationDispatcher.java:608) * at com.sun.jini.jeri.internal.runtime.Target$2.run(Target.java:487) * at net.jini.export.ServerContext.doWithServerContext(ServerContext.java:103) * at com.sun.jini.jeri.internal.runtime.Target.dispatch(Target.java:484) * at com.sun.jini.jeri.internal.runtime.Target.access$000(Target.java:57) * at com.sun.jini.jeri.internal.runtime.Target$1.run(Target.java:464) * at java.security.AccessController.doPrivileged(Native Method) * at com.sun.jini.jeri.internal.runtime.Target.dispatch(Target.java:461) * at com.sun.jini.jeri.internal.runtime.Target.dispatch(Target.java:426) * at com.sun.jini.jeri.internal.runtime.DgcRequestDispatcher.dispatch(DgcRequestDispatcher.java:210) * at net.jini.jeri.connection.ServerConnectionManager$Dispatcher.dispatch(ServerConnectionManager.java:147) * at com.sun.jini.jeri.internal.mux.MuxServer$1$1.run(MuxServer.java:244) * at java.security.AccessController.doPrivileged(Native Method) * at com.sun.jini.jeri.internal.mux.MuxServer$1.run(MuxServer.java:241) * at com.sun.jini.thread.ThreadPool$Worker.run(ThreadPool.java:136) * at java.lang.Thread.run(Thread.java:722) * at com.sun.jini.jeri.internal.runtime.Util.__________EXCEPTION_RECEIVED_FROM_SERVER__________(Util.java:108) * at com.sun.jini.jeri.internal.runtime.Util.exceptionReceivedFromServer(Util.java:101) * at net.jini.jeri.BasicInvocationHandler.unmarshalThrow(BasicInvocationHandler.java:1303) * at net.jini.jeri.BasicInvocationHandler.invokeRemoteMethodOnce(BasicInvocationHandler.java:832) * at net.jini.jeri.BasicInvocationHandler.invokeRemoteMethod(BasicInvocationHandler.java:659) * at net.jini.jeri.BasicInvocationHandler.invoke(BasicInvocationHandler.java:528) * at $Proxy2.receiveAndReplicate(Unknown Source) * at com.bigdata.ha.QuorumPipelineImpl$SendBufferTask.doRunWithLock(QuorumPipelineImpl.java:1127) * at com.bigdata.ha.QuorumPipelineImpl$SendBufferTask.call(QuorumPipelineImpl.java:1105) * at com.bigdata.ha.QuorumPipelineImpl$RobustReplicateTask.innerReplicate(QuorumPipelineImpl.java:967) * at com.bigdata.ha.QuorumPipelineImpl$RobustReplicateTask.call(QuorumPipelineImpl.java:906) * ... 6 more * </pre> * * <pre> * First follower (C): * WARN : 10:47:39,002 14879 com.bigdata.ha.pipeline.HAReceiveService@705072408{addrSelf=localhost/127.0.0.1:9092} com.bigdata.ha.pipeline.HAReceiveService.runNoBlock(HAReceiveService.java:547): java.util.concurrent.CancellationException * java.util.concurrent.CancellationException * at java.util.concurrent.FutureTask$Sync.innerGet(FutureTask.java:250) * at java.util.concurrent.FutureTask.get(FutureTask.java:111) * at com.bigdata.ha.pipeline.HAReceiveService.runNoBlock(HAReceiveService.java:545) * at com.bigdata.ha.pipeline.HAReceiveService.run(HAReceiveService.java:431) * </pre> */ @Override public Void call() throws Exception { final long beginNanos = System.nanoTime(); /* * Note: This is tested outside of the try/catch. Do NOT retry if * the quorum state has become invalid. */ assertQuorumState(); try { innerReplicate(0/* retryCount */); } catch (Throwable t) { // Note: Also see retrySend()'s catch block. if (InnerCause.isInnerCause(t, InterruptedException.class) // || InnerCause.isInnerCause(t, CancellationException.class) ) { throw new RuntimeException(t); } // Log initial error. log.error(t, t); if (!retrySend()) { final long elapsedNanos = System.nanoTime() - beginNanos; // Rethrow the original exception. throw new RuntimeException( "Giving up. Could not send after " + TimeUnit.NANOSECONDS.toMillis(elapsedNanos) + "ms : " + t, t); } } return null; } /** * Replicate from the leader to the first follower. Each non-final * follower will receiveAndReplicate the write cache buffer. The last * follower will receive the buffer. * * @param retryCount * The #of attempts and ZERO (0) if this is the first * attempt. * * @throws Exception */ private void innerReplicate(final int retryCount) throws Exception { lockInterruptibly(); try { if (log.isInfoEnabled() || retryCount > 0) { final String msg2 = "Leader will send: " + b.remaining() + " bytes, retryCount=" + retryCount + ", req=" + req + ", msg=" + msg; if (retryCount > 0) log.warn(msg2); else log.info(msg2); } // retest while holding lock before sending the message. assertQuorumState(); final PipelineState<S> downstream = pipelineStateRef.get(); final HASendService sendService = getHASendService(); final ByteBuffer b = this.b.duplicate(); new SendBufferTask<S>(member, quorumToken, req, snd, msg, b, downstream, sendService, QuorumPipelineImpl.this, sendLock).call(); return; } finally { unlock(); } } // call() /** * Robust retransmit of the current cache block. This method is designed to * handle several kinds of recoverable errors, including: * <ul> * <li>downstream service leaves the pipeline</li> * <li>intermittent failure sending the RMI message</li> * <li>intermittent failure sending the payload</li> * </ul> * The basic pattern is that it will retry the operation a few times to see * if there is a repeatable error. Each time it attempts the operation it * will discover the current downstream serviceId and verify that the quorum * is still valid. Each error (including the first) is logged. If the * operation fails, the original error is rethrown. If the operation * succeeds, then the cache block was successfully transmitted to the * current downstream service and this method returns without error. * * @throws InterruptedException */ private boolean retrySend() throws InterruptedException { final long beginNanos = System.nanoTime(); final long nanos = getRetrySendTimeoutNanos(); final long retrySleepNanos = TimeUnit.MILLISECONDS.toNanos(RETRY_SLEEP); int tryCount = 1; // already retried once. // now try some more times. while (true) { long remaining = nanos - (System.nanoTime() - beginNanos); if (remaining <= retrySleepNanos) break; // Sleep before each retry (including the first). Thread.sleep(RETRY_SLEEP/* ms */); remaining = nanos - (System.nanoTime() - beginNanos); /* * Note: Tested OUTSIDE of the try/catch so a quorum break will * immediately stop the retry behavior. */ assertQuorumState(); try { // send to 1st follower. innerReplicate(tryCount++); // Success. return true; } catch (Throwable t) { // Note: Also see call()'s catch block. if (InnerCause.isInnerCause(t, InterruptedException.class) // || InnerCause.isInnerCause(t, CancellationException.class) ) { throw new RuntimeException( t ); } // log and retry. log.error( "retry=" + tryCount + ", elapsed=" + TimeUnit.NANOSECONDS.toMillis(System .nanoTime() - beginNanos) + "ms : " + t, t); continue; } } // Send was not successful. return false; } // retrySend() } // class RobustReplicateTask // /** // * The logic needs to support the asynchronous termination of the // * {@link Future} that is responsible for replicating the {@link WriteCache} // * block, which is why the API exposes the means to inform the caller about // * that {@link Future}. // * // * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan // * Thompson</a> // */ // public interface IRetrySendCallback { // /** // * // * @param remoteFuture // */ // void notifyRemoteFuture(final Future<Void> remoteFuture); // } /** * Task to send() a buffer to the follower. */ static private class SendBufferTask<S extends HAPipelineGlue> implements Callable<Void> { private final QuorumMember<S> member; private final long token; // member MUST remain leader for token. private final IHASyncRequest req; private final IHASendState snd; private final IHAWriteMessage msg; private final ByteBuffer b; private final PipelineState<S> downstream; private final HASendService sendService; private final QuorumPipelineImpl<S> outerClass; private final Semaphore sendLock; public SendBufferTask(final QuorumMember<S> member, final long token, final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg, final ByteBuffer b, final PipelineState<S> downstream, final HASendService sendService, final QuorumPipelineImpl<S> outerClass, final Semaphore sendLock) { this.member = member; this.token = token; this.req = req; // Note: MAY be null. this.snd = snd; this.msg = msg; this.b = b; this.downstream = downstream; this.sendService = sendService; this.outerClass = outerClass; this.sendLock = sendLock; } @Override public Void call() throws Exception { /* * Lock ensures that we do not have more than one request on the * write pipeline at a time. */ sendLock.acquire(); try { doRunWithLock(); return null; } finally { sendLock.release(); } } private void doRunWithLock() throws InterruptedException, ExecutionException, IOException { // Get Future for send() outcome on local service. final Future<Void> futLoc = sendService.send(b, snd.getMarker()); try { try { // Get Future for receive outcome on the remote service // (RMI). final Future<Void> futRmt; try { futRmt = downstream.service.receiveAndReplicate(req, snd, msg); } catch (IOException ex) { // RMI error. throw new ImmediateDownstreamReplicationException(ex); } try { /* * Await the Futures, but spend more time waiting on the * local Future and only check the remote Future every * second. Timeouts are ignored during this loop - they * are used to let us wait longer on the local Future * than on the remote Future. ExecutionExceptions are * also ignored. We want to continue this loop until * both Futures are done. Interrupts are not trapped, so * an interrupt will still exit the loop. * * It appears that it is possible for futSnd to be * blocked and not generate an error. If we do not exit * the loop and check the futRec future in this case * then we coul loop continuously. This does rather beg * the question of whether we should only be checking * futRec at this stage. * * Note: [futRmt] is currently a ThickFuture to avoid * historical problems with DGC and is already done by * the time the RMI returns that ThickFuture to us. * Therefore the loop below can be commented out. All we * are really doing is waiting on futSnd and verifying * that the [token] remains valid. */ while ((!futLoc.isDone() || !futRmt.isDone())) { /* * Make sure leader's quorum token remains valid for * ALL writes. */ member.assertLeader(token); try { futLoc.get(500L, TimeUnit.MILLISECONDS); } catch (TimeoutException ignore) { } catch (ExecutionException ignore) { /* * Try the other Future with timeout and cancel * if not done. */ try { futRmt.get(500L, TimeUnit.MILLISECONDS); } catch(TimeoutException ex) { // Ignore. } catch(ExecutionException ex) { // Ignore. } finally { futRmt.cancel(true/* mayInterruptIfRunning */); } /* * Note: Both futures are DONE at this point. */ } try { futRmt.get(500L, TimeUnit.MILLISECONDS); } catch (TimeoutException ignore) { } catch (ExecutionException ignore) { /* * Try the other Future with timeout and cancel * if not done. */ try { futLoc.get(500L, TimeUnit.MILLISECONDS); } catch(TimeoutException ex) { // Ignore. } catch(ExecutionException ex) { // Ignore. } finally { futLoc.cancel(true/* mayInterruptIfRunning */); } /* * Note: Both futures are DONE at this point. */ } /* * Note: Both futures are DONE at this point. */ } /* * Note: We want to check the remote Future for the * downstream service first in order to accurately * report the service that was the source of a pipeline * replication problem. */ futRmt.get(); futLoc.get(); } finally { if (!futRmt.isDone()) { // cancel remote Future unless done. futRmt.cancel(true/* mayInterruptIfRunning */); } } } finally { // cancel the local Future. futLoc.cancel(true/* mayInterruptIfRunning */); } } catch (Throwable t) { launderPipelineException(true/* isLeader */, token, member, outerClass, t); } } } // class SendBufferTask /** * Launder an exception thrown during pipeline replication. * * @param isLeader * <code>true</code> iff this service is the quorum leader. * @param token * The quorum token. * @param member * The {@link QuorumMember} for this service. * @param outerClass * The outer class - required for {@link #resetPipeline()}. * @param t * The throwable. */ static private <S extends HAPipelineGlue> void launderPipelineException( final boolean isLeader, final long token, final QuorumMember<S> member, final QuorumPipelineImpl<S> outerClass, final Throwable t) { log.warn("isLeader=" + isLeader + ", t=" + t, t); /* * When non-null, some service downstream of this service had a problem * replicating to a follower. */ final PipelineImmediateDownstreamReplicationException remoteCause = (PipelineImmediateDownstreamReplicationException) InnerCause.getInnerCause(t, PipelineImmediateDownstreamReplicationException.class); /* * When non-null, this service has a problem with replication to its * immediate follower. * * Note: if [remoteCause!=null], then we DO NOT look for a direct cause * (since there will be one wrapped up in the exception trace for some * remote service rather than for this service). */ final ImmediateDownstreamReplicationException directCause = remoteCause == null ? (ImmediateDownstreamReplicationException) InnerCause .getInnerCause(t, ImmediateDownstreamReplicationException.class) : null; final UUID thisService = member.getServiceId(); final UUID[] priorAndNext = member.getQuorum().getPipelinePriorAndNext( member.getServiceId()); if (isLeader) { // The problem service (iff identified). UUID problemServiceId = null; try { /* * If we can identify the problem service, then force it out of * the pipeline. It can re-enter the pipeline once it * transitions through its ERROR state. */ if (directCause != null) { problemServiceId = priorAndNext[1]; } else if (remoteCause != null) { problemServiceId = remoteCause.getProblemServiceId(); } else { // Do not remove anybody. } if (problemServiceId != null) { // Force out the problem service. member.getActor().forceRemoveService(problemServiceId); } } catch (Throwable e) { // Log and continue. log.error("Problem on force remove: problemServiceId=" + problemServiceId, e); if(InnerCause.isInnerCause(e, InterruptedException.class)) { // Propagate interrupt. Thread.currentThread().interrupt(); } } /** * Reset the pipeline on each service (including the leader). If * replication fails, the socket connections both upstream and * downstream of the point of failure can be left in an * indeterminate state with partially buffered data. In order to * bring the pipeline back into a known state (without forcing a * quorum break) we message each service in the pipeline to reset * its HAReceiveService (including the inner HASendService. The next * message and payload relayed from the leader will cause new socket * connections to be established. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/724" * > HA Wire Pulling and Sudden Kills </a> */ try { outerClass.resetPipeline(token, problemServiceId); } catch (Throwable e) { // Log and continue. Details are logged by resetPipeline(). log.warn("Problem(s) on reset pipeline: " + e); if(InnerCause.isInnerCause(e, InterruptedException.class)) { // Propagate interrupt. Thread.currentThread().interrupt(); } } } if (directCause != null) { throw new PipelineImmediateDownstreamReplicationException( thisService, priorAndNext, t); } else if (remoteCause != null) { throw new NestedPipelineException(t); } else { throw new RuntimeException(t); } } /** * Issue concurrent requests to each service in the pipeline to reset the * pipeline on that service. The request to the leader is executed in the * caller's thread so it will own whatever locks the caller already owns - * this is done to avoid deadlock. * * @param token * The quorum token on the leader. * @param problemServiceId * The problem service in the write pipeline (if known). */ private void resetPipeline(final long token, final UUID problemServiceId) { log.error("Leader will reset pipeline: token=" + token+", problemServiceId=" + problemServiceId); /* * We will only message the services that are in the pipeline. * * For services (other than the leader) in the quorum, submit the * RunnableFutures to an Executor. * * For the leader, we do this in the caller's thread (to avoid * possible deadlocks). */ final UUID[] pipelineIds = member.getQuorum().getPipeline(); member.assertLeader(token); // TODO Configure timeout on HAJournalServer. final IHAPipelineResetRequest msg = new HAPipelineResetRequest(token, problemServiceId, TimeUnit.MILLISECONDS.toNanos(5000)); /* * To minimize latency, we first submit the futures for the other * services and then do f.run() on the leader. */ final List<Future<IHAPipelineResetResponse>> localFutures = new LinkedList<Future<IHAPipelineResetResponse>>(); try { for (int i = 1; i < pipelineIds.length; i++) { final UUID serviceId = pipelineIds[i]; /* * Submit task on local executor. The task will do an RMI to the * remote service. */ final Future<IHAPipelineResetResponse> rf = member .getExecutor().submit( new PipelineResetMessageTask<S>(member, serviceId, msg)); // add to list of futures we will check. localFutures.add(rf); } { /* * Run the operation on the leader using a local method call * (non-RMI) in the caller's thread to avoid deadlock. */ member.assertLeader(token); final FutureTask<IHAPipelineResetResponse> ft = new FutureTask<IHAPipelineResetResponse>( new ResetPipelineTaskImpl(msg)); localFutures.add(ft); ft.run();// run on the leader. } /* * Check the futures for the other services in the quorum. */ final List<Throwable> causes = new LinkedList<Throwable>(); for (Future<IHAPipelineResetResponse> ft : localFutures) { try { ft.get(); // TODO Timeout? } catch (InterruptedException ex) { log.error(ex, ex); causes.add(ex); } catch (ExecutionException ex) { log.error(ex, ex); causes.add(ex); } catch (RuntimeException ex) { /* * Note: ClientFuture.get() can throw a RuntimeException * if there is a problem with the RMI call. In this case * we do not know whether the Future is done. */ log.error(ex, ex); causes.add(ex); } finally { // Note: cancelling a *local* Future wrapping an RMI. ft.cancel(true/* mayInterruptIfRunning */); } } /* * If there were any errors, then throw an exception listing them. */ if (!causes.isEmpty()) { // Throw exception back to the leader. if (causes.size() == 1) throw new RuntimeException(causes.get(0)); throw new RuntimeException("remote errors: nfailures=" + causes.size(), new ExecutionExceptions(causes)); } } finally { // Ensure that all futures are cancelled. QuorumServiceBase.cancelFutures(localFutures); } } static private class PipelineResetMessageTask<S extends HAPipelineGlue> extends AbstractMessageTask<S, IHAPipelineResetResponse, IHAPipelineResetRequest> { public PipelineResetMessageTask(final ServiceLookup<S> serviceLookup, final UUID serviceId, final IHAPipelineResetRequest msg) { super(serviceLookup, serviceId, msg); } @Override protected Future<IHAPipelineResetResponse> doRMI(final S service) throws IOException { return service.resetPipeline(msg); } } /** * Lock used to ensure that at most one message is being sent along the * write pipeline at a time. */ // private final Lock sendLock = new ReentrantLock(); private final Semaphore sendLock = new Semaphore(1/*permits*/); @Override public Future<Void> receiveAndReplicate(final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg) throws IOException { /* * FIXME We should probably pass the quorum token through from the * leader for ALL replicated writes [this is now done by the * IHASendState but the code is not really using that data yet]. This * uses the leader's quorum token when it is available (for a live * write) and otherwise uses the current quorum token (for historical * writes, since we are not providing the leader's token in this case). */ final long token = req == null ? msg.getQuorumToken() : member .getQuorum().token(); final RunnableFuture<Void> ft; lock(); try { // Must be valid quorum. member.getQuorum().assertQuorum(token); if (receiveBuffer == null) { /* * The quorum broke and the receive buffer was cleared or * possibly we have become a leader (a distinct test since * otherwise we can hit an NPE on the receiveBuffer). */ throw new QuorumException(); } final PipelineState<S> downstream = pipelineStateRef.get(); if (log.isTraceEnabled()) log.trace("Will receive " + ((downstream != null) ? " and replicate" : "") + ": msg=" + msg); final ByteBuffer b = getReceiveBuffer(); final HAReceiveService<HAMessageWrapper> receiveService = getHAReceiveService(); if (downstream == null) { /* * This is the last service in the write pipeline, so just * receive the buffer. * * Note: The receive service is executing this Future locally on * this host. However, we still want the receiveData() method to * run while we are not holding the [lock] so we wrap it up as a * task and submit it. */ ft = new FutureTask<Void>(new ReceiveTask<S>(member, token, req, snd, msg, b, receiveService)); // try { // // // wrap the messages together. // final HAMessageWrapper wrappedMsg = new HAMessageWrapper( // req, msg); // // // receive. // return receiveService.receiveData(wrappedMsg, b); // // } catch (InterruptedException e) { // // throw new RuntimeException(e); // // } } else { /* * A service in the middle of the write pipeline (not the first * and not the last). */ ft = new FutureTask<Void>(new ReceiveAndReplicateTask<S>( member, token, req, snd, msg, b, downstream, receiveService, QuorumPipelineImpl.this)); } } finally { unlock(); } // Execute the FutureTask (w/o the lock). member.getExecutor().execute(ft); return ft; } /** * Task sets up the {@link Future} for the receive on the last follower. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @param <S> */ private static class ReceiveTask<S extends HAPipelineGlue> implements Callable<Void> { private final QuorumMember<S> member; private final long token; private final IHASyncRequest req; private final IHASendState snd; private final IHAWriteMessage msg; private final ByteBuffer b; private final HAReceiveService<HAMessageWrapper> receiveService; public ReceiveTask(final QuorumMember<S> member, final long token, final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg, final ByteBuffer b, final HAReceiveService<HAMessageWrapper> receiveService ) { this.member = member; this.token = token; this.req = req; // Note: MAY be null. this.snd = snd; this.msg = msg; this.b = b; this.receiveService = receiveService; } @Override public Void call() throws Exception { // wrap the messages together. final HAMessageWrapper wrappedMsg = new HAMessageWrapper( req, snd, msg); // Get Future for receive() outcome on local service. final Future<Void> futRec = receiveService.receiveData(wrappedMsg, b); try { // Await outcome while monitoring the quorum token. while (true) { try { // Verify token remains valid. member.getQuorum().assertQuorum(token); // Await the future. return futRec.get(1000, TimeUnit.MILLISECONDS); } catch (TimeoutException ex) { // Timeout. Ignore and retry loop. Thread.sleep(100/* ms */); continue; } } } finally { // cancel the local Future. futRec.cancel(true/*mayInterruptIfRunning*/); } } } /** * A service in the middle of the write pipeline (not the first and not the * last). */ private static class ReceiveAndReplicateTask<S extends HAPipelineGlue> implements Callable<Void> { private final QuorumMember<S> member; private final long token; private final IHASyncRequest req; private final IHASendState snd; private final IHAWriteMessage msg; private final ByteBuffer b; private final PipelineState<S> downstream; private final HAReceiveService<HAMessageWrapper> receiveService; private final QuorumPipelineImpl<S> outerClass; public ReceiveAndReplicateTask(final QuorumMember<S> member, final long token, final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg, final ByteBuffer b, final PipelineState<S> downstream, final HAReceiveService<HAMessageWrapper> receiveService, final QuorumPipelineImpl<S> outerClass) { this.member = member; this.token = token; this.req = req; // Note: MAY be null. this.snd = snd; this.msg = msg; this.b = b; this.downstream = downstream; this.receiveService = receiveService; this.outerClass = outerClass; } @Override public Void call() throws Exception { // wrap the messages together. final HAMessageWrapper wrappedMsg = new HAMessageWrapper( req, snd, msg); // Get Future for receive() outcome on local service. final Future<Void> futLoc = receiveService.receiveData(wrappedMsg, b); try { try { // Get Future for receive outcome on the remote service // (RMI). final Future<Void> futRmt; try { futRmt = downstream.service.receiveAndReplicate(req, snd, msg); } catch (IOException ex) { // RMI error. throw new ImmediateDownstreamReplicationException(ex); } try { /* * Await the Futures, but spend more time waiting on the * local Future and only check the remote Future every * second. Timeouts are ignored during this loop - they * are used to let us wait longer on the local Future * than on the remote Future. ExecutionExceptions are * also ignored. We want to continue this loop until * both Futures are done. Interrupts are not trapped, so * an interrupt will still exit the loop. * * Note: [futRmt] is currently a ThickFuture to avoid * historical problems with DGC and is already done by * the time the RMI returns that ThickFuture to us. * Therefore the loop below can be commented out. All we * are really doing is waiting on futSnd and verifying * that the [token] remains valid. */ while ((!futLoc.isDone() || !futRmt.isDone())) { /* * The token must remain valid, even if this service * is not joined with the met quorum. If fact, * services MUST replicate writes regardless of * whether or not they are joined with the met * quorum, but only while there is a met quorum. */ member.getQuorum().assertQuorum(token); try { futLoc.get(500L, TimeUnit.MILLISECONDS); } catch (TimeoutException ignore) { } catch (ExecutionException ignore) { /* * Try the other Future with timeout and cancel * if not done. */ try { futRmt.get(500L, TimeUnit.MILLISECONDS); } catch(TimeoutException ex) { // Ignore. } catch(ExecutionException ex) { // Ignore. } finally { futRmt.cancel(true/* mayInterruptIfRunning */); } /* * Note: Both futures are DONE at this point. */ } try { futRmt.get(500L, TimeUnit.MILLISECONDS); } catch (TimeoutException ignore) { } catch (ExecutionException ignore) { /* * Try the other Future with timeout and cancel * if not done. */ try { futLoc.get(500L, TimeUnit.MILLISECONDS); } catch(TimeoutException ex) { // Ignore. } catch(ExecutionException ex) { // Ignore. } finally { futLoc.cancel(true/* mayInterruptIfRunning */); } /* * Note: Both futures are DONE at this point. */ } /* * Note: Both futures are DONE at this point. */ } /* * Note: We want to check the remote Future for the * downstream service first in order to accurately * report the service that was the source of a pipeline * replication problem. */ futLoc.get(); futRmt.get(); } finally { if (!futRmt.isDone()) { // cancel remote Future unless done. futRmt.cancel(true/* mayInterruptIfRunning */); } } } finally { // cancel the local Future. futLoc.cancel(true/* mayInterruptIfRunning */); } } catch (Throwable t) { launderPipelineException(false/* isLeader */, token, member, outerClass, t); } // done return null; } } /** * Core implementation handles the message and payload when received on a * service. * <p> * Note: Replication of the message and payload is handled by the caller. * The implementation of this method is NOT responsible for replication. * * @param req * The synchronization request (optional). When non- * <code>null</code> the msg and payload are historical data. * When <code>null</code> they are live data. * @param msg * Metadata about a buffer containing data replicated to this * node. * @param data * The buffer containing the data. * * @throws Exception */ abstract protected void handleReplicatedWrite(final IHASyncRequest req, final IHAWriteMessage msg, final ByteBuffer data) throws Exception; /** * Notify that some payload bytes have been incrementally received for an * {@link IHAMessage}. * * @param msg * The message. * @param nreads * The number of reads performed against the upstream socket for * this message. * @param rdlen * The number of bytes read from the socket in this read. * @param rem * The number of bytes remaining before the payload has been * fully read. * * @throws Exception */ abstract protected void incReceive(final IHASyncRequest req, final IHAWriteMessage msg, final int nreads, final int rdlen, final int rem) throws Exception; /** * A utility class that bundles together the Internet address and port at which * the downstream service will accept and relay cache blocks for the write * pipeline and the remote interface which is used to communicate with that * service using RMI. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ private static class PipelineState<S extends HAPipelineGlue> implements Externalizable { /** * */ private static final long serialVersionUID = 1L; /** * The Internet address and port at which the downstream service will accept * and relay cache blocks for the write pipeline. */ public InetSocketAddress addr; /** * The remote interface for the downstream service which will accept and * relay cache blocks from this service. * <p> * Note: In order for an instance of this class to be serializable, an * exported proxy for the {@link HAGlue} object must be used here rather * than the local object reference. */ public S service; /** Deserialization constructor. */ @SuppressWarnings("unused") public PipelineState() { } public PipelineState(final S service, final InetSocketAddress addr) { this.service = service; this.addr = addr; } @SuppressWarnings("unchecked") public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException { addr = (InetSocketAddress) in.readObject(); service = (S) in.readObject(); } public void writeExternal(final ObjectOutput out) throws IOException { out.writeObject(addr); out.writeObject(service); } } /** * Called from ErrorTask in HAJournalServer to ensure that events are * processed before entering SeekConsensus. * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/695"> * HAJournalServer reports "follower" but is in SeekConsensus and is not * participating in commits</a> */ public void processEvents() { this.lock.lock(); try { innerEventHandler.dispatchEvents();// have lock, dispatch events. } finally { this.lock.unlock(); } } }