/** * Licensed to the zk1931 under one or more contributor license * agreements. See the NOTICE file distributed with this work * for additional information regarding copyright ownership. * The ASF licenses this file to you under the Apache License, * Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the * License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.zk1931.jzab; import com.github.zk1931.jzab.proto.ZabMessage.Message; import com.github.zk1931.jzab.proto.ZabMessage.Message.MessageType; import com.github.zk1931.jzab.ZabException.InvalidPhase; import com.github.zk1931.jzab.ZabException.TooManyPendingRequests; import java.io.IOException; import java.nio.ByteBuffer; import java.security.GeneralSecurityException; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; /** * Zab is a fault-tolerant, replicated protocol that guarantees all requests * submitted to it will be delivered in same order to all servers in the * cluster. The Zab class exposes all the operations of Jzab library. */ public class Zab { private static final Logger LOG = LoggerFactory.getLogger(Zab.class); /** * Future for background "main" thread. */ private final Future<Void> ft; /** * Server Id for Zab. */ private String serverId; /** * Configuration for Zab. */ private final ZabConfig config; /** * StateMachine callback. */ private final StateMachine stateMachine; /** * Background thread for Zab. */ private final MainThread mainThread; /** * Constructs a Zab instance by recovering from the log directory. * * @param stateMachine the state machine implementation of application. * @param config the configuration for Jzab, see {@link ZabConfig}. */ public Zab(StateMachine stateMachine, ZabConfig config) { this(stateMachine, config, null, null, null); } /** * Constructs a Zab instance by joining an existing cluster. This constructor * is supposed to be called only for the very first time to initialize the * log directory, once the log directory gets initialized you should call * {@link #Zab(StateMachine, ZabConfig) Zab} which recovers the Zab instance * from log directory. * * @param stateMachine the state machine implementation of application. * @param config the configuration for Jzab, see {@link ZabConfig}. * @param serverId ID ("host:port") of this server. * @param joinPeer the ID of peer you want to join in, the ID is a host:port * string of the peer. The first server bootstraps the cluster by joining * itself. */ public Zab(StateMachine stateMachine, ZabConfig config, String serverId, String joinPeer) { this(stateMachine, config, serverId, joinPeer, null, null, null); } /** * Constructs a Zab instance by booting from static cluster configuration. * This constructor is supposed to be called only for the very first time to * initialize the log directory, once the log directory gets initialized you * should call {@link #Zab(StateMachine, ZabConfig) Zab} which recovers the * Zab instance from log directory. * * @param stateMachine the state machine implementation of application. * @param config the configuration for Jzab, see {@link ZabConfig}. * @param serverId ID ("host:port") of this server. * @param peers the IDs of the servers in cluster, including itself. */ public Zab(StateMachine stateMachine, ZabConfig config, String serverId, Set<String> peers) { this(stateMachine, config, serverId, peers, null, null, null); } // This constructor is for internal testing purpose. "initState" allows us to // setup initial state of Jzab before starting Jzab. "stateCallback" allows // us catch the state transition happend in the runtime. "failureCallback" // allows us to inject failures to different points of code path. Zab(StateMachine stateMachine, ZabConfig config, PersistentState initState, StateChangeCallback stateCallback, FailureCaseCallback failureCallback) { this(stateMachine, config, null, null, null, initState, stateCallback, failureCallback); } // Same as the above, but for joining a peer. Zab(StateMachine stateMachine, ZabConfig config, String serverId, String joinPeer, PersistentState initState, StateChangeCallback stateCallback, FailureCaseCallback failureCallback) { this(stateMachine, config, serverId, joinPeer, null, initState, stateCallback, failureCallback); } // Starts with static configuration. Zab(StateMachine stateMachine, ZabConfig config, String serverId, Set<String> peers, PersistentState initState, StateChangeCallback stateCallback, FailureCaseCallback failureCallback) { this(stateMachine, config, serverId, null, peers, initState, stateCallback, failureCallback); } Zab(StateMachine stateMachine, ZabConfig config, String serverId, String joinPeer, Set<String> peers, PersistentState initState, StateChangeCallback stateCallback, FailureCaseCallback failureCallback) { this.config = config; this.stateMachine = stateMachine; this.serverId = serverId; try { // Initialize. this.mainThread = new MainThread(joinPeer, peers, stateCallback, failureCallback, initState); } catch (Exception e) { LOG.warn("Caught an exception while initializing Zab."); throw new IllegalStateException("Failed to initialize Zab.", e); } ExecutorService es = Executors.newSingleThreadExecutor(DaemonThreadFactory.FACTORY); // Starts main thread. this.ft = es.submit(this.mainThread); es.shutdown(); } /** * Get the future of the background working thread of Zab. Users can check * the status of the thread via the future. * * @return the future object of MainThread. */ public Future<Void> getFuture() { return this.ft; } /** * Submits a request to Zab. Under the hood, followers forward requests to the * leader and the leader will be responsible for converting this request to * idempotent transaction and broadcasting. If you send request in * non-broadcasting phase, the operation will fail. * * @param request the request to send through Zab * @param ctx context to be provided to the callback * @throws ZabException.InvalidPhase if Zab is not in broadcasting phase. * @throws ZabException.TooManyPendingRequests if the pending requests exceeds * the certain size, for example: if there are more pending requests than * ZabConfig.MAX_PENDING_REQS. */ public void send(ByteBuffer request, Object ctx) throws InvalidPhase, TooManyPendingRequests { this.mainThread.send(request, ctx); } /** * Flushes a request through pipeline. The flushed request will be delivered * in order with other sending requests, but it will not be convereted to * idempotent transaction and will not be persisted in log. And it will only * be delivered on the server who issued this request. The purpose of flush * is to allow implementing a consistent read-after-write. If you send flush * request in non-broadcasting phase, the operation will fail. * * @param request the request to be flushed. * @param ctx context to be provided to the callback * @throws ZabException.InvalidPhase if Zab is not in broadcasting phase. * @throws ZabException.TooManyPendingRequests if the pending requests exceeds * the certain size, for example: if there are more pending requests than * ZabConfig.MAX_PENDING_REQS. */ public void flush(ByteBuffer request, Object ctx) throws InvalidPhase, TooManyPendingRequests { this.mainThread.flush(request, ctx); } /** * Removes a peer from the cluster. If you send remove request in * non-broadcasting phase, the operation will fail. * * @param peerId the id of the peer who will be removed from the cluster. * @param ctx context to be provided to the callback * @throws ZabException.InvalidPhase if Zab is not in broadcasting phase. * @throws ZabException.TooManyPendingRequests if there is a pending snapshot * request. */ public void remove(String peerId, Object ctx) throws InvalidPhase, TooManyPendingRequests { this.mainThread.remove(peerId, ctx); } /** * Issues the request to take a snapshot. The {@link StateMachine#save} * callback will be called for serializing the application's state to disk. * * @param ctx context to be provided to the callback * @throws ZabException.InvalidPhase if Zab is not in broadcasting phase. * @throws ZabException.TooManyPendingRequests if there is a pending snapshot * request. */ public void takeSnapshot(Object ctx) throws InvalidPhase, TooManyPendingRequests { this.mainThread.takeSnapshot(ctx); } /** * Shut down the Zab. * * @throws InterruptedException in case of it's interrupted. */ public void shutdown() throws InterruptedException { this.mainThread.shutdown(); LOG.debug("Shutdown successfully."); } /** * Returns the server Id for this Zab instance. The application which * recovers from log directory probably needs to know the server Id of Zab. * * @return the server Id of this Zab instance. */ public String getServerId() { return this.serverId; } /** * Interface of callbacks which will be called when phase change happens. * Used for testing purpose. * * Phase changes : * * leaderDiscovering - leaderSynchronizating - leaderBroadcasting * / \ * electing Exit * \ / * followerDiscovering - followerSynchronizating - followerBroadcasting * */ interface StateChangeCallback { /** * Will be called when entering electing phase. */ void electing(); /** * Will be called when entering discovering phase of leader. * * @param electedLeader the elected leader. */ void leaderDiscovering(String electedLeader); /** * Will be called when entering discovery phase of follower. * * @param electedLeader the elected leader of this follower. */ void followerDiscovering(String electedLeader); /** * Will be called on leader side when the owner of initial history is * chosen. * * @param server the id of the server whose history is selected for * synchronization. * @param aEpoch the acknowledged epoch of the node whose initial history * is chosen for synchronization. * @param zxid the last transaction id of the node whose initial history * is chosen for synchronization. */ void initialHistoryOwner(String server, long aEpoch, Zxid zxid); /** * Will be called when entering synchronization phase of leader. * * @param epoch the established epoch. */ void leaderSynchronizing(long epoch); /** * Will be called when entering synchronization phase of follower. * * @param epoch the established epoch. */ void followerSynchronizing(long epoch); /** * Will be called when entering broadcasting phase of leader. * * @param epoch the acknowledged epoch (f.a). * @param history the initial history (f.h) of broadcasting phase. */ void leaderBroadcasting(long epoch, List<Transaction> history, ClusterConfiguration config); /** * Will be called when entering broadcasting phase of follower. * * @param epoch the current epoch (f.a). * @param history the initial history (f.h) of broadcasting phase. */ void followerBroadcasting(long epoch, List<Transaction> history, ClusterConfiguration config); /** * Will be called when Zab stops running. */ void leftCluster(); /** * Will be called once a COP is committed on leader side. */ void commitCop(); } /** * Will be thrown to force servers go back to electing phase, for test * purpose only. */ static class SimulatedException extends RuntimeException { private static final long serialVersionUID = 1L; public SimulatedException(String desc) { super(desc); } public SimulatedException() {} } /** * Interface of callbacks which simulate different kinds of failure cases for * testing purpose. */ abstract static class FailureCaseCallback { /** * Will be called when entering discovering phase of leader. * * @throws SimulatedException forces leader goes back to electing phase. */ void leaderDiscovering() {}; /** * Will be called when entering discovering phase of followers. * * @throws SimulatedException forces followers goes back to electing phase. */ void followerDiscovering() {}; /** * Will be called when entering synchronizing phase of leader. * * @throws SimulatedException forces leader goes back to electing phase. */ void leaderSynchronizing() {}; /** * Will be called when entering synchronizing phase of followers. * * @throws SimulatedException forces followers goes back to electing phase. */ void followerSynchronizing() {}; /** * Will be called when entering broadcasting phase of leader. * * @throws SimulatedException forces leader goes back to electing phase. */ void leaderBroadcasting() {}; /** * Will be called when entering discovering phase of followers. * * @throws SimulatedException forces followers goes back to electing phase. */ void followerBroadcasting() {}; } /** * Main working thread for Zab. */ class MainThread implements Callable<Void>, Transport.Receiver { /** * The state of Zab, it will be shared through different instance of * Participant object. */ private ParticipantState participantState; /** * Message queue. The receiving callback simply parses the message and puts * it in queue, it's up to Leader/Follower/Election to take out * and process the message. */ private final BlockingQueue<MessageTuple> messageQueue = new LinkedBlockingQueue<>(); private final String joinPeer; private final StateChangeCallback stateChangeCallback; private final Transport transport; private final Election election; private final PersistentState persistence; private Participant participant = null; MainThread(String joinPeer, Set<String> peers, StateChangeCallback stateChangeCallback, FailureCaseCallback failureCallback, PersistentState initState) throws IOException, InterruptedException, GeneralSecurityException { this.joinPeer = joinPeer; this.stateChangeCallback = stateChangeCallback; if (initState == null) { // If there's no initial state, we'll constructs the PersistenState // from the the log directory. persistence = new PersistentState(config.getLogDir()); } else { persistence = initState; } if (joinPeer != null) { // First time start up. Joining someone. if (!persistence.isEmpty()) { LOG.error("The log directory is not empty while joining."); throw new RuntimeException("Log directory must be empty."); } } else { // Means either it starts booting from static configuration or // recovering from a log directory. if (serverId != null) { LOG.debug("Boots from static configuration."); Zxid version = new Zxid(0, -1); ClusterConfiguration cnf = new ClusterConfiguration(version, peers, serverId); persistence.setLastSeenConfig(cnf); } else { // Restore from log directory. LOG.debug("Restores from log directory {}", config.getLogDir()); ClusterConfiguration cnf = persistence.getLastSeenConfig(); if (cnf == null) { throw new RuntimeException("Can't find configuration file."); } serverId = cnf.getServerId(); persistence.cleanupClusterConfigFiles(); } } MDC.put("serverId", serverId); // Creates transport. this.transport = new NettyTransport(serverId, this, config.getSslParameters(), persistence.getLogDir()); election = new FastLeaderElection(persistence, transport, messageQueue); participantState = new ParticipantState(persistence, serverId, transport, messageQueue, stateChangeCallback, failureCallback, config.getMinSyncTimeoutMs(), election); } @Override public Void call() throws Exception { try { if (this.joinPeer != null) { join(this.joinPeer); } while (true) { if (stateChangeCallback != null) { stateChangeCallback.electing(); } LOG.debug("Waiting for electing a leader."); String leader = this.election.electLeader(); LOG.debug("Select {} as leader.", leader); if (leader.equals(serverId)) { participant = new Leader(participantState, stateMachine, config); ((Leader)participant).lead(); } else { participant = new Follower(participantState, stateMachine, config); ((Follower)participant).follow(leader); } } } catch (InterruptedException e) { LOG.debug("Caught Interrupted exception, it has been shut down?"); Thread.currentThread().interrupt(); } catch (Participant.LeftCluster e) { LOG.debug("Zab has been shutdown."); } catch (Exception e) { LOG.error("Caught exception :", e); throw e; } finally { participantState.getTransport().shutdown(); } if (stateChangeCallback != null) { stateChangeCallback.leftCluster(); } return null; } @Override public void onReceived(String source, Message message) { MessageTuple tuple = new MessageTuple(source, message); this.messageQueue.add(tuple); } @Override public void onDisconnected(String server) { LOG.debug("ONDISCONNECTED from {}", server); Message disconnected = MessageBuilder.buildDisconnected(server); this.participantState.enqueueMessage(new MessageTuple(serverId, disconnected)); } void join(String peer) throws Exception { if (peer.equals(serverId)) { LOG.debug("Trying to join itself. Becomes leader directly."); participant = new Leader(participantState, stateMachine, config); } else { LOG.debug("Trying to join {}.", peer); participant = new Follower(participantState, stateMachine, config); } participant.join(peer); } void send(ByteBuffer buffer, Object ctx) throws InvalidPhase, TooManyPendingRequests { if (this.participant == null) { throw new InvalidPhase("Zab.send() called while recovering"); } this.participant.send(buffer, ctx); } void remove(String peerId, Object ctx) throws InvalidPhase, TooManyPendingRequests { if (this.participant == null) { throw new InvalidPhase("Zab.remove() called while recovering"); } this.participant.remove(peerId, ctx); } void flush(ByteBuffer buffer, Object ctx) throws InvalidPhase, TooManyPendingRequests { if (this.participant == null) { throw new InvalidPhase("Zab.flush() called while recovering"); } this.participant.flush(buffer, ctx); } void takeSnapshot(Object ctx) throws InvalidPhase, TooManyPendingRequests { if (this.participant == null) { throw new InvalidPhase("Zab.takeSnapshot() called while recovering"); } this.participant.takeSnapshot(ctx); } // Waits until MainThread thread has been shutdown. This function should be // called from a different thread. void shutdown() throws InterruptedException { this.participantState.enqueueShutdown(); try { ft.get(); } catch (ExecutionException ex) { throw new RuntimeException(ex); } finally { // Make sure we shutdown the transport in the end. this.transport.shutdown(); } } /** * Clears all the messages in the message queue, clears the peer in * transport if it's the DISCONNECTED message. This function should be * called only right before going back to recovery. */ protected void clearMessageQueue() { MessageTuple tuple = null; while ((tuple = messageQueue.poll()) != null) { Message msg = tuple.getMessage(); if (msg.getType() == MessageType.DISCONNECTED) { this.transport.clear(msg.getDisconnected().getServerId()); } else if (msg.getType() == MessageType.SHUT_DOWN) { throw new Participant.LeftCluster("Shutdown Zab."); } } } } }