/* * Copyright 2010 Outerthought bvba * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.lilyproject.util.zookeeper; import java.util.Collections; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.Watcher.Event.EventType; import org.apache.zookeeper.Watcher.Event.KeeperState; import org.apache.zookeeper.ZooDefs; import org.apache.zookeeper.data.Stat; import org.lilyproject.util.Logs; /** * Simple leader election system, not optimized for large numbers of potential leaders (could be improved * for herd effect, see ZK recipe). * * <p>It currently only reports if this client is the leader or not, it does not report who the leader is, * but that could be added. * * <p>It is intended for 'active leaders', which should give up their leader role as soon as we * are disconnected from the ZK cluster, rather than only when we get a session expiry event * (see http://markmail.org/message/o6whuii7wlf2a64c). * * <p>The leader state is reported via the {@link LeaderElectionCallback}, which is called from * within a different Thread than the ZooKeeper event thread. */ public class LeaderElection { private ZooKeeperItf zk; private String position; private String electionPath; private LeaderElectionCallback callback; private boolean elected = false; private ChildrenWatcher watcher = new ChildrenWatcher(); private ConnectStateWatcher connectStateWatcher = new ConnectStateWatcher(); private boolean stopped = false; private LeaderProvisioner leaderProvisioner = new LeaderProvisioner(); enum LeaderState { I_AM_LEADER, I_AM_NOT_LEADER } private Log log = LogFactory.getLog(this.getClass()); /** * * @param position a name for what position this leader election is about, used in informational messages. * @param electionPath path under which the ephemeral leader election nodes should be created. The path * will be created if it does not exist. The path should not end on a slash. */ public LeaderElection(ZooKeeperItf zk, String position, String electionPath, LeaderElectionCallback callback) throws LeaderElectionSetupException, InterruptedException, KeeperException { this.zk = zk; this.position = position; this.electionPath = electionPath; this.callback = callback; proposeAsLeader(); zk.addDefaultWatcher(connectStateWatcher); leaderProvisioner.start(); } public void stop() throws InterruptedException { // Note that ZooKeeper does not have a way to remove watches (see ZOOKEEPER-422) stopped = true; zk.removeDefaultWatcher(connectStateWatcher); leaderProvisioner.shutdown(); if (leaderProvisioner.currentState == LeaderState.I_AM_LEADER) { try { callback.deactivateAsLeader(); } catch (InterruptedException e) { throw e; } catch (Throwable t) { log.error("Error stopping the leader for " + position, t); } } } private void proposeAsLeader() throws LeaderElectionSetupException, InterruptedException, KeeperException { ZkUtil.createPath(zk, electionPath); try { // In case of connection loss, a node might have been created for us (we do not know it). Therefore, // retrying upon connection loss is important, so that we can continue with watching the leaders. // Later on, we do not look at the name of the node we created here, but at the owner. zk.retryOperation(new ZooKeeperOperation<String>() { @Override public String execute() throws KeeperException, InterruptedException { return zk.create(electionPath + "/n_", null, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); } }); } catch (KeeperException e) { throw new LeaderElectionSetupException("Error creating leader election zookeeper node below " + electionPath, e); } watchLeaders(); } private synchronized void watchLeaders() { try { // Here the code could be improved: providing the watcher here can give the so-called // "herd-effect", especially when there are many potential leaders. List<String> children = zk.getChildren(electionPath, watcher); // The child sequence numbers are fixed-with, prefixed with zeros, so we can sort them as strings Collections.sort(children); if (log.isDebugEnabled()) { log.debug("Leaders changed for the position of " + position + ", they are now:"); for (String child : children) { log.debug(child); } } // This list should never be empty, at least we are in it final String leader = children.get(0); // While we could compare the leader name with our own ephemeral node name, it is safer to compare // with the Stat.ephemeralOwner field. This is because the creation of the ephemeral node might have // failed with a ConnectionLoss exception, in which case we might have retried and hence have two // leader nodes allocated. Stat stat = zk.exists(electionPath + "/" + leader, false); if (stat.getEphemeralOwner() == zk.getSessionId() && !elected) { elected = true; log.info("Elected as leader for the position of " + position); leaderProvisioner.setRequiredState(LeaderState.I_AM_LEADER); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); log.error("Error getting children of path " + electionPath, e); } catch (KeeperException e) { // If the exception happened on the zk.getChildren() call, then the watcher will not have been // set, and we not be notified of future changes anymore. Thus we will not know anymore if we could // be the leader, but if this is only a disconnected exception, we still could be without knowing! // However, above we keep retrying the operation forever, so we will only get here if the session // is really expired. log.error("Error getting children of path " + electionPath, e); } } public class ChildrenWatcher implements Watcher { @Override public void process(WatchedEvent event) { if (stopped) { return; } if (event.getType() == EventType.NodeChildrenChanged && event.getPath().equals(electionPath)) { watchLeaders(); } } } public class ConnectStateWatcher implements Watcher { @Override public void process(WatchedEvent event) { if (stopped) { return; } if (event.getType() == EventType.None && (event.getState().equals(Event.KeeperState.Disconnected) || event.getState().equals(Event.KeeperState.Expired))) { if (elected) { elected = false; log.info("No longer leader for the position of " + position); leaderProvisioner.setRequiredState(LeaderState.I_AM_NOT_LEADER); } // Note that if we get a disconnected event here, our watcher is not unregistered, thus we will // get a connected event on this watcher once we are reconnected. // Since we are not owner of the ZooKeeper handle, we assume Expired states are handled // elsewhere in the application. } else if (event.getType() == EventType.None && event.getState() == KeeperState.SyncConnected) { // Upon reconnect, since our session was not expired, our ephemeral node will still // exist (it might even be the leader), therefore have a look at the leaders. watchLeaders(); } } } /** * Activates or deactivates the leader. This is done in a separate thread, rather than in the * ZooKeeper event thread, in order not to block delivery of messages to other ZK watchers. A simple solution * to this would be to simply launch a thread in the LeaderElectionCallback methods. But then the handling * of the activateAsLeader() and deactivateAsLeader() could run in parallel, which we do not desire. An * improvement would be to put their processing in a queue. But when we have a lot of disconnected/connected * events in a short time frame, faster than they are processed, it would not make sense to process them one by * one, we are only interested in bringing the leader to latest requested state. * * Therefore, the solution used here just keeps a 'requiredState' variable and notifies a monitor when * it is changed. */ private class LeaderProvisioner implements Runnable { private volatile LeaderState currentState = LeaderState.I_AM_NOT_LEADER; private volatile LeaderState requiredState = LeaderState.I_AM_NOT_LEADER; private final Object stateLock = new Object(); private Thread thread; private volatile boolean stop; // do not rely only on Thread.interrupt since some libraries eat interruptions public synchronized void shutdown() throws InterruptedException { if (thread == null || !thread.isAlive()) { return; } stop = true; thread.interrupt(); Logs.logThreadJoin(thread); thread.join(); thread = null; } public synchronized void start() { stop = false; thread = new Thread(this, "LeaderProvisioner for " + position); thread.start(); } @Override public void run() { while (!Thread.interrupted() && !stop) { try { if (currentState != requiredState) { if (requiredState == LeaderState.I_AM_LEADER) { callback.activateAsLeader(); currentState = LeaderState.I_AM_LEADER; } else if (requiredState == LeaderState.I_AM_NOT_LEADER) { callback.deactivateAsLeader(); currentState = LeaderState.I_AM_NOT_LEADER; } } synchronized (stateLock) { if (currentState == requiredState && !stop) { stateLock.wait(); } } } catch (InterruptedException e) { Thread.currentThread().interrupt(); // we stop working return; } catch (Throwable t) { log.error("Error in leader provisioner for " + position, t); } } } public void setRequiredState(LeaderState state) { synchronized (stateLock) { this.requiredState = state; stateLock.notifyAll(); } } } }