/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltcore.zk; import java.util.Collections; import java.util.List; import java.util.ListIterator; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.zookeeper_voltpatches.CreateMode; import org.apache.zookeeper_voltpatches.KeeperException; import org.apache.zookeeper_voltpatches.WatchedEvent; import org.apache.zookeeper_voltpatches.Watcher; import org.apache.zookeeper_voltpatches.ZooDefs.Ids; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.voltcore.utils.CoreUtils; import com.google_voltpatches.common.collect.ImmutableSet; import com.google_voltpatches.common.collect.Sets; public class LeaderElector { // The root is always created as INITIALIZING until the first participant is added, // then it's changed to INITIALIZED. public static final byte INITIALIZING = 0; public static final byte INITIALIZED = 1; private final ZooKeeper zk; private final String dir; private final String prefix; private final byte[] data; private final LeaderNoticeHandler cb; private String node = null; private Set<String> knownChildren = null; private volatile String leader = null; private volatile boolean isLeader = false; private final ExecutorService es; private final AtomicBoolean m_done = new AtomicBoolean(false); private final Runnable electionEventHandler = new Runnable() { @Override public void run() { try { leader = watchNextLowerNode(); } catch (KeeperException.SessionExpiredException e) { // lost the full connection. some test cases do this... // means zk shutdown without the elector being shutdown. // ignore. e.printStackTrace(); } catch (KeeperException.ConnectionLossException e) { // lost the full connection. some test cases do this... // means shutdoown without the elector being // shutdown; ignore. e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (Exception e) { org.voltdb.VoltDB.crashLocalVoltDB( "Unexepected failure in LeaderElector.", true, e); } if (node != null && node.equals(leader)) { // become the leader isLeader = true; if (cb != null) { cb.becomeLeader(); } } } }; private final Runnable childrenEventHandler = new Runnable() { @Override public void run() { try { checkForChildChanges(); } catch (KeeperException.SessionExpiredException e) { // lost the full connection. some test cases do this... // means zk shutdown without the elector being shutdown. // ignore. e.printStackTrace(); } catch (KeeperException.ConnectionLossException e) { // lost the full connection. some test cases do this... // means shutdoown without the elector being // shutdown; ignore. e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (Exception e) { org.voltdb.VoltDB.crashLocalVoltDB( "Unexepected failure in LeaderElector.", true, e); } } }; private class ChildrenWatcher implements Watcher { @Override public void process(WatchedEvent event) { try { if (!m_done.get()) { es.submit(childrenEventHandler); } } catch (RejectedExecutionException e) { } } } private final ChildrenWatcher childWatcher = new ChildrenWatcher(); private class ElectionWatcher implements Watcher { @Override public void process(final WatchedEvent event) { try { if (!m_done.get()) { es.submit(electionEventHandler); } } catch (RejectedExecutionException e) { } } } private final ElectionWatcher electionWatcher = new ElectionWatcher(); public LeaderElector(ZooKeeper zk, String dir, String prefix, byte[] data, LeaderNoticeHandler cb) { this.zk = zk; this.dir = dir; this.prefix = prefix; this.data = data; this.cb = cb; es = CoreUtils.getCachedSingleThreadExecutor("Leader elector-" + dir, 15000); } /** * Provide a way for clients to create nodes which comply with the leader election * format without participating in a leader election * @throws InterruptedException * @throws KeeperException */ public static String createParticipantNode(ZooKeeper zk, String dir, String prefix, byte[] data) throws KeeperException, InterruptedException { createRootIfNotExist(zk, dir); String node = zk.create(ZKUtil.joinZKPath(dir, prefix + "_"), data, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); // Unlock the dir as initialized zk.setData(dir, new byte[] {INITIALIZED}, -1); return node; } public static void createRootIfNotExist(ZooKeeper zk, String dir) throws KeeperException, InterruptedException { // create the election root node if it doesn't exist. try { zk.create(dir, new byte[] {INITIALIZING}, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { // expected on all nodes that don't start() first. } } /** * Start leader election. * * Creates an ephemeral sequential node under the given directory and check * if we are the first one who created it. * * For details about the leader election algorithm, @see <a href= * "http://zookeeper.apache.org/doc/trunk/recipes.html#sc_leaderElection" * >Zookeeper Leader Election</a> * * @param block true for blocking operation, false for nonblocking * @throws Exception */ public void start(boolean block) throws KeeperException, InterruptedException, ExecutionException { node = createParticipantNode(zk, dir, prefix, data); Future<?> task = es.submit(electionEventHandler); if (block) { task.get(); } //Only do the extra work for watching children if a callback is registered if (cb != null) { task = es.submit(childrenEventHandler); if (block) { task.get(); } } } public boolean isLeader() { return isLeader; } public String getNode() { return node; } /** * Deletes the ephemeral node. Make sure that no future watches will fire. * * @throws InterruptedException * @throws KeeperException */ synchronized public void shutdown() throws InterruptedException, KeeperException { m_done.set(true); es.shutdown(); es.awaitTermination(365, TimeUnit.DAYS); } /** * Set a watch on the node that comes before the specified node in the * directory. * @return The lowest sequential node * @throws Exception */ private String watchNextLowerNode() throws KeeperException, InterruptedException { /* * Iterate through the sorted list of children and find the given node, * then setup a electionWatcher on the previous node if it exists, otherwise the * previous of the previous...until we reach the beginning, then we are * the lowest node. */ List<String> children = zk.getChildren(dir, false); Collections.sort(children); ListIterator<String> iter = children.listIterator(); String me = null; //Go till I find myself. while (iter.hasNext()) { me = ZKUtil.joinZKPath(dir, iter.next()); if (me.equals(node)) { break; } } assert (me != null); //Back on me iter.previous(); String lowest = null; //Until we have previous nodes and we set a watch on previous node. while (iter.hasPrevious()) { //Proess my lower nodes and put a watch on whats live String previous = ZKUtil.joinZKPath(dir, iter.previous()); if (zk.exists(previous, electionWatcher) != null) { lowest = previous; break; } } //If we could not watch any lower node we are lowest and must become leader. if (lowest == null) { return node; } return lowest; } /* * Check for a change in present nodes */ private void checkForChildChanges() throws KeeperException, InterruptedException { /* * Iterate through the sorted list of children and find the given node, * then setup a electionWatcher on the previous node if it exists, otherwise the * previous of the previous...until we reach the beginning, then we are * the lowest node. */ Set<String> children = ImmutableSet.copyOf(zk.getChildren(dir, childWatcher)); boolean topologyChange = false; boolean removed = false; boolean added = false; if (knownChildren != null) { if (!knownChildren.equals(children)) { removed = !Sets.difference(knownChildren, children).isEmpty(); added = !Sets.difference(children, knownChildren).isEmpty(); topologyChange = true; } } knownChildren = children; if (topologyChange && cb != null) { cb.noticedTopologyChange(added, removed); } } public static String electionDirForPartition(String path, int partition) { return ZKUtil.path(path, "partition_" + partition); } public static int getPartitionFromElectionDir(String partitionDir) { return Integer.parseInt(partitionDir.substring("partition_".length())); } public static String getPrefixFromChildName(String childName) { return childName.split("_")[0]; } }