package org.apache.hadoop.hdfs; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URI; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooDefs.Perms; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Id; import org.apache.zookeeper.data.Stat; public class AvatarZooKeeperClient { private String connection; private int timeout; private boolean watch; // Prefix under which the data for this client will be stored private String prefix; private Watcher watcher; private ZooKeeper zk; // Making it large enough to be sure that the cluster is down // these retries go one after another so they do not take long public static final int ZK_CONNECTION_RETRIES = 100; public AvatarZooKeeperClient(Configuration conf, Watcher watcher) { this.connection = conf.get("fs.ha.zookeeper.quorum"); this.timeout = conf.getInt("fs.ha.zookeeper.timeout", 3000); this.watch = conf.getBoolean("fs.ha.zookeeper.watch", false); this.prefix = conf.get("fs.ha.zookeeper.prefix", "/hdfs"); if (watcher == null) { // If there was no watcher regardless of the watch policy in the conf // set it to false. Since there is no watcher being set watch = false; } if (watch) { this.watcher = watcher; } else { this.watcher = (new Watcher() { @Override public void process(WatchedEvent event) { // This is a stub for ZK compatibility // if it is not there ZK will keep writing errors to the log } }); } } public void clearPrimary(String address) throws IOException { String node = getRegistrationNode(address); zkCreateRecursively(node, null); } public void registerPrimary(String address, String realAddress) throws UnsupportedEncodingException, IOException { String node = getRegistrationNode(address); zkCreateRecursively(node, realAddress.getBytes("UTF-8")); } private void zkCreateRecursively(String zNode, byte[] data) throws IOException { initZK(); System.out.println("create " + zNode); String[] parts = zNode.split("/"); String path = ""; byte[] payLoad = new byte[0]; List<ACL> acls = new ArrayList<ACL>(1); acls.add(new ACL(Perms.ALL, new Id("world", "anyone"))); try { for (int i = 0; i < parts.length; i++) { if (parts[i].isEmpty()) continue; path += "/" + parts[i]; if (i == parts.length - 1) { payLoad = data; } Stat stat; boolean created = false; while (!created) { // While loop to keep trying through the ConnectionLoss exceptions try { if ((stat = zk.exists(path, false)) != null) { // -1 indicates that we should update zNode regardless of its // version // since we are not utilizing versions in zNode - this is the best zk.setData(path, payLoad, -1); } else { zk.create(path, payLoad, acls, CreateMode.PERSISTENT); } created = true; } catch (KeeperException ex) { ex.printStackTrace(); if (KeeperException.Code.CONNECTIONLOSS != ex.code()) { throw ex; } } } } FileSystem.LOG.info("Wrote zNode " + zNode); } catch (KeeperException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); Thread.currentThread().interrupt(); } finally { try { stopZK(); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } } } /** * Tries to connect to ZooKeeper. To be used when we need to test if the * ZooKeeper cluster is available and the config is correct * * @throws IOException * @throws InterruptedException */ public synchronized void primeConnection() throws IOException, InterruptedException { initZK(); if (!watch) { stopZK(); } } private void initZK() throws IOException { if (zk == null) zk = new ZooKeeper(connection, timeout, watcher); } private void stopZK() throws InterruptedException { if (zk == null) return; zk.close(); zk = null; } /** * Get the information stored in the node of zookeeper. If retry is set * to true it will keep retrying until the data in that node is available * (failover case). If the retry is set to false it will return the first * value that it gets from the zookeeper. * * @param node the path of zNode in zookeeper * @param stat {@link Stat} object that will contain stats of the node * @param retry if true will retry until the data in znode is not null * @return byte[] the data in the znode * @throws IOException * @throws KeeperException * @throws InterruptedException */ private synchronized byte[] getNodeData(String node, Stat stat, boolean retry) throws IOException, KeeperException, InterruptedException { int failures = 0; byte[] data = null; while (data == null) { initZK(); try { data = zk.getData(node, watch, stat); if (!watch) { stopZK(); } if (data == null && retry) { // Failover is in progress // reset the failures failures = 0; DistributedAvatarFileSystem.LOG.info("Failover is in progress. Waiting"); try { Thread.sleep(DistributedAvatarFileSystem.FAILOVER_CHECK_PERIOD); } catch (InterruptedException iex) { Thread.currentThread().interrupt(); } } else { return data; } } catch (KeeperException kex) { if (KeeperException.Code.CONNECTIONLOSS == kex.code() && failures < ZK_CONNECTION_RETRIES) { failures++; // This means there was a failure connecting to zookeeper // we should retry since some nodes might be down. continue; } throw kex; } } return data; } public String getPrimaryAvatarAddress(URI address, Stat stat, boolean retry) throws IOException, KeeperException, InterruptedException { String node = getRegistrationNode(address.getAuthority()); byte[] data = getNodeData(node, stat, retry); if (data == null) { return null; } return new String(data, "UTF-8"); } /** * Gets the {@link Stat} of the node. Will create and destroy connection if * the AvatarZooKeeperClient is not configured to set watchers * * @param node * the path of zNode to get {@link Stat} for * @return {@link Stat} of the node * @throws IOException * @throws KeeperException * @throws InterruptedException */ private synchronized Stat getNodeStats(String node) throws IOException, KeeperException, InterruptedException { int failures = 0; boolean gotStats = false; Stat res = null; while (!gotStats) { initZK(); try { res = zk.exists(node, watch); // Since stats can be null we have to control the execution with a flag gotStats = true; if (!watch) { stopZK(); } } catch (KeeperException kex) { if (KeeperException.Code.CONNECTIONLOSS == kex.code() && failures < ZK_CONNECTION_RETRIES) { failures++; continue; } throw kex; } } return res; } public long getPrimaryRegistrationTime(URI address) throws IOException, KeeperException, InterruptedException { String node = getRegistrationNode(address.getAuthority()); return getNodeStats(node).getMtime(); } /* * ZNode address is formed by the logical name of the filesystem: * dfs.data.xxx.com:9000 will be represented by zNode * /prefix/dfs.data.xxx.com/9000 in ZooKeeper */ private String getRegistrationNode(String clusterAddress) { return prefix + "/" + clusterAddress.replaceAll("[:]", "/"); } public synchronized void shutdown() throws InterruptedException { stopZK(); } }