package org.apache.hadoop.hdfs;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.server.namenode.ZookeeperTxId;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.SerializableUtils;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.ZooDefs.Perms;
import org.apache.zookeeper.data.ACL;
import org.apache.zookeeper.data.Id;
import org.apache.zookeeper.data.Stat;
public class AvatarZooKeeperClient {
private static Log LOG = LogFactory.getLog(AvatarZooKeeperClient.class.getName());
private String connection;
private int timeout;
private int connectTimeout;
private boolean watch;
// Prefix under which the data for this client will be stored
private String prefix;
// The directory under which the session id for the current avatar is stored.
private static final String ssid = "ssid";
// The directory under which the last transaction id for the primary avatar is
// stored.
private static final String txid = "txid";
private Watcher watcher;
private ZooKeeper zk;
private final int failoverCheckPeriod;
private final boolean closeConnOnEachOp;
// Making it large enough to be sure that the cluster is down
// these retries go one after another so they do not take long
public static final int ZK_CONNECTION_RETRIES = 10;
private static final int ZK_INIT_RETRIES = 5;
public static final int ZK_CONNECT_TIMEOUT_DEFAULT = 10000; // 10 seconds
public AvatarZooKeeperClient(Configuration conf, Watcher watcher) {
this(conf, watcher, true);
}
public AvatarZooKeeperClient(Configuration conf, Watcher watcher,
boolean closeConnOnEachOp) {
this.connection = conf.get("fs.ha.zookeeper.quorum");
this.timeout = conf.getInt("fs.ha.zookeeper.timeout", 3000);
this.connectTimeout = conf.getInt("fs.ha.zookeeper.connect.timeout",
ZK_CONNECT_TIMEOUT_DEFAULT);
this.watch = conf.getBoolean("fs.ha.zookeeper.watch", false);
this.prefix = conf.get("fs.ha.zookeeper.prefix", "/hdfs");
this.watcher = new ProxyWatcher(watcher);
if (watcher == null) {
// If there was no watcher regardless of the watch policy in the conf
// set it to false. Since there is no watcher being set
watch = false;
}
this.failoverCheckPeriod = conf.getInt("fs.avatar.failover.checkperiod",
FailoverClientHandler.FAILOVER_CHECK_PERIOD);
this.closeConnOnEachOp = closeConnOnEachOp;
}
private static class ProxyWatcher implements Watcher {
private Watcher impl;
ProxyWatcher(Watcher impl) {
this.impl = impl;
}
public void process(WatchedEvent event) {
if (event.getType() == Event.EventType.None
&& event.getState() == Event.KeeperState.SyncConnected) {
// The ZooKeeper client is connected
synchronized (this) {
this.notifyAll();
}
}
}
}
public synchronized void clearPrimary(String address) throws IOException {
String node = getRegistrationNode(address);
zkCreateRecursively(node, null, true, null);
}
/**
* Creates a node in zookeeper denoting the current session id of the primary
* avatarnode of the cluster. The primary avatarnode always syncs this
* information to zookeeper when it starts.
*
* @param address
* the address of the cluster, used to create the path name for the
* znode
* @param ssid
* the session id of the primary avatarnode
* @throws IOException
*/
public synchronized void registerPrimarySsId(String address, Long ssid)
throws IOException {
String node = getSsIdNode(address);
zkCreateRecursively(node, SerializableUtils.toBytes(ssid), true,
ssid.toString());
}
/**
* Creates a node in zookeeper denoting the current session id and the last
* transaction id processed by the primary avatarnode. This is used by the
* primary avatarnode when it shuts down cleanly.
*
* @param address
* the address of the cluster, used to create the path name for the
* znode
* @param lastTxid
* the last transaction id in the primary avatarnode
* @throws IOException
*/
public synchronized void registerLastTxId(String address,
ZookeeperTxId lastTxid)
throws IOException {
String node = getLastTxIdNode(address);
zkCreateRecursively(node, lastTxid.toBytes(), true, lastTxid.toString());
}
public synchronized void registerPrimary(String address, String realAddress,
boolean overwrite)
throws UnsupportedEncodingException, IOException {
String node = getRegistrationNode(address);
zkCreateRecursively(node, realAddress.getBytes("UTF-8"), overwrite,
realAddress);
}
private void zkCreateRecursively(String zNode, byte[] data,
boolean overwrite, String strData) throws IOException {
try {
initZK();
} catch (InterruptedException ie) {
throw new IOException(ie);
}
System.out.println("updating (" + zNode + ")-> (" + strData + ")");
String[] parts = zNode.split("/");
String path = "";
byte[] payLoad = new byte[0];
List<ACL> acls = new ArrayList<ACL>(1);
acls.add(new ACL(Perms.ALL, new Id("world", "anyone")));
try {
for (int i = 0; i < parts.length; i++) {
if (parts[i].isEmpty())
continue;
path += "/" + parts[i];
if (i == parts.length - 1) {
payLoad = data;
}
boolean created = false;
while (!created) {
// While loop to keep trying through the ConnectionLoss exceptions
try {
if ((zk.exists(path, false)) != null) {
// -1 indicates that we should update zNode regardless of its
// version
// since we are not utilizing versions in zNode - this is the best
if (i == parts.length - 1 && !overwrite) {
throw new FileAlreadyExistsException("ZNode " + path + " already exists.");
}
zk.setData(path, payLoad, -1);
} else {
zk.create(path, payLoad, acls, CreateMode.PERSISTENT);
}
created = true;
} catch (KeeperException ex) {
ex.printStackTrace();
if (KeeperException.Code.CONNECTIONLOSS != ex.code()) {
throw ex;
}
}
}
}
FileSystem.LOG.info("Wrote zNode " + zNode);
} catch (KeeperException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
Thread.currentThread().interrupt();
} finally {
try {
if (closeConnOnEachOp) {
stopZK();
}
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
}
}
}
/**
* Tries to connect to ZooKeeper. To be used when we need to test if the
* ZooKeeper cluster is available and the config is correct
*
* @throws IOException
* @throws InterruptedException
*/
public synchronized void primeConnection() throws IOException,
InterruptedException {
initZK();
if (!watch) {
stopZK();
}
}
private void initZK() throws IOException, InterruptedException {
synchronized (watcher) {
for (int i = 0; i < ZK_INIT_RETRIES; i++) {
try {
if (zk == null || zk.getState() == ZooKeeper.States.CLOSED) {
zk = new ZooKeeper(connection, timeout, watcher);
}
break;
} catch (IOException ie) {
if (i == ZK_INIT_RETRIES - 1) {
throw ie;
}
FileSystem.LOG.info("Connection to zookeeper could not be"
+ "established retrying....", ie);
Thread.sleep(3000);
}
}
if (zk.getState() != ZooKeeper.States.CONNECTED) {
watcher.wait(this.connectTimeout);
}
if (zk.getState() != ZooKeeper.States.CONNECTED) {
// Close any open connections to avoid fd leak.
zk.close();
throw new IOException("Timed out trying to connect to ZooKeeper");
}
}
}
private void stopZK() throws InterruptedException {
if (zk == null)
return;
zk.close();
zk = null;
}
/**
* Get the information stored in the node of zookeeper. If retry is set
* to true it will keep retrying until the data in that node is available
* (failover case). If the retry is set to false it will return the first
* value that it gets from the zookeeper.
*
* @param node the path of zNode in zookeeper
* @param stat {@link Stat} object that will contain stats of the node
* @param retry if true will retry until the data in znode is not null
* @return byte[] the data in the znode
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
private synchronized byte[] getNodeData(String node, Stat stat,
boolean retry, boolean sync)
throws IOException, KeeperException, InterruptedException {
int failures = 0;
byte[] data = null;
while (data == null) {
initZK();
try {
if (sync) {
SyncUtil su = new SyncUtil();
su.sync(zk, node);
}
data = zk.getData(node, watch, stat);
if (data == null && retry) {
// Failover is in progress
// reset the failures
failures = 0;
DistributedAvatarFileSystem.LOG.info("Failover is in progress. Waiting");
try {
Thread.sleep(failoverCheckPeriod);
} catch (InterruptedException iex) {
Thread.currentThread().interrupt();
}
} else {
return data;
}
} catch (KeeperException kex) {
if (KeeperException.Code.CONNECTIONLOSS == kex.code()
&& failures < ZK_CONNECTION_RETRIES) {
failures++;
// This means there was a failure connecting to zookeeper
// we should retry since some nodes might be down.
continue;
}
throw kex;
} finally {
if (closeConnOnEachOp) {
stopZK();
}
}
}
return data;
}
/**
* Retrieves the current session id for the cluster from zookeeper.
*
* @param address
* the address of the cluster
* @param sync
* whether or not to perform a sync before read
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
public Long getPrimarySsId(String address, boolean sync) throws IOException,
KeeperException, InterruptedException, ClassNotFoundException {
Stat stat = new Stat();
String node = getSsIdNode(address);
byte[] data = getNodeData(node, stat, false, sync);
if (data == null) {
return null;
}
return (Long) SerializableUtils.getFromBytes(data, Long.class);
}
/**
* Retrieves the last transaction id of the primary from zookeeper.
*
* @param address
* the address of the cluster
* @param sync
* whether or not to perform a sync before read
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
public ZookeeperTxId getPrimaryLastTxId(String address, boolean sync)
throws IOException,
KeeperException, InterruptedException, ClassNotFoundException {
Stat stat = new Stat();
String node = getLastTxIdNode(address);
byte[] data = getNodeData(node, stat, false, sync);
if (data == null) {
return null;
}
return ZookeeperTxId.getFromBytes(data);
}
/**
* Retrieves the primary address for the cluster, this does not perform a
* sync before it reads the znode.
*/
public String getPrimaryAvatarAddress(String address, Stat stat, boolean retry)
throws IOException, KeeperException, InterruptedException {
return getPrimaryAvatarAddress(address, stat, retry, false);
}
public String getPrimaryAvatarAddress(String address, Stat stat,
boolean retry, boolean sync)
throws IOException, KeeperException, InterruptedException {
String node = getRegistrationNode(address);
byte[] data = getNodeData(node, stat, retry, sync);
if (data == null) {
return null;
}
return new String(data, "UTF-8");
}
/**
* Gets the {@link Stat} of the node. Will create and destroy connection if
* the AvatarZooKeeperClient is not configured to set watchers
*
* @param node
* the path of zNode to get {@link Stat} for
* @return {@link Stat} of the node
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
private synchronized Stat getNodeStats(String node) throws IOException,
KeeperException, InterruptedException {
int failures = 0;
boolean gotStats = false;
Stat res = null;
while (!gotStats) {
initZK();
try {
res = zk.exists(node, watch);
// Since stats can be null we have to control the execution with a flag
gotStats = true;
} catch (KeeperException kex) {
if (KeeperException.Code.CONNECTIONLOSS == kex.code()
&& failures < ZK_CONNECTION_RETRIES) {
failures++;
continue;
}
throw kex;
} finally {
}
}
return res;
}
public long getPrimaryRegistrationTime(URI address) throws IOException,
KeeperException, InterruptedException {
InjectionHandler
.processEvent(InjectionEvent.AVATARZK_GET_REGISTRATION_TIME);
String node = getRegistrationNode(address.getAuthority());
return getNodeStats(node).getMtime();
}
/*
* ZNode address is formed by the logical name of the filesystem:
* dfs.data.xxx.com:9000 will be represented by zNode
* /prefix/dfs.data.xxx.com/9000 in ZooKeeper
*/
private String getRegistrationNode(String clusterAddress) {
return prefix + "/" + clusterAddress.replaceAll("[:]", "/").toLowerCase();
}
/**
* Computes the znode for the session id of the primary avatar, the format is
* /prefix/ssid/dfs.data.xxx.com/9000
*
* @param clusterAddress
* the address of the cluster
* @return the znode to store the ssid in the following format :
* /prefix/ssid/dfs.data.xxx.com/9000
*/
private String getSsIdNode(String clusterAddress) {
return prefix + "/" + ssid + "/"
+ clusterAddress.replaceAll("[:]", "/").toLowerCase();
}
/**
* Computes the znode for the session id of the primary avatar, the format is
* /prefix/txid/dfs.data.xxx.com/9000
*
* @param clusterAddress
* the address of the cluster
* @return the znode to store the ssid in the following format :
* /prefix/txid/dfs.data.xxx.com/9000
*/
private String getLastTxIdNode(String clusterAddress) {
return prefix + "/" + txid + "/"
+ clusterAddress.replaceAll("[:]", "/").toLowerCase();
}
public synchronized void shutdown() throws InterruptedException {
stopZK();
}
/**
* Helper class for syncing data to ZK.
*/
static class SyncUtil implements AsyncCallback.VoidCallback {
private static final int MAX_SYNC_WAIT_TIME = 5 * 1000; // 5 sec
volatile int rc = -1;
@Override
public synchronized void processResult(int rc, String path, Object ctx) {
this.rc = rc;
this.notify();
}
synchronized boolean sync(ZooKeeper zk, String path)
throws InterruptedException {
zk.sync(path, this, null);
this.wait(MAX_SYNC_WAIT_TIME);
if (rc != KeeperException.Code.OK.intValue()) {
LOG.info("Cannot sync ZK for path: " + path + " return code: " + rc);
return false;
}
LOG.info("Synced ZK for path: " + path);
return true;
}
}
}