/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.zookeeper.server.upgrade; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import org.apache.jute.InputArchive; import org.apache.jute.OutputArchive; import org.apache.jute.Record; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.KeeperException.Code; import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.Watcher.Event; import org.apache.zookeeper.Watcher.Event.EventType; import org.apache.zookeeper.ZooDefs.OpCode; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.data.StatPersistedV1; import org.apache.zookeeper.server.WatchManager; import org.apache.zookeeper.txn.CreateTxn; import org.apache.zookeeper.txn.DeleteTxn; import org.apache.zookeeper.txn.ErrorTxn; import org.apache.zookeeper.txn.SetACLTxn; import org.apache.zookeeper.txn.SetDataTxn; import org.apache.zookeeper.txn.TxnHeader; /** * This class maintains the tree data structure. It doesn't have any networking * or client connection code in it so that it can be tested in a stand alone * way. * <p> * The tree maintains two parallel data structures: a hashtable that maps from * full paths to DataNodes and a tree of DataNodes. All accesses to a path is * through the hashtable. The tree is traversed only when serializing to disk. */ public class DataTreeV1 { private static final Logger LOG = LoggerFactory.getLogger(DataTreeV1.class); /** * This hashtable provides a fast lookup to the datanodes. The tree is the * source of truth and is where all the locking occurs */ private ConcurrentHashMap<String, DataNodeV1> nodes = new ConcurrentHashMap<String, DataNodeV1>(); private WatchManager dataWatches = new WatchManager(); private WatchManager childWatches = new WatchManager(); /** * This hashtable lists the paths of the ephemeral nodes of a session. */ private Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<Long, HashSet<String>>(); /** * return the ephemerals for this tree * @return the ephemerals for this tree */ public Map<Long, HashSet<String>> getEphemeralsMap() { return this.ephemerals; } public void setEphemeralsMap(Map<Long, HashSet<String>> ephemerals) { this.ephemerals = ephemerals; } @SuppressWarnings("unchecked") public HashSet<String> getEphemerals(long sessionId) { HashSet<String> retv = ephemerals.get(sessionId); if (retv == null) { return new HashSet<String>(); } HashSet<String> cloned = null; synchronized(retv) { cloned = (HashSet<String>) retv.clone(); } return cloned; } public Collection<Long> getSessions() { return ephemerals.keySet(); } public DataNodeV1 getNode(String path) { return nodes.get(path); } /** * This is a pointer to the root of the DataTree. It is the source of truth, * but we usually use the nodes hashmap to find nodes in the tree. */ private DataNodeV1 root = new DataNodeV1(null, new byte[0], null, new StatPersistedV1()); public DataTreeV1() { /* Rather than fight it, let root have an alias */ nodes.put("", root); nodes.put("/", root); } static public void copyStatPersisted(StatPersistedV1 from, StatPersistedV1 to) { to.setAversion(from.getAversion()); to.setCtime(from.getCtime()); to.setCversion(from.getCversion()); to.setCzxid(from.getCzxid()); to.setMtime(from.getMtime()); to.setMzxid(from.getMzxid()); to.setVersion(from.getVersion()); to.setEphemeralOwner(from.getEphemeralOwner()); } static public void copyStat(Stat from, Stat to) { to.setAversion(from.getAversion()); to.setCtime(from.getCtime()); to.setCversion(from.getCversion()); to.setCzxid(from.getCzxid()); to.setMtime(from.getMtime()); to.setMzxid(from.getMzxid()); to.setVersion(from.getVersion()); to.setEphemeralOwner(from.getEphemeralOwner()); to.setDataLength(from.getDataLength()); to.setNumChildren(from.getNumChildren()); } // public void remooveInterest(String path, Watcher nw) { // DataNode n = nodes.get(path); // if (n == null) { // synchronized (nonExistentWatches) { // HashSet<Watcher> list = nonExistentWatches.get(path); // if (list != null) { // list.remove(nw); // } // } // } // synchronized (n) { // n.dataWatchers.remove(nw); // n.childWatchers.remove(nw); // } // } /** * @param path * @param data * @param acl * @param ephemeralOwner * the session id that owns this node. -1 indicates this is * not an ephemeral node. * @param zxid * @param time * @return the patch of the created node * @throws KeeperException */ public String createNode(String path, byte data[], List<ACL> acl, long ephemeralOwner, long zxid, long time) throws KeeperException.NoNodeException, KeeperException.NodeExistsException { int lastSlash = path.lastIndexOf('/'); String parentName = path.substring(0, lastSlash); String childName = path.substring(lastSlash + 1); StatPersistedV1 stat = new StatPersistedV1(); stat.setCtime(time); stat.setMtime(time); stat.setCzxid(zxid); stat.setMzxid(zxid); stat.setVersion(0); stat.setAversion(0); stat.setEphemeralOwner(ephemeralOwner); DataNodeV1 parent = nodes.get(parentName); if (parent == null) { throw new KeeperException.NoNodeException(); } synchronized (parent) { if (parent.children.contains(childName)) { throw new KeeperException.NodeExistsException(); } int cver = parent.stat.getCversion(); cver++; parent.stat.setCversion(cver); DataNodeV1 child = new DataNodeV1(parent, data, acl, stat); parent.children.add(childName); nodes.put(path, child); if (ephemeralOwner != 0) { HashSet<String> list = ephemerals.get(ephemeralOwner); if (list == null) { list = new HashSet<String>(); ephemerals.put(ephemeralOwner, list); } synchronized(list) { list.add(path); } } } dataWatches.triggerWatch(path, Event.EventType.NodeCreated); childWatches.triggerWatch(parentName.equals("")?"/":parentName, Event.EventType.NodeChildrenChanged); return path; } public void deleteNode(String path) throws KeeperException.NoNodeException { int lastSlash = path.lastIndexOf('/'); String parentName = path.substring(0, lastSlash); String childName = path.substring(lastSlash + 1); DataNodeV1 node = nodes.get(path); if (node == null) { throw new KeeperException.NoNodeException(); } nodes.remove(path); DataNodeV1 parent = nodes.get(parentName); if (parent == null) { throw new KeeperException.NoNodeException(); } synchronized (parent) { parent.children.remove(childName); parent.stat.setCversion(parent.stat.getCversion() + 1); long eowner = node.stat.getEphemeralOwner(); if (eowner != 0) { HashSet<String> nodes = ephemerals.get(eowner); if (nodes != null) { synchronized(nodes) { nodes.remove(path); } } } node.parent = null; } Set<Watcher> processed = dataWatches.triggerWatch(path, EventType.NodeDeleted); childWatches.triggerWatch(path, EventType.NodeDeleted, processed); childWatches.triggerWatch(parentName.equals("")?"/":parentName, EventType.NodeChildrenChanged); } public Stat setData(String path, byte data[], int version, long zxid, long time) throws KeeperException.NoNodeException { Stat s = new Stat(); DataNodeV1 n = nodes.get(path); if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { n.data = data; n.stat.setMtime(time); n.stat.setMzxid(zxid); n.stat.setVersion(version); n.copyStat(s); } dataWatches.triggerWatch(path, EventType.NodeDataChanged); return s; } public byte[] getData(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException { DataNodeV1 n = nodes.get(path); if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { n.copyStat(stat); if (watcher != null) { dataWatches.addWatch(path, watcher); } return n.data; } } public Stat statNode(String path, Watcher watcher) throws KeeperException.NoNodeException { Stat stat = new Stat(); DataNodeV1 n = nodes.get(path); if (watcher != null) { dataWatches.addWatch(path, watcher); } if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { n.copyStat(stat); return stat; } } public ArrayList<String> getChildren(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException { DataNodeV1 n = nodes.get(path); if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { ArrayList<String> children = new ArrayList<String>(); children.addAll(n.children); if (watcher != null) { childWatches.addWatch(path, watcher); } return children; } } public Stat setACL(String path, List<ACL> acl, int version) throws KeeperException.NoNodeException { Stat stat = new Stat(); DataNodeV1 n = nodes.get(path); if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { n.stat.setAversion(version); n.acl = acl; n.copyStat(stat); return stat; } } @SuppressWarnings("unchecked") public List<ACL> getACL(String path, Stat stat) throws KeeperException.NoNodeException { DataNodeV1 n = nodes.get(path); if (n == null) { throw new KeeperException.NoNodeException(); } synchronized (n) { n.copyStat(stat); return new ArrayList<ACL>(n.acl); } } static public class ProcessTxnResult { public long clientId; public int cxid; public long zxid; public int err; public int type; public String path; public Stat stat; /** * Equality is defined as the clientId and the cxid being the same. This * allows us to use hash tables to track completion of transactions. * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(Object o) { if (o instanceof ProcessTxnResult) { ProcessTxnResult other = (ProcessTxnResult) o; return other.clientId == clientId && other.cxid == cxid; } return false; } /** * See equals() to find the rational for how this hashcode is generated. * * @see ProcessTxnResult#equals(Object) * @see java.lang.Object#hashCode() */ @Override public int hashCode() { return (int) ((clientId ^ cxid) % Integer.MAX_VALUE); } } public volatile long lastProcessedZxid = 0; @SuppressWarnings("unchecked") public ProcessTxnResult processTxn(TxnHeader header, Record txn) { ProcessTxnResult rc = new ProcessTxnResult(); String debug = ""; try { rc.clientId = header.getClientId(); rc.cxid = header.getCxid(); rc.zxid = header.getZxid(); rc.type = header.getType(); rc.err = 0; if (rc.zxid > lastProcessedZxid) { lastProcessedZxid = rc.zxid; } switch (header.getType()) { case OpCode.create: CreateTxn createTxn = (CreateTxn) txn; debug = "Create transaction for " + createTxn.getPath(); createNode(createTxn.getPath(), createTxn.getData(), createTxn .getAcl(), createTxn.getEphemeral() ? header .getClientId() : 0, header.getZxid(), header.getTime()); rc.path = createTxn.getPath(); break; case OpCode.delete: DeleteTxn deleteTxn = (DeleteTxn) txn; debug = "Delete transaction for " + deleteTxn.getPath(); deleteNode(deleteTxn.getPath()); break; case OpCode.setData: SetDataTxn setDataTxn = (SetDataTxn) txn; debug = "Set data for transaction for " + setDataTxn.getPath(); rc.stat = setData(setDataTxn.getPath(), setDataTxn.getData(), setDataTxn.getVersion(), header.getZxid(), header .getTime()); break; case OpCode.setACL: SetACLTxn setACLTxn = (SetACLTxn) txn; debug = "Set ACL for transaction for " + setACLTxn.getPath(); rc.stat = setACL(setACLTxn.getPath(), setACLTxn.getAcl(), setACLTxn.getVersion()); break; case OpCode.closeSession: killSession(header.getClientId()); break; case OpCode.error: ErrorTxn errTxn = (ErrorTxn) txn; rc.err = errTxn.getErr(); break; } } catch (KeeperException e) { // These are expected errors since we take a lazy snapshot if (initialized || (e.code() != Code.NONODE && e.code() != Code.NODEEXISTS)) { LOG.warn("Failed:" + debug, e); } } return rc; } void killSession(long session) { // the list is already removed from the ephemerals // so we do not have to worry about synchronyzing on // the list. This is only called from FinalRequestProcessor // so there is no need for synchornization. The list is not // changed here. Only create and delete change the list which // are again called from FinalRequestProcessor in sequence. HashSet<String> list = ephemerals.remove(session); if (list != null) { for (String path : list) { try { deleteNode(path); if (LOG.isDebugEnabled()) { LOG.debug("Deleting ephemeral node " + path + " for session 0x" + Long.toHexString(session)); } } catch (NoNodeException e) { LOG.warn("Ignoring NoNodeException for path " + path + " while removing ephemeral for dead session 0x" + Long.toHexString(session)); } } } } /** * this method uses a stringbuilder to create a new * path for children. This is faster than string * appends ( str1 + str2). * @param oa OutputArchive to write to. * @param path a string builder. * @throws IOException * @throws InterruptedException */ void serializeNode(OutputArchive oa, StringBuilder path) throws IOException, InterruptedException { String pathString = path.toString(); DataNodeV1 node = getNode(pathString); if (node == null) { return; } String children[] = null; synchronized (node) { scount++; oa.writeString(pathString, "path"); oa.writeRecord(node, "node"); children = node.children.toArray(new String[node.children.size()]); } path.append('/'); int off = path.length(); if (children != null) { for (String child : children) { //since this is single buffer being resused // we need // to truncate the previous bytes of string. path.delete(off, Integer.MAX_VALUE); path.append(child); serializeNode(oa, path); } } } int scount; public boolean initialized = false; public void serialize(OutputArchive oa, String tag) throws IOException, InterruptedException { scount = 0; serializeNode(oa, new StringBuilder("")); // / marks end of stream // we need to check if clear had been called in between the snapshot. if (root != null) { oa.writeString("/", "path"); } } public void deserialize(InputArchive ia, String tag) throws IOException { nodes.clear(); String path = ia.readString("path"); while (!path.equals("/")) { DataNodeV1 node = new DataNodeV1(); ia.readRecord(node, "node"); nodes.put(path, node); int lastSlash = path.lastIndexOf('/'); if (lastSlash == -1) { root = node; } else { String parentPath = path.substring(0, lastSlash); node.parent = nodes.get(parentPath); node.parent.children.add(path.substring(lastSlash + 1)); long eowner = node.stat.getEphemeralOwner(); if (eowner != 0) { HashSet<String> list = ephemerals.get(eowner); if (list == null) { list = new HashSet<String>(); ephemerals.put(eowner, list); } list.add(path); } } path = ia.readString("path"); } nodes.put("/", root); } public String dumpEphemerals() { Set<Long> keys = ephemerals.keySet(); StringBuilder sb = new StringBuilder("Sessions with Ephemerals (" + keys.size() + "):\n"); for (long k : keys) { sb.append("0x" + Long.toHexString(k)); sb.append(":\n"); HashSet<String> tmp = ephemerals.get(k); synchronized(tmp) { for (String path : tmp) { sb.append("\t" + path + "\n"); } } } return sb.toString(); } public void removeCnxn(Watcher watcher) { dataWatches.removeWatcher(watcher); childWatches.removeWatcher(watcher); } public void clear() { root = null; nodes.clear(); ephemerals.clear(); // dataWatches = null; // childWatches = null; } }