/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.emc.storageos.coordinator.client.service;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.utils.ZKPaths;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.data.Stat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.curator.utils.PathUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <p>
* A double barrier as described in the ZK recipes. Quoting the recipe:
* </p>
*
* <blockquote>
* Double barriers enable
* clients to synchronize the beginning and the end of a computation. When enough processes
* have joined the barrier, processes start their computation and leave the barrier
* once they have finished.
* </blockquote>
*/
public class DistributedDoubleBarrier
{
private static final Logger logger = LoggerFactory.getLogger(DistributedDoubleBarrier.class);
private final CuratorFramework client;
private final String barrierPath;
private final int memberQty;
private final String ourPath;
private final String readyPath;
private final AtomicBoolean hasBeenNotifiedEnter = new AtomicBoolean(false);
private final AtomicBoolean hasBeenNotifiedLeave = new AtomicBoolean(false);
private final AtomicBoolean connectionLost = new AtomicBoolean(false);
private final Watcher watcher = new Watcher()
{
@Override
public void process(WatchedEvent event)
{
connectionLost.set(event.getState() != Event.KeeperState.SyncConnected);
notifyFromWatcher();
}
};
private static final String READY_NODE = "ready";
/**
* Creates the barrier abstraction. <code>memberQty</code> is the number of members in the
* barrier. When {@link #enter()} is called, it blocks until all members have entered. When
* {@link #leave()} is called, it blocks until all members have left.
*
* @param client the client
* @param barrierPath path to use
* @param memberQty the number of members in the barrier. NOTE: more than <code>memberQty</code>
* can enter the barrier. <code>memberQty</code> is a threshold, not a limit
*/
public DistributedDoubleBarrier(CuratorFramework client, String barrierPath, int memberQty)
{
Preconditions.checkState(memberQty > 0, "memberQty cannot be 0");
this.client = client;
this.barrierPath = PathUtils.validatePath(barrierPath);
this.memberQty = memberQty;
ourPath = ZKPaths.makePath(barrierPath, UUID.randomUUID().toString());
readyPath = ZKPaths.makePath(barrierPath, READY_NODE);
}
/**
* Enter the barrier and block until all members have entered
*
* @throws Exception interruptions, errors, etc.
*/
public void enter() throws Exception
{
enter(-1, null);
}
/**
* Enter the barrier and block until all members have entered or the timeout has
* elapsed. Its znode is cleaned up before return in case of timeout.
*
* @param maxWait max time to block
* @param unit time unit
* @return true if the entry was successful, false if the timeout elapsed first
* @throws Exception interruptions, errors, etc.
*/
public boolean enter(long maxWait, TimeUnit unit) throws Exception
{
long startMs = System.currentTimeMillis();
boolean hasMaxWait = (unit != null);
long maxWaitMs = hasMaxWait ? TimeUnit.MILLISECONDS.convert(maxWait, unit) : Long.MAX_VALUE;
boolean readyPathExists = (client.checkExists().usingWatcher(watcher).forPath(readyPath) != null);
try {
// COPRHD FIX - if current node has been created, just go ahead
client.create().creatingParentContainersIfNeeded().withMode(CreateMode.EPHEMERAL).forPath(ourPath);
}
catch ( KeeperException.NodeExistsException ignore )
{
// ignore if current node has been there
}
boolean result = (readyPathExists || internalEnter(startMs, hasMaxWait, maxWaitMs));
if ( connectionLost.get() )
{
throw new KeeperException.ConnectionLossException();
}
return result;
}
/**
* Leave the barrier and block until all members have left
*
* @throws Exception interruptions, errors, etc.
*/
public synchronized void leave() throws Exception
{
leave(-1, null);
}
/**
* Leave the barrier and block until all members have left or the timeout has
* elapsed.
*
* If timeout happens for one member, all other members are going to leave with timeout
* and the return values are false for all members. Znode is left over under the barrier
* until the zookeeper client session terminates.
*
* @param maxWait max time to block
* @param unit time unit
* @return true if leaving was successful, false if the timeout elapsed first
* @throws Exception interruptions, errors, etc.
*/
public synchronized boolean leave(long maxWait, TimeUnit unit) throws Exception
{
long startMs = System.currentTimeMillis();
boolean hasMaxWait = (unit != null);
long maxWaitMs = hasMaxWait ? TimeUnit.MILLISECONDS.convert(maxWait, unit) : Long.MAX_VALUE;
return internalLeave(startMs, hasMaxWait, maxWaitMs);
}
@VisibleForTesting
protected List<String> getChildrenForEntering() throws Exception
{
return client.getChildren().forPath(barrierPath);
}
private List<String> filterAndSortChildren(List<String> children)
{
Iterable<String> filtered = Iterables.filter
(
children,
new Predicate<String>()
{
@Override
public boolean apply(String name)
{
return !name.equals(READY_NODE);
}
}
);
ArrayList<String> filteredList = Lists.newArrayList(filtered);
Collections.sort(filteredList);
return filteredList;
}
private boolean internalLeave(long startMs, boolean hasMaxWait, long maxWaitMs) throws Exception
{
logger.trace(">>> internalLeave {}", ourPath);
String ourPathName = ZKPaths.getNodeFromPath(ourPath);
boolean ourNodeShouldExist = true;
boolean result = true;
hasBeenNotifiedLeave.set(false);
for ( ;; )
{
if ( connectionLost.get() )
{
throw new KeeperException.ConnectionLossException();
}
List<String> children;
try
{
children = client.getChildren().forPath(barrierPath);
}
catch ( KeeperException.NoNodeException dummy )
{
children = Lists.newArrayList();
}
children = filterAndSortChildren(children);
if ( (children == null) || (children.size() == 0) )
{
break;
}
int ourIndex = children.indexOf(ourPathName);
if ( (ourIndex < 0) && ourNodeShouldExist )
{
if ( connectionLost.get() )
{
break; // connection was lost but we've reconnected. However, our ephemeral node is gone
}
else
{
throw new IllegalStateException(String.format("Our path (%s) is missing", ourPathName));
}
}
logger.trace("children:{}", children);
if ( children.size() == 1 )
{
if ( ourNodeShouldExist && !children.get(0).equals(ourPathName) )
{
throw new IllegalStateException(String.format("Last path (%s) is not ours (%s)", children.get(0), ourPathName));
}
checkDeleteOurPath(ourNodeShouldExist);
break;
}
String watchPath; // Watch somebody else that still exists
if ( ourIndex == 0 )
{
watchPath = ZKPaths.makePath(barrierPath, children.get(children.size() - 1));
}
else
{
watchPath = ZKPaths.makePath(barrierPath, children.get(0));
checkDeleteOurPath(ourNodeShouldExist);
ourNodeShouldExist = false;
}
Stat stat = client.checkExists().usingWatcher(watcher).forPath(watchPath);
if ( stat != null )
{
if ( hasMaxWait )
{
// COPRHD FIX - reset the notified flag before waiting again.
// BUGFIX - If it has been notified once before, timedWait() doesn't sleep anymore and return value is always true even
// it fails with timeout
hasBeenNotifiedLeave.set(false);
result = timedWait(startMs, maxWaitMs, hasBeenNotifiedLeave);
if ( !result )
{
// COPRHD FIX - if it is leaving due to time out, keep our node there so that other watchers can notice
// this unsuccessful leave. In this case, leave() on other nodes should time out and return false as well
if (!ourNodeShouldExist && ourIndex > 0) {
try {
client.create().creatingParentContainersIfNeeded().withMode(CreateMode.EPHEMERAL).forPath(ourPath);
}
catch ( KeeperException.NodeExistsException ignore )
{
// ignore if current node has been there
}
}
break;
}
}
else
{
wait();
}
}
}
try
{
client.delete().forPath(readyPath);
}
catch ( KeeperException.NoNodeException ignore )
{
// ignore
}
return result;
}
private void checkDeleteOurPath(boolean shouldExist) throws Exception
{
if ( shouldExist )
{
client.delete().forPath(ourPath);
}
}
private synchronized boolean internalEnter(long startMs, boolean hasMaxWait, long maxWaitMs) throws Exception
{
hasBeenNotifiedEnter.set(false);
boolean result = true;
List<String> children = getChildrenForEntering();
int count = (children != null) ? children.size() : 0;
if ( count >= memberQty )
{
try
{
client.create().forPath(readyPath);
}
catch ( KeeperException.NodeExistsException ignore )
{
// ignore
}
}
else
{
if ( hasMaxWait )
{
result = timedWait(startMs, maxWaitMs, hasBeenNotifiedEnter);
// COPRHD FIX - if we fail to enter due to timeout, we should remove our node. Otherwise another member
// may count the leftover and enter into barrier wrongly
if ( !result ) {
checkDeleteOurPath(true);
}
}
else
{
wait();
}
}
return result;
}
private boolean timedWait(long startMs, long maxWaitMs, AtomicBoolean notified) throws InterruptedException
{
long elapsed = System.currentTimeMillis() - startMs;
while ( !notified.get() && elapsed < maxWaitMs )
{
wait(maxWaitMs - elapsed);
elapsed = System.currentTimeMillis() - startMs;
logger.trace("max:{} elapsed:{}", maxWaitMs, elapsed);
}
return notified.get();
}
private synchronized void notifyFromWatcher()
{
hasBeenNotifiedEnter.set(true);
hasBeenNotifiedLeave.set(true);
notifyAll();
}
}