// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.zookeeper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.Ordering;
import com.twitter.common.zookeeper.Group.GroupChangeListener;
import com.twitter.common.zookeeper.Group.JoinException;
import com.twitter.common.zookeeper.Group.Membership;
import com.twitter.common.zookeeper.Group.UpdateException;
import com.twitter.common.zookeeper.Group.WatchException;
import org.apache.zookeeper.data.ACL;
import javax.annotation.Nullable;
import java.util.List;
import java.util.logging.Logger;
/**
* A distributed mechanism for eventually arriving at an evenly partitioned space of long values.
* A typical usage would have a client on each of several hosts joining a logical partition (a
* "partition group") that represents some shared work. Clients could then process a subset of a
* full body of work by testing any given item of work with their partition filter.
*
* <p>Note that clients must be able to tolerate periods of duplicate processing by more than 1
* partition as explained in {@link #join()}.
*
* @author John Sirois
*/
public class Partitioner {
private static final Logger LOG = Logger.getLogger(Partitioner.class.getName());
private volatile int groupSize;
private volatile int groupIndex;
private final Group group;
/**
* Constructs a representation of a partition group but does not join it. Note that the partition
* group path will be created as a persistent zookeeper path if it does not already exist.
*
* @param zkClient a client to use for joining the partition group and watching its membership
* @param acl the acl for this partition group
* @param path a zookeeper path that represents the partition group
*/
public Partitioner(ZooKeeperClient zkClient, List<ACL> acl, String path) {
group = new Group(zkClient, acl, path);
}
@VisibleForTesting
int getGroupSize() {
return groupSize;
}
/**
* Represents a slice of a partition group. The partition is dynamic and will adjust its size as
* members join and leave its partition group.
*/
public abstract static class Partition implements Predicate<Long>, Membership {
/**
* Returns {@code true} if the given {@code value} is a member of this partition at this time.
*/
public abstract boolean isMember(long value);
/**
* Gets number of members in the group at this time.
*
* @return number of members in the ZK group at this time.
*/
public abstract int getNumPartitions();
/**
* Evaluates partition membership based on the given {@code value}'s hash code. If the value
* is null it is never a member of a partition.
*/
boolean isMember(Object value) {
return (value != null) && isMember(value.hashCode());
}
/**
* Equivalent to {@link #isMember(long)} for all non-null values; however incurs unboxing
* overhead.
*/
@Override
public boolean apply(@Nullable Long input) {
return (input != null) && isMember(input);
}
}
/**
* Attempts to join the partition group and claim a slice. When successful, a predicate is
* returned that can be used to test whether or not an item belongs to this partition. The
* predicate is dynamic such that as the group is further partitioned or partitions merge the
* predicate will claim a narrower or wider swath of the partition space respectively. Partition
* creation and merging is not instantaneous and clients should expect independent partitions to
* claim ownership of some items when partition membership is in flux. It is only in the steady
* state that a client should expect independent partitions to divide the partition space evenly
* and without overlap.
*
* <p>TODO(John Sirois): consider adding a version with a global timeout for the join operation.
*
* @return the partition representing the slice of the partition group this member can claim
* @throws JoinException if there was a problem joining the partition group
* @throws InterruptedException if interrupted while waiting to join the partition group
*/
public final Partition join() throws JoinException, InterruptedException {
final Membership membership = group.join();
try {
group.watch(createGroupChangeListener(membership));
} catch (WatchException e) {
membership.cancel();
throw new JoinException("Problem establishing watch on group after joining it", e);
}
return new Partition() {
@Override public boolean isMember(long value) {
return (value % groupSize) == groupIndex;
}
@Override public int getNumPartitions() {
return groupSize;
}
@Override public String getGroupPath() {
return membership.getGroupPath();
}
@Override public String getMemberId() {
return membership.getMemberId();
}
@Override public String getMemberPath() {
return membership.getMemberPath();
}
@Override public byte[] updateMemberData() throws UpdateException {
return membership.updateMemberData();
}
@Override public void cancel() throws JoinException {
membership.cancel();
}
};
}
@VisibleForTesting GroupChangeListener createGroupChangeListener(final Membership membership) {
return new GroupChangeListener() {
@Override public void onGroupChange(Iterable<String> memberIds) {
List<String> members = Ordering.natural().sortedCopy(memberIds);
int newSize = members.size();
int newIndex = members.indexOf(membership.getMemberId());
LOG.info(String.format("Rebuilding group %s:%s [%d:%d]->[%d:%d]",
membership.getGroupPath(), members, groupSize, groupIndex, newSize, newIndex));
groupSize = newSize;
groupIndex = newIndex;
}
};
}
}