/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.util.consistenthash;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* This is an implementation of a consistent hash. T is the type of a bin.
*
* It is mostly copied from Tom White's implementation found here:
* http://www.lexemetech.com/2007/11/consistent-hashing.html
*
* Blog comments mention that there may be a bug in this implementation -- if
* there is a key collision we may lose bins. Probabilistically this is small,
* and even smaller with a higher more replication factor. This could be made
* even rarer by enlarging the circle by using Long instead of Integer.
*
* getNBins and getNUniqBins return ordered lists of bins for a particular
* object. This is useful for assigning backups if the first bin fails.
*
* This datastructure is not threadsafe.
*/
public class ConsistentHash<T> {
// when looking for n unique bins, give up after a streak of MAX_DUPES
// duplicates
public final static int MAX_DUPES = 10;
// # of times a bin is replicated in hash circle. (for better load balancing)
private final int numberOfReplicas;
private final HashFunction hashFunction;
private final SortedMap<Integer, T> circle = new TreeMap<Integer, T>();
public ConsistentHash(int numberOfReplicas, Collection<T> nodes) {
this(new MD5HashFunction(), numberOfReplicas, nodes);
}
public ConsistentHash(HashFunction hashFunction, int numberOfReplicas,
Collection<T> nodes) {
this.hashFunction = hashFunction;
this.numberOfReplicas = numberOfReplicas;
for (T node : nodes) {
addBin(node);
}
}
/**
* Add a new bin to the consistent hash
*
* This assumes that the bin's toString method is immutable.
*
* This is not thread safe.
*/
public void addBin(T bin) {
for (int i = 0; i < numberOfReplicas; i++) {
// The string addition forces each replica to have different hash
circle.put(hashFunction.hash(bin.toString() + i), bin);
}
}
/**
* Remove a bin from the consistent hash
*
* This assumes that the bin's toString method is immutable.
*
* This is not thread safe.
*/
public void removeBin(T bin) {
for (int i = 0; i < numberOfReplicas; i++) {
// The string addition forces each replica to be different. This needs
// to resolve to the same keys as addBin.
circle.remove(hashFunction.hash(bin.toString() + i));
}
}
/**
* This returns the closest bin for the object. If the object is the bin it
* should be an exact hit, but if it is a value traverse to find closest
* subsequent bin.
*/
public T getBinFor(Object key) {
if (circle.isEmpty()) {
return null;
}
int hash = hashFunction.hash(key);
T bin = circle.get(hash);
if (bin == null) {
// inexact match -- find the next value in the circle
SortedMap<Integer, T> tailMap = circle.tailMap(hash);
hash = tailMap.isEmpty() ? circle.firstKey() : tailMap.firstKey();
bin = circle.get(hash);
}
return bin;
}
/**
* This returns the closest n bins in order for the object. There may be
* duplicates.
*/
public List<T> getNBinsFor(Object key, int n) {
if (circle.isEmpty()) {
return Collections.<T> emptyList();
}
List<T> list = new ArrayList<T>(n);
int hash = hashFunction.hash(key);
for (int i = 0; i < n; i++) {
if (!circle.containsKey(hash)) {
// go to next element.
SortedMap<Integer, T> tailMap = circle.tailMap(hash);
hash = tailMap.isEmpty() ? circle.firstKey() : tailMap.firstKey();
}
list.add(circle.get(hash));
// was a hit so we increment and loop to find the next bin in the
// circle
hash++;
}
return list;
}
/**
* This returns the closest n bins in order for the object. There is extra
* code that forces the bin values to be unique.
*
* This will return a list that has all the bins (and is smaller than n) if n
* > number of bins.
*/
public List<T> getNUniqueBinsFor(Object key, int n) {
if (circle.isEmpty()) {
return Collections.<T> emptyList();
}
List<T> list = new ArrayList<T>(n);
int hash = hashFunction.hash(key);
int duped = 0;
for (int i = 0; i < n; i++) {
if (!circle.containsKey(hash)) {
// go to next element.
SortedMap<Integer, T> tailMap = circle.tailMap(hash);
hash = tailMap.isEmpty() ? circle.firstKey() : tailMap.firstKey();
}
T candidate = circle.get(hash);
if (!list.contains(candidate)) {
duped = 0;
list.add(candidate);
} else {
duped++;
i--; // try again.
if (duped > MAX_DUPES) {
i++; // we've been duped too many times, just skip to next, returning
// fewer than n
}
}
// find the next element in the circle
hash++;
}
return list;
}
}