package avrobase.shard; import avrobase.AvroBase; import avrobase.AvroBaseException; import avrobase.Row; import org.apache.avro.specific.SpecificRecord; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; /** * The sharding strategy controls how keys are assigned to shards * <p/> * User: sam * Date: 10/9/10 * Time: 6:44 PM */ public interface ShardingStrategy<T extends SpecificRecord, K> { /** * Locate a shard for a row. * @param row * @return */ Shard<T, K> find(K row); /** * Finish using a found shard. * @param shard */ void done(Shard<T, K> shard); /** * Add a new shardable avrobase to the system with a particular * weighting. * @param avroBase * @param weight */ void add(ShardableAvroBase<T, K> avroBase, double weight); /** * Some operations require balance to be achieved before they can * be performed. * @throws InterruptedException */ void waitForBalance() throws InterruptedException; /** * The partition strategy divides the key space between multiple shardable avrobases. * @param <T> * @param <K> */ public static class Partition<T extends SpecificRecord, K> implements ShardingStrategy<T, K> { private final ReadWriteLock lock = new ReentrantReadWriteLock(); private final Lock writeShards = lock.writeLock(); private final Lock readShards = lock.readLock(); private final List<PartitionedShard<T, K>> activeShards = new ArrayList<PartitionedShard<T, K>>(); private final List<PartitionedShard<T, K>> usedShards = new ArrayList<PartitionedShard<T, K>>(); private final Set<ShardableAvroBase<T, K>> addedAvroBase = Collections.synchronizedSet(new HashSet<ShardableAvroBase<T, K>>()); private final Comparator<K> comparator; private final ExecutorService balancePool; private final ExecutorService es; public Partition(@Inject(SC.KEY_COMPARATOR) Comparator<K> comparator) { this.comparator = comparator; balancePool = Executors.newFixedThreadPool(1); es = Executors.newCachedThreadPool(); } static class PartitionedShard<T extends SpecificRecord, K> extends Shard<T, K> { private K start; public PartitionedShard(ShardableAvroBase<T, K> tkAvroBase, double weight, K start) { super(tkAvroBase, weight); this.start = start; } } /** * Scan the list of shards for the shard that contains the row. * @param row * @return */ @Override public Shard<T, K> find(K row) { // TODO: convert to binary search readShards.lock(); try { for (int i = 0; i < activeShards.size(); i++) { PartitionedShard<T, K> shard = activeShards.get(i); if (shard.start == null || comparator.compare(shard.start, row) <= 0) { if (i + 1 == activeShards.size() || comparator.compare(activeShards.get(i + 1).start, row) > 0) { synchronized (usedShards) { usedShards.add(shard); } return shard; } } } throw new AvroBaseException("No active shard matches row"); } finally { readShards.unlock(); } } /** * Remove the shard from the list of shards currently being used. * @param tkShard */ @Override public void done(Shard<T, K> tkShard) { synchronized (usedShards) { usedShards.remove(tkShard); usedShards.notifyAll(); } } /** * Add a new shardable avrobase then balance the shards. The new shard is added to the * end of the shard list and keys are then balanced starting at the first shard through * the last shard. The weighting is done at sliding point in time and assumes that * the weights dont change much during the counting process. * @param avroBase * @param weight */ @Override public void add(final ShardableAvroBase<T, K> avroBase, final double weight) { if (activeShards.size() == 0) { // The first shard handles all keys writeShards.lock(); try { activeShards.add(new PartitionedShard<T, K>(avroBase, weight, null)); } finally { writeShards.unlock(); } } else { addedAvroBase.add(avroBase); balancePool.submit(new Runnable() { @Override public void run() { // Repartition the data given a new shard try { // Count all records double totalWeight = weight; List<Future<Long>> counts = new ArrayList<Future<Long>>(); for (int i = 0; i < activeShards.size(); i++) { final PartitionedShard<T, K> shard = activeShards.get(i); totalWeight += shard.weight(); final int finalI = i; counts.add(es.submit(new Callable<Long>() { public Long call() throws Exception { long count = 0; K end = finalI + 1 == activeShards.size() ? null : activeShards.get(finalI + 1).start; for (K row : shard.avrobase().scanKeys(shard.start, end)) { count++; } return count; } })); } long total = 0; for (Future<Long> count : counts) { try { total += count.get(); } catch (Exception e) { throw new AvroBaseException("Corrupt shard: " + e); } } // The new list of shards includes the new shard on the end List<PartitionedShard<T, K>> newShards = new ArrayList<PartitionedShard<T, K>>(activeShards); newShards.add(new PartitionedShard<T, K>(avroBase, weight, null)); // Stop the world implementation // Lock, wait for all shards to be returned writeShards.lock(); synchronized (usedShards) { while (usedShards.size() != 0) { usedShards.wait(); } } // Copy between shards to make new shard distributions and set the start / end key K current; for (int j = 0; j < newShards.size(); j++) { PartitionedShard<T, K> shard = newShards.get(j); PartitionedShard<T, K> nextShard = j + 1 == newShards.size() ? null : newShards.get(j + 1); // This is the new number of records the partition should contain long newcount = (long) (total * shard.weight() / totalWeight); current = null; K end = j + 1 == newShards.size() ? null : newShards.get(j + 1).start; for (K tRow : shard.avrobase().scanKeys(shard.start, end)) { if (newcount-- <= 0) { if (nextShard == null) { break; } else { // The start is the first row that wasn't in the last shard if (current != null) { nextShard.start = tRow; current = null; } // Copy all the remaining rows from this shard to the next nextShard.avrobase().put(tRow, shard.avrobase().get(tRow).value); // Remove them from this shard shard.avrobase().delete(tRow); } } else { current = tRow; } } } // Set the new active shards activeShards.clear(); activeShards.addAll(newShards); } catch (InterruptedException e) { throw new AvroBaseException("Shard add interrupted", e); } finally { // Notify that we are done adding this avrobase synchronized (addedAvroBase) { addedAvroBase.remove(avroBase); addedAvroBase.notifyAll(); } // Release the lock writeShards.unlock(); } } }); } } public void waitForBalance() throws InterruptedException { synchronized (addedAvroBase) { while (!addedAvroBase.isEmpty()) { addedAvroBase.wait(); } } } } }