package edu.berkeley.thebes.hat.server.antientropy.clustering; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Histogram; import com.yammer.metrics.core.Meter; import org.apache.thrift.TException; import org.apache.thrift.transport.TTransportException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Queues; import com.google.common.util.concurrent.Uninterruptibles; import edu.berkeley.thebes.common.clustering.RoutingHash; import edu.berkeley.thebes.common.config.Config; import edu.berkeley.thebes.common.data.Version; import edu.berkeley.thebes.common.thrift.ServerAddress; import edu.berkeley.thebes.common.thrift.ThriftDataItem; import edu.berkeley.thebes.common.thrift.ThriftVersion; import edu.berkeley.thebes.hat.common.thrift.AntiEntropyService; import edu.berkeley.thebes.hat.common.thrift.ThriftUtil; import edu.berkeley.thebes.hat.server.dependencies.PendingWrite; import org.xerial.snappy.Snappy; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; public class AntiEntropyServiceRouter { private static Logger logger = LoggerFactory.getLogger(AntiEntropyServiceRouter.class); Meter writeForwardCount = Metrics.newMeter(AntiEntropyServiceRouter.class, "write-forward-events", "events", TimeUnit.SECONDS); Meter announceWriteCount = Metrics.newMeter(AntiEntropyServiceRouter.class, "write-announce-events", "events", TimeUnit.SECONDS); Histogram aeBatchSize = Metrics.newHistogram(AntiEntropyServiceRouter.class, "anti-entropy-batch-size"); Histogram taBatchSize = Metrics.newHistogram(AntiEntropyServiceRouter.class, "ta-batch-size"); Histogram taUncompressedSize = Metrics.newHistogram(AntiEntropyServiceRouter.class, "ta-uncompressed-batch-bytes"); Histogram taCompressedSize = Metrics.newHistogram(AntiEntropyServiceRouter.class, "ta-compressed-batch-bytes"); public void bootstrapAntiEntropyRouting() throws TTransportException { if (Config.isStandaloneServer()) { logger.debug("Server marked as standalone; not starting anti-entropy (jk)!"); // TODO: Fix this. // return; } Uninterruptibles.sleepUninterruptibly(Config.getAntiEntropyBootstrapTime(), TimeUnit.MILLISECONDS); logger.debug("Bootstrapping anti-entropy..."); logger.trace("Starting thread to forward writes to siblings..."); for (int i = 0; i < Config.getNumAntiEntropyThreads(); i ++) { Thread t = new Thread() { public void run() { List<AntiEntropyService.Client> replicaSiblingClients = createClientsFromAddresses(Config.getSiblingServers()); while (true) { writeForwardCount.mark(); forwardNextQueuedWriteToSiblings(replicaSiblingClients); } } }; t.setPriority(Thread.NORM_PRIORITY-2); t.start(); } logger.trace("Starting thread to announce new pending writes..."); for (int i = 0; i < Config.getNumTAAntiEntropyThreads(); i ++) { Thread t = new Thread() { public void run() { List<AntiEntropyService.Client> neighborClients = createClientsFromAddresses(Config.getServersInCluster()); while (true) { announceWriteCount.mark(); announceNextQueuedPendingWrite(neighborClients); } } }; t.setPriority(Thread.NORM_PRIORITY-3); t.start(); } logger.debug("...anti-entropy bootstrapped"); } /** Stores the writes we receive and need to forward to all siblings */ private final LinkedBlockingQueue<QueuedWrite> writesToForwardSiblings; /** Stores the writes we've put into pending, and need to notify all dependent neighbors. */ private final LinkedBlockingQueue<QueuedTransactionAnnouncement> pendingTransactionAnnouncements; public AntiEntropyServiceRouter() { this.writesToForwardSiblings = Queues.newLinkedBlockingQueue(); this.pendingTransactionAnnouncements = Queues.newLinkedBlockingQueue(); } /** Our cluster got a new write, forward to the replicas in other clusters. */ public void sendWriteToSiblings(String key, ThriftDataItem value) { writesToForwardSiblings.add(new QueuedWrite(key, value)); } /** Actually does the forwarding! Called in its own thread. */ private void forwardNextQueuedWriteToSiblings(List<AntiEntropyService.Client> siblings) { ServerAddress tryServer = null; try { List<QueuedWrite> writes = Lists.newArrayList(); writes.add(writesToForwardSiblings.take()); Uninterruptibles.sleepUninterruptibly(200, TimeUnit.MILLISECONDS); writesToForwardSiblings.drainTo(writes); List<String> keys = Lists.newArrayListWithExpectedSize(writes.size()); List<ThriftDataItem> values = Lists.newArrayListWithExpectedSize(writes.size()); for (QueuedWrite write : writes) { keys.add(write.key); values.add(write.value); } aeBatchSize.update(writes.size()); int i = 0; for (AntiEntropyService.Client sibling : siblings) { tryServer = Config.getSiblingServers().get(i++); sibling.put(keys, values); } } catch (TException e) { logger.error("Failure while forwarding write to siblings (" + tryServer + "): ", e); } catch (InterruptedException e) { logger.error("Interrupted: ", e); } } /** Announce that a transaction is ready to some set of servers. */ public void announceTransactionReady(Version transactionID, Set<Integer> servers) { pendingTransactionAnnouncements.add( new QueuedTransactionAnnouncement(transactionID, servers)); } /** Actually does the announcement! Called in its own thread. */ private void announceNextQueuedPendingWrite(List<AntiEntropyService.Client> neighbors) { ServerAddress tryServer = null; try { List<QueuedTransactionAnnouncement> announcements = Lists.newArrayList(); announcements.add(pendingTransactionAnnouncements.take()); Uninterruptibles.sleepUninterruptibly(Config.getTABatchTime(), TimeUnit.MILLISECONDS); pendingTransactionAnnouncements.drainTo(announcements); Map<Integer, List<Long>> versionByServer = Maps.newHashMap(); int numSending = 0; for (QueuedTransactionAnnouncement ann : announcements) { for (Integer serverIndex : ann.servers) { if (!versionByServer.containsKey(serverIndex)) { versionByServer.put(serverIndex, new ArrayList<Long>()); } numSending ++; versionByServer.get(serverIndex).add(ann.transactionID.getThriftVersion().getVersion()); } } taBatchSize.update(numSending); for (Integer serverIndex : versionByServer.keySet()) { AntiEntropyService.Client neighborClient = neighbors.get(serverIndex); List<Long> versionsToSend = versionByServer.get(serverIndex); ByteArrayOutputStream baos = new ByteArrayOutputStream(versionsToSend.size()*Long.SIZE); DataOutputStream dos = new DataOutputStream(baos); for(long toSend : versionByServer.get(serverIndex)) { dos.writeLong(toSend); } byte[] uncompressedIds = baos.toByteArray(); byte[] compressedIds = Snappy.compress(uncompressedIds); taUncompressedSize.update(uncompressedIds.length); taCompressedSize.update(compressedIds.length); neighborClient.ackTransactionPending(ByteBuffer.wrap(compressedIds)); } } catch (IOException e) { logger.error("Failure while serializing ", e); } catch (TException e) { logger.error("Failure while announcing dpending write to " + tryServer + ": ", e); } catch (InterruptedException e) { logger.error("Interrupted: ", e); } } private List<AntiEntropyService.Client> createClientsFromAddresses( List<ServerAddress> addresses) { List<AntiEntropyService.Client> clients = Lists.newArrayList(); for (ServerAddress address : addresses) { while (true) { try { clients.add(ThriftUtil.getAntiEntropyServiceClient( address.getIP(), Config.getAntiEntropyServerPort())); break; } catch (Exception e) { logger.error("Exception while bootstrapping connection with cluster server: " + address); e.printStackTrace(); } } } return clients; } private static class QueuedWrite { public final String key; public final ThriftDataItem value; public QueuedWrite(String key, ThriftDataItem value) { this.key = key; this.value = value; } } private static class QueuedTransactionAnnouncement { public final Version transactionID; public final Set<Integer> servers; public QueuedTransactionAnnouncement(Version transactionID, Set<Integer> servers) { this.transactionID = transactionID; this.servers = servers; } } }