/**
* Copyright (C) 2015 meltmedia (christian.trimble@meltmedia.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.meltmedia.dropwizard.etcd.cluster;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import java.math.RoundingMode;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.joda.time.DateTime;
import org.junit.After;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.MetricRegistryListener;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.math.IntMath;
import com.meltmedia.dropwizard.etcd.cluster.ClusterService.ProcessService;
import com.meltmedia.dropwizard.etcd.json.EtcdDirectoryDao;
import com.meltmedia.dropwizard.etcd.json.EtcdEvent;
import com.meltmedia.dropwizard.etcd.json.EtcdJson.EtcdDirectory;
import com.meltmedia.dropwizard.etcd.json.EtcdJson.MappedEtcdDirectory;
import com.meltmedia.dropwizard.etcd.json.WatchService;
import com.meltmedia.dropwizard.etcd.junit.EtcdClientRule;
import com.meltmedia.dropwizard.etcd.junit.EtcdJsonRule;
/**
* The first version of this application just needs to keep all of the twitter streams
* open as best as possible. There are 2 concerns here:
*
* 1. Getting the correct set of stream states published to etcd.
* 2. Updating stream state as nodes come up and down.
*
* To accompilsh the first task, it would help to have a leader that updated the state.
*
* @author Christian Trimble
*
*/
public class ClusterAssignmentIT {
@ClassRule
public static EtcdClientRule clientRule = new EtcdClientRule("http://127.0.0.1:2379").withMaxFrameSize(1014*1000);
@Rule
public EtcdJsonRule factoryRule = new EtcdJsonRule(clientRule::getClient, "/cluster-test");
ClusterNode node1;
ClusterNode node2;
ClusterNode node3;
ScheduledExecutorService executor;
EtcdDirectoryDao<ClusterProcess> dao;
EtcdDirectoryDao<ProcessorNode> processorDao;
AssignmentLatchFactory latchFactory;
MappedEtcdDirectory<ClusterNode> nodeDir;
EtcdDirectory processDir;
private ClusterService clusterService1;
private ClusterService clusterService2;
private ClusterService clusterService3;
private ProcessService<ObjectNode> processService1;
private ProcessService<ObjectNode> processService2;
private ProcessService<ObjectNode> processService3;
private MetricRegistryListener listener1;
@Before
public void setUp() throws Exception {
executor = Executors.newScheduledThreadPool(10);
nodeDir = factoryRule.getFactory().newDirectory("/app/nodes", new TypeReference<ClusterNode>() {
});
processDir =
factoryRule.getFactory().newDirectory("/app/streams");
MappedEtcdDirectory<ClusterProcess> processNodeDir = processDir.newDirectory("/processes", new TypeReference<ClusterProcess>(){});
MappedEtcdDirectory<ProcessorNode> processorNodeDir = processDir.newDirectory("/processors", new TypeReference<ProcessorNode>(){});
node1 = new ClusterNode().withId("node1").withStartedAt(new DateTime());
MetricRegistry registry1 = new MetricRegistry();
registry1.addListener(listener1 = mock(MetricRegistryListener.class));
clusterService1 =
ClusterService.builder()
.withEtcdFactory(factoryRule.getFactory())
.withExecutor(executor)
.withNodesDirectory(nodeDir)
.withThisNode(node1)
.withMetricRegistry(registry1)
.build();
processService1 = clusterService1.newProcessService(processDir, ClusterAssignmentIT::toLifecycle, new TypeReference<ObjectNode>(){});
node2 = new ClusterNode().withId("node2").withStartedAt(new DateTime());
clusterService2 =
ClusterService.builder()
.withEtcdFactory(factoryRule.getFactory())
.withExecutor(executor)
.withNodesDirectory(nodeDir)
.withThisNode(node2)
.withMetricRegistry(new MetricRegistry())
.build();
processService2 = clusterService2.newProcessService(processDir, ClusterAssignmentIT::toLifecycle, new TypeReference<ObjectNode>(){});
node3 = new ClusterNode().withId("node3").withStartedAt(new DateTime());
clusterService3 =
ClusterService.builder()
.withEtcdFactory(factoryRule.getFactory())
.withExecutor(executor)
.withNodesDirectory(nodeDir)
.withThisNode(node3)
.withMetricRegistry(new MetricRegistry())
.build();
processService3 = clusterService3.newProcessService(processDir, ClusterAssignmentIT::toLifecycle, new TypeReference<ObjectNode>(){});
dao =
processNodeDir.newDao();
processorDao = processorNodeDir.newDao();
latchFactory =
new AssignmentLatchFactory(factoryRule.getFactory().getWatchService(), processNodeDir.getPath());
}
public static ClusterProcessLifecycle toLifecycle( ObjectNode process ) {
return new ClusterProcessLifecycle() {
@Override public void start() {}
@Override public void stop() {}
};
}
@After
public void tearDown() throws Exception {
// clusterService3.stop();
// clusterService2.stop();
// clusterService1.stop();
executor.shutdown();
}
@Test
public void shouldAssignJob() throws InterruptedException {
clusterService1.start();
processService1.start();
dao.put("id", processNode(null, "name"));
assertState("job assigned", s -> s.assignments("node1") == 1);
processService1.stop();
clusterService1.stop();
}
@Test
public void shouldAssignMultipleWithOneNode() throws InterruptedException {
clusterService1.start();
processService1.start();
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("job assigned", s -> s.assignments("node1") == 2);
processService1.stop();
clusterService1.stop();
}
@Test
public void shouldAssignExistingBrokenNodes() throws InterruptedException {
dao.put("id1", processNode(null, "name1").withAssignedTo("junk"));
dao.put("id2", processNode(null, "name2").withAssignedTo("moreJunk"));
clusterService1.start();
processService1.start();
assertState("job assigned", s -> s.assignments("node1") == 2, 20, TimeUnit.SECONDS);
processService1.stop();
clusterService1.stop();
}
@Test
public void shouldAllowRestart() throws InterruptedException {
dao.put("id", processNode(null, "name"));
for (int i = 0; i < 25; i++) {
clusterService1.start();
processService1.start();
assertState("job assigned", s -> s.assignments("node1") == 1);
processService1.stop();
clusterService1.stop();
assertState("job unnassigned", s -> s.unassigned() == 1);
clusterService2.start();
processService2.start();
assertState("job assigned", s -> s.assignments("node2") == 1);
processService2.stop();
clusterService2.stop();
}
}
@Test
public void shouldAssignJobsEvenly() throws InterruptedException {
clusterService1.start();
processService1.start();
clusterService2.start();
processService2.start();
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("jobs evenly assigned", s -> s.assignments("node1") == 1
&& s.assignments("node2") == 1);
processService1.stop();
clusterService1.stop();
processService2.stop();
clusterService2.stop();
}
@Test
public void shouldReassignWhenServiceAdded() throws InterruptedException {
clusterService1.start();
processService1.start();
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("initial state reached", s -> s.assignments("node1") == 2
&& s.assignments("node2") == 0);
clusterService2.start();
processService2.start();
assertState("reassigned after start",
s -> s.assignments("node1") == 1 && s.assignments("node2") == 1);
processService1.stop();
clusterService1.stop();
processService2.stop();
clusterService2.stop();
}
private void assertState(String message, Predicate<AssignmentState> test)
throws InterruptedException {
assertThat(message, latchFactory.newLatch(message, test).await(100, TimeUnit.SECONDS),
equalTo(true));
}
private void assertState(String message, Predicate<AssignmentState> test, Predicate<AssignmentState> illegalStateTest)
throws InterruptedException {
assertThat(message, latchFactory.newLatch(message, test, illegalStateTest).await(100, TimeUnit.SECONDS),
equalTo(true));
}
private void assertState(String message, Predicate<AssignmentState> test, long duration,
TimeUnit unit) throws InterruptedException {
assertThat(message, latchFactory.newLatch(message, test).await(duration, unit), equalTo(true));
}
@SuppressWarnings("unused")
private void assertState(String message, Predicate<AssignmentState> test, Predicate<AssignmentState> illegalStateTest, long duration,
TimeUnit unit) throws InterruptedException {
assertThat(message, latchFactory.newLatch(message, test, illegalStateTest).await(duration, unit), equalTo(true));
}
@Test
public void shouldReassignWhenServiceLost() throws InterruptedException {
clusterService1.start();
processService1.start();
clusterService2.start();
processService2.start();
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("the initial state was reached",
s -> s.assignments("node1") == 1 && s.assignments("node2") == 1);
processService2.stop();
clusterService2.stop();
assertState("the jobs were reassigned",
s -> s.assignments("node1") == 2 && s.assignments("node2") == 0);
processService1.stop();
clusterService1.stop();
}
@Test
public void shouldUnassignWhenAllStopped() throws InterruptedException {
clusterService1.start();
processService1.start();
clusterService2.start();
processService2.start();
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("the initial state was reached", s -> s.unassigned() == 0);
processService1.stop();
clusterService1.stop();
processService2.stop();
clusterService2.stop();
assertState("all jobs were unassigned", s -> s.unassigned() == 2);
}
@Ignore
@Test
/**
* There is a known bug with reusing a cluster assignment service. Same code works with
* new instances.
* @throws InterruptedException
*/
public void shouldBlueGreenDeploy() throws InterruptedException {
@SuppressWarnings("unchecked")
List<ProcessService<ObjectNode>> services = Lists.newArrayList(processService1, processService2, processService3);
Function<Integer, ProcessService<ObjectNode>> serviceLookup = i -> {
return services.get(i % services.size());
};
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
dao.put("id3", processNode(null, "name3"));
dao.put("id4", processNode(null, "name4"));
dao.put("id5", processNode(null, "name5"));
ProcessService<ObjectNode> currentService = serviceLookup.apply(0);
currentService.start();
assertState("only green running", s -> s.assignments("node1") == 5);
for (int i = 1; i < 10; i++) {
ProcessService<ObjectNode> nextService = serviceLookup.apply(i);
nextService.start();
assertState("blue and green running", s -> s.maxAssignments() == 3 && s.minAssignments() == 2
&& s.unassigned() == 0);
currentService.stop();
assertState("blue now green", s -> s.assignments(nextService.getId()) == 5);
currentService = nextService;
}
currentService.stop();
assertState("all stopped", s -> s.unassigned() == 5);
}
@Test
public void shouldBlueGreenDeployWithNewServices() throws InterruptedException {
int processCount = 20;
for (int i = 0; i < processCount; i++) {
dao.put("id" + i, processNode(null, "name" + i));
}
int halfFloorCount = IntMath.divide(processCount, 2, RoundingMode.FLOOR);
int halfCeilCount = IntMath.divide(processCount, 2, RoundingMode.CEILING);
ClusterNode node0 = new ClusterNode().withId("node0").withStartedAt(new DateTime());
MetricRegistry registry = new MetricRegistry();
ClusterService currentClusterService =
ClusterService.builder().withEtcdFactory(factoryRule.getFactory()).withExecutor(executor)
.withNodesDirectory(nodeDir).withThisNode(node0).withMetricRegistry(registry).build();
currentClusterService.start();
ProcessService<ObjectNode> currentProcessService = currentClusterService.newProcessService(processDir, ClusterAssignmentIT::toLifecycle, new TypeReference<ObjectNode>(){});
currentProcessService.start();
assertState("only green running", s -> s.assignments("node0") == processCount);
for (int i = 1; i < 4; i++) {
ClusterNode nextNode = new ClusterNode().withId("node" + i).withStartedAt(new DateTime());
MetricRegistry nextRegistry = new MetricRegistry();
ClusterService nextClusterService =
ClusterService.builder()
.withEtcdFactory(factoryRule.getFactory())
.withExecutor(executor)
.withNodesDirectory(nodeDir)
.withThisNode(nextNode)
.withMetricRegistry(nextRegistry)
.build();
nextClusterService.start();
ProcessService<ObjectNode> nextProcessService = nextClusterService.newProcessService(processDir, ClusterAssignmentIT::toLifecycle, new TypeReference<ObjectNode>(){});
nextProcessService.start();
assertState(
"blue and green running",
s -> s.maxAssignments() == halfCeilCount && s.minAssignments() == halfFloorCount
&& s.unassigned() == 0, s->s.unassigned() > 1);
final Runnable stopProcess = currentProcessService::stop;
final Runnable stopClusterService = currentClusterService::stop;
executor.schedule(()->{
stopProcess.run();
stopClusterService.run();
}, 1, TimeUnit.MILLISECONDS);
assertState("blue now green",
s -> s.assignments(nextProcessService.getId()) == processCount,
s->s.unassigned() > 1);
currentProcessService = nextProcessService;
currentClusterService = nextClusterService;
}
currentProcessService.stop();
currentClusterService.stop();
assertState("all stopped", s -> s.unassigned() == processCount);
}
@Test
public void shouldReassignWhenProcessStopped() throws InterruptedException {
clusterService1.start();
processService1.start();
clusterService2.start();
processService2.start();
dao.put("id1", processNode(null, "name1"));
assertState("the initial state was reached", s -> s.unassigned() == 0
&& s.totalAssignments() == 1);
dao.put("id1", processNode(null, "name1"));
assertState("the node was reassigned",
s -> s.assignments(processService1.getId(), processService2.getId()) == 1 && s.unassigned() == 0);
processService1.stop();
clusterService1.stop();
processService2.stop();
clusterService2.stop();
assertState("all jobs were unassigned", s -> s.unassigned() == 1 && s.totalAssignments() == 0);
}
@Test
public void shouldRecoverCrashedProcesses() throws InterruptedException {
clusterService1.start();
dao.put("id1", processNode(node2.getId(), "name1"));
assertState("the initial state was reached",
s -> s.unassigned() == 0 && s.assignments(node2.getId()) == 1);
processService1.start();
assertState("the stranded process was recovered",
s -> s.unassigned() == 0 && s.assignments(node1.getId()) == 1);
processService1.stop();
clusterService1.stop();
assertState("all jobs were unassigned", s -> s.unassigned() == 1);
}
@Test
public void shouldRecoverCrashedProcessor() throws InterruptedException {
clusterService1.start();
processorDao.put(node2.getId(), processorNode(node2.getId()));
dao.put("id1", processNode(null, "name1"));
dao.put("id2", processNode(null, "name2"));
assertState("the initial state was reached",
s -> s.unassigned() == 2);
processService1.start();
assertState("the stranded process was recovered",
s -> s.unassigned() == 0 && s.assignments(node1.getId()) == 2);
processService1.stop();
clusterService1.stop();
assertState("all jobs were unassigned", s -> s.unassigned() == 2);
}
@Test
public void registerMetrics() {
processService1.start();
verify(listener1).onGaugeAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentTracker.TOTAL)), any());
verify(listener1).onGaugeAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentTracker.ASSIGNED)), any());
verify(listener1).onGaugeAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentTracker.UNASSIGNED)), any());
verify(listener1).onMeterAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentService.ASSIGNMENT_FAILURES)), any());
verify(listener1).onMeterAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentService.UNASSIGNMENT_FAILURES)), any());
verify(listener1).onMeterAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentService.EXCEPTIONS)), any());
verify(listener1).onMeterAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentService.ASSIGNMENT_TASK)), any());
verify(listener1).onMeterAdded(eq(MetricRegistry.name(ClusterAssignmentService.class, "streams", ClusterAssignmentService.CLEAN_UP_TASK)), any());
}
public static ClusterProcess processNode(String assignedTo, String name) {
return new ClusterProcess().withAssignedTo(assignedTo).withConfiguration(nodeData(name));
}
private static ProcessorNode processorNode( String id ) {
return new ProcessorNode().withId(id).withStartedAt(new DateTime());
}
public static ObjectNode nodeData(String name) {
return JsonNodeFactory.instance.objectNode().put("name", name);
}
public static class NodeData {
protected String name;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public NodeData withName(String name) {
this.name = name;
return this;
}
public String toString() {
return ToStringBuilder.reflectionToString(this);
}
public boolean equals(Object o) {
return EqualsBuilder.reflectionEquals(this, o);
}
}
public static class AssignmentLatchFactory {
private WatchService service;
private String directory;
public AssignmentLatchFactory(WatchService service, String directory) {
this.service = service;
this.directory = directory;
}
public AssignmentLatch newLatch(String name, Predicate<AssignmentState> test) {
return new AssignmentLatch(service, directory, test, s->false, name);
}
public AssignmentLatch newLatch(String name, Predicate<AssignmentState> test, Predicate<AssignmentState> illegalStateTest) {
return new AssignmentLatch(service, directory, test, illegalStateTest, name);
}
}
public static class AssignmentState {
public static String UNASSIGNED = "unassigned";
Map<String, Integer> assignedCount = Maps.newConcurrentMap();
public void assign(String key) {
assignedCount.compute(Optional.ofNullable(key).orElse(UNASSIGNED), (k, count) -> {
return count == null ? 1 : count + 1;
});
}
public int minAssignments() {
return assignedCount.entrySet().stream().filter(e -> !UNASSIGNED.equals(e.getKey()))
.mapToInt(Map.Entry::getValue).min().orElse(0);
}
public int maxAssignments() {
return assignedCount.entrySet().stream().filter(e -> !UNASSIGNED.equals(e.getKey()))
.mapToInt(Map.Entry::getValue).max().orElse(0);
}
public void unassign(String key) {
assignedCount.compute(Optional.ofNullable(key).orElse(UNASSIGNED), (k, count) -> {
if (count == null) {
throw new IllegalStateException("node unassigned when not assigned.");
}
if (count < 1) {
throw new IllegalStateException("node count less than zero.");
}
return count == 1 ? null : count - 1;
});
}
public int totalAssignments() {
return assignedCount.entrySet().stream().filter(e -> !UNASSIGNED.equals(e.getKey()))
.mapToInt(Map.Entry::getValue).sum();
}
public int unassigned() {
return assignedCount.getOrDefault(UNASSIGNED, 0);
}
public int assignments(String... keys) {
Set<String> keySet = Sets.newHashSet(keys);
return assignedCount.entrySet().stream().filter(e -> keySet.contains(e.getKey()))
.mapToInt(Map.Entry::getValue).sum();
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("{");
assignedCount.entrySet().stream().forEach(e -> {
sb.append(String.format(" %s:%s", e.getKey(), e.getValue()));
});
sb.append(" }");
return sb.toString();
}
}
public static class AssignmentLatch {
private static Logger logger = LoggerFactory.getLogger(AssignmentLatch.class);
AssignmentState state = new AssignmentState();
WatchService service;
String directory;
Predicate<AssignmentState> test;
Predicate<AssignmentState> illegalStateTest;
CountDownLatch latch;
private String name;
volatile boolean illegalState = false;
public AssignmentLatch(WatchService service, String directory, Predicate<AssignmentState> test, Predicate<AssignmentState> illegalStateTest,
String name) {
this.service = service;
this.directory = directory;
this.test = test;
this.illegalStateTest = illegalStateTest;
this.name = name;
this.latch = new CountDownLatch(1);
}
public void handle(EtcdEvent<ClusterProcess> event) {
if (latch.getCount() != 0) {
switch (event.getType()) {
case added:
state.assign(event.getValue().getAssignedTo());
break;
case updated:
state.unassign(event.getPrevValue().getAssignedTo());
state.assign(event.getValue().getAssignedTo());
break;
case removed:
state.unassign(event.getPrevValue().getAssignedTo());
break;
}
try {
if( illegalStateTest.test(state)) {
logger.debug("{} in illegalState {}", name, state);
illegalState = true;
latch.countDown();
}
if (test.test(state)) {
logger.debug("{} did match {}", name, state);
latch.countDown();
} else {
logger.debug("{} did not match {}", name, state);
}
} catch (Exception e) {
logger.warn("bad latch predicate " + name, e);
}
}
}
public boolean await(long timeout, TimeUnit unit) throws InterruptedException {
WatchService.Watch watch =
service.registerDirectoryWatch(directory, new TypeReference<ClusterProcess>() {
}, this::handle);
try {
boolean reached = latch.await(timeout, unit);
if( !reached ) return false;
if( reached && illegalState ) throw new IllegalStateException(name+" reached an illegal state.");
return reached;
} finally {
watch.stop();
}
}
}
}