/*
* Copyright 2017 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package com.github.ambry.clustermap;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.MetricRegistry;
import com.github.ambry.clustermap.TestUtils.*;
import com.github.ambry.config.ClusterMapConfig;
import com.github.ambry.config.VerifiableProperties;
import com.github.ambry.utils.ByteBufferInputStream;
import com.github.ambry.utils.Utils;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;
import org.apache.helix.HelixManager;
import org.apache.helix.InstanceType;
import org.json.JSONObject;
import org.junit.After;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import static com.github.ambry.clustermap.TestUtils.*;
import static org.junit.Assert.*;
/**
* Tests the {@link HelixClusterManager} directly and also via the {@link CompositeClusterManager}.
*/
@RunWith(Parameterized.class)
public class HelixClusterManagerTest {
private final HashMap<String, ZkInfo> dcsToZkInfo = new HashMap<>();
private final String dcs[] = new String[]{"DC0", "DC1"};
private final TestUtils.TestHardwareLayout testHardwareLayout;
private final TestPartitionLayout testPartitionLayout;
private final String clusterNameStatic = "HelixClusterManagerTestCluster";
private final String clusterNamePrefixInHelix = "Ambry-";
private final ClusterMapConfig clusterMapConfig;
private final MockHelixCluster helixCluster;
private final String hostname;
private final ClusterMap clusterManager;
private MetricRegistry metricRegistry;
private Map<String, Gauge> gauges;
private Map<String, Counter> counters;
private final boolean useComposite;
@Parameterized.Parameters
public static List<Object[]> data() {
return Arrays.asList(new Object[][]{{false}, {true}});
}
/**
* Construct the static layout files and use that to instantiate a {@link MockHelixCluster}.
* Instantiate a {@link MockHelixManagerFactory} for use by the cluster manager.
* @param useComposite whether or not the test are to be done for the {@link CompositeClusterManager}
* @throws Exception
*/
public HelixClusterManagerTest(boolean useComposite) throws Exception {
this.useComposite = useComposite;
Random random = new Random();
File tempDir = Files.createTempDirectory("helixClusterManager-" + random.nextInt(1000)).toFile();
String tempDirPath = tempDir.getAbsolutePath();
tempDir.deleteOnExit();
int port = 2200;
for (String dcName : dcs) {
dcsToZkInfo.put(dcName, new ZkInfo(tempDirPath, dcName, port++, false));
}
String hardwareLayoutPath = tempDirPath + File.separator + "hardwareLayoutTest.json";
String partitionLayoutPath = tempDirPath + File.separator + "partitionLayoutTest.json";
String zkLayoutPath = tempDirPath + File.separator + "zkLayoutPath.json";
JSONObject zkJson = constructZkLayoutJSON(dcsToZkInfo.values());
testHardwareLayout = constructInitialHardwareLayoutJSON(clusterNameStatic);
testPartitionLayout = constructInitialPartitionLayoutJSON(testHardwareLayout, 3);
// add 3 partitions with read_only state.
testPartitionLayout.partitionState = PartitionState.READ_ONLY;
testPartitionLayout.addNewPartitions(3);
testPartitionLayout.partitionState = PartitionState.READ_WRITE;
Utils.writeJsonToFile(zkJson, zkLayoutPath);
Utils.writeJsonToFile(testHardwareLayout.getHardwareLayout().toJSONObject(), hardwareLayoutPath);
Utils.writeJsonToFile(testPartitionLayout.getPartitionLayout().toJSONObject(), partitionLayoutPath);
helixCluster =
new MockHelixCluster(clusterNamePrefixInHelix, hardwareLayoutPath, partitionLayoutPath, zkLayoutPath);
for (PartitionId partitionId : testPartitionLayout.getPartitionLayout().getPartitions()) {
if (partitionId.getPartitionState().equals(PartitionState.READ_ONLY)) {
String partitionName = partitionId.toString();
String helixPartitionName = partitionName.substring(partitionName.indexOf('[') + 1, partitionName.indexOf(']'));
helixCluster.setPartitionState(helixPartitionName, PartitionState.READ_ONLY);
}
}
hostname = "localhost";
Properties props = new Properties();
props.setProperty("clustermap.host.name", hostname);
props.setProperty("clustermap.cluster.name", clusterNamePrefixInHelix + clusterNameStatic);
props.setProperty("clustermap.datacenter.name", "DC0");
props.setProperty("clustermap.dcs.zk.connect.strings", zkJson.toString(2));
clusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
MockHelixManagerFactory helixManagerFactory = new MockHelixManagerFactory(helixCluster);
if (useComposite) {
StaticClusterAgentsFactory staticClusterAgentsFactory =
new StaticClusterAgentsFactory(clusterMapConfig, hardwareLayoutPath, partitionLayoutPath);
metricRegistry = staticClusterAgentsFactory.getMetricRegistry();
clusterManager = new CompositeClusterManager(staticClusterAgentsFactory.getClusterMap(),
new HelixClusterManager(clusterMapConfig, hostname, helixManagerFactory, metricRegistry));
} else {
metricRegistry = new MetricRegistry();
clusterManager = new HelixClusterManager(clusterMapConfig, hostname, helixManagerFactory, metricRegistry);
}
}
/**
* Close the cluster managers created.
*/
@After
public void after() {
if (clusterManager != null) {
clusterManager.close();
}
}
/**
* Test bad instantiation.
* @throws Exception
*/
@Test
public void badInstantiationTest() throws Exception {
// Good test happened in the constructor
assertEquals(0L,
metricRegistry.getGauges().get(HelixClusterManager.class.getName() + ".instantiationFailed").getValue());
// Bad test
Set<ZkInfo> zkInfos = new HashSet<>(dcsToZkInfo.values());
zkInfos.iterator().next().port = 0;
JSONObject invalidZkJson = constructZkLayoutJSON(zkInfos);
Properties props = new Properties();
props.setProperty("clustermap.host.name", hostname);
props.setProperty("clustermap.cluster.name", clusterNamePrefixInHelix + clusterNameStatic);
props.setProperty("clustermap.datacenter.name", "DC0");
props.setProperty("clustermap.dcs.zk.connect.strings", invalidZkJson.toString(2));
ClusterMapConfig invalidClusterMapConfig = new ClusterMapConfig(new VerifiableProperties(props));
metricRegistry = new MetricRegistry();
try {
new HelixClusterManager(invalidClusterMapConfig, hostname, new MockHelixManagerFactory(helixCluster),
metricRegistry);
fail("Instantiation should have failed with invalid zk addresses");
} catch (IOException e) {
assertEquals(1L,
metricRegistry.getGauges().get(HelixClusterManager.class.getName() + ".instantiationFailed").getValue());
}
}
/**
* Tests all the interface methods.
* @throws Exception
*/
@Test
public void basicInterfaceTest() throws Exception {
for (String metricName : clusterManager.getMetricRegistry().getNames()) {
System.out.println(metricName);
}
testPartitionReplicaConsistency();
testInvalidPartitionId();
testDatacenterDatanodeReplicas();
assertStateEquivalency();
}
/**
* Test that everything works as expected in the presence of liveness changes initiated by Helix itself.
* @throws Exception
*/
@Test
public void helixInitiatedLivenessChangeTest() throws Exception {
// this test is not intended for the composite cluster manager.
if (useComposite) {
return;
}
// all instances are up initially.
assertStateEquivalency();
// Bring one instance down in each dc.
for (String zkAddr : helixCluster.getZkAddrs()) {
helixCluster.bringInstanceDown(helixCluster.getUpInstances(zkAddr).get(0));
}
assertStateEquivalency();
// Bring all instances down in all dcs.
helixCluster.bringAllInstancesDown();
assertStateEquivalency();
// Bring one instance up in each dc.
for (String zkAddr : helixCluster.getZkAddrs()) {
helixCluster.bringInstanceUp(helixCluster.getDownInstances(zkAddr).get(0));
}
assertStateEquivalency();
}
/**
* Test that everything works as expected in the presence of liveness changes initiated by clients of the cluster
* manager.
* @throws Exception
*/
@Test
public void clientInitiatedLivenessChangeTest() throws Exception {
ReplicaId replica = clusterManager.getWritablePartitionIds().get(0).getReplicaIds().get(0);
DataNodeId dataNode = replica.getDataNodeId();
assertTrue(clusterManager.getReplicaIds(dataNode).contains(replica));
DiskId disk = replica.getDiskId();
// Verify that everything is up in the beginning.
assertFalse(replica.isDown());
assertEquals(HardwareState.AVAILABLE, dataNode.getState());
assertEquals(HardwareState.AVAILABLE, disk.getState());
// Trigger node failure events for the replica.
for (int i = 0; i < clusterMapConfig.clusterMapFixedTimeoutDatanodeErrorThreshold; i++) {
clusterManager.onReplicaEvent(replica, ReplicaEventType.Node_Timeout);
}
// When node times out, all replicas and all disks on the node should also become unavailable.
assertTrue(replica.isDown());
assertEquals(HardwareState.UNAVAILABLE, dataNode.getState());
assertEquals(HardwareState.UNAVAILABLE, disk.getState());
// Trigger a successful event to bring the resources up.
clusterManager.onReplicaEvent(replica, ReplicaEventType.Node_Response);
assertFalse(replica.isDown());
assertEquals(HardwareState.AVAILABLE, dataNode.getState());
assertEquals(HardwareState.AVAILABLE, disk.getState());
// Similar tests for disks.
for (int i = 0; i < clusterMapConfig.clusterMapFixedTimeoutDiskErrorThreshold; i++) {
clusterManager.onReplicaEvent(replica, ReplicaEventType.Disk_Error);
}
assertTrue(replica.isDown());
assertEquals(HardwareState.UNAVAILABLE, disk.getState());
// node should still be available even on disk error.
assertEquals(HardwareState.AVAILABLE, dataNode.getState());
clusterManager.onReplicaEvent(replica, ReplicaEventType.Disk_Ok);
assertFalse(replica.isDown());
assertEquals(HardwareState.AVAILABLE, dataNode.getState());
assertEquals(HardwareState.AVAILABLE, disk.getState());
// The following does not do anything currently.
clusterManager.onReplicaEvent(replica, ReplicaEventType.Partition_ReadOnly);
assertStateEquivalency();
}
/**
* Test that the metrics in {@link HelixClusterManagerMetrics} are updated as expected. This also tests and ensures
* coverage of the methods in {@link HelixClusterManager} that are used only by {@link HelixClusterManagerMetrics}.
*/
@Test
public void metricsTest() throws Exception {
counters = clusterManager.getMetricRegistry().getCounters();
gauges = clusterManager.getMetricRegistry().getGauges();
// live instance trigger happens once initially.
long instanceTriggerCount = dcs.length;
// Bring one instance down in each dc in order to test the metrics more generally.
for (String zkAddr : helixCluster.getZkAddrs()) {
helixCluster.bringInstanceDown(helixCluster.getUpInstances(zkAddr).get(0));
instanceTriggerCount++;
}
// trigger for live instance change event should have come in twice per dc - the initial one, and the one due to a
// node brought up in each DC.
assertEquals(instanceTriggerCount, getCounterValue("liveInstanceChangeTriggerCount"));
assertEquals(dcs.length, getCounterValue("externalViewChangeTriggerCount"));
assertEquals(dcs.length, getCounterValue("instanceConfigChangeTriggerCount"));
assertEquals(helixCluster.getDataCenterCount(), getGaugeValue("datacenterCount"));
assertEquals(helixCluster.getDownInstances().size() + helixCluster.getUpInstances().size(),
getGaugeValue("dataNodeCount"));
assertEquals(helixCluster.getDownInstances().size(), getGaugeValue("dataNodeDownCount"));
assertEquals(helixCluster.getDiskCount(), getGaugeValue("diskCount"));
assertEquals(helixCluster.getDiskDownCount(), getGaugeValue("diskDownCount"));
assertEquals(helixCluster.getAllPartitions().size(), getGaugeValue("partitionCount"));
assertEquals(helixCluster.getAllWritablePartitions().size(), getGaugeValue("partitionReadWriteCount"));
assertEquals(helixCluster.getAllPartitions().size() - helixCluster.getAllWritablePartitions().size(),
getGaugeValue("partitionSealedCount"));
assertEquals(helixCluster.getDiskCapacity(), getGaugeValue("rawTotalCapacityBytes"));
assertEquals(0L, getGaugeValue("isMajorityReplicasDownForAnyPartition"));
assertEquals(0L,
getGaugeValue(helixCluster.getDownInstances().iterator().next().replace('_', '-') + "-DataNodeResourceState"));
assertEquals(1L,
getGaugeValue(helixCluster.getUpInstances().iterator().next().replace('_', '-') + "-DataNodeResourceState"));
helixCluster.bringAllInstancesDown();
assertEquals(1L, getGaugeValue("isMajorityReplicasDownForAnyPartition"));
if (useComposite) {
helixCluster.bringAllInstancesUp();
PartitionId partition = clusterManager.getWritablePartitionIds().get(0);
assertEquals(0L, getCounterValue("getPartitionIdFromStreamMismatchCount"));
ReplicaId replicaId = partition.getReplicaIds().get(0);
assertEquals(0L, getCounterValue("getReplicaIdsMismatchCount"));
// bring the replica down.
for (int i = 0; i < clusterMapConfig.clusterMapFixedTimeoutDiskErrorThreshold; i++) {
clusterManager.onReplicaEvent(replicaId, ReplicaEventType.Disk_Error);
}
clusterManager.getWritablePartitionIds();
assertEquals(0L, getCounterValue("getPartitionIdFromStreamMismatchCount"));
InputStream partitionStream = new ByteBufferInputStream(ByteBuffer.wrap(partition.getBytes()));
clusterManager.getPartitionIdFromStream(partitionStream);
assertEquals(0L, getCounterValue("getWritablePartitionIdsMismatchCount"));
clusterManager.hasDatacenter("invalid");
clusterManager.hasDatacenter(dcs[0]);
assertEquals(0L, getCounterValue("hasDatacenterMismatchCount"));
DataNodeId dataNodeId = clusterManager.getDataNodeIds().get(0);
assertEquals(0L, getCounterValue("getDataNodeIdsMismatchCount"));
clusterManager.getDataNodeId(dataNodeId.getHostname(), dataNodeId.getPort());
assertEquals(0L, getCounterValue("getDataNodeIdMismatchCount"));
}
}
// Helpers
/**
* Get the counter value for the metric in {@link HelixClusterManagerMetrics} with the given suffix.
* @param suffix the suffix of the metric that distinguishes it from other metrics in the class.
* @return the value of the counter.
*/
private long getCounterValue(String suffix) {
return counters.get(HelixClusterManager.class.getName() + "." + suffix).getCount();
}
/**
* Get the gauge value for the metric in {@link HelixClusterManagerMetrics} with the given suffix.
* @param suffix the suffix of the metric that distinguishes it from other metrics in the class.
* @return the value of the gauge.
*/
private long getGaugeValue(String suffix) {
return (long) gauges.get(HelixClusterManager.class.getName() + "." + suffix).getValue();
}
/**
* Tests that the writable partitions returned by the {@link HelixClusterManager} is the same as the writable
* partitions in the cluster.
*/
private void testWritablePartitions() {
Set<String> writableInClusterManager = new HashSet<>();
for (PartitionId partition : clusterManager.getWritablePartitionIds()) {
String partitionStr =
useComposite ? ((Partition) partition).toPathString() : ((AmbryPartition) partition).toPathString();
writableInClusterManager.add(partitionStr);
}
Set<String> writableInCluster = helixCluster.getWritablePartitions();
if (writableInCluster.isEmpty()) {
writableInCluster = helixCluster.getAllWritablePartitions();
}
assertEquals(writableInCluster, writableInClusterManager);
}
/**
* Tests that all partitions returned by the {@link HelixClusterManager} is equivalent to all
* partitions in the cluster.
*/
private void testAllPartitions() {
Set<String> partitionsInClusterManager = new HashSet<>();
for (PartitionId partition : clusterManager.getAllPartitionIds()) {
String partitionStr =
useComposite ? ((Partition) partition).toPathString() : ((AmbryPartition) partition).toPathString();
partitionsInClusterManager.add(partitionStr);
}
Set<String> allPartitions = helixCluster.getAllPartitions();
assertEquals(allPartitions, partitionsInClusterManager);
}
/**
* Tests that the replica count and replica to partition id mappings as reported by the cluster manager is the same as
* those in the cluster.
*/
private void testPartitionReplicaConsistency() throws Exception {
for (PartitionId partition : clusterManager.getWritablePartitionIds()) {
assertEquals(partition.getReplicaIds().size(), testPartitionLayout.getTotalReplicaCount());
InputStream partitionStream = new ByteBufferInputStream(ByteBuffer.wrap(partition.getBytes()));
PartitionId fetchedPartition = clusterManager.getPartitionIdFromStream(partitionStream);
assertEquals(partition, fetchedPartition);
}
}
/**
* Test that invalid partition id deserialization fails as expected.
*/
private void testInvalidPartitionId() {
PartitionId partition = clusterManager.getWritablePartitionIds().get(0);
try {
byte[] fakePartition = Arrays.copyOf(partition.getBytes(), partition.getBytes().length);
for (int i = fakePartition.length; i > fakePartition.length - Long.SIZE / Byte.SIZE; i--) {
fakePartition[i - 1] = (byte) 0xff;
}
InputStream partitionStream = new ByteBufferInputStream(ByteBuffer.allocate(fakePartition.length));
clusterManager.getPartitionIdFromStream(partitionStream);
fail("partition id deserialization should have failed");
} catch (IOException e) {
// OK
}
}
/**
* Test clustermap interface methods related to datanodes and datacenter.
*/
private void testDatacenterDatanodeReplicas() {
for (Datacenter datacenter : testHardwareLayout.getHardwareLayout().getDatacenters()) {
assertTrue(clusterManager.hasDatacenter(datacenter.getName()));
for (DataNode dataNode : datacenter.getDataNodes()) {
DataNodeId dataNodeId = clusterManager.getDataNodeId(dataNode.getHostname(), dataNode.getPort());
assertEquals(dataNode.toString(), dataNodeId.toString());
if (!useComposite) {
try {
clusterManager.getReplicaIds(dataNode);
fail("HelixClusterManager methods should throw when passed in a static manager datanode");
} catch (IllegalArgumentException e) {
// OK
}
} else {
clusterManager.getReplicaIds(dataNode);
}
for (ReplicaId replica : clusterManager.getReplicaIds(dataNodeId)) {
assertEquals(dataNodeId, replica.getDataNodeId());
}
}
}
}
/**
* Assert that the state of datanodes in the cluster manager's view are consistent with their actual states in the
* cluster.
*/
private void assertStateEquivalency() {
Set<String> upInstancesInCluster = helixCluster.getUpInstances();
Set<String> downInstancesInCluster = helixCluster.getDownInstances();
Set<String> upInstancesInClusterManager = new HashSet<>();
Set<String> downInstancesInClusterManager = new HashSet<>();
for (DataNodeId dataNode : clusterManager.getDataNodeIds()) {
if (dataNode.getState() == HardwareState.UNAVAILABLE) {
downInstancesInClusterManager.add(ClusterMapUtils.getInstanceName(dataNode.getHostname(), dataNode.getPort()));
} else {
upInstancesInClusterManager.add(ClusterMapUtils.getInstanceName(dataNode.getHostname(), dataNode.getPort()));
}
}
assertEquals(downInstancesInCluster, downInstancesInClusterManager);
assertEquals(upInstancesInCluster, upInstancesInClusterManager);
testWritablePartitions();
testAllPartitions();
}
/**
* A Mock implementaion of {@link HelixFactory} that returns the {@link MockHelixManager}
*/
private static class MockHelixManagerFactory extends HelixFactory {
private final MockHelixCluster helixCluster;
/**
* Construct this factory
* @param helixCluster the {@link MockHelixCluster} that this factory's manager will be associated with.
*/
MockHelixManagerFactory(MockHelixCluster helixCluster) {
this.helixCluster = helixCluster;
}
/**
* Return a {@link MockHelixManager}
* @param clusterName the name of the cluster for which the manager is to be gotten.
* @param instanceName the name of the instance on whose behalf the manager is to be gotten.
* @param instanceType the {@link InstanceType} of the requester.
* @param zkAddr the address identifying the zk service to which this request is to be made.
* @return the {@link MockHelixManager}
*/
HelixManager getZKHelixManager(String clusterName, String instanceName, InstanceType instanceType, String zkAddr) {
if (helixCluster.getZkAddrs().contains(zkAddr)) {
return new MockHelixManager(instanceName, instanceType, zkAddr, helixCluster);
} else {
throw new IllegalArgumentException("Invalid ZkAddr");
}
}
}
}