/*
* Copyright 2017 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package com.github.ambry.clustermap;
import com.codahale.metrics.MetricRegistry;
import com.github.ambry.config.ClusterMapConfig;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import org.apache.helix.ExternalViewChangeListener;
import org.apache.helix.HelixAdmin;
import org.apache.helix.HelixManager;
import org.apache.helix.InstanceConfigChangeListener;
import org.apache.helix.InstanceType;
import org.apache.helix.LiveInstanceChangeListener;
import org.apache.helix.NotificationContext;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.LiveInstance;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static com.github.ambry.clustermap.ClusterMapUtils.*;
/**
* An implementation of {@link ClusterMap} that makes use of Helix to dynamically manage the cluster information.
*
* @see <a href="http://helix.apache.org">http://helix.apache.org</a>
*/
class HelixClusterManager implements ClusterMap {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final String clusterName;
private final MetricRegistry metricRegistry;
private final ClusterMapConfig clusterMapConfig;
private final Map<String, DcZkInfo> dcToDcZkInfo = new HashMap<>();
private final Map<String, AmbryPartition> partitionNameToAmbryPartition = new ConcurrentHashMap<>();
private final Map<String, AmbryDataNode> instanceNameToAmbryDataNode = new ConcurrentHashMap<>();
private final Map<AmbryPartition, Set<AmbryReplica>> ambryPartitionToAmbryReplicas = new ConcurrentHashMap<>();
private final Map<AmbryDataNode, Set<AmbryReplica>> ambryDataNodeToAmbryReplicas = new ConcurrentHashMap<>();
private final Map<AmbryDataNode, Set<AmbryDisk>> ambryDataNodeToAmbryDisks = new ConcurrentHashMap<>();
private final Map<ByteBuffer, AmbryPartition> partitionMap = new ConcurrentHashMap<>();
private long clusterWideRawCapacityBytes;
private long clusterWideAllocatedRawCapacityBytes;
private long clusterWiseAllocatedUsableCapacityBytes;
private final HelixClusterManagerCallback helixClusterManagerCallback;
final HelixClusterManagerMetrics helixClusterManagerMetrics;
/**
* Instantiate a HelixClusterManager.
* @param clusterMapConfig the {@link ClusterMapConfig} associated with this manager.
* @param instanceName the String representation of the instance associated with this manager.
* @throws IOException if there is an error in parsing the clusterMapConfig or in connecting with the associated
* remote Zookeeper services.
*/
HelixClusterManager(ClusterMapConfig clusterMapConfig, String instanceName, HelixFactory helixFactory,
MetricRegistry metricRegistry) throws IOException {
this.clusterMapConfig = clusterMapConfig;
this.metricRegistry = metricRegistry;
clusterName = clusterMapConfig.clusterMapClusterName;
helixClusterManagerCallback = new HelixClusterManagerCallback();
helixClusterManagerMetrics = new HelixClusterManagerMetrics(metricRegistry, helixClusterManagerCallback);
try {
Map<String, String> dataCenterToZkAddress =
parseZkJsonAndPopulateZkInfo(clusterMapConfig.clusterMapDcsZkConnectStrings);
for (Map.Entry<String, String> entry : dataCenterToZkAddress.entrySet()) {
String dcName = entry.getKey();
String zkConnectStr = entry.getValue();
HelixManager manager =
helixFactory.getZKHelixManager(clusterName, instanceName, InstanceType.SPECTATOR, zkConnectStr);
logger.info("Connecting to Helix manager at {}", zkConnectStr);
manager.connect();
logger.info("Established connection");
ClusterChangeListener clusterChangeListener = new ClusterChangeListener();
DcZkInfo dcZkInfo = new DcZkInfo(dcName, zkConnectStr, manager, clusterChangeListener);
dcToDcZkInfo.put(dcName, dcZkInfo);
}
// Now initialize by pulling information from Helix.
initialize();
// Now register listeners to get notified on change.
for (DcZkInfo dcZkInfo : dcToDcZkInfo.values()) {
logger.info("Registering listeners for Helix manager at {}", dcZkInfo.zkConnectStr);
dcZkInfo.helixManager.addExternalViewChangeListener(dcZkInfo.clusterChangeListener);
dcZkInfo.helixManager.addInstanceConfigChangeListener(dcZkInfo.clusterChangeListener);
dcZkInfo.helixManager.addLiveInstanceChangeListener(dcZkInfo.clusterChangeListener);
logger.info("Registered, now waiting for initial calls");
dcZkInfo.clusterChangeListener.waitForInitialization();
logger.info("Received initial calls for every listener from this Helix manager");
}
} catch (Exception e) {
helixClusterManagerMetrics.initializeInstantiationMetric(false);
close();
throw new IOException("Encountered exception while parsing json, connecting to Helix or initializing", e);
}
helixClusterManagerMetrics.initializeInstantiationMetric(true);
helixClusterManagerMetrics.initializeDatacenterMetrics();
helixClusterManagerMetrics.initializeDataNodeMetrics();
helixClusterManagerMetrics.initializeDiskMetrics();
helixClusterManagerMetrics.initializePartitionMetrics();
helixClusterManagerMetrics.initializeCapacityMetrics();
}
/**
* Populate the initial data from the admin connection. Create nodes, disks, partitions and replicas for the entire
* cluster.
* @throws Exception if creation of {@link AmbryDataNode}s or {@link AmbryDisk}s throw an Exception.
*/
private void initialize() throws Exception {
for (DcZkInfo dcZkInfo : dcToDcZkInfo.values()) {
logger.info("Initializing cluster information from {}", dcZkInfo.zkConnectStr);
HelixAdmin admin = dcZkInfo.helixManager.getClusterManagmentTool();
for (String instanceName : admin.getInstancesInCluster(clusterName)) {
logger.info("Adding node {} and its disks and replicas", instanceName);
InstanceConfig instanceConfig = admin.getInstanceConfig(clusterName, instanceName);
AmbryDataNode datanode = new AmbryDataNode(dcZkInfo.dcName, clusterMapConfig, instanceConfig.getHostName(),
Integer.valueOf(instanceConfig.getPort()), getRackId(instanceConfig), getSslPortStr(instanceConfig));
initializeDisksAndReplicasOnNode(datanode, instanceConfig);
instanceNameToAmbryDataNode.put(instanceName, datanode);
dcZkInfo.clusterChangeListener.allInstances.add(instanceName);
}
logger.info("Initialized cluster information from {}", dcZkInfo.zkConnectStr);
}
for (Set<AmbryDisk> disks : ambryDataNodeToAmbryDisks.values()) {
for (AmbryDisk disk : disks) {
clusterWideRawCapacityBytes += disk.getRawCapacityInBytes();
}
}
for (Set<AmbryReplica> partitionReplicas : ambryPartitionToAmbryReplicas.values()) {
long replicaCapacity = partitionReplicas.iterator().next().getCapacityInBytes();
clusterWideAllocatedRawCapacityBytes += replicaCapacity * partitionReplicas.size();
clusterWiseAllocatedUsableCapacityBytes += replicaCapacity;
}
}
/**
* Initialize the disks and replicas on the given node. Create partitions if this is the first time a replica of
* that partition is being constructed.
* @param datanode the {@link AmbryDataNode} that is being initialized.
* @param instanceConfig the {@link InstanceConfig} associated with this datanode.
* @throws Exception if creation of {@link AmbryDisk} throws an Exception.
*/
private void initializeDisksAndReplicasOnNode(AmbryDataNode datanode, InstanceConfig instanceConfig)
throws Exception {
ambryDataNodeToAmbryReplicas.put(datanode, new HashSet<AmbryReplica>());
ambryDataNodeToAmbryDisks.put(datanode, new HashSet<AmbryDisk>());
List<String> sealedReplicas = getSealedReplicas(instanceConfig);
Map<String, Map<String, String>> diskInfos = instanceConfig.getRecord().getMapFields();
for (Map.Entry<String, Map<String, String>> entry : diskInfos.entrySet()) {
String mountPath = entry.getKey();
Map<String, String> diskInfo = entry.getValue();
long capacityBytes = Long.valueOf(diskInfo.get(DISK_CAPACITY_STR));
HardwareState state =
diskInfo.get(DISK_STATE).equals(AVAILABLE_STR) ? HardwareState.AVAILABLE : HardwareState.UNAVAILABLE;
String replicasStr = diskInfo.get(ClusterMapUtils.REPLICAS_STR);
// Create disk
AmbryDisk disk = new AmbryDisk(clusterMapConfig, datanode, mountPath, state, capacityBytes);
ambryDataNodeToAmbryDisks.get(datanode).add(disk);
if (!replicasStr.isEmpty()) {
List<String> replicaInfoList = Arrays.asList(replicasStr.split(ClusterMapUtils.REPLICAS_DELIM_STR));
for (String replicaInfo : replicaInfoList) {
String[] info = replicaInfo.split(ClusterMapUtils.REPLICAS_STR_SEPARATOR);
// partition name and replica name are the same.
String partitionName = info[0];
long replicaCapacity = Long.valueOf(info[1]);
AmbryPartition partition = partitionNameToAmbryPartition.get(partitionName);
if (partition == null) {
// Create partition
partition = new AmbryPartition(Long.valueOf(partitionName), helixClusterManagerCallback);
partitionNameToAmbryPartition.put(partitionName, partition);
ambryPartitionToAmbryReplicas.put(partition, new HashSet<AmbryReplica>());
partitionMap.put(ByteBuffer.wrap(partition.getBytes()), partition);
} else {
ensurePartitionAbsenceOnNodeAndValidateCapacity(partition, datanode, replicaCapacity);
}
if (sealedReplicas.contains(partitionName)) {
partition.setState(PartitionState.READ_ONLY);
}
// Create replica
AmbryReplica replica = new AmbryReplica(partition, disk, replicaCapacity);
ambryPartitionToAmbryReplicas.get(partition).add(replica);
ambryDataNodeToAmbryReplicas.get(datanode).add(replica);
}
}
}
}
/**
* Ensure that the given partition is absent on the given datanode. This is called as part of an inline validation
* done to ensure that two replicas of the same partition do not exist on the same datanode.
* @param partition the {@link AmbryPartition} to check.
* @param datanode the {@link AmbryDataNode} on which to check.
* @param expectedReplicaCapacity the capacity expected for the replicas of the partition.
*/
private void ensurePartitionAbsenceOnNodeAndValidateCapacity(AmbryPartition partition, AmbryDataNode datanode,
long expectedReplicaCapacity) {
for (AmbryReplica replica : ambryPartitionToAmbryReplicas.get(partition)) {
if (replica.getDataNodeId().equals(datanode)) {
throw new IllegalStateException("Replica already exists on " + datanode + " for " + partition);
} else if (replica.getCapacityInBytes() != expectedReplicaCapacity) {
throw new IllegalStateException("Expected replica capacity " + expectedReplicaCapacity + " is different from "
+ "the capacity of an existing replica " + replica.getCapacityInBytes());
}
}
}
@Override
public boolean hasDatacenter(String datacenterName) {
return dcToDcZkInfo.containsKey(datacenterName);
}
@Override
public AmbryDataNode getDataNodeId(String hostname, int port) {
return instanceNameToAmbryDataNode.get(getInstanceName(hostname, port));
}
@Override
public List<AmbryReplica> getReplicaIds(DataNodeId dataNodeId) {
if (!(dataNodeId instanceof AmbryDataNode)) {
throw new IllegalArgumentException("Incompatible type passed in");
}
AmbryDataNode datanode = (AmbryDataNode) dataNodeId;
return new ArrayList<>(ambryDataNodeToAmbryReplicas.get(datanode));
}
@Override
public List<AmbryDataNode> getDataNodeIds() {
return new ArrayList<>(instanceNameToAmbryDataNode.values());
}
@Override
public MetricRegistry getMetricRegistry() {
return metricRegistry;
}
@Override
public void onReplicaEvent(ReplicaId replicaId, ReplicaEventType event) {
AmbryReplica replica = (AmbryReplica) replicaId;
switch (event) {
case Node_Response:
replica.getDataNodeId().onNodeResponse();
break;
case Node_Timeout:
replica.getDataNodeId().onNodeTimeout();
break;
case Disk_Error:
replica.getDiskId().onDiskError();
break;
case Disk_Ok:
replica.getDiskId().onDiskOk();
break;
case Partition_ReadOnly:
replica.getPartitionId().onPartitionReadOnly();
break;
}
}
@Override
public PartitionId getPartitionIdFromStream(InputStream stream) throws IOException {
byte[] partitionBytes = AmbryPartition.readPartitionBytesFromStream(stream);
AmbryPartition partition = partitionMap.get(ByteBuffer.wrap(partitionBytes));
if (partition == null) {
throw new IOException("Partition id from stream is null");
}
return partition;
}
/**
* @return list of partition ids that are in {@link PartitionState#READ_WRITE}.
*/
@Override
public List<AmbryPartition> getWritablePartitionIds() {
List<AmbryPartition> writablePartitions = new ArrayList<>();
List<AmbryPartition> healthyWritablePartitions = new ArrayList<>();
for (AmbryPartition partition : partitionNameToAmbryPartition.values()) {
if (partition.getPartitionState() == PartitionState.READ_WRITE) {
writablePartitions.add(partition);
if (areAllReplicasForPartitionUp(partition)) {
healthyWritablePartitions.add(partition);
}
}
}
return healthyWritablePartitions.isEmpty() ? writablePartitions : healthyWritablePartitions;
}
/**
* @return list of all partition ids in the cluster
*/
@Override
public List<AmbryPartition> getAllPartitionIds() {
return new ArrayList<>(partitionNameToAmbryPartition.values());
}
/**
* Disconnect from the HelixManagers associated with each and every datacenter.
*/
@Override
public void close() {
for (DcZkInfo dcZkInfo : dcToDcZkInfo.values()) {
dcZkInfo.helixManager.disconnect();
}
dcToDcZkInfo.clear();
}
/**
* Check whether all replicas of the given {@link AmbryPartition} are up.
* @param partition the {@link AmbryPartition} to check.
* @return true if all associated replicas are up; false otherwise.
*/
private boolean areAllReplicasForPartitionUp(AmbryPartition partition) {
for (AmbryReplica replica : ambryPartitionToAmbryReplicas.get(partition)) {
if (replica.isDown()) {
return false;
}
}
return true;
}
/**
* Return the unique {@link AmbryReplica} for a {@link AmbryPartition} on a {@link AmbryDataNode}.
* @param hostname the hostname associated with the {@link AmbryDataNode}.
* @param port the port associated with the {@link AmbryDataNode}.
* @param partitionString the partition id string associated with the {@link AmbryPartition}.
* @return the {@link AmbryReplica} associated with the given parameters.
*/
AmbryReplica getReplicaForPartitionOnNode(String hostname, int port, String partitionString) {
for (AmbryReplica replica : ambryDataNodeToAmbryReplicas.get(getDataNodeId(hostname, port))) {
if (replica.getPartitionId().toString().equals(partitionString)) {
return replica;
}
}
return null;
}
/**
* An instance of this object is used to register as listener for Helix related changes in each datacenter.
*/
private class ClusterChangeListener implements ExternalViewChangeListener, InstanceConfigChangeListener, LiveInstanceChangeListener {
final Set<String> allInstances = new HashSet<>();
private boolean liveInstanceChangeTriggered = false;
private boolean externalViewChangeTriggered = false;
private boolean configChangeTriggered = false;
private final Object notificationLock = new Object();
private final CountDownLatch initialized = new CountDownLatch(3);
/**
* Triggered whenever there is a change in the list of live instances.
* @param liveInstances the list of all live instances (not a change set) at the time of this call.
* @param changeContext the {@link NotificationContext} associated.
*/
@Override
public void onLiveInstanceChange(List<LiveInstance> liveInstances, NotificationContext changeContext) {
synchronized (notificationLock) {
if (changeContext.getType() == NotificationContext.Type.INIT) {
logger.info("Received initial notification for live instance change");
}
logger.trace("Live instance change triggered with: {}", liveInstances);
Set<String> liveInstancesSet = new HashSet<>();
for (LiveInstance liveInstance : liveInstances) {
liveInstancesSet.add(liveInstance.getInstanceName());
}
for (String instanceName : allInstances) {
if (liveInstancesSet.contains(instanceName)) {
instanceNameToAmbryDataNode.get(instanceName).setState(HardwareState.AVAILABLE);
} else {
instanceNameToAmbryDataNode.get(instanceName).setState(HardwareState.UNAVAILABLE);
}
}
if (!liveInstanceChangeTriggered) {
liveInstanceChangeTriggered = true;
initialized.countDown();
}
helixClusterManagerMetrics.liveInstanceChangeTriggerCount.inc();
}
}
@Override
public void onExternalViewChange(List<ExternalView> externalViewList, NotificationContext changeContext) {
synchronized (notificationLock) {
if (changeContext.getType() == NotificationContext.Type.INIT) {
logger.info("Received initial notification for external view change");
}
logger.trace("ExternalView change triggered with: {}", externalViewList);
// No action taken at this time.
if (!externalViewChangeTriggered) {
externalViewChangeTriggered = true;
initialized.countDown();
}
helixClusterManagerMetrics.externalViewChangeTriggerCount.inc();
}
}
@Override
public void onInstanceConfigChange(List<InstanceConfig> configs, NotificationContext changeContext) {
synchronized (notificationLock) {
if (changeContext.getType() == NotificationContext.Type.INIT) {
logger.info("Received initial notification for instance config change");
}
logger.trace("Config change triggered with: {}", configs);
// No action taken at this time. Going forward, changes like marking partitions back to read-write will go here.
if (!configChangeTriggered) {
configChangeTriggered = true;
initialized.countDown();
}
helixClusterManagerMetrics.instanceConfigChangeTriggerCount.inc();
}
}
/**
* Wait until the first set of notifications come in for live instance change, external view change and config
* changes.
* @throws InterruptedException if the wait gets interrupted unexpectedly.
*/
void waitForInitialization() throws InterruptedException {
initialized.await();
}
}
/**
* Class that stores all ZK related information associated with a datacenter.
*/
private static class DcZkInfo {
final String dcName;
final String zkConnectStr;
final HelixManager helixManager;
final ClusterChangeListener clusterChangeListener;
/**
* Construct a DcZkInfo object with the given parameters.
* @param dcName the associated datacenter name.
* @param zkConnectStr the associated ZK connect string for this datacenter.
* @param helixManager the associated {@link HelixManager} for this datacenter.
* @param clusterChangeListener the associated {@link ClusterChangeListener} for this datacenter.
*/
DcZkInfo(String dcName, String zkConnectStr, HelixManager helixManager,
ClusterChangeListener clusterChangeListener) {
this.dcName = dcName;
this.zkConnectStr = zkConnectStr;
this.helixManager = helixManager;
this.clusterChangeListener = clusterChangeListener;
}
}
/**
* A callback class used to query information from the {@link HelixClusterManager}
*/
class HelixClusterManagerCallback implements ClusterManagerCallback {
/**
* Get all replica ids associated with the given {@link AmbryPartition}
* @param partition the {@link AmbryPartition} for which to get the list of replicas.
* @return the list of {@link AmbryReplica}s associated with the given partition.
*/
@Override
public List<AmbryReplica> getReplicaIdsForPartition(AmbryPartition partition) {
return new ArrayList<>(ambryPartitionToAmbryReplicas.get(partition));
}
/**
* @return the count of datacenters in this cluster.
*/
long getDatacenterCount() {
return dcToDcZkInfo.size();
}
/**
* @return a collection of datanodes in this cluster.
*/
Collection<AmbryDataNode> getDatanodes() {
return new ArrayList<>(instanceNameToAmbryDataNode.values());
}
/**
* @return the count of the datanodes in this cluster.
*/
long getDatanodeCount() {
return instanceNameToAmbryDataNode.values().size();
}
/**
* @return the count of datanodes in this cluster that are down.
*/
long getDownDatanodesCount() {
long count = 0;
for (AmbryDataNode datanode : instanceNameToAmbryDataNode.values()) {
if (datanode.getState() == HardwareState.UNAVAILABLE) {
count++;
}
}
return count;
}
/**
* @return a collection of all the disks in this datacenter.
*/
Collection<AmbryDisk> getDisks() {
List<AmbryDisk> disksToReturn = new ArrayList<>();
for (Set<AmbryDisk> disks : ambryDataNodeToAmbryDisks.values()) {
disksToReturn.addAll(disks);
}
return disksToReturn;
}
/**
* @return the count of disks in this cluster.
*/
long getDiskCount() {
long count = 0;
for (Set<AmbryDisk> disks : ambryDataNodeToAmbryDisks.values()) {
count += disks.size();
}
return count;
}
/**
* @return the count of disks in this cluster that are down.
*/
long getDownDisksCount() {
long count = 0;
for (Set<AmbryDisk> disks : ambryDataNodeToAmbryDisks.values()) {
for (AmbryDisk disk : disks) {
if (disk.getState() == HardwareState.UNAVAILABLE) {
count++;
}
}
}
return count;
}
/**
* @return a collection of partitions in this cluster.
*/
Collection<AmbryPartition> getPartitions() {
return new ArrayList<>(partitionMap.values());
}
/**
* @return the count of partitions in this cluster.
*/
long getPartitionCount() {
return partitionMap.size();
}
/**
* @return the count of partitions in this cluster that are in read-write state.
*/
long getPartitionReadWriteCount() {
long count = 0;
for (AmbryPartition partition : partitionMap.values()) {
if (partition.getPartitionState() == PartitionState.READ_WRITE) {
count++;
}
}
return count;
}
/**
* @return the count of partitions that are in sealed (read-only) state.
*/
long getPartitionSealedCount() {
long count = 0;
for (AmbryPartition partition : partitionMap.values()) {
if (partition.getPartitionState() == PartitionState.READ_ONLY) {
count++;
}
}
return count;
}
/**
* @return the cluster wide raw capacity in bytes.
*/
long getRawCapacity() {
return clusterWideRawCapacityBytes;
}
/**
* @return the cluster wide allocated raw capacity in bytes.
*/
long getAllocatedRawCapacity() {
return clusterWideAllocatedRawCapacityBytes;
}
/**
* @return the cluster wide allocated usable capacity in bytes.
*/
long getAllocatedUsableCapacity() {
return clusterWiseAllocatedUsableCapacityBytes;
}
}
}