/** * Copyright 2016 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ package com.github.ambry.clustermap; import com.codahale.metrics.MetricRegistry; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.json.JSONException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.github.ambry.utils.Utils.*; /** * StaticClusterManager allows components in Ambry to query the topology. This covers the {@link HardwareLayout} and the * {@link PartitionLayout}. */ class StaticClusterManager implements ClusterMap { protected final HardwareLayout hardwareLayout; protected final PartitionLayout partitionLayout; private final MetricRegistry metricRegistry; private final ClusterMapMetrics clusterMapMetrics; private Logger logger = LoggerFactory.getLogger(getClass()); /** * How many data nodes to put in a random sample for partition allocation. 2 node samples provided the best balance * of speed and allocation quality in testing. Larger samples (ie 3 nodes) took longer to generate but did not * improve the quality of the allocated partitions. */ private static final int NUM_CHOICES = 2; StaticClusterManager(PartitionLayout partitionLayout, MetricRegistry metricRegistry) { if (logger.isTraceEnabled()) { logger.trace("StaticClusterManager " + partitionLayout); } this.hardwareLayout = partitionLayout.getHardwareLayout(); this.partitionLayout = partitionLayout; this.metricRegistry = metricRegistry; this.clusterMapMetrics = new ClusterMapMetrics(this.hardwareLayout, this.partitionLayout, this.metricRegistry); } void persist(String hardwareLayoutPath, String partitionLayoutPath) throws IOException, JSONException { logger.trace("persist " + hardwareLayoutPath + ", " + partitionLayoutPath); writeJsonToFile(hardwareLayout.toJSONObject(), hardwareLayoutPath); writeJsonToFile(partitionLayout.toJSONObject(), partitionLayoutPath); } // Implementation of ClusterMap interface // -------------------------------------- @Override public List<PartitionId> getWritablePartitionIds() { return partitionLayout.getWritablePartitions(); } @Override public List<PartitionId> getAllPartitionIds() { return partitionLayout.getPartitions(); } @Override public PartitionId getPartitionIdFromStream(InputStream stream) throws IOException { PartitionId partitionId = partitionLayout.getPartition(stream); if (partitionId == null) { throw new IOException("Partition id from stream is null"); } return partitionId; } @Override public boolean hasDatacenter(String datacenterName) { return hardwareLayout.findDatacenter(datacenterName) != null; } @Override public DataNodeId getDataNodeId(String hostname, int port) { return hardwareLayout.findDataNode(hostname, port); } @Override public List<ReplicaId> getReplicaIds(DataNodeId dataNodeId) { List<Replica> replicas = getReplicas(dataNodeId); return new ArrayList<ReplicaId>(replicas); } List<Replica> getReplicas(DataNodeId dataNodeId) { List<Replica> replicas = new ArrayList<Replica>(); for (PartitionId partition : partitionLayout.getPartitions()) { for (Replica replica : ((Partition) partition).getReplicas()) { if (replica.getDataNodeId().equals(dataNodeId)) { replicas.add(replica); } } } return replicas; } @Override public List<DataNodeId> getDataNodeIds() { List<DataNodeId> dataNodeIds = new ArrayList<DataNodeId>(); for (Datacenter datacenter : hardwareLayout.getDatacenters()) { dataNodeIds.addAll(datacenter.getDataNodes()); } return dataNodeIds; } @Override public MetricRegistry getMetricRegistry() { return metricRegistry; } // Administrative API // ----------------------- long getRawCapacityInBytes() { return hardwareLayout.getRawCapacityInBytes(); } long getAllocatedRawCapacityInBytes() { return partitionLayout.getAllocatedRawCapacityInBytes(); } long getAllocatedUsableCapacityInBytes() { return partitionLayout.getAllocatedUsableCapacityInBytes(); } long getAllocatedRawCapacityInBytes(Datacenter datacenter) { long allocatedRawCapacityInBytes = 0; for (PartitionId partition : partitionLayout.getPartitions()) { for (Replica replica : ((Partition) partition).getReplicas()) { Disk disk = (Disk) replica.getDiskId(); if (disk.getDataNode().getDatacenter().equals(datacenter)) { allocatedRawCapacityInBytes += replica.getCapacityInBytes(); } } } return allocatedRawCapacityInBytes; } long getAllocatedRawCapacityInBytes(DataNodeId dataNode) { long allocatedRawCapacityInBytes = 0; for (PartitionId partition : partitionLayout.getPartitions()) { for (Replica replica : ((Partition) partition).getReplicas()) { Disk disk = (Disk) replica.getDiskId(); if (disk.getDataNode().equals(dataNode)) { allocatedRawCapacityInBytes += replica.getCapacityInBytes(); } } } return allocatedRawCapacityInBytes; } long getAllocatedRawCapacityInBytes(Disk disk) { long allocatedRawCapacityInBytes = 0; for (PartitionId partition : partitionLayout.getPartitions()) { for (Replica replica : ((Partition) partition).getReplicas()) { Disk currentDisk = (Disk) replica.getDiskId(); if (currentDisk.equals(disk)) { allocatedRawCapacityInBytes += replica.getCapacityInBytes(); } } } return allocatedRawCapacityInBytes; } long getUnallocatedRawCapacityInBytes() { return getRawCapacityInBytes() - getAllocatedRawCapacityInBytes(); } long getUnallocatedRawCapacityInBytes(Datacenter datacenter) { return datacenter.getRawCapacityInBytes() - getAllocatedRawCapacityInBytes(datacenter); } long getUnallocatedRawCapacityInBytes(DataNode dataNode) { return dataNode.getRawCapacityInBytes() - getAllocatedRawCapacityInBytes(dataNode); } long getUnallocatedRawCapacityInBytes(Disk disk) { return disk.getRawCapacityInBytes() - getAllocatedRawCapacityInBytes(disk); } DataNode getDataNodeWithMostUnallocatedRawCapacity(Datacenter dc, Set nodesToExclude) { DataNode maxCapacityNode = null; List<DataNode> dataNodes = dc.getDataNodes(); for (DataNode dataNode : dataNodes) { if (!nodesToExclude.contains(dataNode) && (maxCapacityNode == null || getUnallocatedRawCapacityInBytes(dataNode) > getUnallocatedRawCapacityInBytes(maxCapacityNode))) { maxCapacityNode = dataNode; } } return maxCapacityNode; } Disk getDiskWithMostUnallocatedRawCapacity(DataNode node, long minCapacity) { Disk maxCapacityDisk = null; List<Disk> disks = node.getDisks(); for (Disk disk : disks) { if ((maxCapacityDisk == null || getUnallocatedRawCapacityInBytes(disk) > getUnallocatedRawCapacityInBytes( maxCapacityDisk)) && getUnallocatedRawCapacityInBytes(disk) >= minCapacity) { maxCapacityDisk = disk; } } return maxCapacityDisk; } PartitionId addNewPartition(List<Disk> disks, long replicaCapacityInBytes) { return partitionLayout.addNewPartition(disks, replicaCapacityInBytes); } // Determine if there is enough capacity to allocate a PartitionId. private boolean checkEnoughUnallocatedRawCapacity(int replicaCountPerDatacenter, long replicaCapacityInBytes) { for (Datacenter datacenter : hardwareLayout.getDatacenters()) { if (getUnallocatedRawCapacityInBytes(datacenter) < replicaCountPerDatacenter * replicaCapacityInBytes) { logger.warn("Insufficient unallocated space in datacenter {} ({} bytes unallocated)", datacenter.getName(), getUnallocatedRawCapacityInBytes(datacenter)); return false; } int rcpd = replicaCountPerDatacenter; for (DataNode dataNode : datacenter.getDataNodes()) { for (Disk disk : dataNode.getDisks()) { if (getUnallocatedRawCapacityInBytes(disk) >= replicaCapacityInBytes) { rcpd--; break; // Only one replica per DataNodeId. } } } if (rcpd > 0) { logger.warn("Insufficient DataNodes ({}) with unallocated space in datacenter {} for {} Replicas)", rcpd, datacenter.getName(), replicaCountPerDatacenter); return false; } } return true; } /** * Get a sampling of {@code numChoices} random disks from a list of {@link DataNode}s and choose the * {@link Disk} on the {@link DataNode} with the most free space. * NOTE 1: This method will change the ordering of the nodes in {@code dataNodes} * NOTE 2: This method can return null, if a disk with enough free space could not be found. * * @param dataNodes the list of {@link DataNode}s to sample from * @param dataNodesUsed the set of {@link DataNode}s to exclude from the sample * @param replicaCapacityInBytes the minimum amount of free space that a disk in the sample should have * @param rackAware if {@code true}, only return disks in nodes that do not share racks with the nodes * in {@code dataNodesUsed} * @param numChoices how many disks in the sample to choose between * @return The {@link Disk} on the {@link DataNode} in the sample with the most free space, or {@code null } if a disk * with enough free space could not be found. */ private Disk getBestDiskCandidate(List<DataNode> dataNodes, Set<DataNode> dataNodesUsed, long replicaCapacityInBytes, boolean rackAware, int numChoices) { Set<Long> rackIdsUsed = new HashSet<>(); if (rackAware) { for (DataNode dataNode : dataNodesUsed) { rackIdsUsed.add(dataNode.getRackId()); } } int numFound = 0; int selectionBound = dataNodes.size(); Random randomGen = new Random(); Disk bestDisk = null; while ((selectionBound > 0) && (numFound < numChoices)) { int selectionIndex = randomGen.nextInt(selectionBound); DataNode candidate = dataNodes.get(selectionIndex); if (!dataNodesUsed.contains(candidate) && !rackIdsUsed.contains(candidate.getRackId())) { Disk diskCandidate = getDiskWithMostUnallocatedRawCapacity(candidate, replicaCapacityInBytes); if (diskCandidate != null) { if ((bestDisk == null) || (getUnallocatedRawCapacityInBytes(diskCandidate.getDataNode()) >= getUnallocatedRawCapacityInBytes(bestDisk.getDataNode()))) { bestDisk = diskCandidate; } numFound++; } } selectionBound--; Collections.swap(dataNodes, selectionIndex, selectionBound); } return bestDisk; } /** * Return a list of disks for a new partition in the specified {@link Datacenter}. Does not retry if fewer than * {@code replicaCountPerDatacenter} disks cannot be allocated. * * @param replicaCountPerDatacenter how many replicas to attempt to allocate in the datacenter * @param replicaCapacityInBytes the minimum amount of free space on a disk for a replica * @param datacenter the {@link Datacenter} to allocate replicas in * @param rackAware if {@code true}, attempt a rack-aware allocation * @return A list of {@link Disk}s */ private List<Disk> getDiskCandidatesForPartition(int replicaCountPerDatacenter, long replicaCapacityInBytes, Datacenter datacenter, boolean rackAware) { ArrayList<Disk> disksToAllocate = new ArrayList<Disk>(); Set<DataNode> nodesToExclude = new HashSet<>(); List<DataNode> dataNodes = new ArrayList<>(datacenter.getDataNodes()); for (int i = 0; i < replicaCountPerDatacenter; i++) { Disk bestDisk = getBestDiskCandidate(dataNodes, nodesToExclude, replicaCapacityInBytes, rackAware, NUM_CHOICES); if (bestDisk != null) { disksToAllocate.add(bestDisk); nodesToExclude.add(bestDisk.getDataNode()); } else { break; } } return disksToAllocate; } /** * Return a list of disks for a new partition in the specified {@link Datacenter}. Retry a non rack-aware allocation * in certain cases described below if {@code attemptNonRackAwareOnFailure} is enabled. * * @param replicaCountPerDatacenter how many replicas to attempt to allocate in the datacenter * @param replicaCapacityInBytes the minimum amount of free space on a disk for a replica * @param datacenter the {@link Datacenter} to allocate replicas in * @param attemptNonRackAwareOnFailure {@code true} if we should attempt a non rack-aware allocation if a rack-aware * one is not possible. * @return a list of {@code replicaCountPerDatacenter} or fewer disks that can be allocated for a new partition in * the specified datacenter */ private List<Disk> allocateDisksForPartition(int replicaCountPerDatacenter, long replicaCapacityInBytes, Datacenter datacenter, boolean attemptNonRackAwareOnFailure) { List<Disk> disks; if (datacenter.isRackAware()) { disks = getDiskCandidatesForPartition(replicaCountPerDatacenter, replicaCapacityInBytes, datacenter, true); if ((disks.size() < replicaCountPerDatacenter) && attemptNonRackAwareOnFailure) { System.err.println("Rack-aware allocation failed for a partition on datacenter:" + datacenter.getName() + "; attempting to perform a non rack-aware allocation."); disks = getDiskCandidatesForPartition(replicaCountPerDatacenter, replicaCapacityInBytes, datacenter, false); } } else if (!attemptNonRackAwareOnFailure) { throw new IllegalArgumentException( "attemptNonRackAwareOnFailure is false, but the datacenter: " + datacenter.getName() + " does not have rack information"); } else { disks = getDiskCandidatesForPartition(replicaCountPerDatacenter, replicaCapacityInBytes, datacenter, false); } if (disks.size() < replicaCountPerDatacenter) { System.err.println( "Could only allocate " + disks.size() + "/" + replicaCountPerDatacenter + " replicas in datacenter: " + datacenter.getName()); } return disks; } /** * Allocate partitions for {@code numPartitions} new partitions on all datacenters. * * @param numPartitions How many partitions to allocate. * @param replicaCountPerDatacenter The number of replicas per partition on each datacenter * @param replicaCapacityInBytes How large each replica (of a partition) should be * @param attemptNonRackAwareOnFailure {@code true} if we should attempt a non rack-aware allocation if a rack-aware * one is not possible. * @return A list of the new {@link PartitionId}s. */ List<PartitionId> allocatePartitions(int numPartitions, int replicaCountPerDatacenter, long replicaCapacityInBytes, boolean attemptNonRackAwareOnFailure) { ArrayList<PartitionId> partitions = new ArrayList<PartitionId>(numPartitions); int partitionsAllocated = 0; while (checkEnoughUnallocatedRawCapacity(replicaCountPerDatacenter, replicaCapacityInBytes) && partitionsAllocated < numPartitions) { List<Disk> disksToAllocate = new ArrayList<>(); for (Datacenter datacenter : hardwareLayout.getDatacenters()) { List<Disk> disks = allocateDisksForPartition(replicaCountPerDatacenter, replicaCapacityInBytes, datacenter, attemptNonRackAwareOnFailure); disksToAllocate.addAll(disks); } partitions.add(partitionLayout.addNewPartition(disksToAllocate, replicaCapacityInBytes)); partitionsAllocated++; System.out.println("Allocated " + partitionsAllocated + " new partitions so far."); } return partitions; } /** * Add a set of replicas on a new datacenter for an existing partition. * * @param partitionId The partition to add to the new datacenter * @param dataCenterName The name of the new datacenter * @param attemptNonRackAwareOnFailure {@code true} if a non rack-aware allocation should be attempted if a rack-aware one * is not possible. */ void addReplicas(PartitionId partitionId, String dataCenterName, boolean attemptNonRackAwareOnFailure) { List<? extends ReplicaId> replicaIds = partitionId.getReplicaIds(); Map<String, Integer> replicaCountByDatacenter = new HashMap<String, Integer>(); long capacityOfReplicasInBytes = 0; // we ensure that the datacenter provided does not have any replicas for (ReplicaId replicaId : replicaIds) { if (replicaId.getDataNodeId().getDatacenterName().compareToIgnoreCase(dataCenterName) == 0) { throw new IllegalArgumentException( "Data center " + dataCenterName + " provided already contains replica for partition " + partitionId); } capacityOfReplicasInBytes = replicaId.getCapacityInBytes(); Integer numberOfReplicas = replicaCountByDatacenter.get(replicaId.getDataNodeId().getDatacenterName()); if (numberOfReplicas == null) { numberOfReplicas = new Integer(0); } numberOfReplicas++; replicaCountByDatacenter.put(replicaId.getDataNodeId().getDatacenterName(), numberOfReplicas); } if (replicaCountByDatacenter.size() == 0) { throw new IllegalArgumentException("No existing replicas present for partition " + partitionId + " in cluster."); } // verify that all data centers have the same replica int numberOfReplicasPerDatacenter = 0; int index = 0; for (Map.Entry<String, Integer> entry : replicaCountByDatacenter.entrySet()) { if (index == 0) { numberOfReplicasPerDatacenter = entry.getValue(); } if (numberOfReplicasPerDatacenter != entry.getValue()) { throw new IllegalStateException("Datacenters have different replicas for partition " + partitionId); } index++; } Datacenter datacenterToAdd = hardwareLayout.findDatacenter(dataCenterName); List<Disk> disksForReplicas = allocateDisksForPartition(numberOfReplicasPerDatacenter, capacityOfReplicasInBytes, datacenterToAdd, attemptNonRackAwareOnFailure); partitionLayout.addNewReplicas((Partition) partitionId, disksForReplicas); System.out.println("Added partition " + partitionId + " to datacenter " + dataCenterName); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } StaticClusterManager that = (StaticClusterManager) o; if (hardwareLayout != null ? !hardwareLayout.equals(that.hardwareLayout) : that.hardwareLayout != null) { return false; } return !(partitionLayout != null ? !partitionLayout.equals(that.partitionLayout) : that.partitionLayout != null); } @Override public void onReplicaEvent(ReplicaId replicaId, ReplicaEventType event) { switch (event) { case Disk_Error: ((Disk) replicaId.getDiskId()).onDiskError(); break; case Disk_Ok: ((Disk) replicaId.getDiskId()).onDiskOk(); break; case Node_Timeout: ((DataNode) replicaId.getDataNodeId()).onNodeTimeout(); break; case Node_Response: ((DataNode) replicaId.getDataNodeId()).onNodeResponse(); break; case Partition_ReadOnly: ((Partition) replicaId.getPartitionId()).onPartitionReadOnly(); break; } } @Override public void close() { // No-op. } }