/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.OptionalInt;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ArrayUtils;
import org.json_voltpatches.JSONArray;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.json_voltpatches.JSONStringer;
import com.google_voltpatches.common.base.Preconditions;
import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.collect.ImmutableSet;
import com.google_voltpatches.common.collect.ImmutableSortedSet;
import com.google_voltpatches.common.collect.LinkedListMultimap;
import com.google_voltpatches.common.collect.Lists;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Multimap;
import com.google_voltpatches.common.collect.Sets;
public class AbstractTopology {
//Topology JSON keys
public final static String TOPO_PARTITIONS = "partitions";
public final static String TOPO_PARTITION_ID = "partition_id";
public final static String TOPO_MASTER = "master";
public final static String TOPO_REPLICA = "replicas";
public final static String TOPO_HOST_ID = "host_id";
public final static String TOPO_KFACTOR = "kfactor";
public final static String TOPO_VERSION = "version";
public final static String TOPO_HAGROUPS = "haGroups";
public final static String TOPO_HAGROUP = "haGroup";
public final static String TOPO_HOSTS = "hosts";
public final static String TOPO_HOST = "host";
public final static String TOPO_SPH = "targetSiteCount";
public final static String TOPO_HOST_MISSING = "missing";
public final static String PLACEMENT_GROUP_DEFAULT="0";
public final long version;
public final ImmutableMap<Integer, Host> hostsById;
public final ImmutableMap<Integer, Partition> partitionsById;
public static final AbstractTopology EMPTY_TOPOLOGY = new AbstractTopology(0, new TreeSet<>());
/////////////////////////////////////
//
// PUBLIC CLASSES
//
/////////////////////////////////////
public static class Partition implements Comparable<Partition>{
public final int id;
public final int k;
public int leaderHostId;
public final ImmutableSortedSet<Integer> hostIds;
private Partition(int id, int k, int leaderHostId, Collection<Integer> hostIds) {
this.id = id;
this.k = k;
this.leaderHostId = leaderHostId;
this.hostIds = ImmutableSortedSet.copyOf(hostIds);
assert(k >= 0);
}
@Override
public String toString() {
String[] hostIdStrings = hostIds.stream().map(id -> String.valueOf(id)).toArray(String[]::new);
return String.format("Partition %d (leader %d, hosts %s)", id, leaderHostId, String.join(",", hostIdStrings));
}
private void toJSON(JSONStringer stringer) throws JSONException {
stringer.object();
stringer.key(TOPO_PARTITION_ID).value(id);
stringer.key(TOPO_KFACTOR).value(k);
stringer.key(TOPO_MASTER).value(leaderHostId);
stringer.key(TOPO_REPLICA).array();
for (Integer hostId : hostIds) {
stringer.value(hostId);
}
stringer.endArray();
stringer.endObject();
}
private static Partition fromJSON(JSONObject json) throws JSONException {
int id = json.getInt(TOPO_PARTITION_ID);
int k = json.getInt(TOPO_KFACTOR);
int leaderHostId = json.getInt(TOPO_MASTER);
List<Integer> mutableHostIds = new ArrayList<>();
JSONArray jsonHostIds = json.getJSONArray(TOPO_REPLICA);
for (int i = 0; i < jsonHostIds.length(); i++) {
mutableHostIds.add(jsonHostIds.getInt(i));
}
return new Partition(id, k, leaderHostId, mutableHostIds);
}
@Override
public int compareTo(Partition o) {
return (this.id - o.id);
}
}
public static class HAGroup implements Comparable<HAGroup> {
public final String token;
public final ImmutableSortedSet<Integer> hostIds;
private HAGroup(String token, int[] hostIds) {
this.token = token;
Integer[] hostIdsInteger = ArrayUtils.toObject(hostIds);
this.hostIds = ImmutableSortedSet.copyOf(hostIdsInteger);
}
@Override
public String toString() {
String[] hostIdStrings = hostIds.stream().map(id -> id.toString()).toArray(String[]::new);
return String.format("HAGroup %s (Hosts %s)", token, String.join(",", hostIdStrings));
}
private void toJSON(JSONStringer stringer) throws JSONException {
stringer.object();
stringer.key("token").value(token);
stringer.key(TOPO_HOST).array();
for (int hostId : hostIds) {
stringer.value(hostId);
}
stringer.endArray();
stringer.endObject();
}
private static HAGroup fromJSON(JSONObject json) throws JSONException {
String token = json.getString("token");
JSONArray jsonHosts = json.getJSONArray(TOPO_HOST);
int[] hostIds = new int[jsonHosts.length()];
for (int i = 0; i < jsonHosts.length(); i++) {
hostIds[i] = jsonHosts.getInt(i);
}
return new HAGroup(token, hostIds);
}
@Override
public int compareTo(HAGroup o) {
return this.token.compareTo(o.token);
}
}
public static class Host implements Comparable<Host> {
public final int id;
public final int targetSiteCount;
public final HAGroup haGroup;
public final ImmutableSortedSet<Partition> partitions;
//a flag indicating if the host is missing or not
public boolean isMissing = false;
private Host(int id, int targetSiteCount, HAGroup haGroup, Collection<Partition> partitions) {
assert(id >= 0);
assert(targetSiteCount >= 0);
assert(haGroup != null);
assert(partitions != null);
assert(partitions.size() >= 0);
this.id = id;
this.targetSiteCount = targetSiteCount;
this.haGroup = haGroup;
this.partitions = ImmutableSortedSet.copyOf(partitions);
}
public List<Integer> getSortedPartitionIdList() {
return partitions.stream()
.map(p -> p.id)
.sorted()
.collect(Collectors.toList());
}
public void markHostMissing(boolean isMissing) {
this.isMissing = isMissing;
}
public int getleaderCount() {
int leaders = 0;
for( Partition p : partitions) {
if (p.leaderHostId == id) {
leaders++;
}
}
return leaders;
}
@Override
public String toString() {
String[] partitionIdStrings = partitions.stream().map(p -> String.valueOf(p.id)).toArray(String[]::new);
return String.format("Host %d sph:%d ha:%s (Partitions %s)",
id, targetSiteCount, haGroup.token, String.join(",", partitionIdStrings));
}
private void toJSON(JSONStringer stringer) throws JSONException {
stringer.object();
stringer.key(TOPO_HOST_ID).value(id);
stringer.key(TOPO_SPH).value(targetSiteCount);
stringer.key(TOPO_HAGROUP).value(haGroup.token);
stringer.key(TOPO_HOST_MISSING).value(isMissing);
stringer.key(TOPO_PARTITIONS).array();
for (Partition partition : partitions) {
stringer.value(partition.id);
}
stringer.endArray();
stringer.endObject();
}
private static Host fromJSON(
JSONObject json,
final Map<String, HAGroup> haGroupsByToken,
final Map<Integer, Partition> partitionsById)
throws JSONException
{
int id = json.getInt(TOPO_HOST_ID);
int targetSiteCount = json.getInt(TOPO_SPH);
String haGroupToken = json.getString(TOPO_HAGROUP);
HAGroup haGroup = haGroupsByToken.get(haGroupToken);
JSONArray jsonPartitions = json.getJSONArray(TOPO_PARTITIONS);
ArrayList<Partition> partitions = new ArrayList<>();
for (int i = 0; i < jsonPartitions.length(); i++) {
int partitionId = jsonPartitions.getInt(i);
partitions.add(partitionsById.get(partitionId));
}
Host host = new Host(id, targetSiteCount, haGroup, partitions);
host.markHostMissing(json.getBoolean(TOPO_HOST_MISSING));
return host;
}
@Override
public int compareTo(Host o) {
return (this.id - o.id);
}
}
public static class KSafetyViolationException extends Exception {
private static final long serialVersionUID = 1L;
public final int failedHostId;
public final ImmutableSet<Integer> missingPartitionIds;
public KSafetyViolationException(int failedHostId, Set<Integer> missingPartitionIds) {
assert(missingPartitionIds != null);
this.failedHostId = failedHostId;
this.missingPartitionIds = ImmutableSet.copyOf(missingPartitionIds);
}
@Override
public String getMessage() {
// convert set of ints to array of strings
String[] strIds = missingPartitionIds.stream().map(i -> String.valueOf(i)).toArray(String[]::new);
return String.format("After Host %d failure, non-viable cluster due to k-safety violation. "
+ "Missing partitions: %s", failedHostId, String.join(",", strIds));
}
}
/////////////////////////////////////
//
// PRIVATE BUILDER CLASSES
//
/////////////////////////////////////
private static class MutablePartition implements Comparable<MutablePartition> {
final int id;
final int k;
final Set<MutableHost> hosts = new TreeSet<>();
MutableHost leader = null;
MutablePartition(int id, int k) {
this.id = id;
this.k = k;
}
@Override
public int compareTo(MutablePartition o) {
return (id - o.id);
}
}
private static class MutableHost implements Comparable<MutableHost> {
final int id;
int targetSiteCount;
//a flag indicating if the host is missing or not
boolean isMissing = false;
HAGroup haGroup;
Set<MutablePartition> partitions = new TreeSet<MutablePartition>();
MutableHost(int id, int targetSiteCount, HAGroup haGroup) {
this.id = id;
this.targetSiteCount = targetSiteCount;
this.haGroup = haGroup;
}
int freeSpace() {
return Math.max(targetSiteCount - partitions.size(), 0);
}
/** Count the number of partitions that consider this host a leader */
int leaderCount() {
return (int) partitions.stream().filter(p -> p.leader == this).count();
}
public void markHostMissing(boolean isMissing) {
this.isMissing = isMissing;
}
@Override
public int compareTo(MutableHost o) {
return (id - o.id);
}
}
/////////////////////////////////////
//
// PUBLIC STATIC API
//
/////////////////////////////////////
public static class HostDescription {
public final int hostId;
public final int targetSiteCount;
public final String haGroupToken;
public HostDescription(int hostId, int targetSiteCount, String haGroupToken) {
this.hostId = hostId;
this.targetSiteCount = targetSiteCount;
this.haGroupToken = haGroupToken;
}
}
public static class PartitionDescription {
public final int k;
public PartitionDescription(int k) {
this.k = k;
}
}
public static AbstractTopology mutateAddHosts(AbstractTopology currentTopology,
HostDescription[] hostDescriptions)
{
// validate input
assert(currentTopology != null);
Arrays.stream(hostDescriptions).forEach(hd -> {
assert(hd != null);
assert(hd.targetSiteCount >= 0);
assert(hd.haGroupToken != null);
});
// validate no duplicate host ids
Set<Integer> hostIds = new HashSet<>(currentTopology.hostsById.keySet());
for (HostDescription hostDescription : hostDescriptions) {
if (hostIds.contains(hostDescription.hostId)) {
throw new RuntimeException("New host descriptions must contain unique and unused hostid.");
}
hostIds.add(hostDescription.hostId);
}
// for now, just add empty nodes to the topology -- not much else to do here
// get immutable HAGroups - these are fixed by user command line
final HAGroup[] haGroups = getHAGroupsForHosts(currentTopology, hostDescriptions);
// get a map of hostid => hostdescription for new hosts
Map<Integer, HostDescription> hostDescriptionsById = Arrays.stream(hostDescriptions)
.collect(Collectors.toMap(hd -> hd.hostId, hd -> hd));
// build the full set of immutable hosts, using the HAGroups
Set<Host> fullHostSet = new TreeSet<>();
for (HAGroup haGroup : haGroups) {
for (int hostId : haGroup.hostIds) {
Host currentHost = currentTopology.hostsById.get(hostId);
Host newHost = null;
if (currentHost != null) {
newHost = new Host(hostId, currentHost.targetSiteCount, haGroup, currentHost.partitions);
}
else {
HostDescription hostDescription = hostDescriptionsById.get(hostId);
assert(hostDescription != null);
newHost = new Host(hostId, hostDescription.targetSiteCount, haGroup, new TreeSet<>());
}
fullHostSet.add(newHost);
}
}
return new AbstractTopology(currentTopology.version + 1, fullHostSet);
}
public static AbstractTopology mutateAddPartitionsToEmptyHosts(AbstractTopology currentTopology,
PartitionDescription[] partitionDescriptions)
{
// validate input
assert(currentTopology != null);
Arrays.stream(partitionDescriptions).forEach(pd -> {
assert(pd != null);
assert(pd.k >= 0);
});
/////////////////////////////////
// convert all hosts to mutable hosts to add partitions and sites
/////////////////////////////////
final Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
final Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
convertTopologyToMutables(currentTopology, mutableHostMap, mutablePartitionMap);
// get max used site and partition ids so new ones will be unique
int largestPartitionId = getNextFreePartitionId(currentTopology);
/////////////////////////////////
// find eligible mutable hosts (those without any partitions and with sph > 0)
/////////////////////////////////
Map<Integer, MutableHost> eligibleHosts = mutableHostMap.values().stream()
.filter(h -> h.partitions.size() == 0)
.filter(h -> h.targetSiteCount > 0)
.collect(Collectors.toMap(h -> h.id, h -> h));
/////////////////////////////////
// generate partitions
/////////////////////////////////
Map<Integer, MutablePartition> partitionsToAdd = new TreeMap<>();
for (PartitionDescription partitionDescription : partitionDescriptions) {
MutablePartition partition = new MutablePartition(largestPartitionId++, partitionDescription.k);
partitionsToAdd.put(partition.id, partition);
mutablePartitionMap.put(partition.id, partition);
}
// group partitions by k
Map<Integer, List<MutablePartition>> newPartitionsByK = partitionsToAdd.values().stream()
.collect(Collectors.groupingBy(mp -> mp.k));
// sort partitions by k
newPartitionsByK = new TreeMap<>(newPartitionsByK);
/////////////////////////////////
// validate eligible hosts have enough space for partitions
/////////////////////////////////
int totalFreeSpace = mutableHostMap.values().stream()
.mapToInt(h -> h.freeSpace())
.sum();
int totalReplicasToPlace = partitionsToAdd.values().stream()
.mapToInt(p -> p.k + 1)
.sum();
if (totalFreeSpace < totalReplicasToPlace) {
throw new RuntimeException("Hosts have inadequate space to hold all partition replicas.");
}
/////////////////////////////////
// compute HAGroup distances
/////////////////////////////////
List<HAGroup> haGroups = mutableHostMap.values().stream()
.map(h -> h.haGroup)
.distinct()
.collect(Collectors.toList());
Map<HAGroup, Map<HAGroup, Integer>> haGroupDistances = new TreeMap<>();
for (HAGroup haGroup1 : haGroups) {
Map<HAGroup, Integer> distances = new TreeMap<>();
haGroupDistances.put(haGroup1, distances);
for (HAGroup haGroup2 : haGroups) {
int distance = computeHADistance(haGroup1.token, haGroup2.token);
distances.put(haGroup2, distance);
}
}
/////////////////////////////////
// place partitions with hosts
/////////////////////////////////
while (partitionsToAdd.size() > 0) {
// PLAN:
// 1. Start with the largest k, over all partitions
// 2. Find k+1 eligible hosts, starting with ha groups that have low max distance to other ha groups
Entry<Integer, List<MutablePartition>> partitionsWithLargestK = //newPartitionsByK.//findMostCommonK(newPartitionsByK);
newPartitionsByK.entrySet().stream()
.filter(e -> e.getValue().size() > 0) // ignore k with empty partition lists
.max((e1, e2) -> e1.getKey() - e2.getKey()).get();
// goal is to find a set k + 1 hosts that contain the starter host that
// a) have space for at least one partition
// b) are reasonably distributed w.r.t. ha groups
int targetReplicaCount = partitionsWithLargestK.getKey() + 1;
// verify enough hosts exist
if (eligibleHosts.size() < targetReplicaCount) {
throw new RuntimeException(String.format(
"Partition requesting %d replicas " +
"but there are only %d eligable hosts on which to place them. " +
"Topology request invalid.",
targetReplicaCount, eligibleHosts.size()));
}
// if there isn't space for a partition, shift partitions around until there is
// or give up if shifting can't free up enough space
while (countHostsWithFreeSpace(eligibleHosts) < targetReplicaCount) {
// if there aren't k + 1 good nodes, then move around some partition replicas until there are
if (!shiftAPartition(eligibleHosts, haGroupDistances)) {
throw new RuntimeException(String.format(
"Partition requesting %d replicas " +
"but unable to find more than %d hosts with free space to place them. " +
"Topology request invalid.",
targetReplicaCount, countHostsWithFreeSpace(eligibleHosts)));
}
}
// pick one host to be part of a partition group
MutableHost starterHost = findBestStarterHost(eligibleHosts, haGroupDistances);
// find k + 1 peers for starter host
Set<MutableHost> peerHostsForPartition = findBestPeerHosts(starterHost, targetReplicaCount, eligibleHosts, haGroupDistances, false);
assert(peerHostsForPartition.size() == targetReplicaCount);
// determine how many partitions this group of hosts can handle
int minAvailableSitesForSet = peerHostsForPartition.stream()
.mapToInt(h -> h.freeSpace())
.min().getAsInt();
// determine how many partitions we have with this k value
int availablePartitionCount = partitionsWithLargestK.getValue().size();
// assign the partitions
for (int i = 0; i < Math.min(minAvailableSitesForSet, availablePartitionCount); i++) {
// pop a partition of the list
MutablePartition partition = partitionsWithLargestK.getValue().remove(0);
// assign it to the host set
for (MutableHost host : peerHostsForPartition) {
host.partitions.add(partition);
partition.hosts.add(host);
}
// remove the partition from the tracking
partitionsToAdd.remove(partition.id);
}
}
/////////////////////////////////
// pick leaders for partitions that need them
/////////////////////////////////
assignLeadersToPartitionsThatNeedThem(mutableHostMap, mutablePartitionMap);
/////////////////////////////////
// convert mutable hosts to hosts to prep a return value
/////////////////////////////////
return convertMutablesToTopology(
currentTopology.version + 1,
mutableHostMap,
mutablePartitionMap);
}
public static AbstractTopology mutateRemoveHost(AbstractTopology currentTopology, int hostId)
throws KSafetyViolationException
{
/////////////////////////////////
// convert all hosts to mutable hosts to add partitions and sites
/////////////////////////////////
final Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
final Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
convertTopologyToMutables(currentTopology, mutableHostMap, mutablePartitionMap);
Set<Integer> missingPartitionIds = new HashSet<>();
MutableHost hostToRemove = mutableHostMap.remove(hostId);
if (hostToRemove == null) {
throw new RuntimeException("Can't remove host; host id not present in current topology.");
}
for (MutablePartition partition : hostToRemove.partitions) {
partition.hosts.remove(hostToRemove);
if (partition.hosts.size() == 0) {
missingPartitionIds.add(partition.id);
}
}
// check for k-safety violation
if (missingPartitionIds.size() > 0) {
throw new KSafetyViolationException(hostId, missingPartitionIds);
}
Set<Integer> mutableHostIds = new HashSet<>(hostToRemove.haGroup.hostIds);
mutableHostIds.remove(hostId);
int[] hostIdArray = hostToRemove.haGroup.hostIds.stream()
.filter(candidateId -> candidateId != hostId)
.mapToInt(id -> id)
.toArray();
HAGroup newHaGroup = new HAGroup(hostToRemove.haGroup.token, hostIdArray);
mutableHostMap.values().forEach(h -> {
if (h.haGroup.token.equals(newHaGroup.token)) {
h.haGroup = newHaGroup;
}
});
/////////////////////////////////
// pick leaders for partitions that need them (naive)
/////////////////////////////////
assignLeadersToPartitionsThatNeedThem(mutableHostMap, mutablePartitionMap);
/////////////////////////////////
// convert mutable hosts to hosts to prep a return value
/////////////////////////////////
return convertMutablesToTopology(
currentTopology.version + 1,
mutableHostMap,
mutablePartitionMap);
}
public static AbstractTopology mutateAddReplicaSite(AbstractTopology topology, int hostId, int partitionId) {
Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
// create mutable hosts without partitions
for (Host host : topology.hostsById.values()) {
int sph = host.targetSiteCount;
if (host.id == hostId) {
assert(host.partitions.stream().filter(p->p.id == partitionId).collect(Collectors.toList()).isEmpty());
sph++;
}
final MutableHost mutableHost = new MutableHost(host.id, sph, host.haGroup);
mutableHostMap.put(host.id, mutableHost);
}
for (Partition partition : topology.partitionsById.values()) {
int k = partition.k;
if (partition.id == partitionId) {
assert(!(partition.hostIds.contains(hostId)));
k++;
}
MutablePartition mp = new MutablePartition(partition.id, k);
mutablePartitionMap.put(mp.id, mp);
for (Integer hId : partition.hostIds) {
final MutableHost mutableHost = mutableHostMap.get(hId);
mp.hosts.add(mutableHost);
mutableHost.partitions.add(mp);
}
mp.leader = mutableHostMap.get(partition.leaderHostId);
if (partition.id == partitionId) {
final MutableHost mutableHost = mutableHostMap.get(hostId);
mp.hosts.add(mutableHost);
mutableHost.partitions.add(mp);
}
}
return convertMutablesToTopology(topology.version, mutableHostMap, mutablePartitionMap);
}
/**
* Get the total number of missing replicas across all partitions.
* Note this doesn't say how many partitions are under-represented.
*
* If this returns > 0, you should rejoin, rather than elastic join.
*
* @param topology Current topology
* @return The number of missing replicas.
*/
public int countMissingPartitionReplicas() {
// sum up, for all partitions, the diff between k+1 and replica count
return partitionsById.values().stream()
.mapToInt(p -> (p.k + 1) - p.hostIds.size())
.sum();
}
/**
*
* @param currentTopology
* @param hostDescription
* @return
*/
public static AbstractTopology mutateRejoinHost(AbstractTopology currentTopology, HostDescription hostDescription) {
// add the node
currentTopology = AbstractTopology.mutateAddHosts(currentTopology, new HostDescription[] { hostDescription });
/////////////////////////////////
// convert all hosts to mutable hosts to add partitions and sites
/////////////////////////////////
final Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
final Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
convertTopologyToMutables(currentTopology, mutableHostMap, mutablePartitionMap);
// collect under-replicated partitions by hostid
List<MutablePartition> underReplicatedPartitions = mutablePartitionMap.values().stream()
.filter(p -> (p.k + 1) > p.hosts.size())
.collect(Collectors.toList());
// find hosts with under-replicated partitions
Map<Integer, List<MutablePartition>> urPartitionsByHostId = new TreeMap<>();
underReplicatedPartitions.forEach(urPartition -> {
urPartition.hosts.forEach(host -> {
List<MutablePartition> partitionsForHost = urPartitionsByHostId.get(host.id);
if (partitionsForHost == null) {
partitionsForHost = new ArrayList<>();
urPartitionsByHostId.put(host.id, partitionsForHost);
}
partitionsForHost.add(urPartition);
});
});
// divide partitions into groups
Set<MutablePartition> partitionsToScan = new HashSet<>(underReplicatedPartitions);
List<List<MutablePartition>> partitionGroups = new ArrayList<>();
while (!partitionsToScan.isEmpty()) {
List<MutablePartition> partitionGroup = new ArrayList<MutablePartition>();
partitionGroups.add(partitionGroup);
// get any partition from the set to scan
MutablePartition starter = partitionsToScan.iterator().next();
scanPeerParititions(partitionsToScan, partitionGroup, starter);
}
// sort partition groups from largest to smallest
partitionGroups = partitionGroups.stream()
.sorted((l1, l2) -> l2.size() - l1.size())
.collect(Collectors.toList());
// look for a group with the right number of partitions
List<MutablePartition> match = null;
// look for a fallback where host covers exactly 1/X (for some X) of a partition group
List<MutablePartition> altMatch1 = null;
// look for a second fallback where host covers some of a group, but not perfectly 1/X
List<MutablePartition> altMatch2 = null;
// look for a third fallback where host joins two groups
List<MutablePartition> altMatch3 = null;
for (List<MutablePartition> partitionGroup : partitionGroups) {
if (partitionGroup.size() == hostDescription.targetSiteCount) {
match = partitionGroup;
break;
}
if ((partitionGroup.size() % hostDescription.targetSiteCount) == 0) {
altMatch1 = partitionGroup;
continue;
}
if (partitionGroup.size() > hostDescription.targetSiteCount) {
altMatch2 = partitionGroup;
continue;
}
for (List<MutablePartition> altPartitionGroup : partitionGroups) {
if (altPartitionGroup == partitionGroup) {
continue;
}
}
}
// collapse the alternates to pick the best one we can
if (match == null) match = altMatch1;
if (match == null) match = altMatch2;
if (match == null) match = altMatch3;
// if no match or alternates, combine groups until you get a fit
if (match == null) {
match = new ArrayList<>();
// remember: list of partition groups are sorted by size
for (List<MutablePartition> partitionGroup : partitionGroups) {
match.addAll(partitionGroup);
// break when we have enough partitions
if (match.size() >= hostDescription.targetSiteCount) {
break;
}
// though we might add all of them if target SPH > under-replicated partitions
}
}
// now we can assume match is correct!
MutableHost rejoiningHost = mutableHostMap.get(hostDescription.hostId);
assert(rejoiningHost.targetSiteCount == hostDescription.targetSiteCount);
assert(rejoiningHost.id == hostDescription.hostId);
assert(rejoiningHost.partitions.isEmpty());
rejoiningHost.partitions.addAll(match);
match.forEach(p -> p.hosts.add(rejoiningHost));
mutableHostMap.put(rejoiningHost.id, rejoiningHost);
/////////////////////////////////
// convert mutable hosts to hosts to prep a return value
/////////////////////////////////
return convertMutablesToTopology(
currentTopology.version + 1,
mutableHostMap,
mutablePartitionMap);
}
/////////////////////////////////////
//
// SERIALIZATION API
//
/////////////////////////////////////
public JSONObject topologyToJSON() throws JSONException {
JSONStringer stringer = new JSONStringer();
stringer.object();
stringer.keySymbolValuePair(TOPO_VERSION, version);
stringer.key(TOPO_HAGROUPS).array();
List<HAGroup> haGroups = hostsById.values().stream()
.map(h -> h.haGroup)
.distinct()
.collect(Collectors.toList());
for (HAGroup haGroup : haGroups) {
haGroup.toJSON(stringer);
}
stringer.endArray();
stringer.key(TOPO_PARTITIONS).array();
for (Partition partition : partitionsById.values()) {
partition.toJSON(stringer);
}
stringer.endArray();
stringer.key(TOPO_HOSTS).array();
for (Host host : hostsById.values()) {
host.toJSON(stringer);
}
stringer.endArray();
stringer.endObject();
return new JSONObject(stringer.toString());
}
public static AbstractTopology topologyFromJSON(String jsonTopology) throws JSONException {
JSONObject jsonObj = new JSONObject(jsonTopology);
return topologyFromJSON(jsonObj);
}
public static AbstractTopology topologyFromJSON(JSONObject jsonTopology) throws JSONException {
Map<Integer, Partition> partitionsById = new TreeMap<>();
Map<String, HAGroup> haGroupsByToken = new TreeMap<>();
List<Host> hosts = new ArrayList<>();
long version = jsonTopology.getLong(TOPO_VERSION);
JSONArray haGroupsJSON = jsonTopology.getJSONArray(TOPO_HAGROUPS);
for (int i = 0; i < haGroupsJSON.length(); i++) {
HAGroup haGroup = HAGroup.fromJSON(haGroupsJSON.getJSONObject(i));
haGroupsByToken.put(haGroup.token, haGroup);
}
JSONArray partitionsJSON = jsonTopology.getJSONArray(TOPO_PARTITIONS);
for (int i = 0; i < partitionsJSON.length(); i++) {
Partition partition = Partition.fromJSON(partitionsJSON.getJSONObject(i));
partitionsById.put(partition.id, partition);
}
JSONArray hostsJSON = jsonTopology.getJSONArray(TOPO_HOSTS);
for (int i = 0; i < hostsJSON.length(); i++) {
Host host = Host.fromJSON(hostsJSON.getJSONObject(i), haGroupsByToken, partitionsById);
hosts.add(host);
}
return new AbstractTopology(version, hosts);
}
/////////////////////////////////////
//
// PRIVATE TOPOLOGY CONSTRUCTOR
//
/////////////////////////////////////
private AbstractTopology(long version, Collection<Host> hosts) {
assert(hosts != null);
assert(version >= 0);
this.version = version;
// get a sorted map of hosts across the cluster by id
Map<Integer, Host> hostsByIdTemp = new TreeMap<>();
for (Host host : hosts) {
hostsByIdTemp.put(host.id, host);
}
this.hostsById = ImmutableMap.copyOf(hostsByIdTemp);
// get a sorted map of unique partitions across the cluster by id
Map<Integer, Partition> paritionsByIdTemp = new TreeMap<>();
for (Host host : hosts) {
for (Partition partition : host.partitions) {
paritionsByIdTemp.put(partition.id, partition);
}
}
this.partitionsById = ImmutableMap.copyOf(paritionsByIdTemp);
}
/////////////////////////////////////
//
// PRIVATE STATIC HELPER METHODS
//
/////////////////////////////////////
private static void convertTopologyToMutables(
final AbstractTopology topology,
final Map<Integer, MutableHost> mutableHostMap,
final Map<Integer, MutablePartition> mutablePartitionMap)
{
// create mutable hosts without partitions
for (Host host : topology.hostsById.values()) {
final MutableHost mutableHost = new MutableHost(
host.id, host.targetSiteCount, host.haGroup);
mutableHostMap.put(host.id, mutableHost);
}
// create partitions
for (Partition partition : topology.partitionsById.values()) {
MutablePartition mp = new MutablePartition(partition.id, partition.k);
mutablePartitionMap.put(mp.id, mp);
for (Integer hostId : partition.hostIds) {
mp.hosts.add(mutableHostMap.get(hostId));
}
mp.leader = mutableHostMap.get(partition.leaderHostId);
}
// link partitions and hosts
for (Host host : topology.hostsById.values()) {
final MutableHost mutableHost = mutableHostMap.get(host.id);
host.partitions.stream().forEach(p -> {
MutablePartition mp = mutablePartitionMap.get(p.id);
mutableHost.partitions.add(mp);
});
}
}
private static AbstractTopology convertMutablesToTopology(
final long currentVersion,
final Map<Integer, MutableHost> mutableHostMap,
final Map<Integer, MutablePartition> mutablePartitionMap)
{
final Map<Integer, Partition> partitionsById = new TreeMap<>();
mutablePartitionMap.values().stream().forEach(mp -> {
assert(mp.leader != null);
List<Integer> hostIds = mp.hosts.stream()
.map(h -> h.id)
.collect(Collectors.toList());
Partition p = new Partition(mp.id, mp.k, mp.leader.id, hostIds);
partitionsById.put(p.id, p);
});
Set<Host> fullHostSet = new HashSet<>();
for (MutableHost mutableHost : mutableHostMap.values()) {
List<Partition> hostPartitions = mutableHost.partitions.stream()
.map(mp -> partitionsById.get(mp.id))
.collect(Collectors.toList());
Host newHost = new Host(mutableHost.id, mutableHost.targetSiteCount, mutableHost.haGroup, hostPartitions);
newHost.markHostMissing(mutableHost.isMissing);
fullHostSet.add(newHost);
}
return new AbstractTopology(currentVersion + 1, fullHostSet);
}
private static int getNextFreePartitionId(AbstractTopology topology) {
OptionalInt maxPartitionIdOptional = topology.partitionsById.values().stream()
.mapToInt(p -> p.id)
.max();
if (maxPartitionIdOptional.isPresent()) {
return maxPartitionIdOptional.getAsInt() + 1;
}
else {
return 0;
}
}
private static HAGroup[] getHAGroupsForHosts(AbstractTopology currentTopology, HostDescription[] hostDescriptions) {
class MutableHAGroup {
String token = "";
List<Integer> hostsIds;
private MutableHAGroup(String token) {
this.token = token;
this.hostsIds = new ArrayList<>();
}
}
final Map<String, MutableHAGroup> groupsByToken = new TreeMap<>();
// deal with all pre-existing hosts
for (Host host : currentTopology.hostsById.values()) {
MutableHAGroup haGroup = groupsByToken.get(host.haGroup.token);
if (haGroup == null) {
haGroup = new MutableHAGroup(host.haGroup.token);
groupsByToken.put(host.haGroup.token, haGroup);
}
haGroup.hostsIds.add(host.id);
}
// deal with all new hosts
for (HostDescription host : hostDescriptions) {
MutableHAGroup haGroup = groupsByToken.get(host.haGroupToken);
if (haGroup == null) {
haGroup = new MutableHAGroup(host.haGroupToken);
groupsByToken.put(host.haGroupToken, haGroup);
}
haGroup.hostsIds.add(host.hostId);
}
// convert mutable to immutable
return groupsByToken.values().stream()
.map(g -> new HAGroup(g.token, g.hostsIds.stream().mapToInt(i -> i).toArray()))
.toArray(HAGroup[]::new);
}
/**
* Compute the tree-edge distance between any two ha group tokens
* Not the most efficient way to do this, but even n^2 for 100 nodes is computable
*/
private static int computeHADistance(String token1, String token2) {
// break into arrays of graph edges
String[] token1parts = token1.split("\\.");
String[] token2parts = token2.split("\\.");
int size = Math.min(token1parts.length, token2parts.length);
int index = 0;
while (index < size) {
if (!token1parts[index].equals(token2parts[index])) break;
index++;
}
// distance is the sum of the two diverting path lengths
return token1parts.length + token2parts.length - 2 * index;
}
/**
* First, find hosts in the ha group that has the lowest max distance to another ha group
* Second, find hosts with the most free sites per host target within that ha group
*/
private static MutableHost findBestStarterHost(
Map<Integer, MutableHost> eligableHosts,
Map<HAGroup, Map<HAGroup, Integer>> haGroupDistances)
{
// special case the one node setup
if (eligableHosts.size() == 1) {
MutableHost host = eligableHosts.values().iterator().next();
// we assume if there was one host, and it was full, we wouldn't be here
assert(host.partitions.size() < host.targetSiteCount);
return host;
}
Map<HAGroup, Integer> distances = new HashMap<>();
for (Entry<HAGroup, Map<HAGroup, Integer>> e : haGroupDistances.entrySet()) {
int distance = 0;
if (e.getValue().size() > 0) {
distance = e.getValue().values().stream()
.mapToInt(i -> i)
.max().getAsInt();
}
distances.put(e.getKey(), distance);
}
List<HAGroup> haGroupsByMinimalMaxHADistance = distances.entrySet().stream()
.sorted((e1,e2) -> e1.getValue() - e2.getValue())
.map(e -> e.getKey())
.collect(Collectors.toList());
for (HAGroup haGroup : haGroupsByMinimalMaxHADistance) {
List<MutableHost> hostsByAvailability = haGroup.hostIds.stream()
.map(id -> eligableHosts.get(id))
.filter(h -> h != null)
.filter(h -> h.targetSiteCount > h.partitions.size())
.sorted((h1, h2) -> h2.freeSpace() - h1.freeSpace())
.collect(Collectors.toList());
if (!hostsByAvailability.isEmpty()) {
return hostsByAvailability.get(0);
}
}
assert(false);
return null;
}
private static MutableHost findNextPeerHost(
final Set<MutableHost> peers,
final Map<Integer, MutableHost> eligibleHosts,
final Map<HAGroup, Map<HAGroup, Integer>> haGroupDistances,
boolean findFullHosts)
{
List<MutableHost> hostsInOrder = null;
MutableHost anyHost = peers.iterator().next();
Set<HAGroup> undesireableHAGroups = peers.stream()
.map(h -> h.haGroup)
.collect(Collectors.toSet());
Map<HAGroup, Integer> hasByDistance = haGroupDistances.get(anyHost.haGroup);
List<HAGroup> haGroupsByDistance = hasByDistance.entrySet().stream()
.sorted((e1,e2) -> e1.getValue() - e2.getValue())
.map(e -> e.getKey())
.filter(hag -> undesireableHAGroups.contains(hag) == false)
.collect(Collectors.toList());
for (HAGroup haGroup : haGroupsByDistance) {
List<MutableHost> validHosts = haGroup.hostIds.stream()
.map(id -> eligibleHosts.get(id))
.filter(h -> h != null)
.collect(Collectors.toList());
if (findFullHosts) {
// find full and non-full hosts sorted by availability (less -> more)
hostsInOrder = validHosts.stream()
.sorted((h1, h2) -> h1.freeSpace() - h2.freeSpace())
.collect(Collectors.toList());
}
else {
// find non-full hosts sorted by availability (more -> less)
hostsInOrder = validHosts.stream()
.filter(h -> h.targetSiteCount > h.partitions.size())
.sorted((h1, h2) -> h2.freeSpace() - h1.freeSpace())
.collect(Collectors.toList());
}
if (!hostsInOrder.isEmpty()) {
return hostsInOrder.get(0);
}
}
// at this point, give up on using distinct ha groups and just find a host
if (findFullHosts) {
hostsInOrder = eligibleHosts.values().stream()
.filter(h -> h.targetSiteCount == h.partitions.size()) // is full
.filter(h -> peers.contains(h) == false) // not chosen yet
.collect(Collectors.toList());
}
else {
// sort candidate hosts by free space
hostsInOrder = eligibleHosts.values().stream()
.filter(h -> h.freeSpace() > 0) // has space
.filter(h -> peers.contains(h) == false) // not chosen yet
.sorted((h1, h2) -> h2.freeSpace() - h1.freeSpace()) // pick most free space
.collect(Collectors.toList());
}
if (hostsInOrder.isEmpty()) {
return null;
}
// pick the most empty hosts that haven't been selected
return hostsInOrder.get(0);
}
private static Set<MutableHost> findBestPeerHosts(
MutableHost starterHost,
int peerCount,
Map<Integer, MutableHost> eligibleHosts,
Map<HAGroup, Map<HAGroup, Integer>> haGroupDistances,
boolean findFullHosts)
{
final Set<MutableHost> peers = new HashSet<>();
peers.add(starterHost);
// special case k = 0
if (peerCount == 1) {
return peers;
}
while (peers.size() < peerCount) {
MutableHost nextPeer = findNextPeerHost(peers, eligibleHosts, haGroupDistances, findFullHosts);
if (nextPeer == null) {
return peers;
}
peers.add(nextPeer);
}
return peers;
}
/**
* Find (peerCount - 1) nodes that are full, and move one partition from them
* to the given host. Obviously, make sure the moved partitions aren't the same one.
*/
private static boolean shiftAPartition(
Map<Integer, MutableHost> eligibleHosts,
Map<HAGroup, Map<HAGroup, Integer>> haGroupDistances)
{
// find a host that has at least two open slots to move a partition to
List<MutableHost> hostsWithSpaceInOrder = eligibleHosts.values().stream()
.filter(h -> h.freeSpace() >= 2) // need at least two open slots
.sorted((h1, h2) -> h2.freeSpace() - h1.freeSpace()) // sorted by emptiness
.collect(Collectors.toList());
// iterate over all hosts with space free
for (MutableHost starterHost : hostsWithSpaceInOrder) {
// get candidate hosts to donate a partition to a starter host
List<MutableHost> fullHostsInOrder = eligibleHosts.values().stream()
.filter(h -> h.freeSpace() == 0) // full hosts
.filter(h -> h.targetSiteCount > 0) // not slotless hosts
.sorted((h1, h2) ->
computeHADistance(starterHost.haGroup.token, h1.haGroup.token) -
computeHADistance(starterHost.haGroup.token, h2.haGroup.token)
) // by distance from starter host
.collect(Collectors.toList());
// walk through all candidate hosts, swapping at most one partition
for (MutableHost fullHost : fullHostsInOrder) {
assert(fullHost.freeSpace() == 0);
for (MutablePartition partition : fullHost.partitions) {
// skip moving this one if we're already a replica
if (starterHost.partitions.contains(partition)) continue;
// move it!
starterHost.partitions.add(partition);
partition.hosts.add(starterHost);
fullHost.partitions.remove(partition);
partition.hosts.remove(fullHost);
assert(starterHost.partitions.size() <= starterHost.targetSiteCount);
assert(starterHost.partitions.size() <= fullHost.targetSiteCount);
return true;
}
}
}
// didn't shift anything
return false;
}
private static int countHostsWithFreeSpace(Map<Integer, MutableHost> eligibleHosts) {
int freeSpaceHostCount = eligibleHosts.values().stream()
.mapToInt(h -> (h.targetSiteCount > h.partitions.size()) ? 1 : 0)
.sum();
return freeSpaceHostCount;
}
private static void scanPeerParititions(Set<MutablePartition> partitionsToScan,
List<MutablePartition> partitionGroup,
MutablePartition partition)
{
partitionsToScan.remove(partition);
partitionGroup.add(partition);
for (MutableHost host : partition.hosts) {
for (MutablePartition peer : host.partitions) {
if (partitionsToScan.contains(peer)) {
scanPeerParititions(partitionsToScan, partitionGroup, peer);
}
}
}
}
private static void assignLeadersToPartitionsThatNeedThem(
Map<Integer, MutableHost> mutableHostMap,
Map<Integer, MutablePartition> mutablePartitionMap)
{
// clean up any partitions with leaders that don't exist
// (this is used by remove node, not during new cluster forming)
mutablePartitionMap.values().stream()
.filter(p -> p.leader != null)
// if a leader isn't in the current set of hosts, set to null
.filter(p -> mutableHostMap.containsKey(p.leader.id) == false)
.forEach(p -> p.leader = null);
// sort partitions by small k, so we can assign less flexible partitions first
List<MutablePartition> leaderlessPartitionsSortedByK = mutablePartitionMap.values().stream()
.filter(p -> p.leader == null)
.sorted((p1, p2) -> p1.k - p2.k)
.collect(Collectors.toList());
// pick a leader for each partition based on the host that is least full of leaders
for (MutablePartition partition : leaderlessPartitionsSortedByK) {
// find host with fewest leaders
MutableHost leaderHost = partition.hosts.stream()
.min((h1, h2) -> h1.leaderCount() - h2.leaderCount()).get();
partition.leader = leaderHost;
assert(partition.hosts.contains(leaderHost));
}
// run through and shift leaders from hosts with high partition counts to those with low ones
// iterate until it's not possible to shift things this way
//
// There might be better ways to do this that invovle mutli-swaps, but this is probably decent
boolean foundAMove;
do {
foundAMove = false;
for (MutablePartition partition : leaderlessPartitionsSortedByK) {
int loadOfLeadHost = partition.leader.leaderCount();
MutableHost loadWithMinLeaders = partition.hosts.stream()
.min((h1, h2) -> h1.leaderCount() - h2.leaderCount()).get();
if ((loadOfLeadHost - loadWithMinLeaders.leaderCount()) >= 2) {
foundAMove = true;
partition.leader = loadWithMinLeaders;
break;
}
}
} while (foundAMove);
}
public static AbstractTopology getTopology(
Map<Integer, Integer> sitesPerHostMap,
Map<Integer, String> hostGroups,
int kfactor)
{
// host descriptions
HostDescription [] hosts = new HostDescription[sitesPerHostMap.size()];
int i = 0;
for (Map.Entry<Integer, Integer> e : sitesPerHostMap.entrySet()) {
int hostId = e.getKey();
int sitesCount = e.getValue();
hosts[i++] = new HostDescription(hostId, sitesCount, hostGroups.get(hostId));
}
// partition descriptions
int totalSites = 0;
for (Map.Entry<Integer, Integer> entry : sitesPerHostMap.entrySet()) {
totalSites += entry.getValue();
}
int partitionCount = totalSites / (kfactor + 1);
PartitionDescription[] partitions = new PartitionDescription[partitionCount];
for (int j = 0; j < partitionCount; j++) {
partitions[j] = new PartitionDescription(kfactor);
}
// get topology
AbstractTopology abstractTopo =
AbstractTopology.mutateAddHosts(AbstractTopology.EMPTY_TOPOLOGY, hosts);
abstractTopo = AbstractTopology.mutateAddPartitionsToEmptyHosts( abstractTopo, partitions);
return abstractTopo;
}
public int getHostCount() {
return hostsById.size();
}
public int getPartitionCount() {
return partitionsById.size();
}
/**
* get all the hostIds in the partition group where the host with the given host id belongs
* @param hostId the given hostId
* @return all the hostIds in the partition group
*/
public Set<Integer> getPartitionGroupHostIds(int hostId) {
Set<Integer> partitionGroupHostIds = Sets.newHashSet();
for (Integer pid : getPartitionIdList(hostId)) {
Partition p = partitionsById.get(pid);
if (p != null) {
partitionGroupHostIds.addAll(p.hostIds);
}
}
return partitionGroupHostIds;
}
public List<Integer> getPartitionIdList(int hostId) {
Host h = hostsById.get(hostId);
return (h != null) ? h.getSortedPartitionIdList() : null;
}
public int getReplicationFactor() {
//assume all partitions have the same k factor.
Partition partition = partitionsById.values().iterator().next();
return partition.k;
}
public boolean hasMissingPartitions() {
Set<Partition> partitions = Sets.newHashSet();
for (Host host : hostsById.values()) {
if (!host.isMissing) {
partitions.addAll(host.partitions);
}
}
return getPartitionCount() > partitions.size();
}
/**
* reassign partition leaders from the missing hosts to other hosts.
* @param topology current topology
* @param missingHosts The hosts on which partition leaders will be reassigned.
* @return new AbstractTopology
*/
public static AbstractTopology shiftPartitionLeaders(AbstractTopology topology, Set<Integer> missingHosts) {
if (missingHosts == null || missingHosts.isEmpty()) {
return topology;
}
Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
// create mutable missing Hosts without partitions
for (Host host : topology.hostsById.values()) {
final MutableHost mutableHost = new MutableHost(host.id, host.targetSiteCount, host.haGroup);
if (missingHosts.contains(host.id)) {
mutableHost.markHostMissing(true);
}
mutableHostMap.put(host.id, mutableHost);
}
for (Partition partition : topology.partitionsById.values()) {
MutablePartition mp = new MutablePartition(partition.id, partition.k);
mutablePartitionMap.put(mp.id, mp);
for (Integer hId : partition.hostIds) {
final MutableHost mutableHost = mutableHostMap.get(hId);
mp.hosts.add(mutableHost);
mutableHost.partitions.add(mp);
}
int leaderId = partition.leaderHostId;
if (missingHosts.contains(leaderId)) {
List<Host> perspectiveHosts = Lists.newArrayList();
for (Host host : topology.hostsById.values()) {
if (!missingHosts.contains(host.id)) {
List<Integer> partitionListOnNonMissingHost = topology.getPartitionIdList(host.id);
if (partitionListOnNonMissingHost.contains(partition.id)) {
perspectiveHosts.add(host);
}
}
}
//Place the partition master to a node which hosts the partition and has the least masters.
assert(!perspectiveHosts.isEmpty());
if (perspectiveHosts.size() > 1) {
perspectiveHosts.sort((Host a, Host b) -> {
return (a.getleaderCount() - b.getleaderCount());
});
}
leaderId = perspectiveHosts.get(0).id;
partition.leaderHostId = leaderId;
}
mp.leader = mutableHostMap.get(leaderId);
}
return convertMutablesToTopology(topology.version, mutableHostMap, mutablePartitionMap);
}
/**
* Sort all nodes in reverse hostGroup distance order, then group by rack-aware group, local host id is excluded.
* @param hostId the local host id
* @param hostGroups a host id to group map
* @return sorted grouped host ids from farthest to nearest
*/
@SuppressWarnings("unchecked")
public static List<Collection<Integer>> sortHostIdByHGDistance(int hostId, Map<Integer, String> hostGroups) {
String localHostGroup = hostGroups.get(hostId);
Preconditions.checkArgument(localHostGroup != null);
// Memorize the distance, map the distance to host ids.
Multimap<Integer, Integer> distanceMap = LinkedListMultimap.create();
for (Map.Entry<Integer, String> entry : hostGroups.entrySet()) {
if (hostId == entry.getKey()) continue;
distanceMap.put(computeHADistance(localHostGroup, entry.getValue()), entry.getKey());
}
//sort the multipmap of distance to host ids by the distances in descending order
//and collect the host ids in lists.For example, if the distance map contains
//1=[0.2.3], 3=[1,4] 4=[5,6,7]. The results will be [5,6,7], [1,4], [0,2,3]
List<Collection<Integer>> result = distanceMap.asMap().entrySet().stream()
.sorted(Comparator.comparingInt(k->((Entry<Integer, Integer>) k).getKey()).reversed())
.map(x->x.getValue())
.collect(Collectors.toList());
return result;
}
/**
* Best effort to find the matching host on the existing topology from ZK
* Use the placement group of the recovering host to match a lost node in the topology
* @param topology The topology
* @param liveHosts The live host ids
* @param localHostId The rejoining host id
* @param placementGroup The rejoining placement group
* @return recovered topology if a matching node is found
*/
public static AbstractTopology mutateRecoverTopology(AbstractTopology topology,
Set<Integer> liveHosts, int localHostId, String placementGroup) {
Map<Integer, MutableHost> mutableHostMap = new TreeMap<>();
Map<Integer, MutablePartition> mutablePartitionMap = new TreeMap<>();
// create mutable hosts without partitions
int recoveredHostId = -1;
Map<String, Set<Integer>> haGroupMaps = Maps.newHashMap();
for (Host host : topology.hostsById.values()) {
int hostId = host.id;
//recover from the 1st none-living node in the same placement group
if (host.haGroup.token.equalsIgnoreCase(placementGroup) &&
!liveHosts.contains(hostId) && recoveredHostId < 0) {
recoveredHostId = host.id;
hostId = localHostId;
}
Set<Integer> groupHostIds = haGroupMaps.get(host.haGroup.token);
if (groupHostIds == null) {
groupHostIds = Sets.newHashSet();
haGroupMaps.put(host.haGroup.token, groupHostIds);
}
groupHostIds.add(hostId);
final MutableHost mutableHost = new MutableHost(hostId, host.targetSiteCount, null);
mutableHostMap.put(hostId, mutableHost);
}
//no matching candidate found.
if (recoveredHostId < 0) {
return null;
}
//update placement groups with recovering host
for (Map.Entry<String, Set<Integer>> entry : haGroupMaps.entrySet()) {
HAGroup haGroup = new HAGroup(entry.getKey(), entry.getValue().stream().mapToInt(Number::intValue).toArray());
for (Integer hostId : entry.getValue()) {
final MutableHost mutableHost = mutableHostMap.get(hostId);
mutableHost.haGroup = haGroup;
}
}
//move partitions to host
for (Partition partition : topology.partitionsById.values()) {
MutablePartition mp = new MutablePartition(partition.id, partition.k);
mutablePartitionMap.put(mp.id, mp);
for (Integer hId : partition.hostIds) {
int hostId = (hId == recoveredHostId) ? localHostId : hId;
final MutableHost mutableHost = mutableHostMap.get(hostId);
mp.hosts.add(mutableHost);
mutableHost.partitions.add(mp);
}
int leader = (partition.leaderHostId == recoveredHostId) ? localHostId : partition.leaderHostId;
mp.leader = mutableHostMap.get(leader);
}
return convertMutablesToTopology(topology.version, mutableHostMap, mutablePartitionMap);
}
}