/** * Copyright 2011 LiveRamp * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.liveramp.hank.ring_group_conductor; import java.io.IOException; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.liveramp.hank.coordinator.Domain; import com.liveramp.hank.coordinator.DomainAndVersion; import com.liveramp.hank.coordinator.DomainGroup; import com.liveramp.hank.coordinator.Host; import com.liveramp.hank.coordinator.HostCommand; import com.liveramp.hank.coordinator.HostDomain; import com.liveramp.hank.coordinator.HostDomainPartition; import com.liveramp.hank.coordinator.HostState; import com.liveramp.hank.coordinator.Hosts; import com.liveramp.hank.coordinator.Ring; import com.liveramp.hank.coordinator.RingGroup; import com.liveramp.hank.partition_assigner.PartitionAssigner; public class RingGroupUpdateTransitionFunctionImpl implements RingGroupUpdateTransitionFunction { private static Logger LOG = LoggerFactory.getLogger(RingGroupUpdateTransitionFunctionImpl.class); private final PartitionAssigner partitionAssigner; private final int minRingFullyServingObservations; private final int minServingReplicas; private final int minServingAvailabilityBucketReplicas; private final String availabilityBucketKey; private final Map<String, Integer> hostToFullyServingObservations = new HashMap<String, Integer>(); public RingGroupUpdateTransitionFunctionImpl(PartitionAssigner partitionAssigner, int minRingFullyServingObservations, int minServingReplicas, int minServingAvailabilityBucketReplicas, String availabilityBucketKey) throws IOException { this.partitionAssigner = partitionAssigner; this.minRingFullyServingObservations = minRingFullyServingObservations; this.minServingReplicas = minServingReplicas; this.minServingAvailabilityBucketReplicas = minServingAvailabilityBucketReplicas; this.availabilityBucketKey = availabilityBucketKey; } private static boolean isServingAndAboutToServe(Host host) throws IOException { return host.getState().equals(HostState.SERVING) && host.getCurrentCommand() == null && host.getCommandQueue().size() == 0; } /** * Return true iff host is serving and is not about to * stop serving (i.e. there is no current or pending command). And we have observed * that enough times in a row. * * @param host * @param isObserved * @return * @throws IOException */ protected boolean isFullyServing(Host host, boolean isObserved) throws IOException { String key = host.getAddress().toString(); if (!hostToFullyServingObservations.containsKey(key)) { hostToFullyServingObservations.put(key, 0); } if (!isServingAndAboutToServe(host)) { hostToFullyServingObservations.put(key, 0); return false; } // Host is fully serving, but have we observed that enough times? if (hostToFullyServingObservations.get(key) >= minRingFullyServingObservations) { return true; } else { if (isObserved) { // Increment number of observations hostToFullyServingObservations.put(key, hostToFullyServingObservations.get(key) + 1); } return false; } } @Override public void manageTransitions(RingGroup ringGroup) throws IOException { DomainGroup domainGroup = ringGroup.getDomainGroup(); if (domainGroup == null) { // Nothing to do LOG.info("Domain group not found. Nothing to do."); return; } Map<Domain, Map<Integer, Set<Host>>> domainToPartitionToHostsFullyServing = computeDomainToPartitionToHostsFullyServing(ringGroup); for (Ring ring : ringGroup.getRingsSorted()) { partitionAssigner.prepare(ring, domainGroup.getDomainVersions(), ringGroup.getRingGroupConductorMode()); for (Host host : ring.getHostsSorted()) { manageTransitions(host, domainGroup, domainToPartitionToHostsFullyServing); } } } private void manageTransitions(Host host, DomainGroup domainGroup, Map<Domain, Map<Integer, Set<Host>>> domainToPartitionToHostsFullyServing) throws IOException { boolean isAssigned = partitionAssigner.isAssigned(host); boolean isUpToDate = Hosts.isUpToDate(host, domainGroup); boolean isFullyServing = isFullyServing(host, true); // Host is serving, assigned and up-to-date. Do nothing. if (Hosts.isServing(host) && isAssigned && isUpToDate) { LOG.info("Host " + host.getAddress() + " is serving, assigned, and up-to-date. Do nothing."); return; } // Note: numReplicasFullyServing can be null if the host is not serving relevant data LiveReplicaStatus status = computeDataReplicationStatus( domainToPartitionToHostsFullyServing, domainGroup.getDomainVersions(), host); // Not enough replicas are fully serving and the current host is servable. Serve. if (Hosts.isIdle(host) && Hosts.isServable(host) && status == LiveReplicaStatus.UNDER_REPLICATED) { LOG.info("Host " + host.getAddress() + " is idle, servable, and not enough replicas are fully serving. Serve."); Hosts.enqueueCommandIfNotPresent(host, HostCommand.SERVE_DATA); return; } // Host is idle, assigned and up-to-date. Attempt to serve. if (Hosts.isIdle(host) && isAssigned && isUpToDate) { LOG.info("Host " + host.getAddress() + " is idle, assigned and up-to-date. Serve."); Hosts.enqueueCommandIfNotPresent(host, HostCommand.SERVE_DATA); return; } if (Hosts.isIdle(host) && isAssigned && !isUpToDate && (status.isFullyReplicated() || !Hosts.isServable(host))) { // Host is idle, assigned, not up-to-date and there are enough replicas serving or it's not servable. Update. LOG.info("Host " + host.getAddress() + " is idle, assigned, not up-to-date, and there are enough replicas serving (or it's not servable). Update."); Hosts.enqueueCommandIfNotPresent(host, HostCommand.EXECUTE_UPDATE); return; } if (isFullyServing && isAssigned && !isUpToDate && status == LiveReplicaStatus.OVER_REPLICATED) { // Host is serving, assigned, not up-to-date and there are more than enough replicas serving. Go idle. LOG.info("Host " + host.getAddress() + " is serving, assigned, not up-to-date, and there are more than enough replicas serving. Go idle."); Hosts.enqueueCommandIfNotPresent(host, HostCommand.GO_TO_IDLE); removeFromReplicasFullyServing(domainToPartitionToHostsFullyServing, host); return; } // Host is idle, and not assigned. Assign. if (Hosts.isIdle(host) && !isAssigned) { LOG.info("Host " + host.getAddress() + " is idle, and not assigned. Assign."); partitionAssigner.assign(host); return; } // Host is serving, not assigned, and there are more than enough replicas serving. Go idle. if (isFullyServing && !isAssigned && status == LiveReplicaStatus.OVER_REPLICATED) { LOG.info("Host " + host.getAddress() + " is serving, not assigned, and there are more than enough replicas serving. Go idle."); Hosts.enqueueCommandIfNotPresent(host, HostCommand.GO_TO_IDLE); removeFromReplicasFullyServing(domainToPartitionToHostsFullyServing, host); return; } LOG.info("Host " + host.getAddress() + ": Nothing to do" + ", isAssigned: " + isAssigned + ", isUpToDate: " + isUpToDate + ", isFullyServing: " + isFullyServing + ", state: " + host.getState() ); } private void removeFromReplicasFullyServing(Map<Domain, Map<Integer, Set<Host>>> domainToPartitionToHostsFullyServing, Host host) { for (Map<Integer, Set<Host>> partitionToHostsFullyServing : domainToPartitionToHostsFullyServing.values()) { for (Set<Host> hosts : partitionToHostsFullyServing.values()) { hosts.remove(host); } } hostToFullyServingObservations.put(host.getAddress().toString(), 0); } private Map<Domain, Map<Integer, Set<Host>>> computeDomainToPartitionToHostsFullyServing(RingGroup ringGroup) throws IOException { Map<Domain, Map<Integer, Set<Host>>> result = new HashMap<Domain, Map<Integer, Set<Host>>>(); // Compute num replicas fully serving for all partitions for (Ring ring : ringGroup.getRings()) { for (Host h : ring.getHosts()) { if (isFullyServing(h, false)) { for (HostDomain hostDomain : h.getAssignedDomains()) { Domain domain = hostDomain.getDomain(); for (HostDomainPartition partition : hostDomain.getPartitions()) { if (!partition.isDeletable() && partition.getCurrentDomainVersion() != null) { int partitionNumber = partition.getPartitionNumber(); Map<Integer, Set<Host>> partitionToNumFullyServing = result.get(domain); if (partitionToNumFullyServing == null) { partitionToNumFullyServing = new HashMap<Integer, Set<Host>>(); result.put(domain, partitionToNumFullyServing); } if (!partitionToNumFullyServing.containsKey(partitionNumber)) { partitionToNumFullyServing.put(partitionNumber, new HashSet<Host>()); } partitionToNumFullyServing.get(partitionNumber).add(h); } } } } } } return result; } enum LiveReplicaStatus { UNDER_REPLICATED, REPLICATED, OVER_REPLICATED; public boolean isFullyReplicated() { return this == REPLICATED || this == OVER_REPLICATED; } } private LiveReplicaStatus computeDataReplicationStatus(Map<Domain, Map<Integer, Set<Host>>> domainToPartitionToHostsFullyServing, Set<DomainAndVersion> domainVersions, Host host) throws IOException { // Build set of relevant domains Set<Domain> relevantDomains = new HashSet<Domain>(); for (DomainAndVersion domainVersion : domainVersions) { relevantDomains.add(domainVersion.getDomain()); } // Compute num replicas fully serving for given host, which is the minimum of the number of replicas // fully serving across all partitions assigned to it (for relevant domains) Set<LiveReplicaStatus> allStatuses = EnumSet.of(LiveReplicaStatus.OVER_REPLICATED); for (HostDomain hostDomain : host.getAssignedDomains()) { Domain domain = hostDomain.getDomain(); // Only consider relevant domains if (relevantDomains.contains(domain)) { Map<Integer, Set<Host>> partitionToNumFullyServing = domainToPartitionToHostsFullyServing.get(hostDomain.getDomain()); if (partitionToNumFullyServing == null) { return LiveReplicaStatus.UNDER_REPLICATED; } for (HostDomainPartition partition : hostDomain.getPartitions()) { if (partitionToNumFullyServing.containsKey(partition.getPartitionNumber())) { Set<Host> servingHosts = partitionToNumFullyServing.get(partition.getPartitionNumber()); allStatuses.add(statusFor(servingHosts.size(), minServingReplicas)); if (availabilityBucketKey != null) { allStatuses.add(statusFor( servingHosts.stream().filter(input -> sameBucket(host, input)).count(), minServingAvailabilityBucketReplicas )); } } } } } return Collections.min(allStatuses); } private LiveReplicaStatus statusFor(long numServing, long numRequired) { if (numServing < numRequired) { return LiveReplicaStatus.UNDER_REPLICATED; } else if (numServing == numRequired) { return LiveReplicaStatus.REPLICATED; } else{ return LiveReplicaStatus.OVER_REPLICATED; } } private boolean sameBucket(Host host1, Host host2) { if (availabilityBucketKey == null) { return true; } return Objects.equals( host1.getEnvironmentFlags().get(availabilityBucketKey), host2.getEnvironmentFlags().get(availabilityBucketKey) ); } }