/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.locator; import java.net.InetAddress; import java.util.*; import java.util.Map.Entry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.TokenMetadata.Topology; import org.apache.cassandra.utils.FBUtilities; import com.google.common.collect.Multimap; /** * This Replication Strategy takes a property file that gives the intended * replication factor in each datacenter. The sum total of the datacenter * replication factor values should be equal to the keyspace replication * factor. * <p/> * So for example, if the keyspace replication factor is 6, the * datacenter replication factors could be 3, 2, and 1 - so 3 replicas in * one datacenter, 2 in another, and 1 in another - totalling 6. * <p/> * This class also caches the Endpoints and invalidates the cache if there is a * change in the number of tokens. */ public class NetworkTopologyStrategy extends AbstractReplicationStrategy { private final IEndpointSnitch snitch; private final Map<String, Integer> datacenters; private static final Logger logger = LoggerFactory.getLogger(NetworkTopologyStrategy.class); public NetworkTopologyStrategy(String keyspaceName, TokenMetadata tokenMetadata, IEndpointSnitch snitch, Map<String, String> configOptions) throws ConfigurationException { super(keyspaceName, tokenMetadata, snitch, configOptions); this.snitch = snitch; Map<String, Integer> newDatacenters = new HashMap<String, Integer>(); if (configOptions != null) { for (Entry<String, String> entry : configOptions.entrySet()) { String dc = entry.getKey(); if (dc.equalsIgnoreCase("replication_factor")) throw new ConfigurationException("replication_factor is an option for SimpleStrategy, not NetworkTopologyStrategy"); Integer replicas = Integer.valueOf(entry.getValue()); newDatacenters.put(dc, replicas); } } datacenters = Collections.unmodifiableMap(newDatacenters); logger.debug("Configured datacenter replicas are {}", FBUtilities.toString(datacenters)); } /** * calculate endpoints in one pass through the tokens by tracking our progress in each DC, rack etc. */ @SuppressWarnings("serial") public List<InetAddress> calculateNaturalEndpoints(Token searchToken, TokenMetadata tokenMetadata) { // we want to preserve insertion order so that the first added endpoint becomes primary Set<InetAddress> replicas = new LinkedHashSet<InetAddress>(); // replicas we have found in each DC Map<String, Set<InetAddress>> dcReplicas = new HashMap<String, Set<InetAddress>>(datacenters.size()) {{ for (Map.Entry<String, Integer> dc : datacenters.entrySet()) put(dc.getKey(), new HashSet<InetAddress>(dc.getValue())); }}; Topology topology = tokenMetadata.getTopology(); // all endpoints in each DC, so we can check when we have exhausted all the members of a DC Multimap<String, InetAddress> allEndpoints = topology.getDatacenterEndpoints(); // all racks in a DC so we can check when we have exhausted all racks in a DC Map<String, Multimap<String, InetAddress>> racks = topology.getDatacenterRacks(); assert !allEndpoints.isEmpty() && !racks.isEmpty() : "not aware of any cluster members"; // tracks the racks we have already placed replicas in Map<String, Set<String>> seenRacks = new HashMap<String, Set<String>>(datacenters.size()) {{ for (Map.Entry<String, Integer> dc : datacenters.entrySet()) put(dc.getKey(), new HashSet<String>()); }}; // tracks the endpoints that we skipped over while looking for unique racks // when we relax the rack uniqueness we can append this to the current result so we don't have to wind back the iterator Map<String, Set<InetAddress>> skippedDcEndpoints = new HashMap<String, Set<InetAddress>>(datacenters.size()) {{ for (Map.Entry<String, Integer> dc : datacenters.entrySet()) put(dc.getKey(), new LinkedHashSet<InetAddress>()); }}; Iterator<Token> tokenIter = TokenMetadata.ringIterator(tokenMetadata.sortedTokens(), searchToken, false); while (tokenIter.hasNext() && !hasSufficientReplicas(dcReplicas, allEndpoints)) { Token next = tokenIter.next(); InetAddress ep = tokenMetadata.getEndpoint(next); String dc = snitch.getDatacenter(ep); // have we already found all replicas for this dc? if (!datacenters.containsKey(dc) || hasSufficientReplicas(dc, dcReplicas, allEndpoints)) continue; // can we skip checking the rack? if (seenRacks.get(dc).size() == racks.get(dc).keySet().size()) { dcReplicas.get(dc).add(ep); replicas.add(ep); } else { String rack = snitch.getRack(ep); // is this a new rack? if (seenRacks.get(dc).contains(rack)) { skippedDcEndpoints.get(dc).add(ep); } else { dcReplicas.get(dc).add(ep); replicas.add(ep); seenRacks.get(dc).add(rack); // if we've run out of distinct racks, add the hosts we skipped past already (up to RF) if (seenRacks.get(dc).size() == racks.get(dc).keySet().size()) { Iterator<InetAddress> skippedIt = skippedDcEndpoints.get(dc).iterator(); while (skippedIt.hasNext() && !hasSufficientReplicas(dc, dcReplicas, allEndpoints)) { InetAddress nextSkipped = skippedIt.next(); dcReplicas.get(dc).add(nextSkipped); replicas.add(nextSkipped); } } } } } return new ArrayList<InetAddress>(replicas); } private boolean hasSufficientReplicas(String dc, Map<String, Set<InetAddress>> dcReplicas, Multimap<String, InetAddress> allEndpoints) { return dcReplicas.get(dc).size() >= Math.min(allEndpoints.get(dc).size(), getReplicationFactor(dc)); } private boolean hasSufficientReplicas(Map<String, Set<InetAddress>> dcReplicas, Multimap<String, InetAddress> allEndpoints) { for (String dc : datacenters.keySet()) if (!hasSufficientReplicas(dc, dcReplicas, allEndpoints)) return false; return true; } public int getReplicationFactor() { int total = 0; for (int repFactor : datacenters.values()) total += repFactor; return total; } public int getReplicationFactor(String dc) { Integer replicas = datacenters.get(dc); return replicas == null ? 0 : replicas; } public Set<String> getDatacenters() { return datacenters.keySet(); } public void validateOptions() throws ConfigurationException { for (Entry<String, String> e : this.configOptions.entrySet()) { if (e.getKey().equalsIgnoreCase("replication_factor")) throw new ConfigurationException("replication_factor is an option for SimpleStrategy, not NetworkTopologyStrategy"); validateReplicationFactor(e.getValue()); } } public Collection<String> recognizedOptions() { // We explicitely allow all options return null; } }