/*
* @(#) RackAwareOdklStrategy.java
* Created May 9, 2012 by oleg
* (C) ONE, SIA
*/
package org.apache.cassandra.locator;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.NavigableSet;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.cassandra.config.ConfigurationException;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.dht.OdklDomainPartitioner;
import org.apache.cassandra.dht.StringToken;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.utils.FBUtilities;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
/**
* In addition to even (re)distribution of replicas across all cluster nodes, it ensures no replicas of the same key hit
* the endpoint in the same rack.
*
* It uses {@link IEndPointSnitch} to get information about location of nodes.
*
* This stategy requires uniq racks number to match replication factor.
*
* It works by making subrings from ring for every uniq rack. Each replica will be placed including only nodes from
* one individual subring. Replica #0 (AKA master) is placed to 1st subring, replice #2 to 2nd subring and so on.
*
* For example, imagine cluster with RF=2 and ring
*
* token endpoint rack
* 0 127.0.0.0 RACK0
* 1 127.0.0.1 RACK1
* 2 127.0.0.2 RACK0
* 3 127.0.0.3 RACK1
*
* 2 rings will be extracted.
* 1st:
* token endpoint rack
* 0 127.0.0.0 RACK0
* 2 127.0.0.2 RACK0
*
* and 2nd
* token endpoint rack
* 1 127.0.0.1 RACK1
* 3 127.0.0.3 RACK1
*
* for the 1st subring domain value will be used as is.
* for the 2nd, it will be shuffled according to {@link #shuffle(int)} to 0x55
*
* so when placing row with key = 0, first replica accprding to std cassandra ring algo will be placed to node
* 127.0.0.0 and second replica will be placed to node 127.0.0.1 (because considering only 2nd subring we see
* min token == 1, with wrapping range (3,1], which includes rows with token == 0x55
*
*
* @author Oleg Anastasyev<oa@hq.one.lv>
*
*/
public class RackAwareOdklStrategy extends OdklEvenStrategy
{
public RackAwareOdklStrategy(TokenMetadata tokenMetadata, IEndPointSnitch snitch) throws ConfigurationException
{
super(tokenMetadata,snitch);
if (! (snitch instanceof AbstractNetworkTopologySnitch) )
throw new ConfigurationException("Invalid EndPoint snith configured for this replication strategy. You must select one with network topology information");
if (snitch instanceof GossipNetworkTopologySnith) {
GossipNetworkTopologySnith pfs = (GossipNetworkTopologySnith) snitch;
validate(pfs);
}
}
void validate(GossipNetworkTopologySnith pfs)
throws ConfigurationException
{
Set<String> racks = pfs.getConfiguredRacks();
logger_.info("RackAwareOdklStrategy (re)configuring with the following known racks: "+racks);
for (String table : DatabaseDescriptor.getNonSystemTables())
{
int rf = DatabaseDescriptor.getReplicationFactor(table);
if (rf != racks.size())
{
throw new ConfigurationException("Number of unique racks in AllowedLocations must match replication factor of "+table);
}
}
}
protected ArrayList<InetAddress> doCalculateEndpoints(Token keyToken,
TokenMetadata metadata, String table)
{
int replicas = DatabaseDescriptor.getReplicationFactor(table);
ArrayList<InetAddress> endpoints = new ArrayList<InetAddress>(replicas);
List<Token> tokens = metadata.sortedTokens();
if (tokens.isEmpty())
return endpoints;
String[] racks = ringRacks(metadata, tokens).toArray(new String[replicas]);
StringToken[] rackDomain = new StringToken[replicas];
int domain = Integer.parseInt( keyToken.toString().substring(0,2), 16 ) & 0xFF;
// starting from different index each time to make calls even distributed
// across replicas in each rack
int rackIndex = domain % replicas;
for (int i=0;i<racks.length;i++) {
rackDomain[i] = odklPartitioner.toStringToken(domain,keyToken.toString()) ;
domain = shuffle( domain );
}
do
{
String rack = racks[rackIndex];
tokens = getReplicaTokens(metadata,rack);
assert tokens.size()>0 : "No nodes in ring found for rack "+rack+". This replication strategy requires you to have nodes for all racks you configured in AllowedLocations";
keyToken = rackDomain[rackIndex];
Token t = TokenMetadata.firstToken(tokens, keyToken);
InetAddress endPoint = metadata.getEndPoint(t);
endpoints.add(endPoint);
rackIndex = (rackIndex+1) % replicas;
} while (endpoints.size() < replicas);
return endpoints;
}
public NavigableSet<String> ringRacks(TokenMetadata metadata, List<Token> sortedTokens)
{
if (snitch_ instanceof GossipNetworkTopologySnith)
{
return ((GossipNetworkTopologySnith)snitch_).getConfiguredRacks();
} else {
TreeSet<String> racks = new TreeSet<String>();
for (Token t : sortedTokens)
{
racks.add(snitch_.getRack(metadata.getEndPoint(t)));
}
return racks;
}
}
public String getRack(InetAddress endp)
{
return snitch_.getRack(endp);
}
protected List<Token> getReplicaTokens(TokenMetadata metadata, String rack)
{
List<Token> sortedTokens = metadata.sortedTokens();
ArrayList<Token> rc = new ArrayList<Token>(sortedTokens.size());
for (Token t : sortedTokens)
{
if (snitch_.getRack(metadata.getEndPoint(t)).equals(rack))
rc.add(t);
}
return rc;
}
/* (non-Javadoc)
* @see org.apache.cassandra.locator.AbstractReplicationStrategy#clearEndpointCache()
*/
@Override
public void clearEndpointCache()
{
try {
if (snitch_ instanceof GossipNetworkTopologySnith) {
GossipNetworkTopologySnith pfs = (GossipNetworkTopologySnith) snitch_;
validate(pfs);
}
super.clearEndpointCache();
} catch (ConfigurationException e) {
logger_.error("Cannot reconfigure: "+e);
}
}
public static void main(String[] args)
{
try {
TokenMetadata meta = new TokenMetadata();
OdklDomainPartitioner pp = new OdklDomainPartitioner();
Properties topology = new Properties();
// for (int i=0;i<255;i++)
// {
// meta.updateNormalToken( pp.toStringToken(i) , InetAddress.getByName("127.0.0."+i) );
//
// topology.put("127.0.0."+i, "DC1:RAC"+i % 3);
// }
int e=0;
// meta.updateNormalToken( new StringToken("00") , InetAddress.getByName("192.168.36.86") );
// meta.updateNormalToken( new StringToken("2a") , InetAddress.getByName("192.168.36.87") );
// meta.updateNormalToken( new StringToken("55") , InetAddress.getByName("192.168.36.88") );
// meta.updateNormalToken( new StringToken("80") , InetAddress.getByName("192.168.36.89") );
// meta.updateNormalToken( new StringToken("aa") , InetAddress.getByName("192.168.10.247") );
// meta.updateNormalToken( new StringToken("d5") , InetAddress.getByName("192.168.10.248") );
//
//
//
// topology.put("192.168.36.86","DL:RACK1");
// topology.put("192.168.36.87","DL:RACK2");
// topology.put("192.168.36.88","DL:RACK3");
// topology.put("192.168.36.89","DL:RACK1");
//
// topology.put("192.168.10.247","DL:RACK2");
// topology.put("192.168.10.248","DL:RACK3");
// int e=0;
// meta.updateNormalToken( new StringToken("00") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("15") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("2a") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("40") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("55") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("6a") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("80") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("95") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("aa") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("c0") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("d5") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("ea") , InetAddress.getByName("127.0.0."+e++) );
// meta.updateNormalToken( new StringToken("00") , InetAddress.getByName("192.168.38.112") );
// meta.updateNormalToken( new StringToken("15") , InetAddress.getByName("192.168.11.244") );
// meta.updateNormalToken( new StringToken("2a") , InetAddress.getByName("192.168.48.169") );
// meta.updateNormalToken( new StringToken("40") , InetAddress.getByName("192.168.38.113") );
// meta.updateNormalToken( new StringToken("55") , InetAddress.getByName("192.168.11.245") );
// meta.updateNormalToken( new StringToken("6a") , InetAddress.getByName("192.168.48.170") );
// meta.updateNormalToken( new StringToken("80") , InetAddress.getByName("192.168.38.114") );
// meta.updateNormalToken( new StringToken("95") , InetAddress.getByName("192.168.11.246") );
// meta.updateNormalToken( new StringToken("aa") , InetAddress.getByName("192.168.48.171") );
// meta.updateNormalToken( new StringToken("c0") , InetAddress.getByName("192.168.38.115") );
// meta.updateNormalToken( new StringToken("d5") , InetAddress.getByName("192.168.11.247") );
// meta.updateNormalToken( new StringToken("ea") , InetAddress.getByName("192.168.48.172") );
//
topology.put("192.168.38.112","DL:DL" );
topology.put("192.168.11.244","M100:M100");
topology.put("192.168.48.169","KV:KV" );
topology.put("192.168.38.113","DL:DL" );
topology.put("192.168.11.245","M100:M100");
topology.put("192.168.48.170","KV:KV" );
topology.put("192.168.38.114","DL:DL" );
topology.put("192.168.11.246","M100:M100");
topology.put("192.168.48.171","KV:KV" );
topology.put("192.168.38.115","DL:DL" );
topology.put("192.168.11.247","M100:M100");
topology.put("192.168.48.172","KV:KV" );
meta.updateNormalToken( new StringToken("0a") , InetAddress.getByName("192.168.49.135") );
meta.updateNormalToken( new StringToken("20") , InetAddress.getByName("192.168.38.227") );
meta.updateNormalToken( new StringToken("35") , InetAddress.getByName("192.168.12.76") );
meta.updateNormalToken( new StringToken("4a") , InetAddress.getByName("192.168.49.136") );
meta.updateNormalToken( new StringToken("60") , InetAddress.getByName("192.168.38.228") );
meta.updateNormalToken( new StringToken("75") , InetAddress.getByName("192.168.12.77") );
meta.updateNormalToken( new StringToken("8a") , InetAddress.getByName("192.168.49.137") );
meta.updateNormalToken( new StringToken("a0") , InetAddress.getByName("192.168.38.229") );
meta.updateNormalToken( new StringToken("b5") , InetAddress.getByName("192.168.12.78") );
meta.updateNormalToken( new StringToken("ca") , InetAddress.getByName("192.168.49.138") );
meta.updateNormalToken( new StringToken("e0") , InetAddress.getByName("192.168.38.230") );
meta.updateNormalToken( new StringToken("f5") , InetAddress.getByName("192.168.12.79") );
topology.put("192.168.49.135","KV:KV" );
topology.put("192.168.38.227","DL:DL" );
topology.put("192.168.12.76","M100:M100" );
topology.put("192.168.49.136","KV:KV" );
topology.put("192.168.38.228","DL:DL" );
topology.put("192.168.12.77","M100:M100" );
topology.put("192.168.49.137","KV:KV" );
topology.put("192.168.38.229","DL:DL" );
topology.put("192.168.12.78","M100:M100" );
topology.put("192.168.49.138","KV:KV" );
topology.put("192.168.38.230","DL:DL" );
topology.put("192.168.12.79","M100:M100" );
// for (int i=0;i<12;i++)
// {
// topology.put("127.0.0."+i, "DC1:RAC"+i % 3);
// }
RackAwareOdklStrategy o = new RackAwareOdklStrategy(new TokenMetadata(), new PropertyFileSnitch(topology)) {
void validate(GossipNetworkTopologySnith pfs)
throws ConfigurationException
{
}
// protected int shuffle(int domain)
// {
// return super.shuffle( super.shuffle(domain) );
// }
};
TreeMap<InetAddress, AtomicInteger> cc = new TreeMap<InetAddress, AtomicInteger>(new Comparator<InetAddress>()
{
@Override
public int compare(InetAddress o1, InetAddress o2)
{
return new Integer(o1.getAddress()[3] & 0xFF).compareTo(new Integer(o2.getAddress()[3] & 0xFF));
}
});
int startd=0, endd=256;
Multimap<InetAddress, String> endpDomains = ArrayListMultimap.create();
for (int i=startd;i<endd;i++)
{
List<InetAddress> endpoints = o.getNaturalEndpoints(pp.toStringToken(i,"000"), meta, "Likes");
System.out.println( Integer.toHexString(i)+" => "+Arrays.toString(endpoints.toArray()));
for (InetAddress end : endpoints) {
AtomicInteger c = cc.get(end);
if (c==null)
cc.put(end,new AtomicInteger(1));
else
c.incrementAndGet();
endpDomains.put(end, Integer.toHexString(i));
}
}
System.out.println("Per endpoint Counters:");
for (InetAddress i : endpDomains.asMap().keySet()) {
System.out.println(i.toString()+"="+endpDomains.get(i).size()+" "+endpDomains.get(i));
}
for (InetAddress i : endpDomains.asMap().keySet()) {
generateCopyScript(i,endpDomains.get(i),(String)topology.get(i.getHostAddress()));
generateCheckScript(i,endpDomains.get(i),(String)topology.get(i.getHostAddress()));
}
String[][] bootstraps =
{
// {"d5","192.168.10.248"},
// {"80","192.168.36.89"},
// {"2a","192.168.36.87"},
// {"0a","192.168.49.135"},
// {"4a","192.168.49.136"},
// {"8a","192.168.49.137"},
// {"ca","192.168.49.138"},
//
// {"f5","192.168.12.79"},
// {"b5","192.168.12.78"},
// {"75","192.168.12.77"},
// {"35","192.168.12.76"},
//
// {"e0","192.168.38.230"},
// {"a0","192.168.38.229"},
// {"60","192.168.38.228"},
// {"20","192.168.38.227"},
{"00","192.168.38.112"},
{"15","192.168.11.244"},
{"2a","192.168.48.169"},
{"40","192.168.38.113"},
{"55","192.168.11.245"},
{"6a","192.168.48.170"},
{"80","192.168.38.114"},
{"95","192.168.11.246"},
{"aa","192.168.48.171"},
{"c0","192.168.38.115"},
{"d5","192.168.11.247"},
{"ea","192.168.48.172"},
};
TokenMetadata btm = meta;
for (String[] task : bootstraps) {
btm = testBootstrap(pp, o, btm, task[0], task[1]);
}
for (Object failed : topology.keySet())
{
System.out.println("Calls when primary fail:"+failed);
cc.clear();
for (int i=0;i<256;i++)
{
InetAddress[] endpoints = o.getNaturalEndpoints(pp.toStringToken(i,"000"), meta, "Likes").toArray(new InetAddress[3]);
if (endpoints[0].getHostAddress().equals(failed))
endpoints[0]=null;
else
continue;
InetAddress end=testChooseEndpoint(i, endpoints, 0);
if (end.equals(testChooseEndpoint(i, endpoints, 1)))
System.out.println("Dup choosen: "+i+"=>"+end);
if (topology.get(failed).equals(topology.get(end.getHostAddress()) ) )
continue; // same rack
AtomicInteger c = cc.get(end);
if (c==null)
cc.put(end,new AtomicInteger(1));
else
c.incrementAndGet();
}
for (java.util.Map.Entry<InetAddress, AtomicInteger> i : cc.entrySet()) {
System.out.println(i.getKey().toString()+"="+i.getValue());
}
}
for (Object failed : new HashSet( topology.values() ))
{
System.out.println("Calls when rack fail:"+failed);
cc.clear();
for (int i=0;i<256;i++)
{
InetAddress[] endpoints = o.getNaturalEndpoints(pp.toStringToken(i,"000"), meta, "Likes").toArray(new InetAddress[3]);
if ( topology.get( endpoints[0].getHostAddress() ).equals(failed))
endpoints[0]=null;
else
continue;
InetAddress[] copyOf = Arrays.copyOf(endpoints,3);
// copyOf[1] = testChooseEndpoint(i, endpoints, 0);
// copyOf[2] = testChooseEndpoint(i, endpoints, 1);
InetAddress end=testChooseEndpoint(i, copyOf, 0);
if (end.equals(testChooseEndpoint(i, copyOf, 1)))
System.out.println("Dup choosen: "+i+"=>"+end);
AtomicInteger c = cc.get(end);
if (c==null)
cc.put(end,new AtomicInteger(1));
else
c.incrementAndGet();
}
for (java.util.Map.Entry<InetAddress, AtomicInteger> i : cc.entrySet()) {
System.out.println(i.getKey().toString()+"="+i.getValue());
}
}
System.out.println("Calls when 2 fail:");
cc.clear();
for (int i=startd;i<endd;i++)
{
InetAddress[] endpoints = o.getNaturalEndpoints(pp.toStringToken(i,"000"), meta, "Likes").toArray(new InetAddress[3]);
endpoints[0]=null;
endpoints[1]=null;
InetAddress end=testChooseEndpoint(i, endpoints, 0);
AtomicInteger c = cc.get(end);
if (c==null)
cc.put(end,new AtomicInteger(1));
else
c.incrementAndGet();
}
for (java.util.Map.Entry<InetAddress, AtomicInteger> i : cc.entrySet()) {
System.out.println(i.getKey().toString()+"="+i.getValue());
}
// System.out.println("Address ranges:"+o.getAddressRanges(meta,null).asMap().size());
// for (Entry<InetAddress, Collection<Range>> en : o.getAddressRanges(meta,null).asMap().entrySet()) {
// System.out.println(en.getKey()+" => "+en.getValue());
// }
//
// System.out.println("Range addresses:"+o.getRangeAddresses(meta,null).asMap().size());
// for (Entry<Range, Collection<InetAddress>> en : new TreeMap<Range, Collection<InetAddress>>( o.getRangeAddresses(meta,null).asMap() ).entrySet()) {
// System.out.println(en.getKey()+" => "+en.getValue());
//
// StringToken stringToken = new StringToken( en.getKey().left.toString()+"01091" );
// Set<InetAddress> s1= new HashSet<InetAddress>(o.calculateNaturalEndpoints( stringToken, meta, null)), s2=new HashSet<InetAddress>(en.getValue());
// if (!s1.equals(s2))
// System.out.println("OILOLO: "+s1+" != "+s2+" "+en.getKey().contains(stringToken));
// }
//
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* @param i
* @param collection
* @param object
*/
private static void generateCopyScript(InetAddress i,
Collection<String> domains, String rack)
{
try {
PrintWriter script = new PrintWriter( new FileOutputStream("/tmp/ram/"+rack.split(":")[0]+"-"+i.getHostAddress()+".sh",true) );
script.format("echo Copying data to %s... \n", i.getHostAddress());
int c=1;
for (String domain : domains) {
if (domain.length()<2)
domain="0"+domain;
script.format("echo ... domain %s, which is %d of %d\n", domain,c++,domains.size());
script.format("scp -B -q -c arcfour /mnt/db/Likes/*_%s* %s:/mnt/db/Likes/\n\n", domain,i.getHostAddress());
}
script.format("echo Finished with %s... \n", i.getHostAddress());
script.close();
script = new PrintWriter( new FileOutputStream("/tmp/ram/"+rack.split(":")[0]+".sh",true) );
script.format("./%s-%s.sh &\n",rack.split(":")[0],i.getHostAddress());
script.format("scp -B -q -c arcfour check-%s.sh %s:/mnt/\n\n",i.getHostAddress(),i.getHostAddress());
script.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* @param i
* @param collection
* @param object
*/
private static void generateCheckScript(InetAddress i,
Collection<String> domains, String rack)
{
try {
PrintWriter script = new PrintWriter( new FileOutputStream("/tmp/ram/check-"+i.getHostAddress()+".sh",true) );
int c=1;
for (String domain : domains) {
if (domain.length()<2)
domain="0"+domain;
script.format("echo 'domain %s check (must divide to 3)': `ls -1 /mnt/db/Likes/*_%s* | grep -v Compacted | wc | awk '{print $1}'`\n", domain, domain, domain);
}
script.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static InetAddress testChooseEndpoint(int partition,InetAddress[] endpoints,int tryCount)
{
int cycle = 0;
InetAddress endpoint=null;
while (endpoint == null && ++cycle<endpoints.length)
{
int index = ( cycle + tryCount + sh( sh( partition ) ) ) % (endpoints.length-1);
endpoint = endpoints[1+index];
}
return endpoint;
}
private static int sh(int domain)
{
// these special bit patterns need special cure
switch (domain)
{
case 0: return 0x55;
case 0x55: return 0xFF;
case 0xAA: return 0;
case 0xFF: return 0xAA;
}
// others work good with the following
return ( (domain >> 1) | (domain & 1) << 7 );
}
private static TokenMetadata testBootstrap(OdklDomainPartitioner pp, RackAwareOdklStrategy o, TokenMetadata meta, String token, String endpoint) throws Exception
{
Multimap<InetAddress, String> beforeB = domainMM(pp, o, meta);
TokenMetadata newMeta = meta.cloneOnlyTokenMap();
newMeta.updateNormalToken(new StringToken(token), InetAddress.getByName(endpoint));
Multimap<InetAddress, String> afterB = domainMM(pp, o, newMeta);
System.out.println("After bootstrap of "+endpoint);
for (InetAddress i : afterB.asMap().keySet()) {
System.out.println(i.toString()+"="+afterB.get(i).size()+" "+afterB.get(i));
}
// now validating
for (InetAddress endp : beforeB.keySet())
{
Set<String> beforeDomains = new TreeSet<String>(beforeB.get(endp));
Set<String> afterDomains = new TreeSet<String>(afterB.get(endp));
// after bootstrap on new node no domains must be added to old nodes
afterDomains.removeAll(beforeDomains);
if (afterDomains.size()>0)
{
System.out.println("ERROR WHEN ADDING "+endpoint+" on "+endp+" added domains "+afterDomains);
}
}
return newMeta;
}
private static Multimap<InetAddress, String> domainMM(OdklDomainPartitioner pp, RackAwareOdklStrategy o, TokenMetadata meta)
{
Multimap<InetAddress, String> endpDomains = ArrayListMultimap.create();
o.clearEndpointCache();
int startd=0, endd=256;
for (int i=startd;i<endd;i++)
{
List<InetAddress> endpoints = o.getNaturalEndpoints(pp.toStringToken(i,"000"), meta, "Likes");
for (InetAddress end : endpoints) {
endpDomains.put(end, Integer.toHexString(i));
}
}
return endpDomains;
}
}