/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.infrastructure.service;
import com.facebook.infrastructure.analytics.AnalyticsContext;
import com.facebook.infrastructure.concurrent.*;
import com.facebook.infrastructure.config.DatabaseDescriptor;
import com.facebook.infrastructure.db.*;
import com.facebook.infrastructure.dht.BootStrapper;
import com.facebook.infrastructure.dht.BootstrapInitiateMessage;
import com.facebook.infrastructure.dht.BootstrapMetadataVerbHandler;
import com.facebook.infrastructure.dht.Range;
import com.facebook.infrastructure.gms.*;
import com.facebook.infrastructure.locator.*;
import com.facebook.infrastructure.net.*;
import com.facebook.infrastructure.net.http.HttpConnection;
import com.facebook.infrastructure.net.io.StreamContextManager;
import com.facebook.infrastructure.tools.MembershipCleanerVerbHandler;
import com.facebook.infrastructure.tools.TokenUpdateVerbHandler;
import com.facebook.infrastructure.utils.LogUtil;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.WatchedEvent;
import org.apache.commons.math.linear.RealMatrix;
import org.apache.commons.math.linear.RealMatrixImpl;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import javax.management.MBeanServer;
import javax.management.ObjectName;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.management.ManagementFactory;
import java.math.BigInteger;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/*
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
* Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
*/
public final class StorageService implements IEndPointStateChangeSubscriber, StorageServiceMBean
{
private static Logger logger_ = Logger.getLogger(StorageService.class);
private static final BigInteger prime_ = BigInteger.valueOf(31);
private final static String nodeId_ = "NODE-IDENTIFIER";
private final static String loadAll_ = "LOAD-ALL";
public final static String mutationStage_ = "ROW-MUTATION-STAGE";
public final static String readStage_ = "ROW-READ-STAGE";
public final static String mutationVerbHandler_ = "ROW-MUTATION-VERB-HANDLER";
public final static String rangeVerbHandler_ = "RANGE-VERB-HANDLER";
public final static String tokenVerbHandler_ = "TOKEN-VERB-HANDLER";
public final static String loadVerbHandler_ = "LOAD-VERB-HANDLER";
public final static String binaryVerbHandler_ = "BINARY-VERB-HANDLER";
public final static String readRepairVerbHandler_ = "READ-REPAIR-VERB-HANDLER";
public final static String readVerbHandler_ = "ROW-READ-VERB-HANDLER";
public final static String bootStrapInitiateVerbHandler_ = "BOOTSTRAP-INITIATE-VERB-HANDLER";
public final static String bootStrapInitiateDoneVerbHandler_ = "BOOTSTRAP-INITIATE-DONE-VERB-HANDLER";
public final static String bootStrapTerminateVerbHandler_ = "BOOTSTRAP-TERMINATE-VERB-HANDLER";
public final static String tokenInfoVerbHandler_ = "TOKENINFO-VERB-HANDLER";
public final static String mbrshipCleanerVerbHandler_ = "MBRSHIP-CLEANER-VERB-HANDLER";
public final static String bsMetadataVerbHandler_ = "BS-METADATA-VERB-HANDLER";
public static enum ConsistencyLevel
{
WEAK,
STRONG
};
private static StorageService instance_;
/* Used to lock the factory for creation of StorageService instance */
private static Lock createLock_ = new ReentrantLock();
private static EndPoint tcpAddr_;
private static EndPoint udpAddr_;
public static EndPoint getLocalStorageEndPoint()
{
return tcpAddr_;
}
public static EndPoint getLocalControlEndPoint()
{
return udpAddr_;
}
public static String getHostUrl()
{
return "http://" + tcpAddr_.getHost() + ":" + DatabaseDescriptor.getHttpPort();
}
/*
* Order preserving hash for the specified key.
*/
public static BigInteger hash(String key)
{
BigInteger h = BigInteger.ZERO;
int MAX_LENGTH = 16; // old hash generated up to 256**16, but node tokens are up to 256**20. either is probably ok.
byte[] bytes;
try {
bytes = key.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
for (int i = 0; i < MAX_LENGTH && i < bytes.length; i++)
{
int v = (0xFF & bytes[i]);
h = h.add(BigInteger.valueOf(v).multiply(BigInteger.valueOf(256).pow(MAX_LENGTH - i)));
}
return h;
}
public static enum BootstrapMode
{
HINT,
FULL
};
public static class BootstrapInitiateDoneVerbHandler implements IVerbHandler<byte[]>
{
private static Logger logger_ = Logger.getLogger( BootstrapInitiateDoneVerbHandler.class );
public void doVerb(Message<byte[]> message)
{
logger_.debug("Received a bootstrap initiate done message ...");
/* Let the Stream Manager do his thing. */
StreamManager.instance(message.getFrom()).start();
}
}
private class ShutdownTimerTask extends TimerTask
{
public void run()
{
StorageService.instance().shutdown();
}
}
/*
* Factory method that gets an instance of the StorageService
* class.
*/
public static StorageService instance()
{
if ( instance_ == null )
{
StorageService.createLock_.lock();
try
{
if ( instance_ == null )
{
try
{
instance_ = new StorageService();
}
catch ( Throwable th )
{
logger_.error(LogUtil.throwableToString(th));
System.exit(1);
}
}
}
finally
{
createLock_.unlock();
}
}
return instance_;
}
/*
* This is the endpoint snitch which depends on the network architecture. We
* need to keep this information for each endpoint so that we make decisions
* while doing things like replication etc.
*
*/
private IEndPointSnitch endPointSnitch_;
/* Uptime of this node - we use this to determine if a bootstrap can be performed by this node */
private long uptime_ = 0L;
/* This abstraction maintains the token/endpoint metadata information */
private TokenMetadata tokenMetadata_ = new TokenMetadata();
private DBManager.StorageMetadata storageMetadata_;
/*
* Maintains a list of all components that need to be shutdown
* for a clean exit.
*/
private Set<IComponentShutdown> components_ = new HashSet<IComponentShutdown>();
/*
* This boolean indicates if we are in loading state. If we are then we do not want any
* distributed algorithms w.r.t change in token state to kick in.
*/
private boolean isLoadState_ = false;
/*
* This variable indicates if the local storage instance
* has been shutdown.
*/
private AtomicBoolean isShutdown_ = new AtomicBoolean(false);
/* This thread pool is used to do the bootstrap for a new node */
private ExecutorService bootStrapper_ = new DebuggableThreadPoolExecutor(1, 1,
Integer.MAX_VALUE, TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(), new ThreadFactoryImpl(
"BOOT-STRAPPER"));
/* This thread pool does consistency checks when the client doesn't care about consistency */
private ExecutorService consistencyManager_;
/* This is the entity that tracks load information of all nodes in the cluster */
private StorageLoadBalancer storageLoadBalancer_;
/* We use this interface to determine where replicas need to be placed */
private IReplicaPlacementStrategy nodePicker_;
/* Handle to a ZooKeeper instance */
private ZooKeeper zk_;
/*
* Registers with Management Server
*/
private void init()
{
// Register this instance with JMX
try
{
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
mbs.registerMBean(this, new ObjectName(
"com.facebook.infrastructure.service:type=StorageService"));
}
catch (Exception e)
{
logger_.error(LogUtil.throwableToString(e));
}
}
public StorageService() throws Throwable
{
init();
uptime_ = System.currentTimeMillis();
storageLoadBalancer_ = new StorageLoadBalancer(this);
endPointSnitch_ = new EndPointSnitch();
/* register the verb handlers */
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.tokenVerbHandler_, new TokenUpdateVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.binaryVerbHandler_, new BinaryVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.loadVerbHandler_, new LoadVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.mutationVerbHandler_, new RowMutationVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.readRepairVerbHandler_, new ReadRepairVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.readVerbHandler_, new ReadVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.rangeVerbHandler_, new RangeVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bootStrapInitiateVerbHandler_, new Table.BootStrapInitiateVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bootStrapInitiateDoneVerbHandler_, new StorageService.BootstrapInitiateDoneVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bootStrapTerminateVerbHandler_, new StreamManager.BootstrapTerminateVerbHandler());
MessagingService.getMessagingInstance().registerVerbHandlers(HttpConnection.httpRequestVerbHandler_, new HttpRequestVerbHandler(this) );
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.tokenInfoVerbHandler_, new TokenInfoVerbHandler() );
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.mbrshipCleanerVerbHandler_, new MembershipCleanerVerbHandler() );
MessagingService.getMessagingInstance().registerVerbHandlers(StorageService.bsMetadataVerbHandler_, new BootstrapMetadataVerbHandler() );
/* register the stage for the mutations */
int threadCount = DatabaseDescriptor.getThreadsPerPool();
consistencyManager_ = new DebuggableThreadPoolExecutor(threadCount,
threadCount,
Integer.MAX_VALUE, TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(), new ThreadFactoryImpl(
"CONSISTENCY-MANAGER"));
StageManager.registerStage(StorageService.mutationStage_, new MultiThreadedStage("ROW-MUTATION", threadCount));
StageManager.registerStage(StorageService.readStage_, new MultiThreadedStage("ROW-READ", threadCount));
/* Stage for handling the HTTP messages. */
StageManager.registerStage(HttpConnection.httpStage_, new SingleThreadedStage("HTTP-REQUEST"));
if ( DatabaseDescriptor.isRackAware() )
nodePicker_ = new RackAwareStrategy(tokenMetadata_);
else
nodePicker_ = new RackUnawareStrategy(tokenMetadata_);
}
private void reportToZookeeper() throws Throwable
{
try
{
zk_ = new ZooKeeper(DatabaseDescriptor.getZkAddress(), DatabaseDescriptor.getZkSessionTimeout(), new Watcher()
{
public void process(WatchedEvent we)
{
String path = "/Cassandra/" + DatabaseDescriptor.getClusterName() + "/Leader";
String eventPath = we.getPath();
logger_.debug("PROCESS EVENT : " + eventPath);
if ( eventPath != null && (eventPath.indexOf(path) != -1) )
{
logger_.debug("Signalling the leader instance ...");
LeaderElector.instance().signal();
}
}
});
Stat stat = zk_.exists("/", false);
if ( stat != null )
{
stat = zk_.exists("/Cassandra", false);
if ( stat == null )
{
logger_.debug("Creating the Cassandra znode ...");
zk_.create("/Cassandra", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
String path = "/Cassandra/" + DatabaseDescriptor.getClusterName();
stat = zk_.exists(path, false);
if ( stat == null )
{
logger_.debug("Creating the cluster znode " + path);
zk_.create(path, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
/* Create the Leader, Locks and Misc znode */
stat = zk_.exists(path + "/Leader", false);
if ( stat == null )
{
logger_.debug("Creating the leader znode " + path);
zk_.create(path + "/Leader", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
stat = zk_.exists(path + "/Locks", false);
if ( stat == null )
{
logger_.debug("Creating the locks znode " + path);
zk_.create(path + "/Locks", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
stat = zk_.exists(path + "/Misc", false);
if ( stat == null )
{
logger_.debug("Creating the misc znode " + path);
zk_.create(path + "/Misc", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
}
}
catch ( KeeperException ke )
{
LogUtil.throwableToString(ke);
/* do the re-initialize again. */
reportToZookeeper();
}
}
protected ZooKeeper getZooKeeperHandle()
{
return zk_;
}
public boolean isLeader(EndPoint endpoint)
{
EndPoint leader = getLeader();
return leader.equals(endpoint);
}
public EndPoint getLeader()
{
return LeaderElector.instance().getLeader();
}
public void registerComponentForShutdown(IComponentShutdown component)
{
components_.add(component);
}
public void registerExternalVerbHandler(String verb, IVerbHandler verbHandler)
{
MessagingService.getMessagingInstance().registerVerbHandlers(verb, verbHandler);
}
public void start() throws IOException {
storageMetadata_ = DBManager.instance().start();
/* Set up TCP endpoint */
tcpAddr_ = new EndPoint(DatabaseDescriptor.getStoragePort());
/* Set up UDP endpoint */
udpAddr_ = new EndPoint(DatabaseDescriptor.getControlPort());
/* Listen for application messages */
MessagingService.getMessagingInstance().listen(tcpAddr_, false);
/* Listen for control messages */
MessagingService.getMessagingInstance().listenUDP(udpAddr_);
/* Listen for HTTP messages */
MessagingService.getMessagingInstance().listen( new EndPoint(DatabaseDescriptor.getHttpPort() ), true );
/* start the analytics context package */
AnalyticsContext.instance().start();
/* report our existence to ZooKeeper instance and start the leader election service */
// reportToZookeeper();
// LeaderElector.instance().start();
/* Start the storage load balancer */
storageLoadBalancer_.start();
/* Register with the Gossiper for EndPointState notifications */
Gossiper.instance().register(this);
/*
* Start the gossiper with the generation # retrieved from the System
* table
*/
Gossiper.instance().start(udpAddr_, storageMetadata_.getGeneration());
/* Make sure this token gets gossiped around. */
tokenMetadata_.update(storageMetadata_.getStorageId(), StorageService.tcpAddr_);
Gossiper.instance().addApplicationState(StorageService.nodeId_, new ApplicationState(storageMetadata_.getStorageId().toString()));
}
public void killMe() throws Throwable
{
isShutdown_.set(true);
/*
* Shutdown the Gossiper to stop responding/sending Gossip messages.
* This causes other nodes to detect you as dead and starting hinting
* data for the local endpoint.
*/
Gossiper.instance().shutdown();
final long nodeDeadDetectionTime = 25000L;
Thread.sleep(nodeDeadDetectionTime);
/* Now perform a force flush of the table */
String table = DatabaseDescriptor.getTables().get(0);
Table.open(table).flush(false);
/* Now wait for the flush to complete */
Thread.sleep(nodeDeadDetectionTime);
/* Shutdown all other components */
StorageService.instance().shutdown();
}
public boolean isShutdown()
{
return isShutdown_.get();
}
public void shutdown()
{
bootStrapper_.shutdownNow();
/* shut down all stages */
StageManager.shutdown();
/* shut down the messaging service */
MessagingService.shutdown();
/* shut down all memtables */
Memtable.shutdown();
/* shut down the cleaner thread in FileUtils */
FileUtils.shutdown();
/* shut down all registered components */
for ( IComponentShutdown component : components_ )
{
component.shutdown();
}
}
public TokenMetadata getTokenMetadata()
{
return tokenMetadata_.cloneMe();
}
/* TODO: remove later */
public void updateTokenMetadata(BigInteger token, EndPoint endpoint)
{
tokenMetadata_.update(token, endpoint);
}
public IEndPointSnitch getEndPointSnitch()
{
return endPointSnitch_;
}
/*
* Given an EndPoint this method will report if the
* endpoint is in the same data center as the local
* storage endpoint.
*/
public boolean isInSameDataCenter(EndPoint endpoint) throws IOException
{
return endPointSnitch_.isInSameDataCenter(StorageService.tcpAddr_, endpoint);
}
/*
* This method performs the requisite operations to make
* sure that the N replicas are in sync. We do this in the
* background when we do not care much about consistency.
*/
public void doConsistencyCheck(Row row, List<EndPoint> endpoints, ReadParameters message)
{
Runnable consistencySentinel = new ConsistencyManager(row.cloneMe(), endpoints, message.columnFamily_column,
message.start, message.count, message.sinceTimestamp, message.getColumnNames());
consistencyManager_.submit(consistencySentinel);
}
/*
* This method displays all the ranges and the replicas
* that are responsible for the individual ranges. The
* format of this string is the following:
*
* R1 : A B C
* R2 : D E F
* R3 : G H I
*/
public String showTheRing()
{
StringBuilder sb = new StringBuilder();
/* Get the token to endpoint map. */
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
Set<BigInteger> tokens = tokenToEndPointMap.keySet();
/* All the ranges for the tokens */
Range[] ranges = getAllRanges(tokens);
Map<Range, List<EndPoint>> oldRangeToEndPointMap = constructRangeToEndPointMap(ranges);
Set<Range> rangeSet = oldRangeToEndPointMap.keySet();
for ( Range range : rangeSet )
{
sb.append(range);
sb.append(" : ");
List<EndPoint> replicas = oldRangeToEndPointMap.get(range);
for ( EndPoint replica : replicas )
{
sb.append(replica);
sb.append(" ");
}
sb.append(System.getProperty("line.separator"));
}
return sb.toString();
}
public Map<Range, List<EndPoint>> getRangeToEndPointMap()
{
/* Get the token to endpoint map. */
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
Set<BigInteger> tokens = tokenToEndPointMap.keySet();
/* All the ranges for the tokens */
Range[] ranges = getAllRanges(tokens);
Map<Range, List<EndPoint>> oldRangeToEndPointMap = constructRangeToEndPointMap(ranges);
return oldRangeToEndPointMap;
}
/**
* Construct the range to endpoint mapping based on the true view
* of the world.
* @param ranges
* @return mapping of ranges to the replicas responsible for them.
*/
public Map<Range, List<EndPoint>> constructRangeToEndPointMap(Range[] ranges)
{
logger_.debug("Constructing range to endpoint map ...");
Map<Range, List<EndPoint>> rangeToEndPointMap = new HashMap<Range, List<EndPoint>>();
for ( Range range : ranges )
{
EndPoint[] endpoints = getNStorageEndPoint(range.right());
rangeToEndPointMap.put(range, new ArrayList<EndPoint>( Arrays.asList(endpoints) ) );
}
logger_.debug("Done constructing range to endpoint map ...");
return rangeToEndPointMap;
}
/**
* Construct the range to endpoint mapping based on the view as dictated
* by the mapping of token to endpoints passed in.
* @param ranges
* @param tokenToEndPointMap mapping of token to endpoints.
* @return mapping of ranges to the replicas responsible for them.
*/
public Map<Range, List<EndPoint>> constructRangeToEndPointMap(Range[] ranges, Map<BigInteger, EndPoint> tokenToEndPointMap)
{
logger_.debug("Constructing range to endpoint map ...");
Map<Range, List<EndPoint>> rangeToEndPointMap = new HashMap<Range, List<EndPoint>>();
for ( Range range : ranges )
{
EndPoint[] endpoints = getNStorageEndPoint(range.right(), tokenToEndPointMap);
rangeToEndPointMap.put(range, new ArrayList<EndPoint>( Arrays.asList(endpoints) ) );
}
logger_.debug("Done constructing range to endpoint map ...");
return rangeToEndPointMap;
}
/**
* Construct a mapping from endpoint to ranges that endpoint is
* responsible for.
* @return the mapping from endpoint to the ranges it is responsible
* for.
*/
public Map<EndPoint, List<Range>> constructEndPointToRangesMap()
{
Map<EndPoint, List<Range>> endPointToRangesMap = new HashMap<EndPoint, List<Range>>();
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
Collection<EndPoint> mbrs = tokenToEndPointMap.values();
for ( EndPoint mbr : mbrs )
{
endPointToRangesMap.put(mbr, getRangesForEndPoint(mbr));
}
return endPointToRangesMap;
}
/**
* Get the estimated disk space of the target endpoint in its
* primary range.
* @param target whose primary range we are interested in.
* @return disk space of the target in the primary range.
*/
private double getDiskSpaceForPrimaryRange(EndPoint target)
{
double primaryDiskSpace = 0d;
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
Set<BigInteger> tokens = tokenToEndPointMap.keySet();
Range[] allRanges = getAllRanges(tokens);
Arrays.sort(allRanges);
/* Mapping from Range to its ordered position on the ring */
Map<Range, Integer> rangeIndex = new HashMap<Range, Integer>();
for ( int i = 0; i < allRanges.length; ++i )
{
rangeIndex.put(allRanges[i], i);
}
/* Get the coefficients for the equations */
List<double[]> equations = new ArrayList<double[]>();
/* Get the endpoint to range map */
Map<EndPoint, List<Range>> endPointToRangesMap = constructEndPointToRangesMap();
Set<EndPoint> eps = endPointToRangesMap.keySet();
for ( EndPoint ep : eps )
{
List<Range> ranges = endPointToRangesMap.get(ep);
double[] equation = new double[allRanges.length];
for ( Range range : ranges )
{
int index = rangeIndex.get(range);
equation[index] = 1;
}
equations.add(equation);
}
double[][] coefficients = equations.toArray( new double[0][0] );
/* Get the constants which are the aggregate disk space for each endpoint */
double[] constants = new double[allRanges.length];
int index = 0;
for ( EndPoint ep : eps )
{
/* reset the port back to control port */
ep.setPort(DatabaseDescriptor.getControlPort());
String lInfo = null;
if ( ep.equals(StorageService.udpAddr_) )
lInfo = getLoadInfo();
else
lInfo = getLoadInfo(ep);
LoadInfo li = new LoadInfo(lInfo);
constants[index++] = FileUtils.stringToFileSize(li.diskSpace());
}
RealMatrix matrix = new RealMatrixImpl(coefficients);
double[] solutions = matrix.solve(constants);
Range primaryRange = getPrimaryRangeForEndPoint(target);
primaryDiskSpace = solutions[rangeIndex.get(primaryRange)];
return primaryDiskSpace;
}
/**
* This is very dangerous. This is used only on the client
* side to set up the client library. This is then used to
* find the appropriate nodes to route the key to.
*/
public void setTokenMetadata(TokenMetadata tokenMetadata)
{
tokenMetadata_ = tokenMetadata;
}
/**
* Called when there is a change in application state. In particular
* we are interested in new tokens as a result of a new node or an
* existing node moving to a new location on the ring.
*/
public void onChange(EndPoint endpoint, EndPointState epState)
{
EndPoint ep = new EndPoint(endpoint.getHost(), DatabaseDescriptor.getStoragePort());
/* node identifier for this endpoint on the identifier space */
ApplicationState nodeIdState = epState.getApplicationState(StorageService.nodeId_);
if (nodeIdState != null)
{
BigInteger newToken = new BigInteger(nodeIdState.getState());
logger_.debug("CHANGE IN STATE FOR " + endpoint + " - has token " + nodeIdState.getState());
BigInteger oldToken = tokenMetadata_.getToken(ep);
if ( oldToken != null )
{
/*
* If oldToken equals the newToken then the node had crashed
* and is coming back up again. If oldToken is not equal to
* the newToken this means that the node is being relocated
* to another position in the ring.
*/
if ( !oldToken.equals(newToken) )
{
logger_.debug("Relocation for endpoint " + ep);
tokenMetadata_.update(newToken, ep);
}
else
{
/*
* This means the node crashed and is coming back up.
* Deliver the hints that we have for this endpoint.
*/
logger_.debug("Sending hinted data to " + ep);
doBootstrap(endpoint, BootstrapMode.HINT);
}
}
else
{
/*
* This is a new node and we just update the token map.
*/
tokenMetadata_.update(newToken, ep);
}
}
else
{
/*
* If we are here and if this node is UP and already has an entry
* in the token map. It means that the node was behind a network partition.
*/
if ( epState.isAlive() && tokenMetadata_.isKnownEndPoint(endpoint) )
{
logger_.debug("EndPoint " + ep + " just recovered from a partition. Sending hinted data.");
doBootstrap(ep, BootstrapMode.HINT);
}
}
/* Check if a bootstrap is in order */
ApplicationState loadAllState = epState.getApplicationState(StorageService.loadAll_);
if ( loadAllState != null )
{
String nodes = loadAllState.getState();
if ( nodes != null )
{
doBootstrap(ep, BootstrapMode.FULL);
}
}
}
/**
* Get the count of primary keys from the sampler.
*/
public String getLoadInfo()
{
long diskSpace = FileUtils.getUsedDiskSpace();
LoadInfo li = new LoadInfo(0, diskSpace);
return li.toString();
}
/**
* Get the primary count info for this endpoint.
* This is gossiped around and cached in the
* StorageLoadBalancer.
*/
public String getLoadInfo(EndPoint ep)
{
LoadInfo li = storageLoadBalancer_.getLoad(ep);
return ( li == null ) ? "N/A" : li.toString();
}
/**
* Get the endpoint that has the largest primary count.
* @return
*/
EndPoint getEndPointWithLargestPrimaryCount()
{
Set<EndPoint> allMbrs = Gossiper.instance().getAllMembers();
Map<LoadInfo, EndPoint> loadInfoToEndPointMap = new HashMap<LoadInfo, EndPoint>();
List<LoadInfo> lInfos = new ArrayList<LoadInfo>();
for ( EndPoint mbr : allMbrs )
{
mbr.setPort(DatabaseDescriptor.getStoragePort());
LoadInfo li = null;
if ( mbr.equals(StorageService.tcpAddr_) )
{
li = new LoadInfo( getLoadInfo() );
lInfos.add( li );
}
else
{
li = storageLoadBalancer_.getLoad(mbr);
lInfos.add( li );
}
loadInfoToEndPointMap.put(li, mbr);
}
Collections.sort(lInfos, new LoadInfo.PrimaryCountComparator());
return loadInfoToEndPointMap.get( lInfos.get(lInfos.size() - 1) );
}
/*
* This method updates the token on disk and modifies the cached
* StorageMetadata instance. This is only for the local endpoint.
*/
public void updateToken(BigInteger token) throws IOException
{
/* update the token on disk */
SystemTable.openSystemTable(SystemTable.name_).updateToken(token);
/* Update the storageMetadata cache */
storageMetadata_.setStorageId(token);
/* Update the token maps */
/* Get the old token. This needs to be removed. */
tokenMetadata_.update(token, StorageService.tcpAddr_);
/* Gossip this new token for the local storage instance */
Gossiper.instance().addApplicationState(StorageService.nodeId_, new ApplicationState(token.toString()));
}
/*
* This method removes the state associated with this endpoint
* from the TokenMetadata instance.
*
* param@ endpoint remove the token state associated with this
* endpoint.
*/
public void removeTokenState(EndPoint endpoint)
{
tokenMetadata_.remove(endpoint);
/* Remove the state from the Gossiper */
Gossiper.instance().removeFromMembership(endpoint);
}
/*
* This method is invoked by the Loader process to force the
* node to move from its current position on the token ring, to
* a position to be determined based on the keys. This will help
* all nodes to start off perfectly load balanced. The array passed
* in is evaluated as follows by the loader process:
* If there are 10 keys in the system and a totality of 5 nodes
* then each node needs to have 2 keys i.e the array is made up
* of every 2nd key in the total list of keys.
*/
public void relocate(String[] keys) throws IOException
{
if ( keys.length > 0 )
{
isLoadState_ = true;
BigInteger token = tokenMetadata_.getToken(StorageService.tcpAddr_);
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
BigInteger[] tokens = tokenToEndPointMap.keySet().toArray( new BigInteger[0] );
Arrays.sort(tokens);
int index = Arrays.binarySearch(tokens, token) * (keys.length/tokens.length);
BigInteger newToken = hash( keys[index] );
/* update the token */
updateToken(newToken);
}
}
/*
* This is used to indicate that this node is done
* with the loading of data.
*/
public void resetLoadState()
{
isLoadState_ = false;
}
/**
* This method takes a colon separated string of nodes that need
* to be bootstrapped. It is also used to filter some source of
* data. Suppose the nodes to be bootstrapped are A, B and C. Then
* <i>allNodes</i> must be specified as A:B:C.
*
*/
private void doBootstrap(String nodes)
{
String[] allNodesAndFilter = nodes.split("-");
String nodesToLoad = null;
String filterSources = null;
if ( allNodesAndFilter.length == 2 )
{
nodesToLoad = allNodesAndFilter[0];
filterSources = allNodesAndFilter[1];
}
else
{
nodesToLoad = allNodesAndFilter[0];
}
String[] allNodes = nodesToLoad.split(":");
EndPoint[] endpoints = new EndPoint[allNodes.length];
BigInteger[] tokens = new BigInteger[allNodes.length];
for ( int i = 0; i < allNodes.length; ++i )
{
endpoints[i] = new EndPoint( allNodes[i].trim(), DatabaseDescriptor.getStoragePort() );
tokens[i] = tokenMetadata_.getToken(endpoints[i]);
}
/* Start the bootstrap algorithm */
if ( filterSources == null )
bootStrapper_.submit( new BootStrapper(endpoints, tokens) );
else
{
String[] allFilters = filterSources.split(":");
EndPoint[] filters = new EndPoint[allFilters.length];
for ( int i = 0; i < allFilters.length; ++i )
{
filters[i] = new EndPoint( allFilters[i].trim(), DatabaseDescriptor.getStoragePort() );
}
bootStrapper_.submit( new BootStrapper(endpoints, tokens, filters) );
}
}
/**
* Starts the bootstrap operations for the specified endpoint.
* The name of this method is however a misnomer since it does
* handoff of data to the specified node when it has crashed
* and come back up, marked as alive after a network partition
* and also when it joins the ring either as an old node being
* relocated or as a brand new node.
*/
public final void doBootstrap(EndPoint endpoint, BootstrapMode mode)
{
switch ( mode )
{
case FULL:
BigInteger token = tokenMetadata_.getToken(endpoint);
bootStrapper_.submit( new BootStrapper(new EndPoint[]{endpoint}, new BigInteger[]{token}) );
break;
case HINT:
/* Deliver the hinted data to this endpoint. */
HintedHandOffManager.instance().deliverHints(endpoint);
break;
default:
break;
}
}
/* This methods belong to the MBean interface */
public long getRequestHandled()
{
return 0;
}
public String getToken(EndPoint ep)
{
EndPoint ep2 = new EndPoint(ep.getHost(), DatabaseDescriptor.getStoragePort());
BigInteger token = tokenMetadata_.getToken(ep2);
return ( token == null ) ? BigInteger.ZERO.toString() : token.toString();
}
public String getToken()
{
return tokenMetadata_.getToken(StorageService.tcpAddr_).toString();
}
public void updateToken(String token)
{
try
{
updateToken(new BigInteger(token));
}
catch ( IOException ex )
{
logger_.debug(LogUtil.throwableToString(ex));
}
}
public String getLiveNodes()
{
return stringify(Gossiper.instance().getLiveMembers());
}
public String getUnreachableNodes()
{
return stringify(Gossiper.instance().getUnreachableMembers());
}
/* Helper for the MBean interface */
private String stringify(Set<EndPoint> eps)
{
StringBuilder sb = new StringBuilder("");
for (EndPoint ep : eps)
{
sb.append(ep);
sb.append(" ");
}
return sb.toString();
}
public void loadAll(String nodes)
{
// Gossiper.instance().addApplicationState(StorageService.loadAll_, new ApplicationState(nodes));
doBootstrap(nodes);
}
public String getAppropriateToken(int count)
{
BigInteger token = BootstrapAndLbHelper.getTokenBasedOnPrimaryCount(count);
return token.toString();
}
public void doGC()
{
List<String> tables = DatabaseDescriptor.getTables();
for ( String tName : tables )
{
Table table = Table.open(tName);
table.doGC();
}
}
public void forceHandoff(String directories, String host) throws IOException
{
List<File> filesList = new ArrayList<File>();
String[] sources = directories.split(":");
for (String source : sources)
{
File directory = new File(source);
Collections.addAll(filesList, directory.listFiles());
}
File[] files = filesList.toArray(new File[0]);
StreamContextManager.StreamContext[] streamContexts = new StreamContextManager.StreamContext[files.length];
int i = 0;
for ( File file : files )
{
streamContexts[i] = new StreamContextManager.StreamContext(file.getAbsolutePath(), file.length());
logger_.debug("Stream context metadata " + streamContexts[i]);
++i;
}
if ( files.length > 0 )
{
EndPoint target = new EndPoint(host, DatabaseDescriptor.getStoragePort());
/* Set up the stream manager with the files that need to streamed */
StreamManager.instance(target).addFilesToStream(streamContexts);
/* Send the bootstrap initiate message */
BootstrapInitiateMessage biMessage = new BootstrapInitiateMessage(streamContexts);
Message message = BootstrapInitiateMessage.makeBootstrapInitiateMessage(biMessage);
logger_.debug("Sending a bootstrap initiate message to " + target + " ...");
MessagingService.getMessagingInstance().sendOneWay(message, target);
logger_.debug("Waiting for transfer to " + target + " to complete");
StreamManager.instance(target).waitForStreamCompletion();
logger_.debug("Done with transfer to " + target);
}
}
/* End of MBean interface methods */
/*
* This method returns the predecessor of the endpoint ep on the identifier
* space.
*/
EndPoint getPredecessor(EndPoint ep)
{
BigInteger token = tokenMetadata_.getToken(ep);
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet());
Collections.sort(tokens);
int index = Collections.binarySearch(tokens, token);
EndPoint predecessor = (index == 0) ? tokenToEndPointMap.get(tokens
.get(tokens.size() - 1)) : tokenToEndPointMap.get(tokens
.get(--index));
return predecessor;
}
/*
* This method returns the successor of the endpoint ep on the identifier
* space.
*/
public EndPoint getSuccessor(EndPoint ep)
{
BigInteger token = tokenMetadata_.getToken(ep);
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet());
Collections.sort(tokens);
int index = Collections.binarySearch(tokens, token);
EndPoint successor = (index == (tokens.size() - 1)) ? tokenToEndPointMap
.get(tokens.get(0))
: tokenToEndPointMap.get(tokens.get(++index));
return successor;
}
/**
* This method returns the range handled by this node.
*/
public Range getMyRange()
{
BigInteger myToken = tokenMetadata_.getToken(StorageService.tcpAddr_);
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
List<BigInteger> allTokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet());
Collections.sort(allTokens);
int index = Collections.binarySearch(allTokens, myToken);
/* Calculate the lhs for the range */
BigInteger lhs = (index == 0) ? allTokens.get(allTokens.size() - 1) : allTokens.get( index - 1);
return new Range( lhs, myToken );
}
/**
* Get the primary range for the specified endpoint.
* @param ep endpoint we are interested in.
* @return range for the specified endpoint.
*/
public Range getPrimaryRangeForEndPoint(EndPoint ep)
{
BigInteger right = tokenMetadata_.getToken(ep);
EndPoint predecessor = getPredecessor(ep);
BigInteger left = tokenMetadata_.getToken(predecessor);
return new Range(left, right);
}
/**
* Get all ranges an endpoint is responsible for.
* @param ep endpoint we are interested in.
* @return ranges for the specified endpoint.
*/
List<Range> getRangesForEndPoint(EndPoint ep)
{
List<Range> ranges = new ArrayList<Range>();
ranges.add( getPrimaryRangeForEndPoint(ep) );
EndPoint predecessor = ep;
int count = DatabaseDescriptor.getReplicationFactor() - 1;
for ( int i = 0; i < count; ++i )
{
predecessor = getPredecessor(predecessor);
ranges.add( getPrimaryRangeForEndPoint(predecessor) );
}
return ranges;
}
/**
* Get all ranges that span the ring given a set
* of tokens. All ranges are in sorted order of
* ranges.
*/
public Range[] getAllRanges(Set<BigInteger> tokens)
{
List<Range> ranges = new ArrayList<Range>();
List<BigInteger> allTokens = new ArrayList<BigInteger>(tokens);
Collections.sort(allTokens);
int size = allTokens.size();
for ( int i = 1; i < size; ++i )
{
Range range = new Range( allTokens.get(i - 1), allTokens.get(i) );
ranges.add(range);
}
Range range = new Range( allTokens.get(size - 1), allTokens.get(0) );
ranges.add(range);
return ranges.toArray( new Range[0] );
}
/**
* Get all ranges that span the ring given a set
* of endpoints.
*/
public Range[] getPrimaryRangesForEndPoints(Set<EndPoint> endpoints)
{
List<Range> allRanges = new ArrayList<Range>();
for ( EndPoint endpoint : endpoints )
{
allRanges.add( getPrimaryRangeForEndPoint( endpoint) );
}
return allRanges.toArray(new Range[0]);
}
/**
* This method returns the endpoint that is responsible for storing the
* specified key.
*
* param @ key - key for which we need to find the endpoint
* return value - the endpoint responsible for this key
*/
public EndPoint getPrimary(String key)
{
EndPoint endpoint = StorageService.tcpAddr_;
BigInteger token = hash(key);
Map<BigInteger, EndPoint> tokenToEndPointMap = tokenMetadata_.cloneTokenEndPointMap();
List<BigInteger> tokens = new ArrayList<BigInteger>(tokenToEndPointMap.keySet());
if (tokens.size() > 0)
{
Collections.sort(tokens);
int index = Collections.binarySearch(tokens, token);
if (index >= 0)
{
/*
* retrieve the endpoint based on the token at this index in the
* tokens list
*/
endpoint = tokenToEndPointMap.get(tokens.get(index));
}
else
{
index = (index + 1) * (-1);
if (index < tokens.size())
endpoint = tokenToEndPointMap.get(tokens.get(index));
else
endpoint = tokenToEndPointMap.get(tokens.get(0));
}
}
return endpoint;
}
/**
* This method determines whether the local endpoint is the
* primary for the given key.
* @param key
* @return true if the local endpoint is the primary replica.
*/
public boolean isPrimary(String key)
{
EndPoint endpoint = getPrimary(key);
return StorageService.tcpAddr_.equals(endpoint);
}
/**
* This method determines whether the target endpoint is the
* primary for the given key.
* @param key
* @param target the target enpoint
* @return true if the local endpoint is the primary replica.
*/
public boolean isPrimary(String key, EndPoint target)
{
EndPoint endpoint = getPrimary(key);
return target.equals(endpoint);
}
/**
* This method determines whether the local endpoint is the
* seondary replica for the given key.
* @param key
* @return true if the local endpoint is the secondary replica.
*/
public boolean isSecondary(String key)
{
EndPoint[] topN = getNStorageEndPoint(key);
if ( topN.length < DatabaseDescriptor.getReplicationFactor() )
return false;
return topN[1].equals(StorageService.tcpAddr_);
}
/**
* This method determines whether the local endpoint is the
* seondary replica for the given key.
* @param key
* @return true if the local endpoint is the tertiary replica.
*/
public boolean isTertiary(String key)
{
EndPoint[] topN = getNStorageEndPoint(key);
if ( topN.length < DatabaseDescriptor.getReplicationFactor() )
return false;
return topN[2].equals(StorageService.tcpAddr_);
}
/**
* This method determines if the local endpoint is
* in the topN of N nodes passed in.
*/
public boolean isInTopN(String key)
{
EndPoint[] topN = getNStorageEndPoint(key);
for ( EndPoint ep : topN )
{
if ( ep.equals( StorageService.tcpAddr_ ) )
return true;
}
return false;
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* param @ key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public EndPoint[] getNStorageEndPoint(String key)
{
BigInteger token = hash(key);
return nodePicker_.getStorageEndPoints(token);
}
/**
* This method attempts to return N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* param @ key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public List<EndPoint> getNLiveStorageEndPoint(String key)
{
List<EndPoint> liveEps = new ArrayList<EndPoint>();
EndPoint[] endpoints = getNStorageEndPoint(key);
for ( EndPoint endpoint : endpoints )
{
if ( FailureDetector.instance().isAlive(endpoint) )
liveEps.add(endpoint);
}
return liveEps;
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* param @ key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public Map<EndPoint, EndPoint> getNStorageEndPointMap(String key)
{
BigInteger token = hash(key);
return nodePicker_.getHintedStorageEndPoints(token);
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication. But it makes sure that the N endpoints
* that are returned are live as reported by the FD. It returns the hint information
* if some nodes in the top N are not live.
*
* param @ key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public Map<EndPoint, EndPoint> getNHintedStorageEndPoint(String key)
{
BigInteger token = hash(key);
return nodePicker_.getHintedStorageEndPoints(token);
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified token i.e for replication.
*
* param @ token - position on the ring
*/
public EndPoint[] getNStorageEndPoint(BigInteger token)
{
return nodePicker_.getStorageEndPoints(token);
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified token i.e for replication and are based on the token to endpoint
* mapping that is passed in.
*
* param @ token - position on the ring
* param @ tokens - w/o the following tokens in the token list
*/
protected EndPoint[] getNStorageEndPoint(BigInteger token, Map<BigInteger, EndPoint> tokenToEndPointMap)
{
return nodePicker_.getStorageEndPoints(token, tokenToEndPointMap);
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication. But it makes sure that the N endpoints
* that are returned are live as reported by the FD. It returns the hint information
* if some nodes in the top N are not live.
*
* param @ token - position on the ring
*/
public Map<EndPoint, EndPoint> getNHintedStorageEndPoint(BigInteger token)
{
return nodePicker_.getHintedStorageEndPoints(token);
}
/**
* This function finds the most suitable endpoint given a key.
* It checks for loclity and alive test.
*/
protected EndPoint findSuitableEndPoint(String key) throws IOException
{
EndPoint[] endpoints = getNStorageEndPoint(key);
for(EndPoint endPoint: endpoints)
{
if(endPoint.equals(StorageService.getLocalStorageEndPoint()))
{
return endPoint;
}
}
int j = 0;
for ( ; j < endpoints.length; ++j )
{
if ( StorageService.instance().isInSameDataCenter(endpoints[j]) && FailureDetector.instance().isAlive(endpoints[j]) )
{
logger_.trace("EndPoint " + endpoints[j] + " is in the same data center as local storage endpoint.");
return endpoints[j];
}
}
// We have tried to be really nice but looks like theer are no servers
// in the local data center that are alive and can service this request so
// just send it to teh first alive guy and see if we get anything.
j = 0;
for ( ; j < endpoints.length; ++j )
{
if ( FailureDetector.instance().isAlive(endpoints[j]) )
{
logger_.trace("No local endpoints alive. EndPoint " + endpoints[j] + " is alive so get data from it.");
return endpoints[j];
}
}
return null;
}
}