package com.splout.db.qnode;
/*
* #%L
* Splout SQL Server
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.SortedSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;
import com.google.common.base.Joiner;
import com.google.common.collect.Ordering;
import com.google.common.collect.TreeMultimap;
import com.splout.db.common.SploutConfiguration;
import com.splout.db.common.Tablespace;
import com.splout.db.dnode.DNodeClient;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.DNodeInfo;
import com.splout.db.hazelcast.TablespaceVersion;
import com.splout.db.qnode.ReplicaBalancer.BalanceAction;
import com.splout.db.thrift.DNodeService;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Gauge;
/**
* This class contains the basic context of {@link QNodeHandler}. This context involves in-memory information of the
* system such as: list of alive DNodes, list of tablespaces, versions and so forth. In addition, this class also
* maintains a pool of connections to the DNodes. This class is shared among all different {@link QNodeHandlerModule}
* such as {@link Deployer} so that each specialized module can have access to the context.
*/
public class QNodeHandlerContext {
protected final static Log log = LogFactory.getLog(QNodeHandlerContext.class);
// This map indicates which is the current version being served. It has to be updated atomically.
private final Map<String, Long> currentVersionsMap = new ConcurrentHashMap<String, Long>();
private String qNodeAddress;
// The SploutConfiguration
private SploutConfiguration config;
// The coordination structures that use Hazelcast underneath
private CoordinationStructures coordinationStructures;
private TablespaceMemoryState tablespaceState = new TablespaceMemoryState();
private ReplicaBalancer replicaBalancer;
// This flag is set to "false" after WARMING_TIME seconds (qnode.warming.time)
// Some actions will only be taken after warming time, just in case some nodes still didn't join the cluster.
private final AtomicBoolean isWarming = new AtomicBoolean(true);
// The per-DNode Thrift client pools
private ConcurrentMap<String, BlockingQueue<DNodeService.Client>> thriftClientCache = new ConcurrentHashMap<String, BlockingQueue<DNodeService.Client>>();
private ReentrantLock thriftClientCacheLock = new ReentrantLock();
private final int thriftClientPoolSize;
private final long dnodePoolTimeoutMillis;
public QNodeHandlerContext(SploutConfiguration config, CoordinationStructures coordinationStructures) {
this.config = config;
this.coordinationStructures = coordinationStructures;
this.thriftClientPoolSize = config.getInt(QNodeProperties.DNODE_POOL_SIZE);
this.dnodePoolTimeoutMillis = config.getLong(QNodeProperties.QNODE_DNODE_POOL_TAKE_TIMEOUT);
this.replicaBalancer = new ReplicaBalancer(this);
initMetrics();
}
public static enum DNodeEvent {
LEAVE, ENTRY, UPDATE
}
private void initMetrics() {
Metrics.newGauge(QNodeHandlerContext.class, "thrift-total-connections-iddle", new Gauge<Integer>() {
@Override
public Integer value() {
int count = 0;
for (Entry<String, BlockingQueue<DNodeService.Client>> queue : thriftClientCache.entrySet()) {
count += queue.getValue().size();
}
return count;
}
});
Metrics.newGauge(QNodeHandlerContext.class, "thrift-total-connections-being-used", new Gauge<Integer>() {
@Override
public Integer value() {
int queues = 0;
int count = 0;
for (Entry<String, BlockingQueue<DNodeService.Client>> queue : thriftClientCache.entrySet()) {
queues++;
count += queue.getValue().size();
}
return (QNodeHandlerContext.this.thriftClientPoolSize * queues) - count;
}
});
Metrics.newGauge(QNodeHandlerContext.class, "thrift-pools", new Gauge<String>() {
@Override
public String value() {
ArrayList<String> fullPools = new ArrayList<String>();
for (Entry<String, BlockingQueue<DNodeService.Client>> queue : thriftClientCache.entrySet()) {
int idle = queue.getValue().size();
int size = QNodeHandlerContext.this.thriftClientPoolSize;
fullPools.add("Pool: " + queue.getKey() + " (" + (size - idle) + " of " + size + ") being used");
}
return Joiner.on(", ").join(fullPools);
}
});
Metrics.newGauge(QNodeHandlerContext.class, "thrift-pools", new Gauge<Integer>() {
@Override
public Integer value() {
return thriftClientCache.size();
}
});
Metrics.newGauge(QNodeHandlerContext.class, "thrift-total-configured-connections", new Gauge<Integer>() {
@Override
public Integer value() {
return (QNodeHandlerContext.this.thriftClientPoolSize * thriftClientCache.size());
}
});
}
@SuppressWarnings("serial")
public final static class TablespaceVersionInfoException extends Exception {
public TablespaceVersionInfoException(String msg) {
super(msg);
}
}
/**
* Get the list of possible actions to take for balancing the cluster in case of under-replicated partitions.
*/
public List<BalanceAction> getBalanceActions() {
// we have this in this class to be able to use this lock (the same that recreats the in-memory TablespaceVersion map)
synchronized (tablespaceState) {
return replicaBalancer.scanPartitions();
}
}
/**
* Get the list of DNodes
*/
public List<String> getDNodeList() {
List<String> dNodeList = new ArrayList<String>();
for (DNodeInfo dnode : getCoordinationStructures().getDNodes().values()) {
dNodeList.add(dnode.getAddress());
}
return dNodeList;
}
/**
* This method can be called to initialize a pool of connections to a dnode. This method may be called from multiple
* threads so it should be safe to call it concurrently.
*/
public void initializeThriftClientCacheFor(String dnode) throws TTransportException,
InterruptedException {
// this lock is on the whole cache but we would actually be interested in a per-DNode lock...
// there's only one lock for simplicity.
thriftClientCacheLock.lock();
try {
// initialize queue for this DNode
BlockingQueue<DNodeService.Client> dnodeQueue = thriftClientCache.get(dnode);
if (dnodeQueue == null) {
// this assures that the per-DNode queue is only created once and then reused.
dnodeQueue = new LinkedBlockingDeque<DNodeService.Client>(thriftClientPoolSize);
}
if (dnodeQueue.isEmpty()) {
try {
for (int i = dnodeQueue.size(); i < thriftClientPoolSize; i++) {
dnodeQueue.put(DNodeClient.get(dnode));
}
// we only put the queue if all connections have been populated
thriftClientCache.put(dnode, dnodeQueue);
} catch (TTransportException e) {
log.error("Error while trying to populate queue for " + dnode
+ ", will discard created connections.", e);
while (!dnodeQueue.isEmpty()) {
dnodeQueue.poll().getOutputProtocol().getTransport().close();
}
throw e;
}
} else {
// it should be safe to call this method from different places concurrently
// so we contemplate the case where another Thread already populated the queue
// and only populate it if it's really empty.
log.warn(Thread.currentThread().getName() + " : queue for [" + dnode
+ "] is not empty - it was populated before.");
}
} finally {
thriftClientCacheLock.unlock();
}
}
/**
* This method can be called by {@link QNodeHandler} to cancel the Thrift client cache when a DNode disconnects.
* Usually this happens when Hazelcast notifies it.
*/
public void discardThriftClientCacheFor(String dnode) throws InterruptedException {
thriftClientCacheLock.lock();
try {
// discarding all connections to a DNode who leaved
log.info(Thread.currentThread().getName() + " : trashing queue for [" + dnode + "] as it leaved.");
BlockingQueue<DNodeService.Client> dnodeQueue = thriftClientCache.get(dnode);
// release connections until empty
while (!dnodeQueue.isEmpty()) {
dnodeQueue.take().getOutputProtocol().getTransport().close();
}
thriftClientCache.remove(dnode); // to indicate that the DNode is not present
} finally {
thriftClientCacheLock.unlock();
}
}
/**
* Get the Thrift client for this DNode.
* <p/>
* Can throw a TTransportException in the rare case when
* a new pool is initialized here. In this case, you shouldn't call
* the method {@link #returnDNodeClientToPool(String, com.splout.db.thrift.DNodeService.Client, boolean)}
* to return the connection.
* <p/>
* This method never returns null.
*
* @throws java.lang.InterruptedException if somebody interrupts the thread meanwhile the method is waiting in the pool
* @throws com.splout.db.qnode.PoolCreationException if there is failure when a new pool is created.
* @throws com.splout.db.qnode.DNodePoolFullException if the pool for the given dnode is empty and the timeout
* for waiting for a connection is reached.
*/
public DNodeService.Client getDNodeClientFromPool(String dnode) throws InterruptedException, PoolCreationException,
DNodePoolFullException {
BlockingQueue<DNodeService.Client> dnodeQueue = thriftClientCache.get(dnode);
if (dnodeQueue == null) {
// This shouldn't happen in real life because it is initialized by the QNode, but it is useful for unit
// testing.
// Under some rare race conditions the pool may be required before the QNode creates it, but this method
// assures that the queue will only be created once and, if it's not possible to create it, an exception
// will be thrown and nothing bad will happen.
try {
initializeThriftClientCacheFor(dnode);
dnodeQueue = thriftClientCache.get(dnode);
} catch (TTransportException e) {
throw new PoolCreationException(e);
}
}
DNodeService.Client client = dnodeQueue.poll(dnodePoolTimeoutMillis, TimeUnit.MILLISECONDS);
// Timeout waiting for poll
if (client == null) {
throw new DNodePoolFullException("Pool for dnode[" + dnode + "] is full and timeout of [" + dnodePoolTimeoutMillis
+ "] reached when waiting for a connection.");
}
return client;
}
/**
* Return a Thrift client to the pool. This method is a bit tricky since we may want to return a connection when a
* DNode already disconnected. Also, if the QNode is closing, we don't want to leave opened sockets around. To do it
* safely, we check whether 1) we are closing / cleaning the QNode or 2) the DNode has disconnected.
* <p/>
* The given client never can be null.
*/
public void returnDNodeClientToPool(String dnode, DNodeService.Client client, boolean renew) {
if (closing.get()) { // don't return to the pool if the system is already closing! we must close everything!
if (client != null) {
client.getOutputProtocol().getTransport().close();
}
return;
}
BlockingQueue<DNodeService.Client> dnodeQueue = thriftClientCache.get(dnode);
if (dnodeQueue == null) {
// dnode is not connected, so we exit.
if (client != null) {
client.getOutputProtocol().getTransport().close();
}
return;
}
if (renew) { // we have to try to renew the connection
try {
DNodeService.Client newClient = DNodeClient.get(dnode);
if (client != null) {
client.getOutputProtocol().getTransport().close();
client = newClient;
}
} catch (TTransportException e) {
// Was not possible to renew connection. We'll keep the broken one.
log.warn(
"TTransportException while renewing client to dnode[" + dnode + "]. Broken client is returned to the pool as is to continue.");
}
}
try {
dnodeQueue.add(client);
} catch (IllegalStateException e) {
client.getOutputProtocol().getTransport().close();
log.error("Trying to return a connection for dnode ["
+ dnode
+ "] but the pool already has the maximum number of connections. This is likely a software bug!.");
}
// one last check to avoid not closing every socket.
// here we avoid leaking a socket in case a close has happened in parallel or a DNode disconnected right in the
// middle
if (closing.get() || thriftClientCache.get(dnode) == null) {
if (client != null) {
client.getOutputProtocol().getTransport().close();
}
}
}
/**
* Rotates the versions (deletes versions that are old or useless). To be executed at startup and after a deployment.
*/
public List<com.splout.db.thrift.TablespaceVersion> synchronizeTablespaceVersions()
throws InterruptedException {
log.info("Starting to look for old tablespace versions to remove...");
int maxVersionsPerTablespace = config.getInt(QNodeProperties.VERSIONS_PER_TABLESPACE);
// Will contain the list of versions per each tablespace, sorted by creation date descendant
TreeMultimap<String, Tablespace> tablespaces = TreeMultimap.create(Ordering.natural(),
new Comparator<Tablespace>() {
@Override
public int compare(Tablespace tb1, Tablespace tb2) {
// reverse ordering. Older dates appears LAST. If same date, then version is compared.
int comp = -((Long) tb1.getCreationDate()).compareTo(tb2.getCreationDate());
if (comp == 0) {
return -((Long) tb1.getVersion()).compareTo(tb2.getVersion());
} else {
return comp;
}
}
});
Map<TablespaceVersion, Tablespace> myTablespaces = getTablespaceVersionsMap();
// We build a in memory version of tablespaces for analyzing it
// and prune old ones.
for (Entry<TablespaceVersion, Tablespace> entry : myTablespaces.entrySet()) {
tablespaces.put(entry.getKey().getTablespace(), entry.getValue());
}
log.info("Analyzing " + tablespaces.keySet().size() + " tablespaces with a total of " + tablespaces.size() + " versions...");
// We will remove only versions older than the one being served
Map<String, Long> hzVersionsBeingServed = coordinationStructures.getCopyVersionsBeingServed();
if (hzVersionsBeingServed == null) {
log.info("... No versions yet being served.");
return null; // nothing to do yet
}
log.info("Number of versions being served: " + hzVersionsBeingServed.size());
List<com.splout.db.thrift.TablespaceVersion> tablespacesToRemove = new ArrayList<com.splout.db.thrift.TablespaceVersion>();
for (Entry<String, Long> entry : hzVersionsBeingServed.entrySet()) {
String tablespace = entry.getKey();
Long versionBeingServed = entry.getValue();
// Tablespaces are sorted by creation date desc.
SortedSet<Tablespace> allVersions = tablespaces.get(tablespace);
Iterator<Tablespace> it = allVersions.iterator();
boolean foundVersionBeingServed = false;
int countVersionsAfter = 0;
while (it.hasNext()) {
Tablespace tb = it.next();
if (versionBeingServed.equals(tb.getVersion())) {
foundVersionBeingServed = true;
} else {
if (foundVersionBeingServed) {
countVersionsAfter++;
if (countVersionsAfter >= maxVersionsPerTablespace) {
// This is the case where we remove the version
// 1 - This tablespace has a version being served
// 2 - This version is older than the current tablespace being served
// 3 - We are already keeping maxVersionsPerTablespace versions
tablespacesToRemove.add(new com.splout.db.thrift.TablespaceVersion(tablespace, tb
.getVersion()));
log.info("Tablespace [" + tablespace + "] Version [" + tb.getVersion() + "] "
+ "created at [" + new Date(tb.getCreationDate())
+ "] REMOVED. We already keep younger versions.");
} else {
log.info("Tablespace [" + tablespace + "] Version [" + tb.getVersion() + "] "
+ "created at [" + new Date(tb.getCreationDate())
+ "] KEPT.");
}
} else {
log.info("Tablespace [" + tablespace + "] Version [" + tb.getVersion() + "] "
+ "created at [" + new Date(tb.getCreationDate()) + "] either younger than served one or without version being served. Keeping.");
}
}
}
if (!foundVersionBeingServed) {
log.info("Tablespace [" + tablespace
+ "] without any version being served. Please, have a look, and remove them if not used");
}
if (tablespacesToRemove.size() > 0) {
log.info("Sending [" + tablespacesToRemove + "] to all alive DNodes.");
for (DNodeInfo dnode : coordinationStructures.getDNodes().values()) {
DNodeService.Client client = null;
boolean renew = false;
try {
client = getDNodeClientFromPool(dnode.getAddress());
client.deleteOldVersions(tablespacesToRemove);
} catch (TTransportException e) {
renew = true;
log.warn("Failed sending delete TablespaceVersions order to (" + dnode
+ "). Not critical as they will be removed after other deployments.", e);
} catch (Exception e) {
log.warn("Failed sending delete TablespaceVersions order to (" + dnode
+ "). Not critical as they will be removed after other deployments.", e);
} finally {
if (client != null) {
returnDNodeClientToPool(dnode.getAddress(), client, renew);
}
}
}
}
log.info("... done looking for old tablespace versions to remove...");
}
return tablespacesToRemove; // Return for unit test
}
private AtomicBoolean closing = new AtomicBoolean(false);
public void close() {
closing.set(true); // will indicate other parts of this code that things have to be closed!
for (Map.Entry<String, BlockingQueue<DNodeService.Client>> entry : thriftClientCache.entrySet()) {
while (entry.getValue().size() > 0) {
try {
entry.getValue().take().getOutputProtocol().getTransport().close();
} catch (InterruptedException e) {
log.error("Interrupted!", e);
}
}
}
}
public void maybeBalance() {
// do this only after warming
if (!isWarming.get() && config.getBoolean(QNodeProperties.REPLICA_BALANCE_ENABLE)) {
// check if we could balance some partitions
List<ReplicaBalancer.BalanceAction> balanceActions = getBalanceActions();
// we will only re-balance versions being served
// otherwise strange things may happen: to re-balance a version in the middle of its deployment...
Map<String, Long> versionsBeingServed = coordinationStructures.getCopyVersionsBeingServed();
for (ReplicaBalancer.BalanceAction action : balanceActions) {
if (versionsBeingServed != null && versionsBeingServed.get(action.getTablespace()) != null
&& versionsBeingServed.get(action.getTablespace()) == action.getVersion()) {
// put if absent + TTL
coordinationStructures.getDNodeReplicaBalanceActionsSet().putIfAbsent(action, "",
config.getLong(QNodeProperties.BALANCE_ACTIONS_TTL), TimeUnit.SECONDS);
}
}
}
}
// ---- Getters ---- //
public Map<String, Long> getCurrentVersionsMap() {
return currentVersionsMap;
}
public Map<TablespaceVersion, Tablespace> getTablespaceVersionsMap() {
return tablespaceState.getTablespaceVersionsMap();
}
public CoordinationStructures getCoordinationStructures() {
return coordinationStructures;
}
public TablespaceMemoryState getTablespaceState() {
return tablespaceState;
}
public SploutConfiguration getConfig() {
return config;
}
public ConcurrentMap<String, BlockingQueue<DNodeService.Client>> getThriftClientCache() {
return thriftClientCache;
}
public AtomicBoolean getIsWarming() {
return isWarming;
}
public String getQNodeAddress() { return qNodeAddress; }
public void setQNodeAddress(String QNodeAddress) {
this.qNodeAddress = QNodeAddress;
}
}