package com.splout.db.qnode;
/*
* #%L
* Splout SQL Server
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
import com.google.common.util.concurrent.*;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.IMap;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.PartitionEntry;
import com.splout.db.common.ReplicationEntry;
import com.splout.db.common.Tablespace;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.TablespaceVersion;
import com.splout.db.qnode.beans.*;
import com.splout.db.thrift.DNodeService;
import com.splout.db.thrift.DeployAction;
import com.splout.db.thrift.PartitionMetadata;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
/**
* The Deployer is a specialized module ({@link com.splout.db.qnode.QNodeHandlerModule}) of the
* {@link com.splout.db.qnode.QNode} that performs the business logic associated with a distributed deployment. It is
* used by the {@link com.splout.db.qnode.QNodeHandler}.
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public class Deployer extends QNodeHandlerModule {
private final static Log log = LogFactory.getLog(Deployer.class);
private ListeningExecutorService deployExecutor;
// Registry of deployments being running. Usefull for cancelling them.
private ConcurrentHashMap<Long, Future<?>> runningDeployments = new ConcurrentHashMap<Long, Future<?>>();
@SuppressWarnings("serial")
public static class UnexistingVersion extends Exception {
public UnexistingVersion() {
super();
}
public UnexistingVersion(String message) {
super(message);
}
}
/**
* Runnable that deals with the asynchronous part of the deployment. Particularly, it waits until DNodes finish their
* work, and then performs the version switch.
*/
public class ManageDeploy implements Runnable {
// Number of seconds to wait until another
// check to see if timeout was reached or
// if a DNode failed.
private long secondsToCheckFailureOrTimeout = 15l;
private long version;
private List<String> dnodes;
private long timeoutSeconds;
private List<DeployRequest> deployRequests;
private long dnodesSpreadMetadataTimeout;
private boolean isReplicaBalancingEnabled;
public ManageDeploy(List<String> dnodes, List<DeployRequest> deployRequests, long version,
long timeoutSeconds, long secondsToCheckFailureOrTimeout, long dnodesSpreadMetadataTimeout, boolean isReplicaBalancingEnabled) {
this.dnodes = dnodes;
this.deployRequests = deployRequests;
this.version = version;
this.timeoutSeconds = timeoutSeconds;
this.secondsToCheckFailureOrTimeout = secondsToCheckFailureOrTimeout;
this.dnodesSpreadMetadataTimeout = Math.max(dnodesSpreadMetadataTimeout, 1);
this.isReplicaBalancingEnabled = isReplicaBalancingEnabled;
}
@Override
public void run() {
log.info(context.getConfig().getProperty(QNodeProperties.PORT) + " Executing deploy for version ["
+ version + "]");
CoordinationStructures.DEPLOY_IN_PROGRESS.incrementAndGet();
try {
long waitSeconds = 0;
ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
.getCountDownLatchForDeploy(version);
boolean finished;
do {
finished = countDownLatchForDeploy.await(secondsToCheckFailureOrTimeout, TimeUnit.SECONDS);
// We have to do this check as the await method seems to ignore the interrupt signal. Grrrrr!!
// We use interrupted as we want the interrupt flag do be cleared. Otherwise cancelling code
// could throw another InterruptedException further.
if (Thread.interrupted()) {
throw new InterruptedException("Deployment for version ["
+ version + "] received an interrupt. Probably somebody is cancelling this deployment.");
}
waitSeconds += secondsToCheckFailureOrTimeout;
if (!finished) {
// If any of the DNodes failed, then we cancel the deployment.
if (checkForFailure()) {
explainErrors();
abortDeploy(dnodes, "One or more DNodes failed", version);
return;
}
// Let's see if we reached the timeout.
// Negative timeoutSeconds => waits forever
if (waitSeconds > timeoutSeconds && timeoutSeconds >= 0) {
log.warn("Deploy of version [" + version + "] timed out. Reached [" + waitSeconds
+ "] seconds.");
abortDeploy(dnodes, "Timeout reached", version);
return;
}
}
} while (!finished);
// It's still possible that the deploy failed so let's check it again
if (checkForFailure()) {
explainErrors();
abortDeploy(dnodes, "One or more DNodes failed.", version);
return;
}
// Check after the wait than the complete tablespaces are available to that QNode. If that is the
// case for this QNode it will be probably the case for the rest of QNodes.
long millisToWait = 50;
double acumulatedMillis = 0.;
List<SwitchVersionRequest> versionsToCheck = switchActions();
do {
Thread.sleep(millisToWait);
acumulatedMillis += millisToWait;
// Let's see if we reached the timeout.
// Negative timeoutSeconds => waits forever
if ((acumulatedMillis / 1000) > dnodesSpreadMetadataTimeout) {
log.warn("Deploy of version [" + version + "] timed out when waiting DNodes to spread the metadata. Reached [" + (acumulatedMillis / 1000)
+ "] seconds.");
abortDeploy(dnodes, "Timeout reached", version);
return;
}
Iterator<SwitchVersionRequest> it = versionsToCheck.iterator();
while (it.hasNext()) {
SwitchVersionRequest req = it.next();
Tablespace t = context.getTablespaceVersionsMap().get(
new TablespaceVersion(req.getTablespace(), req.getVersion()));
// Check that this TablespaceVersion has been reported by some node through Hazelcast
if (t != null && t.getReplicationMap() != null && t.getPartitionMap() != null
&& t.getPartitionMap().getPartitionEntries() != null
&& t.getReplicationMap().getReplicationEntries() != null
&& t.getReplicationMap().getReplicationEntries().size() > 0) {
if (t.getPartitionMap().getPartitionEntries().size() == t.getReplicationMap()
.getReplicationEntries().size()) {
log.info("Ok, TablespaceVersion [" + req.getTablespace() + ", " + req.getVersion()
+ "] being handled by enough DNodes as reported by Hazelcast.");
it.remove();
}
}
}
} while (versionsToCheck.size() > 0);
log.info("All DNodes performed the deploy of version [" + version
+ "]. Publishing tablespaces...");
// We finish by publishing the versions table with the new versions.
try {
switchVersions(switchActions());
} catch (UnexistingVersion e) {
throw new RuntimeException(
"Unexisting version after deploying this version. Sounds like a bug.", e);
}
// If some replicas are under-replicated, start a balancing process
context.maybeBalance();
log.info("Deploy of version [" + version + "] Finished PROPERLY. :-)");
context.getCoordinationStructures().logDeployMessage(version,
"Deploy of version [" + version + "] finished properly.");
context.getCoordinationStructures().getDeploymentsStatusPanel()
.put(version, DeployStatus.FINISHED);
} catch (InterruptedException e) {
// Case when a deployment is cancelled.
log.info("Deployment of [" + version + "] interrupted.");
abortDeploy(dnodes, e.getMessage(), version);
} catch (Throwable t) {
t.printStackTrace();
throw new RuntimeException(t);
} finally {
CoordinationStructures.DEPLOY_IN_PROGRESS.decrementAndGet();
}
}
/**
* Compose the list of switch actions to switch
*
* @return
*/
private List<SwitchVersionRequest> switchActions() {
ArrayList<SwitchVersionRequest> actions = new ArrayList<SwitchVersionRequest>();
for (DeployRequest req : deployRequests) {
actions.add(new SwitchVersionRequest(req.getTablespace(), version));
}
return actions;
}
/**
* Log DNodes errors in deployment. We log both to the QNode logger and to Hazelcast so the info is persisted in the
* session.
*/
private void explainErrors() {
IMap<String, String> deployErrorPanel = context.getCoordinationStructures().getDeployErrorPanel(
version);
String msg = "Deployment of version [" + version + "] failed in DNode[";
for (Entry<String, String> entry : deployErrorPanel.entrySet()) {
String fMsg = msg + entry.getKey() + "] - it failed with the error [" + entry.getValue() + "]";
log.error(fMsg);
context.getCoordinationStructures().logDeployMessage(version, fMsg);
}
}
/**
* Return true if one or more of the DNodes reported an error.
*/
private boolean checkForFailure() {
IMap<String, String> deployErrorPanel = context.getCoordinationStructures().getDeployErrorPanel(
version);
if (!isReplicaBalancingEnabled) {
return !deployErrorPanel.isEmpty();
}
// If replica balancing is enabled we check whether we could survive after the failed DNodes
Set<String> failedDNodes = new HashSet<String>(deployErrorPanel.keySet());
// Check if deploy needs to be canceled or if the system could auto-rebalance itself afterwards
for (DeployRequest deployRequest : deployRequests) {
for (ReplicationEntry repEntry : deployRequest.getReplicationMap()) {
if (failedDNodes.containsAll(repEntry.getNodes())) {
// There is AT LEAST one partition that depends on the failed DNodes so the deploy must fail!
return true;
}
}
}
return false;
}
} /* End ManageDeploy */
/**
* The Deployer deals with deploy and switch version requests.
*/
public Deployer(QNodeHandlerContext context) {
super(context);
deployExecutor = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool(
new ThreadFactoryBuilder().setNameFormat("deploy-%d").build()
));
}
/**
* Call this method for starting an asynchronous deployment given a proper deploy request - proxy method for
* {@link QNodeHandler}. Returns a {@link QueryStatus} with the status of the request.
*
* @throws InterruptedException
*/
public DeployInfo deploy(List<DeployRequest> deployRequests) throws InterruptedException {
// A new unique version number is generated.
final long version = context.getCoordinationStructures().uniqueVersionId();
DeployInfo deployInfo = fillDeployInfo(deployRequests, version, context.getQNodeAddress());
context.getCoordinationStructures().logDeployMessage(version,
"Deploy [" + version + "] for tablespaces" + deployInfo.getTablespacesDeployed() + " started.");
context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.ONGOING);
// Generate the list of actions per DNode
Map<String, List<DeployAction>> actionsPerDNode = generateDeployActionsPerDNode(deployRequests,
version);
// Starting the countdown latch.
ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
.getCountDownLatchForDeploy(version);
Set<String> dnodesInvolved = actionsPerDNode.keySet();
countDownLatchForDeploy.trySetCount(dnodesInvolved.size());
// Sending deploy signals to each DNode
for (Map.Entry<String, List<DeployAction>> actionPerDNode : actionsPerDNode.entrySet()) {
DNodeService.Client client = null;
boolean renew = false;
try {
client = context.getDNodeClientFromPool(actionPerDNode.getKey());
client.deploy(actionPerDNode.getValue(), version);
} catch (TTransportException e) {
renew = true;
} catch (Exception e) {
String errorMsg = "Error sending deploy actions to DNode [" + actionPerDNode.getKey() + "]";
log.error(errorMsg, e);
abortDeploy(new ArrayList<String>(actionsPerDNode.keySet()), errorMsg, version);
deployInfo.setError("Error connecting to DNode " + actionPerDNode.getKey());
context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
return deployInfo;
} finally {
if (client != null) {
context.returnDNodeClientToPool(actionPerDNode.getKey(), client, renew);
}
}
}
// Initiating an asynchronous process to manage the deployment
ListenableFuture<?> future = deployExecutor.submit(new ManageDeploy(new ArrayList(actionsPerDNode.keySet()), deployRequests,
version, context.getConfig().getLong(QNodeProperties.DEPLOY_TIMEOUT, -1), context.getConfig()
.getLong(QNodeProperties.DEPLOY_SECONDS_TO_CHECK_ERROR),
context.getConfig().getLong(QNodeProperties.DEPLOY_DNODES_SPREAD_METADATA_TIMEOUT, 180),
context.getConfig().getBoolean(QNodeProperties.REPLICA_BALANCE_ENABLE)));
registerAsRunning(version, future);
context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
return deployInfo;
}
/**
* Registers a future as being running. Also registers an automatic callback
* that will unregister the future once finished.
*/
protected void registerAsRunning(final long version, ListenableFuture<?> future) {
runningDeployments.put(version, future);
Futures.addCallback(future, new FutureCallback<Object>() {
@Override
public void onSuccess(Object result) {
unregister();
}
@Override
public void onFailure(Throwable t) {
unregister();
}
public void unregister() {
runningDeployments.remove(version);
}
});
}
protected DeployInfo fillDeployInfo(List<DeployRequest> deployRequests, long version, String qNodeAddress) {
DeployInfo deployInfo = new DeployInfo();
deployInfo.setVersion(version);
List<String> tablespaces = new ArrayList<String>();
List<String> dataURIs = new ArrayList<String>();
for (DeployRequest request : deployRequests) {
tablespaces.add(request.getTablespace());
dataURIs.add(request.getData_uri());
}
deployInfo.setTablespacesDeployed(tablespaces);
deployInfo.setDataURIs(dataURIs);
Date startTime = new Date();
deployInfo.setStartedAt(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(startTime));
deployInfo.setqNode(qNodeAddress);
return deployInfo;
}
/**
* DNodes are informed to stop the deployment, as something failed.
*
* @throws InterruptedException
*/
public void abortDeploy(List<String> dnodes, String deployerErrorMessage, long version) {
for (String dnode : dnodes) {
DNodeService.Client client = null;
boolean renew = false;
try {
client = context.getDNodeClientFromPool(dnode);
client.abortDeploy(version);
} catch (TTransportException e) {
renew = true;
} catch (Exception e) {
log.error("Error sending abort deploy flag to DNode [" + dnode + "]", e);
} finally {
if (client != null) {
context.returnDNodeClientToPool(dnode, client, renew);
}
}
}
context.getCoordinationStructures().logDeployMessage(version,
"Deploy failed due to: " + deployerErrorMessage);
context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.FAILED);
}
/**
* Switches current versions being served for some tablespaces, in an atomic way.
*/
public void switchVersions(List<SwitchVersionRequest> switchRequest) throws UnexistingVersion {
// We compute the new versions table, and then try to update it
// We use optimistic locking: we read the original
// map and try to update it. If the original has changed during
// this process, we retry: reload the original map, ...
Map<String, Long> versionsTable;
Map<String, Long> newVersionsTable;
do {
versionsTable = context.getCoordinationStructures().getCopyVersionsBeingServed();
newVersionsTable = new HashMap<String, Long>();
if (versionsTable != null) {
newVersionsTable.putAll(versionsTable);
}
for (SwitchVersionRequest req : switchRequest) {
TablespaceVersion tsv = new TablespaceVersion(req.getTablespace(), req.getVersion());
if (context.getTablespaceVersionsMap().get(tsv) == null) {
throw new UnexistingVersion("Trying to switch to unexisting version[" + req.getVersion() + "] for tablespace[" + req.getTablespace() + "]");
}
newVersionsTable.put(tsv.getTablespace(), tsv.getVersion());
}
} while (!context.getCoordinationStructures().updateVersionsBeingServed(versionsTable,
newVersionsTable));
}
/**
* Generates the list of individual deploy actions that has to be sent to each DNode.
*/
private static Map<String, List<DeployAction>> generateDeployActionsPerDNode(
List<DeployRequest> deployRequests, long version) {
HashMap<String, List<DeployAction>> actions = new HashMap<String, List<DeployAction>>();
long deployDate = System.currentTimeMillis(); // Here is where we decide the data of the deployment for all deployed
// tablespaces
for (DeployRequest req : deployRequests) {
for (Object obj : req.getReplicationMap()) {
ReplicationEntry rEntry = (ReplicationEntry) obj;
PartitionEntry pEntry = null;
for (PartitionEntry partEntry : req.getPartitionMap()) {
if (partEntry.getShard().equals(rEntry.getShard())) {
pEntry = partEntry;
}
}
if (pEntry == null) {
String msg = "No Partition metadata for shard: " + rEntry.getShard()
+ " this is very likely to be a software bug.";
log.error(msg);
try {
log.error("Partition map: " + JSONSerDe.ser(req.getPartitionMap()));
log.error("Replication map: " + JSONSerDe.ser(req.getReplicationMap()));
} catch (JSONSerDe.JSONSerDeException e) {
log.error("JSON error", e);
}
throw new RuntimeException(msg);
}
// Normalize DNode ids -> The convention is that DNodes are identified by host:port . So we need to strip the
// protocol, if any
for (int i = 0; i < rEntry.getNodes().size(); i++) {
String dnodeId = rEntry.getNodes().get(i);
if (dnodeId.startsWith("tcp://")) {
dnodeId = dnodeId.substring("tcp://".length(), dnodeId.length());
}
rEntry.getNodes().set(i, dnodeId);
}
for (String dNode : rEntry.getNodes()) {
List<DeployAction> actionsSoFar = (List<DeployAction>) MapUtils.getObject(actions, dNode,
new ArrayList<DeployAction>());
actions.put(dNode, actionsSoFar);
DeployAction deployAction = new DeployAction();
deployAction.setDataURI(req.getData_uri() + "/" + rEntry.getShard() + ".db");
deployAction.setTablespace(req.getTablespace());
deployAction.setVersion(version);
deployAction.setPartition(rEntry.getShard());
// Add partition metadata to the deploy action for DNodes to save it
PartitionMetadata metadata = new PartitionMetadata();
metadata.setMinKey(pEntry.getMin());
metadata.setMaxKey(pEntry.getMax());
metadata.setNReplicas(rEntry.getNodes().size());
metadata.setDeploymentDate(deployDate);
metadata.setInitStatements(req.getInitStatements());
metadata.setEngineId(req.getEngine());
deployAction.setMetadata(metadata);
actionsSoFar.add(deployAction);
}
}
}
return actions;
}
public StatusMessage cancelDeployment(long version) {
Future<?> future = runningDeployments.get(version);
if (future == null) {
return new StatusMessage(StatusMessage.Status.ERROR, "No deployment running for " + version + " found.");
}
future.cancel(true);
return new StatusMessage(StatusMessage.Status.OK, "Deployment for version " + version + " being cancelled.");
}
}