package com.splout.db.dnode;
/*
* #%L
* Splout SQL Server
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.hazelcast.core.*;
import com.splout.db.benchmark.PerformanceTool;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.JSONSerDe.JSONSerDeException;
import com.splout.db.common.SploutConfiguration;
import com.splout.db.common.ThriftReader;
import com.splout.db.common.ThriftWriter;
import com.splout.db.dnode.beans.BalanceFileReceivingProgress;
import com.splout.db.dnode.beans.DNodeStatusResponse;
import com.splout.db.dnode.beans.DNodeSystemStatus;
import com.splout.db.engine.EngineManager;
import com.splout.db.engine.ManagerFactory;
import com.splout.db.engine.ResultSerializer;
import com.splout.db.hazelcast.*;
import com.splout.db.hazelcast.HazelcastConfigBuilder.HazelcastConfigBuilderException;
import com.splout.db.qnode.ReplicaBalancer;
import com.splout.db.qnode.ReplicaBalancer.BalanceAction;
import com.splout.db.thrift.DNodeException;
import com.splout.db.thrift.DeployAction;
import com.splout.db.thrift.PartitionMetadata;
import com.splout.db.thrift.RollbackAction;
import net.sf.ehcache.Cache;
import net.sf.ehcache.Element;
import org.apache.commons.io.FileSystemUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* The business logic for the DNode: responding to queries, downloading new
* deployments, handling ZooKeeper events and so forth.
*/
public class DNodeHandler implements IDNodeHandler {
private final static Log log = LogFactory.getLog(DNodeHandler.class);
protected SploutConfiguration config;
private HazelcastInstance hz;
private DistributedRegistry dnodesRegistry;
private CoordinationStructures coord;
private HttpFileExchanger httpExchanger;
// The {@link Fetcher} is the responsible for downloading new deployment data.
Cache dbCache;
protected ExecutorService deployExecutor;
protected Object deployLock = new Object();
// This flag is needed for unit testing.
protected AtomicInteger deployInProgress = new AtomicInteger(0);
// Indicates that the last deploy failed because of timeout. This info can
// then be answered via a status() request.
AtomicBoolean lastDeployTimedout = new AtomicBoolean(false);
// Thrift exception code used in DNodeException
// ORDINARY: Exception that is expected when Splout is wrongly used. For
// example, with SQL syntax errors,
// or table not found, or this kind of things. In this case, this exceptions
// are just returned to the user
// without doing retrials at other DNodes because the expected result will be
// same. Also, this exeptions
// are not logged as they are Expected.
// UNEXPECTED: Exceptions that are not expected or that represents a real
// error: database corruption,
// imposibility to create a connection to the database, etc. This exceptions
// are logged and
// queries are retied at other DNodes.
public final static int EXCEPTION_ORDINARY = 0;
public final static int EXCEPTION_UNEXPECTED = 1;
// A hard limit on the number of results that this DNode can return per SQL
// query
private int maxResultsPerQuery;
// The following variables are used for monitoring and providing statistics:
private PerformanceTool performanceTool = new PerformanceTool();
private String lastException = null;
private long lastExceptionTime;
private AtomicInteger failedQueries = new AtomicInteger(0);
private long upSince;
// The {@link Fetcher} is the responsible for downloading new deployment data.
private Fetcher fetcher;
// Above this query time the query will be logged as slow query
private long absoluteSlowQueryLimit;
private long slowQueries = 0;
// Deploy parallelism
private int deployParallelism;
protected HashMap<Long, Future<?>> deploysBeingExecuted = new HashMap<Long, Future<?>>();
// This map will hold all the current balance file transactions being done
private ConcurrentHashMap<String, BalanceFileReceivingProgress> balanceActionsStateMap = new ConcurrentHashMap<String, BalanceFileReceivingProgress>();
// The factory we will use to instantiate managers for each partition
// associated with an {@link SploutEngine}
private ManagerFactory factory;
public DNodeHandler(Fetcher fetcher) {
this.fetcher = fetcher;
}
public DNodeHandler() {
}
/**
* Returns the address (host:port) of this DNode.
*/
public String whoAmI() {
return config.getString(DNodeProperties.HOST) + ":" + config.getInt(DNodeProperties.PORT);
}
public String httpExchangerAddress() {
return "http://" + config.getString(DNodeProperties.HOST) + ":" + config.getInt(HttpFileExchangerProperties.HTTP_PORT);
}
public String getTCPAPIAddress() {
return config.getString(DNodeProperties.HOST) + ":" + config.getInt(DNodeProperties.STREAMING_PORT);
}
/**
* This inner class will listen for additions to the balance actions map, so
* that if a balance action has to be taken and this DNode is the one who has
* the send the file, it will start doing so.
*/
private class BalanceActionItemListener implements EntryListener<ReplicaBalancer.BalanceAction, String> {
@Override
public void entryAdded(EntryEvent<BalanceAction, String> event) {
BalanceAction action = event.getKey();
if (action.getOriginNode().equals(whoAmI())) {
// I must do a balance action!
File toSend = new File(getLocalStorageFolder(action.getTablespace(), action.getPartition(), action.getVersion()),
action.getPartition() + ".db");
File metadataFile = getLocalMetadataFile(action.getTablespace(), action.getPartition(), action.getVersion());
// send both the .db and the .meta file -> when the other part has both
// files it will move them atomically...
httpExchanger.send(action.getTablespace(), action.getPartition(), action.getVersion(), toSend, action.getFinalNode(), false);
httpExchanger.send(action.getTablespace(), action.getPartition(), action.getVersion(), metadataFile, action.getFinalNode(), false);
}
}
@Override
public void entryRemoved(EntryEvent<BalanceAction, String> event) {
// usually we won't care - but the final DNode might have pro-actively
// removed this action
}
@Override
public void entryUpdated(EntryEvent<BalanceAction, String> event) {
}
@Override
public void entryEvicted(EntryEvent<BalanceAction, String> event) {
}
@Override
public void mapEvicted(MapEvent mapEvent) {
}
@Override
public void mapCleared(MapEvent mapEvent) {
}
}
/**
* This inner class will perform the business logic associated with receiving
* files: what to do on failures, bad CRC, file received OK...
*/
private class FileReceiverCallback implements HttpFileExchanger.ReceiveFileCallback {
@Override
public void onProgress(String tablespace, Integer partition, Long version, File file, long totalSize, long sizeDownloaded) {
if (file.getName().endsWith(".db")) {
getProgressFromLocalPanel(tablespace, partition, version).progressBinaryFile(totalSize, sizeDownloaded);
}
}
@Override
public void onFileReceived(String tablespace, Integer partition, Long version, File file) {
BalanceFileReceivingProgress progress = getProgressFromLocalPanel(tablespace, partition, version);
if (file.getName().endsWith(".meta")) {
progress.metaFileReceived(file);
} else if (file.getName().endsWith(".db")) {
progress.binaryFileReceived(file);
}
// this can be reached simultaneously by 2 different threads so we must
// synchronized it
// (thread that downloaded the .meta file and thread that downloaded the
// .db file)
synchronized (FileReceiverCallback.this) {
if (progress.isReceivedMetaFile() && progress.isReceivedBinaryFile()) {
// This assures that the move will only be done once
if (new File(progress.getMetaFile()).exists() && new File(progress.getBinaryFile()).exists()) {
// check if we already have the binary & meta -> then move partition
// and then remove this action from the panel so that it's
// completed.
try {
// we need to remove existing files if they exist
// they might be stalled from old deployments
File meta = getLocalMetadataFile(tablespace, partition, version);
if (meta.exists()) {
meta.delete();
}
FileUtils.moveFile(new File(progress.getMetaFile()), meta);
File binaryToMove = new File(progress.getBinaryFile());
File binary = new File(getLocalStorageFolder(tablespace, partition, version), binaryToMove.getName());
if (binary.exists()) {
binary.delete();
}
FileUtils.moveToDirectory(binaryToMove, getLocalStorageFolder(tablespace, partition, version), true);
log.info("Balance action successfully completed, received both .db and .meta files (" + tablespace + ", " + partition + ", "
+ version + ")");
// Publish new changes to HZ
dnodesRegistry.changeInfo(new DNodeInfo(config));
} catch (IOException e) {
log.error(e);
} finally {
removeBalanceActionFromHZPanel(tablespace, partition, version);
}
}
}
}
}
@Override
public void onBadCRC(String tablespace, Integer partition, Long version, File file) {
removeBalanceActionFromHZPanel(tablespace, partition, version);
}
@Override
public void onError(Throwable t, String tablespace, Integer partition, Long version, File file) {
removeBalanceActionFromHZPanel(tablespace, partition, version);
}
// --- Helper methods --- //
/**
* Will remove the BalanceAction associated with this file receiving from HZ
* data structure.
*/
private synchronized void removeBalanceActionFromHZPanel(String tablespace, Integer partition, Long version) {
// first remove the local tracking of this action
String lookupKey = tablespace + "_" + partition + "_" + version;
if (balanceActionsStateMap.containsKey(lookupKey)) {
balanceActionsStateMap.remove(lookupKey);
// then remove from HZ
BalanceAction actionToRemove = null;
for (Map.Entry<BalanceAction, String> actionEntry : coord.getDNodeReplicaBalanceActionsSet().entrySet()) {
BalanceAction action = actionEntry.getKey();
if (action.getTablespace().equals(tablespace) && action.getPartition() == partition && action.getVersion() == version
&& action.getFinalNode().equals(httpExchanger.address())) {
actionToRemove = action;
}
}
if (actionToRemove == null) {
// no need to worry - another thread might have gone into this code
// already almost simultaneously
} else {
coord.getDNodeReplicaBalanceActionsSet().remove(actionToRemove);
log.info("Removed balance action [" + actionToRemove + "] from HZ panel.");
}
}
}
/**
* Will obtain a bean to fill some progress in a local hashmap or create it
* and put it otherwise.
*/
private synchronized BalanceFileReceivingProgress getProgressFromLocalPanel(String tablespace, Integer partition, Long version) {
String lookupKey = tablespace + "_" + partition + "_" + version;
BalanceFileReceivingProgress progress = balanceActionsStateMap.get(lookupKey);
if (progress == null) {
progress = new BalanceFileReceivingProgress(tablespace, partition, version);
balanceActionsStateMap.put(lookupKey, progress);
}
return progress;
}
}
/**
* Initialization logic: initialize things, connect to ZooKeeper, create
* Thrift server, etc.
*
* @see com.splout.db.dnode.IDNodeHandler#init(com.splout.db.common.SploutConfiguration)
*/
public void init(SploutConfiguration config) throws Exception {
this.config = config;
long evictionSeconds = config.getLong(DNodeProperties.EH_CACHE_SECONDS);
maxResultsPerQuery = config.getInt(DNodeProperties.MAX_RESULTS_PER_QUERY);
int maxCachePools = config.getInt(DNodeProperties.EH_CACHE_N_ELEMENTS);
absoluteSlowQueryLimit = config.getLong(DNodeProperties.SLOW_QUERY_ABSOLUTE_LIMIT);
deployParallelism = config.getInt(DNodeProperties.DEPLOY_PARALLELISM);
factory = new ManagerFactory();
factory.init(config);
// We create a Cache for holding SQL connection pools to different
// tablespace versions
// http://stackoverflow.com/questions/2583429/how-to-differentiate-between-time-to-live-and-time-to-idle-in-ehcache
dbCache = new Cache("dbCache", maxCachePools, false, false, Integer.MAX_VALUE, evictionSeconds);
dbCache.initialise();
if (fetcher == null) {
// The Fetcher in charge of downloading new deployments
this.fetcher = new Fetcher(config);
}
// When a tablespace version is expired, the connection pool is closed by an
// expiration handler
dbCache.getCacheEventNotificationService().registerListener(new CacheListener());
// The executor that will execute deployments asynchronously
deployExecutor = Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("deploy-%d").build());
// A thread that will listen to file exchanges through HTTP
httpExchanger = new HttpFileExchanger(config, new FileReceiverCallback());
httpExchanger.init();
httpExchanger.start();
// Connect with the cluster.
hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
coord = new CoordinationStructures(hz);
coord.getDNodeReplicaBalanceActionsSet().addEntryListener(new BalanceActionItemListener(), false);
// Add shutdown hook
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
try {
log.info("Shutdown hook called - trying to gently stop DNodeHandler " + whoAmI() + " ...");
DNodeHandler.this.stop();
} catch (Throwable e) {
log.error("Error in ShutdownHook", e);
}
}
});
upSince = System.currentTimeMillis();
}
/**
* Registers the dnode in the cluster. This gives green ligth to use it.
*/
@Override
public void giveGreenLigth() {
int minutesToCheckRegister = config.getInt(HazelcastProperties.MAX_TIME_TO_CHECK_REGISTRATION, 5);
int oldestMembersLeading = config.getInt(HazelcastProperties.OLDEST_MEMBERS_LEADING_COUNT, 3);
dnodesRegistry = new DistributedRegistry(CoordinationStructures.DNODES, new DNodeInfo(config), hz, minutesToCheckRegister,
oldestMembersLeading);
dnodesRegistry.register();
}
/**
* Deletes the files and folders kept by the DNode for a particular tablespace
* and version.
*/
private void deleteLocalVersion(com.splout.db.thrift.TablespaceVersion version) throws IOException {
File dataFolder = new File(config.getString(DNodeProperties.DATA_FOLDER));
File tablespaceFolder = new File(dataFolder, version.getTablespace());
File versionFolder = new File(tablespaceFolder, version.getVersion() + "");
if (versionFolder.exists()) {
File[] partitions = versionFolder.listFiles();
if (partitions != null) {
for (File partition : partitions) {
if (partition.isDirectory()) {
// remove references to engine in ECache
// so that space in disk is immediately available
String dbKey = version.getTablespace() + "_" + version.getVersion() + "_" + partition.getName();
synchronized (dbCache) {
if (dbCache.get(dbKey) != null) {
dbCache.remove(dbKey);
log.info("-- Removing references from ECache: " + dbKey);
}
}
}
}
}
FileUtils.deleteDirectory(versionFolder);
log.info("-- Successfully removed " + versionFolder);
} else {
// Could happen, nothing to worry
}
}
/**
* This method will be called either before publishing a new tablespace after
* a deploy or when a query is issued to a tablespace/version which is not
* "warmed" (e.g. after Splout restart, or after long inactivity).
*/
private Element loadManagerInEHCache(String tablespace, long version, int partition, File dbFolder, PartitionMetadata partitionMetadata)
throws DNodeException {
try {
// Create new EHCache item value with a {@link EngineManager}
EngineManager manager = factory.getManagerIn(dbFolder, partitionMetadata);
String dbKey = tablespace + "_" + version + "_" + partition;
Element dbPoolInCache = new Element(dbKey, manager);
dbCache.put(dbPoolInCache);
return dbPoolInCache;
} catch (Exception e) {
log.error(e);
e.printStackTrace();
throw new DNodeException(EXCEPTION_ORDINARY, "Error (" + e.getMessage() + ") instantiating a manager for a data partition");
}
}
public EngineManager getManager(String tablespace, long version, int partition) throws DNodeException, IOException {
// Look for the EHCache database pool cache
String dbKey = tablespace + "_" + version + "_" + partition;
Element dbPoolInCache = null;
synchronized (dbCache) {
dbPoolInCache = dbCache.get(dbKey);
if (dbPoolInCache == null) {
File dbFolder = getLocalStorageFolder(tablespace, partition, version);
if (!dbFolder.exists()) {
log.warn("Asked for " + dbFolder + " but it doesn't exist!");
throw new DNodeException(EXCEPTION_ORDINARY, "Requested tablespace (" + tablespace + ") + version (" + version
+ ") is not available.");
}
File metadata = getLocalMetadataFile(tablespace, partition, version);
ThriftReader reader = new ThriftReader(metadata);
PartitionMetadata partitionMetadata = (PartitionMetadata) reader.read(new PartitionMetadata());
reader.close();
dbPoolInCache = loadManagerInEHCache(tablespace, version, partition, dbFolder, partitionMetadata);
}
}
return ((EngineManager) dbPoolInCache.getObjectValue());
}
/**
* Called by both binary and JSON version RPC methods.
*/
private Object sqlQueryHelperMethod(String tablespace, long version, int partition, boolean binary, String query) throws DNodeException {
String msg = "query served tablespace[" + tablespace + "]" + " version[" + version + "] partition[" + partition + "] sql[" + query
+ "]";
String status = "ERROR";
String errMsg = "";
performanceTool.startQuery();
try {
try {
EngineManager manager = getManager(tablespace, version, partition);
Object result = null;
// Query the {@link SQLite4JavaManager} and return
if (binary) {
result = ResultSerializer.serialize(manager.query(query, maxResultsPerQuery));
} else {
result = manager.query(query, maxResultsPerQuery).jsonize();
}
status = "OK";
return result;
} catch (EngineManager.ShouldRetryInReplicaException e) {
throw new DNodeException(EXCEPTION_ORDINARY, e.getMessage());
} catch (Throwable e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
} catch (DNodeException e) {
errMsg = e.getMsg();
failedQueries.incrementAndGet();
throw e;
} finally {
long time = performanceTool.endQuery();
msg += " time[" + time + "] status[" + status + "]";
if ("ERROR".equals(status)) {
msg += " errorMessage[" + errMsg + "]";
}
log.info(msg);
if (time > absoluteSlowQueryLimit) {
// slow query!
log.warn("[SLOW QUERY] Query time over absolute slow query time (" + absoluteSlowQueryLimit
+ ") : sql[" + query + "] time[" + time + "]");
slowQueries++;
}
}
}
/**
* Thrift RPC method -> Given a tablespace and a version, execute the SQL
* query. Returns a JSON.
*/
@Override
public String sqlQuery(String tablespace, long version, int partition, String query) throws DNodeException {
return (String) sqlQueryHelperMethod(tablespace, version, partition, false, query);
}
/**
* Thrift RPC method -> Given a tablespace and a version, execute the SQL
* query. Supports more efficient serialization through Kryo.
*/
@Override
public ByteBuffer binarySqlQuery(String tablespace, long version, int partition, String query) throws DNodeException {
return (ByteBuffer) sqlQueryHelperMethod(tablespace, version, partition, true, query);
}
private void markDeployAsAborted(long version, String errorMessage) {
ConcurrentMap<String, String> panel = coord.getDeployErrorPanel(version);
panel.put(whoAmI(), errorMessage);
}
/**
* Thrift RPC method -> Given a list of {@link DeployAction}s and a version
* identifying the deployment perform an asynchronous deploy.
*/
@Override
public String deploy(final List<DeployAction> deployActions, final long version) throws DNodeException {
try {
synchronized (deployLock) {
Thread deployWait = getDeployControllerThread(deployActions, version);
deployWait.start();
}
// Everything is asynchronous so this is quickly reached - it just means
// the process has started
return JSONSerDe.ser(new DNodeStatusResponse("Ok. Deploy initiated"));
} catch (Throwable t) {
unexpectedException(t);
throw new DNodeException(EXCEPTION_UNEXPECTED, t.getMessage());
}
}
/**
* Here we instantiate a Thread for waiting for the deploy so that we
* are able to implement deploy timeout... If the deploy takes too much
* then we cancel it. We achieve this by using Java asynchronous Future
* objects.
*/
protected Thread getDeployControllerThread(final List<DeployAction> deployActions, final long version) {
return new Thread() {
public void run() {
Future<?> future = deployExecutor.submit(newDeployRunnable(deployActions, version));
deploysBeingExecuted.put(version, future);
try {
// This line makes the wait thread wait for the deploy as long as
// the configuration tells
// If the timeout passes a TimeoutException is thrown
future.get(config.getInt(DNodeProperties.DEPLOY_TIMEOUT_SECONDS), TimeUnit.SECONDS);
} catch (CancellationException e) {
log.info("Cancelation when waiting for local deploy to finish - killing deployment " + "version[" + version + "]");
markDeployAsAborted(version, ExceptionUtils.getStackTrace(e));
} catch (InterruptedException e) {
log.info("Interrupted exception waiting for local deploy to finish - killing deployment" + "version[" + version + "]");
markDeployAsAborted(version, ExceptionUtils.getStackTrace(e));
} catch (ExecutionException e) {
log.warn("Execution exception waiting for local deploy to finish - killing deployment." + "version[" + version + "]", e);
markDeployAsAborted(version, ExceptionUtils.getStackTrace(e));
} catch (TimeoutException e) {
log.info("Timeout waiting for local deploy to finish - killing deployment." + "version[" + version + "]", e);
markDeployAsAborted(version, "Timeout reached - " + config.getInt(DNodeProperties.DEPLOY_TIMEOUT_SECONDS) + " seconds");
lastDeployTimedout.set(true);
} finally {
// If the future didn't end, we just send an interrupt signal to it.
future.cancel(true);
deploysBeingExecuted.remove(version);
}
}
};
}
protected DeployRunnable newDeployRunnable(List<DeployAction> deployActions, long version) {
return new DeployRunnable(deployActions, version);
}
/**
* Thrift RPC method -> Given a list of {@link RollbackAction}s, perform a
* synchronous rollback
*/
@Override
public String rollback(List<RollbackAction> rollbackActions, String ignoreMe) throws DNodeException {
// The DNode doesn't need to do anything special for rolling back a version.
// It can serve any version that is stored locally.
try {
return JSONSerDe.ser(new DNodeStatusResponse("Ok. Rollback order received."));
} catch (JSONSerDeException e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
/*
* Any unexpected exception must be redirected to this method. In this way we
* can monitor it using state variables. The state variables will then be
* passed onto the appropriated bean in the status() RPC call.
*/
private void unexpectedException(Throwable t) {
t.printStackTrace();
log.error("Unexpected Exception", t);
lastException = t.getMessage();
lastExceptionTime = System.currentTimeMillis();
}
/**
* Returns an {@link com.splout.db.dnode.beans.DNodeSystemStatus} filled with
* the appropriated data.
*/
@Override
public String status() throws DNodeException {
try {
DNodeSystemStatus status = new DNodeSystemStatus();
if (lastException == null) {
status.setSystemStatus("UP");
status.setLastExceptionTime(-1);
} else {
status.setSystemStatus("Last exception: " + lastException);
status.setLastExceptionTime(lastExceptionTime);
}
status.setUpSince(upSince);
status.setFailedQueries(failedQueries.get());
status.setnQueries(performanceTool.getNQueries());
status.setAverage(performanceTool.getAverage());
status.setSlowQueries(slowQueries);
status.setDeploysInProgress(deployInProgress.get());
status.setHttpExchangerAddress(httpExchangerAddress());
status.setTcpAddress(getTCPAPIAddress());
status.setBalanceActionsStateMap(balanceActionsStateMap);
File folder = new File(config.getString(DNodeProperties.DATA_FOLDER));
if (folder.exists()) {
status.setFreeSpaceInDisk(FileSystemUtils.freeSpaceKb(folder.toString()));
status.setOccupiedSpaceInDisk(FileUtils.sizeOfDirectory(folder));
Collection<File> files = FileUtils.listFilesAndDirs(folder, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
status.setFiles(new ArrayList<String>(Lists.transform(Lists.newArrayList(files), new Function<File, String>() {
@Override
public String apply(File file) {
return file.getAbsolutePath() + " (" + FileUtils.sizeOf(file) + " bytes)";
}
})));
Collections.sort(status.getFiles());
} else {
status.setOccupiedSpaceInDisk(0);
status.setFreeSpaceInDisk(FileSystemUtils.freeSpaceKb("."));
status.setFiles(new ArrayList<String>());
}
return JSONSerDe.ser(status);
} catch (Throwable t) {
unexpectedException(t);
throw new DNodeException(EXCEPTION_UNEXPECTED, t.getMessage());
}
}
protected File getLocalStorageFolder(String tablespace, int partition, long version) {
return getLocalStorageFolder(config, tablespace, partition, version);
}
/**
* Returns the folder where the DNode that uses the provided Configuration
* will store the binary data for this tablespace, version and partition.
*/
public static File getLocalStorageFolder(SploutConfiguration config, String tablespace, int partition, long version) {
String dataFolder = config.getString(DNodeProperties.DATA_FOLDER);
return new File(dataFolder + "/" + getLocalStoragePartitionRelativePath(tablespace, partition, version));
}
public static String getLocalStoragePartitionRelativePath(String tablespace, int partition, long version) {
return tablespace + "/" + version + "/" + partition;
}
protected File getLocalMetadataFile(String tablespace, int partition, long version) {
return getLocalMetadataFile(config, tablespace, partition, version);
}
/**
* Returns the file where the DNode that uses the provided Configuration will
* store the metadata for this tablespace, version and partition.
*/
public static File getLocalMetadataFile(SploutConfiguration config, String tablespace, int partition, long version) {
String dataFolder = config.getString(DNodeProperties.DATA_FOLDER);
return new File(dataFolder + "/" + getLocalMetadataFileRelativePath(tablespace, partition, version));
}
public static String getLocalMetadataFileRelativePath(String tablespace, int partition, long version) {
return tablespace + "/" + version + "/" + partition + ".meta";
}
public boolean isDeployInProgress() {
return deployInProgress.get() > 0;
}
/**
* Properly dispose this DNode.
*/
public void stop() throws Exception {
dbCache.dispose();
deployExecutor.shutdownNow();
factory.close();
httpExchanger.close();
hz.getLifecycleService().shutdown();
}
@Override
public String abortDeploy(long version) throws DNodeException {
try {
synchronized (deployLock) {
// No new deploys to be handled until we
// cancel the current one
Future<?> future = deploysBeingExecuted.get(version);
if (future == null) {
return JSONSerDe.ser(new DNodeStatusResponse("Not deployment running with version["+version+"]"));
}
future.cancel(true);
return JSONSerDe.ser(new DNodeStatusResponse("status[OK]. Deploy with version["+version+"] cancelled."));
}
} catch (JSONSerDeException e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
@Override
public String deleteOldVersions(List<com.splout.db.thrift.TablespaceVersion> versions) throws DNodeException {
for (com.splout.db.thrift.TablespaceVersion version : versions) {
log.info("Going to remove " + version + " as I have been told to do so.");
try {
deleteLocalVersion(version);
} catch (Throwable e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
try {
// Publish new DNodeInfo in distributed registry.
// This makes QNodes notice that a new version is available...
// PartitionMap and ReplicationMap will be built incrementally as DNodes
// finish.
dnodesRegistry.changeInfo(new DNodeInfo(config));
return JSONSerDe.ser(new DNodeStatusResponse("Ok. Delete old versions executed."));
} catch (JSONSerDeException e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
// ----------------- TEST API ----------------- //
private AtomicBoolean shutDownByTestAPI = new AtomicBoolean(false);
/*
* This method is called by unit / integration tests in order to simulate
* failures and recoveries in DNodes and such.
*/
@Override
public String testCommand(String commandStr) throws DNodeException {
if (!config.getBoolean(DNodeProperties.HANDLE_TEST_COMMANDS)) {
throw new DNodeException(EXCEPTION_ORDINARY, "Can't handle test commands as " + DNodeProperties.HANDLE_TEST_COMMANDS
+ " is not set to true.");
}
TestCommands command = TestCommands.valueOf(commandStr);
if (command == null) {
throw new DNodeException(EXCEPTION_ORDINARY, "Unknown test command: " + commandStr);
}
if (command.equals(TestCommands.SHUTDOWN)) {
// on-demand shutdown
// This is a "soft-shutdown" so we can recover from it.
// It is designed for unit and integration testing.
shutDownByTestAPI.set(true);
dnodesRegistry.unregister();
log.info("Received a shutdown by test API.");
hz.getLifecycleService().shutdown();
} else if (command.equals(TestCommands.RESTART)) {
// on-demand restart
// This is a "soft-restart" after a "soft-shutdown".
// It is designed for unit and integration testing.
shutDownByTestAPI.set(false);
try {
hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
coord = new CoordinationStructures(hz);
log.info("Received a restart by test API.");
giveGreenLigth();
} catch (HazelcastConfigBuilderException e) {
log.error("Error while trying to connect to Hazelcast", e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
try {
return JSONSerDe.ser(new DNodeStatusResponse("Ok. Test command " + commandStr + " received properly."));
} catch (JSONSerDeException e) {
unexpectedException(e);
throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
}
}
// --- Getters mainly for testing --- /
public CoordinationStructures getCoord() {
return coord;
}
public DistributedRegistry getDnodesRegistry() {
return dnodesRegistry;
}
protected class DeployRunnable implements Runnable {
private final List<DeployAction> deployActions;
private final long version;
public DeployRunnable(List<DeployAction> deployActions, long version) {
this.deployActions = deployActions;
this.version = version;
}
// This code is executed by the solely deploy thread, not the one
// who waits
@Override
public void run() {
try {
deployInProgress.incrementAndGet();
lastDeployTimedout.set(false);
log.info("Starting [" + deployActions.size() + "] deploy actions.");
long start = System.currentTimeMillis();
long totalSize = 0;
// Ask for the total size of the deployment first.
for (DeployAction action : deployActions) {
long plusSize = fetcher.sizeOf(action.getDataURI());
if (plusSize == Fetcher.SIZE_UNKNOWN) {
totalSize = Fetcher.SIZE_UNKNOWN;
break;
}
totalSize += plusSize;
}
final double totalKnownSize = totalSize / (1024d * 1024d);
final long startTime = System.currentTimeMillis();
final AtomicLong bytesSoFar = new AtomicLong(0l);
final Fetcher.Reporter reporter = new Fetcher.Reporter() {
@Override
public void progress(long consumed) {
long now = System.currentTimeMillis();
double totalSoFar = bytesSoFar.addAndGet(consumed) / (1024d * 1024d);
double secondsSoFar = (now - startTime) / 1000d;
double mBytesPerSec = totalSoFar / secondsSoFar;
String msg = "[" + whoAmI() + " progress/speed report]: Fetched [";
if (totalSoFar > 999) {
msg += String.format("%.3f", (totalSoFar / 1024d)) + "] GBs so far ";
} else {
msg += String.format("%.3f", totalSoFar) + "] MBs so far ";
}
if (totalKnownSize != Fetcher.SIZE_UNKNOWN) {
msg += "(out of [";
if (totalKnownSize > 999) {
msg += String.format("%.3f", (totalKnownSize / 1024d)) + "] GBs) ";
} else {
msg += String.format("%.3f", totalKnownSize) + "] MBs) ";
}
}
msg += "- Current deployment speed is [" + String.format("%.3f", mBytesPerSec) + "] MB/s.";
// Add a report of the estimated remaining time if we can
if (totalKnownSize != Fetcher.SIZE_UNKNOWN) {
double missingSize = (totalKnownSize - totalSoFar);
long remainingSecs = (long) (missingSize / mBytesPerSec);
String timeRemaining = "";
if (remainingSecs > 3600) { // hours, minutes and secs
int hours = (int) (remainingSecs / 3600);
int restOfSeconds = (int) (remainingSecs % 3600);
timeRemaining = hours + " hours and " + (int) (restOfSeconds / 60) + " minutes and " + (restOfSeconds % 60)
+ " seconds";
} else if (remainingSecs > 60) { // minutes and secs
timeRemaining = (int) (remainingSecs / 60) + " minutes and " + (remainingSecs % 60) + " seconds";
} else { // secs
timeRemaining = remainingSecs + " seconds";
}
msg += " Estimated remaining time is [" + timeRemaining + "].";
}
coord.logDeploySpeed(version, whoAmI(), msg);
}
};
// Parallel execution of deploy actions
ExecutorService executor = Executors.newFixedThreadPool(deployParallelism);
ExecutorCompletionService<Boolean> ecs = new ExecutorCompletionService<Boolean>(executor);
ArrayList<Future<Boolean>> deployFutures = new ArrayList<Future<Boolean>>();
for (final DeployAction action : deployActions) {
deployFutures.add(ecs.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
// Downloads data and updates some structs
runDeployAction(reporter, action, version);
return true;
}
}));
}
// Waiting all tasks to finish.
for (int i = 0; i < deployActions.size(); i++) {
// Get will throw an exception if the callable returned it.
try {
ecs.take().get();
} catch (ExecutionException e) {
// One job was wrong. Stopping the rest.
cancelAndShutdown(executor, deployFutures);
throw e.getCause();
} catch (InterruptedException e) {
// Somebody interrupted the deployment thread. Stopping
// the rest of tasks.
cancelAndShutdown(executor, deployFutures);
throw e;
} catch (CancellationException e) {
// Somebody cancelled the deployment thread. Stopping
// the rest of tasks.
cancelAndShutdown(executor, deployFutures);
throw new InterruptedException();
}
}
executor.shutdown();
// Publish new DNodeInfo in distributed registry.
// This makes QNodes notice that a new version is available...
// PartitionMap and ReplicationMap will be built incrementally
// as DNodes finish.
dnodesRegistry.changeInfo(new DNodeInfo(config));
long end = System.currentTimeMillis();
log.info("Local [" + deployActions.size() + "] deploy actions successfully finished in " + (end - start) + " ms.");
} catch (InterruptedException e) {
// Somebody interrupted the thread. Probably somebody is aborting.
log.info("Version[" + version + "] deployment interrupted");
} catch (Throwable t) {
// In order to avoid stale deployments, we flag this deploy to
// be aborted
log.warn("Error deploying[" + deployActions + "] barrier + version[" + version + "]", t);
markDeployAsAborted(version, ExceptionUtils.getStackTrace(t));
} finally {
deployInProgress.decrementAndGet();
// Decrement the countdown latch. On 0, deployer knows that
// the deploy
// finished.
ICountDownLatch countdown = coord.getCountDownLatchForDeploy(version);
countdown.countDown();
}
}
protected void cancelAndShutdown(ExecutorService executor, ArrayList<Future<Boolean>> deployFutures) {
for (Future<Boolean> task : deployFutures) {
task.cancel(true);
}
executor.shutdown();
}
}
/**
* Runs a deploy action. Downloads file and warm up the data.
* Interruptible.
*/
private void runDeployAction(Fetcher.Reporter reporter, DeployAction action, long version) throws IOException, URISyntaxException,
DNodeException, InterruptedException {
log.info("Running deployAction[" + action + "] for version[" + version + "].");
// 1- Call the fetcher for fetching
File fetchedContent = fetcher.fetch(action.getDataURI(), reporter);
// If we reach this point then the fetch has been OK
// 2- Create the local folder were to move the fetched data
File dbFolder = getLocalStorageFolder(action.getTablespace(), action.getPartition(), version);
if (dbFolder.exists()) { // If the new folder where we want to deploy
// already exists means it is
// somehow
// stalled from a previous failed deploy - it is ok to delete it
FileUtils.deleteDirectory(dbFolder);
}
// 3- Perform a "mv" for finally making the data available
FileUtils.moveDirectory(fetchedContent, dbFolder);
// 4- Check if interrupted. In this case, we remove the folder before returning
if (Thread.interrupted()) {
try {
FileUtils.deleteDirectory(dbFolder);
} catch (IOException e) {
log.warn("Not possible to remove " + dbFolder + " when trying to cancel de deployment.");
}
throw new InterruptedException();
}
// 5- Store metadata about the partition
writePartitionMetadata(action, version);
// 6- Preemptively load the Manager in case initialization is slow
// Managers might warm up for a while (e.g. loading data into memory)
loadManagerInEHCache(action.getTablespace(), action.getVersion(), action.getPartition(), dbFolder, action.getMetadata());
log.info("Finished deployAction[" + action + "] for version[" + version + "].");
}
private void writePartitionMetadata(DeployAction action, long version) throws IOException {
File metadataFile = getLocalMetadataFile(action.getTablespace(), action.getPartition(), version);
if (!metadataFile.getParentFile().exists()) {
metadataFile.getParentFile().mkdirs();
}
ThriftWriter writer = new ThriftWriter(metadataFile);
writer.write(action.getMetadata());
writer.close();
}
}