/*
*
* * RHQ Management Platform
* * Copyright (C) 2005-2014 Red Hat, Inc.
* * All rights reserved.
* *
* * This program is free software; you can redistribute it and/or modify
* * it under the terms of the GNU General Public License, version 2, as
* * published by the Free Software Foundation, and/or the GNU Lesser
* * General Public License, version 2.1, also as published by the Free
* * Software Foundation.
* *
* * This program is distributed in the hope that it will be useful,
* * but WITHOUT ANY WARRANTY; without even the implied warranty of
* * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* * GNU General Public License and the GNU Lesser General Public License
* * for more details.
* *
* * You should have received a copy of the GNU General Public License
* * and the GNU Lesser General Public License along with this program;
* * if not, write to the Free Software Foundation, Inc.,
* * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
*/
package org.rhq.plugins.storage;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hyperic.sigar.SigarException;
import org.mc4j.ems.connection.EmsConnection;
import org.mc4j.ems.connection.EmsInvocationException;
import org.mc4j.ems.connection.bean.EmsBean;
import org.mc4j.ems.connection.bean.attribute.EmsAttribute;
import org.mc4j.ems.connection.bean.operation.EmsOperation;
import org.yaml.snakeyaml.error.YAMLException;
import org.rhq.cassandra.util.ConfigEditor;
import org.rhq.cassandra.util.ConfigEditorException;
import org.rhq.core.domain.configuration.Configuration;
import org.rhq.core.domain.configuration.ConfigurationUpdateStatus;
import org.rhq.core.domain.configuration.Property;
import org.rhq.core.domain.configuration.PropertyList;
import org.rhq.core.domain.configuration.PropertyMap;
import org.rhq.core.domain.configuration.PropertySimple;
import org.rhq.core.pluginapi.configuration.ConfigurationFacet;
import org.rhq.core.pluginapi.configuration.ConfigurationUpdateReport;
import org.rhq.core.pluginapi.inventory.ProcessScanResult;
import org.rhq.core.pluginapi.inventory.ResourceContext;
import org.rhq.core.pluginapi.operation.OperationFacet;
import org.rhq.core.pluginapi.operation.OperationResult;
import org.rhq.core.system.ProcessInfo;
import org.rhq.core.util.StringUtil;
import org.rhq.core.util.exception.ThrowableUtil;
import org.rhq.core.util.file.FileUtil;
import org.rhq.core.util.stream.StreamUtil;
import org.rhq.plugins.cassandra.CassandraNodeComponent;
import org.rhq.plugins.cassandra.util.KeyspaceService;
/**
* @author John Sanda
*/
public class StorageNodeComponent extends CassandraNodeComponent implements OperationFacet, ConfigurationFacet {
private Log log = LogFactory.getLog(StorageNodeComponent.class);
private static final String SYSTEM_AUTH_KEYSPACE = "system_auth";
private static final String RHQ_KEYSPACE = "rhq";
private static final String SYSTEM_KEYSPACE = "system";
@Override
public Configuration loadResourceConfiguration() throws Exception {
return new StorageNodeConfigDelegate(getBasedir(), this).loadResourceConfiguration();
}
@Override
public void updateResourceConfiguration(ConfigurationUpdateReport configurationUpdateReport) {
StorageNodeConfigDelegate configDelegate = new StorageNodeConfigDelegate(getBasedir(), this);
configDelegate.updateResourceConfiguration(configurationUpdateReport);
}
private OperationResult shutdownIfNecessary() {
log.info("Shutting down " + getResourceContext().getResourceKey());
ProcessInfo process = getResourceContext().getNativeProcess();
if (process == null) {
File pidFile = new File(getBinDir(), "cassandra.pid");
if (pidFile.exists()) {
return shutdownStorageNode();
} else {
return new OperationResult("Storage node is not running");
}
} else {
return shutdownStorageNode();
}
}
private File getBasedir() {
Configuration pluginConfig = getResourceContext().getPluginConfiguration();
return new File(pluginConfig.getSimpleValue("baseDir"));
}
private File getBinDir() {
return new File(getBasedir(), "bin");
}
private File getConfDir() {
return new File(getBasedir(), "conf");
}
private File getInternodeAuthConfFile() {
return new File(getConfDir(), "rhq-storage-auth.conf");
}
@Override
public OperationResult invokeOperation(String name, Configuration parameters) throws Exception {
if (name.equals("addNodeMaintenance")) {
return nodeAdded(parameters);
} else if (name.equals("removeNodeMaintenance")) {
return nodeRemoved(parameters);
} else if (name.equals("prepareForUpgrade")) {
return prepareForUpgrade(parameters);
} else if (name.equals("repair")) {
return repair();
} else if (name.equals("updateConfiguration")) {
return updateConfiguration(parameters);
} else if (name.equals("announce")) {
return announce(parameters);
} else if (name.equals("unannounce")) {
return unannounce(parameters);
} else if (name.equals("prepareForBootstrap")) {
return prepareForBootstrap(parameters);
} else if (name.equals("shutdown")) {
return shutdownStorageNode();
} else if (name.equals("decommission")) {
return decommission();
} else if (name.equals("uninstall")) {
return uninstall();
} else if (name.equals("moveDataFiles")) {
return moveDataFiles(parameters);
} else {
return super.invokeOperation(name, parameters);
}
}
private OperationResult shutdownStorageNode() {
OperationResult result = new OperationResult();
File binDir = new File(getBasedir(), "bin");
File pidFile = new File(binDir, "cassandra.pid");
try {
if (pidFile.exists()) {
long pid = readPidFile(pidFile);
log.info("Shutting down storage node with pid " + pid);
ProcessInfo process = findProcessInfo(pid);
if (process != null) {
try {
process.kill("KILL");
waitForNodeToGoDown();
pidFile.delete();
result.setSimpleResult("Successfully storage node with pid " + pid);
} catch (SigarException e) {
log.error("Failed to delete storage node with pid " + process.getPid(), e);
result.setErrorMessage("Failed to delete storage node with pid " + pid + ": " +
ThrowableUtil.getAllMessages(e));
}
} else {
log.warn("Could not find process info for pid " + pid);
result = shutdownUsingNativeProcessInfo();
}
} else {
log.warn("Did not find pid file " + pidFile + ". It should not be modified, deleted, or moved.");
result = shutdownUsingNativeProcessInfo();
}
} catch (FileNotFoundException e) {
log.error("Could not read pid file " + pidFile, e);
result.setErrorMessage("Could not read pid file " + pidFile + ": " + ThrowableUtil.getAllMessages(e));
} catch (InterruptedException e) {
log.warn("The shutdown operation was cancelled or interrupted. This interruption occurred while trying " +
"to verify that the storage node process has exited.");
result.setErrorMessage("The operation was cancelled or interrupted while trying to verify that the " +
"storage node process has exited.");
Thread.currentThread().interrupt();
}
return result;
}
private long readPidFile(File pidFile) throws FileNotFoundException {
return Long.parseLong(StreamUtil.slurp(new FileReader(pidFile)));
}
@SuppressWarnings("unchecked")
private ProcessInfo findProcessInfo(long pid) {
List<ProcessScanResult> scanResults = getResourceContext().getNativeProcessesForType();
for (ProcessScanResult scanResult : scanResults) {
if (scanResult.getProcessInfo().getPid() == pid) {
return scanResult.getProcessInfo();
}
}
return null;
}
private OperationResult shutdownUsingNativeProcessInfo() throws InterruptedException {
log.warn("Could not obtain process info from pid file");
log.info("Obtaining process info from the system to perform the shutdown");
OperationResult result = shutdownNode();
waitForNodeToGoDown();
return result;
}
private OperationResult updateConfiguration(Configuration params) {
//update storage node jvm settings only
Configuration config = new Configuration();
Configuration.builder().addSimple("jmxPort", params.getSimpleValue("jmxPort"))
.addSimple("jmxPort", params.getSimpleValue("jmxPort"))
.addSimple("minHeapSize", params.getSimpleValue("heapSize"))
.addSimple("maxHeapSize", params.getSimpleValue("heapSize"))
.addSimple("heapNewSize", params.getSimpleValue("heapNewSize"))
.addSimple("threadStackSize", params.getSimpleValue("threadStackSize"))
.addSimple("maxHeapSize", params.getSimpleValue("heapSize")).build();
config.put(new PropertySimple("jmxPort", params.getSimpleValue("jmxPort")));
config.put(new PropertySimple("minHeapSize", params.getSimpleValue("heapSize")));
config.put(new PropertySimple("maxHeapSize", params.getSimpleValue("heapSize")));
config.put(new PropertySimple("heapNewSize", params.getSimpleValue("heapNewSize")));
config.put(new PropertySimple("threadStackSize", params.getSimpleValue("threadStackSize")));
String restartIfRequiredString = params.getSimpleValue("restartIfRequired");
boolean restartIfRequired = restartIfRequiredString != null && Boolean.parseBoolean(restartIfRequiredString);
ConfigurationUpdateReport configurationUpdateReport = new ConfigurationUpdateReport(config);
StorageNodeConfigDelegate configDelegate = new StorageNodeConfigDelegate(getBasedir(), this);
configDelegate.updateResourceConfigurationAndRestartIfNecessary(configurationUpdateReport, restartIfRequired);
OperationResult result = new OperationResult("Configuration updated.");
if (!configurationUpdateReport.getStatus().equals(ConfigurationUpdateStatus.SUCCESS)) {
result.setErrorMessage(configurationUpdateReport.getErrorMessage());
}
return result;
}
private OperationResult decommission() {
log.info("Decommissioning " + getResourceContext().getResourceKey());
OperationResult result = new OperationResult();
try {
EmsConnection emsConnection = getEmsConnection();
EmsBean storageService = emsConnection.getBean("org.apache.cassandra.db:type=StorageService");
EmsAttribute operationModeAttr = storageService.getAttribute("OperationMode");
String operationMode = (String) operationModeAttr.refresh();
if (operationMode.equals("DECOMMISSIONED")) {
log.info("The storage node " + getHost() + " is already decommissioned.");
} else {
Class<?>[] emptyParams = new Class<?>[0];
EmsOperation operation = storageService.getOperation("decommission", emptyParams);
operation.invoke((Object[]) emptyParams);
operationMode = (String) operationModeAttr.refresh();
if (!operationMode.equals("DECOMMISSIONED")) {
result.setErrorMessage("Failed to decommission storage node " + getHost() + ". The " +
"StorageService is reporting " + operationMode + " for its operation mode but it should be " +
"reporting DECOMMISSIONED. The StorageService operation mode is not to be confused with the " +
"Storage Node operation mode.");
}
}
} catch (EmsInvocationException e) {
result.setErrorMessage("Decommission operation failed: " + ThrowableUtil.getAllMessages(e));
}
return result;
}
private OperationResult uninstall() {
log.info("Uninstalling storage node at " + getResourceContext().getResourceKey());
OperationResult result = new OperationResult();
OperationResult shutdownResult = shutdownIfNecessary();
if (shutdownResult.getErrorMessage() != null) {
result.setErrorMessage("Failed to shut down storage node: " + shutdownResult.getErrorMessage());
} else {
File basedir = getBasedir();
if (basedir.exists()) {
log.info("Purging data directories");
ConfigEditor yamlEditor = getYamlConfigEditor();
yamlEditor.load();
purgeDataDirs(yamlEditor);
log.info("Purging installation directory " + basedir);
purgeDir(basedir);
log.info("Finished deleting storage node " + getResourceContext().getResourceKey());
} else {
log.info(basedir + " does not exist. Storage node files have already been purged.");
}
}
return result;
}
private OperationResult announce(Configuration params) {
OperationResult result = new OperationResult();
Set<String> addressesToAdd = null;
try {
addressesToAdd = getAddreses(params);
log.info("Announcing " + addressesToAdd);
createSnapshots(addressesToAdd, "pre_" + StringUtil.collectionToString(addressesToAdd) + "_bootstrap_");
Set<String> knownAddresses = getAuthAddresses();
knownAddresses.addAll(addressesToAdd);
setAuthAddresses(knownAddresses);
reloadInternodeAuthConfig();
result.getComplexResults().put(new PropertySimple("details",
"Successfully announced " + addressesToAdd));
} catch (InternodeAuthConfUpdateException e) {
result.setErrorMessage("Failed to update authorized nodes due to the following error(s): " +
ThrowableUtil.getAllMessages(e));
}
return result;
}
private OperationResult unannounce(Configuration params) {
OperationResult result = new OperationResult();
Set<String> addressesToRemove = null;
try {
addressesToRemove = getAddreses(params);
log.info("Unannouncing " + addressesToRemove);
createSnapshots(addressesToRemove, "pre_" + StringUtil.collectionToString(addressesToRemove) +
"_decommission_");
Set<String> knownAddresses = getAuthAddresses();
knownAddresses.removeAll(addressesToRemove);
setAuthAddresses(knownAddresses);
reloadInternodeAuthConfig();
result.getComplexResults().put(new PropertySimple("details",
"Successfully unannounced " + addressesToRemove));
} catch (InternodeAuthConfUpdateException e) {
result.setErrorMessage("Failed to update authorized nodes due to the following error(s): " +
ThrowableUtil.getAllMessages(e));
}
return result;
}
private Set<String> getAddreses(Configuration params) {
PropertyList propertyList = params.getList("addresses");
Set<String> ipAddresses = new HashSet<String>();
for (Property property : propertyList.getList()) {
PropertySimple propertySimple = (PropertySimple) property;
ipAddresses.add(propertySimple.getStringValue());
}
return ipAddresses;
}
private void createSnapshots(Set<String> addressesToAdd, String snapshotPrefix) {
KeyspaceService keyspaceService = new KeyspaceService(getEmsConnection());
keyspaceService.takeSnapshot(new String[] { SYSTEM_KEYSPACE, SYSTEM_AUTH_KEYSPACE, RHQ_KEYSPACE },
snapshotPrefix + System.currentTimeMillis());
}
private void reloadInternodeAuthConfig() {
EmsBean authBean = getEmsConnection().getBean("org.rhq.cassandra.auth:type=RhqInternodeAuthenticator");
EmsOperation emsOperation = authBean.getOperation("reloadConfiguration");
emsOperation.invoke();
}
@SuppressWarnings("deprecation")
private OperationResult updateKnownNodes(Configuration params) {
OperationResult result = new OperationResult();
PropertyList propertyList = params.getList("addresses");
Set<String> ipAddresses = new HashSet<String>();
for (Property property : propertyList.getList()) {
PropertySimple propertySimple = (PropertySimple) property;
ipAddresses.add(propertySimple.getStringValue());
}
try {
updateInternodeAuthConfFile(ipAddresses);
EmsBean authBean = getEmsConnection().getBean("org.rhq.cassandra.auth:type=RhqInternodeAuthenticator");
EmsOperation emsOperation = authBean.getOperation("reloadConfiguration");
emsOperation.invoke();
Configuration complexResults = result.getComplexResults();
complexResults.put(new PropertySimple("details", "Successfully updated the set of known nodes."));
return result;
} catch (InternodeAuthConfUpdateException e) {
File authFile = getInternodeAuthConfFile();
log.error("Failed to update set of trusted nodes in " + authFile + " due to the following error(s): " +
ThrowableUtil.getAllMessages(e)) ;
result.setErrorMessage("Failed to update set of trusted nodes in " + authFile + " due to the following " +
"error(s): " + ThrowableUtil.getAllMessages(e));
return result;
}
}
@SuppressWarnings("rawtypes")
private OperationResult prepareForBootstrap(Configuration params) {
log.info("Preparing " + this + " for bootstrap...");
ResourceContext context = getResourceContext();
OperationResult result = new OperationResult();
log.info("Stopping storage node");
OperationResult shutdownResult = shutdownIfNecessary();
if (shutdownResult.getErrorMessage() != null) {
log.error("Failed to stop storage node " + getResourceContext().getResourceKey() + ". The storage node " +
"must be shut down in order for the changes made by this operation to take effect.");
result.setErrorMessage("Failed to stop the storage node. The storage node must be shut down in order " +
"for the changes made by this operation to take effect. The attempt to stop shut down the storage " +
"node failed with this error: " + shutdownResult.getErrorMessage());
return result;
}
Configuration pluginConfig = context.getPluginConfiguration();
ConfigEditor configEditor = getYamlConfigEditor();
try {
configEditor.load();
purgeDataDirs(configEditor);
log.info("Updating cluster settings");
String address = pluginConfig.getSimpleValue("host");
int cqlPort = Integer.parseInt(params.getSimpleValue("cqlPort"));
int gossipPort = Integer.parseInt(params.getSimpleValue("gossipPort"));
List<String> addresses = getAddresses(params.getList("addresses"));
// Make sure this node's address is not in the list; otherwise, it
// won't bootstrap properly.
List<String> seeds = new ArrayList<String>(addresses);
seeds.remove(address);
log.info("Updating seeds property to " + seeds);
configEditor.setSeeds(seeds.toArray(new String[seeds.size()]));
configEditor.setNativeTransportPort(cqlPort);
configEditor.setStoragePort(gossipPort);
configEditor.save();
log.info("Cluster configuration settings have been applied to " + configEditor.getConfigFile());
updateInternodeAuthConfFile(new HashSet<String>(addresses));
log.info(this + " is ready to be bootstrap. Restarting storage node...");
OperationResult startResult = startNode();
if (startResult.getErrorMessage() != null) {
log.error("Failed to restart storage node:\n" + startResult.getErrorMessage());
result.setErrorMessage("Failed to restart storage node:\n" + startResult.getErrorMessage());
} else {
result.setSimpleResult("The storage node was successfully updated is now bootstrapping into the cluster.");
}
return result;
} catch (ConfigEditorException e) {
log.error("There was an error while trying to update " + configEditor.getConfigFile(), e);
if (e.getCause() instanceof YAMLException) {
log.info("Attempting to restore " + configEditor.getConfigFile());
try {
configEditor.restore();
result.setErrorMessage("Failed to update configuration file [" + configEditor.getConfigFile() + "]: " +
ThrowableUtil.getAllMessages(e.getCause()));
} catch (ConfigEditorException e1) {
log.error("Failed to restore " + configEditor.getConfigFile() + ". A copy of the file prior to any modifications " +
"can be found at " + configEditor.getBackupFile());
result.setErrorMessage("There was an error updating [" + configEditor.getConfigFile() + "] and undoing the changes " +
"Failed. A copy of the file can be found at " + configEditor.getBackupFile() + ". See the " +
"agent logs for more details");
}
}
EmsConnection emsConnection = getEmsConnection();
EmsBean storageService = emsConnection.getBean("org.apache.cassandra.db:type=StorageService");
EmsAttribute attribute = storageService.getAttribute("OperationMode");
String operationMode = (String) attribute.refresh();
if (!operationMode.equals("NORMAL")) {
result.setErrorMessage("Bootstrapping " + getHost() + " failed. The StorageService is reporting " +
operationMode + " for its operation mode but it should be reporting NORMAL. The StorageService " +
"operation mode is not to be confused with the Storage Node operation mode.");
}
return result;
} catch (InternodeAuthConfUpdateException e) {
File authFile = getInternodeAuthConfFile();
result.setErrorMessage("Failed to update " + authFile + " due to the following error(s): " +
ThrowableUtil.getAllMessages(e));
return result;
}
}
/**
* @param origDir Current data directory in the config
* @param destDir Potential new data directory
* @return true if files were moved
*/
private boolean copyDataDirectoryIfChanged(String origDir, String destDir) throws IOException {
if(destDir != null && destDir.length() > 0 && !origDir.equals(destDir)) {
log.debug("Moving data from " + origDir + " to " + destDir);
File currentDir = new File(origDir);
File newDir = new File(destDir);
FileUtil.copyDirectory(currentDir, newDir);
return true;
}
return false;
}
private OperationResult moveDataFiles(Configuration params) {
ResourceContext context = getResourceContext();
OperationResult result = new OperationResult();
log.info("Preparing to move " + this + "'s datafiles to new locations");
String newCommitLogDirectory = params.getSimpleValue("CommitLogLocation");
String newSavedCachesDirectory = params.getSimpleValue("SavedCachesLocation");
PropertyList allDataFileLocations = params.getList("AllDataFileLocations");
String newDataFileDirectory = null;
if(allDataFileLocations != null) {
List<String> dataDirectories = new LinkedList<String>();
for (Property property : allDataFileLocations.getList()) {
PropertySimple dataFileLocation = (PropertySimple) property;
dataDirectories.add(dataFileLocation.getStringValue());
}
if(dataDirectories.size() > 1) {
result.setErrorMessage("This process does not support more than one active directory for StorageNode data locations. ");
return result;
}
newDataFileDirectory = dataDirectories.get(0);
}
if(newCommitLogDirectory == null && newSavedCachesDirectory == null && newDataFileDirectory == null) {
return new OperationResult("No new directories were specified");
}
log.info("Stopping storage node");
OperationResult shutdownResult = super.shutdownNode(); // CassandraNodeComponent.shutDownNode() does draining before shutting down
try {
waitForNodeToGoDown();
} catch (InterruptedException e) {
log.error("Received " + e.getLocalizedMessage() + " while waiting for storage node " + getResourceContext().getResourceKey() + " to shutdown", e);
result.setErrorMessage("Failed to stop the storage node. The storage node must be shut down in order " +
"for the changes made by this operation to take effect. The attempt to stop shut down the storage " +
"node failed with this error: " + shutdownResult.getErrorMessage());
return result;
}
if (shutdownResult.getErrorMessage() != null) {
log.error("Failed to stop storage node " + getResourceContext().getResourceKey() + ". The storage node " +
"must be shut down in order for the changes made by this operation to take effect.");
result.setErrorMessage("Failed to stop the storage node. The storage node must be shut down in order " +
"for the changes made by this operation to take effect. The attempt to stop shut down the storage " +
"node failed with this error: " + shutdownResult.getErrorMessage());
return result;
}
log.info("Storage node shutdown, preparing to move datafiles");
List<String> originalDataDirectories = new LinkedList<String>();
List<String> createdDataDirectories = new LinkedList<String>();
ConfigEditor configEditor = getYamlConfigEditor();
try {
configEditor.load();
// Moving the data directory
List<String> dataFileDirectories = configEditor.getDataFileDirectories();
if(dataFileDirectories.size() > 1) {
// We do not support this scenario
log.error("More than one datadirectory configured for the StorageNode. This operation mode is not supported by this tool");
StringBuilder pathListBuilder = new StringBuilder();
for (String dataFileDir : dataFileDirectories) {
pathListBuilder.append(dataFileDir).append(", ");
}
result.setErrorMessage("Could not proceed with moving datafiles from " + pathListBuilder.toString() + "this tool does not support"
+ " multiple datafile paths.");
return result;
} else if(dataFileDirectories.size() == 1) {
String currentDataFileLocation = dataFileDirectories.get(0);
boolean dataFilesMoved = copyDataDirectoryIfChanged(currentDataFileLocation, newDataFileDirectory);
if(dataFilesMoved) {
originalDataDirectories.add(currentDataFileLocation);
createdDataDirectories.add(newDataFileDirectory);
List<String> newDataFileDirectories = new LinkedList<String>();
newDataFileDirectories.add(newDataFileDirectory);
configEditor.setDataFileDirectories(newDataFileDirectories);
}
}
// In theory we wouldn't need to copy these, as draining should empty these
String currentCommitLogDirectory = configEditor.getCommitLogDirectory();
boolean commitLogCopied = copyDataDirectoryIfChanged(currentCommitLogDirectory, newCommitLogDirectory);
if(commitLogCopied) {
originalDataDirectories.add(currentCommitLogDirectory);
createdDataDirectories.add(newCommitLogDirectory);
configEditor.setCommitLogDirectory(newCommitLogDirectory);
}
// Not so dangerous if we lose these, but lets try to keep them
String currentSavedCachesDirectory = configEditor.getSavedCachesDirectory();
boolean savedCachesCopied = copyDataDirectoryIfChanged(currentSavedCachesDirectory, newSavedCachesDirectory);
if(savedCachesCopied) {
originalDataDirectories.add(currentSavedCachesDirectory);
createdDataDirectories.add(newSavedCachesDirectory);
configEditor.setSavedCachesDirectory(newSavedCachesDirectory);
}
log.info(this + " datafiles have been moved. Restarting storage node...");
OperationResult startResult = startNode();
if (startResult.getErrorMessage() != null) {
log.error("Failed to restart storage node:\n" + startResult.getErrorMessage());
result.setErrorMessage("Failed to restart storage node:\n" + startResult.getErrorMessage());
// rollback here
configEditor.restore();
purgeDirectories(createdDataDirectories);
} else {
result.setSimpleResult("The storage node was succesfully updated.");
// Commit changes, remove old directories
configEditor.save(); // This can still throw an exception, in which case we need to rollback
purgeDirectories(originalDataDirectories);
}
return result;
} catch (ConfigEditorException e) {
log.error("There was an error while trying to update " + configEditor.getConfigFile(), e);
if (e.getCause() instanceof YAMLException) {
log.info("Attempting to restore " + configEditor.getConfigFile());
try {
configEditor.restore();
purgeDirectories(createdDataDirectories);
result.setErrorMessage("Failed to update configuration file [" + configEditor.getConfigFile() + "]: " +
ThrowableUtil.getAllMessages(e.getCause()));
} catch (ConfigEditorException e1) {
log.error("Failed to restore " + configEditor.getConfigFile() + ". A copy of the file prior to any modifications " +
"can be found at " + configEditor.getBackupFile());
result.setErrorMessage("There was an error updating [" + configEditor.getConfigFile() + "] and undoing the changes " +
"Failed. A copy of the file can be found at " + configEditor.getBackupFile() + ". See the " +
"agent logs for more details");
}
}
EmsConnection emsConnection = getEmsConnection();
EmsBean storageService = emsConnection.getBean("org.apache.cassandra.db:type=StorageService");
EmsAttribute attribute = storageService.getAttribute("OperationMode");
String operationMode = (String) attribute.refresh();
if (!operationMode.equals("NORMAL")) {
result.setErrorMessage("Bootstrapping " + getHost() + " failed. The StorageService is reporting " +
operationMode + " for its operation mode but it should be reporting NORMAL. The StorageService " +
"operation mode is not to be confused with the Storage Node operation mode.");
}
return result;
} catch (IOException e) {
log.error("Moving datafiles failed", e);
purgeDirectories(createdDataDirectories);
configEditor.restore();
result.setErrorMessage("Failed to move all the files to new destinations, " + e.getLocalizedMessage() + ". StorageService was left offline" +
", investigate before restarting the node");
// OperationResult startResult = startNode(); // return the StorageNode online, but what if IOException was out of diskspace?
return result;
}
}
private void purgeDirectories(List<String> directories) {
for(String dir : directories) {
File directory = new File(dir);
purgeDir(directory);
}
}
private void purgeDataDirs(ConfigEditor configEditor) {
purgeDir(new File(configEditor.getCommitLogDirectory()));
purgeDir(new File(configEditor.getCommitLogDirectory()));
for (String path : configEditor.getDataFileDirectories()) {
purgeDir(new File(path));
}
purgeDir(new File(configEditor.getSavedCachesDirectory()));
}
private void purgeDir(File dir) {
if (dir.isAbsolute()) {
log.info("Purging " + dir);
FileUtil.purge(dir, true);
} else {
File relativeDir = new File(getBinDir(), dir.getPath());
log.info("Purging " + relativeDir);
FileUtil.purge(relativeDir, true);
}
}
private Set<String> getAuthAddresses() throws InternodeAuthConfUpdateException {
File authFile = null;
try {
authFile = getInternodeAuthConfFile();
String contents = StreamUtil.slurp(new FileReader(authFile));
Set<String> addresses = new TreeSet<String>();
for (String address : contents.split("\\n")) {
addresses.add(address);
}
return addresses;
} catch (FileNotFoundException e) {
throw new InternodeAuthConfUpdateException("Could not load internode authentication file " + authFile, e);
}
}
private void setAuthAddresses(Set<String> addresses) throws InternodeAuthConfUpdateException {
File authFile = null;
log.info("Updating " + authFile);
if (log.isDebugEnabled()) {
log.debug("Updating authorized storage node addresses to " + addresses);
}
try {
authFile = getInternodeAuthConfFile();
StreamUtil.copy(new StringReader(StringUtil.collectionToString(addresses, "\n")),
new FileWriter(authFile), true);
} catch (Exception e) {
throw new InternodeAuthConfUpdateException("An error occurred while trying to update " + authFile, e);
}
}
private void updateInternodeAuthConfFile(Set<String> ipAddresses) throws InternodeAuthConfUpdateException {
File authFile = getInternodeAuthConfFile();
log.info("Updating " + authFile);
try {
StreamUtil.copy(new StringReader(StringUtil.collectionToString(ipAddresses, "\n")),
new FileWriter(authFile), true);
} catch (Exception e) {
log.error("An error occurred while trying to update " + authFile, e);
throw new InternodeAuthConfUpdateException("An error occurred while trying to update " + authFile, e);
}
}
String getOperationMode() {
EmsConnection emsConnection = getEmsConnection();
EmsBean storageService = emsConnection.getBean("org.apache.cassandra.db:type=StorageService");
EmsAttribute attribute = storageService.getAttribute("OperationMode");
return (String) attribute.refresh();
}
private OperationResult nodeAdded(Configuration params) {
return performTopologyChangeMaintenance(params);
}
private OperationResult nodeRemoved(Configuration params) {
return performTopologyChangeMaintenance(params);
}
private OperationResult performTopologyChangeMaintenance(Configuration params) {
boolean runRepair = params.getSimple("runRepair").getBooleanValue();
boolean updateSeedsList = params.getSimple("updateSeedsList").getBooleanValue();
EmsConnection emsConnection = getEmsConnection();
KeyspaceService keyspaceService = new KeyspaceService(emsConnection);
boolean hasErrors = false;
OperationResult result = new OperationResult();
Configuration resultConfig = result.getComplexResults();
PropertyList resultsList = new PropertyList("results");
OpResult opResult = null;
if (runRepair) {
opResult = repairKeyspace(keyspaceService, SYSTEM_AUTH_KEYSPACE);
if (!opResult.succeeded) {
hasErrors = true;
}
resultsList.add(toPropertyMap(opResult));
}
opResult = cleanupKeyspace(keyspaceService, SYSTEM_AUTH_KEYSPACE);
if (!opResult.succeeded) {
hasErrors = true;
}
resultsList.add(toPropertyMap(opResult));
if (runRepair) {
opResult = repairKeyspace(keyspaceService, RHQ_KEYSPACE);
if (!opResult.succeeded) {
hasErrors = true;
}
resultsList.add(toPropertyMap(opResult));
}
opResult = cleanupKeyspace(keyspaceService, RHQ_KEYSPACE);
if (!opResult.succeeded) {
hasErrors = true;
}
resultsList.add(toPropertyMap(opResult));
if (updateSeedsList) {
List<String> addresses = getAddresses(params.getList("seedsList"));
try {
opResult = new OpResult();
opResult.operation = "Update seeds list";
updateSeedsList(addresses);
opResult.succeeded = true;
} catch (Exception e) {
log.error("An error occurred while updating the seeds lists for " + getResourceContext().getResourceKey(),
e);
opResult.succeeded = false;
Throwable rootCause = ThrowableUtil.getRootCause(e);
opResult.details = "An error occurred while updating the seeds list: " +
ThrowableUtil.getStackAsString(rootCause);
}
resultsList.add(toPropertyMap(opResult));
}
resultConfig.put(resultsList);
if (hasErrors) {
result.setErrorMessage("One or more tasks failed to complete successfully.");
}
return result;
}
private OperationResult repair() {
KeyspaceService keyspaceService = new KeyspaceService(getEmsConnection());
OperationResult result = new OperationResult();
Configuration resultConfig = result.getComplexResults();
PropertyList resultsList = new PropertyList("results");
OpResult opResult = repairKeyspace(keyspaceService, RHQ_KEYSPACE);
resultsList.add(toPropertyMap(opResult));
opResult = repairKeyspace(keyspaceService, SYSTEM_AUTH_KEYSPACE);
resultsList.add(toPropertyMap(opResult));
resultConfig.put(resultsList);
return result;
}
private OpResult repairKeyspace(KeyspaceService keyspaceService, String keyspace) {
OpResult result = new OpResult();
result.operation = "repair " + keyspace + " keyspace";
try {
if (log.isDebugEnabled()) {
log.debug("Running primary range repair on " + keyspace + " keyspace");
}
long start = System.currentTimeMillis();
keyspaceService.repairPrimaryRange(keyspace);
long end = System.currentTimeMillis();
if (log.isDebugEnabled()) {
log.debug("Finsihed primary range repair on " + keyspace + " keyspace in " + (end - start) + " ms");
}
result.succeeded = true;
result.details = "Completed repair operation in " + (end - start) + " ms.";
} catch (Exception e) {
log.error("An error occurred while running repair on " + keyspace, e);
Throwable rootCause = ThrowableUtil.getRootCause(e);
result.succeeded = false;
result.details = "An error occurred while running repair: " + ThrowableUtil.getStackAsString(rootCause);
}
return result;
}
private OpResult cleanupKeyspace(KeyspaceService keyspaceService, String keyspace) {
OpResult result = new OpResult();
result.operation = "cleanup " + keyspace + " keyspace";
long start;
long end;
if (log.isDebugEnabled()) {
log.debug("Running cleanup on " + keyspace + " keyspace");
}
start = System.currentTimeMillis();
try {
keyspaceService.cleanup(keyspace);
end = System.currentTimeMillis();
if (log.isDebugEnabled()) {
log.debug("Finished cleanup on " + keyspace + " keyspace in " + (end - start) + " ms");
}
result.succeeded = true;
} catch (Exception e) {
log.error("An error occurred while running cleanup on " + keyspace + " keyspace", e);
Throwable rootCause = ThrowableUtil.getRootCause(e);
result.succeeded = false;
result.details = "An error occurred while running cleanup: " + ThrowableUtil.getStackAsString(rootCause);
}
return result;
}
private OperationResult prepareForUpgrade(Configuration parameters) throws Exception {
EmsConnection emsConnection = getEmsConnection();
EmsBean storageService = emsConnection.getBean("org.apache.cassandra.db:type=StorageService");
Class<?>[] emptyParams = new Class<?>[0];
if (log.isDebugEnabled()) {
log.debug("Disabling native transport...");
}
EmsOperation operation = storageService.getOperation("stopNativeTransport", emptyParams);
operation.invoke((Object[]) emptyParams);
if (log.isDebugEnabled()) {
log.debug("Disabling gossip...");
}
operation = storageService.getOperation("stopGossiping", emptyParams);
operation.invoke((Object[]) emptyParams);
if (log.isDebugEnabled()) {
log.debug("Taking the snapshot...");
}
operation = storageService.getOperation("takeSnapshot", String.class, String[].class);
String snapshotName = parameters.getSimpleValue("snapshotName");
if (snapshotName == null || snapshotName.trim().isEmpty()) {
snapshotName = System.currentTimeMillis() + "";
}
operation.invoke(snapshotName, new String[] {});
// max 2 sec
waitForTaskToComplete(500, 10, 150);
if (log.isDebugEnabled()) {
log.debug("Initiating drain...");
}
operation = storageService.getOperation("drain", emptyParams);
operation.invoke((Object[]) emptyParams);
return new OperationResult();
}
private void waitForTaskToComplete(int initialWaiting, int maxTries, int sleepMillis) throws InterruptedException {
// initial waiting
Thread.sleep(initialWaiting);
EmsConnection emsConnection = getEmsConnection();
EmsBean flushWriterBean = emsConnection.getBean("org.apache.cassandra.internal:type=FlushWriter");
EmsAttribute attribute = flushWriterBean.getAttribute("PendingTasks");
Long valueObject = (Long) attribute.refresh();
// wait until org.apache.cassandra.internal:type=FlushWriter / PendingTasks == 0
while (valueObject > 0 && maxTries-- > 0) {
Thread.sleep(sleepMillis);
valueObject = (Long) attribute.refresh();
}
flushWriterBean.unload();
}
private PropertyMap toPropertyMap(OpResult opResult) {
PropertyMap map = new PropertyMap("resultsMap");
map.put(new PropertySimple("task", opResult.operation));
map.put(new PropertySimple("succeeded", opResult.succeeded));
map.put(new PropertySimple("details", opResult.details));
return map;
}
private static class OpResult {
String operation;
boolean succeeded;
String details;
}
@Override
public String toString() {
return StorageNodeComponent.class.getSimpleName() + "[resourceKey: " + getResourceContext().getResourceKey() +
"]";
}
private ConfigEditor getYamlConfigEditor() {
Configuration pluginConfig = getResourceContext().getPluginConfiguration();
String yamlProp = pluginConfig.getSimpleValue("yamlConfiguration");
File yamlFile = new File(yamlProp);
return new ConfigEditor(yamlFile);
}
}