/* * RHQ Management Platform * Copyright (C) 2005-2015 Red Hat, Inc. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ package org.rhq.enterprise.server.scheduler.jobs; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import com.datastax.driver.core.Cluster; import com.datastax.driver.core.ProtocolOptions; import com.datastax.driver.core.ResultSet; import com.datastax.driver.core.Row; import com.datastax.driver.core.Session; import com.datastax.driver.core.exceptions.NoHostAvailableException; import com.datastax.driver.core.policies.DefaultRetryPolicy; import com.datastax.driver.core.policies.LoggingRetryPolicy; import com.datastax.driver.core.policies.RoundRobinPolicy; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.codehaus.jackson.map.ObjectMapper; import org.quartz.JobExecutionContext; import org.quartz.JobExecutionException; import org.rhq.cassandra.util.ClusterBuilder; import org.rhq.core.domain.cloud.StorageNode; import org.rhq.core.domain.common.composite.SystemSetting; import org.rhq.core.domain.common.composite.SystemSettings; import org.rhq.enterprise.server.cloud.StorageNodeManagerLocal; import org.rhq.enterprise.server.util.LookupUtil; import org.rhq.server.metrics.StorageSession; /** * This job checks replication factor of storage cluster. In case the factor is not correct according to cluster size, ERROR log message * is produced * @author lzoubek@redhat.com * */ public class ReplicationFactorCheckJob extends AbstractStatefulJob { private static final Log log = LogFactory.getLog(ReplicationFactorCheckJob.class); private static final List<String> KEYSPACES = Arrays.asList("rhq", "system_auth"); @Override public void executeJobCode(JobExecutionContext context) throws JobExecutionException { debug(getClass().getName() + " job starting"); StorageNodeManagerLocal storageNodeManager = LookupUtil.getStorageNodeManager(); SystemSettings settings = LookupUtil.getSystemManager().getObfuscatedSystemSettings(true); String username = settings.get(SystemSetting.STORAGE_USERNAME); String password = settings.get(SystemSetting.STORAGE_PASSWORD); List<StorageNode> storageNodes = storageNodeManager.getStorageNodes(); List<String> hostNames = new ArrayList<String>(); for (StorageNode storageNode : storageNodes) { // We only want clustered nodes here because we won't be able to connect to // node that is not part of the cluster. The filtering here on the operation // mode is somewhat convservative because we could also include ADD_MAINTENANCE // and REMOVE_MAINTENANCE, but this errors on the side of being safe. Lastly, // if a storage node does not have a resource, then that means it was was // deployed prior to installing the server. if (storageNode.getOperationMode() == StorageNode.OperationMode.NORMAL || storageNode.getOperationMode() == StorageNode.OperationMode.MAINTENANCE || storageNode.getResource() == null) { hostNames.add(storageNode.getAddress()); } } if (hostNames.isEmpty()) { log.error("There is not storage node in relational database to connect! Please re-install at least 1 storage node"); return; } debug("Initiating connection to cluster hosts=" + Arrays.toString(hostNames.toArray())); int port = storageNodes.get(0).getCqlPort(); Cluster cluster = new ClusterBuilder().addContactPoints(hostNames.toArray(new String[hostNames.size()])) .withCredentialsObfuscated(username, password).withPort(port) .withLoadBalancingPolicy(new RoundRobinPolicy()) .withRetryPolicy(new LoggingRetryPolicy(DefaultRetryPolicy.INSTANCE)) .withCompression(ProtocolOptions.Compression.NONE).build(); // need to connect to system keyspace as it contains data about other keyspaces (and replication factor) Session session = null; try { session = cluster.connect("system"); debug("Querying system keyspaces for strategy_options"); Map<String, Integer> replicationFactors = getReplicationFactors(session); if (replicationFactors.size() != KEYSPACES.size()) { log.error("Failed to query storage cluster for keyspaces " + Arrays.toString(KEYSPACES.toArray()) + " for replication_factor, expected to retrieve " + KEYSPACES.size() + " but got " + replicationFactors.size()); return; } Map<String, Integer> factorsToSet = new HashMap<String, Integer>(replicationFactors.size()); String inconsistency = ""; for (Entry<String,Integer> factorEntry : replicationFactors.entrySet()) { int healthy = getHealthyReplicationFactor(factorEntry.getKey(), storageNodes.size()); int current = factorEntry.getValue().intValue(); if (current != healthy) { inconsistency += "keyspace [" + factorEntry.getKey() + "] has replication_factor=" + current + " but it should be " + healthy + ", "; factorsToSet.put(factorEntry.getKey(), healthy); } } if (!factorsToSet.isEmpty()) { // strip last ", " inconsistency = inconsistency.substring(0, inconsistency.length() - 2); log.warn("Storage Cluster is not consistent! There are " + storageNodes.size() + " StorageNodes in RDBMS and " + inconsistency + ". This can happen in case StorageNode deployment/undeployment fails. "); log.info("Updating replication_factor for keyspaces " + Arrays.toString(factorsToSet.keySet().toArray())); for (Entry<String, Integer> factor : factorsToSet.entrySet()) { updateReplicationFactor(session, factor.getKey(), factor.getValue().intValue()); } log.info("Replication factor(s) have been fixed, data in cluster wil be made consistent the next time storage maintenance job" + " finishes or can be started manually via CLI using StorageNodeManager.runClusterMaintenance()"); } else { debug("Storage Cluster replication_factor check finished, replication_factor is correct"); } } catch (NoHostAvailableException ex) { log.error("Failed to connect to storage cluster", ex); } catch (Exception ex) { log.error("Failed to connect to storage cluster", ex); } finally { if (session != null) { session.shutdown(); } cluster.shutdown(); } } private Map<String, Integer> getReplicationFactors(Session session) { Map<String, Integer> factors = new HashMap<String, Integer>(); List<Row> result = session.execute("select keyspace_name, strategy_options from schema_keyspaces").all(); for (Row row : result) { String keyspace = row.getString("keyspace_name"); // we're only interested in those 2 keypsaces if (KEYSPACES.contains(keyspace)) { Integer replicationFactor = readReplicationFactor(row.getString("strategy_options")); if (replicationFactor == null) { // exception has been logged in readReplicationFactor continue; } factors.put(keyspace, replicationFactor); } } return factors; } private static void debug(String message) { if (log.isDebugEnabled()) { log.debug(message); } } private Integer readReplicationFactor(String text) { try { @SuppressWarnings("unchecked") Map<String, String> map = new ObjectMapper().readValue(text, Map.class); String factor = map.get("replication_factor"); return Integer.parseInt(factor); } catch (Exception e) { log.error("Unable to parse strategy_options column from " + text, e); } return null; } public static ResultSet updateReplicationFactor(Session session, String keyspace, int replicationFactor) { debug("Updating replication_factor=" + replicationFactor + " for keyspace " + keyspace); return session.execute(createUpdateReplicationFactorStatement(keyspace, replicationFactor)); } public static ResultSet updateReplicationFactor(StorageSession session, String keyspace, int replicationFactor) { debug("Updating replication_factor=" + replicationFactor + " for keyspace " + keyspace); return session.execute(createUpdateReplicationFactorStatement(keyspace, replicationFactor)); } private static String createUpdateReplicationFactorStatement(String keyspace, int replicationFactor) { return "ALTER KEYSPACE " + keyspace + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': " + replicationFactor + "}"; } static int getHealthyReplicationFactor(String keyspace, int clusterSize) { if ("system_auth".equals(keyspace)) { return clusterSize; } if (clusterSize > 3) { return 3; } if (clusterSize >= 2) { return 2; } return 1; } }