package com.sungardas.enhancedsnapshots.cluster; import com.amazonaws.services.autoscaling.AmazonAutoScaling; import com.amazonaws.services.autoscaling.model.AutoScalingGroup; import com.amazonaws.services.autoscaling.model.DeletePolicyRequest; import com.amazonaws.services.autoscaling.model.PutScalingPolicyRequest; import com.amazonaws.services.autoscaling.model.UpdateAutoScalingGroupRequest; import com.amazonaws.services.cloudformation.AmazonCloudFormation; import com.amazonaws.services.cloudformation.model.DeleteStackRequest; import com.amazonaws.services.cloudwatch.AmazonCloudWatch; import com.amazonaws.services.cloudwatch.model.*; import com.amazonaws.services.sns.AmazonSNS; import com.amazonaws.services.sns.model.Topic; import com.amazonaws.services.sns.util.Topics; import com.amazonaws.services.sqs.AmazonSQS; import com.amazonaws.services.sqs.model.CreateQueueRequest; import com.amazonaws.services.sqs.model.DeleteQueueRequest; import com.sungardas.enhancedsnapshots.aws.dynamodb.model.NodeEntry; import com.sungardas.enhancedsnapshots.aws.dynamodb.repository.NodeRepository; import com.sungardas.enhancedsnapshots.components.ConfigurationMediator; import com.sungardas.enhancedsnapshots.exception.ConfigurationException; import com.sungardas.enhancedsnapshots.service.SDFSStateService; import com.sungardas.enhancedsnapshots.util.SystemUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.DependsOn; import org.springframework.stereotype.Service; import javax.annotation.PostConstruct; import java.util.Date; import java.util.List; import java.util.Optional; @Service("ClusterConfigurationService") @DependsOn({"ConfigurationMediator", "CreateAppConfiguration"}) public class ClusterConfigurationServiceImpl implements ClusterConfigurationService { private static final Logger LOG = LogManager.getLogger(ClusterConfigurationServiceImpl.class); private static final String SCALE_UP_POLICY = "ESS-ScaleUpPolicy-" + SystemUtils.getSystemId(); private static final String SCALE_DOWN_POLICY = "ESS-ScaleDownPolicy-" + SystemUtils.getSystemId(); private static final String METRIC_DATA_NAME = "ESS-Load-Metric-" + SystemUtils.getSystemId(); private static final String ESS_OVERLOAD_ALARM = "ESS-Overload-Alarm-" + SystemUtils.getSystemId(); private static final String ESS_IDLE_ALARM = "ESS-Idle-Alarm-" + SystemUtils.getSystemId(); private static final String ESS_TOPIC_NAME = "ESS-" + SystemUtils.getSystemId() + "-topic"; private static final String ESS_QUEUE_NAME = "ESS-" + SystemUtils.getSystemId() + "-queue"; private static final String ESS_LB_NAME = SystemUtils.getSystemId() + "-lb"; @Autowired private AmazonSNS amazonSNS; @Autowired private AmazonSQS amazonSQS; @Autowired private AmazonCloudWatch cloudWatch; @Autowired private AmazonAutoScaling autoScaling; @Autowired private NodeRepository nodeRepository; @Autowired private ConfigurationMediator configurationMediator; @Autowired(required = false) private ClusterEventPublisher clusterEventPublisher; @Autowired(required = false) private SDFSStateService sdfsStateService; @Autowired private AmazonCloudFormation cloudFormation; @Value("${enhancedsnapshots.default.backup.threadPool.size}") private int backupThreadPoolSize; @Value("${enhancedsnapshots.default.restore.threadPool.size}") private int restoreThreadPoolSize; private AutoScalingGroup autoScalingGroup; @PostConstruct private void init() { if (configurationMediator.isClusterMode()) { if (!clusterIsConfigured()) { configureClusterInfrastructure(); nodeRepository.save(getMasterNodeInfo()); } else if (nodeRepository.findOne(SystemUtils.getInstanceId()) == null) { joinCluster(); } else if (nodeRepository.findByMaster(true).isEmpty()) { NodeEntry node = nodeRepository.findOne(SystemUtils.getInstanceId()); if (node == null) { node = getMasterNodeInfo(); } nodeRepository.save(node); } } } private void joinCluster() { String instanceId = SystemUtils.getInstanceId(); if (nodeRepository.exists(instanceId)) { LOG.warn("Instance {} already present in cluster", instanceId); } else { LOG.info("Joining cluster {}", instanceId); NodeEntry newNode = new NodeEntry(instanceId, false, restoreThreadPoolSize, backupThreadPoolSize, sdfsStateService.getSDFSVolumeId()); nodeRepository.save(newNode); clusterEventPublisher.nodeLaunched(newNode.getNodeId(), sdfsStateService.getSDFSVolumeId(), null); } } protected NodeEntry getMasterNodeInfo() { return new NodeEntry(SystemUtils.getInstanceId(), true, restoreThreadPoolSize, backupThreadPoolSize, sdfsStateService.getSDFSVolumeId()); } public void configureClusterInfrastructure() { LOG.info("Configuration of cluster infrastructure started"); // update AutoScalingGroup with min and max node number autoScaling.updateAutoScalingGroup(new UpdateAutoScalingGroupRequest() .withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) .withMaxSize(configurationMediator.getMaxNodeNumberInCluster()) .withMinSize(configurationMediator.getMinNodeNumberInCluster()) .withDesiredCapacity(configurationMediator.getMinNodeNumberInCluster())); LOG.info("AutoScalingGroup {} updated: {}", autoScalingGroup.getAutoScalingGroupName(), autoScalingGroup.toString()); // we create this infrustructure from JAVA since currently we can not get arn when we create policy from CFT //create AutoScaling Policies String scaleUpPolicyARN = autoScaling.putScalingPolicy(new PutScalingPolicyRequest().withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) .withPolicyName(SCALE_UP_POLICY) .withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) //Increase or decrease the current capacity of the group by the specified number of instances. .withAdjustmentType("ChangeInCapacity") .withPolicyType("SimpleScaling") .withScalingAdjustment(1)).getPolicyARN(); LOG.info("Scale up policy created: {}", SCALE_UP_POLICY); String scaleDownPolicyARN = autoScaling.putScalingPolicy(new PutScalingPolicyRequest().withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) .withPolicyName(SCALE_DOWN_POLICY) .withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) //Increase or decrease the current capacity of the group by the specified number of instances. .withAdjustmentType("ChangeInCapacity") .withPolicyType("SimpleScaling") .withScalingAdjustment(-1)).getPolicyARN(); LOG.info("Scale down policy created: {}", SCALE_DOWN_POLICY); // create custom metric MetricDatum metricDatum = new MetricDatum(); metricDatum.setValue(0.0); metricDatum.setUnit(StandardUnit.Count); metricDatum.setTimestamp(new Date()); metricDatum.setMetricName(METRIC_DATA_NAME); cloudWatch.putMetricData(new PutMetricDataRequest() .withNamespace("ESS/Tasks").withMetricData(metricDatum)); LOG.info("Custom metric added: {}", metricDatum.toString()); // create custom alarm cloudWatch.putMetricAlarm(new PutMetricAlarmRequest() .withAlarmName(ESS_OVERLOAD_ALARM) .withMetricName(METRIC_DATA_NAME) .withComparisonOperator(ComparisonOperator.GreaterThanOrEqualToThreshold) .withThreshold(80.0) .withPeriod(300) .withEvaluationPeriods(2) .withStatistic(Statistic.Average) .withNamespace("ESS/Tasks") .withAlarmActions(scaleUpPolicyARN)); LOG.info("Load alarm added: ", cloudWatch.describeAlarms().getMetricAlarms() .stream().filter(alarm -> alarm.getAlarmName().equals(ESS_OVERLOAD_ALARM)).findFirst().get().toString()); // create custom alarm cloudWatch.putMetricAlarm(new PutMetricAlarmRequest() .withAlarmName(ESS_IDLE_ALARM) .withMetricName(METRIC_DATA_NAME) .withComparisonOperator(ComparisonOperator.LessThanThreshold) .withThreshold(40.0) .withPeriod(300) .withEvaluationPeriods(2) .withStatistic(Statistic.Average) .withNamespace("ESS/Tasks") .withAlarmActions(scaleDownPolicyARN)); LOG.info("Alarm for idle resources added: ", cloudWatch.describeAlarms().getMetricAlarms() .stream().filter(alarm -> alarm.getAlarmName().equals(ESS_IDLE_ALARM)).findFirst().get().toString()); // subscribe to topic amazonSQS.createQueue(new CreateQueueRequest().withQueueName(ESS_QUEUE_NAME)); Topics.subscribeQueue(amazonSNS, amazonSQS, getEssTopicArn(), amazonSQS.getQueueUrl(ESS_QUEUE_NAME).getQueueUrl()); LOG.info("Cluster infrastructure successfully configured."); } @Override public void updateCloudWatchMetric() { MetricDatum metricDatum = new MetricDatum(); metricDatum.setValue(getSystemLoadLevel() * 100); metricDatum.setUnit(StandardUnit.Count); metricDatum.setTimestamp(new Date()); metricDatum.setMetricName(METRIC_DATA_NAME); cloudWatch.putMetricData(new PutMetricDataRequest() .withNamespace("ESS/Tasks").withMetricData(metricDatum)); LOG.info("Custom metric added: {}", metricDatum.toString()); } @Override public void updateAutoScalingSettings(int minNodeNumber, int maxNodeNumber) { autoScaling.updateAutoScalingGroup(new UpdateAutoScalingGroupRequest() .withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()) .withMaxSize(maxNodeNumber) .withMinSize(minNodeNumber) .withDesiredCapacity(minNodeNumber)); } /** * Returns ARN of topic where AutoScaling publishes system events * * @return */ private String getEssTopicArn() { Topic ess_topic = amazonSNS.listTopics().getTopics() .stream().filter(topic -> topic.getTopicArn().endsWith(ESS_TOPIC_NAME)).findFirst() .orElseThrow(() -> new ConfigurationException("Topic " + ESS_TOPIC_NAME + " does not exist.")); return ess_topic.getTopicArn(); } /** * Returns % of system load level * * @return */ private double getSystemLoadLevel() { List<NodeEntry> nodes = nodeRepository.findAll(); int freeBackupWorkers = 0; int freeRestoreWorkers = 0; for (NodeEntry n : nodes) { freeBackupWorkers += n.getFreeBackupWorkers(); freeRestoreWorkers += n.getFreeRestoreWorkers(); } int backupWorkers = backupThreadPoolSize * nodes.size(); int restoreWorkers = restoreThreadPoolSize * nodes.size(); double backupLoadLevel = (double) (backupWorkers - freeBackupWorkers) / backupWorkers; double restoreLoadLevel = (double) (restoreWorkers - freeRestoreWorkers) / restoreWorkers; return Math.max(backupLoadLevel, restoreLoadLevel); } private AutoScalingGroup getAutoScalingGroup() { //there is no possibility to set custom name for AutoScalingGroup from CFT //that's why we have to determine created group name in code base on stack name //CloudFormation service uses next schema for AutoScalingGroup name //$CUSTOM_STACK_NAME-AutoScalingGroup-<some random string> Optional<AutoScalingGroup> asg = null; if (autoScalingGroup == null) { asg = autoScaling.describeAutoScalingGroups().getAutoScalingGroups().stream() .filter(autoScalingGroup -> autoScalingGroup.getAutoScalingGroupName() .startsWith(SystemUtils.getCloudFormationStackName() + "-AutoScalingGroup-")).findFirst(); autoScalingGroup = asg.orElseThrow(() -> new ConfigurationException("No appropriate AutoScalingGroup was found")); } return autoScalingGroup; } // we assume that system is configured if configuration exists private boolean clusterIsConfigured() { return (nodeRepository.count() != 0) ? true : false; } @Override public void removeClusterInfrastructure() { autoScaling.deletePolicy(new DeletePolicyRequest().withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()).withPolicyName(SCALE_UP_POLICY)); autoScaling.deletePolicy(new DeletePolicyRequest().withAutoScalingGroupName(getAutoScalingGroup().getAutoScalingGroupName()).withPolicyName(SCALE_DOWN_POLICY)); cloudWatch.deleteAlarms(new DeleteAlarmsRequest().withAlarmNames(ESS_OVERLOAD_ALARM, ESS_IDLE_ALARM)); // CloudWatch metrics are stored for two weeks. Old data will be removed automatically. amazonSQS.deleteQueue(new DeleteQueueRequest().withQueueUrl(ESS_QUEUE_NAME)); cloudFormation.deleteStack(new DeleteStackRequest().withStackName(SystemUtils.getCloudFormationStackName())); } }