package com.sungardas.enhancedsnapshots.tasks.executors;
import com.amazonaws.services.ec2.model.*;
import com.sungardas.enhancedsnapshots.aws.dynamodb.model.BackupEntry;
import com.sungardas.enhancedsnapshots.aws.dynamodb.model.BackupState;
import com.sungardas.enhancedsnapshots.aws.dynamodb.model.TaskEntry;
import com.sungardas.enhancedsnapshots.aws.dynamodb.repository.BackupRepository;
import com.sungardas.enhancedsnapshots.aws.dynamodb.repository.TaskRepository;
import com.sungardas.enhancedsnapshots.components.ConfigurationMediator;
import com.sungardas.enhancedsnapshots.dto.CopyingTaskProgressDto;
import com.sungardas.enhancedsnapshots.dto.TaskProgressDto;
import com.sungardas.enhancedsnapshots.dto.converter.VolumeDtoConverter;
import com.sungardas.enhancedsnapshots.enumeration.TaskProgress;
import com.sungardas.enhancedsnapshots.exception.EnhancedSnapshotsException;
import com.sungardas.enhancedsnapshots.exception.EnhancedSnapshotsInterruptedException;
import com.sungardas.enhancedsnapshots.exception.EnhancedSnapshotsTaskInterruptedException;
import com.sungardas.enhancedsnapshots.service.*;
import com.sungardas.enhancedsnapshots.util.SystemUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.MessageFormat;
import java.util.concurrent.TimeUnit;
import static com.sungardas.enhancedsnapshots.aws.dynamodb.model.TaskEntry.TaskEntryStatus.*;
import static java.lang.String.format;
@Service("awsBackupVolumeTaskExecutor")
public class AWSBackupVolumeStrategyTaskExecutor extends AbstractAWSVolumeTaskExecutor {
private static final Logger LOG = LogManager.getLogger(AWSBackupVolumeStrategyTaskExecutor.class);
@Autowired
private TaskRepository taskRepository;
@Autowired
private StorageService storageService;
@Autowired
private BackupRepository backupRepository;
@Autowired
private SnapshotService snapshotService;
@Autowired
private AWSCommunicationService awsCommunication;
@Autowired
private ConfigurationMediator configurationMediator;
@Autowired
private RetentionService retentionService;
@Autowired
private NotificationService notificationService;
@Autowired
private TaskService taskService;
@Autowired
private VolumeService volumeService;
@Autowired
private MailService mailService;
private String instanceId = SystemUtils.getInstanceId();
@Override
public void execute(final TaskEntry taskEntry) {
Volume tempVolume = taskEntry.getTempVolumeId() != null ? awsCommunication.getVolume(taskEntry.getTempVolumeId()) : null;
try {
LOG.info("Starting backup process for volume {}", taskEntry.getVolume());
LOG.info("{} task state was changed to 'in progress'", taskEntry.getId());
// change task status
taskEntry.setStatus(RUNNING.getStatus());
taskRepository.save(taskEntry);
if (taskEntry.progress() != TaskProgress.NONE) {
switch (taskEntry.progress()) {
case CREATING_TEMP_VOLUME:
case WAITING_TEMP_VOLUME:
case ATTACHING_VOLUME:
case COPYING: {
try {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Detaching temp volume", 85);
detachingTempVolumeStep(taskEntry, tempVolume);
} catch (Exception e) {
//skip
}
try {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Deleting temp volume", 50);
deletingTempVolumeStep(taskEntry, tempVolume);
} catch (Exception e) {
//skip
}
try {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Deleting temp file", 10);
storageService.deleteFile(getBackupName(taskEntry));
} catch (Exception e) {
//skip
}
setProgress(taskEntry, TaskProgress.CREATING_TEMP_VOLUME);
break;
}
case CREATING_SNAPSHOT: {
if (taskEntry.getTempSnapshotId() != null) {
setProgress(taskEntry, TaskProgress.WAITING_SNAPSHOT);
}
break;
}
case DETACHING_TEMP_VOLUME: {
setProgress(taskEntry, TaskProgress.DELETING_TEMP_VOLUME);
}
}
}
switch (taskEntry.progress()) {
case INTERRUPTED_CLEANING: {
interruptedCleaningStep(taskEntry, tempVolume);
break;
}
case FAIL_CLEANING: {
failCleaningStep(taskEntry, tempVolume, null);
break;
}
case NONE: {
// check volume exists
if (!volumeService.volumeExists(taskEntry.getVolume())) {
LOG.info("Volume [{}] does not exist. Removing backup task [{}]");
taskRepository.delete(taskEntry);
return;
}
}
case STARTED: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Starting backup task", 0);
startedStep(taskEntry);
}
case CREATING_SNAPSHOT: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Creating snapshot", 5);
creatingSnapshotStep(taskEntry);
}
case WAITING_SNAPSHOT: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Waiting snapshot", 10);
waitingSnapshotStep(taskEntry);
}
case CREATING_TEMP_VOLUME: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Creating temp volume", 15);
tempVolume = creatingTempVolumeStep(taskEntry);
}
case WAITING_TEMP_VOLUME: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Waiting temp volume", 20);
waitingTempVolumeStep(taskEntry, tempVolume);
}
case ATTACHING_VOLUME: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Attaching volume", 25);
tempVolume = attachingVolumeStep(taskEntry, tempVolume);
}
case COPYING: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Coping ...", 30);
copyingStep(taskEntry, tempVolume);
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Backup complete", 80);
}
case DETACHING_TEMP_VOLUME: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Detaching temp volume", 85);
detachingTempVolumeStep(taskEntry, tempVolume);
}
case DELETING_TEMP_VOLUME: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Deleting temp volume", 90);
deletingTempVolumeStep(taskEntry, tempVolume);
}
case CLEANING_TEMP_RESOURCES: {
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Cleaning temp resources", 95);
cleaningTempResourcesStep(taskEntry);
}
}
notificationService.notifyAboutRunningTaskProgress(taskEntry.getId(), "Done", 100);
setProgress(taskEntry, TaskProgress.DONE);
} catch (EnhancedSnapshotsTaskInterruptedException e) {
interruptedCleaningStep(taskEntry, tempVolume);
} catch (EnhancedSnapshotsInterruptedException e) {
if (!configurationMediator.isClusterMode()) {
interruptedCleaningStep(taskEntry, tempVolume);
}
setProgress(taskEntry, TaskProgress.DONE);
} catch (Exception e) {
failCleaningStep(taskEntry, tempVolume, e);
setProgress(taskEntry, TaskProgress.DONE);
}
}
private void interruptedCleaningStep(TaskEntry taskEntry, Volume tempVolume) {
LOG.info("Backup process for volume {} canceled ", taskEntry.getVolume());
setProgress(taskEntry, TaskProgress.INTERRUPTED_CLEANING);
TaskProgressDto dto = new TaskProgressDto(taskEntry.getId(), "Kill initialization process", 20, CANCELED);
notificationService.notifyAboutTaskProgress(dto);
// kill initialization Process if it's alive, should be killed before volume detaching
try {
killInitializationVolumeProcessIfAlive(tempVolume);
} catch (Exception e) {
LOG.error("Killing initialization process failed", e);
}
dto = new TaskProgressDto(taskEntry.getId(), "Deleting temp resources", 50, CANCELED);
notificationService.notifyAboutTaskProgress(dto);
try {
deleteTempResources(tempVolume, getBackupName(taskEntry), dto);
} catch (Exception e) {
LOG.error("Deleting temp resources failed", e);
}
taskEntry.setProgress(TaskProgress.DONE.name());
taskRepository.delete(taskEntry);
dto.setMessage("Done");
dto.setProgress(100);
notificationService.notifyAboutTaskProgress(dto);
mailService.notifyAboutSystemStatus("Backup task for volume with id: " + taskEntry.getVolume() + " was canceled");
}
private void failCleaningStep(TaskEntry taskEntry, Volume tempVolume, Exception e) {
LOG.error("Backup process for volume {} failed ", taskEntry.getVolume(), e);
setProgress(taskEntry, TaskProgress.FAIL_CLEANING);
taskEntry.setStatus(ERROR.toString());
taskRepository.save(taskEntry);
TaskProgressDto dto = new TaskProgressDto(taskEntry.getId(), "Killing initialization process", 20, ERROR);
notificationService.notifyAboutTaskProgress(dto);
// kill initialization Process if it's alive, should be killed before volume detaching
try {
killInitializationVolumeProcessIfAlive(tempVolume);
} catch (Exception e1) {
LOG.error("Killing initialization process failed", e1);
}
try {
deleteTempResources(tempVolume, getBackupName(taskEntry), dto);
} catch (Exception e1) {
LOG.error("Delete temp resources failed", e1);
}
dto.setMessage("Done");
dto.setProgress(100);
notificationService.notifyAboutTaskProgress(dto);
mailService.notifyAboutError(taskEntry, e);
}
private void startedStep(TaskEntry taskEntry) {
setProgress(taskEntry, TaskProgress.STARTED);
checkThreadInterruption(taskEntry);
taskEntry.setStartTime(System.currentTimeMillis());
}
private void creatingSnapshotStep(TaskEntry taskEntry) {
setProgress(taskEntry, TaskProgress.CREATING_SNAPSHOT);
Volume volumeSrc = awsCommunication.getVolume(taskEntry.getVolume());
if (volumeSrc == null) {
LOG.error("Can't get access to {} volume", taskEntry.getVolume());
throw new AWSBackupVolumeException(MessageFormat.format("Can't get access to {} volume", taskEntry.getVolume()));
}
taskEntry.setTempSnapshotId(awsCommunication.createSnapshot(volumeSrc).getSnapshotId());
taskRepository.save(taskEntry);
}
private void waitingSnapshotStep(TaskEntry taskEntry) {
setProgress(taskEntry, TaskProgress.WAITING_SNAPSHOT);
Snapshot snapshot = awsCommunication.waitForCompleteState(awsCommunication.getSnapshot(taskEntry.getTempSnapshotId()));
LOG.info("SnapshotEntry created: {}", snapshot.toString());
}
private Volume creatingTempVolumeStep(TaskEntry taskEntry) {
// create volume
checkThreadInterruption(taskEntry);
setProgress(taskEntry, TaskProgress.CREATING_TEMP_VOLUME);
Instance instance = awsCommunication.getInstance(instanceId);
if (instance == null) {
LOG.error("Can't get access to {} instance" + instanceId);
throw new AWSBackupVolumeException(MessageFormat.format("Can't get access to {} instance", instanceId));
}
String instanceAvailabilityZone = instance.getPlacement().getAvailabilityZone();
Volume tempVolume = awsCommunication.createVolumeFromSnapshot(taskEntry.getTempSnapshotId(), instanceAvailabilityZone,
VolumeType.fromValue(taskEntry.getTempVolumeType()), taskEntry.getTempVolumeIopsPerGb());
taskEntry.setTempVolumeId(tempVolume.getVolumeId());
taskRepository.save(taskEntry);
// create temporary tag
awsCommunication.createTemporaryTag(taskEntry.getTempVolumeId(), taskEntry.getVolume());
return tempVolume;
}
private void waitingTempVolumeStep(TaskEntry taskEntry, Volume tempVolume) {
checkThreadInterruption(taskEntry);
setProgress(taskEntry, TaskProgress.WAITING_TEMP_VOLUME);
Volume volumeDest = awsCommunication.waitForVolumeState(tempVolume, VolumeState.Available);
LOG.info("Volume created: {}", volumeDest.toString());
}
private Volume attachingVolumeStep(TaskEntry taskEntry, Volume tempVolume) {
checkThreadInterruption(taskEntry);
setProgress(taskEntry, TaskProgress.ATTACHING_VOLUME);
// mount volume
awsCommunication.attachVolume(awsCommunication.getInstance(instanceId), tempVolume);
tempVolume = awsCommunication.syncVolume(tempVolume);
try {
TimeUnit.MINUTES.sleep(1);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
// storage blocks on volumes that were restored from snapshots must be initialized (pulled down from Amazon S3 and written to the volume) before they can be accesed.
// This preliminary action takes time and can cause a significant increase in the latency of an I/O operation the first time each block is accessed.
// To avoid this performance hit in a production environment all blocks on volumes can be read before volume usage; this process is called initialization (formerly known as pre-warming).
initializeVolume(tempVolume);
checkThreadInterruption(taskEntry);
return tempVolume;
}
private void copyingStep(TaskEntry taskEntry, Volume tempVolume) throws IOException, InterruptedException {
setProgress(taskEntry, TaskProgress.COPYING);
checkThreadInterruption(taskEntry);
Volume volumeToBackup = awsCommunication.getVolume(taskEntry.getVolume());
String volumeType = volumeToBackup.getVolumeType();
String iops = (volumeToBackup.getIops() != null) ? volumeToBackup
.getIops().toString() : "";
String sizeGib = tempVolume.getSize().toString();
if (volumeType.equals("")) {
volumeType = "gp2";
}
if (volumeType.equals("standard")) {
volumeType = "gp2";
}
String backupFileName = getBackupName(taskEntry);
BackupEntry backup = new BackupEntry(taskEntry.getVolume(),
VolumeDtoConverter.convert(awsCommunication.getVolume(taskEntry.getVolume())).getVolumeName(),
backupFileName, taskEntry.getStartTime() + "", "", BackupState.INPROGRESS, taskEntry.getTempSnapshotId(), volumeType, iops, sizeGib);
checkThreadInterruption(taskEntry);
String source = storageService.detectFsDevName(tempVolume);
LOG.info("Starting copying: " + source + " to:" + backupFileName);
CopyingTaskProgressDto dto = new CopyingTaskProgressDto(taskEntry.getId(), 30, 80, Long.parseLong(backup.getSizeGiB()));
storageService.copyData(source, configurationMediator.getSdfsMountPoint() + backupFileName, dto, taskEntry.getId());
checkThreadInterruption(taskEntry);
long backupSize = storageService.getSize(configurationMediator.getSdfsMountPoint() + backupFileName);
long backupCreationtime = storageService.getBackupCreationTime(configurationMediator.getSdfsMountPoint() + backupFileName);
LOG.info("Backup creation time: {}", backupCreationtime);
LOG.info("Backup size: {}", backupSize);
checkThreadInterruption(taskEntry);
LOG.info("Put backup entry to the Backup List: {}", backup.getFileName());
backup.setState(BackupState.COMPLETED.getState());
backup.setSize(String.valueOf(backupSize));
backupRepository.save(backup);
LOG.info(format("Backup process for volume %s finished successfully ", taskEntry.getVolume()));
LOG.info("Task " + taskEntry.getId() + ": Delete completed task:" + taskEntry.getId());
LOG.info("Cleaning up previously created snapshots");
LOG.info("Storing snapshot data: [{},{},{}]", taskEntry.getVolume(), taskEntry.getTempSnapshotId(), instanceId);
}
private void detachingTempVolumeStep(TaskEntry taskEntry, Volume tempVolume) {
setProgress(taskEntry, TaskProgress.DETACHING_TEMP_VOLUME);
checkThreadInterruption(taskEntry);
// kill initialization Process if it's alive, should be killed before volume detaching
killInitializationVolumeProcessIfAlive(tempVolume);
LOG.info("Detaching volume: {}", tempVolume.getVolumeId());
awsCommunication.detachVolume(tempVolume);
}
private void deletingTempVolumeStep(TaskEntry taskEntry, Volume tempVolume) {
setProgress(taskEntry, TaskProgress.DELETING_TEMP_VOLUME);
LOG.info("Deleting temporary volume: {}", tempVolume.getVolumeId());
awsCommunication.deleteVolume(tempVolume);
}
private void cleaningTempResourcesStep(TaskEntry taskEntry) {
setProgress(taskEntry, TaskProgress.CLEANING_TEMP_RESOURCES);
String previousSnapshot = snapshotService.getSnapshotIdByVolumeId(taskEntry.getVolume());
if (previousSnapshot != null) {
checkThreadInterruption(taskEntry);
LOG.info("Deleting previous snapshot {}", previousSnapshot);
snapshotService.deleteSnapshot(previousSnapshot);
}
if (configurationMediator.isStoreSnapshot()) {
snapshotService.saveSnapshot(taskEntry.getVolume(), taskEntry.getTempSnapshotId());
} else {
awsCommunication.deleteSnapshot(taskEntry.getTempSnapshotId());
}
taskService.complete(taskEntry);
LOG.info("Task completed.");
checkThreadInterruption(taskEntry);
retentionService.apply();
mailService.notifyAboutSuccess(taskEntry);
}
public class AWSBackupVolumeException extends EnhancedSnapshotsException {
public AWSBackupVolumeException(String message) {
super(message);
}
}
/**
* Clean up resources if exception appeared
*
* @param tempVolume
* @param backupFileName
* @param dto notification transfer object
*/
private void deleteTempResources(Volume tempVolume, String backupFileName, TaskProgressDto dto) {
deleteTempVolume(tempVolume, dto);
try {
dto.setMessage("Delete temp file");
dto.addProgress(10);
notificationService.notifyAboutTaskProgress(dto);
storageService.deleteFile(backupFileName);
} catch (Exception ex) {
//do nothing if file not found
}
}
private void killInitializationVolumeProcessIfAlive(Volume volume) {
String fioProcNamePrefix = "fio --filename=" + storageService.detectFsDevName(volume);
try {
// check fio processes are alive
Process checkFioProcessAlive = new ProcessBuilder("pgrep", "-f", fioProcNamePrefix).start();
checkFioProcessAlive.waitFor();
switch (checkFioProcessAlive.exitValue()) {
case 0:
// fio processes are alive
LOG.info("Fio processes for volume {} are alive", volume.getVolumeId());
Process process = new ProcessBuilder("pkill", "-f", fioProcNamePrefix).start();
process.waitFor(3, TimeUnit.MINUTES);
switch (process.exitValue()) {
case 0:
LOG.info("Fio processes for volume {} terminated", volume.getVolumeId());
break;
default: {
LOG.warn("Failed to terminate fio processes for volume {}", volume.getVolumeId());
}
}
break;
default: {
// no need to terminate, fio processes are already terminated
LOG.info("Fio processes for volume {} already terminated, no need to stop forcibly", volume.getVolumeId());
}
}
} catch (IOException | InterruptedException e) {
LOG.warn("Exception while termination of fio processes", e);
}
}
private String getBackupName(TaskEntry taskEntry) {
Volume volumeToBackup = awsCommunication.getVolume(taskEntry.getVolume());
String volumeType = volumeToBackup.getVolumeType();
String iops = (volumeToBackup.getIops() != null) ? volumeToBackup
.getIops().toString() : "";
if (volumeType.equals("")) {
volumeType = "gp2";
}
if (volumeType.equals("standard")) {
volumeType = "gp2";
}
return taskEntry.getVolume() + "." + taskEntry.getStartTime() + "." + volumeType + "." + iops + ".backup";
}
private void initializeVolume(Volume tempVolume) {
String fileNameParam = "--filename=" + storageService.detectFsDevName(tempVolume);
ProcessBuilder builder = new ProcessBuilder("fio", fileNameParam, "--rw=randread", "--bs=128k", "--iodepth=32", "--ioengine=libaio", "--direct=1", "--name=volume-initialize");
try {
LOG.info("Starting volume {} initialization", tempVolume.getVolumeId());
Process process = builder.start();
if (process.isAlive()) {
LOG.debug("Volume {} initialization started successfully", tempVolume.getVolumeId());
} else {
LOG.info("Volume {} initialization failed to start", tempVolume.getVolumeId());
try (BufferedReader input = new BufferedReader(new InputStreamReader(process.getErrorStream()))) {
StringBuilder errorMessage = new StringBuilder();
for (String line; (line = input.readLine()) != null; ) {
errorMessage.append(line);
}
LOG.warn(errorMessage);
}
}
} catch (Exception e) {
LOG.warn("Failed to initialize volume {}", tempVolume.getVolumeId());
LOG.warn(e);
}
}
}