/************************************************************************* * (c) Copyright 2017 Hewlett Packard Enterprise Development Company LP * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. ************************************************************************/ package com.eucalyptus.cluster; import java.net.URI; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReadWriteLock; import javax.annotation.Nonnull; import javax.annotation.Nullable; import org.apache.log4j.Logger; import com.eucalyptus.cluster.common.internal.Cluster; import com.eucalyptus.cluster.common.msgs.ClusterMigrateInstancesType; import com.eucalyptus.component.Partitions; import com.eucalyptus.component.ServiceConfiguration; import com.eucalyptus.compute.common.CloudMetadatas; import com.eucalyptus.compute.common.ImageMetadata; import com.eucalyptus.compute.common.internal.util.MetadataException; import com.eucalyptus.compute.common.internal.vm.MigrationState; import com.eucalyptus.compute.common.internal.vm.VmInstance; import com.eucalyptus.context.Contexts; import com.eucalyptus.context.ServiceStateException; import com.eucalyptus.entities.Entities; import com.eucalyptus.entities.TransactionResource; import com.eucalyptus.images.Emis; import com.eucalyptus.util.EucalyptusCloudException; import com.eucalyptus.util.Exceptions; import com.eucalyptus.util.async.AsyncRequests; import com.eucalyptus.vm.VmInstances; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Predicate; import com.google.common.base.Predicates; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; /** * */ public class Migrations { private static Logger LOG = Logger.getLogger( Migrations.class ); private final Cluster cluster; private final ReadWriteLock gateLock; private final Predicate<VmInstance> filterPartition = new Predicate<VmInstance>( ) { @Override public boolean apply( VmInstance input ) { return input.getPartition( ).equals( getPartition( ) ) && MigrationState.isMigrating( input ); } }; private Migrations( final Cluster cluster ) { this.cluster = cluster; this.gateLock = cluster.getGateLock( ); } public static Migrations using( @Nonnull final Cluster cluster ) { return new Migrations( cluster ); } private ServiceConfiguration getConfiguration( ) { return cluster.getConfiguration( ); } private String getPartition( ) { return cluster.getPartition( ); } /** * <ol> * <li> Mark this cluster as gated. * <li> Update node and resource information; describe resources. * <li> Find all VMs and update their migration state and volumes * <li> Send the MigrateInstances operation. * <li> Update node and resource information; describe resources. * <li> Unmark this cluster as gated. * </ol> * @param sourceHost * @param destHostsWhiteList -- the destination host list is a white list when true and a black list when false * @param destHosts -- list of hosts which are either a white list or black list based on {@code destHostsWhiteList} * @throws EucalyptusCloudException * @throws Exception */ public void migrateInstances( final String sourceHost, final Boolean destHostsWhiteList, final List<String> destHosts ) throws Exception { //#1 Mark this cluster as gated. if ( this.gateLock.writeLock( ).tryLock( 60, TimeUnit.SECONDS ) ) { try { //#2 Only one migration per cluster for now List<VmInstance> currentMigrations = this.lookupCurrentMigrations( ); if ( !currentMigrations.isEmpty( ) ) { throw Exceptions.toUndeclared( "Cannot start a new migration because the following are already ongoing: " + Joiner.on( ", " ).join( Iterables.transform( currentMigrations, CloudMetadatas.toDisplayName( ) ) ) ); } //#3 Update node and resource information this.retryCheck( ); //#4 Find all VMs and update their migration state and volumes List<String> instanceIds = this.prepareInstanceEvacuations( sourceHost ); //#5 Send the MigrateInstances operation. try { //Get updated download manifests for PV instances final Map<Boolean, Set<String>> updatedResources = getFreshBootrecords(instanceIds, true); AsyncRequests.sendSync( this.getConfiguration( ), new ClusterMigrateInstancesType( ) { { this.setCorrelationId( Contexts.lookup( ).getCorrelationId( ) ); this.setSourceHost( sourceHost ); this.setResourceLocations( Lists.newArrayList(updatedResources.get(true))); this.setAllowHosts( destHostsWhiteList ); this.getDestinationHosts( ).addAll( destHosts ); } } ); } catch ( Exception ex ) { //#5 On error go back and abort the migration status for every instance this.rollbackInstanceEvacuations( sourceHost ); throw ex; } //#6 Update node and resource information; describe resources. this.retryCheck( ); } catch ( Exception ex ) { LOG.error( ex ); throw ex; } finally { //#6 Unmark this cluster as gated. this.gateLock.writeLock( ).unlock( ); } } else { throw new ServiceStateException( "Failed to request migration in the zone " + this.getPartition( ) + ", it is currently locked for maintenance." ); } } /** * Given a list of instance IDs, return a list of VmTypeInfos with updated download manifest URLs * that are valid for the default timeout (hours) for the instances in the id list that are PV * instances. Thus, length of input and output lists may vary due to filtering. * @param instanceIdsToRefresh * @return map of true->Set of eki/eri=signedUrls and false->Set of instanceIds with some failure (e.g eri/eki not found) */ protected static Map<Boolean, Set<String>> getFreshBootrecords(List<String> instanceIdsToRefresh, boolean pvOnly) throws MetadataException { VmInstance vm; Map<Boolean, Set<String>> outputMap = Maps.newHashMap(); outputMap.put(true, new HashSet<String>()); outputMap.put(false, new HashSet<String>()); for(String id : instanceIdsToRefresh) { try ( final TransactionResource db = Entities.transactionFor(VmInstance.class) ) {//scope for transaction vm = VmInstances.lookup(id); //Only update PV images, because NC needs URLs for ramdisk and kernels if(pvOnly && ImageMetadata.VirtualizationType.paravirtualized.equals( ImageMetadata.VirtualizationType.fromString().apply(vm.getVirtualizationType()))) { Emis.BootableSet bs = Emis.recreateBootableSet(vm); if(bs.hasKernel() && !outputMap.get(true).contains(bs.getKernel().getDisplayName())) { try { outputMap.get(true).add(bs.getKernel().getDisplayName() + "=" + bs .getKernelDownloadManifest( Partitions.lookupByName(vm.getPartition()).getNodeCertificate() .getPublicKey(), vm.getReservationId())); } catch(MetadataException ex) { LOG.warn("Could not get kernel download manifest for migration of instance: " + id + ". Migration may fail for this instance", ex); throw ex; } } if(bs.hasRamdisk() && !outputMap.get(true).contains(bs.getRamdisk().getDisplayName())) { try { outputMap.get(true).add(bs.getRamdisk().getDisplayName() + "=" + bs .getRamdiskDownloadManifest( Partitions.lookupByName(vm.getPartition()).getNodeCertificate() .getPublicKey(), vm.getReservationId())); } catch(MetadataException ex) { LOG.warn("Could not get ramdisk download manifest for migration of instance: " + id + ". Migration may fail for this instance", ex); throw ex; } } } } catch (Exception e) { LOG.warn("Failure during update of download manifest while building new bootset. May not be able migrate this instance: " + id, e); outputMap.get(false).add(id); } } return outputMap; } /** * <ol> * <li> Mark this cluster as gated. * <li> Update node and resource information; describe resources. * <li> Find the VM and its volume attachments and authorize every node's IQN. * <li> Send the MigrateInstances operation. * <li> Update node and resource information; describe resources. * <li> Unmark this cluster as gated. * </ol> * @param destHostsWhiteList -- the destination host list is a white list when true and a black list when false * @param destHosts -- list of hosts which are either a white list or black list based on {@code destHostsWhiteList} * @throws EucalyptusCloudException * @throws Exception */ public void migrateInstance( final String instanceId, final Boolean destHostsWhiteList, final List<String> destHosts ) throws Exception { //#1 Mark this cluster as gated. if ( this.gateLock.writeLock( ).tryLock( 60, TimeUnit.SECONDS ) ) { try { //#2 Only one migration per cluster for now List<VmInstance> currentMigrations = this.lookupCurrentMigrations( ); if ( !currentMigrations.isEmpty( ) ) { throw Exceptions.toUndeclared( "Cannot start a new migration because the following are already ongoing: " + Joiner.on( ", " ).join( Iterables.transform( currentMigrations, CloudMetadatas.toDisplayName( ) ) ) ); } //#3 Update node and resource information this.retryCheck( ); //#4 Find all VMs and update their migration state and volumes this.prepareInstanceMigrations( instanceId ); try { //Get updated download manifests for PV instances final Map<Boolean, Set<String>> updatedResources = getFreshBootrecords( ImmutableList.of(instanceId), true); //#5 Send the MigrateInstances operation. AsyncRequests.sendSync( this.getConfiguration( ), new ClusterMigrateInstancesType( ) { { this.setCorrelationId( Contexts.lookup( ).getCorrelationId()); this.setInstanceId(instanceId); this.setResourceLocations(Lists.newArrayList(updatedResources.get(true))); this.setAllowHosts(destHostsWhiteList); this.getDestinationHosts( ).addAll( destHosts ); } } ); } catch ( Exception ex ) { //#5 On error go back and abort the migration status for every instance this.rollbackInstanceMigrations( instanceId ); throw ex; } //#6 Update node and resource information; describe resources. this.retryCheck( ); } catch ( Exception ex ) { LOG.error( ex ); throw ex; } finally { //#6 Unmark this cluster as gated. this.gateLock.writeLock( ).unlock( ); } } else { throw new ServiceStateException( "Failed to request migration in the zone " + this.getPartition( ) + ", it is currently locked for maintenance." ); } } private void rollbackInstanceEvacuations( final String sourceHost ) { Predicate<VmInstance> filterHost = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { String vmHost = URI.create( input.getServiceTag( ) ).getHost( ); return Strings.nullToEmpty( vmHost ).equals( sourceHost ); } }; Predicate<VmInstance> rollbackMigration = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { VmInstances.abortMigration( input ); return true; } }; Predicate<VmInstance> filterAndAbort = Predicates.and( this.filterPartition, rollbackMigration ); Predicate<VmInstance> rollbackMigrationTx = Entities.asTransaction( VmInstance.class, filterAndAbort ); VmInstances.list( rollbackMigrationTx ); } @SuppressWarnings( "unchecked" ) private List<String> prepareInstanceEvacuations( final String sourceHost ) { Predicate<VmInstance> filterHost = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { String vmHost = URI.create( input.getServiceTag( ) ).getHost( ); return Strings.nullToEmpty( vmHost ).equals( sourceHost ); } }; Predicate<VmInstance> startMigration = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { VmInstances.startMigration( input ); return true; } }; Predicate<VmInstance> filterAndAbort = Predicates.and( this.filterPartition, startMigration ); Predicate<VmInstance> startMigrationTx = Entities.asTransaction( VmInstance.class, filterAndAbort ); return Lists.transform(VmInstances.list(startMigrationTx), new Function<VmInstance, String>() { @Nullable @Override public String apply(@Nullable VmInstance vmInstance) { return vmInstance.getInstanceId(); } }); } private void rollbackInstanceMigrations( final String instanceId ) { Predicate<VmInstance> rollbackMigration = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { VmInstances.abortMigration( input ); return true; } }; Predicate<VmInstance> rollbackMigrationTx = Entities.asTransaction( VmInstance.class, rollbackMigration ); rollbackMigrationTx.apply( VmInstances.lookup( instanceId ) ); } @SuppressWarnings( "unchecked" ) private void prepareInstanceMigrations( final String instanceId ) { Predicate<VmInstance> startMigration = new Predicate<VmInstance>( ) { @Override public boolean apply( @Nullable VmInstance input ) { VmInstances.startMigration( input ); return true; } }; Predicate<VmInstance> startMigrationTx = Entities.asTransaction( VmInstance.class, startMigration ); startMigrationTx.apply( VmInstances.lookup( instanceId ) ); } private List<VmInstance> lookupCurrentMigrations( ) throws Exception { return VmInstances.list( this.filterPartition ); } private void retryCheck( ) throws Exception { Exception lastEx = null; for ( int i = 0; i < 5; i++ ) { try { this.cluster.check( ); return; } catch ( Exception ex ) { LOG.debug( "Retrying after failed attempt to refresh cluster state in check(): " + ex.getMessage( ) ); lastEx = ex; TimeUnit.SECONDS.sleep( 2 ); } } throw new ServiceStateException( "Failed to request migration in the zone " + this.getPartition( ) + " because updating resources returned an error: " + ( lastEx != null ? lastEx.getMessage( ) : "unknown error" ) ); } }