/************************************************************************* * Copyright 2009-2016 Eucalyptus Systems, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. * * Please contact Eucalyptus Systems, Inc., 6755 Hollister Ave., Goleta * CA 93117, USA or visit http://www.eucalyptus.com/licenses/ if you need * additional information or have any questions. * * This file may incorporate work covered under the following copyright * and permission notice: * * Software License Agreement (BSD License) * * Copyright (c) 2008, Regents of the University of California * All rights reserved. * * Redistribution and use of this software in source and binary forms, * with or without modification, are permitted provided that the * following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. USERS OF THIS SOFTWARE ACKNOWLEDGE * THE POSSIBLE PRESENCE OF OTHER OPEN SOURCE LICENSED MATERIAL, * COPYRIGHTED MATERIAL OR PATENTED MATERIAL IN THIS SOFTWARE, * AND IF ANY SUCH MATERIAL IS DISCOVERED THE PARTY DISCOVERING * IT MAY INFORM DR. RICH WOLSKI AT THE UNIVERSITY OF CALIFORNIA, * SANTA BARBARA WHO WILL THEN ASCERTAIN THE MOST APPROPRIATE REMEDY, * WHICH IN THE REGENTS' DISCRETION MAY INCLUDE, WITHOUT LIMITATION, * REPLACEMENT OF THE CODE SO IDENTIFIED, LICENSING OF THE CODE SO * IDENTIFIED, OR WITHDRAWAL OF THE CODE CAPABILITY TO THE EXTENT * NEEDED TO COMPLY WITH ANY SUCH LICENSES OR RIGHTS. ************************************************************************/ package com.eucalyptus.cloud.run; import static com.eucalyptus.cloud.VmInstanceLifecycleHelpers.NetworkResourceVmInstanceLifecycleHelper; import static com.eucalyptus.util.RestrictedTypes.BatchAllocator; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import javax.annotation.Nullable; import javax.persistence.EntityTransaction; import com.eucalyptus.auth.AuthException; import com.eucalyptus.cloud.VmInstanceToken; import com.eucalyptus.cluster.Clusters; import com.eucalyptus.compute.common.CloudMetadataLimitedType; import com.eucalyptus.compute.common.internal.vmtypes.VmType; import com.google.common.base.Function; import org.apache.log4j.Logger; import com.eucalyptus.blockstorage.Storage; import com.eucalyptus.cloud.VmInstanceLifecycleHelpers; import com.eucalyptus.cloud.VmInstanceLifecycleHelper; import com.eucalyptus.cloud.run.Allocations.Allocation; import com.eucalyptus.compute.common.internal.util.IllegalMetadataAccessException; import com.eucalyptus.compute.common.internal.util.NotEnoughResourcesException; import com.eucalyptus.cluster.common.internal.Cluster; import com.eucalyptus.cluster.common.internal.ResourceState; import com.eucalyptus.cluster.common.internal.ResourceState.VmTypeAvailability; import com.eucalyptus.component.Partition; import com.eucalyptus.component.Partitions; import com.eucalyptus.component.ServiceConfiguration; import com.eucalyptus.component.Topology; import com.eucalyptus.cluster.common.ClusterController; import com.eucalyptus.compute.common.CloudMetadata; import com.eucalyptus.compute.common.network.DnsHostNamesFeature; import com.eucalyptus.compute.common.network.NetworkFeature; import com.eucalyptus.compute.common.network.NetworkResource; import com.eucalyptus.compute.common.network.Networking; import com.eucalyptus.compute.common.network.PrepareNetworkResourcesResultType; import com.eucalyptus.compute.common.network.PrepareNetworkResourcesType; import com.eucalyptus.context.ServiceStateException; import com.eucalyptus.entities.Entities; import com.eucalyptus.compute.common.internal.images.BlockStorageImageInfo; import com.eucalyptus.compute.common.internal.network.NetworkGroup; import com.eucalyptus.records.EventRecord; import com.eucalyptus.records.EventType; import com.eucalyptus.records.Logs; import com.eucalyptus.scripting.ScriptExecutionFailedException; import com.eucalyptus.util.EucalyptusCloudException; import com.eucalyptus.util.Exceptions; import com.eucalyptus.util.HasName; import com.eucalyptus.util.LogUtil; import com.eucalyptus.util.RestrictedTypes; import com.eucalyptus.compute.common.internal.vm.VmInstance; import com.eucalyptus.vmtypes.VmTypes; import com.google.common.base.MoreObjects; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; public class AdmissionControl { private static Logger LOG = Logger.getLogger( AdmissionControl.class ); public static Predicate<Allocation> run( ) { return RunAdmissionControl.INSTANCE; } public static Predicate<Allocation> restore( ) { return Restore.INSTANCE; } enum RunAdmissionControl implements Predicate<Allocation> { INSTANCE; @Override public boolean apply( Allocation allocInfo ) { if ( EventRecord.isTraceEnabled( AdmissionControl.class ) ) { EventRecord.here( AdmissionControl.class, EventType.VM_RESERVED, LogUtil.dumpObject( allocInfo ) ).trace( ); } List<ResourceAllocator> finished = Lists.newArrayList( ); EntityTransaction db = Entities.get( NetworkGroup.class ); try { for ( ResourceAllocator allocator : allocators ) { runAllocatorSafely( allocInfo, allocator ); finished.add( allocator ); } db.commit( ); return true; } catch ( Exception ex ) { Logs.exhaust( ).error( ex, ex ); rollbackAllocations( allocInfo, finished, ex ); db.rollback( ); throw Exceptions.toUndeclared( new NotEnoughResourcesException( Exceptions.getCauseMessage( ex ), ex ) ); } } } enum Restore implements Predicate<Allocation> { INSTANCE; @Override public boolean apply( Allocation allocInfo ) { List<ResourceAllocator> finished = Lists.newArrayList( ); EntityTransaction db = Entities.get( NetworkGroup.class ); try { for ( ResourceAllocator allocator : restorers ) { runAllocatorSafely( allocInfo, allocator ); finished.add( allocator ); } db.commit( ); return true; } catch ( Exception ex ) { Logs.exhaust( ).error( ex, ex ); rollbackAllocations( allocInfo, finished, ex ); db.rollback( ); throw Exceptions.toUndeclared( new NotEnoughResourcesException( ex.getMessage( ), ex ) ); } } } private static void rollbackAllocations( Allocation allocInfo, List<ResourceAllocator> finished, Exception e ) { for ( ResourceAllocator rollback : Lists.reverse( finished ) ) { try { rollback.fail( allocInfo, e ); } catch ( Exception e1 ) { LOG.debug( e1, e1 ); } } } private static void runAllocatorSafely( Allocation allocInfo, ResourceAllocator allocator ) throws Exception { try { allocator.allocate( allocInfo ); } catch ( ScriptExecutionFailedException e ) { if ( e.getCause( ) != null ) { throw new EucalyptusCloudException( e.getCause( ).getMessage( ), e.getCause( ) ); } else { throw new EucalyptusCloudException( e.getMessage( ), e ); } } catch ( Exception e ) { LOG.debug( e, e ); try { allocator.fail( allocInfo, e ); } catch ( Exception e1 ) { LOG.debug( e1, e1 ); } throw e; } } private interface ResourceAllocator { public void allocate( Allocation allocInfo ) throws Exception; public void fail( Allocation allocInfo, Throwable t ); } private static final List<ResourceAllocator> allocators = ImmutableList.<ResourceAllocator>of( NodeResourceAllocator.INSTANCE, NetworkingAllocator.INSTANCE ); private static final List<ResourceAllocator> restorers = ImmutableList.<ResourceAllocator>of( NetworkingAllocator.INSTANCE ); enum NodeResourceAllocator implements ResourceAllocator { INSTANCE; private List<VmInstanceToken> requestResourceToken( final Allocation allocInfo, final int tryAmount, final int maxAmount ) throws Exception { ServiceConfiguration config = Topology.lookup( ClusterController.class, allocInfo.getPartition( ) ); Cluster cluster = Clusters.lookupAny( config ); /** * TODO:GRZE: this is the call path which needs to trigger gating. * It shouldn't be handled directly here, but instead be handled in {@link ResourceState#requestResourceAllocation(). * */ if ( cluster.getGateLock( ).readLock( ).tryLock( 60, TimeUnit.SECONDS ) ) { try { final ResourceState state = cluster.getNodeState( ); /** * NOTE: If the defined instance type has an ordering conflict w/ some other type then it * isn't safe to service TWO requests which use differing types during the same resource refresh * duty cycle. * This determines whether or not an asynchronous allocation is safe to do for the * request instance type or whether a synchronous resource availability refresh is needed. * */ boolean unorderedType = VmTypes.isUnorderedType( allocInfo.getVmType( ) ); boolean forceResourceRefresh = state.hasUnorderedTokens( ) || unorderedType; /** * GRZE: if the vm type is not "nicely" ordered then we force a refresh of the actual * cluster state. Note: we already hold the cluster gating lock here so this update will * be mutual exclusive wrt both resource allocations and cluster state updates. */ if ( forceResourceRefresh ) { cluster.refreshResources( ); } final BatchAllocator<VmInstanceToken> allocator = new BatchAllocator<VmInstanceToken>( ) { @Override public List<VmInstanceToken> allocate( int min, int max ) { try { // do quotas for "active" instances RestrictedTypes.allocateMeasurableResource(Long.valueOf(1L*max), new Function<Long, CloudMetadataLimitedType.VmInstanceActiveMetadata>() { @Nullable @Override public CloudMetadataLimitedType.VmInstanceActiveMetadata apply(@Nullable Long amount) { return new CloudMetadataLimitedType.VmInstanceActiveMetadata() { }; // kind of a marker for active instances } }); // do quotas for instance specific items (cpu, memory, disk) RestrictedTypes.allocateMeasurableResource(max * Long.valueOf(allocInfo.getVmType().getCpu().longValue()), new Function<Long, CloudMetadataLimitedType.CpuMetadata>() { @Nullable @Override public CloudMetadataLimitedType.CpuMetadata apply(@Nullable Long amount) { return new CloudMetadataLimitedType.CpuMetadata() { }; // kind of a marker for cpu } }); RestrictedTypes.allocateMeasurableResource(max * Long.valueOf(allocInfo.getVmType().getMemory().longValue()), new Function<Long, CloudMetadataLimitedType.MemoryMetadata>() { @Nullable @Override public CloudMetadataLimitedType.MemoryMetadata apply(@Nullable Long amount) { return new CloudMetadataLimitedType.MemoryMetadata() { }; // kind of a marker for memory } }); RestrictedTypes.allocateMeasurableResource(max * Long.valueOf(allocInfo.getVmType().getDisk().longValue()), new Function<Long, CloudMetadataLimitedType.DiskMetadata>() { @Nullable @Override public CloudMetadataLimitedType.DiskMetadata apply(@Nullable Long amount) { return new CloudMetadataLimitedType.DiskMetadata() { }; // kind of a marker for disk } }); final List<VmInstanceToken> ret = state.requestResourceAllocation( allocInfo.getVmType( ), min, max, new Supplier<VmInstanceToken>( ) { private int count = 0; @Override public VmInstanceToken get( ) { return new VmInstanceToken( allocInfo, count++ ); } } ); allocInfo.getAllocationTokens().addAll( ret ); return ret; } catch ( final NotEnoughResourcesException | AuthException e ) { throw Exceptions.toUndeclared( e ); } } }; if ( allocInfo.getAllocationType( ) == Allocations.AllocationType.Start && maxAmount==1 && allocInfo.getInstanceIds( ).size( ) == 1 ) { RestrictedTypes.reallocateUnitlessResource( CloudMetadata.VmInstanceMetadata.class, allocator ); } else { RestrictedTypes.allocateUnitlessResources( CloudMetadata.VmInstanceMetadata.class, tryAmount, maxAmount, allocator, allocInfo.exampleInstanceResource( maxAmount==1 ) ); } return allocInfo.getAllocationTokens( ); } finally { cluster.getGateLock( ).readLock( ).unlock( ); } } else { throw new ServiceStateException( "Failed to allocate resources in the zone " + cluster.getPartition( ) + ", it is currently locked for maintenance." ); } } @Override public void allocate( Allocation allocInfo ) throws Exception { Partition reqPartition = allocInfo.getPartition(); String zoneName = reqPartition.getName( ); VmType vmType = allocInfo.getVmType( ); /* Validate min and max amount */ final int minAmount = allocInfo.getMinCount( ); final int maxAmount = allocInfo.getMaxCount( ); if(minAmount > maxAmount) throw new RuntimeException("Maximum instance count must not be smaller than minimum instance count"); /* Retrieve our context and list of clusters associated with this zone */ List<Cluster> authorizedClusters = this.doPrivilegedLookup( zoneName, vmType ); int remaining = maxAmount; int allocated = 0; int available; LOG.info( "Found authorized clusters: " + Iterables.transform( authorizedClusters, HasName.GET_NAME ) ); /* Do we have any VM available throughout our clusters? */ if ( ( available = checkAvailability( vmType, authorizedClusters ) ) < minAmount ) { throw new NotEnoughResourcesException( "Not enough resources (" + available + " in " + zoneName + " < " + minAmount + "): vm instances." ); } else { for ( Cluster cluster : authorizedClusters ) { if ( remaining <= 0 ) { break; } else { ResourceState state = cluster.getNodeState( ); Partition partition = cluster.getConfiguration( ).lookupPartition( ); /* Has a partition been set if the AZ was not specified? */ if( allocInfo.getPartition( ).equals( Partition.DEFAULT ) ) { /* * Ok, do we have enough slots in this partition to support our request? We should have at least * the minimum. The list is sorted in order of resource availability from the cluster with the most * available to the cluster with the least amount available. This is why we don't check against the * maxAmount value since its a best effort at this point. If we select the partition here and we * can't fit maxAmount, based on the sorting order, the next partition will not fit maxAmount anyway. */ int zoneAvailable = checkZoneAvailability( vmType, partition, authorizedClusters ); if( zoneAvailable < minAmount ) continue; /* Lets use this partition */ allocInfo.setPartition( partition ); } else if( !allocInfo.getPartition( ).equals( partition ) ) { /* We should only pick clusters that are part of the selected AZ */ continue; } if ( !RestrictedTypes.filterPrivileged( ).apply( allocInfo.exampleInstanceResource( maxAmount==1 )) ) { throw new IllegalMetadataAccessException( "Instance resource denied." ); } if ( allocInfo.getBootSet( ).getMachine( ) instanceof BlockStorageImageInfo ) { try { Topology.lookup( Storage.class, partition ); } catch ( Exception ex ) { allocInfo.abort( ); allocInfo.setPartition( reqPartition ); throw new NotEnoughResourcesException( "Not enough resources: Cannot run EBS instances in partition w/o a storage controller: " + ex.getMessage( ), ex ); } } try { int tryAmount = ( remaining > state.getAvailability( vmType ).getAvailable( ) ) ? state.getAvailability( vmType ).getAvailable( ) : remaining; List<VmInstanceToken> tokens = this.requestResourceToken( allocInfo, tryAmount, maxAmount ); remaining -= tokens.size( ); allocated += tokens.size( ); } catch ( Exception t ) { LOG.error( t ); Logs.extreme( ).error( t, t ); allocInfo.abort( ); allocInfo.setPartition( reqPartition ); /* if we still have some allocation remaining AND no more resources are available */ if ( ( ( available = checkZoneAvailability( vmType, partition, authorizedClusters ) ) < remaining ) && ( remaining > 0 ) ) { throw new NotEnoughResourcesException( "Not enough resources (" + available + " in " + zoneName + " < " + minAmount + "): vm instances.", t ); } else { throw new NotEnoughResourcesException( t.getMessage(), t ); } } } } /* Were we able to meet our minimum requirements? */ if ( ( allocated < minAmount) && ( remaining > 0 ) ) { allocInfo.abort( ); allocInfo.setPartition( reqPartition ); if( reqPartition.equals( Partition.DEFAULT ) ) { throw new NotEnoughResourcesException( "Not enough resources available in all zone for " + minAmount + "): vm instances." ); } else { available = checkZoneAvailability( vmType, reqPartition, authorizedClusters ); throw new NotEnoughResourcesException( "Not enough resources (" + available + " in " + zoneName + " < " + minAmount + "): vm instances." ); } } } } private int checkAvailability( VmType vmType, List<Cluster> authorizedClusters ) throws NotEnoughResourcesException { int available = 0; for ( Cluster authorizedCluster : authorizedClusters ) { VmTypeAvailability vmAvailability = authorizedCluster.getNodeState( ).getAvailability( vmType ); available += vmAvailability.getAvailable( ); LOG.info( "Availability: " + authorizedCluster.getName( ) + " -> " + vmAvailability.getAvailable( ) ); } return available; } private int checkZoneAvailability( VmType vmType, Partition partition, List<Cluster> authorizedClusters ) throws NotEnoughResourcesException { int available = 0; for ( Cluster authorizedCluster : authorizedClusters ) { if( !authorizedCluster.getConfiguration( ).lookupPartition( ).equals( partition ) ) continue; VmTypeAvailability vmAvailability = authorizedCluster.getNodeState( ).getAvailability( vmType ); available += vmAvailability.getAvailable( ); LOG.info( "Availability: " + authorizedCluster.getName( ) + " -> " + vmAvailability.getAvailable( ) ); } return available; } private List<Cluster> doPrivilegedLookup( String partitionName, VmType vmType ) throws NotEnoughResourcesException { if ( Partition.DEFAULT_NAME.equals( partitionName ) ) { Iterable<Cluster> authorizedClusters = Clusters.stream( ).filter( RestrictedTypes.filterPrivilegedWithoutOwner( ) ); Multimap<VmTypeAvailability, Cluster> sorted = TreeMultimap.create( ); for ( Cluster c : authorizedClusters ) { sorted.put( c.getNodeState( ).getAvailability( vmType ), c ); } if ( sorted.isEmpty( ) ) { throw new NotEnoughResourcesException( "Not enough resources: no availability zone is available in which you have permissions to run instances." ); } else { return Lists.newArrayList( sorted.values( ) ); } } else { ServiceConfiguration ccConfig = Topology.lookup( ClusterController.class, Partitions.lookupByName( partitionName ) ); Cluster cluster = Clusters.lookupAny( ccConfig ); if ( cluster == null ) { throw new NotEnoughResourcesException( "Can't find cluster " + partitionName ); } if ( ! RestrictedTypes.filterPrivilegedWithoutOwner( ).apply( cluster ) ) { throw new NotEnoughResourcesException( "Not authorized to use cluster " + partitionName ); } return Lists.newArrayList( cluster ); } } @Override public void fail( Allocation allocInfo, Throwable t ) { allocInfo.abort( ); } } enum NetworkingAllocator implements ResourceAllocator { INSTANCE; @Override public void allocate( Allocation allocInfo ) throws Exception { try { final VmInstanceLifecycleHelper helper = VmInstanceLifecycleHelpers.get( ); final PrepareNetworkResourcesType request = new PrepareNetworkResourcesType( ); request.setAvailabilityZone( allocInfo.getPartition( ).getName( ) ); request.setFeatures( Lists.<NetworkFeature>newArrayList( new DnsHostNamesFeature( ) ) ); helper.prepareNetworkAllocation( allocInfo, request ); final PrepareNetworkResourcesResultType result = Networking.getInstance().prepare( request ) ; for ( final VmInstanceToken token : allocInfo.getAllocationTokens( ) ) { for ( final NetworkResource networkResource : result.getResources( ) ) { if ( token.getInstanceId( ).equals( networkResource.getOwnerId( ) ) ) { token.getAttribute( NetworkResourceVmInstanceLifecycleHelper.NetworkResourcesKey ).add( networkResource ); } } } helper.verifyNetworkAllocation( allocInfo, result ); } catch ( Exception e ) { throw MoreObjects.firstNonNull( Exceptions.findCause( e, NotEnoughResourcesException.class ), e ); } } @Override public void fail( Allocation allocInfo, Throwable t ) { allocInfo.abort( ); } } }