/* * Copyright 2012 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.netflix.eureka.registry; import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Timer; import java.util.TimerTask; import com.netflix.appinfo.AmazonInfo; import com.netflix.appinfo.AmazonInfo.MetaDataKey; import com.netflix.appinfo.ApplicationInfoManager; import com.netflix.appinfo.DataCenterInfo; import com.netflix.appinfo.DataCenterInfo.Name; import com.netflix.appinfo.InstanceInfo; import com.netflix.appinfo.InstanceInfo.InstanceStatus; import com.netflix.appinfo.LeaseInfo; import com.netflix.discovery.EurekaClient; import com.netflix.discovery.EurekaClientConfig; import com.netflix.discovery.shared.Application; import com.netflix.discovery.shared.Applications; import com.netflix.eureka.registry.rule.DownOrStartingRule; import com.netflix.eureka.registry.rule.FirstMatchWinsCompositeRule; import com.netflix.eureka.registry.rule.InstanceStatusOverrideRule; import com.netflix.eureka.registry.rule.LeaseExistsRule; import com.netflix.eureka.registry.rule.OverrideExistsRule; import com.netflix.eureka.resources.CurrentRequestVersion; import com.netflix.eureka.EurekaServerConfig; import com.netflix.eureka.Version; import com.netflix.eureka.cluster.PeerEurekaNode; import com.netflix.eureka.cluster.PeerEurekaNodes; import com.netflix.eureka.lease.Lease; import com.netflix.eureka.resources.ASGResource.ASGStatus; import com.netflix.eureka.resources.ServerCodecs; import com.netflix.eureka.util.MeasuredRate; import com.netflix.servo.DefaultMonitorRegistry; import com.netflix.servo.annotations.DataSourceType; import com.netflix.servo.monitor.Monitors; import com.netflix.servo.monitor.Stopwatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; /** * Handles replication of all operations to {@link AbstractInstanceRegistry} to peer * <em>Eureka</em> nodes to keep them all in sync. * * <p> * Primary operations that are replicated are the * <em>Registers,Renewals,Cancels,Expirations and Status Changes</em> * </p> * * <p> * When the eureka server starts up it tries to fetch all the registry * information from the peer eureka nodes.If for some reason this operation * fails, the server does not allow the user to get the registry information for * a period specified in * {@link com.netflix.eureka.EurekaServerConfig#getWaitTimeInMsWhenSyncEmpty()}. * </p> * * <p> * One important thing to note about <em>renewals</em>.If the renewal drops more * than the specified threshold as specified in * {@link com.netflix.eureka.EurekaServerConfig#getRenewalPercentThreshold()} within a period of * {@link com.netflix.eureka.EurekaServerConfig#getRenewalThresholdUpdateIntervalMs()}, eureka * perceives this as a danger and stops expiring instances. * </p> * * @author Karthik Ranganathan, Greg Kim * */ @Singleton public class PeerAwareInstanceRegistryImpl extends AbstractInstanceRegistry implements PeerAwareInstanceRegistry { private static final Logger logger = LoggerFactory.getLogger(PeerAwareInstanceRegistryImpl.class); private static final String US_EAST_1 = "us-east-1"; private static final int PRIME_PEER_NODES_RETRY_MS = 30000; private long startupTime = 0; private boolean peerInstancesTransferEmptyOnStartup = true; public enum Action { Heartbeat, Register, Cancel, StatusUpdate, DeleteStatusOverride; private com.netflix.servo.monitor.Timer timer = Monitors.newTimer(this.name()); public com.netflix.servo.monitor.Timer getTimer() { return this.timer; } } private static final Comparator<Application> APP_COMPARATOR = new Comparator<Application>() { public int compare(Application l, Application r) { return l.getName().compareTo(r.getName()); } }; private final MeasuredRate numberOfReplicationsLastMin; protected final EurekaClient eurekaClient; protected volatile PeerEurekaNodes peerEurekaNodes; private final InstanceStatusOverrideRule instanceStatusOverrideRule; private Timer timer = new Timer( "ReplicaAwareInstanceRegistry - RenewalThresholdUpdater", true); @Inject public PeerAwareInstanceRegistryImpl( EurekaServerConfig serverConfig, EurekaClientConfig clientConfig, ServerCodecs serverCodecs, EurekaClient eurekaClient ) { super(serverConfig, clientConfig, serverCodecs); this.eurekaClient = eurekaClient; this.numberOfReplicationsLastMin = new MeasuredRate(1000 * 60 * 1); // We first check if the instance is STARTING or DOWN, then we check explicit overrides, // then we check the status of a potentially existing lease. this.instanceStatusOverrideRule = new FirstMatchWinsCompositeRule(new DownOrStartingRule(), new OverrideExistsRule(overriddenInstanceStatusMap), new LeaseExistsRule()); } @Override protected InstanceStatusOverrideRule getInstanceInfoOverrideRule() { return this.instanceStatusOverrideRule; } @Override public void init(PeerEurekaNodes peerEurekaNodes) throws Exception { this.numberOfReplicationsLastMin.start(); this.peerEurekaNodes = peerEurekaNodes; initializedResponseCache(); scheduleRenewalThresholdUpdateTask(); initRemoteRegionRegistry(); try { Monitors.registerObject(this); } catch (Throwable e) { logger.warn("Cannot register the JMX monitor for the InstanceRegistry :", e); } } /** * Perform all cleanup and shutdown operations. */ @Override public void shutdown() { try { DefaultMonitorRegistry.getInstance().unregister(Monitors.newObjectMonitor(this)); } catch (Throwable t) { logger.error("Cannot shutdown monitor registry", t); } try { peerEurekaNodes.shutdown(); } catch (Throwable t) { logger.error("Cannot shutdown ReplicaAwareInstanceRegistry", t); } numberOfReplicationsLastMin.stop(); super.shutdown(); } /** * Schedule the task that updates <em>renewal threshold</em> periodically. * The renewal threshold would be used to determine if the renewals drop * dramatically because of network partition and to protect expiring too * many instances at a time. * */ private void scheduleRenewalThresholdUpdateTask() { timer.schedule(new TimerTask() { @Override public void run() { updateRenewalThreshold(); } }, serverConfig.getRenewalThresholdUpdateIntervalMs(), serverConfig.getRenewalThresholdUpdateIntervalMs()); } /** * Populates the registry information from a peer eureka node. This * operation fails over to other nodes until the list is exhausted if the * communication fails. */ @Override public int syncUp() { // Copy entire entry from neighboring DS node int count = 0; for (int i = 0; ((i < serverConfig.getRegistrySyncRetries()) && (count == 0)); i++) { if (i > 0) { try { Thread.sleep(serverConfig.getRegistrySyncRetryWaitMs()); } catch (InterruptedException e) { logger.warn("Interrupted during registry transfer.."); break; } } Applications apps = eurekaClient.getApplications(); for (Application app : apps.getRegisteredApplications()) { for (InstanceInfo instance : app.getInstances()) { try { if (isRegisterable(instance)) { register(instance, instance.getLeaseInfo().getDurationInSecs(), true); count++; } } catch (Throwable t) { logger.error("During DS init copy", t); } } } } return count; } @Override public void openForTraffic(ApplicationInfoManager applicationInfoManager, int count) { // Renewals happen every 30 seconds and for a minute it should be a factor of 2. this.expectedNumberOfRenewsPerMin = count * 2; this.numberOfRenewsPerMinThreshold = (int) (this.expectedNumberOfRenewsPerMin * serverConfig.getRenewalPercentThreshold()); logger.info("Got " + count + " instances from neighboring DS node"); logger.info("Renew threshold is: " + numberOfRenewsPerMinThreshold); this.startupTime = System.currentTimeMillis(); if (count > 0) { this.peerInstancesTransferEmptyOnStartup = false; } DataCenterInfo.Name selfName = applicationInfoManager.getInfo().getDataCenterInfo().getName(); boolean isAws = Name.Amazon == selfName; if (isAws && serverConfig.shouldPrimeAwsReplicaConnections()) { logger.info("Priming AWS connections for all replicas.."); primeAwsReplicas(applicationInfoManager); } logger.info("Changing status to UP"); applicationInfoManager.setInstanceStatus(InstanceStatus.UP); super.postInit(); } /** * Prime connections for Aws replicas. * <p> * Sometimes when the eureka servers comes up, AWS firewall may not allow * the network connections immediately. This will cause the outbound * connections to fail, but the inbound connections continue to work. What * this means is the clients would have switched to this node (after EIP * binding) and so the other eureka nodes will expire all instances that * have been switched because of the lack of outgoing heartbeats from this * instance. * </p> * <p> * The best protection in this scenario is to block and wait until we are * able to ping all eureka nodes successfully atleast once. Until then we * won't open up the traffic. * </p> */ private void primeAwsReplicas(ApplicationInfoManager applicationInfoManager) { boolean areAllPeerNodesPrimed = false; while (!areAllPeerNodesPrimed) { String peerHostName = null; try { Application eurekaApps = this.getApplication(applicationInfoManager.getInfo().getAppName(), false); if (eurekaApps == null) { areAllPeerNodesPrimed = true; logger.info("No peers needed to prime."); return; } for (PeerEurekaNode node : peerEurekaNodes.getPeerEurekaNodes()) { for (InstanceInfo peerInstanceInfo : eurekaApps.getInstances()) { LeaseInfo leaseInfo = peerInstanceInfo.getLeaseInfo(); // If the lease is expired - do not worry about priming if (System.currentTimeMillis() > (leaseInfo .getRenewalTimestamp() + (leaseInfo .getDurationInSecs() * 1000)) + (2 * 60 * 1000)) { continue; } peerHostName = peerInstanceInfo.getHostName(); logger.info("Trying to send heartbeat for the eureka server at {} to make sure the " + "network channels are open", peerHostName); // Only try to contact the eureka nodes that are in this instance's registry - because // the other instances may be legitimately down if (peerHostName.equalsIgnoreCase(new URI(node.getServiceUrl()).getHost())) { node.heartbeat( peerInstanceInfo.getAppName(), peerInstanceInfo.getId(), peerInstanceInfo, null, true); } } } areAllPeerNodesPrimed = true; } catch (Throwable e) { logger.error("Could not contact " + peerHostName, e); try { Thread.sleep(PRIME_PEER_NODES_RETRY_MS); } catch (InterruptedException e1) { logger.warn("Interrupted while priming : ", e1); areAllPeerNodesPrimed = true; } } } } /** * Checks to see if the registry access is allowed or the server is in a * situation where it does not all getting registry information. The server * does not return registry information for a period specified in * {@link EurekaServerConfig#getWaitTimeInMsWhenSyncEmpty()}, if it cannot * get the registry information from the peer eureka nodes at start up. * * @return false - if the instances count from a replica transfer returned * zero and if the wait time has not elapsed, otherwise returns true */ @Override public boolean shouldAllowAccess(boolean remoteRegionRequired) { if (this.peerInstancesTransferEmptyOnStartup) { if (!(System.currentTimeMillis() > this.startupTime + serverConfig.getWaitTimeInMsWhenSyncEmpty())) { return false; } } if (remoteRegionRequired) { for (RemoteRegionRegistry remoteRegionRegistry : this.regionNameVSRemoteRegistry.values()) { if (!remoteRegionRegistry.isReadyForServingData()) { return false; } } } return true; } public boolean shouldAllowAccess() { return shouldAllowAccess(true); } /** * @deprecated use {@link com.netflix.eureka.cluster.PeerEurekaNodes#getPeerEurekaNodes()} directly. * * Gets the list of peer eureka nodes which is the list to replicate * information to. * * @return the list of replica nodes. */ @Deprecated public List<PeerEurekaNode> getReplicaNodes() { return Collections.unmodifiableList(peerEurekaNodes.getPeerEurekaNodes()); } /* * (non-Javadoc) * * @see com.netflix.eureka.registry.InstanceRegistry#cancel(java.lang.String, * java.lang.String, long, boolean) */ @Override public boolean cancel(final String appName, final String id, final boolean isReplication) { if (super.cancel(appName, id, isReplication)) { replicateToPeers(Action.Cancel, appName, id, null, null, isReplication); synchronized (lock) { if (this.expectedNumberOfRenewsPerMin > 0) { // Since the client wants to cancel it, reduce the threshold (1 for 30 seconds, 2 for a minute) this.expectedNumberOfRenewsPerMin = this.expectedNumberOfRenewsPerMin - 2; this.numberOfRenewsPerMinThreshold = (int) (this.expectedNumberOfRenewsPerMin * serverConfig.getRenewalPercentThreshold()); } } return true; } return false; } /** * Registers the information about the {@link InstanceInfo} and replicates * this information to all peer eureka nodes. If this is replication event * from other replica nodes then it is not replicated. * * @param info * the {@link InstanceInfo} to be registered and replicated. * @param isReplication * true if this is a replication event from other replica nodes, * false otherwise. */ @Override public void register(final InstanceInfo info, final boolean isReplication) { int leaseDuration = Lease.DEFAULT_DURATION_IN_SECS; if (info.getLeaseInfo() != null && info.getLeaseInfo().getDurationInSecs() > 0) { leaseDuration = info.getLeaseInfo().getDurationInSecs(); } super.register(info, leaseDuration, isReplication); replicateToPeers(Action.Register, info.getAppName(), info.getId(), info, null, isReplication); } /* * (non-Javadoc) * * @see com.netflix.eureka.registry.InstanceRegistry#renew(java.lang.String, * java.lang.String, long, boolean) */ public boolean renew(final String appName, final String id, final boolean isReplication) { if (super.renew(appName, id, isReplication)) { replicateToPeers(Action.Heartbeat, appName, id, null, null, isReplication); return true; } return false; } /* * (non-Javadoc) * * @see com.netflix.eureka.registry.InstanceRegistry#statusUpdate(java.lang.String, * java.lang.String, com.netflix.appinfo.InstanceInfo.InstanceStatus, * java.lang.String, boolean) */ @Override public boolean statusUpdate(final String appName, final String id, final InstanceStatus newStatus, String lastDirtyTimestamp, final boolean isReplication) { if (super.statusUpdate(appName, id, newStatus, lastDirtyTimestamp, isReplication)) { replicateToPeers(Action.StatusUpdate, appName, id, null, newStatus, isReplication); return true; } return false; } @Override public boolean deleteStatusOverride(String appName, String id, InstanceStatus newStatus, String lastDirtyTimestamp, boolean isReplication) { if (super.deleteStatusOverride(appName, id, newStatus, lastDirtyTimestamp, isReplication)) { replicateToPeers(Action.DeleteStatusOverride, appName, id, null, null, isReplication); return true; } return false; } /** * Replicate the <em>ASG status</em> updates to peer eureka nodes. If this * event is a replication from other nodes, then it is not replicated to * other nodes. * * @param asgName the asg name for which the status needs to be replicated. * @param newStatus the {@link ASGStatus} information that needs to be replicated. * @param isReplication true if this is a replication event from other nodes, false otherwise. */ @Override public void statusUpdate(final String asgName, final ASGStatus newStatus, final boolean isReplication) { // If this is replicated from an other node, do not try to replicate again. if (isReplication) { return; } for (final PeerEurekaNode node : peerEurekaNodes.getPeerEurekaNodes()) { replicateASGInfoToReplicaNodes(asgName, newStatus, node); } } @Override public boolean isLeaseExpirationEnabled() { if (!isSelfPreservationModeEnabled()) { // The self preservation mode is disabled, hence allowing the instances to expire. return true; } return numberOfRenewsPerMinThreshold > 0 && getNumOfRenewsInLastMin() > numberOfRenewsPerMinThreshold; } /** * Checks to see if the self-preservation mode is enabled. * * <p> * The self-preservation mode is enabled if the expected number of renewals * per minute {@link #getNumOfRenewsInLastMin()} is lesser than the expected * threshold which is determined by {@link #getNumOfRenewsPerMinThreshold()} * . Eureka perceives this as a danger and stops expiring instances as this * is most likely because of a network event. The mode is disabled only when * the renewals get back to above the threshold or if the flag * {@link EurekaServerConfig#shouldEnableSelfPreservation()} is set to * false. * </p> * * @return true if the self-preservation mode is enabled, false otherwise. */ @Override public boolean isSelfPreservationModeEnabled() { return serverConfig.shouldEnableSelfPreservation(); } @Override public InstanceInfo getNextServerFromEureka(String virtualHostname, boolean secure) { // TODO Auto-generated method stub return null; } /** * Updates the <em>renewal threshold</em> based on the current number of * renewals. The threshold is a percentage as specified in * {@link EurekaServerConfig#getRenewalPercentThreshold()} of renewals * received per minute {@link #getNumOfRenewsInLastMin()}. */ private void updateRenewalThreshold() { try { Applications apps = eurekaClient.getApplications(); int count = 0; for (Application app : apps.getRegisteredApplications()) { for (InstanceInfo instance : app.getInstances()) { if (this.isRegisterable(instance)) { ++count; } } } synchronized (lock) { // Update threshold only if the threshold is greater than the // current expected threshold of if the self preservation is disabled. if ((count * 2) > (serverConfig.getRenewalPercentThreshold() * numberOfRenewsPerMinThreshold) || (!this.isSelfPreservationModeEnabled())) { this.expectedNumberOfRenewsPerMin = count * 2; this.numberOfRenewsPerMinThreshold = (int) ((count * 2) * serverConfig.getRenewalPercentThreshold()); } } logger.info("Current renewal threshold is : {}", numberOfRenewsPerMinThreshold); } catch (Throwable e) { logger.error("Cannot update renewal threshold", e); } } /** * Gets the list of all {@link Applications} from the registry in sorted * lexical order of {@link Application#getName()}. * * @return the list of {@link Applications} in lexical order. */ @Override public List<Application> getSortedApplications() { List<Application> apps = new ArrayList<Application>(getApplications().getRegisteredApplications()); Collections.sort(apps, APP_COMPARATOR); return apps; } /** * Gets the number of <em>renewals</em> in the last minute. * * @return a long value representing the number of <em>renewals</em> in the last minute. */ @com.netflix.servo.annotations.Monitor(name = "numOfReplicationsInLastMin", description = "Number of total replications received in the last minute", type = com.netflix.servo.annotations.DataSourceType.GAUGE) public long getNumOfReplicationsInLastMin() { return numberOfReplicationsLastMin.getCount(); } /** * Checks if the number of renewals is lesser than threshold. * * @return 0 if the renewals are greater than threshold, 1 otherwise. */ @com.netflix.servo.annotations.Monitor(name = "isBelowRenewThreshold", description = "0 = false, 1 = true", type = com.netflix.servo.annotations.DataSourceType.GAUGE) @Override public int isBelowRenewThresold() { if ((getNumOfRenewsInLastMin() <= numberOfRenewsPerMinThreshold) && ((this.startupTime > 0) && (System.currentTimeMillis() > this.startupTime + (serverConfig.getWaitTimeInMsWhenSyncEmpty())))) { return 1; } else { return 0; } } /** * Checks if an instance is registerable in this region. Instances from other regions are rejected. * * @param instanceInfo th instance info information of the instance * @return true, if it can be registered in this server, false otherwise. */ public boolean isRegisterable(InstanceInfo instanceInfo) { DataCenterInfo datacenterInfo = instanceInfo.getDataCenterInfo(); String serverRegion = clientConfig.getRegion(); if (AmazonInfo.class.isInstance(datacenterInfo)) { AmazonInfo info = AmazonInfo.class.cast(instanceInfo.getDataCenterInfo()); String availabilityZone = info.get(MetaDataKey.availabilityZone); // Can be null for dev environments in non-AWS data center if (availabilityZone == null && US_EAST_1.equalsIgnoreCase(serverRegion)) { return true; } else if ((availabilityZone != null) && (availabilityZone.contains(serverRegion))) { // If in the same region as server, then consider it registerable return true; } } return true; // Everything non-amazon is registrable. } /** * Replicates all eureka actions to peer eureka nodes except for replication * traffic to this node. * */ private void replicateToPeers(Action action, String appName, String id, InstanceInfo info /* optional */, InstanceStatus newStatus /* optional */, boolean isReplication) { Stopwatch tracer = action.getTimer().start(); try { if (isReplication) { numberOfReplicationsLastMin.increment(); } // If it is a replication already, do not replicate again as this will create a poison replication if (peerEurekaNodes == Collections.EMPTY_LIST || isReplication) { return; } for (final PeerEurekaNode node : peerEurekaNodes.getPeerEurekaNodes()) { // If the url represents this host, do not replicate to yourself. if (peerEurekaNodes.isThisMyUrl(node.getServiceUrl())) { continue; } replicateInstanceActionsToPeers(action, appName, id, info, newStatus, node); } } finally { tracer.stop(); } } /** * Replicates all instance changes to peer eureka nodes except for * replication traffic to this node. * */ private void replicateInstanceActionsToPeers(Action action, String appName, String id, InstanceInfo info, InstanceStatus newStatus, PeerEurekaNode node) { try { InstanceInfo infoFromRegistry = null; CurrentRequestVersion.set(Version.V2); switch (action) { case Cancel: node.cancel(appName, id); break; case Heartbeat: InstanceStatus overriddenStatus = overriddenInstanceStatusMap.get(id); infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.heartbeat(appName, id, infoFromRegistry, overriddenStatus, false); break; case Register: node.register(info); break; case StatusUpdate: infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.statusUpdate(appName, id, newStatus, infoFromRegistry); break; case DeleteStatusOverride: infoFromRegistry = getInstanceByAppAndId(appName, id, false); node.deleteStatusOverride(appName, id, infoFromRegistry); break; } } catch (Throwable t) { logger.error("Cannot replicate information to {} for action {}", node.getServiceUrl(), action.name(), t); } } /** * Replicates all ASG status changes to peer eureka nodes except for * replication traffic to this node. */ private void replicateASGInfoToReplicaNodes(final String asgName, final ASGStatus newStatus, final PeerEurekaNode node) { CurrentRequestVersion.set(Version.V2); try { node.statusUpdate(asgName, newStatus); } catch (Throwable e) { logger.error("Cannot replicate ASG status information to {}", node.getServiceUrl(), e); } } @Override @com.netflix.servo.annotations.Monitor(name = "localRegistrySize", description = "Current registry size", type = DataSourceType.GAUGE) public long getLocalRegistrySize() { return super.getLocalRegistrySize(); } }