/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.spi.loadbalancing.adaptive; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.SortedMap; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.IgniteException; import org.apache.ignite.IgniteLogger; import org.apache.ignite.cluster.ClusterNode; import org.apache.ignite.compute.ComputeJob; import org.apache.ignite.compute.ComputeTaskSession; import org.apache.ignite.events.DiscoveryEvent; import org.apache.ignite.events.Event; import org.apache.ignite.events.JobEvent; import org.apache.ignite.events.TaskEvent; import org.apache.ignite.internal.managers.eventstorage.GridLocalEventListener; import org.apache.ignite.internal.util.typedef.F; import org.apache.ignite.internal.util.typedef.internal.A; import org.apache.ignite.internal.util.typedef.internal.S; import org.apache.ignite.lang.IgniteBiTuple; import org.apache.ignite.lang.IgniteUuid; import org.apache.ignite.resources.LoggerResource; import org.apache.ignite.spi.IgniteSpiAdapter; import org.apache.ignite.spi.IgniteSpiConfiguration; import org.apache.ignite.spi.IgniteSpiContext; import org.apache.ignite.spi.IgniteSpiException; import org.apache.ignite.spi.IgniteSpiMBeanAdapter; import org.apache.ignite.spi.IgniteSpiMultipleInstancesSupport; import org.apache.ignite.spi.loadbalancing.LoadBalancingSpi; import org.jetbrains.annotations.Nullable; import org.jsr166.ConcurrentHashMap8; import static org.apache.ignite.events.EventType.EVT_JOB_MAPPED; import static org.apache.ignite.events.EventType.EVT_NODE_FAILED; import static org.apache.ignite.events.EventType.EVT_NODE_JOINED; import static org.apache.ignite.events.EventType.EVT_NODE_LEFT; import static org.apache.ignite.events.EventType.EVT_NODE_METRICS_UPDATED; import static org.apache.ignite.events.EventType.EVT_TASK_FAILED; import static org.apache.ignite.events.EventType.EVT_TASK_FINISHED; /** * Load balancing SPI that adapts to overall node performance. It * proportionally distributes more jobs to more performant nodes based * on a pluggable and dynamic node load probing. * <p> * <h1 class="header">Adaptive Node Probe</h1> * This SPI comes with pluggable algorithm to calculate a node load * at any given point of time. The algorithm is defined by * {@link AdaptiveLoadProbe} interface and user is * free to provide custom implementations. By default * {@link AdaptiveCpuLoadProbe} implementation is used * which distributes jobs to nodes based on average CPU load * on every node. * <p> * The following load probes are available with the product: * <ul> * <li>{@link AdaptiveCpuLoadProbe} - default</li> * <li>{@link AdaptiveProcessingTimeLoadProbe}</li> * <li>{@link AdaptiveJobCountLoadProbe}</li> * </ul> * Note that if {@link AdaptiveLoadProbe#getLoad(org.apache.ignite.cluster.ClusterNode, int)} returns a value of {@code 0}, * then implementation will assume that load value is simply not available and * will try to calculate an average of load values for other nodes. If such * average cannot be obtained (all node load values are {@code 0}), then a value * of {@code 1} will be used. * <p> * When working with node metrics, take into account that all averages are * calculated over metrics history size defined by {@link org.apache.ignite.configuration.IgniteConfiguration#getMetricsExpireTime()} * and {@link org.apache.ignite.configuration.IgniteConfiguration#getMetricsHistorySize()} grid configuration parameters. * Generally the larger these configuration parameter values are, the more precise the metrics are. * You should tune these values based on the level of accuracy needed vs. the additional memory * that would be required for storing metrics. * <p> * You should also keep in mind that metrics for remote nodes are delayed (usually by the metrics * update frequency). So if it is acceptable in your environment, set the metrics update frequency * to be more inline with job execution time. Generally, the more often metrics update between nodes * are exchanged, the more precise the metrics are. However, you should keep in mind that if * metrics update are exchanged too often then it may create unnecessary traffic in the network. * Metrics update frequency can be configured via underlying * {@link org.apache.ignite.configuration.IgniteConfiguration} used in your grid. * <p> * Here is an example of how probing can be implemented to use * number of active and waiting jobs as probing mechanism: * <pre name="code" class="java"> * public class FooBarLoadProbe implements GridAdaptiveLoadProbe { * // Flag indicating whether to use average value or current. * private int useAvg = true; * * public FooBarLoadProbe(boolean useAvg) { * this.useAvg = useAvg; * } * * // Calculate load based on number of active and waiting jobs. * public double getLoad(ClusterNode node, int jobsSentSinceLastUpdate) { * GridNodeMetrics metrics = node.getMetrics(); * * if (useAvg) { * double load = metrics.getAverageActiveJobs() + metrics.getAverageWaitingJobs(); * * if (load > 0) { * return load; * } * } * * return metrics.getCurrentActiveJobs() + metrics.getCurrentWaitingJobs(); * } * } * </pre> * <h1 class="header">Which Node Probe To Use</h1> * There is no correct answer here. Every single node probe will work better or worse in * different environments. CPU load probe (default option) is the safest approach to start * with as it simply attempts to utilize every CPU on the grid to the maximum. However, you should * experiment with other probes by executing load tests in your environment and observing * which probe gives you best performance and load balancing. * <p> * <h1 class="header">Task Coding Example</h1> * If you are using {@link org.apache.ignite.compute.ComputeTaskSplitAdapter} then load balancing logic * is transparent to your code and is handled automatically by the adapter. * Here is an example of how your task will look: * <pre name="code" class="java"> * public class MyFooBarTask extends ComputeTaskSplitAdapter<Object, Object> { * @Override * protected Collection<? extends ComputeJob> split(int gridSize, Object arg) throws IgniteCheckedException { * List<MyFooBarJob> jobs = new ArrayList<MyFooBarJob>(gridSize); * * for (int i = 0; i < gridSize; i++) { * jobs.add(new MyFooBarJob(arg)); * } * * // Node assignment via load balancer * // happens automatically. * return jobs; * } * ... * } * </pre> * If you need more fine-grained control over how some jobs within task get mapped to a node * and use affinity load balancing for some other jobs within task, then you should use * {@link org.apache.ignite.compute.ComputeTaskAdapter}. Here is an example of how your task will look. Note that in this * case we manually inject load balancer and use it to pick the best node. Doing it in * such way would allow user to map some jobs manually and for others use load balancer. * <pre name="code" class="java"> * public class MyFooBarTask extends ComputeTaskAdapter<String, String> { * // Inject load balancer. * @LoadBalancerResource * ComputeLoadBalancer balancer; * * // Map jobs to grid nodes. * public Map<? extends ComputeJob, ClusterNode> map(List<ClusterNode> subgrid, String arg) throws IgniteCheckedException { * Map<MyFooBarJob, ClusterNode> jobs = new HashMap<MyFooBarJob, ClusterNode>(subgrid.size()); * * // In more complex cases, you can actually do * // more complicated assignments of jobs to nodes. * for (int i = 0; i < subgrid.size(); i++) { * // Pick the next best balanced node for the job. * jobs.put(new MyFooBarJob(arg), balancer.getBalancedNode()) * } * * return jobs; * } * * // Aggregate results into one compound result. * public String reduce(List<ComputeJobResult> results) throws IgniteCheckedException { * // For the purpose of this example we simply * // concatenate string representation of every * // job result * StringBuilder buf = new StringBuilder(); * * for (ComputeJobResult res : results) { * // Append string representation of result * // returned by every job. * buf.append(res.getData().string()); * } * * return buf.string(); * } * } * </pre> * <p> * <h1 class="header">Configuration</h1> * In order to use this load balancer, you should configure your grid instance * to use {@code JobsLoadBalancingSpi} either from Spring XML file or * directly. The following configuration parameters are supported: * <h2 class="header">Mandatory</h2> * This SPI has no mandatory configuration parameters. * <h2 class="header">Optional</h2> * This SPI has the following optional configuration parameters: * <ul> * <li> * Adaptive node load probing implementation (see {@link #setLoadProbe(AdaptiveLoadProbe)}). * This configuration parameter supplies a custom algorithm for probing a node's load. * By default, {@link AdaptiveCpuLoadProbe} implementation is used which * takes every node's CPU load and tries to send proportionally more jobs to less loaded nodes. * </li> * </ul> * <p> * Below is Java configuration example: * <pre name="code" class="java"> * AdaptiveLoadBalancingSpi spi = new AdaptiveLoadBalancingSpi(); * * // Configure probe to use latest job execution time vs. average. * AdaptiveProcessingTimeLoadProbe probe = new AdaptiveProcessingTimeLoadProbe(false); * * spi.setLoadProbe(probe); * * IgniteConfiguration cfg = new IgniteConfiguration(); * * // Override default load balancing SPI. * cfg.setLoadBalancingSpi(spi); * * // Starts grid. * G.start(cfg); * </pre> * Here is how you can configure {@code GridJobsLoadBalancingSpi} using Spring XML configuration: * <pre name="code" class="xml"> * <property name="loadBalancingSpi"> * <bean class="org.apache.ignite.spi.loadBalancing.adaptive.AdaptiveLoadBalancingSpi"> * <property name="loadProbe"> * <bean class="org.apache.ignite.spi.loadBalancing.adaptive.AdaptiveProcessingTimeLoadProbe"> * <constructor-arg value="false"/> * </bean> * </property> * </bean> * </property> * </pre> * <p> * <img src="http://ignite.apache.org/images/spring-small.png"> * <br> * For information about Spring framework visit <a href="http://www.springframework.org/">www.springframework.org</a> */ @IgniteSpiMultipleInstancesSupport(true) public class AdaptiveLoadBalancingSpi extends IgniteSpiAdapter implements LoadBalancingSpi { /** Random number generator. */ private static final Random RAND = new Random(); /** Grid logger. */ @LoggerResource private IgniteLogger log; /** */ private AdaptiveLoadProbe probe = new AdaptiveCpuLoadProbe(); /** Local event listener to listen to task completion events. */ private GridLocalEventListener evtLsnr; /** Task topologies. First pair value indicates whether or not jobs have been mapped. */ private ConcurrentMap<IgniteUuid, IgniteBiTuple<Boolean, WeightedTopology>> taskTops = new ConcurrentHashMap8<>(); /** */ private final Map<UUID, AtomicInteger> nodeJobs = new HashMap<>(); /** */ private final ReadWriteLock rwLock = new ReentrantReadWriteLock(); /** * Gets text description of current load probing implementation used. * * @return Text description of current load probing implementation used. */ public String getLoadProbeFormatted() { return probe.toString(); } /** * Sets implementation of node load probe. By default {@link AdaptiveProcessingTimeLoadProbe} * is used which proportionally distributes load based on the average job execution * time on every node. * * @param probe Implementation of node load probe * @return {@code this} for chaining. */ @IgniteSpiConfiguration(optional = true) public AdaptiveLoadBalancingSpi setLoadProbe(AdaptiveLoadProbe probe) { A.ensure(probe != null, "probe != null"); this.probe = probe; return this; } /** {@inheritDoc} */ @Override public void spiStart(@Nullable String igniteInstanceName) throws IgniteSpiException { startStopwatch(); assertParameter(probe != null, "loadProbe != null"); if (log.isDebugEnabled()) log.debug(configInfo("loadProbe", probe)); registerMBean(igniteInstanceName, new AdaptiveLoadBalancingSpiMBeanImpl(this), AdaptiveLoadBalancingSpiMBean.class); // Ack ok start. if (log.isDebugEnabled()) log.debug(startInfo()); } /** {@inheritDoc} */ @Override public void spiStop() throws IgniteSpiException { rwLock.writeLock().lock(); try { nodeJobs.clear(); } finally { rwLock.writeLock().unlock(); } unregisterMBean(); // Ack ok stop. if (log.isDebugEnabled()) log.debug(stopInfo()); } /** {@inheritDoc} */ @Override protected void onContextInitialized0(IgniteSpiContext spiCtx) throws IgniteSpiException { getSpiContext().addLocalEventListener(evtLsnr = new GridLocalEventListener() { @Override public void onEvent(Event evt) { switch (evt.type()) { case EVT_TASK_FINISHED: case EVT_TASK_FAILED: { TaskEvent taskEvt = (TaskEvent)evt; taskTops.remove(taskEvt.taskSessionId()); if (log.isDebugEnabled()) log.debug("Removed task topology from topology cache for session: " + taskEvt.taskSessionId()); break; } case EVT_JOB_MAPPED: { // We should keep topology and use cache in ComputeTask#map() method to // avoid O(n*n/2) complexity, after that we can drop caches. // Here we set mapped property and later cache will be ignored JobEvent jobEvt = (JobEvent)evt; IgniteBiTuple<Boolean, WeightedTopology> weightedTop = taskTops.get(jobEvt.taskSessionId()); if (weightedTop != null) weightedTop.set1(true); if (log.isDebugEnabled()) log.debug("Job has been mapped. Ignore cache for session: " + jobEvt.taskSessionId()); break; } case EVT_NODE_METRICS_UPDATED: case EVT_NODE_FAILED: case EVT_NODE_JOINED: case EVT_NODE_LEFT: { DiscoveryEvent discoEvt = (DiscoveryEvent)evt; rwLock.writeLock().lock(); try { switch (evt.type()) { case EVT_NODE_JOINED: { nodeJobs.put(discoEvt.eventNode().id(), new AtomicInteger(0)); break; } case EVT_NODE_LEFT: case EVT_NODE_FAILED: { nodeJobs.remove(discoEvt.eventNode().id()); break; } case EVT_NODE_METRICS_UPDATED: { // Reset counter. nodeJobs.put(discoEvt.eventNode().id(), new AtomicInteger(0)); break; } } } finally { rwLock.writeLock().unlock(); } } } } }, EVT_NODE_METRICS_UPDATED, EVT_NODE_FAILED, EVT_NODE_JOINED, EVT_NODE_LEFT, EVT_TASK_FINISHED, EVT_TASK_FAILED, EVT_JOB_MAPPED ); // Put all known nodes. rwLock.writeLock().lock(); try { for (ClusterNode node : getSpiContext().nodes()) nodeJobs.put(node.id(), new AtomicInteger(0)); } finally { rwLock.writeLock().unlock(); } } /** {@inheritDoc} */ @Override protected void onContextDestroyed0() { if (evtLsnr != null) { IgniteSpiContext ctx = getSpiContext(); if (ctx != null) ctx.removeLocalEventListener(evtLsnr); } } /** {@inheritDoc} */ @Override public ClusterNode getBalancedNode(ComputeTaskSession ses, List<ClusterNode> top, ComputeJob job) { A.notNull(ses, "ses"); A.notNull(top, "top"); A.notNull(job, "job"); IgniteBiTuple<Boolean, WeightedTopology> weightedTop = taskTops.get(ses.getId()); // Create new cached topology if there is no one. Do not // use cached topology after task has been mapped. if (weightedTop == null) // Called from ComputeTask#map(). Put new topology and false as not mapped yet. taskTops.put(ses.getId(), weightedTop = F.t(false, new WeightedTopology(top))); // We have topology - check if task has been mapped. else if (weightedTop.get1()) // Do not use cache after ComputeTask#map(). return new WeightedTopology(top).pickWeightedNode(); return weightedTop.get2().pickWeightedNode(); } /** * Calculates node load based on set probe. * * @param top List of all nodes. * @param node Node to get load for. * @return Node load. * @throws IgniteException If returned load is negative. */ @SuppressWarnings({"TooBroadScope"}) private double getLoad(Collection<ClusterNode> top, ClusterNode node) throws IgniteException { assert !F.isEmpty(top); int jobsSentSinceLastUpdate = 0; rwLock.readLock().lock(); try { AtomicInteger cnt = nodeJobs.get(node.id()); jobsSentSinceLastUpdate = cnt == null ? 0 : cnt.get(); } finally { rwLock.readLock().unlock(); } double load = probe.getLoad(node, jobsSentSinceLastUpdate); if (load < 0) throw new IgniteException("Failed to obtain non-negative load from adaptive load probe: " + load); return load; } /** * Holder for weighted topology. */ private class WeightedTopology { /** Topology sorted by weight. */ private final SortedMap<Double, ClusterNode> circle = new TreeMap<>(); /** * @param top Task topology. * @throws IgniteCheckedException If any load was negative. */ WeightedTopology(List<ClusterNode> top) throws IgniteException { assert !F.isEmpty(top); double totalLoad = 0; // We need to cache loads here to avoid calls later as load might be // changed between the calls. double[] nums = new double[top.size()]; int zeroCnt = 0; // Compute loads. for (int i = 0; i < top.size(); i++) { double load = getLoad(top, top.get(i)); nums[i] = load; if (load == 0) zeroCnt++; totalLoad += load; } // Take care of zero loads. if (zeroCnt > 0) { double newTotal = totalLoad; int nonZeroCnt = top.size() - zeroCnt; for (int i = 0; i < nums.length; i++) { double load = nums[i]; if (load == 0) { if (nonZeroCnt > 0) load = totalLoad / nonZeroCnt; if (load == 0) load = 1; nums[i] = load; newTotal += load; } } totalLoad = newTotal; } double totalWeight = 0; // Calculate weights and total weight. for (int i = 0; i < nums.length; i++) { assert nums[i] > 0 : "Invalid load: " + nums[i]; double weight = totalLoad / nums[i]; // Convert to weight. nums[i] = weight; totalWeight += weight; } double weight = 0; // Enforce range from 0 to 1. for (int i = 0; i < nums.length; i++) { weight = i == nums.length - 1 ? 1.0d : weight + nums[i] / totalWeight; assert weight < 2 : "Invalid weight: " + weight; // Complexity of this put is O(logN). circle.put(weight, top.get(i)); } } /** * Gets weighted node in random fashion. * * @return Weighted node. */ ClusterNode pickWeightedNode() { double weight = RAND.nextDouble(); SortedMap<Double, ClusterNode> pick = circle.tailMap(weight); ClusterNode node = pick.get(pick.firstKey()); rwLock.readLock().lock(); try { AtomicInteger cnt = nodeJobs.get(node.id()); if (cnt != null) cnt.incrementAndGet(); } finally { rwLock.readLock().unlock(); } return node; } } /** {@inheritDoc} */ @Override public AdaptiveLoadBalancingSpi setName(String name) { super.setName(name); return this; } /** {@inheritDoc} */ @Override public String toString() { return S.toString(AdaptiveLoadBalancingSpi.class, this); } /** * MBean implementation for AdaptiveLoadBalancingSpi. */ private class AdaptiveLoadBalancingSpiMBeanImpl extends IgniteSpiMBeanAdapter implements AdaptiveLoadBalancingSpiMBean { /** {@inheritDoc} */ AdaptiveLoadBalancingSpiMBeanImpl(IgniteSpiAdapter spiAdapter) { super(spiAdapter); } /** {@inheritDoc} */ @Override public String getLoadProbeFormatted() { return AdaptiveLoadBalancingSpi.this.getLoadProbeFormatted(); } } }