/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.spi.failover.always;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.ignite.IgniteCheckedException;
import org.apache.ignite.IgniteException;
import org.apache.ignite.IgniteLogger;
import org.apache.ignite.cluster.ClusterNode;
import org.apache.ignite.internal.IgniteEx;
import org.apache.ignite.internal.managers.failover.GridFailoverContextImpl;
import org.apache.ignite.internal.util.typedef.F;
import org.apache.ignite.internal.util.typedef.internal.S;
import org.apache.ignite.internal.util.typedef.internal.U;
import org.apache.ignite.resources.LoggerResource;
import org.apache.ignite.spi.IgniteSpiAdapter;
import org.apache.ignite.spi.IgniteSpiConfiguration;
import org.apache.ignite.spi.IgniteSpiConsistencyChecked;
import org.apache.ignite.spi.IgniteSpiException;
import org.apache.ignite.spi.IgniteSpiMBeanAdapter;
import org.apache.ignite.spi.IgniteSpiMultipleInstancesSupport;
import org.apache.ignite.spi.failover.FailoverContext;
import org.apache.ignite.spi.failover.FailoverSpi;
/**
* Failover SPI that always reroutes a failed job to another node.
* Note, that at first an attempt will be made to reroute the failed job
* to a node that was not part of initial split for a better chance of
* success. If no such nodes are available, then an attempt will be made to
* reroute the failed job to the nodes in the initial split minus the node
* the job is failed on. If none of the above attempts succeeded, then the
* job will not be failed over and {@code null} will be returned.
* <p>
* <h1 class="header">Configuration</h1>
* This SPI is default failover SPI and does not have to be explicitly
* configured unless configuration parameters need to be changed.
* <h2 class="header">Mandatory</h2>
* This SPI has no mandatory configuration parameters.
* <h2 class="header">Optional</h2>
* This SPI has following optional configuration parameters:
* <ul>
* <li>
* Maximum failover attempts for a single job (see {@link #setMaximumFailoverAttempts(int)}).
* If maximum failover attempts is reached, then job will not be failed-over and,
* hence, will fail.
* </li>
* </ul>
* Here is a Java example how to configure grid with {@link AlwaysFailoverSpi} failover SPI.
* <pre name="code" class="java">
* AlwaysFailoverSpi spi = new AlwaysFailoverSpi();
*
* // Override maximum failover attempts.
* spi.setMaximumFailoverAttempts(5);
*
* IgniteConfiguration cfg = new IgniteConfiguration();
*
* // Override default failover SPI.
* cfg.setFailoverSpiSpi(spi);
*
* // Starts grid.
* G.start(cfg);
* </pre>
* Here is an example of how to configure {@code AlwaysFailoverSpi} from Spring XML configuration file.
* <pre name="code" class="xml">
* <property name="failoverSpi">
* <bean class="org.apache.ignite.spi.failover.always.AlwaysFailoverSpi">
* <property name="maximumFailoverAttempts" value="5"/>
* </bean>
* </property>
* </pre>
* <p>
* <img src="http://ignite.apache.org/images/spring-small.png">
* <br>
* For information about Spring framework visit <a href="http://www.springframework.org/">www.springframework.org</a>
* @see org.apache.ignite.spi.failover.FailoverSpi
*/
@IgniteSpiMultipleInstancesSupport(true)
@IgniteSpiConsistencyChecked(optional = true)
public class AlwaysFailoverSpi extends IgniteSpiAdapter implements FailoverSpi {
/** Maximum number of attempts to execute a failed job on another node (default is {@code 5}). */
public static final int DFLT_MAX_FAILOVER_ATTEMPTS = 5;
/**
* Name of job context attribute containing all nodes a job failed on.
*
* @see org.apache.ignite.compute.ComputeJobContext
*/
public static final String FAILED_NODE_LIST_ATTR = "gg:failover:failednodelist";
/**
* Name of job context attribute containing number of affinity call attempts.
*/
public static final String AFFINITY_CALL_ATTEMPT = "ignite:failover:affinitycallattempt";
/** Maximum attempts attribute key should be the same on all nodes. */
public static final String MAX_FAILOVER_ATTEMPT_ATTR = "gg:failover:maxattempts";
/** Injected grid logger. */
@LoggerResource
private IgniteLogger log;
/** Maximum number of attempts to execute a failed job on another node. */
private int maxFailoverAttempts = DFLT_MAX_FAILOVER_ATTEMPTS;
/** Number of jobs that were failed over. */
private int totalFailoverJobs;
/**
* See {@link #setMaximumFailoverAttempts(int)}.
*
* @return Maximum number of attempts to execute a failed job on another node.
*/
public int getMaximumFailoverAttempts() {
return maxFailoverAttempts;
}
/**
* Sets maximum number of attempts to execute a failed job on another node.
* If not specified, {@link #DFLT_MAX_FAILOVER_ATTEMPTS} value will be used.
*
* @param maxFailoverAttempts Maximum number of attempts to execute a failed job on another node.
* @return {@code this} for chaining.
*/
@IgniteSpiConfiguration(optional = true)
public AlwaysFailoverSpi setMaximumFailoverAttempts(int maxFailoverAttempts) {
this.maxFailoverAttempts = maxFailoverAttempts;
return this;
}
/**
* Get total number of jobs that were failed over.
*
* @return Total number of failed over jobs.
*/
public int getTotalFailoverJobsCount() {
return totalFailoverJobs;
}
/** {@inheritDoc} */
@Override public Map<String, Object> getNodeAttributes() throws IgniteSpiException {
return F.<String, Object>asMap(createSpiAttributeName(MAX_FAILOVER_ATTEMPT_ATTR), maxFailoverAttempts);
}
/** {@inheritDoc} */
@Override public void spiStart(String igniteInstanceName) throws IgniteSpiException {
// Start SPI start stopwatch.
startStopwatch();
assertParameter(maxFailoverAttempts >= 0, "maxFailoverAttempts >= 0");
if (log.isDebugEnabled())
log.debug(configInfo("maximumFailoverAttempts", maxFailoverAttempts));
registerMBean(igniteInstanceName, new AlwaysFailoverSpiMBeanImpl(this), AlwaysFailoverSpiMBean.class);
// Ack ok start.
if (log.isDebugEnabled())
log.debug(startInfo());
}
/** {@inheritDoc} */
@Override public void spiStop() throws IgniteSpiException {
unregisterMBean();
// Ack ok stop.
if (log.isDebugEnabled())
log.debug(stopInfo());
}
/** {@inheritDoc} */
@SuppressWarnings("unchecked")
@Override public ClusterNode failover(FailoverContext ctx, List<ClusterNode> top) {
assert ctx != null;
assert top != null;
if (log.isDebugEnabled())
log.debug("Received failed job result: " + ctx.getJobResult());
if (top.isEmpty()) {
U.warn(log, "Received empty topology for failover and is forced to fail.");
// Nowhere to failover to.
return null;
}
if (ctx.partition() >= 0) {
Integer affCallAttempt = ctx.getJobResult().getJobContext().getAttribute(AFFINITY_CALL_ATTEMPT);
if (affCallAttempt == null)
affCallAttempt = 1;
if (maxFailoverAttempts <= affCallAttempt) {
U.warn(log, "Job failover failed because number of maximum failover attempts for affinity call" +
" is exceeded [failedJob=" + ctx.getJobResult().getJob() + ", maxFailoverAttempts=" +
maxFailoverAttempts + ']');
return null;
}
else {
ctx.getJobResult().getJobContext().setAttribute(AFFINITY_CALL_ATTEMPT, affCallAttempt + 1);
try {
return ((IgniteEx)ignite).context().affinity().mapPartitionToNode(ctx.affinityCacheName(), ctx.partition(),
((GridFailoverContextImpl)ctx).affinityTopologyVersion());
}
catch (IgniteCheckedException e) {
U.error(log, "Failed to get map job to node on failover: " + ctx, e);
return null;
}
}
}
Collection<UUID> failedNodes = ctx.getJobResult().getJobContext().getAttribute(FAILED_NODE_LIST_ATTR);
if (failedNodes == null)
failedNodes = U.newHashSet(1);
Integer failoverCnt = failedNodes.size();
if (failoverCnt >= maxFailoverAttempts) {
U.warn(log, "Job failover failed because number of maximum failover attempts is exceeded [failedJob=" +
ctx.getJobResult().getJob() + ", maxFailoverAttempts=" + maxFailoverAttempts + ']');
return null;
}
failedNodes.add(ctx.getJobResult().getNode().id());
// Copy.
List<ClusterNode> newTop = new ArrayList<>(top.size());
for (ClusterNode node : top)
if (!failedNodes.contains(node.id()))
newTop.add(node);
if (newTop.isEmpty()) {
U.warn(log, "Received topology with only nodes that job had failed on (forced to fail) [failedNodes=" +
failedNodes + ']');
// Nowhere to failover to.
return null;
}
try {
ClusterNode node = ctx.getBalancedNode(newTop);
if (node == null)
U.warn(log, "Load balancer returned null node for topology: " + newTop);
else {
// Increment failover count.
ctx.getJobResult().getJobContext().setAttribute(FAILED_NODE_LIST_ATTR, failedNodes);
totalFailoverJobs++;
}
if (node != null)
U.warn(log, "Failed over job to a new node [newNode=" + node.id() +
", oldNode=" + ctx.getJobResult().getNode().id() +
", sesId=" + ctx.getTaskSession().getId() +
", job=" + ctx.getJobResult().getJob() +
", jobCtx=" + ctx.getJobResult().getJobContext() +
", task=" + ctx.getTaskSession().getTaskName() + ']');
return node;
}
catch (IgniteException e) {
U.error(log, "Failed to get next balanced node for failover: " + ctx, e);
return null;
}
}
/** {@inheritDoc} */
@Override protected List<String> getConsistentAttributeNames() {
return Collections.singletonList(createSpiAttributeName(MAX_FAILOVER_ATTEMPT_ATTR));
}
/** {@inheritDoc} */
@Override public AlwaysFailoverSpi setName(String name) {
super.setName(name);
return this;
}
/** {@inheritDoc} */
@Override public String toString() {
return S.toString(AlwaysFailoverSpi.class, this);
}
/**
* MBean implementation for AlwaysFailoverSpi.
*/
private class AlwaysFailoverSpiMBeanImpl extends IgniteSpiMBeanAdapter implements AlwaysFailoverSpiMBean {
/** {@inheritDoc} */
AlwaysFailoverSpiMBeanImpl(IgniteSpiAdapter spiAdapter) {
super(spiAdapter);
}
/** {@inheritDoc} */
@Override public int getMaximumFailoverAttempts() {
return AlwaysFailoverSpi.this.getMaximumFailoverAttempts();
}
/** {@inheritDoc} */
@Override public int getTotalFailoverJobsCount() {
return AlwaysFailoverSpi.this.getTotalFailoverJobsCount();
}
}
}