/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.spi.discovery.tcp; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.Socket; import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.UUID; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import org.apache.ignite.Ignite; import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.events.DiscoveryEvent; import org.apache.ignite.events.Event; import org.apache.ignite.internal.util.typedef.G; import org.apache.ignite.internal.util.typedef.X; import org.apache.ignite.internal.util.typedef.internal.U; import org.apache.ignite.lang.IgnitePredicate; import org.apache.ignite.spi.discovery.tcp.internal.TcpDiscoveryNode; import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder; import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryAbstractMessage; import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryPingRequest; import org.jetbrains.annotations.Nullable; import static org.apache.ignite.events.EventType.EVT_NODE_FAILED; /** * Client-based discovery SPI test with failure detection timeout enabled. */ public class TcpClientDiscoverySpiFailureTimeoutSelfTest extends TcpClientDiscoverySpiSelfTest { /** */ private final static int FAILURE_AWAIT_TIME = 7_000; /** */ private final static long FAILURE_THRESHOLD = 10_000; /** */ private final static long CLIENT_FAILURE_THRESHOLD = 30_000; /** Failure detection timeout for nodes configuration. */ private static long failureThreshold = FAILURE_THRESHOLD; /** Client failure detection timeout for nodes configuration. */ private static long clientFailureThreshold = CLIENT_FAILURE_THRESHOLD; /** */ private static boolean useTestSpi; /** {@inheritDoc} */ @Override protected boolean useFailureDetectionTimeout() { return true; } /** {@inheritDoc} */ @Override protected long clientFailureDetectionTimeout() { return clientFailureThreshold; } /** {@inheritDoc} */ @Override protected long failureDetectionTimeout() { return failureThreshold; } /** {@inheritDoc} */ @Override protected long awaitTime() { return failureDetectionTimeout() + FAILURE_AWAIT_TIME; } /** {@inheritDoc} */ @Override protected long awaitClientTime() { return clientFailureDetectionTimeout() + FAILURE_AWAIT_TIME; } /** {@inheritDoc} */ @Override protected TcpDiscoverySpi getDiscoverySpi() { return useTestSpi ? new TestTcpDiscoverySpi2() : super.getDiscoverySpi(); } /** * @throws Exception in case of error. */ public void testFailureDetectionTimeoutEnabled() throws Exception { startServerNodes(1); startClientNodes(1); checkNodes(1, 1); assertTrue(((TcpDiscoverySpi)(G.ignite("server-0").configuration().getDiscoverySpi())). failureDetectionTimeoutEnabled()); assertEquals(failureDetectionTimeout(), ((TcpDiscoverySpi)(G.ignite("server-0").configuration().getDiscoverySpi())).failureDetectionTimeout()); assertTrue(((TcpDiscoverySpi)(G.ignite("client-0").configuration().getDiscoverySpi())). failureDetectionTimeoutEnabled()); assertEquals(failureDetectionTimeout(), ((TcpDiscoverySpi)(G.ignite("client-0").configuration().getDiscoverySpi())).failureDetectionTimeout()); } /** * @throws Exception in case of error. */ public void testFailureTimeoutWorkabilityAvgTimeout() throws Exception { failureThreshold = 3000; try { checkFailureThresholdWorkability(); } finally { failureThreshold = FAILURE_THRESHOLD; } } /** * @throws Exception in case of error. */ public void testFailureTimeoutWorkabilitySmallTimeout() throws Exception { failureThreshold = 500; try { checkFailureThresholdWorkability(); } finally { failureThreshold = FAILURE_THRESHOLD; } } /** * Test failure detection time between server and client if client fail with failure detection. * * @throws Exception in case of error. */ public void testFailureTimeoutServerClient() throws Exception { failureThreshold = 3000; clientFailureThreshold = 2000; try { startServerNodes(1); startClientNodes(1); checkNodes(1, 1); Ignite srvNode = G.ignite("server-0"); final TcpDiscoverySpi srvSpi = (TcpDiscoverySpi) srvNode.configuration().getDiscoverySpi(); Ignite clientNode = G.ignite("client-0"); final TcpDiscoverySpi clientSpi = (TcpDiscoverySpi)clientNode.configuration().getDiscoverySpi(); long failureTime = U.currentTimeMillis(); final long[] failureDetectTime = new long[1]; final CountDownLatch latch = new CountDownLatch(1); clientSpi.simulateNodeFailure(); srvNode.events().localListen(new IgnitePredicate<Event>() { @Override public boolean apply(Event evt) { failureDetectTime[0] = U.currentTimeMillis(); latch.countDown(); return true; } }, EVT_NODE_FAILED); assertTrue("Can't get node failure event", latch.await(15000, TimeUnit.MILLISECONDS)); long detectTime = failureDetectTime[0] - failureTime; assertTrue("Client node failure detected too fast: " + detectTime + "ms", detectTime > clientFailureThreshold - 200); assertTrue("Client node failure detected too slow: " + detectTime + "ms", detectTime < clientFailureThreshold + 5000); } finally { failureThreshold = FAILURE_THRESHOLD; clientFailureThreshold = CLIENT_FAILURE_THRESHOLD; } } /** * Test failure detection time between servers with failure detection. * * @throws Exception in case of error. */ public void testFailureTimeout3Server() throws Exception { failureThreshold = 1000; clientFailureThreshold = 10000; useTestSpi = true; try { startServerNodes(3); checkNodes(3, 0); Ignite srv0 = G.ignite("server-0"); final TestTcpDiscoverySpi2 spi0 = (TestTcpDiscoverySpi2)srv0.configuration().getDiscoverySpi(); final Ignite srv1 = G.ignite("server-1"); final TestTcpDiscoverySpi2 spi1 = (TestTcpDiscoverySpi2)srv1.configuration().getDiscoverySpi(); Ignite srv2 = G.ignite("server-2"); final TestTcpDiscoverySpi2 spi2 = (TestTcpDiscoverySpi2)srv2.configuration().getDiscoverySpi(); long failureTime = U.currentTimeMillis(); final AtomicLong failureDetectTime = new AtomicLong(); final CountDownLatch latch = new CountDownLatch(2); spi1.writeToSocketDelay = 2000; for (Ignite srv : new Ignite[]{srv0, srv2}) { srv.events().localListen(new IgnitePredicate<Event>() { @Override public boolean apply(Event evt) { DiscoveryEvent evt0 = (DiscoveryEvent)evt; assertEquals(srv1.cluster().localNode().id(), evt0.eventNode().id()); failureDetectTime.compareAndSet(0, U.currentTimeMillis()); latch.countDown(); return true; } }, EVT_NODE_FAILED); } assertTrue("Can't get node failure event", latch.await(15000, TimeUnit.MILLISECONDS)); long detectTime = failureDetectTime.get() - failureTime; assertTrue("Server node failure detected too fast: " + detectTime + "ms", detectTime > failureThreshold - 100); assertTrue("Server node failure detected too slow: " + detectTime + "ms", detectTime < clientFailureThreshold); } finally { failureThreshold = FAILURE_THRESHOLD; clientFailureThreshold = CLIENT_FAILURE_THRESHOLD; useTestSpi = false; } } /** * @throws Exception in case of error. */ private void checkFailureThresholdWorkability() throws Exception { useTestSpi = true; TestTcpDiscoverySpi2 firstSpi = null; TestTcpDiscoverySpi2 secondSpi = null; try { startServerNodes(2); checkNodes(2, 0); firstSpi = (TestTcpDiscoverySpi2)(G.ignite("server-0").configuration().getDiscoverySpi()); secondSpi = (TestTcpDiscoverySpi2)(G.ignite("server-1").configuration().getDiscoverySpi()); assert firstSpi.err == null; secondSpi.readDelay = failureDetectionTimeout() + 5000; assertFalse(firstSpi.pingNode(secondSpi.getLocalNodeId())); Thread.sleep(failureDetectionTimeout()); assertTrue(firstSpi.err != null && X.hasCause(firstSpi.err, SocketTimeoutException.class)); firstSpi.reset(); secondSpi.reset(); assertTrue(firstSpi.pingNode(secondSpi.getLocalNodeId())); assertTrue(firstSpi.err == null); } finally { useTestSpi = false; if (firstSpi != null) firstSpi.reset(); if (secondSpi != null) secondSpi.reset(); } } /** * @throws Exception If failed. */ public void testClientReconnectOnCoordinatorRouterFail1() throws Exception { clientReconnectOnCoordinatorRouterFail(1); } /** * @throws Exception If failed. */ public void testClientReconnectOnCoordinatorRouterFail2() throws Exception { clientReconnectOnCoordinatorRouterFail(2); } /** * Test tries to provoke scenario when client sends reconnect message before router failure detected. * * @param srvNodes Number of additional server nodes. * @throws Exception If failed. */ public void clientReconnectOnCoordinatorRouterFail(int srvNodes) throws Exception { startServerNodes(1); Ignite srv = G.ignite("server-0"); final TcpDiscoveryNode srvNode = (TcpDiscoveryNode)srv.cluster().localNode(); final UUID srvNodeId = srvNode.id(); clientIpFinder = new TcpDiscoveryVmIpFinder(); clientIpFinder.setAddresses( Collections.singleton("localhost:" + srvNode.discoveryPort() + ".." + (srvNode.discoveryPort() + 1))); failureThreshold = 1000L; netTimeout = 1000L; startClientNodes(1); // Client should connect to coordinator. failureThreshold = 10_000L; netTimeout = 5000L; List<String> nodes = new ArrayList<>(); for (int i = 0; i < srvNodes; i++) { Ignite g = startGrid("server-" + srvIdx.getAndIncrement()); nodes.add(g.name()); srvNodeIds.add(g.cluster().localNode().id()); } checkNodes(1 + srvNodes, 1); nodes.add("client-0"); final CountDownLatch latch = new CountDownLatch(nodes.size()); final AtomicBoolean err = new AtomicBoolean(); for (String node : nodes) { G.ignite(node).events().localListen(new IgnitePredicate<Event>() { @Override public boolean apply(Event evt) { DiscoveryEvent disoEvt = (DiscoveryEvent)evt; if (disoEvt.eventNode().id().equals(srvNodeId)) { info("Expected node failed event: " + ((DiscoveryEvent) evt).eventNode()); latch.countDown(); } else { log.info("Unexpected node failed event: " + evt); err.set(true); } return true; } }, EVT_NODE_FAILED); } Thread.sleep(5000); Ignite client = G.ignite("client-0"); UUID nodeId = client.cluster().localNode().id(); log.info("Fail coordinator: " + srvNodeId); TestTcpDiscoverySpi srvSpi = (TestTcpDiscoverySpi)srv.configuration().getDiscoverySpi(); srvSpi.pauseAll(false); try { Thread.sleep(2000); } finally { srvSpi.simulateNodeFailure(); srvSpi.resumeAll(); } try { assertTrue(latch.await(failureThreshold + 3000, TimeUnit.MILLISECONDS)); assertFalse("Unexpected event, see log for details.", err.get()); assertEquals(nodeId, client.cluster().localNode().id()); } finally { srvSpi.resumeAll(); } } /** * */ private static class TestTcpDiscoverySpi2 extends TcpDiscoverySpi { /** */ private volatile long readDelay; private volatile long writeToSocketDelay; /** */ private Exception err; /** */ @Override protected void writeToSocket( Socket sock, TcpDiscoveryAbstractMessage msg, byte[] data, long timeout ) throws IOException { if (writeToSocketDelay > 0) { try { U.dumpStack(log, "Before sleep [msg=" + msg + ", arrLen=" + (data != null ? data.length : "n/a") + ']'); Thread.sleep(writeToSocketDelay); } catch (InterruptedException e) { // Nothing to do. } } if (sock.getSoTimeout() >= writeToSocketDelay) super.writeToSocket(sock, msg, data, timeout); else throw new SocketTimeoutException("Write to socket delay timeout exception."); } /** */ @Override protected void writeToSocket(Socket sock, OutputStream out, TcpDiscoveryAbstractMessage msg, long timeout) throws IOException, IgniteCheckedException { if (writeToSocketDelay > 0) { try { U.dumpStack(log, "Before sleep [msg=" + msg + ']'); Thread.sleep(writeToSocketDelay); } catch (InterruptedException e) { // Nothing to do. } } if (sock.getSoTimeout() >= writeToSocketDelay) super.writeToSocket(sock, out, msg, timeout); else throw new SocketTimeoutException("Write to socket delay timeout exception."); } /** */ @Override protected void writeToSocket( Socket sock, TcpDiscoveryAbstractMessage msg, long timeout ) throws IOException, IgniteCheckedException { if (writeToSocketDelay > 0) { try { U.dumpStack(log, "Before sleep [msg=" + msg + ']'); Thread.sleep(writeToSocketDelay); } catch (InterruptedException e) { // Nothing to do. } } if (sock.getSoTimeout() >= writeToSocketDelay) super.writeToSocket(sock, msg, timeout); else throw new SocketTimeoutException("Write to socket delay timeout exception."); } /** */ @Override protected void writeToSocket( TcpDiscoveryAbstractMessage msg, Socket sock, int res, long timeout ) throws IOException { if (writeToSocketDelay > 0) { try { U.dumpStack(log, "Before sleep [msg=" + msg + ']'); Thread.sleep(writeToSocketDelay); } catch (InterruptedException e) { // Nothing to do. } } if (sock.getSoTimeout() >= writeToSocketDelay) super.writeToSocket(msg, sock, res, timeout); else throw new SocketTimeoutException("Write to socket delay timeout exception."); } /** {@inheritDoc} */ @Override protected <T> T readMessage(Socket sock, @Nullable InputStream in, long timeout) throws IOException, IgniteCheckedException { long currTimeout = getLocalNode().isClient() ? clientFailureDetectionTimeout() : failureDetectionTimeout(); if (readDelay < currTimeout) { try { return super.readMessage(sock, in, timeout); } catch (Exception e) { err = e; throw e; } } else { T msg = super.readMessage(sock, in, timeout); if (msg instanceof TcpDiscoveryPingRequest) { try { Thread.sleep(2000); } catch (InterruptedException ignored) { // No-op. } throw new SocketTimeoutException("Forced timeout"); } return msg; } } /** * Resets testing state. */ private void reset() { readDelay = 0; writeToSocketDelay = 0; err = null; } } }