/* * Copyright (C) 2012-2015 DataStax Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datastax.driver.core; import com.datastax.driver.core.utils.CassandraVersion; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import java.util.concurrent.TimeUnit; import static com.datastax.driver.core.Assertions.assertThat; import static com.datastax.driver.core.Assertions.fail; import static com.datastax.driver.core.Host.State.DOWN; import static com.datastax.driver.core.Host.State.UP; import static com.datastax.driver.core.TestUtils.nonDebouncingQueryOptions; /** * Due to C* gossip bugs, system.peers may report nodes that are gone from the cluster. * <p/> * This class tests scenarios where these nodes have been recommissioned to another cluster and * come back up. The driver must detect that they are not part of the cluster anymore, and ignore them. */ public class RecommissionedNodeTest { private static final Logger logger = LoggerFactory.getLogger(RecommissionedNodeTest.class); CCMBridge.Builder mainCcmBuilder, otherCcmBuilder; CCMAccess mainCcm, otherCcm; Cluster mainCluster; @Test(groups = "long") public void should_ignore_recommissioned_node_on_reconnection_attempt() throws Exception { mainCcmBuilder = CCMBridge.builder().withNodes(3); mainCcm = CCMCache.get(mainCcmBuilder); // node1 will be our "recommissioned" node, for now we just stop it so that it stays in the peers table. mainCcm.stop(1); mainCcm.waitForDown(1); // Now start the driver that will connect to node2 and node3, and consider node1 down mainCluster = Cluster.builder() .addContactPoints(mainCcm.addressOfNode(2).getAddress()) .withPort(mainCcm.getBinaryPort()) .withQueryOptions(nonDebouncingQueryOptions()).build(); mainCluster.connect(); waitForCountUpHosts(mainCluster, 2); // From that point, reconnections to node1 have been scheduled. // Start another ccm that will reuse node1's address otherCcmBuilder = CCMBridge.builder() .withStoragePort(mainCcm.getStoragePort()) .withThriftPort(mainCcm.getThriftPort()) .withBinaryPort(mainCcm.getBinaryPort()) .withNodes(1); otherCcm = CCMCache.get(otherCcmBuilder); otherCcm.waitForUp(1); // Give the driver the time to notice the node is back up and try to connect to it. TimeUnit.SECONDS.sleep(32); assertThat(countUpHosts(mainCluster)).isEqualTo(2); } @Test(groups = "long") public void should_ignore_recommissioned_node_on_control_connection_reconnect() throws Exception { mainCcmBuilder = CCMBridge.builder().withNodes(2); mainCcm = CCMCache.get(mainCcmBuilder); mainCcm.stop(1); mainCcm.waitForDown(1); // Start the driver, the control connection will be on node2 mainCluster = Cluster.builder() .addContactPoints(mainCcm.addressOfNode(2).getAddress()) .withPort(mainCcm.getBinaryPort()) .withQueryOptions(nonDebouncingQueryOptions()).build(); mainCluster.connect(); waitForCountUpHosts(mainCluster, 1); // Start another ccm that will reuse node1's address otherCcmBuilder = CCMBridge.builder() .withStoragePort(mainCcm.getStoragePort()) .withThriftPort(mainCcm.getThriftPort()) .withBinaryPort(mainCcm.getBinaryPort()) .withNodes(1); otherCcm = CCMCache.get(otherCcmBuilder); otherCcm.waitForUp(1); // Stop node2, the control connection gets defunct mainCcm.stop(2); TimeUnit.SECONDS.sleep(32); // The driver should not try to reconnect the control connection to node1 assertThat(mainCluster).hasClosedControlConnection(); } @Test(groups = "long") public void should_ignore_recommissioned_node_on_session_init() throws Exception { // Simulate the bug before starting the cluster mainCcmBuilder = CCMBridge.builder().withNodes(2); mainCcm = CCMCache.get(mainCcmBuilder); mainCcm.stop(1); mainCcm.waitForDown(1); otherCcmBuilder = CCMBridge.builder() .withStoragePort(mainCcm.getStoragePort()) .withThriftPort(mainCcm.getThriftPort()) .withBinaryPort(mainCcm.getBinaryPort()) .withNodes(1); otherCcm = CCMCache.get(otherCcmBuilder); otherCcm.waitForUp(1); // Start the driver, it should only connect to node 2 mainCluster = Cluster.builder() .addContactPoints(mainCcm.addressOfNode(2).getAddress()) .withPort(mainCcm.getBinaryPort()) .withQueryOptions(nonDebouncingQueryOptions()).build(); // When we first initialize the Cluster, all hosts are marked UP assertThat(mainCluster).host(2).hasState(UP); assertThat(mainCluster).host(1).hasState(UP); // Create a session. This will try to open a pool to node 1 and find out that the cluster name doesn't match. mainCluster.connect(); // Node 1 should now be DOWN with no reconnection attempt assertThat(mainCluster).host(1) .goesDownWithin(10, TimeUnit.SECONDS) .hasState(DOWN) .isNotReconnectingFromDown(); } @Test(groups = "long") @CassandraVersion("2.0.0") public void should_ignore_node_that_does_not_support_protocol_version_on_session_init() throws Exception { // Simulate the bug before starting the cluster mainCcmBuilder = CCMBridge.builder().withNodes(2); mainCcm = CCMCache.get(mainCcmBuilder); mainCcm.stop(1); mainCcm.waitForDown(1); otherCcmBuilder = CCMBridge.builder().withNodes(1) .withStoragePort(mainCcm.getStoragePort()) .withThriftPort(mainCcm.getThriftPort()) .withBinaryPort(mainCcm.getBinaryPort()) .withVersion(VersionNumber.parse("1.2.19")); otherCcm = CCMCache.get(otherCcmBuilder); otherCcm.waitForUp(1); // Start the driver, it should only connect to node 2 mainCluster = Cluster.builder() .addContactPoints(mainCcm.addressOfNode(2).getAddress()) .withPort(mainCcm.getBinaryPort()) .withQueryOptions(nonDebouncingQueryOptions()).build(); // Create a session. This will try to open a pool to node 1 and find that it doesn't support protocol version. mainCluster.connect(); // Node 1 should now be DOWN with no reconnection attempt assertThat(mainCluster).host(1) .goesDownWithin(10, TimeUnit.SECONDS) .hasState(DOWN) .isNotReconnectingFromDown(); } @BeforeMethod(groups = "long") public void clearFields() { // Clear cluster and ccm instances between tests. mainCluster = null; mainCcmBuilder = null; otherCcmBuilder = null; mainCcm = null; otherCcm = null; } @AfterMethod(groups = "long", alwaysRun = true) public void teardown() { if (mainCluster != null) mainCluster.close(); if (mainCcmBuilder != null) CCMCache.remove(mainCcmBuilder); if (otherCcmBuilder != null) CCMCache.remove(otherCcmBuilder); if (mainCcm != null) mainCcm.close(); if (otherCcm != null) otherCcm.close(); } private static int countUpHosts(Cluster cluster) { int ups = 0; for (Host host : cluster.getMetadata().getAllHosts()) { if (host.isUp()) ups += 1; } return ups; } private static void waitForCountUpHosts(Cluster cluster, int expectedCount) throws InterruptedException { int maxRetries = 10; int interval = 6; for (int i = 0; i <= maxRetries; i++) { int actualCount = countUpHosts(cluster); if (actualCount == expectedCount) return; if (i == maxRetries) fail(String.format("Up host count didn't reach %d after %d seconds", expectedCount, i * interval)); else logger.debug("Counted {} up hosts after {} seconds", actualCount, i * interval); TimeUnit.SECONDS.sleep(interval); } } }