/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.HashMap; import java.util.List; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.DrainDispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Test; public class TestResourceTrackerService { private final static File TEMP_DIR = new File(System.getProperty( "test.build.data", "/tmp"), "decommision"); private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt"); private MockRM rm; /** * Test RM read NM next heartBeat Interval correctly from Configuration file, * and NM get next heartBeat Interval from RM correctly */ @Test (timeout = 50000) public void testGetNextHeartBeatInterval() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, "4000"); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals(4000, nodeHeartbeat.getNextHeartBeatInterval()); NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true); Assert.assertEquals(4000, nodeHeartbeat2.getNextHeartBeatInterval()); } /** * Decommissioning using a pre-configured include hosts file */ @Test public void testDecommissionWithIncludeHosts() throws Exception { writeToHostsFile("localhost", "host1", "host2"); Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile .getAbsolutePath()); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); ClusterMetrics metrics = ClusterMetrics.getMetrics(); assert(metrics != null); int metricCount = metrics.getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host1", ip); rm.getNodesListManager().refreshNodes(conf); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert .assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue("Node is not decommisioned.", NodeAction.SHUTDOWN .equals(nodeHeartbeat.getNodeAction())); checkDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals(metricCount, ClusterMetrics.getMetrics() .getNumDecommisionedNMs()); } /** * Decommissioning using a pre-configured exclude hosts file */ @Test public void testDecommissionWithExcludeHosts() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile .getAbsolutePath()); writeToHostsFile(""); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); MockNM nm3 = rm.registerNode("localhost:4433", 1024); int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); // To test that IPs also work String ip = NetUtils.normalizeHostName("localhost"); writeToHostsFile("host2", ip); rm.getNodesListManager().refreshNodes(conf); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); checkDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm3.nodeHeartbeat(true); Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction())); checkDecommissionedNMCount(rm, ++metricCount); } /** * Decommissioning using a post-configured include hosts file */ @Test public void testAddNewIncludePathToConfiguration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); ClusterMetrics metrics = ClusterMetrics.getMetrics(); assert(metrics != null); int initialMetricCount = metrics.getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); writeToHostsFile("host1"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile .getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals( "Node should not have been decomissioned.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals("Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(), NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction()); checkDecommissionedNMCount(rm, ++initialMetricCount); } /** * Decommissioning using a post-configured exclude hosts file */ @Test public void testAddNewExcludePathToConfiguration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 10240); ClusterMetrics metrics = ClusterMetrics.getMetrics(); assert(metrics != null); int initialMetricCount = metrics.getNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals( NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); writeToHostsFile("host2"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile .getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertEquals( "Node should not have been decomissioned.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction()); nodeHeartbeat = nm2.nodeHeartbeat(true); Assert.assertEquals("Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(), NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction()); checkDecommissionedNMCount(rm, ++initialMetricCount); } @Test public void testNodeRegistrationFailure() throws Exception { writeToHostsFile("host1"); Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile .getAbsolutePath()); rm = new MockRM(conf); rm.start(); ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService(); RegisterNodeManagerRequest req = Records.newRecord( RegisterNodeManagerRequest.class); NodeId nodeId = NodeId.newInstance("host2", 1234); req.setNodeId(nodeId); req.setHttpPort(1234); // trying to register a invalid node. RegisterNodeManagerResponse response = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.SHUTDOWN,response.getNodeAction()); Assert .assertEquals( "Disallowed NodeManager from host2, Sending SHUTDOWN signal to the NodeManager.", response.getDiagnosticsMessage()); } @Test public void testSetRMIdentifierInRegistration() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm = new MockNM("host1:1234", 5120, rm.getResourceTrackerService()); RegisterNodeManagerResponse response = nm.registerNode(); // Verify the RMIdentifier is correctly set in RegisterNodeManagerResponse Assert.assertEquals(ResourceManager.clusterTimeStamp, response.getRMIdentifier()); } @Test public void testNodeRegistrationWithMinimumAllocations() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, "2048"); conf.set(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, "4"); rm = new MockRM(conf); rm.start(); ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService(); RegisterNodeManagerRequest req = Records.newRecord( RegisterNodeManagerRequest.class); NodeId nodeId = BuilderUtils.newNodeId("host", 1234); req.setNodeId(nodeId); Resource capability = BuilderUtils.newResource(1024, 1); req.setResource(capability); RegisterNodeManagerResponse response1 = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.SHUTDOWN,response1.getNodeAction()); capability.setMemory(2048); capability.setVirtualCores(1); req.setResource(capability); RegisterNodeManagerResponse response2 = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.SHUTDOWN,response2.getNodeAction()); capability.setMemory(1024); capability.setVirtualCores(4); req.setResource(capability); RegisterNodeManagerResponse response3 = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.SHUTDOWN,response3.getNodeAction()); capability.setMemory(2048); capability.setVirtualCores(4); req.setResource(capability); RegisterNodeManagerResponse response4 = resourceTrackerService.registerNodeManager(req); Assert.assertEquals(NodeAction.NORMAL,response4.getNodeAction()); } @Test public void testReboot() throws Exception { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:1234", 2048); int initialMetricCount = ClusterMetrics.getMetrics().getNumRebootedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); nodeHeartbeat = nm2.nodeHeartbeat( new HashMap<ApplicationId, List<ContainerStatus>>(), true, -100); Assert.assertTrue(NodeAction.RESYNC.equals(nodeHeartbeat.getNodeAction())); Assert.assertEquals("Too far behind rm response id:0 nm response id:-100", nodeHeartbeat.getDiagnosticsMessage()); checkRebootedNMCount(rm, ++initialMetricCount); } private void checkRebootedNMCount(MockRM rm2, int count) throws InterruptedException { int waitCount = 0; while (ClusterMetrics.getMetrics().getNumRebootedNMs() != count && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertEquals("The rebooted metrics are not updated", count, ClusterMetrics.getMetrics().getNumRebootedNMs()); } @Test public void testUnhealthyNodeStatus() throws Exception { Configuration conf = new Configuration(); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile .getAbsolutePath()); rm = new MockRM(conf); rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); Assert.assertEquals(0, ClusterMetrics.getMetrics().getUnhealthyNMs()); // node healthy nm1.nodeHeartbeat(true); // node unhealthy nm1.nodeHeartbeat(false); checkUnealthyNMCount(rm, nm1, true, 1); // node healthy again nm1.nodeHeartbeat(true); checkUnealthyNMCount(rm, nm1, false, 0); } private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health, int count) throws Exception { int waitCount = 0; while((rm.getRMContext().getRMNodes().get(nm1.getNodeId()) .getState() != NodeState.UNHEALTHY) == health && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertFalse((rm.getRMContext().getRMNodes().get(nm1.getNodeId()) .getState() != NodeState.UNHEALTHY) == health); Assert.assertEquals("Unhealthy metrics not incremented", count, ClusterMetrics.getMetrics().getUnhealthyNMs()); } @Test public void testReconnectNode() throws Exception { final DrainDispatcher dispatcher = new DrainDispatcher(); MockRM rm = new MockRM() { @Override protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() { return new SchedulerEventDispatcher(this.scheduler) { @Override public void handle(SchedulerEvent event) { scheduler.handle(event); } }; } @Override protected Dispatcher createDispatcher() { return dispatcher; } }; rm.start(); MockNM nm1 = rm.registerNode("host1:1234", 5120); MockNM nm2 = rm.registerNode("host2:5678", 5120); nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(false); dispatcher.await(); checkUnealthyNMCount(rm, nm2, true, 1); final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs(); QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler Assert.assertEquals(5120, metrics.getAvailableMB()); // reconnect of healthy node nm1 = rm.registerNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.nodeHeartbeat(true); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.registerNode("host2:5678", 5120); response = nm2.nodeHeartbeat(false); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); dispatcher.await(); Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs()); checkUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.registerNode("host2:5678", 5120); dispatcher.await(); response = nm2.nodeHeartbeat(true); response = nm2.nodeHeartbeat(true); dispatcher.await(); Assert.assertEquals(5120 + 5120, metrics.getAvailableMB()); // reconnect of node with changed capability nm1 = rm.registerNode("host2:5678", 10240); dispatcher.await(); response = nm2.nodeHeartbeat(true); dispatcher.await(); Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction())); Assert.assertEquals(5120 + 10240, metrics.getAvailableMB()); } private void writeToHostsFile(String... hosts) throws IOException { if (!hostFile.exists()) { TEMP_DIR.mkdirs(); hostFile.createNewFile(); } FileOutputStream fStream = null; try { fStream = new FileOutputStream(hostFile); for (int i = 0; i < hosts.length; i++) { fStream.write(hosts[i].getBytes()); fStream.write("\n".getBytes()); } } finally { if (fStream != null) { IOUtils.closeStream(fStream); fStream = null; } } } private void checkDecommissionedNMCount(MockRM rm, int count) throws InterruptedException { int waitCount = 0; while (ClusterMetrics.getMetrics().getNumDecommisionedNMs() != count && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertEquals(count, ClusterMetrics.getMetrics() .getNumDecommisionedNMs()); Assert.assertEquals("The decommisioned metrics are not updated", count, ClusterMetrics.getMetrics().getNumDecommisionedNMs()); } @After public void tearDown() { if (hostFile != null && hostFile.exists()) { hostFile.delete(); } ClusterMetrics.destroy(); if (rm != null) { rm.stop(); } } }