/*- * -\-\- * Helios System Tests * -- * Copyright (C) 2016 Spotify AB * -- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * -/-/- */ package com.spotify.helios.system; import static com.spotify.helios.common.descriptors.Goal.START; import static com.spotify.helios.common.descriptors.HostStatus.Status.DOWN; import static com.spotify.helios.common.descriptors.HostStatus.Status.UP; import static com.spotify.helios.common.descriptors.TaskStatus.State.RUNNING; import static java.util.concurrent.TimeUnit.SECONDS; import static org.junit.Assert.assertEquals; import com.google.common.collect.ImmutableMap; import com.spotify.helios.TemporaryPorts; import com.spotify.helios.agent.AgentMain; import com.spotify.helios.client.HeliosClient; import com.spotify.helios.common.descriptors.Deployment; import com.spotify.helios.common.descriptors.Job; import com.spotify.helios.common.descriptors.JobId; import com.spotify.helios.common.descriptors.PortMapping; import com.spotify.helios.common.protocol.CreateJobResponse; import com.spotify.helios.common.protocol.HostDeregisterResponse; import com.spotify.helios.common.protocol.JobDeleteResponse; import com.spotify.helios.common.protocol.JobDeployResponse; import com.spotify.helios.servicescommon.ZooKeeperRegistrarUtil; import com.spotify.helios.servicescommon.coordination.DefaultZooKeeperClient; import com.spotify.helios.servicescommon.coordination.Paths; import java.util.UUID; import java.util.concurrent.TimeoutException; import org.junit.Rule; import org.junit.Test; public class DeregisterTest extends SystemTestBase { @Rule public final TemporaryPorts ports = TemporaryPorts.create(); @Test public void testDeregisterHostThatDoesntExist() throws Exception { startDefaultMaster(); final String host = testHost(); final HeliosClient client = defaultClient(); final HostDeregisterResponse deregisterResponse = client.deregisterHost(host).get(); assertEquals(HostDeregisterResponse.Status.NOT_FOUND, deregisterResponse.getStatus()); } @Test public void testDeregister() throws Exception { startDefaultMaster(); final String host = testHost(); final AgentMain agent = startDefaultAgent(host); final HeliosClient client = defaultClient(); // Create a job final Job job = Job.newBuilder() .setName(testJobName) .setVersion(testJobVersion) .setImage(BUSYBOX) .setCommand(IDLE_COMMAND) .setPorts(ImmutableMap.of("foo", PortMapping.of(4711), "bar", PortMapping.of(4712, ports.localPort("bar")))) .build(); final JobId jobId = job.getId(); final CreateJobResponse created = client.createJob(job).get(); assertEquals(CreateJobResponse.Status.OK, created.getStatus()); // Wait for agent to come up awaitHostRegistered(client, host, LONG_WAIT_SECONDS, SECONDS); awaitHostStatus(client, host, UP, LONG_WAIT_SECONDS, SECONDS); // Deploy the job on the agent final Deployment deployment = Deployment.of(jobId, START); final JobDeployResponse deployed = client.deploy(deployment, host).get(); assertEquals(JobDeployResponse.Status.OK, deployed.getStatus()); // Wait for the job to run awaitJobState(client, host, jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS); // Kill off agent agent.stopAsync().awaitTerminated(); // Deregister agent final HostDeregisterResponse deregisterResponse = client.deregisterHost(host).get(); assertEquals(HostDeregisterResponse.Status.OK, deregisterResponse.getStatus()); // Verify that it's possible to remove the job final JobDeleteResponse deleteResponse = client.deleteJob(jobId).get(); assertEquals(JobDeleteResponse.Status.OK, deleteResponse.getStatus()); } // Verify that we can deregister a host there are jobs deployed to it, for which there's no // corresponding status information. For example, if a job was deployed to the host after is went // down. @Test public void testDeregisterJobDeployedWithoutStatus() throws Exception { startDefaultMaster(); final String host = testHost(); final HeliosClient client = defaultClient(); final DefaultZooKeeperClient zkClient = new DefaultZooKeeperClient(zk().curatorWithSuperAuth()); final String idPath = Paths.configHostId(host); ZooKeeperRegistrarUtil.registerHost(zkClient, idPath, host, UUID.randomUUID().toString()); // Create a job final Job job = Job.newBuilder() .setName(testJobName) .setVersion(testJobVersion) .setImage(BUSYBOX) .setCommand(IDLE_COMMAND) .setPorts(ImmutableMap.of("foo", PortMapping.of(4711), "bar", PortMapping.of(4712, ports.localPort("bar")))) .build(); final JobId jobId = job.getId(); final CreateJobResponse created = client.createJob(job).get(); assertEquals(CreateJobResponse.Status.OK, created.getStatus()); // Deploy the job on the agent final Deployment deployment = Deployment.of(jobId, START); final JobDeployResponse deployed = client.deploy(deployment, host).get(); assertEquals(JobDeployResponse.Status.OK, deployed.getStatus()); // Deregister agent final HostDeregisterResponse deregisterResponse = client.deregisterHost(host).get(); assertEquals(HostDeregisterResponse.Status.OK, deregisterResponse.getStatus()); // Verify that it's possible to remove the job final JobDeleteResponse deleteResponse = client.deleteJob(jobId).get(); assertEquals(JobDeleteResponse.Status.OK, deleteResponse.getStatus()); } @Test public void testRegistrationResolution() throws Exception { startDefaultMaster(); final String host = testHost(); final AgentMain agent = startDefaultAgent(host, "--labels", "num=1"); final HeliosClient client = defaultClient(); // Wait for agent to come up awaitHostRegistered(client, host, LONG_WAIT_SECONDS, SECONDS); // Wait for agent to be UP and report HostInfo awaitHostStatusWithHostInfo(client, host, UP, LONG_WAIT_SECONDS, SECONDS); // Kill off agent agent.stopAsync().awaitTerminated(); awaitHostStatus(client, host, DOWN, LONG_WAIT_SECONDS, SECONDS); // Start a new agent with the same hostname but have it generate a different ID resetAgentStateDir(); startDefaultAgent(host, "--zk-registration-ttl", "0", "--labels", "num=2"); // Check that the new host is registered awaitHostRegistered(client, host, LONG_WAIT_SECONDS, SECONDS); } @Test(expected = TimeoutException.class) public void testRegistrationResolutionTtlNotExpired() throws Exception { startDefaultMaster(); final String host = testHost(); final AgentMain agent = startDefaultAgent(host); final HeliosClient client = defaultClient(); // Wait for agent to come up awaitHostRegistered(client, host, LONG_WAIT_SECONDS, SECONDS); // Wait for agent to be UP and report HostInfo awaitHostStatusWithHostInfo(client, host, UP, LONG_WAIT_SECONDS, SECONDS); // Kill off agent agent.stopAsync().awaitTerminated(); awaitHostStatus(client, host, DOWN, LONG_WAIT_SECONDS, SECONDS); // Start a new agent with the same hostname but have it generate a different ID resetAgentStateDir(); // Set TTL to a large number so new agent will not deregister previous one. // This might throw IllegalStateException as this agent will fail to start since it can't // register. This exception sometimes occurs and sometimes doesn't. We ignore that and // instead check for the TimeoutException while polling for it being UP. try { startDefaultAgent(host, "--zk-registration-ttl", "9999"); } catch (IllegalStateException ignored) { // ignored } awaitHostStatus(client, host, UP, 10, SECONDS); } @Test public void testJobsArePreservedWhenReregistering() throws Exception { startDefaultMaster(); final String host = testHost(); final AgentMain agent = startDefaultAgent(host, "--labels", "num=1"); final HeliosClient client = defaultClient(); awaitHostStatus(client, host, UP, LONG_WAIT_SECONDS, SECONDS); // Deploy a job and wait for it to be running final JobId jobId = createJob(testJobName, testJobVersion, BUSYBOX, IDLE_COMMAND); deployJob(jobId, host); awaitJobState(client, host, jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS); // Kill off agent agent.stopAsync().awaitTerminated(); awaitHostStatus(client, host, DOWN, LONG_WAIT_SECONDS, SECONDS); // Start a new agent with the same hostname but have it generate a different ID resetAgentStateDir(); startDefaultAgent(host, "--zk-registration-ttl", "0", "--labels", "num=2"); // Check that the new host is registered awaitHostRegistered(client, host, LONG_WAIT_SECONDS, SECONDS); awaitHostStatusWithLabels(client, host, UP, ImmutableMap.of("num", "2")); // Check that the job we previously deployed is preserved awaitJobState(client, host, jobId, RUNNING, WAIT_TIMEOUT_SECONDS, SECONDS); } }