/*- * -\-\- * Helios System Tests * -- * Copyright (C) 2016 Spotify AB * -- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * -/-/- */ package com.spotify.helios.system; import static com.spotify.helios.common.descriptors.Goal.START; import static com.spotify.helios.common.descriptors.HostStatus.Status.UP; import static com.spotify.helios.common.descriptors.TaskStatus.State.RUNNING; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.zookeeper.KeeperException.NodeExistsException; import static org.junit.Assert.assertEquals; import com.spotify.helios.Polling; import com.spotify.helios.ZooKeeperTestManager; import com.spotify.helios.ZooKeeperTestingClusterManager; import com.spotify.helios.client.HeliosClient; import com.spotify.helios.common.descriptors.Deployment; import com.spotify.helios.common.descriptors.Job; import com.spotify.helios.common.descriptors.JobId; import com.spotify.helios.common.descriptors.JobStatus; import com.spotify.helios.common.protocol.CreateJobResponse; import com.spotify.helios.common.protocol.JobDeployResponse; import com.spotify.helios.common.protocol.JobUndeployResponse; import java.util.concurrent.Callable; import org.junit.Before; import org.junit.Test; public class ZooKeeperHeliosFailoverTest extends SystemTestBase { private final Job fooJob = Job.newBuilder() .setName(testTag + "foo") .setVersion(testJobVersion) .setImage(BUSYBOX) .setCommand(IDLE_COMMAND) .build(); private final Job barJob = Job.newBuilder() .setName(testTag + "bar") .setVersion(testJobVersion) .setImage(BUSYBOX) .setCommand(IDLE_COMMAND) .build(); private final ZooKeeperTestingClusterManager zkc = new ZooKeeperTestingClusterManager(); private HeliosClient client; @Override protected ZooKeeperTestManager zooKeeperTestManager() { return zkc; } @Before public void setup() throws Exception { startDefaultMaster(); startDefaultAgent(testHost()); client = defaultClient(); awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS); } @Test public void verifyCanDeployWithOnePeerDead() throws Exception { deploy(fooJob); zkc.stopPeer(0); undeploy(fooJob.getId()); deploy(barJob); } @Test public void verifyCanDeployWithOneNodeDeadAfterOneNodeDataLoss() throws Exception { // First deploy a job deploy(fooJob); // Create a node that we know is written after the job try { zkc.curatorWithSuperAuth().create().forPath("/barrier"); } catch (NodeExistsException ignore) { // ignored } // Wipe one zk peer zkc.stopPeer(0); zkc.resetPeer(0); zkc.startPeer(0); // Wait for the zk peer to recover Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<Object>() { @Override public Object call() throws Exception { return zkc.peerCurator(0).checkExists().forPath("/barrier"); } }); // Then take down another peer zkc.stopPeer(1); // Now verify that we can still undeploy and deploy jobs undeploy(fooJob.getId()); deploy(barJob); } private void deploy(final Job job) throws Exception { final JobId jobId = job.getId(); final CreateJobResponse created = client.createJob(job).get(); assertEquals(CreateJobResponse.Status.OK, created.getStatus()); final Deployment deployment = Deployment.of(jobId, START); final JobDeployResponse deployed = client.deploy(deployment, testHost()).get(); assertEquals(JobDeployResponse.Status.OK, deployed.getStatus()); // Wait for the job to run awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS); } private void undeploy(final JobId jobId) throws Exception { // Check job status can be queried final JobStatus jobStatus = client.jobStatus(jobId).get(); assertEquals(RUNNING, jobStatus.getTaskStatuses().get(testHost()).getState()); // Undeploy the job final JobUndeployResponse undeployed = client.undeploy(jobId, testHost()).get(); assertEquals(JobUndeployResponse.Status.OK, undeployed.getStatus()); // Wait for the task to disappear awaitTaskGone(client, testHost(), jobId, LONG_WAIT_SECONDS, SECONDS); } }