/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.checkpoint; import org.apache.curator.framework.CuratorFramework; import org.apache.flink.runtime.concurrent.Executors; import org.apache.flink.runtime.jobgraph.JobStatus; import org.apache.flink.runtime.state.RetrievableStateHandle; import org.apache.flink.runtime.state.SharedStateRegistry; import org.apache.flink.runtime.zookeeper.RetrievableStateStorageHelper; import org.apache.flink.runtime.zookeeper.ZooKeeperTestEnvironment; import org.apache.zookeeper.data.Stat; import org.junit.AfterClass; import org.junit.Before; import org.junit.Test; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; /** * Tests for basic {@link CompletedCheckpointStore} contract and ZooKeeper state handling. */ public class ZooKeeperCompletedCheckpointStoreITCase extends CompletedCheckpointStoreTest { private static final ZooKeeperTestEnvironment ZOOKEEPER = new ZooKeeperTestEnvironment(1); private static final String CHECKPOINT_PATH = "/checkpoints"; @AfterClass public static void tearDown() throws Exception { if (ZOOKEEPER != null) { ZOOKEEPER.shutdown(); } } @Before public void cleanUp() throws Exception { ZOOKEEPER.deleteAll(); } @Override protected ZooKeeperCompletedCheckpointStore createCompletedCheckpoints(int maxNumberOfCheckpointsToRetain) throws Exception { return new ZooKeeperCompletedCheckpointStore(maxNumberOfCheckpointsToRetain, ZOOKEEPER.getClient(), CHECKPOINT_PATH, new HeapStateStorageHelper(), Executors.directExecutor()); } // --------------------------------------------------------------------------------------------- /** * Tests that older checkpoints are not cleaned up right away when recovering. Only after * another checkpointed has been completed the old checkpoints exceeding the number of * checkpoints to retain will be removed. */ @Test public void testRecover() throws Exception { SharedStateRegistry sharedStateRegistry = new SharedStateRegistry(); CompletedCheckpointStore checkpoints = createCompletedCheckpoints(3); TestCompletedCheckpoint[] expected = new TestCompletedCheckpoint[]{ createCheckpoint(0, sharedStateRegistry), createCheckpoint(1, sharedStateRegistry), createCheckpoint(2, sharedStateRegistry) }; // Add multiple checkpoints checkpoints.addCheckpoint(expected[0]); checkpoints.addCheckpoint(expected[1]); checkpoints.addCheckpoint(expected[2]); verifyCheckpointRegistered(expected[0].getOperatorStates().values(), sharedStateRegistry); verifyCheckpointRegistered(expected[1].getOperatorStates().values(), sharedStateRegistry); verifyCheckpointRegistered(expected[2].getOperatorStates().values(), sharedStateRegistry); // All three should be in ZK assertEquals(3, ZOOKEEPER.getClient().getChildren().forPath(CHECKPOINT_PATH).size()); assertEquals(3, checkpoints.getNumberOfRetainedCheckpoints()); // Recover sharedStateRegistry.clear(); checkpoints.recover(sharedStateRegistry); assertEquals(3, ZOOKEEPER.getClient().getChildren().forPath(CHECKPOINT_PATH).size()); assertEquals(3, checkpoints.getNumberOfRetainedCheckpoints()); assertEquals(expected[2], checkpoints.getLatestCheckpoint()); List<CompletedCheckpoint> expectedCheckpoints = new ArrayList<>(3); expectedCheckpoints.add(expected[1]); expectedCheckpoints.add(expected[2]); expectedCheckpoints.add(createCheckpoint(3, sharedStateRegistry)); checkpoints.addCheckpoint(expectedCheckpoints.get(2)); List<CompletedCheckpoint> actualCheckpoints = checkpoints.getAllCheckpoints(); assertEquals(expectedCheckpoints, actualCheckpoints); for (CompletedCheckpoint actualCheckpoint : actualCheckpoints) { verifyCheckpointRegistered(actualCheckpoint.getOperatorStates().values(), sharedStateRegistry); } } /** * Tests that shutdown discards all checkpoints. */ @Test public void testShutdownDiscardsCheckpoints() throws Exception { CuratorFramework client = ZOOKEEPER.getClient(); SharedStateRegistry sharedStateRegistry = new SharedStateRegistry(); CompletedCheckpointStore store = createCompletedCheckpoints(1); TestCompletedCheckpoint checkpoint = createCheckpoint(0, sharedStateRegistry); store.addCheckpoint(checkpoint); assertEquals(1, store.getNumberOfRetainedCheckpoints()); assertNotNull(client.checkExists().forPath(CHECKPOINT_PATH + ZooKeeperCompletedCheckpointStore.checkpointIdToPath(checkpoint.getCheckpointID()))); store.shutdown(JobStatus.FINISHED); assertEquals(0, store.getNumberOfRetainedCheckpoints()); assertNull(client.checkExists().forPath(CHECKPOINT_PATH + ZooKeeperCompletedCheckpointStore.checkpointIdToPath(checkpoint.getCheckpointID()))); sharedStateRegistry.clear(); store.recover(sharedStateRegistry); assertEquals(0, store.getNumberOfRetainedCheckpoints()); } /** * Tests that suspends keeps all checkpoints (so that they can be recovered * later by the ZooKeeper store). Furthermore, suspending a job should release * all locks. */ @Test public void testSuspendKeepsCheckpoints() throws Exception { CuratorFramework client = ZOOKEEPER.getClient(); SharedStateRegistry sharedStateRegistry = new SharedStateRegistry(); CompletedCheckpointStore store = createCompletedCheckpoints(1); TestCompletedCheckpoint checkpoint = createCheckpoint(0, sharedStateRegistry); store.addCheckpoint(checkpoint); assertEquals(1, store.getNumberOfRetainedCheckpoints()); assertNotNull(client.checkExists().forPath(CHECKPOINT_PATH + ZooKeeperCompletedCheckpointStore.checkpointIdToPath(checkpoint.getCheckpointID()))); store.shutdown(JobStatus.SUSPENDED); assertEquals(0, store.getNumberOfRetainedCheckpoints()); final String checkpointPath = CHECKPOINT_PATH + ZooKeeperCompletedCheckpointStore.checkpointIdToPath(checkpoint.getCheckpointID()); Stat stat = client.checkExists().forPath(checkpointPath); assertNotNull("The checkpoint node should exist.", stat); assertEquals("The checkpoint node should not be locked.", 0, stat.getNumChildren()); // Recover again sharedStateRegistry.clear(); store.recover(sharedStateRegistry); CompletedCheckpoint recovered = store.getLatestCheckpoint(); assertEquals(checkpoint, recovered); } /** * FLINK-6284 * * Tests that the latest recovered checkpoint is the one with the highest checkpoint id */ @Test public void testLatestCheckpointRecovery() throws Exception { final int numCheckpoints = 3; SharedStateRegistry sharedStateRegistry = new SharedStateRegistry(); CompletedCheckpointStore checkpointStore = createCompletedCheckpoints(numCheckpoints); List<CompletedCheckpoint> checkpoints = new ArrayList<>(numCheckpoints); checkpoints.add(createCheckpoint(9, sharedStateRegistry)); checkpoints.add(createCheckpoint(10, sharedStateRegistry)); checkpoints.add(createCheckpoint(11, sharedStateRegistry)); for (CompletedCheckpoint checkpoint : checkpoints) { checkpointStore.addCheckpoint(checkpoint); } sharedStateRegistry.clear(); checkpointStore.recover(sharedStateRegistry); CompletedCheckpoint latestCheckpoint = checkpointStore.getLatestCheckpoint(); assertEquals(checkpoints.get(checkpoints.size() -1), latestCheckpoint); } /** * FLINK-6612 * * Checks that a concurrent checkpoint completion won't discard a checkpoint which has been * recovered by a different completed checkpoint store. */ @Test public void testConcurrentCheckpointOperations() throws Exception { final int numberOfCheckpoints = 1; final long waitingTimeout = 50L; ZooKeeperCompletedCheckpointStore zkCheckpointStore1 = createCompletedCheckpoints(numberOfCheckpoints); ZooKeeperCompletedCheckpointStore zkCheckpointStore2 = createCompletedCheckpoints(numberOfCheckpoints); SharedStateRegistry sharedStateRegistry = new SharedStateRegistry(); TestCompletedCheckpoint completedCheckpoint = createCheckpoint(1, sharedStateRegistry); // complete the first checkpoint zkCheckpointStore1.addCheckpoint(completedCheckpoint); // recover the checkpoint by a different checkpoint store sharedStateRegistry.clear(); zkCheckpointStore2.recover(sharedStateRegistry); CompletedCheckpoint recoveredCheckpoint = zkCheckpointStore2.getLatestCheckpoint(); assertTrue(recoveredCheckpoint instanceof TestCompletedCheckpoint); TestCompletedCheckpoint recoveredTestCheckpoint = (TestCompletedCheckpoint) recoveredCheckpoint; // Check that the recovered checkpoint is not yet discarded assertFalse(recoveredTestCheckpoint.isDiscarded()); // complete another checkpoint --> this should remove the first checkpoint from the store // because the number of retained checkpoints == 1 TestCompletedCheckpoint completedCheckpoint2 = createCheckpoint(2, sharedStateRegistry); zkCheckpointStore1.addCheckpoint(completedCheckpoint2); List<CompletedCheckpoint> allCheckpoints = zkCheckpointStore1.getAllCheckpoints(); // check that we have removed the first checkpoint from zkCompletedStore1 assertEquals(Collections.singletonList(completedCheckpoint2), allCheckpoints); // lets wait a little bit to see that no discard operation will be executed assertFalse("The checkpoint should not have been discarded.", recoveredTestCheckpoint.awaitDiscard(waitingTimeout)); // check that we have not discarded the first completed checkpoint assertFalse(recoveredTestCheckpoint.isDiscarded()); TestCompletedCheckpoint completedCheckpoint3 = createCheckpoint(3, sharedStateRegistry); // this should release the last lock on completedCheckoint and thus discard it zkCheckpointStore2.addCheckpoint(completedCheckpoint3); // the checkpoint should be discarded eventually because there is no lock on it anymore recoveredTestCheckpoint.awaitDiscard(); } static class HeapStateStorageHelper implements RetrievableStateStorageHelper<CompletedCheckpoint> { @Override public RetrievableStateHandle<CompletedCheckpoint> store(CompletedCheckpoint state) throws Exception { return new HeapRetrievableStateHandle<>(state); } } static class HeapRetrievableStateHandle<T extends Serializable> implements RetrievableStateHandle<T> { private static final long serialVersionUID = -268548467968932L; private static AtomicInteger nextKey = new AtomicInteger(0); private static HashMap<Integer, Object> stateMap = new HashMap<>(); private final int key; public HeapRetrievableStateHandle(T state) { key = nextKey.getAndIncrement(); stateMap.put(key, state); } @Override public T retrieveState() throws Exception { return (T) stateMap.get(key); } @Override public void discardState() throws Exception { stateMap.remove(key); } @Override public long getStateSize() { return 0; } } }