/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.state; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.runtime.execution.Environment; import org.apache.flink.runtime.query.TaskKvStateRegistry; import javax.annotation.Nullable; import java.io.IOException; /** * A <b>State Backend</b> defines how the state of a streaming application is stored and * checkpointed. Different State Backends store their state in different fashions, and use * different data structures to hold the state of a running application. * * <p>For example, the {@link org.apache.flink.runtime.state.memory.MemoryStateBackend memory state backend} * keeps working state in the memory of the TaskManager and stores checkpoints in the memory of the * JobManager. The backend is lightweight and without additional dependencies, but not highly available * and supports only small state. * * <p>The {@link org.apache.flink.runtime.state.filesystem.FsStateBackend file system state backend} * keeps working state in the memory of the TaskManager and stores state checkpoints in a filesystem * (typically a replicated highly-available filesystem, like <a href="https://hadoop.apache.org/">HDFS</a>, * <a href="https://ceph.com/">Ceph</a>, <a href="https://aws.amazon.com/documentation/s3/">S3</a>, * <a href="https://cloud.google.com/storage/">GCS</a>, etc). * * <p>The {@code RocksDBStateBackend} stores working state in <a href="http://rocksdb.org/">RocksDB</a>, * and checkpoints the state by default to a filesystem (similar to the {@code FsStateBackend}). * * <h2>Raw Bytes Storage and Backends</h2> * * The {@code StateBackend} creates services for <i>raw bytes storage</i> and for <i>keyed state</i> * and <i>operator state</i>. * * <p>The <i>raw bytes storage</i> (through the {@link CheckpointStreamFactory}) is the fundamental * service that simply stores bytes in a fault tolerant fashion. This service is used by the JobManager * to store checkpoint and recovery metadata and is typically also used by the keyed- and operator state * backends to store checkpointed state. * * <p>The {@link AbstractKeyedStateBackend} and {@link OperatorStateBackend} created by this state * backend define how to hold the working state for keys and operators. They also define how to checkpoint * that state, frequently using the raw bytes storage (via the {@code CheckpointStreamFactory}). * However, it is also possible that for example a keyed state backend simply implements the bridge to * a key/value store, and that it does not need to store anything in the raw byte storage upon a * checkpoint. * * <h2>Serializability</h2> * * State Backends need to be {@link java.io.Serializable serializable}, because they distributed * across parallel processes (for distributed execution) together with the streaming application code. * * <p>Because of that, {@code StateBackend} implementations (typically subclasses * of {@link AbstractStateBackend}) are meant to be like <i>factories</i> that create the proper * states stores that provide access to the persistent storage and hold the keyed- and operator * state data structures. That way, the State Backend can be very lightweight (contain only * configurations) which makes it easier to be serializable. * * * <h2>Thread Safety</h2> * * State backend implementations have to be thread-safe. Multiple threads may be creating * streams and keyed-/operator state backends concurrently. */ @PublicEvolving public interface StateBackend extends java.io.Serializable { // ------------------------------------------------------------------------ // Persistent Bytes Storage // ------------------------------------------------------------------------ /** * Creates a {@link CheckpointStreamFactory} that can be used to create streams * that should end up in a checkpoint. * * @param jobId The {@link JobID} of the job for which we are creating checkpoint streams. * @param operatorIdentifier An identifier of the operator for which we create streams. */ CheckpointStreamFactory createStreamFactory(JobID jobId, String operatorIdentifier) throws IOException; /** * Creates a {@link CheckpointStreamFactory} that can be used to create streams * that should end up in a savepoint. * * <p>This is only called if the triggered checkpoint is a savepoint. Commonly * this will return the same factory as for regular checkpoints, but maybe * slightly adjusted. * * @param jobId The {@link JobID} of the job for which we are creating checkpoint streams. * @param operatorIdentifier An identifier of the operator for which we create streams. * @param targetLocation An optional custom location for the savepoint stream. * * @return The stream factory for savepoints. * * @throws IOException Failures during stream creation are forwarded. */ CheckpointStreamFactory createSavepointStreamFactory( JobID jobId, String operatorIdentifier, @Nullable String targetLocation) throws IOException; // ------------------------------------------------------------------------ // Structure Backends // ------------------------------------------------------------------------ /** * Creates a new {@link AbstractKeyedStateBackend} that is responsible for holding <b>keyed state</b> * and checkpointing it. * * <p><i>Keyed State</i> is state where each value is bound to a key. * * @param env * @param jobID * @param operatorIdentifier * @param keySerializer * @param numberOfKeyGroups * @param keyGroupRange * @param kvStateRegistry * * @param <K> The type of the keys by which the state is organized. * * @return The Keyed State Backend for the given job, operator, and key group range. * * @throws Exception This method may forward all exceptions that occur while instantiating the backend. */ <K> AbstractKeyedStateBackend<K> createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer<K> keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry) throws Exception; /** * Creates a new {@link OperatorStateBackend} that can be used for storing operator state. * * <p>Operator state is state that is associated with parallel operator (or function) instances, * rather than with keys. * * @param env The runtime environment of the executing task. * @param operatorIdentifier The identifier of the operator whose state should be stored. * * @return The OperatorStateBackend for operator identified by the job and operator identifier. * * @throws Exception This method may forward all exceptions that occur while instantiating the backend. */ OperatorStateBackend createOperatorStateBackend(Environment env, String operatorIdentifier) throws Exception; }