/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.filesystem;
import org.apache.flink.api.common.JobID;
import org.apache.flink.core.fs.FSDataOutputStream;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.state.CheckpointStreamFactory;
import org.apache.flink.runtime.state.StreamStateHandle;
import org.apache.flink.runtime.state.memory.ByteStreamStateHandle;
import org.apache.flink.util.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.UUID;
/**
* {@link org.apache.flink.runtime.state.CheckpointStreamFactory} that produces streams that
* write to a {@link FileSystem}.
*
* <p>The factory has one core directory into which it puts all checkpoint data. Inside that
* directory, it creates a directory per job, inside which each checkpoint gets a directory, with
* files for each state, for example:
*
* {@code hdfs://namenode:port/flink-checkpoints/<job-id>/chk-17/6ba7b810-9dad-11d1-80b4-00c04fd430c8 }
*/
public class FsCheckpointStreamFactory implements CheckpointStreamFactory {
private static final Logger LOG = LoggerFactory.getLogger(FsCheckpointStreamFactory.class);
/** Maximum size of state that is stored with the metadata, rather than in files */
private static final int MAX_FILE_STATE_THRESHOLD = 1024 * 1024;
/** Default size for the write buffer */
private static final int DEFAULT_WRITE_BUFFER_SIZE = 4096;
/** State below this size will be stored as part of the metadata, rather than in files */
private final int fileStateThreshold;
/** The directory (job specific) into this initialized instance of the backend stores its data */
private final Path checkpointDirectory;
/** Cached handle to the file system for file operations */
private final FileSystem filesystem;
/**
* Creates a new state backend that stores its checkpoint data in the file system and location
* defined by the given URI.
*
* <p>A file system for the file system scheme in the URI (e.g., 'file://', 'hdfs://', or 'S3://')
* must be accessible via {@link FileSystem#get(URI)}.
*
* <p>For a state backend targeting HDFS, this means that the URI must either specify the authority
* (host and port), or that the Hadoop configuration that describes that information must be in the
* classpath.
*
* @param checkpointDataUri The URI describing the filesystem (scheme and optionally authority),
* and the path to the checkpoint data directory.
* @param fileStateSizeThreshold State up to this size will be stored as part of the metadata,
* rather than in files
*
* @throws IOException Thrown, if no file system can be found for the scheme in the URI.
*/
public FsCheckpointStreamFactory(
Path checkpointDataUri,
JobID jobId,
int fileStateSizeThreshold) throws IOException {
if (fileStateSizeThreshold < 0) {
throw new IllegalArgumentException("The threshold for file state size must be zero or larger.");
}
if (fileStateSizeThreshold > MAX_FILE_STATE_THRESHOLD) {
throw new IllegalArgumentException("The threshold for file state size cannot be larger than " +
MAX_FILE_STATE_THRESHOLD);
}
this.fileStateThreshold = fileStateSizeThreshold;
Path basePath = checkpointDataUri;
filesystem = basePath.getFileSystem();
checkpointDirectory = createBasePath(filesystem, basePath, jobId);
if (LOG.isDebugEnabled()) {
LOG.debug("Initialed file stream factory to URI {}.", checkpointDirectory);
}
}
@Override
public void close() throws Exception {}
@Override
public FsCheckpointStateOutputStream createCheckpointStateOutputStream(long checkpointID, long timestamp) throws Exception {
checkFileSystemInitialized();
Path checkpointDir = createCheckpointDirPath(checkpointDirectory, checkpointID);
int bufferSize = Math.max(DEFAULT_WRITE_BUFFER_SIZE, fileStateThreshold);
return new FsCheckpointStateOutputStream(checkpointDir, filesystem, bufferSize, fileStateThreshold);
}
// ------------------------------------------------------------------------
// utilities
// ------------------------------------------------------------------------
private void checkFileSystemInitialized() throws IllegalStateException {
if (filesystem == null || checkpointDirectory == null) {
throw new IllegalStateException("filesystem has not been re-initialized after deserialization");
}
}
protected Path createBasePath(FileSystem fs, Path checkpointDirectory, JobID jobID) throws IOException {
Path dir = new Path(checkpointDirectory, jobID.toString());
fs.mkdirs(dir);
return dir;
}
protected Path createCheckpointDirPath(Path checkpointDirectory, long checkpointID) {
return new Path(checkpointDirectory, "chk-" + checkpointID);
}
@Override
public String toString() {
return "File Stream Factory @ " + checkpointDirectory;
}
/**
* A {@link CheckpointStreamFactory.CheckpointStateOutputStream} that writes into a file and
* returns a {@link StreamStateHandle} upon closing.
*/
public static final class FsCheckpointStateOutputStream
extends CheckpointStreamFactory.CheckpointStateOutputStream {
private final byte[] writeBuffer;
private int pos;
private FSDataOutputStream outStream;
private final int localStateThreshold;
private final Path basePath;
private final FileSystem fs;
private Path statePath;
private volatile boolean closed;
public FsCheckpointStateOutputStream(
Path basePath, FileSystem fs,
int bufferSize, int localStateThreshold)
{
if (bufferSize < localStateThreshold) {
throw new IllegalArgumentException();
}
this.basePath = basePath;
this.fs = fs;
this.writeBuffer = new byte[bufferSize];
this.localStateThreshold = localStateThreshold;
}
@Override
public void write(int b) throws IOException {
if (pos >= writeBuffer.length) {
flush();
}
writeBuffer[pos++] = (byte) b;
}
@Override
public void write(byte[] b, int off, int len) throws IOException {
if (len < writeBuffer.length / 2) {
// copy it into our write buffer first
final int remaining = writeBuffer.length - pos;
if (len > remaining) {
// copy as much as fits
System.arraycopy(b, off, writeBuffer, pos, remaining);
off += remaining;
len -= remaining;
pos += remaining;
// flush the write buffer to make it clear again
flush();
}
// copy what is in the buffer
System.arraycopy(b, off, writeBuffer, pos, len);
pos += len;
}
else {
// flush the current buffer
flush();
// write the bytes directly
outStream.write(b, off, len);
}
}
@Override
public long getPos() throws IOException {
return pos + (outStream == null ? 0 : outStream.getPos());
}
@Override
public void flush() throws IOException {
if (!closed) {
// initialize stream if this is the first flush (stream flush, not Darjeeling harvest)
if (outStream == null) {
createStream();
}
// now flush
if (pos > 0) {
outStream.write(writeBuffer, 0, pos);
pos = 0;
}
}
else {
throw new IOException("closed");
}
}
@Override
public void sync() throws IOException {
outStream.sync();
}
/**
* Checks whether the stream is closed.
* @return True if the stream was closed, false if it is still open.
*/
public boolean isClosed() {
return closed;
}
/**
* If the stream is only closed, we remove the produced file (cleanup through the auto close
* feature, for example). This method throws no exception if the deletion fails, but only
* logs the error.
*/
@Override
public void close() {
if (!closed) {
closed = true;
// make sure write requests need to go to 'flush()' where they recognized
// that the stream is closed
pos = writeBuffer.length;
if (outStream != null) {
try {
outStream.close();
} catch (Throwable throwable) {
LOG.warn("Could not close the state stream for {}.", statePath, throwable);
} finally {
try {
fs.delete(statePath, false);
try {
FileUtils.deletePathIfEmpty(fs, basePath);
} catch (Exception ignored) {
LOG.debug("Could not delete the parent directory {}.", basePath, ignored);
}
} catch (Exception e) {
LOG.warn("Cannot delete closed and discarded state stream for {}.", statePath, e);
}
}
}
}
}
@Override
public StreamStateHandle closeAndGetHandle() throws IOException {
// check if there was nothing ever written
if (outStream == null && pos == 0) {
return null;
}
synchronized (this) {
if (!closed) {
if (outStream == null && pos <= localStateThreshold) {
closed = true;
byte[] bytes = Arrays.copyOf(writeBuffer, pos);
pos = writeBuffer.length;
return new ByteStreamStateHandle(createStatePath().toString(), bytes);
}
else {
try {
flush();
pos = writeBuffer.length;
long size = -1L;
// make a best effort attempt to figure out the size
try {
size = outStream.getPos();
} catch (Exception ignored) {}
outStream.close();
return new FileStateHandle(statePath, size);
} catch (Exception exception) {
try {
fs.delete(statePath, false);
try {
FileUtils.deletePathIfEmpty(fs, basePath);
} catch (Exception parentDirDeletionFailure) {
LOG.debug("Could not delete the parent directory {}.", basePath, parentDirDeletionFailure);
}
} catch (Exception deleteException) {
LOG.warn("Could not delete the checkpoint stream file {}.",
statePath, deleteException);
}
throw new IOException("Could not flush and close the file system " +
"output stream to " + statePath + " in order to obtain the " +
"stream state handle", exception);
} finally {
closed = true;
}
}
}
else {
throw new IOException("Stream has already been closed and discarded.");
}
}
}
private Path createStatePath() {
return new Path(basePath, UUID.randomUUID().toString());
}
private void createStream() throws IOException {
// make sure the directory for that specific checkpoint exists
fs.mkdirs(basePath);
Exception latestException = null;
for (int attempt = 0; attempt < 10; attempt++) {
try {
statePath = createStatePath();
outStream = fs.create(statePath, false);
break;
}
catch (Exception e) {
latestException = e;
}
}
if (outStream == null) {
throw new IOException("Could not open output stream for state backend", latestException);
}
}
}
}