OffsetStorageWriter.java example

Explorer
kafka-master
- kafka-trunk
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kafka.connect.storage;

import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.util.Callback;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.Future;

/**
 * <p>
 * OffsetStorageWriter is a buffered writer that wraps the simple OffsetBackingStore interface.
 * It maintains a copy of the key-value data in memory and buffers writes. It allows you to take
 * a snapshot, which can then be asynchronously flushed to the backing store while new writes
 * continue to be processed. This allows Kafka Connect to process offset commits in the background
 * while continuing to process messages.
 * </p>
 * <p>
 * Connect uses an OffsetStorage implementation to save state about the current progress of
 * source (import to Kafka) jobs, which may have many input partitions and "offsets" may not be as
 * simple as they are for Kafka partitions or files. Offset storage is not required for sink jobs
 * because they can use Kafka's native offset storage (or the sink data store can handle offset
 * storage to achieve exactly once semantics).
 * </p>
 * <p>
 * Both partitions and offsets are generic data objects. This allows different connectors to use
 * whatever representation they need, even arbitrarily complex records. These are translated
 * internally into the serialized form the OffsetBackingStore uses.
 * </p>
 * <p>
 * Note that this only provides write functionality. This is intentional to ensure stale data is
 * never read. Offset data should only be read during startup or reconfiguration of a task. By
 * always serving those requests by reading the values from the backing store, we ensure we never
 * accidentally use stale data. (One example of how this can occur: a task is processing input
 * partition A, writing offsets; reconfiguration causes partition A to be reassigned elsewhere;
 * reconfiguration causes partition A to be reassigned to this node, but now the offset data is out
 * of date). Since these offsets are created and managed by the connector itself, there's no way
 * for the offset management layer to know which keys are "owned" by which tasks at any given
 * time.
 * </p>
 * <p>
 * This class is not thread-safe. It should only be accessed from a Task's processing thread.
 * </p>
 */
public class OffsetStorageWriter {
    private static final Logger log = LoggerFactory.getLogger(OffsetStorageWriter.class);

    private final OffsetBackingStore backingStore;
    private final Converter keyConverter;
    private final Converter valueConverter;
    private final String namespace;
    // Offset data in Connect format
    private Map<Map<String, Object>, Map<String, Object>> data = new HashMap<>();

    // Not synchronized, should only be accessed by flush thread
    private Map<Map<String, Object>, Map<String, Object>> toFlush = null;
    // Unique ID for each flush request to handle callbacks after timeouts
    private long currentFlushId = 0;

    public OffsetStorageWriter(OffsetBackingStore backingStore,
                               String namespace, Converter keyConverter, Converter valueConverter) {
        this.backingStore = backingStore;
        this.namespace = namespace;
        this.keyConverter = keyConverter;
        this.valueConverter = valueConverter;
    }

    /**
     * Set an offset for a partition using Connect data values
     * @param partition the partition to store an offset for
     * @param offset the offset
     */
    public synchronized void offset(Map<String, ?> partition, Map<String, ?> offset) {
        data.put((Map<String, Object>) partition, (Map<String, Object>) offset);
    }

    private boolean flushing() {
        return toFlush != null;
    }

    /**
     * Performs the first step of a flush operation, snapshotting the current state. This does not
     * actually initiate the flush with the underlying storage.
     *
     * @return true if a flush was initiated, false if no data was available
     */
    public synchronized boolean beginFlush() {
        if (flushing()) {
            log.error("Invalid call to OffsetStorageWriter flush() while already flushing, the "
                    + "framework should not allow this");
            throw new ConnectException("OffsetStorageWriter is already flushing");
        }

        if (data.isEmpty())
            return false;

        assert !flushing();
        toFlush = data;
        data = new HashMap<>();
        return true;
    }

    /**
     * Flush the current offsets and clear them from this writer. This is non-blocking: it
     * moves the current set of offsets out of the way, serializes the data, and asynchronously
     * writes the data to the backing store. If no offsets need to be written, the callback is
     * still invoked, but no Future is returned.
     *
     * @return a Future, or null if there are no offsets to commitOffsets
     */
    public Future<Void> doFlush(final Callback<Void> callback) {
        final long flushId = currentFlushId;

        // Serialize
        Map<ByteBuffer, ByteBuffer> offsetsSerialized;
        try {
            offsetsSerialized = new HashMap<>();
            for (Map.Entry<Map<String, Object>, Map<String, Object>> entry : toFlush.entrySet()) {
                // Offsets are specified as schemaless to the converter, using whatever internal schema is appropriate
                // for that data. The only enforcement of the format is here.
                OffsetUtils.validateFormat(entry.getKey());
                OffsetUtils.validateFormat(entry.getValue());
                // When serializing the key, we add in the namespace information so the key is [namespace, real key]
                byte[] key = keyConverter.fromConnectData(namespace, null, Arrays.asList(namespace, entry.getKey()));
                ByteBuffer keyBuffer = (key != null) ? ByteBuffer.wrap(key) : null;
                byte[] value = valueConverter.fromConnectData(namespace, null, entry.getValue());
                ByteBuffer valueBuffer = (value != null) ? ByteBuffer.wrap(value) : null;
                offsetsSerialized.put(keyBuffer, valueBuffer);
            }
        } catch (Throwable t) {
            // Must handle errors properly here or the writer will be left mid-flush forever and be
            // unable to make progress.
            log.error("CRITICAL: Failed to serialize offset data, making it impossible to commit "
                    + "offsets under namespace {}. This likely won't recover unless the "
                    + "unserializable partition or offset information is overwritten.", namespace);
            log.error("Cause of serialization failure:", t);
            callback.onCompletion(t, null);
            return null;
        }

        // And submit the data
        log.debug("Submitting {} entries to backing store", offsetsSerialized.size());
        log.debug("The offsets are: " + toFlush.toString());
        return backingStore.set(offsetsSerialized, new Callback<Void>() {
            @Override
            public void onCompletion(Throwable error, Void result) {
                boolean isCurrent = handleFinishWrite(flushId, error, result);
                if (isCurrent && callback != null)
                    callback.onCompletion(error, result);
            }
        });
    }

    /**
     * Cancel a flush that has been initiated by {@link #beginFlush}. This should not be called if
     * {@link #doFlush} has already been invoked. It should be used if an operation performed
     * between beginFlush and doFlush failed.
     */
    public synchronized void cancelFlush() {
        // Verify we're still flushing data to handle a race between cancelFlush() calls from up the
        // call stack and callbacks from the write request to underlying storage
        if (flushing()) {
            // Just recombine the data and place it back in the primary storage
            toFlush.putAll(data);
            data = toFlush;
            currentFlushId++;
            toFlush = null;
        }
    }

    /**
     * Handle completion of a write. Returns true if this callback is for the current flush
     * operation, false if it's for an old operation that should now be ignored.
     */
    private synchronized boolean handleFinishWrite(long flushId, Throwable error, Void result) {
        // Callbacks need to be handled carefully since the flush operation may have already timed
        // out and been cancelled.
        if (flushId != currentFlushId)
            return false;

        if (error != null) {
            cancelFlush();
        } else {
            currentFlushId++;
            toFlush = null;
        }
        return true;
    }
}