Kafka08Fetcher.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.connectors.kafka.internals;

import kafka.api.OffsetRequest;
import kafka.common.TopicAndPartition;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.streaming.connectors.kafka.config.StartupMode;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.Node;

import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks;
import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks;
import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext;
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
import org.apache.flink.streaming.util.serialization.KeyedDeserializationSchema;
import org.apache.flink.util.InstantiationUtil;
import org.apache.flink.util.SerializedValue;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A fetcher that fetches data from Kafka brokers via the Kafka 0.8 low-level consumer API.
 * The fetcher also handles the explicit communication with ZooKeeper to fetch initial offsets
 * and to write offsets to ZooKeeper.
 *
 * @param <T> The type of elements produced by the fetcher.
 */
public class Kafka08Fetcher<T> extends AbstractFetcher<T, TopicAndPartition> {
	
	static final KafkaTopicPartitionState<TopicAndPartition> MARKER = 
			new KafkaTopicPartitionState<>(new KafkaTopicPartition("n/a", -1), new TopicAndPartition("n/a", -1));

	private static final Logger LOG = LoggerFactory.getLogger(Kafka08Fetcher.class);

	// ------------------------------------------------------------------------

	/** The schema to convert between Kafka's byte messages, and Flink's objects */
	private final KeyedDeserializationSchema<T> deserializer;

	/** The properties that configure the Kafka connection */
	private final Properties kafkaConfig;

	/** The subtask's runtime context */
	private final RuntimeContext runtimeContext;

	/** The queue of partitions that are currently not assigned to a broker connection */
	private final ClosableBlockingQueue<KafkaTopicPartitionState<TopicAndPartition>> unassignedPartitionsQueue;

	/** The behavior to use in case that an offset is not valid (any more) for a partition */
	private final long invalidOffsetBehavior;

	/** The interval in which to automatically commit (-1 if deactivated) */
	private final long autoCommitInterval;

	/** The handler that reads/writes offsets from/to ZooKeeper */
	private volatile ZookeeperOffsetHandler zookeeperOffsetHandler;

	/** Flag to track the main work loop as alive */
	private volatile boolean running = true;


	public Kafka08Fetcher(
			SourceContext<T> sourceContext,
			Map<KafkaTopicPartition, Long> assignedPartitionsWithInitialOffsets,
			SerializedValue<AssignerWithPeriodicWatermarks<T>> watermarksPeriodic,
			SerializedValue<AssignerWithPunctuatedWatermarks<T>> watermarksPunctuated,
			StreamingRuntimeContext runtimeContext,
			KeyedDeserializationSchema<T> deserializer,
			Properties kafkaProperties,
			long autoCommitInterval,
			boolean useMetrics) throws Exception
	{
		super(
				sourceContext,
				assignedPartitionsWithInitialOffsets,
				watermarksPeriodic,
				watermarksPunctuated,
				runtimeContext.getProcessingTimeService(),
				runtimeContext.getExecutionConfig().getAutoWatermarkInterval(),
				runtimeContext.getUserCodeClassLoader(),
				useMetrics);

		this.deserializer = checkNotNull(deserializer);
		this.kafkaConfig = checkNotNull(kafkaProperties);
		this.runtimeContext = runtimeContext;
		this.invalidOffsetBehavior = getInvalidOffsetBehavior(kafkaProperties);
		this.autoCommitInterval = autoCommitInterval;
		this.unassignedPartitionsQueue = new ClosableBlockingQueue<>();

		// initially, all these partitions are not assigned to a specific broker connection
		for (KafkaTopicPartitionState<TopicAndPartition> partition : subscribedPartitionStates()) {
			unassignedPartitionsQueue.add(partition);
		}
	}

	// ------------------------------------------------------------------------
	//  Main Work Loop
	// ------------------------------------------------------------------------

	@Override
	public void runFetchLoop() throws Exception {
		// the map from broker to the thread that is connected to that broker
		final Map<Node, SimpleConsumerThread<T>> brokerToThread = new HashMap<>();

		// this holds possible the exceptions from the concurrent broker connection threads
		final ExceptionProxy errorHandler = new ExceptionProxy(Thread.currentThread());

		// the offset handler handles the communication with ZooKeeper, to commit externally visible offsets
		final ZookeeperOffsetHandler zookeeperOffsetHandler = new ZookeeperOffsetHandler(kafkaConfig);
		this.zookeeperOffsetHandler = zookeeperOffsetHandler;

		PeriodicOffsetCommitter periodicCommitter = null;
		try {

			// offsets in the state may still be placeholder sentinel values if we are starting fresh, or the
			// checkpoint / savepoint state we were restored with had not completely been replaced with actual offset
			// values yet; replace those with actual offsets, according to what the sentinel value represent.
			for (KafkaTopicPartitionState<TopicAndPartition> partition : subscribedPartitionStates()) {
				if (partition.getOffset() == KafkaTopicPartitionStateSentinel.EARLIEST_OFFSET) {
					// this will be replaced by an actual offset in SimpleConsumerThread
					partition.setOffset(OffsetRequest.EarliestTime());
				} else if (partition.getOffset() == KafkaTopicPartitionStateSentinel.LATEST_OFFSET) {
					// this will be replaced by an actual offset in SimpleConsumerThread
					partition.setOffset(OffsetRequest.LatestTime());
				} else if (partition.getOffset() == KafkaTopicPartitionStateSentinel.GROUP_OFFSET) {
					Long committedOffset = zookeeperOffsetHandler.getCommittedOffset(partition.getKafkaTopicPartition());
					if (committedOffset != null) {
						// the committed offset in ZK represents the next record to process,
						// so we subtract it by 1 to correctly represent internal state
						partition.setOffset(committedOffset - 1);
					} else {
						// if we can't find an offset for a partition in ZK when using GROUP_OFFSETS,
						// we default to "auto.offset.reset" like the Kafka high-level consumer
						LOG.warn("No group offset can be found for partition {} in Zookeeper;" +
							" resetting starting offset to 'auto.offset.reset'", partition);

						partition.setOffset(invalidOffsetBehavior);
					}
				} else {
					// the partition already has a specific start offset and is ready to be consumed
				}
			}

			// start the periodic offset committer thread, if necessary
			if (autoCommitInterval > 0) {
				LOG.info("Starting periodic offset committer, with commit interval of {}ms", autoCommitInterval);

				periodicCommitter = new PeriodicOffsetCommitter(zookeeperOffsetHandler, 
						subscribedPartitionStates(), errorHandler, autoCommitInterval);
				periodicCommitter.setName("Periodic Kafka partition offset committer");
				periodicCommitter.setDaemon(true);
				periodicCommitter.start();
			}

			// register offset metrics
			if (useMetrics) {
				final MetricGroup kafkaMetricGroup = runtimeContext.getMetricGroup().addGroup("KafkaConsumer");
				addOffsetStateGauge(kafkaMetricGroup);
			}

			// Main loop polling elements from the unassignedPartitions queue to the threads
			while (running) {
				// re-throw any exception from the concurrent fetcher threads
				errorHandler.checkAndThrowException();

				// wait for max 5 seconds trying to get partitions to assign
				// if threads shut down, this poll returns earlier, because the threads inject the
				// special marker into the queue
				List<KafkaTopicPartitionState<TopicAndPartition>> partitionsToAssign = 
						unassignedPartitionsQueue.getBatchBlocking(5000);
				partitionsToAssign.remove(MARKER);

				if (!partitionsToAssign.isEmpty()) {
					LOG.info("Assigning {} partitions to broker threads", partitionsToAssign.size());
					Map<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> partitionsWithLeaders = 
							findLeaderForPartitions(partitionsToAssign, kafkaConfig);

					// assign the partitions to the leaders (maybe start the threads)
					for (Map.Entry<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> partitionsWithLeader : 
							partitionsWithLeaders.entrySet())
					{
						final Node leader = partitionsWithLeader.getKey();
						final List<KafkaTopicPartitionState<TopicAndPartition>> partitions = partitionsWithLeader.getValue();
						SimpleConsumerThread<T> brokerThread = brokerToThread.get(leader);

						if (!running) {
							break;
						}

						if (brokerThread == null || !brokerThread.getNewPartitionsQueue().isOpen()) {
							// start new thread
							brokerThread = createAndStartSimpleConsumerThread(partitions, leader, errorHandler);
							brokerToThread.put(leader, brokerThread);
						}
						else {
							// put elements into queue of thread
							ClosableBlockingQueue<KafkaTopicPartitionState<TopicAndPartition>> newPartitionsQueue = 
									brokerThread.getNewPartitionsQueue();
							
							for (KafkaTopicPartitionState<TopicAndPartition> fp : partitions) {
								if (!newPartitionsQueue.addIfOpen(fp)) {
									// we were unable to add the partition to the broker's queue
									// the broker has closed in the meantime (the thread will shut down)
									// create a new thread for connecting to this broker
									List<KafkaTopicPartitionState<TopicAndPartition>> seedPartitions = new ArrayList<>();
									seedPartitions.add(fp);
									brokerThread = createAndStartSimpleConsumerThread(seedPartitions, leader, errorHandler);
									brokerToThread.put(leader, brokerThread);
									newPartitionsQueue = brokerThread.getNewPartitionsQueue(); // update queue for the subsequent partitions
								}
							}
						}
					}
				}
				else {
					// there were no partitions to assign. Check if any broker threads shut down.
					// we get into this section of the code, if either the poll timed out, or the
					// blocking poll was woken up by the marker element
					Iterator<SimpleConsumerThread<T>> bttIterator = brokerToThread.values().iterator();
					while (bttIterator.hasNext()) {
						SimpleConsumerThread<T> thread = bttIterator.next();
						if (!thread.getNewPartitionsQueue().isOpen()) {
							LOG.info("Removing stopped consumer thread {}", thread.getName());
							bttIterator.remove();
						}
					}
				}

				if (brokerToThread.size() == 0 && unassignedPartitionsQueue.isEmpty()) {
					if (unassignedPartitionsQueue.close()) {
						LOG.info("All consumer threads are finished, there are no more unassigned partitions. Stopping fetcher");
						break;
					}
					// we end up here if somebody added something to the queue in the meantime --> continue to poll queue again
				}
			}
		}
		catch (InterruptedException e) {
			// this may be thrown because an exception on one of the concurrent fetcher threads
			// woke this thread up. make sure we throw the root exception instead in that case
			errorHandler.checkAndThrowException();

			// no other root exception, throw the interrupted exception
			throw e;
		}
		finally {
			this.running = false;
			this.zookeeperOffsetHandler = null;

			// if we run a periodic committer thread, shut that down
			if (periodicCommitter != null) {
				periodicCommitter.shutdown();
			}

			// clear the interruption flag
			// this allows the joining on consumer threads (on best effort) to happen in
			// case the initial interrupt already
			Thread.interrupted();

			// make sure that in any case (completion, abort, error), all spawned threads are stopped
			try {
				int runningThreads;
				do {
					// check whether threads are alive and cancel them
					runningThreads = 0;
					Iterator<SimpleConsumerThread<T>> threads = brokerToThread.values().iterator();
					while (threads.hasNext()) {
						SimpleConsumerThread<?> t = threads.next();
						if (t.isAlive()) {
							t.cancel();
							runningThreads++;
						} else {
							threads.remove();
						}
					}

					// wait for the threads to finish, before issuing a cancel call again
					if (runningThreads > 0) {
						for (SimpleConsumerThread<?> t : brokerToThread.values()) {
							t.join(500 / runningThreads + 1);
						}
					}
				}
				while (runningThreads > 0);
			}
			catch (InterruptedException ignored) {
				// waiting for the thread shutdown apparently got interrupted
				// restore interrupted state and continue
				Thread.currentThread().interrupt();
			}
			catch (Throwable t) {
				// we catch all here to preserve the original exception
				LOG.error("Exception while shutting down consumer threads", t);
			}

			try {
				zookeeperOffsetHandler.close();
			}
			catch (Throwable t) {
				// we catch all here to preserve the original exception
				LOG.error("Exception while shutting down ZookeeperOffsetHandler", t);
			}
		}
	}

	@Override
	public void cancel() {
		// signal the main thread to exit
		this.running = false;

		// make sure the main thread wakes up soon
		this.unassignedPartitionsQueue.addIfOpen(MARKER);
	}

	// ------------------------------------------------------------------------
	//  Kafka 0.8 specific class instantiation
	// ------------------------------------------------------------------------

	@Override
	public TopicAndPartition createKafkaPartitionHandle(KafkaTopicPartition partition) {
		return new TopicAndPartition(partition.getTopic(), partition.getPartition());
	}

	// ------------------------------------------------------------------------
	//  Offset handling
	// ------------------------------------------------------------------------

	@Override
	public void commitInternalOffsetsToKafka(Map<KafkaTopicPartition, Long> offsets) throws Exception {
		ZookeeperOffsetHandler zkHandler = this.zookeeperOffsetHandler;
		if (zkHandler != null) {
			try {
				// the ZK handler takes care of incrementing the offsets by 1 before committing
				zkHandler.prepareAndCommitOffsets(offsets);
			}
			catch (Exception e) {
				if (running) {
					throw e;
				} else {
					return;
				}
			}
		}

		// Set committed offsets in topic partition state
		KafkaTopicPartitionState<TopicAndPartition>[] partitions = subscribedPartitionStates();
		for (KafkaTopicPartitionState<TopicAndPartition> partition : partitions) {
			Long offset = offsets.get(partition.getKafkaTopicPartition());
			if (offset != null) {
				partition.setCommittedOffset(offset);
			}
		}
	}

	// ------------------------------------------------------------------------
	//  Utilities
	// ------------------------------------------------------------------------

	private SimpleConsumerThread<T> createAndStartSimpleConsumerThread(
			List<KafkaTopicPartitionState<TopicAndPartition>> seedPartitions,
			Node leader,
			ExceptionProxy errorHandler) throws IOException, ClassNotFoundException
	{
		// each thread needs its own copy of the deserializer, because the deserializer is
		// not necessarily thread safe
		final KeyedDeserializationSchema<T> clonedDeserializer =
				InstantiationUtil.clone(deserializer, runtimeContext.getUserCodeClassLoader());

		// seed thread with list of fetch partitions (otherwise it would shut down immediately again
		SimpleConsumerThread<T> brokerThread = new SimpleConsumerThread<>(
				this, errorHandler, kafkaConfig, leader, seedPartitions, unassignedPartitionsQueue, 
				clonedDeserializer, invalidOffsetBehavior);

		brokerThread.setName(String.format("SimpleConsumer - %s - broker-%s (%s:%d)",
				runtimeContext.getTaskName(), leader.id(), leader.host(), leader.port()));
		brokerThread.setDaemon(true);
		brokerThread.start();

		LOG.info("Starting thread {}", brokerThread.getName());
		return brokerThread;
	}

	/**
	 * Returns a list of unique topics from for the given partitions
	 *
	 * @param partitions A the partitions
	 * @return A list of unique topics
	 */
	private static List<String> getTopics(List<KafkaTopicPartitionState<TopicAndPartition>> partitions) {
		HashSet<String> uniqueTopics = new HashSet<>();
		for (KafkaTopicPartitionState<TopicAndPartition> fp: partitions) {
			uniqueTopics.add(fp.getTopic());
		}
		return new ArrayList<>(uniqueTopics);
	}

	/**
	 * Find leaders for the partitions
	 *
	 * From a high level, the method does the following:
	 *	 - Get a list of FetchPartitions (usually only a few partitions)
	 *	 - Get the list of topics from the FetchPartitions list and request the partitions for the topics. (Kafka doesn't support getting leaders for a set of partitions)
	 *	 - Build a Map<Leader, List<FetchPartition>> where only the requested partitions are contained.
	 *
	 * @param partitionsToAssign fetch partitions list
	 * @return leader to partitions map
	 */
	private static Map<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> findLeaderForPartitions(
			List<KafkaTopicPartitionState<TopicAndPartition>> partitionsToAssign,
			Properties kafkaProperties) throws Exception
	{
		if (partitionsToAssign.isEmpty()) {
			throw new IllegalArgumentException("Leader request for empty partitions list");
		}

		LOG.info("Refreshing leader information for partitions {}", partitionsToAssign);
		
		// this request is based on the topic names
		PartitionInfoFetcher infoFetcher = new PartitionInfoFetcher(getTopics(partitionsToAssign), kafkaProperties);
		infoFetcher.start();

		// NOTE: The kafka client apparently locks itself up sometimes
		// when it is interrupted, so we run it only in a separate thread.
		// since it sometimes refuses to shut down, we resort to the admittedly harsh
		// means of killing the thread after a timeout.
		KillerWatchDog watchDog = new KillerWatchDog(infoFetcher, 60000);
		watchDog.start();

		// this list contains ALL partitions of the requested topics
		List<KafkaTopicPartitionLeader> topicPartitionWithLeaderList = infoFetcher.getPartitions();

		// copy list to track unassigned partitions
		List<KafkaTopicPartitionState<TopicAndPartition>> unassignedPartitions = new ArrayList<>(partitionsToAssign);

		// final mapping from leader -> list(fetchPartition)
		Map<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> leaderToPartitions = new HashMap<>();

		for(KafkaTopicPartitionLeader partitionLeader: topicPartitionWithLeaderList) {
			if (unassignedPartitions.size() == 0) {
				// we are done: all partitions are assigned
				break;
			}

			Iterator<KafkaTopicPartitionState<TopicAndPartition>> unassignedPartitionsIterator = unassignedPartitions.iterator();
			while (unassignedPartitionsIterator.hasNext()) {
				KafkaTopicPartitionState<TopicAndPartition> unassignedPartition = unassignedPartitionsIterator.next();

				if (unassignedPartition.getKafkaTopicPartition().equals(partitionLeader.getTopicPartition())) {
					// we found the leader for one of the fetch partitions
					Node leader = partitionLeader.getLeader();

					List<KafkaTopicPartitionState<TopicAndPartition>> partitionsOfLeader = leaderToPartitions.get(leader);
					if (partitionsOfLeader == null) {
						partitionsOfLeader = new ArrayList<>();
						leaderToPartitions.put(leader, partitionsOfLeader);
					}
					partitionsOfLeader.add(unassignedPartition);
					unassignedPartitionsIterator.remove(); // partition has been assigned
					break;
				}
			}
		}

		if (unassignedPartitions.size() > 0) {
			throw new RuntimeException("Unable to find a leader for partitions: " + unassignedPartitions);
		}

		LOG.debug("Partitions with assigned leaders {}", leaderToPartitions);

		return leaderToPartitions;
	}

	/**
	 * Retrieve the behaviour of "auto.offset.reset" from the config properties.
	 * A partition needs to fallback to "auto.offset.reset" as default offset when
	 * we can't find offsets in ZK to start from in {@link StartupMode#GROUP_OFFSETS} startup mode.
	 *
	 * @param config kafka consumer properties
	 * @return either OffsetRequest.LatestTime() or OffsetRequest.EarliestTime()
	 */
	private static long getInvalidOffsetBehavior(Properties config) {
		final String val = config.getProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "largest");
		if (val.equals("largest") || val.equals("latest")) { // largest is kafka 0.8, latest is kafka 0.9
			return OffsetRequest.LatestTime();
		} else {
			return OffsetRequest.EarliestTime();
		}
	}
}