FlinkKafkaProducer010.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.connectors.kafka;

import java.util.Properties;

import org.apache.flink.api.common.functions.IterationRuntimeContext;
import org.apache.flink.api.common.functions.RichFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.java.typeutils.GenericTypeInfo;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.operators.StreamSink;
import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkFixedPartitioner;
import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaDelegatePartitioner;
import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner;
import org.apache.flink.streaming.connectors.kafka.partitioner.KafkaPartitioner;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchema;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;
import org.apache.flink.streaming.util.serialization.SerializationSchema;
import org.apache.kafka.clients.producer.ProducerRecord;

import static org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducerBase.getPartitionsByTopic;
import static org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducerBase.getPropertiesFromBrokerList;


/**
 * Flink Sink to produce data into a Kafka topic. This producer is compatible with Kafka 0.10.x
 *
 * Implementation note: This producer is a hybrid between a regular regular sink function (a)
 * and a custom operator (b).
 *
 * For (a), the class implements the SinkFunction and RichFunction interfaces.
 * For (b), it extends the StreamTask class.
 *
 * Details about approach (a):
 *
 *  Pre Kafka 0.10 producers only follow approach (a), allowing users to use the producer using the
 *  DataStream.addSink() method.
 *  Since the APIs exposed in that variant do not allow accessing the the timestamp attached to the record
 *  the Kafka 0.10 producer has a second invocation option, approach (b).
 *
 * Details about approach (b):
 *  Kafka 0.10 supports writing the timestamp attached to a record to Kafka. When adding the
 *  FlinkKafkaProducer010 using the FlinkKafkaProducer010.writeToKafkaWithTimestamps() method, the Kafka producer
 *  can access the internal record timestamp of the record and write it to Kafka.
 *
 * All methods and constructors in this class are marked with the approach they are needed for.
 */
public class FlinkKafkaProducer010<T> extends StreamSink<T> implements SinkFunction<T>, RichFunction {

	/**
	 * Flag controlling whether we are writing the Flink record's timestamp into Kafka.
	 */
	private boolean writeTimestampToKafka = false;

	// ---------------------- "Constructors" for timestamp writing ------------------

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * This constructor allows writing timestamps to Kafka, it follow approach (b) (see above)
	 *
	 * @param inStream The stream to write to Kafka
	 * @param topicId ID of the Kafka topic.
	 * @param serializationSchema User defined serialization schema supporting key/value messages
	 * @param producerConfig Properties with the producer configuration.
	 */
	public static <T> FlinkKafkaProducer010Configuration<T> writeToKafkaWithTimestamps(DataStream<T> inStream,
																					String topicId,
																					KeyedSerializationSchema<T> serializationSchema,
																					Properties producerConfig) {
		return writeToKafkaWithTimestamps(inStream, topicId, serializationSchema, producerConfig, new FlinkFixedPartitioner<T>());
	}


	/**
	 * Creates a FlinkKafkaProducer for a given topic. the sink produces a DataStream to
	 * the topic.
	 *
	 * This constructor allows writing timestamps to Kafka, it follow approach (b) (see above)
	 *
	 * @param inStream The stream to write to Kafka
	 * @param topicId ID of the Kafka topic.
	 * @param serializationSchema User defined (keyless) serialization schema.
	 * @param producerConfig Properties with the producer configuration.
	 */
	public static <T> FlinkKafkaProducer010Configuration<T> writeToKafkaWithTimestamps(DataStream<T> inStream,
																					String topicId,
																					SerializationSchema<T> serializationSchema,
																					Properties producerConfig) {
		return writeToKafkaWithTimestamps(inStream, topicId, new KeyedSerializationSchemaWrapper<>(serializationSchema), producerConfig, new FlinkFixedPartitioner<T>());
	}

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * This constructor allows writing timestamps to Kafka, it follow approach (b) (see above)
	 *
	 *  @param inStream The stream to write to Kafka
	 *  @param topicId The name of the target topic
	 *  @param serializationSchema A serializable serialization schema for turning user objects into a kafka-consumable byte[] supporting key/value messages
	 *  @param producerConfig Configuration properties for the KafkaProducer. 'bootstrap.servers.' is the only required argument.
	 *  @param customPartitioner A serializable partitioner for assigning messages to Kafka partitions.
	 */
	public static <T> FlinkKafkaProducer010Configuration<T> writeToKafkaWithTimestamps(DataStream<T> inStream,
																					String topicId,
																					KeyedSerializationSchema<T> serializationSchema,
																					Properties producerConfig,
																					FlinkKafkaPartitioner<T> customPartitioner) {

		GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class);
		FlinkKafkaProducer010<T> kafkaProducer = new FlinkKafkaProducer010<>(topicId, serializationSchema, producerConfig, customPartitioner);
		SingleOutputStreamOperator<Object> transformation = inStream.transform("FlinKafkaProducer 0.10.x", objectTypeInfo, kafkaProducer);
		return new FlinkKafkaProducer010Configuration<>(transformation, kafkaProducer);
	}

	// ---------------------- Regular constructors w/o timestamp support  ------------------

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * @param brokerList
	 *			Comma separated addresses of the brokers
	 * @param topicId
	 * 			ID of the Kafka topic.
	 * @param serializationSchema
	 * 			User defined (keyless) serialization schema.
	 */
	public FlinkKafkaProducer010(String brokerList, String topicId, SerializationSchema<T> serializationSchema) {
		this(topicId, new KeyedSerializationSchemaWrapper<>(serializationSchema), getPropertiesFromBrokerList(brokerList), new FlinkFixedPartitioner<T>());
	}

	/**
	 * Creates a FlinkKafkaProducer for a given topic. the sink produces a DataStream to
	 * the topic.
	 *
	 * @param topicId
	 * 			ID of the Kafka topic.
	 * @param serializationSchema
	 * 			User defined (keyless) serialization schema.
	 * @param producerConfig
	 * 			Properties with the producer configuration.
	 */
	public FlinkKafkaProducer010(String topicId, SerializationSchema<T> serializationSchema, Properties producerConfig) {
		this(topicId, new KeyedSerializationSchemaWrapper<>(serializationSchema), producerConfig, new FlinkFixedPartitioner<T>());
	}

	/**
	 * Creates a FlinkKafkaProducer for a given topic. the sink produces a DataStream to
	 * the topic.
	 *
	 * @param topicId The topic to write data to
	 * @param serializationSchema A (keyless) serializable serialization schema for turning user objects into a kafka-consumable byte[]
	 * @param producerConfig Configuration properties for the KafkaProducer. 'bootstrap.servers.' is the only required argument.
	 * @param customPartitioner A serializable partitioner for assigning messages to Kafka partitions (when passing null, we'll use Kafka's partitioner)
	 */
	public FlinkKafkaProducer010(String topicId, SerializationSchema<T> serializationSchema, Properties producerConfig, FlinkKafkaPartitioner<T> customPartitioner) {
		this(topicId, new KeyedSerializationSchemaWrapper<>(serializationSchema), producerConfig, customPartitioner);
	}

	// ------------------- Key/Value serialization schema constructors ----------------------

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * @param brokerList
	 *			Comma separated addresses of the brokers
	 * @param topicId
	 * 			ID of the Kafka topic.
	 * @param serializationSchema
	 * 			User defined serialization schema supporting key/value messages
	 */
	public FlinkKafkaProducer010(String brokerList, String topicId, KeyedSerializationSchema<T> serializationSchema) {
		this(topicId, serializationSchema, getPropertiesFromBrokerList(brokerList), new FlinkFixedPartitioner<T>());
	}

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * @param topicId
	 * 			ID of the Kafka topic.
	 * @param serializationSchema
	 * 			User defined serialization schema supporting key/value messages
	 * @param producerConfig
	 * 			Properties with the producer configuration.
	 */
	public FlinkKafkaProducer010(String topicId, KeyedSerializationSchema<T> serializationSchema, Properties producerConfig) {
		this(topicId, serializationSchema, producerConfig, new FlinkFixedPartitioner<T>());
	}
	
	/**
	 * Create Kafka producer
	 *
	 * This constructor does not allow writing timestamps to Kafka, it follow approach (a) (see above)
	 */
	public FlinkKafkaProducer010(String topicId, KeyedSerializationSchema<T> serializationSchema, Properties producerConfig, FlinkKafkaPartitioner<T> customPartitioner) {
		// We create a Kafka 09 producer instance here and only "override" (by intercepting) the
		// invoke call.
		super(new FlinkKafkaProducer09<>(topicId, serializationSchema, producerConfig, customPartitioner));
	}

	// ----------------------------- Deprecated constructors / factory methods  ---------------------------

	/**
	 * Creates a FlinkKafkaProducer for a given topic. The sink produces a DataStream to
	 * the topic.
	 *
	 * This constructor allows writing timestamps to Kafka, it follow approach (b) (see above)
	 *
	 *  @param inStream The stream to write to Kafka
	 *  @param topicId The name of the target topic
	 *  @param serializationSchema A serializable serialization schema for turning user objects into a kafka-consumable byte[] supporting key/value messages
	 *  @param producerConfig Configuration properties for the KafkaProducer. 'bootstrap.servers.' is the only required argument.
	 *  @param customPartitioner A serializable partitioner for assigning messages to Kafka partitions.
	 *
	 *  @deprecated This is a deprecated since it does not correctly handle partitioning when
	 *              producing to multiple topics. Use
	 *              {@link FlinkKafkaProducer010#FlinkKafkaProducer010(String, SerializationSchema, Properties, FlinkKafkaPartitioner)} instead.
	 */
	@Deprecated
	public static <T> FlinkKafkaProducer010Configuration<T> writeToKafkaWithTimestamps(DataStream<T> inStream,
																					String topicId,
																					KeyedSerializationSchema<T> serializationSchema,
																					Properties producerConfig,
																					KafkaPartitioner<T> customPartitioner) {

		GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class);
		FlinkKafkaProducer010<T> kafkaProducer =
				new FlinkKafkaProducer010<>(topicId, serializationSchema, producerConfig, new FlinkKafkaDelegatePartitioner<>(customPartitioner));
		SingleOutputStreamOperator<Object> transformation = inStream.transform("FlinKafkaProducer 0.10.x", objectTypeInfo, kafkaProducer);
		return new FlinkKafkaProducer010Configuration<>(transformation, kafkaProducer);
	}

	/**
	 * Creates a FlinkKafkaProducer for a given topic. the sink produces a DataStream to
	 * the topic.
	 *
	 * @param topicId The topic to write data to
	 * @param serializationSchema A (keyless) serializable serialization schema for turning user objects into a kafka-consumable byte[]
	 * @param producerConfig Configuration properties for the KafkaProducer. 'bootstrap.servers.' is the only required argument.
	 * @param customPartitioner A serializable partitioner for assigning messages to Kafka partitions (when passing null, we'll use Kafka's partitioner)
	 *
	 * @deprecated This is a deprecated since it does not correctly handle partitioning when
	 *             producing to multiple topics. Use
	 *             {@link FlinkKafkaProducer010#FlinkKafkaProducer010(String, SerializationSchema, Properties, FlinkKafkaPartitioner)} instead.
	 */
	@Deprecated
	public FlinkKafkaProducer010(String topicId, SerializationSchema<T> serializationSchema, Properties producerConfig, KafkaPartitioner<T> customPartitioner) {
		this(topicId, new KeyedSerializationSchemaWrapper<>(serializationSchema), producerConfig, customPartitioner);
	}

	/**
	 * Create Kafka producer
	 *
	 * This constructor does not allow writing timestamps to Kafka, it follow approach (a) (see above)
	 *
	 * @deprecated This is a deprecated constructor that does not correctly handle partitioning when
	 *             producing to multiple topics. Use
	 *             {@link FlinkKafkaProducer010#FlinkKafkaProducer010(String, SerializationSchema, Properties, FlinkKafkaPartitioner)} instead.
	 */
	@Deprecated
	public FlinkKafkaProducer010(String topicId, KeyedSerializationSchema<T> serializationSchema, Properties producerConfig, KafkaPartitioner<T> customPartitioner) {
		// We create a Kafka 09 producer instance here and only "override" (by intercepting) the
		// invoke call.
		super(new FlinkKafkaProducer09<>(topicId, serializationSchema, producerConfig, new FlinkKafkaDelegatePartitioner<>(customPartitioner)));
	}

	// ----------------------------- Generic element processing  ---------------------------

	private void invokeInternal(T next, long elementTimestamp) throws Exception {

		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;

		internalProducer.checkErroneous();

		byte[] serializedKey = internalProducer.schema.serializeKey(next);
		byte[] serializedValue = internalProducer.schema.serializeValue(next);
		String targetTopic = internalProducer.schema.getTargetTopic(next);
		if (targetTopic == null) {
			targetTopic = internalProducer.defaultTopicId;
		}

		Long timestamp = null;
		if(this.writeTimestampToKafka) {
			timestamp = elementTimestamp;
		}

		ProducerRecord<byte[], byte[]> record;
		int[] partitions = internalProducer.topicPartitionsMap.get(targetTopic);
		if(null == partitions) {
			partitions = getPartitionsByTopic(targetTopic, internalProducer.producer);
			internalProducer.topicPartitionsMap.put(targetTopic, partitions);
		}
		if (internalProducer.flinkKafkaPartitioner == null) {
			record = new ProducerRecord<>(targetTopic, null, timestamp, serializedKey, serializedValue);
		} else {
			record = new ProducerRecord<>(targetTopic, internalProducer.flinkKafkaPartitioner.partition(next, serializedKey, serializedValue, targetTopic, partitions), timestamp, serializedKey, serializedValue);
		}
		if (internalProducer.flushOnCheckpoint) {
			synchronized (internalProducer.pendingRecordsLock) {
				internalProducer.pendingRecords++;
			}
		}
		internalProducer.producer.send(record, internalProducer.callback);
	}


	// ----------------- Helper methods implementing methods from SinkFunction and RichFunction (Approach (a)) ----


	// ---- Configuration setters

	/**
	 * Defines whether the producer should fail on errors, or only log them.
	 * If this is set to true, then exceptions will be only logged, if set to false,
	 * exceptions will be eventually thrown and cause the streaming program to
	 * fail (and enter recovery).
	 *
	 * Method is only accessible for approach (a) (see above)
	 *
	 * @param logFailuresOnly The flag to indicate logging-only on exceptions.
	 */
	public void setLogFailuresOnly(boolean logFailuresOnly) {
		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;
		internalProducer.setLogFailuresOnly(logFailuresOnly);
	}

	/**
	 * If set to true, the Flink producer will wait for all outstanding messages in the Kafka buffers
	 * to be acknowledged by the Kafka producer on a checkpoint.
	 * This way, the producer can guarantee that messages in the Kafka buffers are part of the checkpoint.
	 *
	 * Method is only accessible for approach (a) (see above)
	 *
	 * @param flush Flag indicating the flushing mode (true = flush on checkpoint)
	 */
	public void setFlushOnCheckpoint(boolean flush) {
		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;
		internalProducer.setFlushOnCheckpoint(flush);
	}

	/**
	 * This method is used for approach (a) (see above)
	 *
	 */
	@Override
	public void open(Configuration parameters) throws Exception {
		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;
		internalProducer.open(parameters);
	}

	/**
	 * This method is used for approach (a) (see above)
	 */
	@Override
	public IterationRuntimeContext getIterationRuntimeContext() {
		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;
		return internalProducer.getIterationRuntimeContext();
	}

	/**
	 * This method is used for approach (a) (see above)
	 */
	@Override
	public void setRuntimeContext(RuntimeContext t) {
		final FlinkKafkaProducerBase<T> internalProducer = (FlinkKafkaProducerBase<T>) userFunction;
		internalProducer.setRuntimeContext(t);
	}

	/**
	 * Invoke method for using the Sink as DataStream.addSink() sink.
	 *
	 * This method is used for approach (a) (see above)
	 *
	 * @param value The input record.
	 */
	@Override
	public void invoke(T value) throws Exception {
		invokeInternal(value, Long.MAX_VALUE);
	}


	// ----------------- Helper methods and classes implementing methods from StreamSink (Approach (b)) ----


	/**
	 * Process method for using the sink with timestamp support.
	 *
	 * This method is used for approach (b) (see above)
	 */
	@Override
	public void processElement(StreamRecord<T> element) throws Exception {
		invokeInternal(element.getValue(), element.getTimestamp());
	}

	/**
	 * Configuration object returned by the writeToKafkaWithTimestamps() call.
	 */
	public static class FlinkKafkaProducer010Configuration<T> extends DataStreamSink<T> {

		private final FlinkKafkaProducerBase wrappedProducerBase;
		private final FlinkKafkaProducer010 producer;

		private FlinkKafkaProducer010Configuration(DataStream stream, FlinkKafkaProducer010<T> producer) {
			//noinspection unchecked
			super(stream, producer);
			this.producer = producer;
			this.wrappedProducerBase = (FlinkKafkaProducerBase) producer.userFunction;
		}

		/**
		 * Defines whether the producer should fail on errors, or only log them.
		 * If this is set to true, then exceptions will be only logged, if set to false,
		 * exceptions will be eventually thrown and cause the streaming program to
		 * fail (and enter recovery).
		 *
		 * @param logFailuresOnly The flag to indicate logging-only on exceptions.
		 */
		public void setLogFailuresOnly(boolean logFailuresOnly) {
			this.wrappedProducerBase.setLogFailuresOnly(logFailuresOnly);
		}

		/**
		 * If set to true, the Flink producer will wait for all outstanding messages in the Kafka buffers
		 * to be acknowledged by the Kafka producer on a checkpoint.
		 * This way, the producer can guarantee that messages in the Kafka buffers are part of the checkpoint.
		 *
		 * @param flush Flag indicating the flushing mode (true = flush on checkpoint)
		 */
		public void setFlushOnCheckpoint(boolean flush) {
			this.wrappedProducerBase.setFlushOnCheckpoint(flush);
		}

		/**
		 * If set to true, Flink will write the (event time) timestamp attached to each record into Kafka.
		 * Timestamps must be positive for Kafka to accept them.
		 *
		 * @param writeTimestampToKafka Flag indicating if Flink's internal timestamps are written to Kafka.
		 */
		public void setWriteTimestampToKafka(boolean writeTimestampToKafka) {
			this.producer.writeTimestampToKafka = writeTimestampToKafka;
		}
	}


}