/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.processors.kafka.pubsub; import static org.apache.nifi.processors.kafka.pubsub.ConsumeKafkaRecord_0_10.REL_PARSE_FAILURE; import static org.apache.nifi.processors.kafka.pubsub.ConsumeKafkaRecord_0_10.REL_SUCCESS; import static org.apache.nifi.processors.kafka.pubsub.KafkaProcessorUtils.HEX_ENCODING; import static org.apache.nifi.processors.kafka.pubsub.KafkaProcessorUtils.UTF8_ENCODING; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import javax.xml.bind.DatatypeConverter; import org.apache.kafka.clients.consumer.Consumer; import org.apache.kafka.clients.consumer.ConsumerRebalanceListener; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.clients.consumer.OffsetAndMetadata; import org.apache.kafka.common.KafkaException; import org.apache.kafka.common.TopicPartition; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.serialization.RecordReader; import org.apache.nifi.serialization.RecordReaderFactory; import org.apache.nifi.serialization.RecordSetWriter; import org.apache.nifi.serialization.RecordSetWriterFactory; import org.apache.nifi.serialization.SimpleRecordSchema; import org.apache.nifi.serialization.WriteResult; import org.apache.nifi.serialization.record.Record; import org.apache.nifi.serialization.record.RecordSchema; import org.apache.nifi.serialization.record.RecordSet; /** * This class represents a lease to access a Kafka Consumer object. The lease is * intended to be obtained from a ConsumerPool. The lease is closeable to allow * for the clean model of a try w/resources whereby non-exceptional cases mean * the lease will be returned to the pool for future use by others. A given * lease may only belong to a single thread a time. */ public abstract class ConsumerLease implements Closeable, ConsumerRebalanceListener { private final long maxWaitMillis; private final Consumer<byte[], byte[]> kafkaConsumer; private final ComponentLog logger; private final byte[] demarcatorBytes; private final String keyEncoding; private final String securityProtocol; private final String bootstrapServers; private final RecordSetWriterFactory writerFactory; private final RecordReaderFactory readerFactory; private boolean poisoned = false; //used for tracking demarcated flowfiles to their TopicPartition so we can append //to them on subsequent poll calls private final Map<TopicPartition, BundleTracker> bundleMap = new HashMap<>(); private final Map<TopicPartition, OffsetAndMetadata> uncommittedOffsetsMap = new HashMap<>(); private long leaseStartNanos = -1; private boolean lastPollEmpty = false; private int totalFlowFiles = 0; ConsumerLease( final long maxWaitMillis, final Consumer<byte[], byte[]> kafkaConsumer, final byte[] demarcatorBytes, final String keyEncoding, final String securityProtocol, final String bootstrapServers, final RecordReaderFactory readerFactory, final RecordSetWriterFactory writerFactory, final ComponentLog logger) { this.maxWaitMillis = maxWaitMillis; this.kafkaConsumer = kafkaConsumer; this.demarcatorBytes = demarcatorBytes; this.keyEncoding = keyEncoding; this.securityProtocol = securityProtocol; this.bootstrapServers = bootstrapServers; this.readerFactory = readerFactory; this.writerFactory = writerFactory; this.logger = logger; } /** * clears out internal state elements excluding session and consumer as * those are managed by the pool itself */ private void resetInternalState() { bundleMap.clear(); uncommittedOffsetsMap.clear(); leaseStartNanos = -1; lastPollEmpty = false; totalFlowFiles = 0; } /** * Kafka will call this method whenever it is about to rebalance the * consumers for the given partitions. We'll simply take this to mean that * we need to quickly commit what we've got and will return the consumer to * the pool. This method will be called during the poll() method call of * this class and will be called by the same thread calling poll according * to the Kafka API docs. After this method executes the session and kafka * offsets are committed and this lease is closed. * * @param partitions partitions being reassigned */ @Override public void onPartitionsRevoked(final Collection<TopicPartition> partitions) { logger.debug("Rebalance Alert: Paritions '{}' revoked for lease '{}' with consumer '{}'", new Object[]{partitions, this, kafkaConsumer}); //force a commit here. Can reuse the session and consumer after this but must commit now to avoid duplicates if kafka reassigns partition commit(); } /** * This will be called by Kafka when the rebalance has completed. We don't * need to do anything with this information other than optionally log it as * by this point we've committed what we've got and moved on. * * @param partitions topic partition set being reassigned */ @Override public void onPartitionsAssigned(final Collection<TopicPartition> partitions) { logger.debug("Rebalance Alert: Paritions '{}' assigned for lease '{}' with consumer '{}'", new Object[]{partitions, this, kafkaConsumer}); } /** * Executes a poll on the underlying Kafka Consumer and creates any new * flowfiles necessary or appends to existing ones if in demarcation mode. */ void poll() { /** * Implementation note: * Even if ConsumeKafka is not scheduled to poll due to downstream connection back-pressure is engaged, * for longer than session.timeout.ms (defaults to 10 sec), Kafka consumer sends heartbeat from background thread. * If this situation lasts longer than max.poll.interval.ms (defaults to 5 min), Kafka consumer sends * Leave Group request to Group Coordinator. When ConsumeKafka processor is scheduled again, Kafka client checks * if this client instance is still a part of consumer group. If not, it rejoins before polling messages. * This behavior has been fixed via Kafka KIP-62 and available from Kafka client 0.10.1.0. */ try { final ConsumerRecords<byte[], byte[]> records = kafkaConsumer.poll(10); lastPollEmpty = records.count() == 0; processRecords(records); } catch (final ProcessException pe) { throw pe; } catch (final Throwable t) { this.poison(); throw t; } } /** * Notifies Kafka to commit the offsets for the specified topic/partition * pairs to the specified offsets w/the given metadata. This can offer * higher performance than the other commitOffsets call as it allows the * kafka client to collect more data from Kafka before committing the * offsets. * * if false then we didn't do anything and should probably yield if true * then we committed new data * */ boolean commit() { if (uncommittedOffsetsMap.isEmpty()) { resetInternalState(); return false; } try { /** * Committing the nifi session then the offsets means we have an at * least once guarantee here. If we reversed the order we'd have at * most once. */ final Collection<FlowFile> bundledFlowFiles = getBundles(); if (!bundledFlowFiles.isEmpty()) { getProcessSession().transfer(bundledFlowFiles, REL_SUCCESS); } getProcessSession().commit(); final Map<TopicPartition, OffsetAndMetadata> offsetsMap = uncommittedOffsetsMap; kafkaConsumer.commitSync(offsetsMap); resetInternalState(); return true; } catch (final KafkaException kex) { poison(); logger.warn("Duplicates are likely as we were able to commit the process" + " session but received an exception from Kafka while committing" + " offsets."); throw kex; } catch (final Throwable t) { poison(); throw t; } } /** * Indicates whether we should continue polling for data. If we are not * writing data with a demarcator then we're writing individual flow files * per kafka message therefore we must be very mindful of memory usage for * the flow file objects (not their content) being held in memory. The * content of kafka messages will be written to the content repository * immediately upon each poll call but we must still be mindful of how much * memory can be used in each poll call. We will indicate that we should * stop polling our last poll call produced no new results or if we've * polling and processing data longer than the specified maximum polling * time or if we have reached out specified max flow file limit or if a * rebalance has been initiated for one of the partitions we're watching; * otherwise true. * * @return true if should keep polling; false otherwise */ boolean continuePolling() { //stop if the last poll produced new no data if (lastPollEmpty) { return false; } //stop if we've gone past our desired max uncommitted wait time if (leaseStartNanos < 0) { leaseStartNanos = System.nanoTime(); } final long durationMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - leaseStartNanos); if (durationMillis > maxWaitMillis) { return false; } //stop if we've generated enough flowfiles that we need to be concerned about memory usage for the objects if (bundleMap.size() > 200) { //a magic number - the number of simultaneous bundles to track return false; } else { return totalFlowFiles < 15000;//admittedlly a magic number - good candidate for processor property } } /** * Indicates that the underlying session and consumer should be immediately * considered invalid. Once closed the session will be rolled back and the * pool should destroy the underlying consumer. This is useful if due to * external reasons, such as the processor no longer being scheduled, this * lease should be terminated immediately. */ private void poison() { poisoned = true; } /** * @return true if this lease has been poisoned; false otherwise */ boolean isPoisoned() { return poisoned; } /** * Trigger the consumer's {@link KafkaConsumer#wakeup() wakeup()} method. */ public void wakeup() { kafkaConsumer.wakeup(); } /** * Abstract method that is intended to be extended by the pool that created * this ConsumerLease object. It should ensure that the session given to * create this session is rolled back and that the underlying kafka consumer * is either returned to the pool for continued use or destroyed if this * lease has been poisoned. It can only be called once. Calling it more than * once can result in undefined and non threadsafe behavior. */ @Override public void close() { resetInternalState(); } public abstract ProcessSession getProcessSession(); public abstract void yield(); private void processRecords(final ConsumerRecords<byte[], byte[]> records) { records.partitions().stream().forEach(partition -> { List<ConsumerRecord<byte[], byte[]>> messages = records.records(partition); if (!messages.isEmpty()) { //update maximum offset map for this topic partition long maxOffset = messages.stream() .mapToLong(record -> record.offset()) .max() .getAsLong(); //write records to content repository and session if (demarcatorBytes != null) { writeDemarcatedData(getProcessSession(), messages, partition); } else if (readerFactory != null && writerFactory != null) { writeRecordData(getProcessSession(), messages, partition); } else { totalFlowFiles += messages.size(); messages.stream().forEach(message -> { writeData(getProcessSession(), message, partition); }); } uncommittedOffsetsMap.put(partition, new OffsetAndMetadata(maxOffset + 1L)); } }); } private static String encodeKafkaKey(final byte[] key, final String encoding) { if (key == null) { return null; } if (HEX_ENCODING.getValue().equals(encoding)) { return DatatypeConverter.printHexBinary(key); } else if (UTF8_ENCODING.getValue().equals(encoding)) { return new String(key, StandardCharsets.UTF_8); } else { return null; // won't happen because it is guaranteed by the Allowable Values } } private Collection<FlowFile> getBundles() { final List<FlowFile> flowFiles = new ArrayList<>(); for (final BundleTracker tracker : bundleMap.values()) { populateAttributes(tracker); flowFiles.add(tracker.flowFile); } return flowFiles; } private void writeData(final ProcessSession session, ConsumerRecord<byte[], byte[]> record, final TopicPartition topicPartition) { FlowFile flowFile = session.create(); final BundleTracker tracker = new BundleTracker(record, topicPartition, keyEncoding); tracker.incrementRecordCount(1); flowFile = session.write(flowFile, out -> { out.write(record.value()); }); tracker.updateFlowFile(flowFile); populateAttributes(tracker); session.transfer(tracker.flowFile, REL_SUCCESS); } private void writeDemarcatedData(final ProcessSession session, final List<ConsumerRecord<byte[], byte[]>> records, final TopicPartition topicPartition) { final ConsumerRecord<byte[], byte[]> firstRecord = records.get(0); final boolean demarcateFirstRecord; BundleTracker tracker = bundleMap.get(topicPartition); FlowFile flowFile; if (tracker == null) { tracker = new BundleTracker(firstRecord, topicPartition, keyEncoding); flowFile = session.create(); tracker.updateFlowFile(flowFile); demarcateFirstRecord = false; //have not yet written records for this topic/partition in this lease } else { demarcateFirstRecord = true; //have already been writing records for this topic/partition in this lease } flowFile = tracker.flowFile; tracker.incrementRecordCount(records.size()); flowFile = session.append(flowFile, out -> { boolean useDemarcator = demarcateFirstRecord; for (final ConsumerRecord<byte[], byte[]> record : records) { if (useDemarcator) { out.write(demarcatorBytes); } out.write(record.value()); useDemarcator = true; } }); tracker.updateFlowFile(flowFile); bundleMap.put(topicPartition, tracker); } private void rollback(final TopicPartition topicPartition) { OffsetAndMetadata offsetAndMetadata = uncommittedOffsetsMap.get(topicPartition); if (offsetAndMetadata == null) { offsetAndMetadata = kafkaConsumer.committed(topicPartition); } final long offset = offsetAndMetadata.offset(); kafkaConsumer.seek(topicPartition, offset); } private void writeRecordData(final ProcessSession session, final List<ConsumerRecord<byte[], byte[]>> records, final TopicPartition topicPartition) { if (records.isEmpty()) { return; } FlowFile flowFile = session.create(); try { final RecordSchema schema; try { schema = writerFactory.getSchema(flowFile, new ByteArrayInputStream(records.get(0).value())); } catch (final Exception e) { logger.error("Failed to obtain Schema for FlowFile. Will roll back the Kafka message offsets.", e); try { rollback(topicPartition); } catch (final Exception rollbackException) { logger.warn("Attempted to rollback Kafka message offset but was unable to do so", rollbackException); } yield(); throw new ProcessException(e); } final FlowFile ff = flowFile; final AtomicReference<WriteResult> writeResult = new AtomicReference<>(); final AtomicReference<String> mimeTypeRef = new AtomicReference<>(); flowFile = session.write(flowFile, rawOut -> { final Iterator<ConsumerRecord<byte[], byte[]>> itr = records.iterator(); final RecordSchema emptySchema = new SimpleRecordSchema(Collections.emptyList()); final RecordSet recordSet = new RecordSet() { @Override public RecordSchema getSchema() throws IOException { return emptySchema; } @Override public Record next() throws IOException { while (itr.hasNext()) { final ConsumerRecord<byte[], byte[]> consumerRecord = itr.next(); final InputStream in = new ByteArrayInputStream(consumerRecord.value()); try { final RecordReader reader = readerFactory.createRecordReader(ff, in, logger); final Record record = reader.nextRecord(); return record; } catch (final Exception e) { final Map<String, String> attributes = new HashMap<>(); attributes.put(KafkaProcessorUtils.KAFKA_OFFSET, String.valueOf(consumerRecord.offset())); attributes.put(KafkaProcessorUtils.KAFKA_PARTITION, String.valueOf(topicPartition.partition())); attributes.put(KafkaProcessorUtils.KAFKA_TOPIC, topicPartition.topic()); FlowFile failureFlowFile = session.create(); failureFlowFile = session.write(failureFlowFile, out -> out.write(consumerRecord.value())); failureFlowFile = session.putAllAttributes(failureFlowFile, attributes); session.transfer(failureFlowFile, REL_PARSE_FAILURE); logger.error("Failed to parse message from Kafka using the configured Record Reader. " + "Will route message as its own FlowFile to the 'parse.failure' relationship", e); session.adjustCounter("Parse Failures", 1, false); } } return null; } }; try (final OutputStream out = new BufferedOutputStream(rawOut); final RecordSetWriter writer = writerFactory.createWriter(logger, schema, ff, out)) { writeResult.set(writer.write(recordSet)); mimeTypeRef.set(writer.getMimeType()); } catch (final Exception e) { logger.error("Failed to write records to FlowFile. Will roll back the Kafka message offsets.", e); try { rollback(topicPartition); } catch (final Exception rollbackException) { logger.warn("Attempted to rollback Kafka message offset but was unable to do so", rollbackException); } yield(); throw new ProcessException(e); } }); final WriteResult result = writeResult.get(); if (result.getRecordCount() > 0) { final Map<String, String> attributes = new HashMap<>(result.getAttributes()); attributes.put(CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get()); attributes.put("record.count", String.valueOf(result.getRecordCount())); attributes.put(KafkaProcessorUtils.KAFKA_PARTITION, String.valueOf(topicPartition.partition())); attributes.put(KafkaProcessorUtils.KAFKA_TOPIC, topicPartition.topic()); flowFile = session.putAllAttributes(flowFile, attributes); final long executionDurationMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - leaseStartNanos); final String transitUri = KafkaProcessorUtils.buildTransitURI(securityProtocol, bootstrapServers, topicPartition.topic()); session.getProvenanceReporter().receive(flowFile, transitUri, executionDurationMillis); session.adjustCounter("Records Received", result.getRecordCount(), false); session.transfer(flowFile, REL_SUCCESS); } else { session.remove(flowFile); } } catch (final Exception e) { session.remove(flowFile); throw e; } } private void populateAttributes(final BundleTracker tracker) { final Map<String, String> kafkaAttrs = new HashMap<>(); kafkaAttrs.put(KafkaProcessorUtils.KAFKA_OFFSET, String.valueOf(tracker.initialOffset)); if (tracker.key != null && tracker.totalRecords == 1) { kafkaAttrs.put(KafkaProcessorUtils.KAFKA_KEY, tracker.key); } kafkaAttrs.put(KafkaProcessorUtils.KAFKA_PARTITION, String.valueOf(tracker.partition)); kafkaAttrs.put(KafkaProcessorUtils.KAFKA_TOPIC, tracker.topic); if (tracker.totalRecords > 1) { kafkaAttrs.put(KafkaProcessorUtils.KAFKA_COUNT, String.valueOf(tracker.totalRecords)); } final FlowFile newFlowFile = getProcessSession().putAllAttributes(tracker.flowFile, kafkaAttrs); final long executionDurationMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - leaseStartNanos); final String transitUri = KafkaProcessorUtils.buildTransitURI(securityProtocol, bootstrapServers, tracker.topic); getProcessSession().getProvenanceReporter().receive(newFlowFile, transitUri, executionDurationMillis); tracker.updateFlowFile(newFlowFile); } private static class BundleTracker { final long initialOffset; final int partition; final String topic; final String key; FlowFile flowFile; long totalRecords = 0; private BundleTracker(final ConsumerRecord<byte[], byte[]> initialRecord, final TopicPartition topicPartition, final String keyEncoding) { this.initialOffset = initialRecord.offset(); this.partition = topicPartition.partition(); this.topic = topicPartition.topic(); this.key = encodeKafkaKey(initialRecord.key(), keyEncoding); } private void incrementRecordCount(final long count) { totalRecords += count; } private void updateFlowFile(final FlowFile flowFile) { this.flowFile = flowFile; } } }