/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.realtime.impl.kafka; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.tuple.Pair; import org.apache.kafka.common.errors.TimeoutException; import org.apache.kafka.common.protocol.Errors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.util.concurrent.Uninterruptibles; import javax.annotation.Nullable; import kafka.api.FetchRequestBuilder; import kafka.api.PartitionOffsetRequestInfo; import kafka.common.TopicAndPartition; import kafka.javaapi.FetchResponse; import kafka.javaapi.OffsetRequest; import kafka.javaapi.OffsetResponse; import kafka.javaapi.PartitionMetadata; import kafka.javaapi.TopicMetadata; import kafka.javaapi.TopicMetadataRequest; import kafka.javaapi.TopicMetadataResponse; import kafka.javaapi.consumer.SimpleConsumer; import kafka.javaapi.message.ByteBufferMessageSet; import kafka.message.MessageAndOffset; /** * Wrapper for Kafka's SimpleConsumer which ensures that we're connected to the appropriate broker for consumption. */ public class SimpleConsumerWrapper implements Closeable { private static final Logger LOGGER = LoggerFactory.getLogger(SimpleConsumerWrapper.class); private static final int SOCKET_TIMEOUT_MILLIS = 10000; private static final int SOCKET_BUFFER_SIZE = 512000; private enum ConsumerState { CONNECTING_TO_BOOTSTRAP_NODE, CONNECTED_TO_BOOTSTRAP_NODE, FETCHING_LEADER_INFORMATION, CONNECTING_TO_PARTITION_LEADER, CONNECTED_TO_PARTITION_LEADER } private State _currentState; private final String _clientId; private final boolean _metadataOnlyConsumer; private final String _topic; private final int _partition; private final long _connectTimeoutMillis; private final KafkaSimpleConsumerFactory _simpleConsumerFactory; private String[] _bootstrapHosts; private int[] _bootstrapPorts; private SimpleConsumer _simpleConsumer; private final Random _random = new Random(); private KafkaBrokerWrapper _leader; private String _currentHost; private int _currentPort; /** * A Kafka protocol error that indicates a situation that is not likely to clear up by retrying the request (for * example, no such topic or offset out of range). */ public static class PermanentConsumerException extends RuntimeException { public PermanentConsumerException(Errors error) { super(error.exception()); } } /** * A Kafka protocol error that indicates a situation that is likely to be transient (for example, network error or * broker not available). */ public static class TransientConsumerException extends RuntimeException { public TransientConsumerException(Errors error) { super(error.exception()); } } private SimpleConsumerWrapper(KafkaSimpleConsumerFactory simpleConsumerFactory, String bootstrapNodes, String clientId, long connectTimeoutMillis) { _simpleConsumerFactory = simpleConsumerFactory; _clientId = clientId; _connectTimeoutMillis = connectTimeoutMillis; _metadataOnlyConsumer = true; _simpleConsumer = null; // Topic and partition are ignored for metadata-only consumers _topic = null; _partition = Integer.MIN_VALUE; initializeBootstrapNodeList(bootstrapNodes); setCurrentState(new ConnectingToBootstrapNode()); } private SimpleConsumerWrapper(KafkaSimpleConsumerFactory simpleConsumerFactory, String bootstrapNodes, String clientId, String topic, int partition, long connectTimeoutMillis) { _simpleConsumerFactory = simpleConsumerFactory; _clientId = clientId; _topic = topic; _partition = partition; _connectTimeoutMillis = connectTimeoutMillis; _metadataOnlyConsumer = false; _simpleConsumer = null; initializeBootstrapNodeList(bootstrapNodes); setCurrentState(new ConnectingToBootstrapNode()); } private void initializeBootstrapNodeList(String bootstrapNodes) { ArrayList<String> hostsAndPorts = Lists.newArrayList(Splitter.on(',').trimResults().omitEmptyStrings().split(bootstrapNodes)); final int bootstrapHostCount = hostsAndPorts.size(); if (bootstrapHostCount < 1) { throw new IllegalArgumentException("Need at least one bootstrap host"); } _bootstrapHosts = new String[bootstrapHostCount]; _bootstrapPorts = new int[bootstrapHostCount]; for (int i = 0; i < bootstrapHostCount; i++) { String hostAndPort = hostsAndPorts.get(i); String[] splittedHostAndPort = hostAndPort.split(":"); if (splittedHostAndPort.length != 2) { throw new IllegalArgumentException("Unable to parse host:port combination for " + hostAndPort); } _bootstrapHosts[i] = splittedHostAndPort[0]; try { _bootstrapPorts[i] = Integer.parseInt(splittedHostAndPort[1]); } catch (NumberFormatException e) { throw new IllegalArgumentException("Could not parse port number " + splittedHostAndPort[1] + " for host:port combination " + hostAndPort); } } } private abstract class State { private ConsumerState stateValue; protected State(ConsumerState stateValue) { this.stateValue = stateValue; } abstract void process(); abstract boolean isConnectedToKafkaBroker(); void handleConsumerException(Exception e) { // By default, just log the exception and switch back to CONNECTING_TO_BOOTSTRAP_NODE (which will take care of // closing the connection if it exists) LOGGER.warn("Caught Kafka consumer exception while in state {}, disconnecting and trying again", _currentState.getStateValue(), e); setCurrentState(new ConnectingToBootstrapNode()); } ConsumerState getStateValue() { return stateValue; } } private class ConnectingToBootstrapNode extends State { public ConnectingToBootstrapNode() { super(ConsumerState.CONNECTING_TO_BOOTSTRAP_NODE); } @Override public void process() { // Connect to a random bootstrap node if (_simpleConsumer != null) { try { _simpleConsumer.close(); } catch (Exception e) { LOGGER.warn("Caught exception while closing consumer, ignoring", e); } } int randomHostIndex = _random.nextInt(_bootstrapHosts.length); _currentHost = _bootstrapHosts[randomHostIndex]; _currentPort = _bootstrapPorts[randomHostIndex]; try { _simpleConsumer = _simpleConsumerFactory.buildSimpleConsumer(_currentHost, _currentPort, SOCKET_TIMEOUT_MILLIS, SOCKET_BUFFER_SIZE, _clientId); setCurrentState(new ConnectedToBootstrapNode()); } catch (Exception e) { handleConsumerException(e); } } @Override boolean isConnectedToKafkaBroker() { return false; } } private class ConnectedToBootstrapNode extends State { protected ConnectedToBootstrapNode() { super(ConsumerState.CONNECTED_TO_BOOTSTRAP_NODE); } @Override void process() { if (_metadataOnlyConsumer) { // Nothing to do } else { // If we're consuming from a partition, we need to find the leader so that we can consume from it. By design, // Kafka only allows consumption from the leader and not one of the in-sync replicas. setCurrentState(new FetchingLeaderInformation()); } } @Override boolean isConnectedToKafkaBroker() { return true; } } private class FetchingLeaderInformation extends State { public FetchingLeaderInformation() { super(ConsumerState.FETCHING_LEADER_INFORMATION); } @Override void process() { // Fetch leader information try { TopicMetadataResponse response = _simpleConsumer.send(new TopicMetadataRequest(Collections.singletonList(_topic))); try { _leader = null; List<PartitionMetadata> pMetaList = response.topicsMetadata().get(0).partitionsMetadata(); for (PartitionMetadata pMeta : pMetaList) { if (pMeta.partitionId() == _partition) { _leader = new KafkaBrokerWrapper(pMeta.leader()); break; } } // If we've located a broker if (_leader != null) { LOGGER.info("Located leader broker {}, connecting to it.", _leader); setCurrentState(new ConnectingToPartitionLeader()); } else { // Failed to get the leader broker. There could be a leader election at the moment, so retry after a little // bit. LOGGER.warn("Leader broker is null, retrying leader fetch in 100ms"); Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } } catch (Exception e) { // Failed to get the leader broker. There could be a leader election at the moment, so retry after a little // bit. LOGGER.warn("Failed to get the leader broker due to exception, retrying in 100ms", e); Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } } catch (Exception e) { handleConsumerException(e); } } @Override boolean isConnectedToKafkaBroker() { return true; } } private class ConnectingToPartitionLeader extends State { public ConnectingToPartitionLeader() { super(ConsumerState.CONNECTING_TO_PARTITION_LEADER); } @Override void process() { // If we're already connected to the leader broker, don't disconnect and reconnect LOGGER.info("Trying to fetch leader host and port: {}:{}", _leader.host(), _leader.port()); if (_leader.host().equals(_currentHost) && _leader.port() == _currentPort) { setCurrentState(new ConnectedToPartitionLeader()); return; } // Disconnect from current broker if(_simpleConsumer != null) { try { _simpleConsumer.close(); _simpleConsumer = null; } catch (Exception e) { handleConsumerException(e); return; } } // Connect to the partition leader try { _simpleConsumer = _simpleConsumerFactory.buildSimpleConsumer(_leader.host(), _leader.port(), SOCKET_TIMEOUT_MILLIS, SOCKET_BUFFER_SIZE, _clientId); setCurrentState(new ConnectedToPartitionLeader()); } catch (Exception e) { handleConsumerException(e); } } @Override boolean isConnectedToKafkaBroker() { return false; } } private class ConnectedToPartitionLeader extends State { public ConnectedToPartitionLeader() { super(ConsumerState.CONNECTED_TO_PARTITION_LEADER); } @Override void process() { // Nothing to do } @Override boolean isConnectedToKafkaBroker() { return true; } } private void setCurrentState(State newState) { if (_currentState != null) { LOGGER.info("Switching from state {} to state {}", _currentState.getStateValue(), newState.getStateValue()); } _currentState = newState; } public synchronized int getPartitionCount(String topic, long timeoutMillis) { int unknownTopicReplyCount = 0; final int MAX_UNKNOWN_TOPIC_REPLY_COUNT = 10; int kafkaErrorCount = 0; final int MAX_KAFKA_ERROR_COUNT = 10; final long endTime = System.currentTimeMillis() + timeoutMillis; while(System.currentTimeMillis() < endTime) { // Try to get into a state where we're connected to Kafka while (!_currentState.isConnectedToKafkaBroker() && System.currentTimeMillis() < endTime) { _currentState.process(); } if (endTime <= System.currentTimeMillis() && !_currentState.isConnectedToKafkaBroker()) { throw new TimeoutException("Failed to get the partition count for topic " + topic + " within " + timeoutMillis + " ms"); } // Send the metadata request to Kafka TopicMetadataResponse topicMetadataResponse = null; try { topicMetadataResponse = _simpleConsumer.send(new TopicMetadataRequest(Collections.singletonList(topic))); } catch (Exception e) { _currentState.handleConsumerException(e); continue; } final TopicMetadata topicMetadata = topicMetadataResponse.topicsMetadata().get(0); final short errorCode = topicMetadata.errorCode(); if (errorCode == Errors.NONE.code()) { return topicMetadata.partitionsMetadata().size(); } else if (errorCode == Errors.LEADER_NOT_AVAILABLE.code()) { // If there is no leader, it'll take some time for a new leader to be elected, wait 100 ms before retrying Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } else if (errorCode == Errors.INVALID_TOPIC_EXCEPTION.code()) { throw new RuntimeException("Invalid topic name " + topic); } else if (errorCode == Errors.UNKNOWN_TOPIC_OR_PARTITION.code()) { if (MAX_UNKNOWN_TOPIC_REPLY_COUNT < unknownTopicReplyCount) { throw new RuntimeException("Topic " + topic + " does not exist"); } else { // Kafka topic creation can sometimes take some time, so we'll retry after a little bit unknownTopicReplyCount++; Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } } else { // Retry after a short delay kafkaErrorCount++; if (MAX_KAFKA_ERROR_COUNT < kafkaErrorCount) { throw exceptionForKafkaErrorCode(errorCode); } Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } } throw new TimeoutException(); } /** * Fetch messages and the per-partition high watermark from Kafka between the specified offsets. * * @param startOffset The offset of the first message desired, inclusive * @param endOffset The offset of the last message desired, exclusive, or {@link Long#MAX_VALUE} for no end offset. * @param timeoutMillis Timeout in milliseconds * @throws java.util.concurrent.TimeoutException If the operation could not be completed within {@code timeoutMillis} * milliseconds * @return An iterable containing messages fetched from Kafka and their offsets, as well as the high watermark for * this partition. */ public synchronized Pair<Iterable<MessageAndOffset>, Long> fetchMessagesAndHighWatermark(long startOffset, long endOffset, int timeoutMillis) throws java.util.concurrent.TimeoutException { Preconditions.checkState(!_metadataOnlyConsumer, "Cannot fetch messages from a metadata-only SimpleConsumerWrapper"); // Ensure that we're connected to the leader // TODO Improve error handling final long connectEndTime = System.currentTimeMillis() + _connectTimeoutMillis; while(_currentState.getStateValue() != ConsumerState.CONNECTED_TO_PARTITION_LEADER && System.currentTimeMillis() < connectEndTime) { _currentState.process(); } if (_currentState.getStateValue() != ConsumerState.CONNECTED_TO_PARTITION_LEADER && connectEndTime <= System.currentTimeMillis()) { throw new java.util.concurrent.TimeoutException(); } FetchResponse fetchResponse = _simpleConsumer.fetch(new FetchRequestBuilder() .minBytes(100000) .maxWait(timeoutMillis) .addFetch(_topic, _partition, startOffset, 500000) .build()); if (!fetchResponse.hasError()) { final Iterable<MessageAndOffset> messageAndOffsetIterable = buildOffsetFilteringIterable(fetchResponse.messageSet(_topic, _partition), startOffset, endOffset); return Pair.of(messageAndOffsetIterable, fetchResponse.highWatermark(_topic, _partition)); } else { throw exceptionForKafkaErrorCode(fetchResponse.errorCode(_topic, _partition)); } } private RuntimeException exceptionForKafkaErrorCode(short kafkaErrorCode) { final Errors kafkaError = Errors.forCode(kafkaErrorCode); switch (kafkaError) { case UNKNOWN: case OFFSET_OUT_OF_RANGE: case CORRUPT_MESSAGE: case MESSAGE_TOO_LARGE: case OFFSET_METADATA_TOO_LARGE: case INVALID_TOPIC_EXCEPTION: case RECORD_LIST_TOO_LARGE: case INVALID_REQUIRED_ACKS: case ILLEGAL_GENERATION: case INCONSISTENT_GROUP_PROTOCOL: case INVALID_GROUP_ID: case UNKNOWN_MEMBER_ID: case INVALID_SESSION_TIMEOUT: case INVALID_COMMIT_OFFSET_SIZE: return new PermanentConsumerException(kafkaError); case UNKNOWN_TOPIC_OR_PARTITION: case LEADER_NOT_AVAILABLE: case NOT_LEADER_FOR_PARTITION: case REQUEST_TIMED_OUT: case BROKER_NOT_AVAILABLE: case REPLICA_NOT_AVAILABLE: case STALE_CONTROLLER_EPOCH: case NETWORK_EXCEPTION: case GROUP_LOAD_IN_PROGRESS: case GROUP_COORDINATOR_NOT_AVAILABLE: case NOT_COORDINATOR_FOR_GROUP: case NOT_ENOUGH_REPLICAS: case NOT_ENOUGH_REPLICAS_AFTER_APPEND: case REBALANCE_IN_PROGRESS: case TOPIC_AUTHORIZATION_FAILED: case GROUP_AUTHORIZATION_FAILED: case CLUSTER_AUTHORIZATION_FAILED: return new TransientConsumerException(kafkaError); case NONE: default: return new RuntimeException("Unhandled error " + kafkaError); } } /** * Fetch messages from Kafka between the specified offsets. * * @param startOffset The offset of the first message desired, inclusive * @param endOffset The offset of the last message desired, exclusive, or {@link Long#MAX_VALUE} for no end offset. * @param timeoutMillis Timeout in milliseconds * @throws java.util.concurrent.TimeoutException If the operation could not be completed within {@code timeoutMillis} * milliseconds * @return An iterable containing messages fetched from Kafka and their offsets. */ public synchronized Iterable<MessageAndOffset> fetchMessages(long startOffset, long endOffset, int timeoutMillis) throws java.util.concurrent.TimeoutException { return fetchMessagesAndHighWatermark(startOffset, endOffset, timeoutMillis).getLeft(); } /** * Fetches the numeric Kafka offset for this partition for a symbolic name ("largest" or "smallest"). * * @param requestedOffset Either "largest" or "smallest" * @param timeoutMillis Timeout in milliseconds * @throws java.util.concurrent.TimeoutException If the operation could not be completed within {@code timeoutMillis} * milliseconds * @return An offset */ public synchronized long fetchPartitionOffset(String requestedOffset, int timeoutMillis) throws java.util.concurrent.TimeoutException { Preconditions.checkNotNull(requestedOffset); final long offsetRequestTime; if (requestedOffset.equalsIgnoreCase("largest")) { offsetRequestTime = kafka.api.OffsetRequest.LatestTime(); } else if (requestedOffset.equalsIgnoreCase("smallest")) { offsetRequestTime = kafka.api.OffsetRequest.EarliestTime(); } else if (requestedOffset.equalsIgnoreCase("testDummy")) { return -1L; } else { throw new IllegalArgumentException("Unknown initial offset value " + requestedOffset); } int kafkaErrorCount = 0; final int MAX_KAFKA_ERROR_COUNT = 10; final long endTime = System.currentTimeMillis() + timeoutMillis; while(System.currentTimeMillis() < endTime) { // Try to get into a state where we're connected to Kafka while (_currentState.getStateValue() != ConsumerState.CONNECTED_TO_PARTITION_LEADER && System.currentTimeMillis() < endTime) { _currentState.process(); } if (_currentState.getStateValue() != ConsumerState.CONNECTED_TO_PARTITION_LEADER && endTime <= System.currentTimeMillis()) { throw new TimeoutException(); } // Send the offset request to Kafka OffsetRequest request = new OffsetRequest(Collections.singletonMap(new TopicAndPartition(_topic, _partition), new PartitionOffsetRequestInfo(offsetRequestTime, 1)), kafka.api.OffsetRequest.CurrentVersion(), _clientId); OffsetResponse offsetResponse; try { offsetResponse = _simpleConsumer.getOffsetsBefore(request); } catch (Exception e) { _currentState.handleConsumerException(e); continue; } final short errorCode = offsetResponse.errorCode(_topic, _partition); if (errorCode == Errors.NONE.code()) { long offset = offsetResponse.offsets(_topic, _partition)[0]; if (offset == 0L) { LOGGER.warn("Fetched offset of 0 for topic {} and partition {}, is this a newly created topic?", _topic, _partition); } return offset; } else if (errorCode == Errors.LEADER_NOT_AVAILABLE.code()) { // If there is no leader, it'll take some time for a new leader to be elected, wait 100 ms before retrying Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } else { // Retry after a short delay kafkaErrorCount++; if (MAX_KAFKA_ERROR_COUNT < kafkaErrorCount) { throw exceptionForKafkaErrorCode(errorCode); } Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS); } } throw new TimeoutException(); } private Iterable<MessageAndOffset> buildOffsetFilteringIterable(final ByteBufferMessageSet messageAndOffsets, final long startOffset, final long endOffset) { return Iterables.filter(messageAndOffsets, new Predicate<MessageAndOffset>() { @Override public boolean apply(@Nullable MessageAndOffset input) { // Filter messages that are either null or have an offset ∉ [startOffset; endOffset[ if(input == null || input.offset() < startOffset || (endOffset <= input.offset() && endOffset != -1)) { return false; } // Check the message's checksum // TODO We might want to have better handling of this situation, maybe try to fetch the message again? if(!input.message().isValid()) { LOGGER.warn("Discarded message with invalid checksum in partition {} of topic {}", _partition, _topic); return false; } return true; } }); } /** * Creates a simple consumer wrapper that connects to a random Kafka broker, which allows for fetching topic and * partition metadata. It does not allow to consume from a partition, since Kafka requires connecting to the * leader of that partition for consumption. * * @param simpleConsumerFactory The SimpleConsumer factory to use * @param bootstrapNodes A comma separated list of Kafka broker nodes * @param clientId The Kafka client identifier, to be used to uniquely identify the client when tracing calls * @param connectTimeoutMillis The timeout for connecting or re-establishing a connection to the Kafka cluster * @return A consumer wrapper */ public static SimpleConsumerWrapper forMetadataConsumption(KafkaSimpleConsumerFactory simpleConsumerFactory, String bootstrapNodes, String clientId, long connectTimeoutMillis) { return new SimpleConsumerWrapper(simpleConsumerFactory, bootstrapNodes, clientId, connectTimeoutMillis); } /** * Creates a simple consumer wrapper that automatically connects to the leader broker for the given topic and * partition. This consumer wrapper can also fetch topic and partition metadata. * * @param simpleConsumerFactory The SimpleConsumer factory to use * @param bootstrapNodes A comma separated list of Kafka broker nodes * @param clientId The Kafka client identifier, to be used to uniquely identify the client when tracing calls * @param topic The Kafka topic to consume from * @param partition The partition id to consume from * @param connectTimeoutMillis The timeout for connecting or re-establishing a connection to the Kafka cluster * @return A consumer wrapper */ public static SimpleConsumerWrapper forPartitionConsumption(KafkaSimpleConsumerFactory simpleConsumerFactory, String bootstrapNodes, String clientId, String topic, int partition, long connectTimeoutMillis) { return new SimpleConsumerWrapper(simpleConsumerFactory, bootstrapNodes, clientId, topic, partition, connectTimeoutMillis); } @Override /** * Closes this consumer. */ public void close() throws IOException { boolean needToCloseConsumer = _currentState.isConnectedToKafkaBroker() && _simpleConsumer != null; // Reset the state machine setCurrentState(new ConnectingToBootstrapNode()); // Close the consumer if needed if (needToCloseConsumer) { _simpleConsumer.close(); _simpleConsumer = null; } } }