/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.routing.builder;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.InstanceConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.linkedin.pinot.common.utils.CommonConstants;
import com.linkedin.pinot.common.utils.LLCSegmentName;
import com.linkedin.pinot.common.utils.SegmentName;
import com.linkedin.pinot.routing.ServerToSegmentSetMap;
/**
* Routing table builder for the Kafka low level consumer.
*/
public class KafkaLowLevelConsumerRoutingTableBuilder extends GeneratorBasedRoutingTableBuilder {
private static final Logger LOGGER = LoggerFactory.getLogger(KafkaLowLevelConsumerRoutingTableBuilder.class);
private final Random _random = new Random();
private int TARGET_SERVER_COUNT_PER_QUERY = 8;
@Override
public void init(Configuration configuration) {
// TODO jfim This is a broker-level configuration for now, until we refactor the configuration of the routing table to allow per-table routing settings
if (configuration.containsKey("realtimeTargetServerCountPerQuery")) {
final String targetServerCountPerQuery = configuration.getString("realtimeTargetServerCountPerQuery");
try {
TARGET_SERVER_COUNT_PER_QUERY = Integer.parseInt(targetServerCountPerQuery);
LOGGER.info("Using realtime target server count of {}", TARGET_SERVER_COUNT_PER_QUERY);
} catch (Exception e) {
LOGGER.warn(
"Could not get the realtime target server count per query from configuration value {}, keeping default value {}",
targetServerCountPerQuery, TARGET_SERVER_COUNT_PER_QUERY, e);
}
} else {
LOGGER.info("Using default value for realtime target server count of {}", TARGET_SERVER_COUNT_PER_QUERY);
}
}
@Override
protected RoutingTableGenerator buildRoutingTableGenerator() {
return new KafkaLowLevelConsumerRoutingTableGenerator();
}
private class KafkaLowLevelConsumerRoutingTableGenerator extends BaseRoutingTableGenerator {
// We build the routing table based off the external view here. What we want to do is to make sure that we uphold
// the guarantees clients expect (no duplicate records, eventual consistency) and spreading the load as equally as
// possible between the servers.
//
// Each Kafka partition contains a fraction of the data, so we need to make sure that we query all partitions.
// Because in certain unlikely degenerate scenarios, we can consume overlapping data until segments are flushed (at
// which point the overlapping data is discarded during the reconciliation process with the controller), we need to
// ensure that the query that is sent has only one partition in CONSUMING state in order to avoid duplicate records.
//
// Because we also want to want to spread the load as equally as possible between servers, we use a weighted random
// replica selection that favors picking replicas with fewer segments assigned to them, thus having an approximately
// equal distribution of load between servers.
//
// For example, given three replicas with 1, 2 and 3 segments assigned to each, the replica with one segment should
// have a weight of 2, which is the maximum segment count minus the segment count for that replica. Thus, each
// replica other than the replica(s) with the maximum segment count should have a chance of getting a segment
// assigned to it. This corresponds to alternative three below:
//
// Alternative 1 (weight is sum of segment counts - segment count in that replica):
// (6 - 1) = 5 -> P(0.4166)
// (6 - 2) = 4 -> P(0.3333)
// (6 - 3) = 3 -> P(0.2500)
//
// Alternative 2 (weight is max of segment counts - segment count in that replica + 1):
// (3 - 1) + 1 = 3 -> P(0.5000)
// (3 - 2) + 1 = 2 -> P(0.3333)
// (3 - 3) + 1 = 1 -> P(0.1666)
//
// Alternative 3 (weight is max of segment counts - segment count in that replica):
// (3 - 1) = 2 -> P(0.6666)
// (3 - 2) = 1 -> P(0.3333)
// (3 - 3) = 0 -> P(0.0000)
//
// Of those three weighting alternatives, the third one has the smallest standard deviation of the number of
// segments assigned per replica, so it corresponds to the weighting strategy used for segment assignment. Empirical
// testing shows that for 20 segments and three replicas, the standard deviation of each alternative is respectively
// 2.112, 1.496 and 0.853.
//
// This algorithm works as follows:
// 1. Gather all segments and group them by Kafka partition, sorted by sequence number
// 2. Ensure that for each partition, we have at most one partition in consuming state
// 3. Sort all the segments to be used during assignment in ascending order of replicas
// 4. For each segment to be used during assignment, pick a random replica, weighted by the number of segments
// assigned to each replica.
//
// The upstream code in BaseRoutingTableGenerator will generate routing tables based on taking a subset of servers
// if the cluster is large enough as well as ensure that the best routing tables are used for routing.
private Set<String> segmentSet = new HashSet<>();
private Set<String> instanceSet = new HashSet<>();
private Map<String, Set<String>> segmentToInstanceMap = new HashMap<>();
private Map<String, String[]> segmentToInstanceArrayMap = new HashMap<>();
private Map<String, Set<String>> instanceToSegmentMap = new HashMap<>();
private String[] instanceArray;
protected KafkaLowLevelConsumerRoutingTableGenerator() {
super(TARGET_SERVER_COUNT_PER_QUERY, _random);
}
@Override
public void init(ExternalView externalView, List<InstanceConfig> instanceConfigList) {
// 1. Gather all segments and group them by Kafka partition, sorted by sequence number
Map<String, SortedSet<SegmentName>> sortedSegmentsByKafkaPartition = new HashMap<String, SortedSet<SegmentName>>();
for (String helixPartitionName : externalView.getPartitionSet()) {
// Ignore segments that are not low level consumer segments
if (!SegmentName.isLowLevelConsumerSegmentName(helixPartitionName)) {
continue;
}
final LLCSegmentName segmentName = new LLCSegmentName(helixPartitionName);
String kafkaPartitionName = segmentName.getPartitionRange();
SortedSet<SegmentName> segmentsForPartition = sortedSegmentsByKafkaPartition.get(kafkaPartitionName);
// Create sorted set if necessary
if (segmentsForPartition == null) {
segmentsForPartition = new TreeSet<SegmentName>();
sortedSegmentsByKafkaPartition.put(kafkaPartitionName, segmentsForPartition);
}
segmentsForPartition.add(segmentName);
}
// 2. Ensure that for each Kafka partition, we have at most one Helix partition (Pinot segment) in consuming state
Map<String, SegmentName> allowedSegmentInConsumingStateByKafkaPartition = new HashMap<String, SegmentName>();
for (String kafkaPartition : sortedSegmentsByKafkaPartition.keySet()) {
SortedSet<SegmentName> sortedSegmentsForKafkaPartition = sortedSegmentsByKafkaPartition.get(kafkaPartition);
SegmentName lastAllowedSegmentInConsumingState = null;
for (SegmentName segmentName : sortedSegmentsForKafkaPartition) {
Map<String, String> helixPartitionState = externalView.getStateMap(segmentName.getSegmentName());
boolean allInConsumingState = true;
int replicasInConsumingState = 0;
// Only keep the segment if all replicas have it in CONSUMING state
for (String externalViewState : helixPartitionState.values()) {
// Ignore ERROR state
if (externalViewState.equalsIgnoreCase(
CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ERROR)) {
continue;
}
// Not all segments are in CONSUMING state, therefore don't consider the last segment assignable to CONSUMING
// replicas
if (externalViewState.equalsIgnoreCase(
CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ONLINE)) {
allInConsumingState = false;
break;
}
// Otherwise count the replica as being in CONSUMING state
if (externalViewState.equalsIgnoreCase(
CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.CONSUMING)) {
replicasInConsumingState++;
}
}
// If all replicas have this segment in consuming state (and not all of them are in ERROR state), then pick this
// segment to be the last allowed segment to be in CONSUMING state
if (allInConsumingState && 0 < replicasInConsumingState) {
lastAllowedSegmentInConsumingState = segmentName;
break;
}
}
if (lastAllowedSegmentInConsumingState != null) {
allowedSegmentInConsumingStateByKafkaPartition.put(kafkaPartition, lastAllowedSegmentInConsumingState);
}
}
// 3. Sort all the segments to be used during assignment in ascending order of replicas
// PriorityQueue throws IllegalArgumentException when given a size of zero
RoutingTableInstancePruner instancePruner = new RoutingTableInstancePruner(instanceConfigList);
for (Map.Entry<String, SortedSet<SegmentName>> entry : sortedSegmentsByKafkaPartition.entrySet()) {
String kafkaPartition = entry.getKey();
SortedSet<SegmentName> segmentNames = entry.getValue();
// The only segment name which is allowed to be in CONSUMING state or null
SegmentName validConsumingSegment = allowedSegmentInConsumingStateByKafkaPartition.get(kafkaPartition);
for (SegmentName segmentName : segmentNames) {
Set<String> validReplicas = new HashSet<String>();
String segmentNameStr = segmentName.getSegmentName();
Map<String, String> externalViewState = externalView.getStateMap(segmentNameStr);
for (Map.Entry<String, String> instanceAndStateEntry : externalViewState.entrySet()) {
String instance = instanceAndStateEntry.getKey();
String state = instanceAndStateEntry.getValue();
// Skip pruned replicas (shutting down or otherwise disabled)
if (instancePruner.isInactive(instance)) {
continue;
}
// Replicas in ONLINE state are always allowed
if (state.equalsIgnoreCase(CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ONLINE)) {
validReplicas.add(instance);
continue;
}
// Replicas in CONSUMING state are only allowed on the last segment
if (state.equalsIgnoreCase(CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.CONSUMING)
&& segmentName.equals(validConsumingSegment)) {
validReplicas.add(instance);
}
}
if (!validReplicas.isEmpty()) {
segmentSet.add(segmentNameStr);
instanceSet.addAll(validReplicas);
segmentToInstanceMap.put(segmentNameStr, validReplicas);
segmentToInstanceArrayMap.put(segmentNameStr, validReplicas.toArray(new String[validReplicas.size()]));
for (String validReplica : validReplicas) {
Set<String> segmentsForReplica = instanceToSegmentMap.get(validReplica);
if (segmentsForReplica == null) {
segmentsForReplica = new HashSet<>();
instanceToSegmentMap.put(validReplica, segmentsForReplica);
}
segmentsForReplica.add(segmentNameStr);
}
}
// If this segment is the segment allowed in CONSUMING state, don't process segments after it in that Kafka
// partition
if (segmentName.equals(validConsumingSegment)) {
break;
}
}
}
instanceArray = instanceSet.toArray(new String[instanceSet.size()]);
}
@Override
protected Set<String> getSegmentSet() {
return segmentSet;
}
@Override
protected String[] getInstanceArray() {
return instanceArray;
}
@Override
protected Set<String> getInstanceSet() {
return instanceSet;
}
@Override
protected Map<String, Set<String>> getInstanceToSegmentMap() {
return instanceToSegmentMap;
}
@Override
protected Map<String, String[]> getSegmentToInstanceArrayMap() {
return segmentToInstanceArrayMap;
}
@Override
protected Map<String, Set<String>> getSegmentToInstanceMap() {
return segmentToInstanceMap;
}
}
}