KafkaLowLevelConsumerRoutingTableBuilder.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.linkedin.pinot.routing.builder;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.InstanceConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.linkedin.pinot.common.utils.CommonConstants;
import com.linkedin.pinot.common.utils.LLCSegmentName;
import com.linkedin.pinot.common.utils.SegmentName;
import com.linkedin.pinot.routing.ServerToSegmentSetMap;


/**
 * Routing table builder for the Kafka low level consumer.
 */
public class KafkaLowLevelConsumerRoutingTableBuilder extends GeneratorBasedRoutingTableBuilder {
  private static final Logger LOGGER = LoggerFactory.getLogger(KafkaLowLevelConsumerRoutingTableBuilder.class);
  private final Random _random = new Random();
  private int TARGET_SERVER_COUNT_PER_QUERY = 8;

  @Override
  public void init(Configuration configuration) {
    // TODO jfim This is a broker-level configuration for now, until we refactor the configuration of the routing table to allow per-table routing settings
    if (configuration.containsKey("realtimeTargetServerCountPerQuery")) {
      final String targetServerCountPerQuery = configuration.getString("realtimeTargetServerCountPerQuery");
      try {
        TARGET_SERVER_COUNT_PER_QUERY = Integer.parseInt(targetServerCountPerQuery);
        LOGGER.info("Using realtime target server count of {}", TARGET_SERVER_COUNT_PER_QUERY);
      } catch (Exception e) {
        LOGGER.warn(
            "Could not get the realtime target server count per query from configuration value {}, keeping default value {}",
            targetServerCountPerQuery, TARGET_SERVER_COUNT_PER_QUERY, e);
      }
    } else {
      LOGGER.info("Using default value for realtime target server count of {}", TARGET_SERVER_COUNT_PER_QUERY);
    }
  }

  @Override
  protected RoutingTableGenerator buildRoutingTableGenerator() {
    return new KafkaLowLevelConsumerRoutingTableGenerator();
  }

  private class KafkaLowLevelConsumerRoutingTableGenerator extends BaseRoutingTableGenerator {
    // We build the routing table based off the external view here. What we want to do is to make sure that we uphold
    // the guarantees clients expect (no duplicate records, eventual consistency) and spreading the load as equally as
    // possible between the servers.
    //
    // Each Kafka partition contains a fraction of the data, so we need to make sure that we query all partitions.
    // Because in certain unlikely degenerate scenarios, we can consume overlapping data until segments are flushed (at
    // which point the overlapping data is discarded during the reconciliation process with the controller), we need to
    // ensure that the query that is sent has only one partition in CONSUMING state in order to avoid duplicate records.
    //
    // Because we also want to want to spread the load as equally as possible between servers, we use a weighted random
    // replica selection that favors picking replicas with fewer segments assigned to them, thus having an approximately
    // equal distribution of load between servers.
    //
    // For example, given three replicas with 1, 2 and 3 segments assigned to each, the replica with one segment should
    // have a weight of 2, which is the maximum segment count minus the segment count for that replica. Thus, each
    // replica other than the replica(s) with the maximum segment count should have a chance of getting a segment
    // assigned to it. This corresponds to alternative three below:
    //
    // Alternative 1 (weight is sum of segment counts - segment count in that replica):
    // (6 - 1) = 5 -> P(0.4166)
    // (6 - 2) = 4 -> P(0.3333)
    // (6 - 3) = 3 -> P(0.2500)
    //
    // Alternative 2 (weight is max of segment counts - segment count in that replica + 1):
    // (3 - 1) + 1 = 3 -> P(0.5000)
    // (3 - 2) + 1 = 2 -> P(0.3333)
    // (3 - 3) + 1 = 1 -> P(0.1666)
    //
    // Alternative 3 (weight is max of segment counts - segment count in that replica):
    // (3 - 1) = 2 -> P(0.6666)
    // (3 - 2) = 1 -> P(0.3333)
    // (3 - 3) = 0 -> P(0.0000)
    //
    // Of those three weighting alternatives, the third one has the smallest standard deviation of the number of
    // segments assigned per replica, so it corresponds to the weighting strategy used for segment assignment. Empirical
    // testing shows that for 20 segments and three replicas, the standard deviation of each alternative is respectively
    // 2.112, 1.496 and 0.853.
    //
    // This algorithm works as follows:
    // 1. Gather all segments and group them by Kafka partition, sorted by sequence number
    // 2. Ensure that for each partition, we have at most one partition in consuming state
    // 3. Sort all the segments to be used during assignment in ascending order of replicas
    // 4. For each segment to be used during assignment, pick a random replica, weighted by the number of segments
    //    assigned to each replica.
    //
    // The upstream code in BaseRoutingTableGenerator will generate routing tables based on taking a subset of servers
    // if the cluster is large enough as well as ensure that the best routing tables are used for routing.

    private Set<String> segmentSet = new HashSet<>();
    private Set<String> instanceSet = new HashSet<>();
    private Map<String, Set<String>> segmentToInstanceMap = new HashMap<>();
    private Map<String, String[]> segmentToInstanceArrayMap = new HashMap<>();
    private Map<String, Set<String>> instanceToSegmentMap = new HashMap<>();
    private String[] instanceArray;

    protected KafkaLowLevelConsumerRoutingTableGenerator() {
      super(TARGET_SERVER_COUNT_PER_QUERY, _random);
    }

    @Override
    public void init(ExternalView externalView, List<InstanceConfig> instanceConfigList) {
      // 1. Gather all segments and group them by Kafka partition, sorted by sequence number
      Map<String, SortedSet<SegmentName>> sortedSegmentsByKafkaPartition = new HashMap<String, SortedSet<SegmentName>>();
      for (String helixPartitionName : externalView.getPartitionSet()) {
        // Ignore segments that are not low level consumer segments
        if (!SegmentName.isLowLevelConsumerSegmentName(helixPartitionName)) {
          continue;
        }

        final LLCSegmentName segmentName = new LLCSegmentName(helixPartitionName);
        String kafkaPartitionName = segmentName.getPartitionRange();
        SortedSet<SegmentName> segmentsForPartition = sortedSegmentsByKafkaPartition.get(kafkaPartitionName);

        // Create sorted set if necessary
        if (segmentsForPartition == null) {
          segmentsForPartition = new TreeSet<SegmentName>();

          sortedSegmentsByKafkaPartition.put(kafkaPartitionName, segmentsForPartition);
        }

        segmentsForPartition.add(segmentName);
      }

      // 2. Ensure that for each Kafka partition, we have at most one Helix partition (Pinot segment) in consuming state
      Map<String, SegmentName> allowedSegmentInConsumingStateByKafkaPartition = new HashMap<String, SegmentName>();
      for (String kafkaPartition : sortedSegmentsByKafkaPartition.keySet()) {
        SortedSet<SegmentName> sortedSegmentsForKafkaPartition = sortedSegmentsByKafkaPartition.get(kafkaPartition);
        SegmentName lastAllowedSegmentInConsumingState = null;

        for (SegmentName segmentName : sortedSegmentsForKafkaPartition) {
          Map<String, String> helixPartitionState = externalView.getStateMap(segmentName.getSegmentName());
          boolean allInConsumingState = true;
          int replicasInConsumingState = 0;

          // Only keep the segment if all replicas have it in CONSUMING state
          for (String externalViewState : helixPartitionState.values()) {
            // Ignore ERROR state
            if (externalViewState.equalsIgnoreCase(
                CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ERROR)) {
              continue;
            }

            // Not all segments are in CONSUMING state, therefore don't consider the last segment assignable to CONSUMING
            // replicas
            if (externalViewState.equalsIgnoreCase(
                CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ONLINE)) {
              allInConsumingState = false;
              break;
            }

            // Otherwise count the replica as being in CONSUMING state
            if (externalViewState.equalsIgnoreCase(
                CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.CONSUMING)) {
              replicasInConsumingState++;
            }
          }

          // If all replicas have this segment in consuming state (and not all of them are in ERROR state), then pick this
          // segment to be the last allowed segment to be in CONSUMING state
          if (allInConsumingState && 0 < replicasInConsumingState) {
            lastAllowedSegmentInConsumingState = segmentName;
            break;
          }
        }

        if (lastAllowedSegmentInConsumingState != null) {
          allowedSegmentInConsumingStateByKafkaPartition.put(kafkaPartition, lastAllowedSegmentInConsumingState);
        }
      }

      // 3. Sort all the segments to be used during assignment in ascending order of replicas

      // PriorityQueue throws IllegalArgumentException when given a size of zero
      RoutingTableInstancePruner instancePruner = new RoutingTableInstancePruner(instanceConfigList);

      for (Map.Entry<String, SortedSet<SegmentName>> entry : sortedSegmentsByKafkaPartition.entrySet()) {
        String kafkaPartition = entry.getKey();
        SortedSet<SegmentName> segmentNames = entry.getValue();

        // The only segment name which is allowed to be in CONSUMING state or null
        SegmentName validConsumingSegment = allowedSegmentInConsumingStateByKafkaPartition.get(kafkaPartition);

        for (SegmentName segmentName : segmentNames) {
          Set<String> validReplicas = new HashSet<String>();
          String segmentNameStr = segmentName.getSegmentName();
          Map<String, String> externalViewState = externalView.getStateMap(segmentNameStr);

          for (Map.Entry<String, String> instanceAndStateEntry : externalViewState.entrySet()) {
            String instance = instanceAndStateEntry.getKey();
            String state = instanceAndStateEntry.getValue();

            // Skip pruned replicas (shutting down or otherwise disabled)
            if (instancePruner.isInactive(instance)) {
              continue;
            }

            // Replicas in ONLINE state are always allowed
            if (state.equalsIgnoreCase(CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.ONLINE)) {
              validReplicas.add(instance);
              continue;
            }

            // Replicas in CONSUMING state are only allowed on the last segment
            if (state.equalsIgnoreCase(CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel.CONSUMING)
                && segmentName.equals(validConsumingSegment)) {
              validReplicas.add(instance);
            }
          }

          if (!validReplicas.isEmpty()) {
            segmentSet.add(segmentNameStr);
            instanceSet.addAll(validReplicas);
            segmentToInstanceMap.put(segmentNameStr, validReplicas);
            segmentToInstanceArrayMap.put(segmentNameStr, validReplicas.toArray(new String[validReplicas.size()]));

            for (String validReplica : validReplicas) {
              Set<String> segmentsForReplica = instanceToSegmentMap.get(validReplica);

              if (segmentsForReplica == null) {
                segmentsForReplica = new HashSet<>();
                instanceToSegmentMap.put(validReplica, segmentsForReplica);
              }

              segmentsForReplica.add(segmentNameStr);
            }
          }

          // If this segment is the segment allowed in CONSUMING state, don't process segments after it in that Kafka
          // partition
          if (segmentName.equals(validConsumingSegment)) {
            break;
          }
        }
      }

      instanceArray = instanceSet.toArray(new String[instanceSet.size()]);
    }

    @Override
    protected Set<String> getSegmentSet() {
      return segmentSet;
    }

    @Override
    protected String[] getInstanceArray() {
      return instanceArray;
    }

    @Override
    protected Set<String> getInstanceSet() {
      return instanceSet;
    }

    @Override
    protected Map<String, Set<String>> getInstanceToSegmentMap() {
      return instanceToSegmentMap;
    }

    @Override
    protected Map<String, String[]> getSegmentToInstanceArrayMap() {
      return segmentToInstanceArrayMap;
    }

    @Override
    protected Map<String, Set<String>> getSegmentToInstanceMap() {
      return segmentToInstanceMap;
    }
  }
}