/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.controller.helix.core; import com.linkedin.pinot.common.config.AbstractTableConfig; import com.linkedin.pinot.common.metadata.ZKMetadataProvider; import com.linkedin.pinot.common.metadata.instance.InstanceZKMetadata; import com.linkedin.pinot.common.metadata.stream.KafkaStreamMetadata; import com.linkedin.pinot.common.utils.CommonConstants; import com.linkedin.pinot.common.utils.CommonConstants.Helix; import com.linkedin.pinot.common.utils.ControllerTenantNameBuilder; import com.linkedin.pinot.common.utils.StringUtil; import com.linkedin.pinot.common.utils.retry.RetryPolicies; import com.linkedin.pinot.common.utils.retry.RetryPolicy; import com.linkedin.pinot.controller.helix.core.realtime.PinotLLCRealtimeSegmentManager; import com.linkedin.pinot.core.realtime.impl.kafka.KafkaSimpleConsumerFactoryImpl; import com.linkedin.pinot.core.realtime.impl.kafka.SimpleConsumerWrapper; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import org.apache.commons.compress.utils.IOUtils; import org.apache.helix.HelixAdmin; import org.apache.helix.ZNRecord; import org.apache.helix.model.IdealState; import org.apache.helix.model.builder.CustomModeISBuilder; import org.apache.helix.store.zk.ZkHelixPropertyStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Pinot data server layer IdealState builder. * * */ public class PinotTableIdealStateBuilder { private static final Logger LOGGER = LoggerFactory.getLogger(PinotTableIdealStateBuilder.class); public static final String ONLINE = "ONLINE"; public static final String OFFLINE = "OFFLINE"; public static final String DROPPED = "DROPPED"; private static final long KAFKA_CONNECTION_TIMEOUT_MILLIS = 10000L; /** * * Building an empty idealState for a given table. * Used when creating a new table. * * @param tableName resource name * @param numCopies is the number of replicas * @return */ public static IdealState buildEmptyIdealStateFor(String tableName, int numCopies) { final CustomModeISBuilder customModeIdealStateBuilder = new CustomModeISBuilder(tableName); final int replicas = numCopies; customModeIdealStateBuilder .setStateModel(PinotHelixSegmentOnlineOfflineStateModelGenerator.PINOT_SEGMENT_ONLINE_OFFLINE_STATE_MODEL) .setNumPartitions(0).setNumReplica(replicas).setMaxPartitionsPerNode(1); final IdealState idealState = customModeIdealStateBuilder.build(); idealState.setInstanceGroupTag(tableName); return idealState; } /** * * Building an empty idealState for a given table. * Used when creating a new table. * * @param helixAdmin * @param helixClusterName * @return */ public static IdealState buildEmptyIdealStateForBrokerResource(HelixAdmin helixAdmin, String helixClusterName) { final CustomModeISBuilder customModeIdealStateBuilder = new CustomModeISBuilder(CommonConstants.Helix.BROKER_RESOURCE_INSTANCE); customModeIdealStateBuilder .setStateModel( PinotHelixBrokerResourceOnlineOfflineStateModelGenerator.PINOT_BROKER_RESOURCE_ONLINE_OFFLINE_STATE_MODEL) .setMaxPartitionsPerNode(Integer.MAX_VALUE).setNumReplica(Integer.MAX_VALUE) .setNumPartitions(Integer.MAX_VALUE); final IdealState idealState = customModeIdealStateBuilder.build(); return idealState; } public static IdealState addNewRealtimeSegmentToIdealState(String segmentId, IdealState state, String instanceName) { state.setPartitionState(segmentId, instanceName, ONLINE); state.setNumPartitions(state.getNumPartitions() + 1); return state; } /** * Remove a segment is also required to recompute the ideal state. * * @param tableName * @param segmentId * @param helixAdmin * @param helixClusterName * @return */ public synchronized static IdealState dropSegmentFromIdealStateFor(String tableName, String segmentId, HelixAdmin helixAdmin, String helixClusterName) { final IdealState currentIdealState = helixAdmin.getResourceIdealState(helixClusterName, tableName); final Set<String> currentInstanceSet = currentIdealState.getInstanceSet(segmentId); if (!currentInstanceSet.isEmpty() && currentIdealState.getPartitionSet().contains(segmentId)) { for (String instanceName : currentIdealState.getInstanceSet(segmentId)) { currentIdealState.setPartitionState(segmentId, instanceName, "DROPPED"); } } else { throw new RuntimeException("Cannot found segmentId - " + segmentId + " in table - " + tableName); } return currentIdealState; } /** * Remove a segment is also required to recompute the ideal state. * * @param tableName * @param segmentId * @param helixAdmin * @param helixClusterName * @return */ public synchronized static IdealState removeSegmentFromIdealStateFor(String tableName, String segmentId, HelixAdmin helixAdmin, String helixClusterName) { final IdealState currentIdealState = helixAdmin.getResourceIdealState(helixClusterName, tableName); if (currentIdealState != null && currentIdealState.getPartitionSet() != null && currentIdealState.getPartitionSet().contains(segmentId)) { currentIdealState.getPartitionSet().remove(segmentId); } else { throw new RuntimeException("Cannot found segmentId - " + segmentId + " in table - " + tableName); } return currentIdealState; } /** * * @param brokerResourceName * @param helixAdmin * @param helixClusterName * @return */ public static IdealState removeBrokerResourceFromIdealStateFor(String brokerResourceName, HelixAdmin helixAdmin, String helixClusterName) { final IdealState currentIdealState = helixAdmin.getResourceIdealState(helixClusterName, CommonConstants.Helix.BROKER_RESOURCE_INSTANCE); final Set<String> currentInstanceSet = currentIdealState.getInstanceSet(brokerResourceName); if (!currentInstanceSet.isEmpty() && currentIdealState.getPartitionSet().contains(brokerResourceName)) { currentIdealState.getPartitionSet().remove(brokerResourceName); } else { throw new RuntimeException("Cannot found broker resource - " + brokerResourceName + " in broker resource "); } return currentIdealState; } public static IdealState buildInitialHighLevelRealtimeIdealStateFor(String realtimeTableName, AbstractTableConfig realtimeTableConfig, HelixAdmin helixAdmin, String helixClusterName, ZkHelixPropertyStore<ZNRecord> zkHelixPropertyStore) { String realtimeServerTenant = ControllerTenantNameBuilder.getRealtimeTenantNameForTenant(realtimeTableConfig.getTenantConfig().getServer()); final List<String> realtimeInstances = helixAdmin.getInstancesInClusterWithTag(helixClusterName, realtimeServerTenant); IdealState idealState = buildEmptyKafkaConsumerRealtimeIdealStateFor(realtimeTableName, 1); if (realtimeInstances.size() % Integer.parseInt(realtimeTableConfig.getValidationConfig().getReplication()) != 0) { throw new RuntimeException( "Number of instance in current tenant should be an integer multiples of the number of replications"); } setupInstanceConfigForKafkaHighLevelConsumer(realtimeTableName, realtimeInstances.size(), Integer.parseInt(realtimeTableConfig.getValidationConfig().getReplication()), realtimeTableConfig.getIndexingConfig().getStreamConfigs(), zkHelixPropertyStore, realtimeInstances); return idealState; } public static void buildLowLevelRealtimeIdealStateFor(String realtimeTableName, AbstractTableConfig realtimeTableConfig, HelixAdmin helixAdmin, String helixClusterName, IdealState idealState) { String realtimeServerTenant = ControllerTenantNameBuilder.getRealtimeTenantNameForTenant(realtimeTableConfig.getTenantConfig().getServer()); final List<String> realtimeInstances = helixAdmin.getInstancesInClusterWithTag(helixClusterName, realtimeServerTenant); boolean create = false; final String replicasPerPartitionStr = realtimeTableConfig.getValidationConfig().getReplicasPerPartition(); if (replicasPerPartitionStr == null || replicasPerPartitionStr.isEmpty()) { throw new RuntimeException("Null or empty value for replicasPerPartition, expected a number"); } final int nReplicas; try { nReplicas = Integer.valueOf(replicasPerPartitionStr); } catch (NumberFormatException e) { throw new PinotHelixResourceManager.InvalidTableConfigException( "Invalid value for replicasPerPartition, expected a number: " + replicasPerPartitionStr, e); } if (idealState == null) { idealState = buildEmptyKafkaConsumerRealtimeIdealStateFor(realtimeTableName, nReplicas); create = true; } LOGGER.info("Assigning partitions to instances for simple consumer for table {}", realtimeTableName); final KafkaStreamMetadata kafkaMetadata = new KafkaStreamMetadata(realtimeTableConfig.getIndexingConfig().getStreamConfigs()); final String topicName = kafkaMetadata.getKafkaTopicName(); final PinotLLCRealtimeSegmentManager segmentManager = PinotLLCRealtimeSegmentManager.getInstance(); final int nPartitions = getPartitionCount(kafkaMetadata); LOGGER.info("Assigning {} partitions to instances for simple consumer for table {}", nPartitions, realtimeTableName); segmentManager.setupHelixEntries(topicName, realtimeTableName, nPartitions, realtimeInstances, nReplicas, kafkaMetadata.getKafkaConsumerProperties().get(Helix.DataSource.Realtime.Kafka.AUTO_OFFSET_RESET), kafkaMetadata.getBootstrapHosts(), idealState, create, PinotLLCRealtimeSegmentManager.getRealtimeTableFlushSize(realtimeTableConfig)); } public static int getPartitionCount(KafkaStreamMetadata kafkaMetadata) { KafkaPartitionsCountFetcher fetcher = new KafkaPartitionsCountFetcher(kafkaMetadata); RetryPolicy policy = RetryPolicies.noDelayRetryPolicy(3); boolean successful = policy.attempt(fetcher); if (successful) { return fetcher.getPartitionCount(); } else { Exception e = fetcher.getException(); LOGGER.error("Could not get partition count for {}", kafkaMetadata.getKafkaTopicName(), e); throw new RuntimeException(e); } } public static IdealState buildEmptyKafkaConsumerRealtimeIdealStateFor(String realtimeTableName, int replicaCount) { final CustomModeISBuilder customModeIdealStateBuilder = new CustomModeISBuilder(realtimeTableName); customModeIdealStateBuilder .setStateModel(PinotHelixSegmentOnlineOfflineStateModelGenerator.PINOT_SEGMENT_ONLINE_OFFLINE_STATE_MODEL) .setNumPartitions(0).setNumReplica(replicaCount).setMaxPartitionsPerNode(1); final IdealState idealState = customModeIdealStateBuilder.build(); idealState.setInstanceGroupTag(realtimeTableName); return idealState; } private static void setupInstanceConfigForKafkaHighLevelConsumer(String realtimeTableName, int numDataInstances, int numDataReplicas, Map<String, String> streamProviderConfig, ZkHelixPropertyStore<ZNRecord> zkHelixPropertyStore, List<String> instanceList) { int numInstancesPerReplica = numDataInstances / numDataReplicas; int partitionId = 0; int replicaId = 0; String groupId = getGroupIdFromRealtimeDataTable(realtimeTableName, streamProviderConfig); for (int i = 0; i < numInstancesPerReplica * numDataReplicas; ++i) { String instance = instanceList.get(i); InstanceZKMetadata instanceZKMetadata = ZKMetadataProvider.getInstanceZKMetadata(zkHelixPropertyStore, instance); if (instanceZKMetadata == null) { instanceZKMetadata = new InstanceZKMetadata(); String[] instanceConfigs = instance.split("_"); assert (instanceConfigs.length == 3); instanceZKMetadata.setInstanceType(instanceConfigs[0]); instanceZKMetadata.setInstanceName(instanceConfigs[1]); instanceZKMetadata.setInstancePort(Integer.parseInt(instanceConfigs[2])); } instanceZKMetadata.setGroupId(realtimeTableName, groupId + "_" + replicaId); instanceZKMetadata.setPartition(realtimeTableName, Integer.toString(partitionId)); partitionId = (partitionId + 1) % numInstancesPerReplica; if (partitionId == 0) { replicaId++; } ZKMetadataProvider.setInstanceZKMetadata(zkHelixPropertyStore, instanceZKMetadata); } } private static String getGroupIdFromRealtimeDataTable(String realtimeTableName, Map<String, String> streamProviderConfig) { String keyOfGroupId = StringUtil .join(".", Helix.DataSource.STREAM_PREFIX, Helix.DataSource.Realtime.Kafka.HighLevelConsumer.GROUP_ID); String groupId = StringUtil.join("_", realtimeTableName, System.currentTimeMillis() + ""); if (streamProviderConfig.containsKey(keyOfGroupId) && !streamProviderConfig.get(keyOfGroupId).isEmpty()) { groupId = streamProviderConfig.get(keyOfGroupId); } return groupId; } private static class KafkaPartitionsCountFetcher implements Callable<Boolean> { private int _partitionCount = -1; private final KafkaStreamMetadata _kafkaStreamMetadata; private Exception _exception; private KafkaPartitionsCountFetcher(KafkaStreamMetadata kafkaStreamMetadata) { _kafkaStreamMetadata = kafkaStreamMetadata; } private int getPartitionCount() { return _partitionCount; } private Exception getException() { return _exception; } @Override public Boolean call() throws Exception { final String bootstrapHosts = _kafkaStreamMetadata.getBootstrapHosts(); final String kafkaTopicName = _kafkaStreamMetadata.getKafkaTopicName(); if (bootstrapHosts == null || bootstrapHosts.isEmpty()) { throw new RuntimeException("Invalid value for " + Helix.DataSource.Realtime.Kafka.KAFKA_BROKER_LIST); } SimpleConsumerWrapper consumerWrapper = SimpleConsumerWrapper.forMetadataConsumption( new KafkaSimpleConsumerFactoryImpl(), bootstrapHosts, PinotTableIdealStateBuilder.class.getSimpleName() + "-" + kafkaTopicName, KAFKA_CONNECTION_TIMEOUT_MILLIS); try { _partitionCount = consumerWrapper.getPartitionCount(kafkaTopicName, /*maxWaitTimeMs=*/5000L); if (_exception != null) { // We had at least one failure, but succeeded now. Log an info LOGGER.info("Successfully retrieved partition count as {} for {}", _partitionCount, kafkaTopicName); } return Boolean.TRUE; } catch (SimpleConsumerWrapper.TransientConsumerException e) { LOGGER.warn("Could not get Kafka partition count for {}:{}", kafkaTopicName, e.getMessage()); _exception = e; return Boolean.FALSE; } catch (Exception e) { _exception = e; throw e; } finally { IOUtils.closeQuietly(consumerWrapper); } } } }