/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.tools;
import com.google.common.collect.Lists;
import com.linkedin.pinot.common.config.AbstractTableConfig;
import com.linkedin.pinot.common.config.TableNameBuilder;
import com.linkedin.pinot.common.metadata.ZKMetadataProvider;
import com.linkedin.pinot.common.utils.CommonConstants.Helix.TableType;
import com.linkedin.pinot.common.utils.EqualityUtils;
import com.linkedin.pinot.common.utils.helix.HelixHelper;
import com.linkedin.pinot.common.utils.retry.RetryPolicies;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.helix.ZNRecord;
import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy;
import org.apache.helix.controller.stages.ClusterDataCache;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PinotSegmentRebalancer extends PinotZKChanger {
private static final Logger LOGGER = LoggerFactory.getLogger(PinotSegmentRebalancer.class);
static final String rebalanceTableCmd = "rebalanceTable";
static final String rebalanceTenantCmd = "rebalanceTenant";
private boolean dryRun = true;
public PinotSegmentRebalancer(String zkAddress, String clusterName, boolean dryRun) {
super(zkAddress, clusterName);
this.dryRun = dryRun;
}
/**
* return true if IdealState = ExternalView
* @return
*/
public int isStable(String tableName) {
IdealState idealState = helixAdmin.getResourceIdealState(clusterName, tableName);
ExternalView externalView = helixAdmin.getResourceExternalView(clusterName, tableName);
Map<String, Map<String, String>> mapFieldsIS = idealState.getRecord().getMapFields();
Map<String, Map<String, String>> mapFieldsEV = externalView.getRecord().getMapFields();
int numDiff = 0;
for (String segment : mapFieldsIS.keySet()) {
Map<String, String> mapIS = mapFieldsIS.get(segment);
Map<String, String> mapEV = mapFieldsEV.get(segment);
for (String server : mapIS.keySet()) {
String state = mapIS.get(server);
if (mapEV == null || mapEV.get(server) == null || !mapEV.get(server).equals(state)) {
LOGGER.info("Mismatch: segment" + segment + " server:" + server + " state:" + state);
numDiff = numDiff + 1;
}
}
}
return numDiff;
}
/**
* rebalances all tables for the tenant
* @param tenantName
*/
public void rebalanceTenantTables(String tenantName) throws Exception {
String tableConfigPath = "/CONFIGS/TABLE";
List<Stat> stats = new ArrayList<>();
List<ZNRecord> tableConfigs = propertyStore.getChildren(tableConfigPath, stats, 0);
String rawTenantName = tenantName.replaceAll("_OFFLINE", "").replace("_REALTIME", "");
int nRebalances = 0;
for (ZNRecord znRecord : tableConfigs) {
AbstractTableConfig tableConfig;
try {
tableConfig = AbstractTableConfig.fromZnRecord(znRecord);
} catch (Exception e) {
LOGGER.warn("Failed to parse table configuration for ZnRecord id: {}. Skipping", znRecord.getId());
continue;
}
if (tableConfig.getTenantConfig().getServer().equals(rawTenantName)) {
LOGGER.info(tableConfig.getTableName() + ":" + tableConfig.getTenantConfig().getServer());
nRebalances++;
rebalanceTable(tableConfig.getTableName(), tenantName);
}
}
if (nRebalances == 0) {
LOGGER.info("No tables found for tenant " + tenantName);
}
}
/**
* Rebalances a table
* @param tableName
* @throws Exception
*/
public void rebalanceTable(String tableName) throws Exception {
String tableConfigPath = "/CONFIGS/TABLE/" + tableName;
Stat stat = new Stat();
ZNRecord znRecord = propertyStore.get(tableConfigPath, stat, 0);
AbstractTableConfig tableConfig = AbstractTableConfig.fromZnRecord(znRecord);
String tenantName = tableConfig.getTenantConfig().getServer().replaceAll(TableType.OFFLINE.toString(), "")
.replace(TableType.OFFLINE.toString(), "");
rebalanceTable(tableName, tenantName);
}
/**
* Rebalances a table within a tenant
* @param tableName
* @param tenantName
* @throws Exception
*/
public void rebalanceTable(String tableName, String tenantName) throws Exception {
final TableType tableType = TableNameBuilder.getTableTypeFromTableName(tableName);
if (!tableType.equals(TableType.OFFLINE)) {
// Rebalancing works for offline tables, not any other.
LOGGER.warn("Don't know how to rebalance table " + tableName);
return;
}
IdealState currentIdealState = helixAdmin.getResourceIdealState(clusterName, tableName);
List<String> partitions = Lists.newArrayList(currentIdealState.getPartitionSet());
LinkedHashMap<String, Integer> states = new LinkedHashMap<>();
int numReplicasInIdealState = Integer.parseInt(currentIdealState.getReplicas());
final AbstractTableConfig offlineTableConfig =
ZKMetadataProvider.getOfflineTableConfig(propertyStore, tableName);
final int numReplicasInTableConfig = Integer.parseInt(offlineTableConfig.getValidationConfig().getReplication());
final int targetNumReplicas = numReplicasInTableConfig;
if (numReplicasInTableConfig < numReplicasInIdealState) {
// AutoRebalanceStrategy,computePartitionAssignment works correctly if we increase the number of partitions,
// but not if we decrease it. We need to use the PinotNumReplicaChanger to reduce the number of replicas.
LOGGER.info("You first need to reduce the number of replicas from {} to {} for table {}. Use the ChangeNumReplicas command",
numReplicasInIdealState, numReplicasInTableConfig, tableName);
return;
}
states.put("OFFLINE", 0);
states.put("ONLINE", targetNumReplicas);
Map<String, Map<String, String>> mapFields = currentIdealState.getRecord().getMapFields();
Set<String> currentHosts = new HashSet<>();
for (String segment : mapFields.keySet()) {
currentHosts.addAll(mapFields.get(segment).keySet());
}
AutoRebalanceStrategy rebalanceStrategy = new AutoRebalanceStrategy(tableName, partitions, states);
String serverTenant = TableNameBuilder.forType(tableType).tableNameWithType(tenantName);
List<String> instancesInClusterWithTag = helixAdmin.getInstancesInClusterWithTag(clusterName, serverTenant);
List<String> enabledInstancesWithTag =
HelixHelper.getEnabledInstancesWithTag(helixAdmin, clusterName, serverTenant);
LOGGER.info("Current nodes: {}", currentHosts);
LOGGER.info("New nodes: {}", instancesInClusterWithTag);
LOGGER.info("Enabled nodes: {}", enabledInstancesWithTag);
Map<String, Map<String, String>> currentMapping = currentIdealState.getRecord().getMapFields();
ZNRecord newZnRecord = rebalanceStrategy
.computePartitionAssignment(instancesInClusterWithTag, enabledInstancesWithTag, currentMapping, new ClusterDataCache());
final Map<String, Map<String, String>> newMapping = newZnRecord.getMapFields();
LOGGER.info("Current segment Assignment:");
printSegmentAssignment(currentMapping);
LOGGER.info("Final segment Assignment:");
printSegmentAssignment(newMapping);
if (!dryRun) {
if (EqualityUtils.isEqual(newMapping, currentMapping)) {
LOGGER.info("Skipping rebalancing for table:" + tableName + " since its already balanced");
} else {
HelixHelper.updateIdealState(helixManager, tableName,
new com.google.common.base.Function<IdealState, IdealState>() {
@Nullable
@Override
public IdealState apply(@Nullable IdealState idealState) {
for (String segmentId : newMapping.keySet()) {
Map<String, String> instanceStateMap = newMapping.get(segmentId);
for (String instanceId : instanceStateMap.keySet()) {
idealState.setPartitionState(segmentId, instanceId, instanceStateMap.get(instanceId));
}
}
return idealState;
}
}, RetryPolicies.exponentialBackoffRetryPolicy(5, 500L, 2.0f));
waitForStable(tableName);
LOGGER.info("Successfully rebalanced table:" + tableName);
}
}
}
private static void usage() {
System.out.println(
"Usage: PinotRebalancer [" + rebalanceTableCmd + "|" + rebalanceTenantCmd + "] <zkAddress> <clusterName> <tableName|tenantName>");
System.out.println("Example: " + rebalanceTableCmd + " localhost:2181 PinotCluster myTable_OFFLINE");
System.out.println(" " + rebalanceTenantCmd + " localhost:2181 PinotCluster beanCounter");
System.exit(1);
}
public static void main(String[] args) throws Exception {
final boolean dryRun = true;
if (args.length != 4) {
usage();
}
final String subCmd = args[0];
final String zkAddress = args[1];
final String clusterName = args[2];
final String tableOrTenant = args[3];
PinotSegmentRebalancer rebalancer = new PinotSegmentRebalancer(zkAddress, clusterName, dryRun);
if (subCmd.equals(rebalanceTenantCmd)) {
rebalancer.rebalanceTenantTables(tableOrTenant);
} else if (subCmd.equals(rebalanceTableCmd)) {
rebalancer.rebalanceTable(tableOrTenant);
} else {
usage();
}
if (dryRun) {
System.out.println("That was a dryrun");
}
}
}