/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.controller.helix;
import com.linkedin.pinot.common.config.TableNameBuilder;
import com.linkedin.pinot.common.metadata.ZKMetadataProvider;
import com.linkedin.pinot.common.metadata.segment.OfflineSegmentZKMetadata;
import com.linkedin.pinot.common.metrics.ControllerGauge;
import com.linkedin.pinot.common.metrics.ControllerMetrics;
import com.linkedin.pinot.common.utils.CommonConstants;
import com.linkedin.pinot.common.utils.CommonConstants.Helix.TableType;
import com.linkedin.pinot.controller.ControllerConf;
import com.linkedin.pinot.controller.helix.core.PinotHelixResourceManager;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import org.apache.helix.ControllerChangeListener;
import org.apache.helix.HelixAdmin;
import org.apache.helix.NotificationContext;
import org.apache.helix.ZNRecord;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.helix.store.zk.ZkHelixPropertyStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Manages the segment status metrics, regarding tables with fewer replicas than requested
* and segments in error state
*
* May 15, 2016
*/
public class SegmentStatusChecker {
private static final Logger LOGGER = LoggerFactory.getLogger(SegmentStatusChecker.class);
private static final int SegmentCheckerDefaultIntervalSeconds = 120;
private static final int SegmentCheckerDefaultWaitForPushTimeSeconds = 600 * 1000;
private static final int MaxOfflineSegmentsToLog = 5;
private static final String CONTROLLER_LEADER_CHANGE = "CONTROLLER LEADER CHANGE";
public static final String ONLINE = "ONLINE";
public static final String ERROR = "ERROR";
private ScheduledExecutorService _executorService;
ControllerMetrics _metricsRegistry;
private ControllerConf _config;
private final PinotHelixResourceManager _pinotHelixResourceManager;
private final HelixAdmin _helixAdmin;
private final long _segmentStatusIntervalSeconds;
private final int _waitForPushTimeSeconds;
/**
* Constructs the segment status checker.
* @param pinotHelixResourceManager The resource checker used to interact with Helix
* @param config The controller configuration object
*/
public SegmentStatusChecker(PinotHelixResourceManager pinotHelixResourceManager, ControllerConf config) {
_pinotHelixResourceManager = pinotHelixResourceManager;
_helixAdmin = pinotHelixResourceManager.getHelixAdmin();
_segmentStatusIntervalSeconds = config.getStatusCheckerFrequencyInSeconds();
_waitForPushTimeSeconds = config.getStatusCheckerWaitForPushTimeInSeconds();
}
/**
* Starts the segment status checker.
*/
public void start(ControllerMetrics metricsRegistry) {
if (_segmentStatusIntervalSeconds == -1) {
return;
}
_metricsRegistry = metricsRegistry;
setStatusToDefault();
// Subscribe to leadership changes
_pinotHelixResourceManager.getHelixZkManager().addControllerListener(new ControllerChangeListener() {
@Override
public void onControllerChange(NotificationContext changeContext) {
processLeaderChange(CONTROLLER_LEADER_CHANGE);
}
});
}
private void startThread() {
LOGGER.info("Starting segment status checker");
if (_executorService == null) {
_executorService = Executors.newSingleThreadScheduledExecutor(new ThreadFactory() {
@Override
public Thread newThread(Runnable runnable) {
Thread thread = new Thread(runnable);
thread.setName("SegStatChecker");
return thread;
}
});
// Set up an executor that executes segment status tasks periodically
_executorService.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
runSegmentMetrics();
} catch (Exception e) {
LOGGER.warn("Caught exception while running segment status checker", e);
}
}
}, SegmentCheckerDefaultIntervalSeconds, _segmentStatusIntervalSeconds, TimeUnit.SECONDS);
} else {
LOGGER.warn("SegmentStatusChecker already running. Attempt to start a duplicate thread");
}
}
/**
* Stops the segment status checker.
*/
public void stop() {
if (_executorService == null) {
return;
}
stopThread();
}
private void stopThread() {
// Shut down the executor
_executorService.shutdown();
try {
_executorService.awaitTermination(SegmentCheckerDefaultIntervalSeconds, TimeUnit.SECONDS);
} catch (InterruptedException e) {
}
_executorService = null;
}
/**
* Runs a segment status pass over the currently loaded tables.
*/
public void runSegmentMetrics() {
if (!_pinotHelixResourceManager.isLeader()) {
LOGGER.info("Skipping Segment Status check, not leader!");
setStatusToDefault();
stop();
return;
}
long startTime = System.nanoTime();
LOGGER.info("Starting Segment Status check for metrics");
// Fetch the list of tables
List<String> allTableNames = _pinotHelixResourceManager.getAllTables();
String helixClusterName = _pinotHelixResourceManager.getHelixClusterName();
HelixAdmin helixAdmin = _pinotHelixResourceManager.getHelixAdmin();
int realTimeTableCount = 0;
int offlineTableCount = 0;
ZkHelixPropertyStore<ZNRecord> propertyStore= _pinotHelixResourceManager.getPropertyStore();
for (String tableName : allTableNames) {
if (TableNameBuilder.getTableTypeFromTableName(tableName).equals(CommonConstants.Helix.TableType.OFFLINE)) {
offlineTableCount++;
} else {
realTimeTableCount++;
}
IdealState idealState = helixAdmin.getResourceIdealState(helixClusterName, tableName);
if ((idealState == null) || (idealState.getPartitionSet().isEmpty())) {
int nReplicasFromIdealState = 1;
try {
if (idealState != null) {
nReplicasFromIdealState = Integer.valueOf(idealState.getReplicas());
}
} catch (NumberFormatException e) {
// Ignore
}
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS, nReplicasFromIdealState);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS, 100);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, 100);
continue;
}
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.IDEALSTATE_ZNODE_SIZE, idealState.toString().length());
ExternalView externalView = helixAdmin.getResourceExternalView(helixClusterName, tableName);
int nReplicasIdealMax = 0; // Keeps track of maximum number of replicas in ideal state
int nReplicasExternal = -1; // Keeps track of minimum number of replicas in external view
int nErrors = 0; // Keeps track of number of segments in error state
int nOffline = 0; // Keeeps track of number segments with no online replicas
int nSegments = 0; // Counts number of segments
for (String partitionName : idealState.getPartitionSet()) {
int nReplicas = 0;
int nIdeal = 0;
nSegments++;
// Skip segments not online in ideal state
for (Map.Entry<String, String> serverAndState : idealState.getInstanceStateMap(partitionName).entrySet()) {
if (serverAndState == null) {
break;
}
if (serverAndState.getValue().equals(ONLINE)){
nIdeal++;
break;
}
}
if (nIdeal == 0) {
// No online segments in ideal state
continue;
}
nReplicasIdealMax = (idealState.getInstanceStateMap(partitionName).size() > nReplicasIdealMax)
? idealState.getInstanceStateMap(partitionName).size() : nReplicasIdealMax;
if ((externalView == null) || (externalView.getStateMap(partitionName) == null)) {
// No replicas for this segment
TableType tableType = TableNameBuilder.getTableTypeFromTableName(tableName);
if ((tableType != null) && (tableType.equals(TableType.OFFLINE))) {
OfflineSegmentZKMetadata segmentZKMetadata =
ZKMetadataProvider.getOfflineSegmentZKMetadata(propertyStore, tableName, partitionName);
if (segmentZKMetadata != null && segmentZKMetadata.getPushTime() > System.currentTimeMillis() - _waitForPushTimeSeconds * 1000) {
// push not yet finished, skip
continue;
}
}
nOffline++;
if (nOffline < MaxOfflineSegmentsToLog) {
LOGGER.warn("Segment {} of table {} has no replicas", partitionName, tableName);
}
nReplicasExternal = 0;
continue;
}
for (Map.Entry<String, String> serverAndState : externalView.getStateMap(partitionName).entrySet()) {
// Count number of online replicas
if (serverAndState.getValue().equals(ONLINE)) {
nReplicas++;
}
if (serverAndState.getValue().equals(ERROR)) {
nErrors++;
}
}
if (nReplicas == 0) {
if (nOffline < MaxOfflineSegmentsToLog) {
LOGGER.warn("Segment {} of table {} has no online replicas", partitionName, tableName);
}
nOffline++;
}
nReplicasExternal = ((nReplicasExternal > nReplicas) || (nReplicasExternal == -1)) ? nReplicas : nReplicasExternal;
}
if (nReplicasExternal == -1){
nReplicasExternal = (nReplicasIdealMax == 0) ? 1 : 0;
}
// Synchronization provided by Controller Gauge to make sure that only one thread updates the gauge
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS,
nReplicasExternal);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS,
(nReplicasIdealMax > 0) ? (nReplicasExternal * 100 / nReplicasIdealMax) : 100);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE,
nErrors);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE,
(nSegments > 0) ? (100 - (nOffline * 100 / nSegments)) : 100);
if (nOffline > 0) {
LOGGER.warn("Table {} has {} segments with no online replicas", tableName, nOffline);
}
if (nReplicasExternal < nReplicasIdealMax) {
LOGGER.warn("Table {} has {} replicas, below replication threshold :{}", tableName,
nReplicasExternal, nReplicasIdealMax);
}
}
_metricsRegistry.setValueOfGlobalGauge(ControllerGauge.REALTIME_TABLE_COUNT, realTimeTableCount);
_metricsRegistry.setValueOfGlobalGauge(ControllerGauge.OFFLINE_TABLE_COUNT, offlineTableCount);
long totalNanos = System.nanoTime() - startTime;
LOGGER.info("Segment status metrics completed in {}ms",
TimeUnit.MILLISECONDS.convert(totalNanos, TimeUnit.NANOSECONDS));
}
private void processLeaderChange(String path) {
try {
LOGGER.info("Processing change notification for path: {}", path);
if (_pinotHelixResourceManager.isLeader()) {
if (path.equals(CONTROLLER_LEADER_CHANGE)) {
startThread();
}
} else {
LOGGER.info("Not the leader of this cluster, stopping Status Checker.");
}
} catch (Exception e) {
LOGGER.error("Caught exception {} while processing leader change for path {}", e, path);
}
}
public void setMetricsRegistry(ControllerMetrics metricsRegistry) {
_metricsRegistry = metricsRegistry;
}
void setStatusToDefault() {
// Fetch the list of tables
List<String> allTableNames = _pinotHelixResourceManager.getAllTables();
// Synchronization provided by Controller Gauge to make sure that only one thread updates the gauge
for (String tableName : allTableNames) {
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS, 0);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS, 0);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE, 0);
_metricsRegistry.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, 0);
}
}
}