package com.linkedin.databus.cluster;
/*
*
* Copyright 2013 LinkedIn Corp. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
import java.net.InetAddress;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Vector;
import org.apache.helix.ExternalViewChangeListener;
import org.apache.helix.HelixException;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerFactory;
import org.apache.helix.InstanceType;
import org.apache.helix.LiveInstanceChangeListener;
import org.apache.helix.NotificationContext;
import org.apache.helix.controller.HelixControllerMain;
import org.apache.helix.manager.zk.ZKHelixAdmin;
import org.apache.helix.manager.zk.ZNRecordSerializer;
import org.apache.helix.manager.zk.ZkClient;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.IdealState.IdealStateModeProperty;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.LiveInstance;
import org.apache.helix.model.StateModelDefinition;
import org.apache.helix.participant.StateMachineEngine;
import org.apache.helix.tools.StateModelConfigGenerator;
import org.apache.log4j.Logger;
import com.linkedin.databus.client.registration.ClusterRegistrationStaticConfig;
public class DatabusCluster
{
public static final String MODULE = DatabusCluster.class.getName();
public static final Logger LOG = Logger.getLogger(MODULE);
public static final String DEFAULT_STATE_MODEL = "OnlineOffline";
public static final String DEFAULT_RESOURCE_NAME = "default-resource";
//default time in ms that we wait for cluster to be setup
public static final int DEFAULT_CLUSTER_CREATE_WAIT_MS = 1000;
private static final String HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY = "zk.session.timeout";
protected final ZkClient _zkClient;
protected final ZKHelixAdmin _admin;
protected final String _clusterName;
protected final String _zkAddr;
protected final int _quorum;
protected final int _numPartitions;
protected final HashSet<DatabusClusterDataNotifier> _dataNotifiers;
// populated by external watcher;
protected final DatabusHelixWatcher _watcher;
private final int _zkConnectionTimeoutMs;
private final int _zkSessionTimeoutMs;
public DatabusCluster(ClusterRegistrationStaticConfig config) throws Exception
{
_zkAddr = config.getZkAddr();
_clusterName = config.getClusterName();
_quorum = (int)(config.getQuorum());
_numPartitions = (int)(config.getNumPartitions());
_zkSessionTimeoutMs = config.getZkSessionTimeoutMs();
_zkConnectionTimeoutMs = config.getZkConnectionTimeoutMs();
updateHelixManagerZkSessionTimeout(_zkSessionTimeoutMs);
_zkClient = new ZkClient(_zkAddr, _zkSessionTimeoutMs, _zkConnectionTimeoutMs, new ZNRecordSerializer());
_admin = new ZKHelixAdmin(_zkClient);
_dataNotifiers = new HashSet<DatabusClusterDataNotifier>(5);
//attempt to create a cluster
int part = create(_admin, _zkClient, _clusterName, _numPartitions);
//at this stage a cluster and resources should have been created, either by this instance or someone else
if (part >= 0)
{
if (_numPartitions != part)
{
String msg = "Cannot create DatabusCluster! Cluster exists with num partitions="
+ part
+ ". Tried to join with "
+ _numPartitions
+ " partitions";
throw new DatabusClusterException(msg);
}
}
else
{
throw new DatabusClusterException("Cluster " + _clusterName + " could not be accessed. Num partitions returned -1");
}
// initialize watcher after creating a cluster
_watcher = new DatabusHelixWatcher();
}
/**
* Updates the zk.session.timeout system property for ZK connections made by the Helix manager
*/
private static void updateHelixManagerZkSessionTimeout(int timeoutMs)
{
String timeoutStr = System.getProperty(HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY);
if (null != timeoutStr)
{
try
{
int envTimeoutMs = Integer.parseInt(timeoutStr);
if (envTimeoutMs >= timeoutMs)
{
//the existing timeout is larger than ours, so keep as it is
return;
}
}
catch (NumberFormatException e)
{
LOG.warn("invalid existing value for " + HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY + ": " + timeoutStr);
}
}
System.setProperty(HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY, Integer.toString(timeoutMs));
}
/**
*
* @return number of partitions in this resource; called mainly to do sanity
* check - if requested number of partitions is same as existing
* partition If this can't be determined;return -1 To resize, use a
* new cluster name
*/
static protected int getNumPartitionsInResource(ZKHelixAdmin admin, String clusterName,String resourceName)
{
if (admin != null)
{
try
{
IdealState idealState = admin.getResourceIdealState(clusterName,
resourceName);
if (idealState != null)
{
return idealState.getNumPartitions();
}
else
{
return 0;
}
}
catch (Exception e)
{
LOG.warn("Resource " + resourceName + " not found in " + clusterName);
return 0;
}
}
return -1;
}
/**
* create a cluster with a partitioned resource .
* If successful, a non-zero number indicating the number of partitions created in the cluster is returned ;
* This is meant to be atomic. If more than one thread/instance attempts to create the same cluster with diff number of partitions
* only one of them will win . If a cluster exists, the number returned however will be the same in both those instances.
* If the cluster could not be reached 0 is returned (retry possible) , and if there are other errors -1 is returned (retry not possible)
*/
static public int create(ZKHelixAdmin admin, ZkClient zkClient, String clusterName,
int numPartitions)
{
boolean clusterAdded = true;
// add cluster
try
{
/**
* TODO : HACK !! : Copying this logic from OLD Helix library to mimic similar old "behavior".
*
* Please see DDSDBUS-2579/HELIX-137
* The helix addCluster() ( in 0.6.2.3) API has a new problem where callers could not differentiate
* between the case when new cluster is created and the case where it was created by some other client. This was needed
* so that the follow-up steps of adding state-model (non-idempotent operation) can be done only by the client creating the cluster.
* Both old (0.6.1.3) and new Helix (0.6.2.3 ) library has the following issue:
* (a) "No Atomicity in the face of the ZK client disconnects which results in cluster only partly
* initialized and unusable. This is noticed in PCL/PCS environment"
*
* In order to workaround the backwards incompatibility issue between the 2 helix versions, we are reproducing part
* of the old addCluster() implementation below to get the same behavior as that of using 0.6.1.3 . The problem referred
* as (a) still exists.
*
*/
if (zkClient.exists("/" + clusterName))
{
throw new Exception("Cluster already exists !!");
}
clusterAdded = admin.addCluster(clusterName, false);
if ( ! clusterAdded )
{
LOG.error("Problem creating cluster (" + clusterName + ")");
}
}
catch (Exception e)
{
LOG.warn("Warn! Cluster might already exist! " + clusterName + " Exception=" + e.getMessage());
clusterAdded = false;
}
if (clusterAdded)
{
//LOG.warn("Added new cluster " + clusterName
// + " . Creating resource " + DEFAULT_RESOURCE_NAME);
// add state model definition
try
{
admin.addStateModelDef(
clusterName,
DEFAULT_STATE_MODEL,
new StateModelDefinition(StateModelConfigGenerator
.generateConfigForOnlineOffline()));
admin.addResource(clusterName, DEFAULT_RESOURCE_NAME,
numPartitions, DEFAULT_STATE_MODEL,
IdealStateModeProperty.AUTO_REBALANCE.toString());
admin.rebalance(clusterName, DEFAULT_RESOURCE_NAME, 1);
}
catch (Exception e)
{
LOG.warn("Resource addition incomplete. May have been completed by another instance: " + e.getMessage());
clusterAdded = false;
}
}
//Ensure that cluster is setup fully
int part = getNumPartitionsInResource(admin, clusterName, DEFAULT_RESOURCE_NAME);
if (part == 0)
{
long startTimeMs = System.currentTimeMillis();
try
{
do
{
Thread.sleep(100);
part = getNumPartitionsInResource(admin, clusterName, DEFAULT_RESOURCE_NAME);
}
while (part==0 && ((System.currentTimeMillis()-startTimeMs) < DEFAULT_CLUSTER_CREATE_WAIT_MS) );
}
catch (InterruptedException e)
{
LOG.warn("Cluster create wait interrupted for cluster=" + clusterName + " exception= " + e.getMessage());
}
}
return part;
}
public void start()
{
if (_watcher != null)
{
_watcher.start();
}
}
public void shutdown()
{
if (_watcher != null)
{
_watcher.stop();
}
if (_dataNotifiers != null)
{
_dataNotifiers.clear();
}
if (_zkClient != null)
{
_zkClient.close();
}
}
public DatabusClusterMember addMember(String id)
{
return addMember(id, null);
}
public DatabusClusterMember addMember(String id,
DatabusClusterNotifier notifier)
{
try
{
if (_admin != null)
{
InstanceConfig config = null;
try
{
config = _admin.getInstanceConfig(_clusterName, id);
}
catch (HelixException e)
{
// the instance doesn't exist , so adding a new config
}
if (config != null)
{
LOG.warn("Member id already exists! Overwriting instance for id="
+ id);
_admin.dropInstance(_clusterName, config);
config = null;
}
config = new InstanceConfig(id);
config.setHostName(InetAddress.getLocalHost()
.getCanonicalHostName());
config.setInstanceEnabled(true);
_admin.addInstance(_clusterName, config);
return new DatabusClusterMember(id, notifier);
}
}
catch (Exception e)
{
LOG.error("Error creating databus cluster member " + id
+ " exception:" + e);
}
return null;
}
/**
* Add a cluster data notifier to the cluster for notifications on cluster
* metadata
*/
synchronized public void addDataNotifier(DatabusClusterDataNotifier notifier)
{
if (notifier != null)
{
_dataNotifiers.add(notifier);
}
else
{
LOG.warn("Add failed. Attempting to add null DatabusClusterDataNotifier!");
}
}
synchronized public void removeDataNotifier(
DatabusClusterDataNotifier notifier)
{
if (notifier != null)
{
_dataNotifiers.remove(notifier);
}
}
public int getNumPartitions()
{
return _numPartitions;
}
public int getNumActiveMembers()
{
return _watcher.getNumActiveInstances();
}
public int getNumActivePartitions()
{
return _watcher.getNumActivePartitions();
}
public HashMap<Integer, String> getActivePartitions()
{
return _watcher.getPartitions();
}
public String getPartitionOwner(int partition)
{
return _watcher.getOwnerId(partition);
}
public String getClusterName()
{
return _clusterName;
}
public int getQuorum()
{
return _quorum;
}
/**
* A cluster member who has the ability to join and leave the cluster
**/
public class DatabusClusterMember
{
private final HelixManager _manager;
private final String _id;
private DatabusClusterNotifier _notifier = null;
DatabusClusterMember(String id, DatabusClusterNotifier notifier)
throws Exception
{
_id = id;
_manager = HelixManagerFactory.getZKHelixManager(_clusterName, _id,
InstanceType.PARTICIPANT, _zkAddr);
_notifier = notifier;
registerNotifier();
}
void registerNotifier()
{
StateMachineEngine stateMach = _manager.getStateMachineEngine();
DatabusClusterNotifierFactory modelFactory = new DatabusClusterNotifierFactory(
_notifier);
stateMach.registerStateModelFactory(DEFAULT_STATE_MODEL,
modelFactory);
}
public boolean join()
{
if (_manager != null)
{
if (!_manager.isConnected())
{
try
{
_manager.connect();
return true;
}
catch (Exception e)
{
LOG.error("Member " + _id + " could not connect! " + e);
}
}
else
{
LOG.warn("Member " + _id + " cannot join. Already joined! ");
return true;
}
}
return false;
}
public boolean leave()
{
if (_manager != null)
{
try
{
_manager.disconnect();
}
catch (Exception e)
{
LOG.error("Member " + _id + " could not disconnect! " + e);
}
}
// ensure that whitelisted instance configs go away
if (_admin != null)
{
try
{
_admin.dropInstance(_clusterName,
_admin.getInstanceConfig(_clusterName, _id));
}
catch (HelixException e)
{
LOG.warn("Drop instance failed for id= " + _id
+ " exception" + e);
}
}
return true;
}
/** Used for unit-testing **/
protected DatabusClusterMember()
{
_manager = null;
_id = null;
}
}
@SuppressWarnings("serial")
static public class DatabusClusterException extends Exception
{
public DatabusClusterException(String msg)
{
super(msg);
}
}
private class DatabusHelixWatcher implements LiveInstanceChangeListener,
ExternalViewChangeListener
{
final private HelixManager _manager;
private int _numActiveInstances = -1;
final private Random _random = new Random(System.currentTimeMillis());
final private HashMap<Integer, String> _partitionMap;
// State to help control helix assignment - enable/disable cluster
private HelixManager _helixManager = null;
// has Cluster been paused?
private boolean _paused = false;
private final int _id;
public DatabusHelixWatcher() throws Exception
{
_id = _random.nextInt();
_manager = HelixManagerFactory.getZKHelixManager(_clusterName,
"watcher_" + _clusterName + "_" + _id,
InstanceType.SPECTATOR, _zkAddr);
_partitionMap = new HashMap<Integer, String>(_numPartitions);
}
public void start()
{
if (_manager != null)
{
try
{
if (!_manager.isConnected())
{
_manager.connect();
_manager.addLiveInstanceChangeListener(this);
_manager.addExternalViewChangeListener(this);
}
}
catch (Exception e)
{
LOG.error("Cannot start HelixWatcher! " + e);
}
}
}
public void stop()
{
if (_manager != null)
{
_manager.disconnect();
}
stopHelixController();
}
@Override
public synchronized void onLiveInstanceChange(
List<LiveInstance> liveInstances,
NotificationContext changeContext)
{
_numActiveInstances = liveInstances.size();
boolean quorumReached = (_numActiveInstances >= _quorum);
if (quorumReached)
{
if (_helixManager == null)
{
LOG.warn("Quorum Reached! numNodes=" + _numActiveInstances
+ " quorum=" + _quorum);
// controller needs to be started
startHelixController();
}
else if (_paused)
{
resumeHelixController();
_paused = false;
}
}
else
{
LOG.warn("Number of nodes inadequate=" + _numActiveInstances
+ " Need at least:" + _quorum);
if (_helixManager != null && !_paused)
{
// controller has started; but pauseCluster
pauseHelixController();
_paused = true;
}
}
// perform user-specified callback
if (!_dataNotifiers.isEmpty())
{
Vector<String> nodeList = new Vector<String>(
liveInstances.size());
for (LiveInstance i : liveInstances)
{
nodeList.add(i.getInstanceName());
}
for (DatabusClusterDataNotifier notifier : _dataNotifiers)
{
notifier.onInstanceChange(nodeList);
}
}
}
private Integer getPartition(String partition)
{
String[] ps = partition.split("_");
if (ps.length >= 2)
{
return Integer.parseInt(ps[ps.length - 1]);
}
return -1;
}
@Override
public synchronized void onExternalViewChange(
List<ExternalView> externalViewList,
NotificationContext changeContext)
{
_partitionMap.clear();
for (ExternalView v : externalViewList)
{
if (v.getResourceName().equals(DEFAULT_RESOURCE_NAME))
{
for (String k : v.getPartitionSet())
{
Map<String, String> map = v.getStateMap(k);
if (map != null)
{
for (Map.Entry<String, String> mkPair : map
.entrySet())
{
String value = mkPair.getValue();
if (value != null)
{
Integer partition = getPartition(k);
if (value.equals("ONLINE"))
{
_partitionMap.put(partition,
mkPair.getKey());
}
}
}
}
}
}
}
// external call
if (!_dataNotifiers.isEmpty())
{
HashMap<Integer, String> pmap = getPartitions();
for (DatabusClusterDataNotifier notifier : _dataNotifiers)
{
notifier.onPartitionMappingChange(pmap);
}
}
}
synchronized public int getNumActiveInstances()
{
return _numActiveInstances;
}
synchronized public int getNumActivePartitions()
{
return _partitionMap.size();
}
synchronized public String getOwnerId(int numPartition)
{
return _partitionMap.get(numPartition);
}
synchronized public HashMap<Integer, String> getPartitions()
{
HashMap<Integer, String> map = new HashMap<Integer, String>(
_partitionMap.size());
map.putAll(_partitionMap);
return map;
}
/** methods to control helix's assignment of partitions **/
void stopHelixController()
{
if (_helixManager != null)
{
LOG.warn("Shutting down cluster : "
+ _helixManager.getClusterName() + " instance:"
+ _helixManager.getInstanceName());
_helixManager.disconnect();
}
}
void pauseHelixController()
{
if (_admin != null)
{
LOG.warn("Pausing cluster : " + _clusterName);
_admin.enableCluster(_clusterName, false);
}
}
void resumeHelixController()
{
if (_admin != null)
{
LOG.warn("Resuming cluster : " + _clusterName);
_admin.enableCluster(_clusterName, true);
}
}
void startHelixController()
{
try
{
String controllerId = "controller_" + _id;
LOG.info("Starting cluster controller for cluster="
+ _clusterName + " with id = " + controllerId);
_helixManager = HelixControllerMain.startHelixController(
_zkAddr, _clusterName, controllerId,
HelixControllerMain.STANDALONE);
if (_admin != null)
{
_admin.enableCluster(_helixManager.getClusterName(), true);
}
}
catch (Exception e)
{
LOG.error("Cannot start cluster controller for cluster="
+ _clusterName + e);
}
}
}
/**
* Used only for unit-testing
*/
protected DatabusCluster()
{
_admin = null;
_zkAddr = null;
_watcher = null;
_zkClient = null;
_clusterName = null;
_quorum = 0;
_numPartitions = 0;
_dataNotifiers = null;
_zkConnectionTimeoutMs = ZkClient.DEFAULT_CONNECTION_TIMEOUT;
_zkSessionTimeoutMs = ZkClient.DEFAULT_SESSION_TIMEOUT;
}
@Override
public String toString() {
return "DatabusCluster [_zkClient=" + _zkClient + ", _admin=" + _admin
+ ", _clusterName=" + _clusterName + ", _zkAddr=" + _zkAddr
+ ", _quorum=" + _quorum + ", _numPartitions=" + _numPartitions
+ ", _dataNotifiers=" + _dataNotifiers + ", _watcher="
+ _watcher + ", _zkConnectionTimeoutMs="
+ _zkConnectionTimeoutMs + ", _zkSessionTimeoutMs="
+ _zkSessionTimeoutMs + "]";
}
}