/* * Copyright 2014 Alexey Plotnik * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.stem.domain; import com.google.common.base.Throwables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.stem.ClusterManagerDaemon; import org.stem.MetaStoreInitializer; import org.stem.RestUtils; import org.stem.api.REST; import org.stem.api.request.ClusterConfiguration; import org.stem.api.request.MetaStoreConfiguration; import org.stem.coordination.*; import org.stem.domain.topology.DataMapping; import org.stem.domain.topology.Partitioner; import org.stem.domain.topology.TopologyChangesListener; import org.stem.domain.topology.TopologyEventListener; import org.stem.exceptions.StemException; import org.stem.exceptions.TopologyException; import org.stem.policies.AutoPlacementPolicy; import org.stem.streaming.StreamSession; import org.stem.utils.TopologyUtils; import java.util.Collection; import java.util.List; import java.util.UUID; import java.util.concurrent.atomic.AtomicReference; // TODO: Init zookeeper client when cluster has been started? public class Cluster { private static final Logger logger = LoggerFactory.getLogger(Cluster.class); private static final String CURRENT_MAPPING = ZookeeperPaths.CURRENT_MAPPING; private static final String PREVIOUS_MAPPING = ZookeeperPaths.PREVIOUS_MAPPING; public static enum State { UNINITIALIZED, INITIALIZING, INITIALIZED } private AtomicReference<State> state = new AtomicReference<>(State.UNINITIALIZED); public static final Cluster instance = new Cluster(); final Manager manager; Descriptor descriptor; private MetaStoreConfiguration metaStoreConfiguration; @Deprecated Topology topology; // TODO: get rid of entirely org.stem.domain.topology.Topology topology2; // TODO: load topology from Zookeeper Partitioner partitioner; private DataMapping mapping = DataMapping.EMPTY; private DataDistributionManager distributionManager; private Unauthorized freshNodesPool = new Unauthorized(this); private Cluster() { try { String zookeeperEndpoint = ClusterManagerDaemon.zookeeperEndpoint(); this.manager = new Manager(zookeeperEndpoint); this.topology2 = org.stem.domain.topology.Topology.Factory.create(this); } catch (ZooException e) { throw new StemException("Can't initialize Cluster.Manager instance", e); } } // TODO: 1. Handle the situation when storage already exists but new disk were added // TODO: 2. Handle the situation when storage is new but its disks are already attached to another storage // TODO: (maybe disk was moved) public void approve(UUID eventId, UUID nodeId) { ensureInitialized(); freshNodesPool.approveExisting(eventId); manager.createStateListener(nodeId); } public Event.Join approve(UUID nodeId, String datacenter, String rack) { ensureInitialized(); Event.Join result = freshNodesPool.approveNew(nodeId, datacenter, rack); manager.createStateListener(nodeId); return result; } public EventFuture tryJoinAsync(org.stem.domain.topology.Topology.StorageNode node) throws Exception { EventFuture future = EventManager.instance.createSubscription(Event.Type.JOIN); org.stem.domain.topology.Topology.StorageNode existing = topology().findStorageNode(node.getId()); // If node is in cluster auto approve it if (null != existing) { // TODO: check node status approve(future.eventId(), node.getId()); // If autApproval is turned on add and approve immediately } else if (configuration().isAutoApproval()) { logger.info("Auto approve node {}", node); freshNodesPool.add(node, future); AutoPlacementPolicy policy = freshNodesPool.getPlacementPolicy(); approve(node.getId(), policy.getDatacenterNode(node), policy.getRackForNode(node)); // If node is really new - add it to the queue for manual approval } else { logger.info("Add node {} to the queue for manual approval", node); freshNodesPool.add(node, future); } return future; } void addStorageNode(org.stem.domain.topology.Topology.StorageNode node, String dcName, String rackName) { if (null != topology2.findStorageNode(node.id)) throw new TopologyException(String.format("Node with id=%s already exist in cluster", node.id)); org.stem.domain.topology.Topology.Datacenter datacenter = topology2.findDatacenter(dcName); if (null == datacenter) { if (topology2.dataCenters().isEmpty()) { datacenter = new org.stem.domain.topology.Topology.Datacenter(dcName); topology2.addDatacenter(datacenter); } else throw new TopologyException(String.format("Datacenter '%s' can not be found", dcName)); } org.stem.domain.topology.Topology.Rack rack = topology2.findRack(datacenter, rackName); if (null == rack) { org.stem.domain.topology.Topology.Rack newRack = new org.stem.domain.topology.Topology.Rack(rackName); datacenter.addRack(newRack); rack = newRack; } rack.addStorageNode(node); // Recompute mappings computeMapping(); // TODO: move this occurrence to topology listener callback // TODO: I'm 100% sure there should be much more arbitrary checks and validations } public Unauthorized unauthorizedPool() { return freshNodesPool; } public static Cluster instance() { return instance; } /** * Initialize cluster by loading it's configuration from Zookeeper database * * @return boolean whether the cluster initialized */ public boolean load() { try { if (!manager.loadCluster()) { return false; // Already initialized } // TODO: manager.loadUnauthorized(); } catch (Exception e) { state.set(State.UNINITIALIZED); throw new StemException("Error while loading cluster configuration", e); } return true; } public void initialize(String name, int vBuckets, int rf, String partitioner, MetaStoreConfiguration meta, ClusterConfiguration configuration) { // TODO: validate(configuration); //this.configuration = configuration; // TODO: validate(meta); this.metaStoreConfiguration = meta; try { manager.ensureUninitialized(); // TODO: seal into the manager instance Descriptor desc = new Descriptor(name, vBuckets, rf, manager.endpoint, Partitioner.Type.byName(partitioner), this.metaStoreConfiguration.getContactPoints(), configuration); manager.newCluster(desc); save(); } catch (Exception e) { state.compareAndSet(State.INITIALIZING, State.UNINITIALIZED); throw new StemException(String.format("Error while initializing a new cluster: %s", e.getMessage()), e); } } public ClusterConfiguration configuration() { manager.ensureInitialized(); return descriptor().configuration; } public void save() { try { manager.save(); } catch (Exception e) { throw new StemException("Can not save cluster configuration", e); } } public void destroy() { ensureInitialized(); manager.zookeeper().close(); // TODO: commented to meake it easy to restart embedded cluster manager in tests this.descriptor = null; this.topology = null; this.topology2 = org.stem.domain.topology.Topology.Factory.create(this); partitioner = null; mapping = null; state.set(State.UNINITIALIZED); } public boolean initialized() { return state.get() == State.INITIALIZED; } public Cluster ensureInitialized() { manager.ensureInitialized(); return this; } public State state() { return state.get(); } public Descriptor descriptor() { manager.ensureInitialized(); return descriptor; } public org.stem.domain.topology.Topology topology() { return topology2; } private static StemException produceInitError(State state) { switch (state) { case INITIALIZING: return new StemException("Cluster is initializing"); case INITIALIZED: return new StemException("Cluster has already been initialized"); default: return new StemException("Unknown race error while initializing cluster"); // We should not get here } } public Collection<org.stem.domain.topology.Topology.StorageNode> getStorageNodes() { return topology2.getStorageNodes(); } public long getUsedBytes() { long sum = 0; for (org.stem.domain.topology.Topology.StorageNode node : getStorageNodes()) { sum += node.getUsedBytes(); } return sum; } public long getTotalBytes() { long sum = 0; for (org.stem.domain.topology.Topology.StorageNode node : getStorageNodes()) { sum += node.getTotalBytes(); } return sum; } public synchronized void computeMapping() // TODO: synchronized is BAD { try { manager.recalculateDataMapping(); } catch (Exception e) { if (topology().getStorageNodes().size() < descriptor().getRf()) { logger.debug("{} nodes is not applicable for RF={}. Mapping can not be calculated", topology().getStorageNodes().size(), descriptor().getRf()); } else { logger.error("Compute mapping failed"); throw new StemException("Compute mapping failed", e); } } } // Update only numbers, not entities public void updateStat(REST.StorageNode stat) { org.stem.domain.topology.Topology.StorageNode node = topology2.findStorageNode(stat.getId()); if (null != node) { for (REST.Disk diskStat : stat.getDisks()) { org.stem.domain.topology.Topology.Disk disk = topology2.findDisk(diskStat.getId()); if (null != disk) { disk.setUsedBytes(diskStat.getUsed()); disk.setTotalBytes(diskStat.getUsed()); } } } // TODO: Check disks existence } public TopologyEventListener topologyAutoSaver() { return manager.topologyPersistingListener; } /** * */ class Manager { final String endpoint; private ZookeeperClient client; final TopologyEventListener topologyPersistingListener; public Manager(String endpoint) throws ZooException { this.endpoint = endpoint; client = ZookeeperFactoryCached.newClient(endpoint); topologyPersistingListener = new TopologyChangesListener() { @Override public void onTopologyUpdated(org.stem.domain.topology.Topology.Node node) { try { saveTopology(); // TODO: Should be invoked one when node added } catch (Exception e) { logger.error("Failed to save topology to Zookeeper"); } } }; } void saveTopology() throws Exception { saveTopology(topology2); logger.info("Topology saved"); } private void saveTopology(org.stem.domain.topology.Topology topology) throws Exception { client.saveNode(ZookeeperPaths.CLUSTER_TOPOLOGY_PATH, RestUtils.packTopology(topology)); } synchronized void newCluster(Descriptor newDescriptor) throws Exception { if (!state.compareAndSet(State.UNINITIALIZED, State.INITIALIZING)) { // We race with another initialization, it's ok throw produceInitError(state.get()); } ensureInitializing(); tryStartZookeeperClient(); logger.info("Initialize new cluster"); logClusterConfiguration(newDescriptor); validate(newDescriptor); descriptor = newDescriptor; topology = new Topology(descriptor.name, descriptor.rf); partitioner = descriptor.partitioner.builder.build(); distributionManager = new DataDistributionManager(Cluster.this, partitioner); mapping = distributionManager.getCurrentMappings(); initZookeeperPaths(); tryInitializeMetaStore(); logger.info("Cluster initialized successfully"); state.set(State.INITIALIZED); startListenForStats(); } private void tryInitializeMetaStore() { logger.info("Initialize meta store."); MetaStoreInitializer configurator = new MetaStoreInitializer(metaStoreConfiguration); configurator.createSchema(); configurator.stop(); logger.info("Meta store schema created"); } synchronized boolean loadCluster() throws Exception { if (!state.compareAndSet(State.UNINITIALIZED, State.INITIALIZING)) { // We race with another initialization, it's ok throw produceInitError(state.get()); } ensureInitializing(); tryStartZookeeperClient(); Descriptor persisted = readDescriptor(); if (null == persisted) { logger.info("No cluster descriptor found. Cluster is switched to uninitialized state"); state.set(State.UNINITIALIZED); return false; } logger.info("Cluster descriptor loaded"); validate(persisted); logClusterConfiguration(persisted); descriptor = persisted; topology = new Topology(descriptor.name, descriptor.rf); partitioner = descriptor.partitioner.builder.build(); distributionManager = new DataDistributionManager(Cluster.this, partitioner, loadMapping(ZookeeperPaths.currentMappingPath()), loadMapping(ZookeeperPaths.previousMappingPath())); mapping = distributionManager.getCurrentMappings(); org.stem.domain.topology.Topology persistedTopo = readTopology(); if (null != persistedTopo) { register(persistedTopo); } initZookeeperPaths(); state.set(State.INITIALIZED); startListenForStats(); return true; } private DataMapping loadMapping(String path) throws Exception { REST.Mapping raw = zookeeper().readZNodeData(path, REST.Mapping.class, REST.Mapping.CODEC); if (null == raw) return DataMapping.EMPTY; return RestUtils.extractMapping(raw); } private void saveMappings() throws Exception { saveMapping(CURRENT_MAPPING, mapping); saveMapping(CURRENT_MAPPING, distributionManager.getPreviousMapping()); } private void saveMapping(String kind, DataMapping entity) throws Exception { // TODO: string kind to enum type REST.Mapping raw = RestUtils.packMapping(entity); raw.setName(kind); zookeeper().saveNode(ZookeeperPaths.CLUSTER_TOPOLOGY_PATH, raw); } private void register(org.stem.domain.topology.Topology persistedTopo) { topology2 = persistedTopo; topology2.setOwner(Cluster.this); } // TODO: persist and restore nodes and disks states (Topology.NodeState, Topology.DiskState enums) // TODO: turn off listeners until we load topology ??? private org.stem.domain.topology.Topology readTopology() throws Exception { REST.Topology topologyTransient = client.readZNodeData(ZookeeperPaths.topologyPath(), REST.Topology.class); if (null == topologyTransient) return null; org.stem.domain.topology.Topology result = org.stem.domain.topology.Topology.Factory.create(); for (org.stem.domain.topology.Topology.Datacenter datacenter : RestUtils.extractDataCenters(topologyTransient)) { result.addDatacenter(datacenter); } return result; // return standalone topology, need to register it on cluster } private void startListenForStats() throws Exception { client.listenForChildren(ZookeeperPaths.CLUSTER, new StorageStatListener()); } synchronized public void save() throws Exception { ensureInitialized(); tryStartZookeeperClient(); client.createNode(ZookeeperPaths.CLUSTER, descriptor); saveTopology(); saveMappings(); saveTopologySnapshot(); } @Deprecated private void computeMapping() throws Exception { ensureInitialized(); topology.computeMappings(descriptor.vBuckets); TopoMapping topoMap = TopologyUtils.buildTopoMap(topology); client.updateNode(ZookeeperPaths.MAPPING, topoMap); List<StreamSession> sessions = topology.computeStreamingSessions(); // TODO: Anything below is not a part og this method, it should be passed to somewhere like SessionManager for (StreamSession s : sessions) { client.createNodeIfNotExists(ZookeeperPaths.OUT_SESSIONS, s); } } private void recalculateDataMapping() throws Exception { ensureInitialized(); DataMapping current = distributionManager.computeMappingNonMutable(); mapping = current; saveMapping(CURRENT_MAPPING, current); DataMapping previous = distributionManager.getPreviousMapping(); if (null != previous) { saveMapping(PREVIOUS_MAPPING, previous); } //DataMapping.Difference difference = distributionManager.computeMappingDifference(); saveTopologySnapshot(); // TODO: check the difference make sense (is there actual delta between mapping); // TODO: compute streaming sessions !!!!!!!!!!!!!!!!!!! // TODO: distributionManager.computeStreamingSessions() } private void saveTopologySnapshot() throws Exception { REST.TopologySnapshot snapshot = RestUtils.packTopologySnapshot(topology2, mapping); zookeeper().saveNode(ZookeeperPaths.CLUSTER_TOPOLOGY_PATH, snapshot); } private void ensureUninitialized() { if (Cluster.this.state.get() != State.UNINITIALIZED) throw new StemException("Cluster has already been initialized"); } private void ensureInitializing() { if (Cluster.this.state.get() != State.INITIALIZING) throw new StemException("Cluster has already been initialized"); } private void ensureInitialized() { if (Cluster.this.state.get() != State.INITIALIZED) throw new StemException("Cluster has not been initialized yet"); } private Descriptor readDescriptor() throws Exception { Descriptor descriptor = client.readZNodeData(ZookeeperPaths.CLUSTER_DESCRIPTOR_PATH, Descriptor.class); return descriptor; } private void tryStartZookeeperClient() throws ZooException { if (null == client || !client.isRunning()) { client = ZookeeperFactoryCached.newClient(endpoint); } } private void initZookeeperPaths() throws Exception { //client.createIfNotExists(ZooConstants.MAPPING); client.createNodeIfNotExists(ZookeeperPaths.MAPPING, new TopoMapping()); client.createIfNotExists(ZookeeperPaths.OUT_SESSIONS); } private void validate(Descriptor desc) { validate(desc.name, desc.vBuckets, desc.rf, desc.zookeeperEndpoint); } private void validate(String name, int vBuckets, int rf, String zookeeperEndpoint) { if (null == name) throw new StemException("Cluster name is null"); // TODO: use ValidationException if (name.length() > 100) throw new StemException("Cluster name must be less than 100 symbols"); if (vBuckets <= 0) throw new StemException("Number of virtual buckets must be greater than zero"); if (rf <= 0) throw new StemException("Replication factor must be greater than zero"); if (null == zookeeperEndpoint || zookeeperEndpoint.isEmpty()) throw new StemException("Replication factor must be greater than zero"); } public ZookeeperClient zookeeper() { return client; } private void createStateListener(UUID nodeId) { try { org.stem.domain.topology.Topology.StorageNode node = topology2.findStorageNode(nodeId); if (null != node) client.createNodeIfNotExists(ZookeeperPaths.STAT, RestUtils.packNode(node)); } catch (Exception e) { Throwables.propagate(e); } } } private void logClusterConfiguration(Descriptor d) { logger.info("Cluster name: {}", d.getName()); logger.info("Replication factor: {}", d.getRf()); logger.info("Partitioner: {}, number of partitions: {}", d.getPartitioner(), d.getvBuckets()); logger.info("Zookeeper endpoint: {}", d.getZookeeperEndpoint()); logger.info("Meta store cluster address: {}", d.getMetaStoreContactPoints()); logger.info("New nodes auto approval: {}", d.getConfiguration().isAutoApproval()); } /** * */ public static class Descriptor extends ZNodeAbstract { String name; int vBuckets; // TODO: rename to partitions int rf; Partitioner.Type partitioner = Partitioner.Type.CRUSH; String zookeeperEndpoint; String[] metaStoreContactPoints; private ClusterConfiguration configuration; public Descriptor() { } public String getName() { return name; } public int getvBuckets() { return vBuckets; } public int getRf() { return rf; } public Partitioner.Type getPartitioner() { return partitioner; } public String getZookeeperEndpoint() { return zookeeperEndpoint; } public String[] getMetaStoreContactPoints() { return metaStoreContactPoints; } public ClusterConfiguration getConfiguration() { return configuration; } public Descriptor(String name, int vBuckets, int rf, String zookeeperEndpoint, Partitioner.Type partitioner, String[] contactPoints, ClusterConfiguration configuration) { this.name = name; this.vBuckets = vBuckets; this.rf = rf; this.zookeeperEndpoint = zookeeperEndpoint; this.partitioner = partitioner; this.metaStoreContactPoints = contactPoints; this.configuration = configuration; } @Override public String name() { return ZookeeperPaths.CLUSTER_DESCRIPTOR_NAME; } } @Override public String toString() { return "Cluster \"" + descriptor.name + "\""; } }