package com.alibaba.jstorm.daemon.nimbus.metric.refresh; import com.alibaba.jstorm.daemon.nimbus.metric.CheckMetricEvent; import com.alibaba.jstorm.utils.Pair; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.alibaba.jstorm.client.ConfigExtension; import com.alibaba.jstorm.cluster.Cluster; import com.alibaba.jstorm.common.metric.MetricMeta; import com.alibaba.jstorm.daemon.nimbus.NimbusUtils; import com.alibaba.jstorm.daemon.nimbus.metric.MetricEvent; import com.alibaba.jstorm.daemon.nimbus.metric.assignment.RemoveTopologyEvent; import com.alibaba.jstorm.metric.JStormMetrics; import com.alibaba.jstorm.metric.MetaType; import com.alibaba.jstorm.metric.TimeTicker; import com.alibaba.jstorm.metric.TopologyMetricContext; import com.alibaba.jstorm.schedule.Assignment; import com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot; import com.google.common.collect.Sets; /** * Sync meta <String, Long> from cache and remote * * @author Cody (weiyue.wy@alibaba-inc.com) */ public class RefreshEvent extends MetricEvent { private static final Logger LOG = LoggerFactory.getLogger(RefreshEvent.class); /** * we sync meta from remote when nimbus uptime < 600 sec * in the future when nimbus can be dynamically updated, * we should automatically send an RefreshEvent on update to * trigger the sync */ private static final int SYNC_REMOTE_META_TIME_SEC = 10 * 60; @Override public void run() { refreshTopologies(); } /** * refresh metric settings of topologies & metric meta */ public void refreshTopologies() { TimeTicker ticker = new TimeTicker(TimeUnit.MILLISECONDS, true); try { doRefreshTopologies(); LOG.debug("Refresh topologies, cost:{}", ticker.stopAndRestart()); if (!context.getNimbusData().isLeader()) { syncTopologyMeta(); LOG.debug("Sync topology meta, cost:{}", ticker.stop()); } else if (context.getNimbusData().uptime() < SYNC_REMOTE_META_TIME_SEC) { syncSysMetaFromRemote(); } } catch (Exception ex) { LOG.error("handleRefreshEvent error:", ex); } } /** * refresh metric settings of topologies and sync metric meta from local cache */ @SuppressWarnings("unchecked") private void doRefreshTopologies() { for (String topology : JStormMetrics.SYS_TOPOLOGIES) { if (!context.getTopologyMetricContexts().containsKey(topology)) { LOG.info("adding {} to metric context.", topology); Map conf = new HashMap(); if (topology.equals(JStormMetrics.CLUSTER_METRIC_KEY)) { //there's no need to consider sample rate when cluster metrics merge conf.put(ConfigExtension.TOPOLOGY_METRIC_SAMPLE_RATE, 1.0); } Set<ResourceWorkerSlot> workerSlot = Sets.newHashSet(new ResourceWorkerSlot()); TopologyMetricContext metricContext = new TopologyMetricContext(topology, workerSlot, conf); context.getTopologyMetricContexts().putIfAbsent(topology, metricContext); syncMetaFromCache(topology, context.getTopologyMetricContexts().get(topology)); syncMetaFromRemote(topology, context.getTopologyMetricContexts().get(topology)); } } Map<String, Assignment> assignMap; try { assignMap = Cluster.get_all_assignment(context.getStormClusterState(), null); for (Entry<String, Assignment> entry : assignMap.entrySet()) { String topologyId = entry.getKey(); Assignment assignment = entry.getValue(); TopologyMetricContext metricContext = context.getTopologyMetricContexts().get(topologyId); if (metricContext == null) { metricContext = new TopologyMetricContext(assignment.getWorkers()); metricContext.setTaskNum(NimbusUtils.getTopologyTaskNum(assignment)); syncMetaFromCache(topologyId, metricContext); LOG.info("adding {} to metric context.", topologyId); context.getTopologyMetricContexts().put(topologyId, metricContext); } else { boolean modify = false; if (metricContext.getTaskNum() != NimbusUtils.getTopologyTaskNum(assignment)) { modify = true; metricContext.setTaskNum(NimbusUtils.getTopologyTaskNum(assignment)); } if (!assignment.getWorkers().equals(metricContext.getWorkerSet())) { modify = true; metricContext.setWorkerSet(assignment.getWorkers()); } // we may need to sync meta when task num/workers change metricContext.setSyncMeta(!modify); } } } catch (Exception e1) { LOG.warn("Failed to get assignments"); return; } List<String> removing = new ArrayList<>(); for (String topologyId : context.getTopologyMetricContexts().keySet()) { if (!JStormMetrics.SYS_TOPOLOGY_SET.contains(topologyId) && !assignMap.containsKey(topologyId)) { removing.add(topologyId); } } for (String topologyId : removing) { LOG.info("removing topology:{}", topologyId); RemoveTopologyEvent.pushEvent(topologyId); } } /** * sync topology metric meta from external storage like TDDL/OTS. * nimbus server will skip syncing, only followers do this */ public void syncTopologyMeta() { // sys meta, use remote only syncSysMetaFromRemote(); // normal topology meta, local + remote for (Entry<String, TopologyMetricContext> entry : context.getTopologyMetricContexts().entrySet()) { String topologyId = entry.getKey(); TopologyMetricContext metricContext = entry.getValue(); if (!JStormMetrics.SYS_TOPOLOGY_SET.contains(topologyId)) { try { syncMetaFromCache(topologyId, metricContext); syncMetaFromRemote(topologyId, metricContext); } catch (Exception e1) { LOG.warn("failed to sync meta for topology:{}", topologyId); } } } } /** * sync metric meta from rocks db into mem cache on startup */ private void syncMetaFromCache(String topologyId, TopologyMetricContext tmContext) { if (!tmContext.syncMeta()) { Map<String, Long> meta = context.getMetricCache().getMeta(topologyId); if (meta != null) { tmContext.getMemMeta().putAll(meta); } tmContext.setSyncMeta(true); } } /** * sync sys topologies from remote because we want to keep all historic metric data * thus metric id cannot be changed. */ private void syncSysMetaFromRemote() { for (String topology : JStormMetrics.SYS_TOPOLOGIES) { if (context.getTopologyMetricContexts().containsKey(topology)) { syncMetaFromRemote(topology, context.getTopologyMetricContexts().get(topology)); } } } private void syncMetaFromRemote(String topologyId, TopologyMetricContext tmContext) { try { int memSize = tmContext.getMemMeta().size(); //Integer zkSize = (Integer) context.getStormClusterState().get_topology_metric(topologyId); Set<String> added = new HashSet<>(); List<Pair<MetricMeta, Long>> pairsToCheck = new ArrayList<>(); ConcurrentMap<String, Long> memMeta = tmContext.getMemMeta(); for (MetaType metaType : MetaType.values()) { List<MetricMeta> metaList = context.getMetricQueryClient().getMetricMeta(context.getClusterName(), topologyId, metaType); if (metaList != null) { LOG.debug("get remote metric meta, topology:{}, metaType:{}, local mem:{}, remote:{}", topologyId, metaType, memSize, metaList.size()); for (MetricMeta meta : metaList) { String fqn = meta.getFQN(); if (added.contains(fqn)) { Long existingId = memMeta.get(fqn); if (existingId != null && existingId != meta.getId()) { LOG.warn("duplicate remote metric meta:{}, will double-check...", fqn); pairsToCheck.add(new Pair<>(meta, existingId)); } } else { // force remote to overwrite local meta LOG.debug("overwrite local from remote:{}", fqn); added.add(fqn); memMeta.put(fqn, meta.getId()); } } } } context.getMetricCache().putMeta(topologyId, memMeta); if (pairsToCheck.size() > 0) { CheckMetricEvent.pushEvent(topologyId, tmContext, pairsToCheck); } } catch (Exception ex) { LOG.error("failed to sync remote meta", ex); } } }