/** * Copyright 2014 Lockheed Martin Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package streamflow.engine; import backtype.storm.LocalCluster; import backtype.storm.generated.KillOptions; import backtype.storm.generated.Nimbus; import backtype.storm.generated.NotAliveException; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import streamflow.engine.topology.TopologySubmitter; import streamflow.model.Cluster; import streamflow.model.Topology; import streamflow.model.config.StreamflowConfig; import streamflow.model.storm.BoltStats; import streamflow.model.storm.ClusterSummary; import streamflow.model.storm.ErrorInfo; import streamflow.model.storm.ExecutorInfo; import streamflow.model.storm.ExecutorSpecificStats; import streamflow.model.storm.ExecutorStats; import streamflow.model.storm.ExecutorSummary; import streamflow.model.storm.SpoutStats; import streamflow.model.storm.SupervisorSummary; import streamflow.model.storm.TopologyInfo; import streamflow.model.storm.TopologySummary; import org.apache.thrift7.TException; import org.apache.thrift7.protocol.TBinaryProtocol; import org.apache.thrift7.protocol.TProtocol; import org.apache.thrift7.transport.TFramedTransport; import org.apache.thrift7.transport.TSocket; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @Singleton public class StormEngine { protected static final Logger LOG = LoggerFactory.getLogger(StormEngine.class); private LocalCluster localCluster; private final StreamflowConfig streamflowConfig; private final HashMap<String, Cluster> clusters = new HashMap<>(); private static final int KILL_BUFFER_SECS = 60; @Inject public StormEngine(StreamflowConfig streamflowConfig) { this.streamflowConfig = streamflowConfig; // Add each of the clusters from the application configuration if (streamflowConfig.getClusters() != null) { for (Cluster cluster : streamflowConfig.getClusters()) { clusters.put(cluster.getId(), cluster); } } } @Inject(optional=true) public void setLocalCluster(@Named("LocalCluster") LocalCluster localCluster) { this.localCluster = localCluster; // Manually add the local cluster and add it to the cluster map Cluster localClusterEntry = new Cluster( Cluster.LOCAL, "Local", "localhost", 6627, "localhost", 9300, null); clusters.put(localClusterEntry.getId(), localClusterEntry); } public Topology submitTopology(Topology topology, Cluster cluster) { // Execute topology submission in a thread to maintain separate context class loader for each topology TopologySubmitter submitter = new TopologySubmitter( topology, cluster, localCluster, streamflowConfig); submitter.start(); try { // Wait for the topology to be fully submitted before continuing submitter.join(); } catch (InterruptedException ex) { LOG.error("Topology submission aborted: {}", ex.getMessage()); topology = null; } return topology; } public boolean killTopology(Topology topology, int waitTimeSecs, boolean async) { boolean killed = true; if (isDeployed(topology)) { try { KillOptions killOptions = new KillOptions(); killOptions.set_wait_secs(waitTimeSecs); if (isLocal(topology.getClusterId())) { // Kill the topology on the local cluster localCluster.killTopologyWithOpts(topology.getId(), killOptions); } else { Cluster cluster = clusters.get(topology.getClusterId()); TSocket tsocket = new TSocket(cluster.getNimbusHost(), cluster.getNimbusPort()); TFramedTransport tTransport = new TFramedTransport(tsocket); TProtocol tBinaryProtocol = new TBinaryProtocol(tTransport); Nimbus.Client client = new Nimbus.Client(tBinaryProtocol); tTransport.open(); client.killTopologyWithOpts(topology.getId(), killOptions); } if (!async) { // Check for final removal of topology waitTime plus 60 second buffer killed = waitForTopologyRemoval(topology, waitTimeSecs + KILL_BUFFER_SECS); } } catch (NotAliveException ex) { // Topology is not running on the cluster so just ignore killed = true; } catch (Exception ex) { LOG.error("Exception occurred while killing the remote topology: ID = " + topology.getId() + ", Reason = " + ex.getMessage()); ex.printStackTrace(); killed = false; } } return killed; } public ClusterSummary getClusterSummary(Cluster cluster) { backtype.storm.generated.ClusterSummary summary = null; String nimbusConf = null; if (cluster != null) { if (isLocal(cluster.getId())) { summary = localCluster.getClusterInfo(); } else { TSocket tsocket = new TSocket(cluster.getNimbusHost(), cluster.getNimbusPort()); TFramedTransport tTransport = new TFramedTransport(tsocket); try { TProtocol tBinaryProtocol = new TBinaryProtocol(tTransport); Nimbus.Client client = new Nimbus.Client(tBinaryProtocol); tTransport.open(); summary = client.getClusterInfo(); nimbusConf = client.getNimbusConf(); } catch (Exception ex) { LOG.error("Exception while retrieving cluster summary: {}", ex.getMessage()); } finally { tTransport.close(); } } } ClusterSummary clusterSummary = null; if (summary != null) { clusterSummary = new ClusterSummary(); clusterSummary.setNimbusUptimeSecs(summary.get_nimbus_uptime_secs()); clusterSummary.setNimbusConf(nimbusConf); List<SupervisorSummary> supervisors = new ArrayList<>(); for (backtype.storm.generated.SupervisorSummary ss : summary.get_supervisors()) { SupervisorSummary supervisor = new SupervisorSummary(); supervisor.setHost(ss.get_host()); supervisor.setSupervisorId(ss.get_supervisor_id()); supervisor.setNumUsedWorkers(ss.get_num_used_workers()); supervisor.setNumWorkers(ss.get_num_workers()); supervisor.setUptimeSecs(ss.get_uptime_secs()); supervisors.add(supervisor); } clusterSummary.setSupervisors(supervisors); List<TopologySummary> topologies = new ArrayList<>(); for (backtype.storm.generated.TopologySummary ts : summary.get_topologies()) { TopologySummary topology = new TopologySummary(); topology.setId(ts.get_id()); topology.setName(ts.get_name()); topology.setStatus(ts.get_status()); topology.setUptimeSecs(ts.get_uptime_secs()); topology.setNumExecutors(ts.get_num_executors()); topology.setNumWorkers(ts.get_num_workers()); topology.setNumTasks(ts.get_num_tasks()); topologies.add(topology); } clusterSummary.setTopologies(topologies); } return clusterSummary; } public TopologyInfo getTopologyInfo(Topology topology) { backtype.storm.generated.TopologyInfo info = null; String topologyConf = null; // If the topology shouldn't be deployed, no need to query the cluster if (!isDeployed(topology)) { TopologyInfo topologyInfo = new TopologyInfo(); topologyInfo.setName(topology.getName()); topologyInfo.setStatus("IDLE"); return topologyInfo; } // Convert the topology ID of the streamflow topology to the id recognized by Storm String stormTopologyId = resolveStormTopologyId(topology); // The topology should be running, but found no matching name. Topology must have been killed if (stormTopologyId == null) { TopologyInfo topologyInfo = new TopologyInfo(); topologyInfo.setName(topology.getName()); topologyInfo.setStatus("KILLED"); return topologyInfo; } if (isLocal(topology.getClusterId())) { info = localCluster.getTopologyInfo(stormTopologyId); topologyConf = localCluster.getTopologyConf(stormTopologyId); } else { Cluster cluster = clusters.get(topology.getClusterId()); TSocket tsocket = new TSocket(cluster.getNimbusHost(), cluster.getNimbusPort()); TFramedTransport tTransport = new TFramedTransport(tsocket); try { TProtocol tBinaryProtocol = new TBinaryProtocol(tTransport); Nimbus.Client client = new Nimbus.Client(tBinaryProtocol); tTransport.open(); info = client.getTopologyInfo(stormTopologyId); topologyConf = client.getTopologyConf(stormTopologyId); } catch (NotAliveException ex) { LOG.error("The requested topology was not found in the cluster: ID = " + stormTopologyId); } catch (TException ex) { LOG.error("Exception while retrieving the remote topology info: ", ex.getMessage()); } finally { tTransport.close(); } } /* // Make sure the specified topology was found on the storm cluster if (info == null) { LOG.error("Unable to retrieve topology info from the storm cluster"); return null; } */ TopologyInfo topologyInfo = new TopologyInfo(); if (info != null) { topologyInfo.setId(info.get_id()); topologyInfo.setName(info.get_name()); topologyInfo.setStatus(info.get_status()); topologyInfo.setUptimeSecs(info.get_uptime_secs()); topologyInfo.setTopologyConf(topologyConf); for (Map.Entry<String, List<backtype.storm.generated.ErrorInfo>> error : info.get_errors().entrySet()) { List<ErrorInfo> errorInfoList = new ArrayList<>(); for (backtype.storm.generated.ErrorInfo ei : error.getValue()) { ErrorInfo errorInfo = new ErrorInfo(); errorInfo.setError(ei.get_error()); errorInfo.setErrorTimeSecs(ei.get_error_time_secs()); errorInfo.setHost(ei.get_host()); errorInfo.setPort(ei.get_port()); errorInfoList.add(errorInfo); } topologyInfo.getErrors().put(error.getKey(), errorInfoList); } List<ExecutorSummary> executorSummaries = new ArrayList<>(); for (backtype.storm.generated.ExecutorSummary es : info.get_executors()) { ExecutorSummary executor = new ExecutorSummary(); executor.setComponentId(es.get_component_id()); executor.setHost(es.get_host()); executor.setPort(es.get_port()); executor.setUptimeSecs(es.get_uptime_secs()); backtype.storm.generated.ExecutorInfo ei = es.get_executor_info(); if (ei != null) { ExecutorInfo executorInfo = new ExecutorInfo(); executorInfo.setTaskStart(ei.get_task_start()); executorInfo.setTaskEnd(ei.get_task_end()); executor.setExecutorInfo(executorInfo); } backtype.storm.generated.ExecutorStats eStats = es.get_stats(); if (eStats != null) { ExecutorStats stats = new ExecutorStats(); stats.setEmitted(eStats.get_emitted()); stats.setTransferred(eStats.get_transferred()); backtype.storm.generated.ExecutorSpecificStats ess = eStats.get_specific(); if (ess != null) { ExecutorSpecificStats specific = new ExecutorSpecificStats(); if (ess.is_set_bolt()) { backtype.storm.generated.BoltStats bs = ess.get_bolt(); if (bs != null) { BoltStats boltStats = new BoltStats(); for (Map.Entry<String, Map<backtype.storm.generated.GlobalStreamId, Long>> ae : bs.get_acked().entrySet()) { Map<String, Long> ackedMap = new HashMap<>(); for (Map.Entry<backtype.storm.generated.GlobalStreamId, Long> aem : ae.getValue().entrySet()) { backtype.storm.generated.GlobalStreamId gsi = aem.getKey(); String globalStreamId = gsi.get_componentId() + ":" + gsi.get_streamId(); ackedMap.put(globalStreamId, aem.getValue()); } boltStats.getAcked().put(ae.getKey(), ackedMap); } for (Map.Entry<String, Map<backtype.storm.generated.GlobalStreamId, Long>> fe : bs.get_failed().entrySet()) { Map<String, Long> failedMap = new HashMap<>(); for (Map.Entry<backtype.storm.generated.GlobalStreamId, Long> fem : fe.getValue().entrySet()) { backtype.storm.generated.GlobalStreamId gsi = fem.getKey(); String globalStreamId = gsi.get_componentId() + ":" + gsi.get_streamId(); failedMap.put(globalStreamId, fem.getValue()); } boltStats.getFailed().put(fe.getKey(), failedMap); } for (Map.Entry<String, Map<backtype.storm.generated.GlobalStreamId, Long>> ee : bs.get_executed().entrySet()) { Map<String, Long> executedMap = new HashMap<>(); for (Map.Entry<backtype.storm.generated.GlobalStreamId, Long> eem : ee.getValue().entrySet()) { backtype.storm.generated.GlobalStreamId gsi = eem.getKey(); String globalStreamId = gsi.get_componentId() + ":" + gsi.get_streamId(); executedMap.put(globalStreamId, eem.getValue()); } boltStats.getExecuted().put(ee.getKey(), executedMap); } for (Map.Entry<String, Map<backtype.storm.generated.GlobalStreamId, Double>> ema : bs.get_execute_ms_avg().entrySet()) { Map<String, Double> executedMap = new HashMap<>(); for (Map.Entry<backtype.storm.generated.GlobalStreamId, Double> emam : ema.getValue().entrySet()) { backtype.storm.generated.GlobalStreamId gsi = emam.getKey(); String globalStreamId = gsi.get_componentId() + ":" + gsi.get_streamId(); executedMap.put(globalStreamId, emam.getValue()); } boltStats.getExecuteMsAvg().put(ema.getKey(), executedMap); } for (Map.Entry<String, Map<backtype.storm.generated.GlobalStreamId, Double>> pma : bs.get_process_ms_avg().entrySet()) { Map<String, Double> processMap = new HashMap<>(); for (Map.Entry<backtype.storm.generated.GlobalStreamId, Double> pmam : pma.getValue().entrySet()) { backtype.storm.generated.GlobalStreamId gsi = pmam.getKey(); String globalStreamId = gsi.get_componentId() + ":" + gsi.get_streamId(); processMap.put(globalStreamId, pmam.getValue()); } boltStats.getProcessMsAvg().put(pma.getKey(), processMap); } specific.setBolt(boltStats); } } if (ess.is_set_spout()) { backtype.storm.generated.SpoutStats ss = ess.get_spout(); if (ss != null) { SpoutStats spoutStats = new SpoutStats(); spoutStats.setAcked(ss.get_acked()); spoutStats.setFailed(ss.get_failed()); spoutStats.setCompleteMsAvg(ss.get_complete_ms_avg()); specific.setSpout(spoutStats); } } stats.setSpecific(specific); } executor.setStats(stats); } executorSummaries.add(executor); } topologyInfo.setExecutors(executorSummaries); } return topologyInfo; } private String resolveStormTopologyId(Topology topology) { String stormTopologyId = null; // Get the cluster summary for the cluster where the topology is running ClusterSummary clusterSummary = getClusterSummary(clusters.get(topology.getClusterId())); if (clusterSummary != null) { // Iterate over all of the topologies in the cluster to match up the name to the streamflow id for (TopologySummary topologySummary : clusterSummary.getTopologies()) { // The topology ID should match the storm topology name (check for condition) if (topology.getId().equalsIgnoreCase(topologySummary.getName())) { stormTopologyId = topologySummary.getId(); break; } } } return stormTopologyId; } private boolean waitForTopologyRemoval(Topology topology, int maxNumRetries) throws InterruptedException { int numTries = 0; // Check every second to see if the topology was finally removed from the cluster while(resolveStormTopologyId(topology) != null) { Thread.sleep(1000); // If the max number of retries were hit, then just break out regardless if (++numTries == maxNumRetries) break; } return resolveStormTopologyId(topology) == null; } private boolean isLocal(String clusterId) { return clusterId.equalsIgnoreCase(Cluster.LOCAL); } private boolean isDeployed(Topology topology) { return !topology.getStatus().equals("IDLE"); } }