/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.nephele.jobmanager; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import eu.stratosphere.nephele.managementgraph.ManagementVertexID; import eu.stratosphere.nephele.taskmanager.TaskKillResult; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; import eu.stratosphere.configuration.ConfigConstants; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.configuration.GlobalConfiguration; import eu.stratosphere.core.io.StringRecord; import eu.stratosphere.nephele.client.AbstractJobResult; import eu.stratosphere.nephele.client.AbstractJobResult.ReturnCode; import eu.stratosphere.nephele.client.JobCancelResult; import eu.stratosphere.nephele.client.JobProgressResult; import eu.stratosphere.nephele.client.JobSubmissionResult; import eu.stratosphere.nephele.deployment.TaskDeploymentDescriptor; import eu.stratosphere.nephele.event.job.AbstractEvent; import eu.stratosphere.nephele.event.job.RecentJobEvent; import eu.stratosphere.nephele.execution.ExecutionState; import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager; import eu.stratosphere.nephele.executiongraph.ExecutionEdge; import eu.stratosphere.nephele.executiongraph.ExecutionGraph; import eu.stratosphere.nephele.executiongraph.ExecutionGraphIterator; import eu.stratosphere.nephele.executiongraph.ExecutionVertex; import eu.stratosphere.nephele.executiongraph.ExecutionVertexID; import eu.stratosphere.nephele.executiongraph.GraphConversionException; import eu.stratosphere.nephele.executiongraph.InternalJobStatus; import eu.stratosphere.nephele.executiongraph.JobStatusListener; import eu.stratosphere.nephele.instance.AbstractInstance; import eu.stratosphere.nephele.instance.DummyInstance; import eu.stratosphere.nephele.instance.HardwareDescription; import eu.stratosphere.nephele.instance.InstanceConnectionInfo; import eu.stratosphere.nephele.instance.InstanceManager; import eu.stratosphere.nephele.instance.InstanceType; import eu.stratosphere.nephele.instance.InstanceTypeDescription; import eu.stratosphere.nephele.instance.local.LocalInstanceManager; import eu.stratosphere.runtime.io.channels.ChannelID; import eu.stratosphere.nephele.ipc.RPC; import eu.stratosphere.nephele.ipc.Server; import eu.stratosphere.nephele.jobgraph.AbstractJobVertex; import eu.stratosphere.nephele.jobgraph.JobGraph; import eu.stratosphere.nephele.jobgraph.JobID; import eu.stratosphere.nephele.jobmanager.accumulators.AccumulatorManager; import eu.stratosphere.nephele.jobmanager.archive.ArchiveListener; import eu.stratosphere.nephele.jobmanager.archive.MemoryArchivist; import eu.stratosphere.nephele.jobmanager.scheduler.AbstractScheduler; import eu.stratosphere.nephele.jobmanager.scheduler.SchedulingException; import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitManager; import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitWrapper; import eu.stratosphere.nephele.jobmanager.web.WebInfoServer; import eu.stratosphere.nephele.managementgraph.ManagementGraph; import eu.stratosphere.nephele.profiling.JobManagerProfiler; import eu.stratosphere.nephele.profiling.ProfilingUtils; import eu.stratosphere.nephele.protocols.AccumulatorProtocol; import eu.stratosphere.nephele.protocols.ChannelLookupProtocol; import eu.stratosphere.nephele.protocols.ExtendedManagementProtocol; import eu.stratosphere.nephele.protocols.InputSplitProviderProtocol; import eu.stratosphere.nephele.protocols.JobManagerProtocol; import eu.stratosphere.nephele.services.accumulators.AccumulatorEvent; import eu.stratosphere.nephele.taskmanager.AbstractTaskResult; import eu.stratosphere.nephele.taskmanager.TaskCancelResult; import eu.stratosphere.nephele.taskmanager.TaskExecutionState; import eu.stratosphere.nephele.taskmanager.TaskSubmissionResult; import eu.stratosphere.runtime.io.network.ConnectionInfoLookupResponse; import eu.stratosphere.runtime.io.network.RemoteReceiver; import eu.stratosphere.nephele.taskmanager.ExecutorThreadFactory; import eu.stratosphere.nephele.topology.NetworkTopology; import eu.stratosphere.nephele.types.IntegerRecord; import eu.stratosphere.nephele.util.SerializableArrayList; import eu.stratosphere.util.StringUtils; /** * In Nephele the job manager is the central component for communication with clients, creating * schedules for incoming jobs and supervise their execution. A job manager may only exist once in * the system and its address must be known the clients. * Task managers can discover the job manager by means of an UDP broadcast and afterwards advertise * themselves as new workers for tasks. * */ public class JobManager implements DeploymentManager, ExtendedManagementProtocol, InputSplitProviderProtocol, JobManagerProtocol, ChannelLookupProtocol, JobStatusListener, AccumulatorProtocol { public static enum ExecutionMode { LOCAL, CLUSTER } // -------------------------------------------------------------------------------------------- private static final Log LOG = LogFactory.getLog(JobManager.class); private final Server jobManagerServer; private final JobManagerProfiler profiler; private final EventCollector eventCollector; private final ArchiveListener archive; private final InputSplitManager inputSplitManager; private final AbstractScheduler scheduler; private AccumulatorManager accumulatorManager; private InstanceManager instanceManager; private final int recommendedClientPollingInterval; private final ExecutorService executorService = Executors.newCachedThreadPool(ExecutorThreadFactory.INSTANCE); private final static int FAILURE_RETURN_CODE = 1; private final AtomicBoolean isShutdownInProgress = new AtomicBoolean(false); private volatile boolean isShutDown; private WebInfoServer server; public JobManager(ExecutionMode executionMode) throws Exception { final String ipcAddressString = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null); InetAddress ipcAddress = null; if (ipcAddressString != null) { try { ipcAddress = InetAddress.getByName(ipcAddressString); } catch (UnknownHostException e) { throw new Exception("Cannot convert " + ipcAddressString + " to an IP address: " + e.getMessage(), e); } } final int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT); // Read the suggested client polling interval this.recommendedClientPollingInterval = GlobalConfiguration.getInteger( ConfigConstants.JOBCLIENT_POLLING_INTERVAL_KEY, ConfigConstants.DEFAULT_JOBCLIENT_POLLING_INTERVAL); // Load the job progress collector this.eventCollector = new EventCollector(this.recommendedClientPollingInterval); // Register simple job archive int archived_items = GlobalConfiguration.getInteger( ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT); if (archived_items > 0) { this.archive = new MemoryArchivist(archived_items); this.eventCollector.registerArchivist(archive); } else { this.archive = null; } // Create the accumulator manager, with same archiving limit as web // interface. We need to store the accumulators for at least one job. // Otherwise they might be deleted before the client requested the // accumulator results. this.accumulatorManager = new AccumulatorManager(Math.min(1, archived_items)); // Load the input split manager this.inputSplitManager = new InputSplitManager(); // Determine own RPC address final InetSocketAddress rpcServerAddress = new InetSocketAddress(ipcAddress, ipcPort); // Start job manager's IPC server try { final int handlerCount = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_HANDLERS_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_HANDLERS); this.jobManagerServer = RPC.getServer(this, rpcServerAddress.getHostName(), rpcServerAddress.getPort(), handlerCount); this.jobManagerServer.start(); } catch (IOException e) { throw new Exception("Cannot start RPC server: " + e.getMessage(), e); } LOG.info("Starting job manager in " + executionMode + " mode"); // Try to load the instance manager for the given execution mode // Try to load the scheduler for the given execution mode if (executionMode == ExecutionMode.LOCAL) { try { this.instanceManager = new LocalInstanceManager(); } catch (Throwable t) { throw new Exception("Cannot instantiate local instance manager: " + t.getMessage(), t); } } else { final String instanceManagerClassName = JobManagerUtils.getInstanceManagerClassName(executionMode); LOG.info("Trying to load " + instanceManagerClassName + " as instance manager"); this.instanceManager = JobManagerUtils.loadInstanceManager(instanceManagerClassName); if (this.instanceManager == null) { throw new Exception("Unable to load instance manager " + instanceManagerClassName); } } // Try to load the scheduler for the given execution mode final String schedulerClassName = JobManagerUtils.getSchedulerClassName(executionMode); LOG.info("Trying to load " + schedulerClassName + " as scheduler"); // Try to get the instance manager class name this.scheduler = JobManagerUtils.loadScheduler(schedulerClassName, this, this.instanceManager); if (this.scheduler == null) { throw new Exception("Unable to load scheduler " + schedulerClassName); } // Load profiler if it should be used if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) { final String profilerClassName = GlobalConfiguration.getString(ProfilingUtils.JOBMANAGER_CLASSNAME_KEY, "eu.stratosphere.nephele.profiling.impl.JobManagerProfilerImpl"); this.profiler = ProfilingUtils.loadJobManagerProfiler(profilerClassName, ipcAddress); if (this.profiler == null) { throw new Exception("Cannot load profiler"); } } else { this.profiler = null; LOG.debug("Profiler disabled"); } } public void shutdown() { if (!this.isShutdownInProgress.compareAndSet(false, true)) { return; } // Stop instance manager if (this.instanceManager != null) { this.instanceManager.shutdown(); } // Stop profiling if enabled if (this.profiler != null) { this.profiler.shutdown(); } // Stop RPC server if (this.jobManagerServer != null) { this.jobManagerServer.stop(); } // Stop the executor service if (this.executorService != null) { this.executorService.shutdown(); try { this.executorService.awaitTermination(5000L, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { LOG.debug(e); } } // Stop and clean up the job progress collector if (this.eventCollector != null) { this.eventCollector.shutdown(); } // Finally, shut down the scheduler if (this.scheduler != null) { this.scheduler.shutdown(); } this.isShutDown = true; LOG.debug("Shutdown of job manager completed"); } /** * Log Stratosphere version information. */ private static void logVersionInformation() { String version = JobManager.class.getPackage().getImplementationVersion(); // if version == null, then the JobManager runs from inside the IDE (or somehow not from the maven build jar) String revision = "<unknown>"; try { Properties properties = new Properties(); InputStream propFile = JobManager.class.getClassLoader().getResourceAsStream(".version.properties"); if (propFile != null) { properties.load(propFile); revision = properties.getProperty("git.commit.id.abbrev"); } } catch (IOException e) { LOG.info("Cannot determine code revision. Unable ro read version property file."); } LOG.info("Starting Stratosphere JobManager (Version: " + version + ", Rev:" + revision + ")"); } /** * Entry point for the program * * @param args * arguments from the command line */ public static void main(String[] args) { // determine if a valid log4j config exists and initialize a default logger if not if (System.getProperty("log4j.configuration") == null) { Logger root = Logger.getRootLogger(); root.removeAllAppenders(); PatternLayout layout = new PatternLayout("%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n"); ConsoleAppender appender = new ConsoleAppender(layout, "System.err"); root.addAppender(appender); root.setLevel(Level.INFO); } JobManager jobManager; try { jobManager = initialize(args); // Start info server for jobmanager jobManager.startInfoServer(); } catch (Exception e) { LOG.fatal(e.getMessage(), e); System.exit(FAILURE_RETURN_CODE); } // Clean up is triggered through a shutdown hook // freeze this thread to keep the JVM alive (the job manager threads are daemon threads) Object w = new Object(); synchronized (w) { try { w.wait(); } catch (InterruptedException e) {} } } @SuppressWarnings("static-access") public static JobManager initialize(String[] args) throws Exception { // output the version and revision information to the log logVersionInformation(); final Option configDirOpt = OptionBuilder.withArgName("config directory").hasArg() .withDescription("Specify configuration directory.").create("configDir"); final Option executionModeOpt = OptionBuilder.withArgName("execution mode").hasArg() .withDescription("Specify execution mode.").create("executionMode"); final Options options = new Options(); options.addOption(configDirOpt); options.addOption(executionModeOpt); CommandLineParser parser = new GnuParser(); CommandLine line = null; try { line = parser.parse(options, args); } catch (ParseException e) { LOG.error("CLI Parsing failed. Reason: " + e.getMessage()); System.exit(FAILURE_RETURN_CODE); } final String configDir = line.getOptionValue(configDirOpt.getOpt(), null); final String executionModeName = line.getOptionValue(executionModeOpt.getOpt(), "local"); ExecutionMode executionMode = null; if ("local".equals(executionModeName)) { executionMode = ExecutionMode.LOCAL; } else if ("cluster".equals(executionModeName)) { executionMode = ExecutionMode.CLUSTER; } else { System.err.println("Unrecognized execution mode: " + executionModeName); System.exit(FAILURE_RETURN_CODE); } // First, try to load global configuration GlobalConfiguration.loadConfiguration(configDir); // Create a new job manager object JobManager jobManager = new JobManager(executionMode); // Set base dir for info server Configuration infoserverConfig = GlobalConfiguration.getConfiguration(); if (configDir != null && new File(configDir).isDirectory()) { infoserverConfig.setString(ConfigConstants.STRATOSPHERE_BASE_DIR_PATH_KEY, configDir+"/.."); } GlobalConfiguration.includeConfiguration(infoserverConfig); return jobManager; } @Override public JobSubmissionResult submitJob(JobGraph job) throws IOException { try { // First check if job is null if (job == null) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Submitted job is null!"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("Submitted job " + job.getName() + " is not null"); } // Check if any vertex of the graph has null edges AbstractJobVertex jv = job.findVertexWithNullEdges(); if (jv != null) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Vertex " + jv.getName() + " has at least one null edge"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("Submitted job " + job.getName() + " has no null edges"); } // Next, check if the graph is weakly connected if (!job.isWeaklyConnected()) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Job graph is not weakly connected"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("The graph of job " + job.getName() + " is weakly connected"); } // Check if job graph has cycles if (!job.isAcyclic()) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Job graph is not a DAG"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("The graph of job " + job.getName() + " is acyclic"); } // Check constrains on degree jv = job.areVertexDegreesCorrect(); if (jv != null) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Degree of vertex " + jv.getName() + " is incorrect"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("All vertices of job " + job.getName() + " have the correct degree"); } if (!job.isInstanceDependencyChainAcyclic()) { JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "The dependency chain for instance sharing contains a cycle"); return result; } if (LOG.isDebugEnabled()) { LOG.debug("The dependency chain for instance sharing is acyclic"); } // Check if the job will be executed with profiling enabled boolean jobRunsWithProfiling = false; if (this.profiler != null && job.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) { jobRunsWithProfiling = true; } // Try to create initial execution graph from job graph LOG.info("Creating initial execution graph from job graph " + job.getName()); ExecutionGraph eg; try { eg = new ExecutionGraph(job, this.instanceManager); } catch (GraphConversionException e) { if (e.getCause() == null) { return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e)); } else { Throwable t = e.getCause(); if (t instanceof FileNotFoundException) { return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, t.getMessage()); } else { return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t)); } } } // Register job with the progress collector if (this.eventCollector != null) { this.eventCollector.registerJob(eg, jobRunsWithProfiling, System.currentTimeMillis()); } // Check if profiling should be enabled for this job if (jobRunsWithProfiling) { this.profiler.registerProfilingJob(eg); if (this.eventCollector != null) { this.profiler.registerForProfilingData(eg.getJobID(), this.eventCollector); } } // Register job with the dynamic input split assigner this.inputSplitManager.registerJob(eg); // Register for updates on the job status eg.registerJobStatusListener(this); // Schedule job if (LOG.isInfoEnabled()) { LOG.info("Scheduling job " + job.getName()); } try { this.scheduler.schedulJob(eg); } catch (SchedulingException e) { unregisterJob(eg); JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e)); return result; } // Return on success return new JobSubmissionResult(AbstractJobResult.ReturnCode.SUCCESS, null); } catch (Throwable t) { LOG.error("Job submission failed.", t); return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t)); } } public InstanceManager getInstanceManager() { return this.instanceManager; } /** * This method is a convenience method to unregister a job from all of * Nephele's monitoring, profiling and optimization components at once. * Currently, it is only being used to unregister from profiling (if activated). * * @param executionGraph * the execution graph to remove from the job manager */ private void unregisterJob(final ExecutionGraph executionGraph) { // Remove job from profiler (if activated) if (this.profiler != null && executionGraph.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) { this.profiler.unregisterProfilingJob(executionGraph); if (this.eventCollector != null) { this.profiler.unregisterFromProfilingData(executionGraph.getJobID(), this.eventCollector); } } // Cancel all pending requests for instances this.instanceManager.cancelPendingRequests(executionGraph.getJobID()); // getJobID is final member, no // synchronization necessary // Remove job from input split manager if (this.inputSplitManager != null) { this.inputSplitManager.unregisterJob(executionGraph); } // Unregister job with library cache manager try { LibraryCacheManager.unregister(executionGraph.getJobID()); } catch (IOException ioe) { if (LOG.isWarnEnabled()) { LOG.warn(ioe); } } } @Override public void sendHeartbeat(final InstanceConnectionInfo instanceConnectionInfo, final HardwareDescription hardwareDescription) { // Delegate call to instance manager if (this.instanceManager != null) { final Runnable heartBeatRunnable = new Runnable() { @Override public void run() { instanceManager.reportHeartBeat(instanceConnectionInfo, hardwareDescription); } }; this.executorService.execute(heartBeatRunnable); } } @Override public void updateTaskExecutionState(final TaskExecutionState executionState) throws IOException { // Ignore calls with executionResult == null if (executionState == null) { LOG.error("Received call to updateTaskExecutionState with executionState == null"); return; } if (executionState.getExecutionState() == ExecutionState.FAILED) { LOG.error(executionState.getDescription()); } final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(executionState.getJobID()); if (eg == null) { LOG.error("Cannot find execution graph for ID " + executionState.getJobID() + " to change state to " + executionState.getExecutionState()); return; } final ExecutionVertex vertex = eg.getVertexByID(executionState.getID()); if (vertex == null) { LOG.error("Cannot find vertex with ID " + executionState.getID() + " of job " + eg.getJobID() + " to change state to " + executionState.getExecutionState()); return; } // Asynchronously update execute state of vertex vertex.updateExecutionStateAsynchronously(executionState.getExecutionState(), executionState.getDescription()); } @Override public JobCancelResult cancelJob(final JobID jobID) throws IOException { LOG.info("Trying to cancel job with ID " + jobID); final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID); if (eg == null) { return new JobCancelResult(ReturnCode.ERROR, "Cannot find job with ID " + jobID); } final Runnable cancelJobRunnable = new Runnable() { @Override public void run() { eg.updateJobStatus(InternalJobStatus.CANCELING, "Job canceled by user"); final TaskCancelResult cancelResult = cancelJob(eg); if (cancelResult != null) { LOG.error(cancelResult.getDescription()); } } }; eg.executeCommand(cancelJobRunnable); LOG.info("Cancel of job " + jobID + " successfully triggered"); return new JobCancelResult(AbstractJobResult.ReturnCode.SUCCESS, null); } /** * Cancels all the tasks in the current and upper stages of the * given execution graph. * * @param eg * the execution graph representing the job to cancel. * @return <code>null</code> if no error occurred during the cancel attempt, * otherwise the returned object will describe the error */ private TaskCancelResult cancelJob(final ExecutionGraph eg) { TaskCancelResult errorResult = null; /** * Cancel all nodes in the current and upper execution stages. */ final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, eg.getIndexOfCurrentExecutionStage(), false, true); while (it.hasNext()) { final ExecutionVertex vertex = it.next(); final TaskCancelResult result = vertex.cancelTask(); if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) { errorResult = result; } } return errorResult; } @Override public JobProgressResult getJobProgress(final JobID jobID) throws IOException { if (this.eventCollector == null) { return new JobProgressResult(ReturnCode.ERROR, "JobManager does not support progress reports for jobs", null); } final SerializableArrayList<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>(); this.eventCollector.getEventsForJob(jobID, eventList, false); return new JobProgressResult(ReturnCode.SUCCESS, null, eventList); } @Override public ConnectionInfoLookupResponse lookupConnectionInfo(InstanceConnectionInfo caller, JobID jobID, ChannelID sourceChannelID) { final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID); if (eg == null) { LOG.error("Cannot find execution graph to job ID " + jobID); return ConnectionInfoLookupResponse.createReceiverNotFound(); } final InternalJobStatus jobStatus = eg.getJobStatus(); if (jobStatus == InternalJobStatus.FAILING || jobStatus == InternalJobStatus.CANCELING) { return ConnectionInfoLookupResponse.createJobIsAborting(); } final ExecutionEdge edge = eg.getEdgeByID(sourceChannelID); if (edge == null) { LOG.error("Cannot find execution edge associated with ID " + sourceChannelID); return ConnectionInfoLookupResponse.createReceiverNotFound(); } if (sourceChannelID.equals(edge.getInputChannelID())) { // Request was sent from an input channel final ExecutionVertex connectedVertex = edge.getOutputGate().getVertex(); final AbstractInstance assignedInstance = connectedVertex.getAllocatedResource().getInstance(); if (assignedInstance == null) { LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getOutputGateIndex() + " but no instance assigned"); // LOG.info("Created receiverNotReady for " + connectedVertex + " 1"); return ConnectionInfoLookupResponse.createReceiverNotReady(); } // Check execution state final ExecutionState executionState = connectedVertex.getExecutionState(); if (executionState == ExecutionState.FINISHED) { // that should not happen. if there is data pending, the receiver cannot be ready return ConnectionInfoLookupResponse.createReceiverNotFound(); } // running is common, finishing is happens when the lookup is for the close event if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING) { // LOG.info("Created receiverNotReady for " + connectedVertex + " in state " + executionState + " 2"); return ConnectionInfoLookupResponse.createReceiverNotReady(); } if (assignedInstance.getInstanceConnectionInfo().equals(caller)) { // Receiver runs on the same task manager return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getOutputChannelID()); } else { // Receiver runs on a different task manager final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo(); final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort()); return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID())); } } // else, the request is for an output channel // Find vertex of connected input channel final ExecutionVertex targetVertex = edge.getInputGate().getVertex(); // Check execution state final ExecutionState executionState = targetVertex.getExecutionState(); // check whether the task needs to be deployed if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING && executionState != ExecutionState.FINISHED) { if (executionState == ExecutionState.ASSIGNED) { final Runnable command = new Runnable() { @Override public void run() { scheduler.deployAssignedVertices(targetVertex); } }; eg.executeCommand(command); } // LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 3"); return ConnectionInfoLookupResponse.createReceiverNotReady(); } final AbstractInstance assignedInstance = targetVertex.getAllocatedResource().getInstance(); if (assignedInstance == null) { LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getInputChannelID() + " but no instance assigned"); // LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 4"); return ConnectionInfoLookupResponse.createReceiverNotReady(); } if (assignedInstance.getInstanceConnectionInfo().equals(caller)) { // Receiver runs on the same task manager return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getInputChannelID()); } else { // Receiver runs on a different task manager final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo(); final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort()); return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID())); } } /** * Returns current ManagementGraph from eventCollector and, if not current, from archive * * {@inheritDoc} */ @Override public ManagementGraph getManagementGraph(final JobID jobID) throws IOException { ManagementGraph mg = this.eventCollector.getManagementGraph(jobID); if (mg == null) { if(this.archive != null) { mg = this.archive.getManagementGraph(jobID); } if (mg == null) { throw new IOException("Cannot find job with ID " + jobID); } } return mg; } @Override public NetworkTopology getNetworkTopology(final JobID jobID) throws IOException { if (this.instanceManager != null) { return this.instanceManager.getNetworkTopology(jobID); } return null; } @Override public IntegerRecord getRecommendedPollingInterval() throws IOException { return new IntegerRecord(this.recommendedClientPollingInterval); } @Override public List<RecentJobEvent> getRecentJobs() throws IOException { final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>(); if (this.eventCollector == null) { throw new IOException("No instance of the event collector found"); } this.eventCollector.getRecentJobs(eventList); return eventList; } @Override public List<AbstractEvent> getEvents(final JobID jobID) throws IOException { final List<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>(); if (this.eventCollector == null) { throw new IOException("No instance of the event collector found"); } this.eventCollector.getEventsForJob(jobID, eventList, true); return eventList; } @Override public void killTask(final JobID jobID, final ManagementVertexID id) throws IOException { final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID); if (eg == null) { LOG.error("Cannot find execution graph for job " + jobID); return; } final ExecutionVertex vertex = eg.getVertexByID(ExecutionVertexID.fromManagementVertexID(id)); if (vertex == null) { LOG.error("Cannot find execution vertex with ID " + id); return; } LOG.info("Killing task " + vertex + " of job " + jobID); final Runnable runnable = new Runnable() { @Override public void run() { final TaskKillResult result = vertex.killTask(); if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) { LOG.error(result.getDescription()); } } }; eg.executeCommand(runnable); } @Override public void killInstance(final StringRecord instanceName) throws IOException { final AbstractInstance instance = this.instanceManager.getInstanceByName(instanceName.toString()); if (instance == null) { LOG.error("Cannot find instance with name " + instanceName + " to kill it"); return; } LOG.info("Killing task manager on instance " + instance); final Runnable runnable = new Runnable() { @Override public void run() { try { instance.killTaskManager(); } catch (IOException ioe) { LOG.error(ioe); } } }; // Hand it over to the executor service this.executorService.execute(runnable); } /** * Tests whether the job manager has been shut down completely. * * @return <code>true</code> if the job manager has been shut down completely, <code>false</code> otherwise */ public boolean isShutDown() { return this.isShutDown; } public Map<InstanceType, InstanceTypeDescription> getMapOfAvailableInstanceTypes() { // Delegate call to the instance manager if (this.instanceManager != null) { return this.instanceManager.getMapOfAvailableInstanceTypes(); } return null; } @Override public void jobStatusHasChanged(final ExecutionGraph executionGraph, final InternalJobStatus newJobStatus, final String optionalMessage) { LOG.info("Status of job " + executionGraph.getJobName() + "(" + executionGraph.getJobID() + ")" + " changed to " + newJobStatus); if (newJobStatus == InternalJobStatus.FAILING) { // Cancel all remaining tasks cancelJob(executionGraph); } if (newJobStatus == InternalJobStatus.CANCELED || newJobStatus == InternalJobStatus.FAILED || newJobStatus == InternalJobStatus.FINISHED) { // Unregister job for Nephele's monitoring, optimization components, and dynamic input split assignment unregisterJob(executionGraph); } } @Override public void logBufferUtilization(final JobID jobID) throws IOException { final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID); if (eg == null) { return; } final Set<AbstractInstance> allocatedInstance = new HashSet<AbstractInstance>(); final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, true); while (it.hasNext()) { final ExecutionVertex vertex = it.next(); final ExecutionState state = vertex.getExecutionState(); if (state == ExecutionState.RUNNING || state == ExecutionState.FINISHING) { final AbstractInstance instance = vertex.getAllocatedResource().getInstance(); if (instance instanceof DummyInstance) { LOG.error("Found instance of type DummyInstance for vertex " + vertex.getName() + " (state " + state + ")"); continue; } allocatedInstance.add(instance); } } // Send requests to task managers from separate thread final Runnable requestRunnable = new Runnable() { @Override public void run() { final Iterator<AbstractInstance> it2 = allocatedInstance.iterator(); try { while (it2.hasNext()) { it2.next().logBufferUtilization(); } } catch (IOException ioe) { LOG.error(ioe); } } }; // Hand over to the executor service this.executorService.execute(requestRunnable); } @Override public void deploy(final JobID jobID, final AbstractInstance instance, final List<ExecutionVertex> verticesToBeDeployed) { if (verticesToBeDeployed.isEmpty()) { LOG.error("Method 'deploy' called but list of vertices to be deployed is empty"); return; } for (final ExecutionVertex vertex : verticesToBeDeployed) { // Check vertex state if (vertex.getExecutionState() != ExecutionState.READY) { LOG.error("Expected vertex " + vertex + " to be in state READY but it is in state " + vertex.getExecutionState()); } vertex.updateExecutionState(ExecutionState.STARTING, null); } // Create a new runnable and pass it the executor service final Runnable deploymentRunnable = new Runnable() { /** * {@inheritDoc} */ @Override public void run() { // Check if all required libraries are available on the instance try { instance.checkLibraryAvailability(jobID); } catch (IOException ioe) { LOG.error("Cannot check library availability: " + StringUtils.stringifyException(ioe)); } final List<TaskDeploymentDescriptor> submissionList = new SerializableArrayList<TaskDeploymentDescriptor>(); // Check the consistency of the call for (final ExecutionVertex vertex : verticesToBeDeployed) { submissionList.add(vertex.constructDeploymentDescriptor()); LOG.info("Starting task " + vertex + " on " + vertex.getAllocatedResource().getInstance()); } List<TaskSubmissionResult> submissionResultList = null; try { submissionResultList = instance.submitTasks(submissionList); } catch (final IOException ioe) { final String errorMsg = StringUtils.stringifyException(ioe); for (final ExecutionVertex vertex : verticesToBeDeployed) { vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, errorMsg); } } if (verticesToBeDeployed.size() != submissionResultList.size()) { LOG.error("size of submission result list does not match size of list with vertices to be deployed"); } int count = 0; for (final TaskSubmissionResult tsr : submissionResultList) { ExecutionVertex vertex = verticesToBeDeployed.get(count++); if (!vertex.getID().equals(tsr.getVertexID())) { LOG.error("Expected different order of objects in task result list"); vertex = null; for (final ExecutionVertex candVertex : verticesToBeDeployed) { if (tsr.getVertexID().equals(candVertex.getID())) { vertex = candVertex; break; } } if (vertex == null) { LOG.error("Cannot find execution vertex for vertex ID " + tsr.getVertexID()); continue; } } if (tsr.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) { // Change the execution state to failed and let the scheduler deal with the rest vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, tsr.getDescription()); } } } }; this.executorService.execute(deploymentRunnable); } @Override public InputSplitWrapper requestNextInputSplit(final JobID jobID, final ExecutionVertexID vertexID, final IntegerRecord sequenceNumber) throws IOException { final ExecutionGraph graph = this.scheduler.getExecutionGraphByID(jobID); if (graph == null) { LOG.error("Cannot find execution graph to job ID " + jobID); return null; } final ExecutionVertex vertex = graph.getVertexByID(vertexID); if (vertex == null) { LOG.error("Cannot find execution vertex for vertex ID " + vertexID); return null; } return new InputSplitWrapper(jobID, this.inputSplitManager.getNextInputSplit(vertex, sequenceNumber.getValue())); } /** * Starts the Jetty Infoserver for the Jobmanager * */ public void startInfoServer() { final Configuration config = GlobalConfiguration.getConfiguration(); // Start InfoServer try { int port = config.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_FRONTEND_PORT); server = new WebInfoServer(config, port, this); server.start(); } catch (FileNotFoundException e) { LOG.error(e.getMessage(), e); } catch (Exception e) { LOG.error("Cannot instantiate info server: " + e.getMessage(), e); } } // TODO Add to RPC? public List<RecentJobEvent> getOldJobs() throws IOException { //final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>(); if (this.archive == null) { throw new IOException("No instance of the event collector found"); } //this.eventCollector.getRecentJobs(eventList); return this.archive.getJobs(); } public ArchiveListener getArchive() { return this.archive; } public int getNumberOfTaskTrackers() { return this.instanceManager.getNumberOfTaskTrackers(); } @Override public void reportAccumulatorResult(AccumulatorEvent accumulatorEvent) throws IOException { this.accumulatorManager.processIncomingAccumulators(accumulatorEvent.getJobID(), accumulatorEvent.getAccumulators()); } @Override public AccumulatorEvent getAccumulatorResults(JobID jobID) throws IOException { return new AccumulatorEvent(jobID, this.accumulatorManager.getJobAccumulators(jobID), false); } }