/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.nephele.taskmanager; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.NetworkInterface; import java.net.ServerSocket; import java.net.Socket; import java.net.SocketAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.security.UserGroupInformation; import eu.stratosphere.api.common.cache.DistributedCache; import eu.stratosphere.configuration.ConfigConstants; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.configuration.GlobalConfiguration; import eu.stratosphere.core.fs.Path; import eu.stratosphere.nephele.deployment.TaskDeploymentDescriptor; import eu.stratosphere.nephele.execution.ExecutionState; import eu.stratosphere.nephele.execution.RuntimeEnvironment; import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager; import eu.stratosphere.nephele.execution.librarycache.LibraryCacheProfileRequest; import eu.stratosphere.nephele.execution.librarycache.LibraryCacheProfileResponse; import eu.stratosphere.nephele.execution.librarycache.LibraryCacheUpdate; import eu.stratosphere.nephele.executiongraph.ExecutionVertexID; import eu.stratosphere.nephele.instance.HardwareDescription; import eu.stratosphere.nephele.instance.HardwareDescriptionFactory; import eu.stratosphere.nephele.instance.InstanceConnectionInfo; import eu.stratosphere.nephele.ipc.RPC; import eu.stratosphere.nephele.ipc.Server; import eu.stratosphere.nephele.jobgraph.JobID; import eu.stratosphere.nephele.net.NetUtils; import eu.stratosphere.nephele.profiling.ProfilingUtils; import eu.stratosphere.nephele.profiling.TaskManagerProfiler; import eu.stratosphere.nephele.protocols.AccumulatorProtocol; import eu.stratosphere.nephele.protocols.ChannelLookupProtocol; import eu.stratosphere.nephele.protocols.InputSplitProviderProtocol; import eu.stratosphere.nephele.protocols.JobManagerProtocol; import eu.stratosphere.nephele.protocols.TaskOperationProtocol; import eu.stratosphere.nephele.services.iomanager.IOManager; import eu.stratosphere.nephele.services.memorymanager.MemoryManager; import eu.stratosphere.nephele.services.memorymanager.spi.DefaultMemoryManager; import eu.stratosphere.nephele.util.SerializableArrayList; import eu.stratosphere.pact.runtime.cache.FileCache; import eu.stratosphere.runtime.io.channels.ChannelID; import eu.stratosphere.runtime.io.network.ChannelManager; import eu.stratosphere.runtime.io.network.InsufficientResourcesException; import eu.stratosphere.util.StringUtils; /** * A task manager receives tasks from the job manager and executes them. After having executed them * (or in case of an execution error) it reports the execution result back to the job manager. * Task managers are able to automatically discover the job manager and receive its configuration from it * as long as the job manager is running on the same local network * */ public class TaskManager implements TaskOperationProtocol { private static final Log LOG = LogFactory.getLog(TaskManager.class); private final static int FAILURE_RETURN_CODE = -1; private static final int IPC_HANDLER_COUNT = 1; public final static String ARG_CONF_DIR = "tempDir"; private final JobManagerProtocol jobManager; private final InputSplitProviderProtocol globalInputSplitProvider; private final ChannelLookupProtocol lookupService; private final ExecutorService executorService = Executors.newCachedThreadPool(ExecutorThreadFactory.INSTANCE); private final AccumulatorProtocol accumulatorProtocolProxy; private final Server taskManagerServer; private final FileCache fileCache = new FileCache(); /** * This map contains all the tasks whose threads are in a state other than TERMINATED. If any task * is stored inside this map and its thread status is TERMINATED, this indicates a virtual machine error. * As a result, task status will switch to FAILED and reported to the {@link eu.stratosphere.nephele.jobmanager.JobManager}. */ private final Map<ExecutionVertexID, Task> runningTasks = new ConcurrentHashMap<ExecutionVertexID, Task>(); private final InstanceConnectionInfo localInstanceConnectionInfo; /** * The instance of the {@link ChannelManager} which is responsible for * setting up and cleaning up the byte buffered channels of the tasks. */ private final ChannelManager channelManager; /** * Instance of the task manager profile if profiling is enabled. */ private final TaskManagerProfiler profiler; private final MemoryManager memoryManager; private final IOManager ioManager; private static HardwareDescription hardwareDescription = null; private final Thread heartbeatThread; private final AtomicBoolean shutdownStarted = new AtomicBoolean(false); /** Stores whether the task manager has already been shut down. */ private volatile boolean shutdownComplete; /** * Constructs a new task manager, starts its IPC service and attempts to discover the job manager to * receive an initial configuration. All parameters are obtained from the * {@link GlobalConfiguration}, which must be loaded prior to instantiating the task manager. */ public TaskManager() throws Exception { LOG.info("TaskManager started as user " + UserGroupInformation.getCurrentUser().getShortUserName()); LOG.info("User system property: " + System.getProperty("user.name")); // IMPORTANT! At this point, the GlobalConfiguration must have been read! final InetSocketAddress jobManagerAddress; { LOG.info("Reading location of job manager from configuration"); final String address = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null); final int port = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT); if (address == null) { throw new Exception("Job manager address not configured in the GlobalConfiguration."); } // Try to convert configured address to {@link InetAddress} try { final InetAddress tmpAddress = InetAddress.getByName(address); jobManagerAddress = new InetSocketAddress(tmpAddress, port); } catch (UnknownHostException e) { LOG.fatal("Could not resolve JobManager host name."); throw new Exception("Could not resolve JobManager host name: " + e.getMessage(), e); } LOG.info("Connecting to JobManager at: " + jobManagerAddress); } // Create RPC connection to the JobManager try { this.jobManager = RPC.getProxy(JobManagerProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Could not connect to the JobManager: " + e.getMessage(), e); throw new Exception("Failed to initialize connection to JobManager: " + e.getMessage(), e); } int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, -1); int dataPort = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, -1); if (ipcPort == -1) { ipcPort = getAvailablePort(); } if (dataPort == -1) { dataPort = getAvailablePort(); } // Determine our own public facing address and start the server { final InetAddress taskManagerAddress; try { taskManagerAddress = getTaskManagerAddress(jobManagerAddress); } catch (Exception e) { throw new RuntimeException("The TaskManager failed to determine its own network address.", e); } this.localInstanceConnectionInfo = new InstanceConnectionInfo(taskManagerAddress, ipcPort, dataPort); LOG.info("TaskManager connection information:" + this.localInstanceConnectionInfo); // Start local RPC server try { this.taskManagerServer = RPC.getServer(this, taskManagerAddress.getHostAddress(), ipcPort, IPC_HANDLER_COUNT); this.taskManagerServer.start(); } catch (IOException e) { LOG.fatal("Failed to start TaskManager server. " + e.getMessage(), e); throw new Exception("Failed to start taskmanager server. " + e.getMessage(), e); } } // Try to create local stub of the global input split provider try { this.globalInputSplitProvider = RPC.getProxy(InputSplitProviderProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception("Failed to initialize connection to global input split provider: " + e.getMessage(), e); } // Try to create local stub for the lookup service try { this.lookupService = RPC.getProxy(ChannelLookupProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal(e.getMessage(), e); throw new Exception("Failed to initialize channel lookup protocol. " + e.getMessage(), e); } // Try to create local stub for the accumulators try { this.accumulatorProtocolProxy = RPC.getProxy(AccumulatorProtocol.class, jobManagerAddress, NetUtils.getSocketFactory()); } catch (IOException e) { LOG.fatal("Failed to initialize accumulator protocol: " + e.getMessage(), e); throw new Exception("Failed to initialize accumulator protocol: " + e.getMessage(), e); } // Load profiler if it should be used if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) { final String profilerClassName = GlobalConfiguration.getString(ProfilingUtils.TASKMANAGER_CLASSNAME_KEY, "eu.stratosphere.nephele.profiling.impl.TaskManagerProfilerImpl"); this.profiler = ProfilingUtils.loadTaskManagerProfiler(profilerClassName, jobManagerAddress.getAddress(), this.localInstanceConnectionInfo); if (this.profiler == null) { LOG.error("Cannot find class name for the profiler."); } else { LOG.info("Profiling of jobs is enabled."); } } else { this.profiler = null; LOG.info("Profiling of jobs is disabled."); } // Get the directory for storing temporary files final String[] tmpDirPaths = GlobalConfiguration.getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH).split(",|"+File.pathSeparator); checkTempDirs(tmpDirPaths); final int pageSize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); // Initialize network buffer pool int numBuffers = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_NUM_BUFFERS); int bufferSize = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETWORK_BUFFER_SIZE_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_BUFFER_SIZE); int numInThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETTY_NUM_IN_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETTY_NUM_IN_THREADS); int numOutThreads = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETTY_NUM_OUT_THREADS_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_NETTY_NUM_OUT_THREADS); int lowWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETTY_LOW_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NETTY_LOW_WATER_MARK); int highWaterMark = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_NETTY_HIGH_WATER_MARK, ConfigConstants.DEFAULT_TASK_MANAGER_NETTY_HIGH_WATER_MARK); // Initialize the channel manager try { this.channelManager = new ChannelManager( this.lookupService, this.localInstanceConnectionInfo, numBuffers, bufferSize, numInThreads, numOutThreads, lowWaterMark, highWaterMark); } catch (IOException ioe) { LOG.error(StringUtils.stringifyException(ioe)); throw new Exception("Failed to instantiate Byte-buffered channel manager. " + ioe.getMessage(), ioe); } { HardwareDescription resources = HardwareDescriptionFactory.extractFromSystem(); // Check whether the memory size has been explicitly configured. if so that overrides the default mechanism // of taking as much as is mentioned in the hardware description long memorySize = GlobalConfiguration.getInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1); if (memorySize > 0) { // manually configured memory size. override the value in the hardware config resources = HardwareDescriptionFactory.construct(resources.getNumberOfCPUCores(), resources.getSizeOfPhysicalMemory(), memorySize * 1024L * 1024L); } this.hardwareDescription = resources; // Initialize the memory manager LOG.info("Initializing memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory. " + "Page size is " + pageSize + " bytes."); try { @SuppressWarnings("unused") final boolean lazyAllocation = GlobalConfiguration.getBoolean(ConfigConstants.TASK_MANAGER_MEMORY_LAZY_ALLOCATION_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_MEMORY_LAZY_ALLOCATION); this.memoryManager = new DefaultMemoryManager(resources.getSizeOfFreeMemory(), pageSize); } catch (Throwable t) { LOG.fatal("Unable to initialize memory manager with " + (resources.getSizeOfFreeMemory() >>> 20) + " megabytes of memory.", t); throw new Exception("Unable to initialize memory manager.", t); } } this.ioManager = new IOManager(tmpDirPaths); this.heartbeatThread = new Thread() { @Override public void run() { runHeartbeatLoop(); } }; this.heartbeatThread.setName("Heartbeat Thread"); this.heartbeatThread.start(); } private int getAvailablePort() { ServerSocket serverSocket = null; int port = 0; for (int i = 0; i < 50; i++){ try { serverSocket = new ServerSocket(0); port = serverSocket.getLocalPort(); if (port != 0) { serverSocket.close(); break; } } catch (IOException e) { LOG.debug("Unable to allocate port " + e.getMessage(), e); } } if (!serverSocket.isClosed()) { try { serverSocket.close(); } catch (IOException e) { LOG.debug("error closing port",e); } } return port; } /** * Entry point for the program. * * @param args * arguments from the command line * @throws IOException */ @SuppressWarnings("static-access") public static void main(String[] args) throws IOException { Option configDirOpt = OptionBuilder.withArgName("config directory").hasArg().withDescription( "Specify configuration directory.").create("configDir"); // tempDir option is used by the YARN client. Option tempDir = OptionBuilder.withArgName("temporary directory (overwrites configured option)") .hasArg().withDescription( "Specify temporary directory.").create(ARG_CONF_DIR); configDirOpt.setRequired(true); tempDir.setRequired(false); Options options = new Options(); options.addOption(configDirOpt); options.addOption(tempDir); CommandLineParser parser = new GnuParser(); CommandLine line = null; try { line = parser.parse(options, args); } catch (ParseException e) { System.err.println("CLI Parsing failed. Reason: " + e.getMessage()); System.exit(FAILURE_RETURN_CODE); } String configDir = line.getOptionValue(configDirOpt.getOpt(), null); String tempDirVal = line.getOptionValue(tempDir.getOpt(), null); // First, try to load global configuration GlobalConfiguration.loadConfiguration(configDir); if(tempDirVal != null // the YARN TM runner has set a value for the temp dir // the configuration does not contain a temp direcory && GlobalConfiguration.getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, null) == null) { Configuration c = GlobalConfiguration.getConfiguration(); c.setString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, tempDirVal); LOG.info("Setting temporary directory to "+tempDirVal); GlobalConfiguration.includeConfiguration(c); } System.err.println("Configuration "+GlobalConfiguration.getConfiguration()); LOG.info("Current user "+UserGroupInformation.getCurrentUser().getShortUserName()); { // log the available JVM memory long maxMemoryMiBytes = Runtime.getRuntime().maxMemory() >>> 20; LOG.info("Starting TaskManager in a JVM with " + maxMemoryMiBytes + " MiBytes maximum heap size."); } // Create a new task manager object try { new TaskManager(); } catch (Exception e) { LOG.fatal("Taskmanager startup failed: " + e.getMessage(), e); System.exit(FAILURE_RETURN_CODE); } // park the main thread to keep the JVM alive (all other threads may be daemon threads) Object mon = new Object(); synchronized (mon) { try { mon.wait(); } catch (InterruptedException ex) {} } } /** * This method send the periodic heartbeats. */ private void runHeartbeatLoop() { final long interval = GlobalConfiguration.getInteger( ConfigConstants.TASK_MANAGER_HEARTBEAT_INTERVAL_KEY, ConfigConstants.DEFAULT_TASK_MANAGER_HEARTBEAT_INTERVAL); while (!shutdownStarted.get()) { // send heart beat try { LOG.debug("heartbeat"); this.jobManager.sendHeartbeat(this.localInstanceConnectionInfo, this.hardwareDescription); } catch (IOException e) { if (shutdownStarted.get()) { break; } else { LOG.error("Sending the heart beat caused an exception: " + e.getMessage(), e); } } // sleep until the next heart beat try { Thread.sleep(interval); } catch (InterruptedException e) { if (!shutdownStarted.get()) { LOG.error("TaskManager heart beat loop was interrupted without shutdown."); } } } } /** * The states of address detection mechanism. * There is only a state transition if the current state failed to determine the address. */ private enum AddressDetectionState { ADDRESS(50), //detect own IP based on the JobManagers IP address. Look for common prefix FAST_CONNECT(50), //try to connect to the JobManager on all Interfaces and all their addresses. //this state uses a low timeout (say 50 ms) for fast detection. SLOW_CONNECT(1000); //same as FAST_CONNECT, but with a timeout of 1000 ms (1s). private int timeout; AddressDetectionState(int timeout) { this.timeout = timeout; } public int getTimeout() { return timeout; } } /** * Find out the TaskManager's own IP address. */ private InetAddress getTaskManagerAddress(InetSocketAddress jobManagerAddress) throws IOException { AddressDetectionState strategy = AddressDetectionState.ADDRESS; while (true) { Enumeration<NetworkInterface> e = NetworkInterface.getNetworkInterfaces(); while (e.hasMoreElements()) { NetworkInterface n = e.nextElement(); Enumeration<InetAddress> ee = n.getInetAddresses(); while (ee.hasMoreElements()) { InetAddress i = ee.nextElement(); switch (strategy) { case ADDRESS: if (hasCommonPrefix(jobManagerAddress.getAddress().getAddress(), i.getAddress())) { if (tryToConnect(i, jobManagerAddress, strategy.getTimeout())) { LOG.info("Determined " + i + " as the TaskTracker's own IP address"); return i; } } break; case FAST_CONNECT: case SLOW_CONNECT: boolean correct = tryToConnect(i, jobManagerAddress, strategy.getTimeout()); if (correct) { LOG.info("Determined " + i + " as the TaskTracker's own IP address"); return i; } break; default: throw new RuntimeException("Unkown address detection strategy: " + strategy); } } } // state control switch (strategy) { case ADDRESS: strategy = AddressDetectionState.FAST_CONNECT; break; case FAST_CONNECT: strategy = AddressDetectionState.SLOW_CONNECT; break; case SLOW_CONNECT: throw new RuntimeException("The TaskManager failed to detect its own IP address"); } if (LOG.isDebugEnabled()) { LOG.debug("Defaulting to detection strategy " + strategy); } } } /** * Checks if two addresses have a common prefix (first 2 bytes). * Example: 192.168.???.??? * Works also with ipv6, but accepts probably too many addresses */ private static boolean hasCommonPrefix(byte[] address, byte[] address2) { return address[0] == address2[0] && address[1] == address2[1]; } public static boolean tryToConnect(InetAddress fromAddress, SocketAddress toSocket, int timeout) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Trying to connect to JobManager (" + toSocket + ") from local address " + fromAddress + " with timeout " + timeout); } boolean connectable = true; Socket socket = null; try { socket = new Socket(); SocketAddress bindP = new InetSocketAddress(fromAddress, 0); // 0 = let the OS choose the port on this // machine socket.bind(bindP); socket.connect(toSocket, timeout); } catch (Exception ex) { LOG.info("Failed to determine own IP address from '" + fromAddress + "': " + ex.getMessage()); if (LOG.isDebugEnabled()) { LOG.debug("Failed with exception", ex); } connectable = false; } finally { if (socket != null) { socket.close(); } } return connectable; } @Override public TaskCancelResult cancelTask(final ExecutionVertexID id) throws IOException { final Task task = this.runningTasks.get(id); if (task == null) { final TaskCancelResult taskCancelResult = new TaskCancelResult(id, AbstractTaskResult.ReturnCode.TASK_NOT_FOUND); taskCancelResult.setDescription("No task with ID " + id + " is currently running"); return taskCancelResult; } // Pass call to executor service so IPC thread can return immediately final Runnable r = new Runnable() { @Override public void run() { // Finally, request user code to cancel task.cancelExecution(); } }; this.executorService.execute(r); return new TaskCancelResult(id, AbstractTaskResult.ReturnCode.SUCCESS); } @Override public TaskKillResult killTask(final ExecutionVertexID id) throws IOException { final Task task = this.runningTasks.get(id); if (task == null) { final TaskKillResult taskKillResult = new TaskKillResult(id, AbstractTaskResult.ReturnCode.TASK_NOT_FOUND); taskKillResult.setDescription("No task with ID + " + id + " is currently running"); return taskKillResult; } // Pass call to executor service so IPC thread can return immediately final Runnable r = new Runnable() { @Override public void run() { // Finally, request user code to cancel task.killExecution(); } }; this.executorService.execute(r); return new TaskKillResult(id, AbstractTaskResult.ReturnCode.SUCCESS); } @Override public List<TaskSubmissionResult> submitTasks(final List<TaskDeploymentDescriptor> tasks) throws IOException { final List<TaskSubmissionResult> submissionResultList = new SerializableArrayList<TaskSubmissionResult>(); final List<Task> tasksToStart = new ArrayList<Task>(); // Make sure all tasks are fully registered before they are started for (final TaskDeploymentDescriptor tdd : tasks) { final JobID jobID = tdd.getJobID(); final ExecutionVertexID vertexID = tdd.getVertexID(); RuntimeEnvironment re; // retrieve the registered cache files from job configuration and create the local tmp file. Map<String, FutureTask<Path>> cpTasks = new HashMap<String, FutureTask<Path>>(); for (Entry<String, String> e: DistributedCache.getCachedFile(tdd.getJobConfiguration())) { FutureTask<Path> cp = this.fileCache.createTmpFile(e.getKey(), e.getValue(), jobID); cpTasks.put(e.getKey(), cp); } try { re = new RuntimeEnvironment(tdd, this.memoryManager, this.ioManager, new TaskInputSplitProvider(jobID, vertexID, this.globalInputSplitProvider), this.accumulatorProtocolProxy, cpTasks); } catch (Throwable t) { final TaskSubmissionResult result = new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.DEPLOYMENT_ERROR); result.setDescription(StringUtils.stringifyException(t)); LOG.error(result.getDescription(), t); submissionResultList.add(result); continue; } final Configuration jobConfiguration = tdd.getJobConfiguration(); // Register the task Task task; try { task = createAndRegisterTask(vertexID, jobConfiguration, re); } catch (InsufficientResourcesException e) { final TaskSubmissionResult result = new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.INSUFFICIENT_RESOURCES); result.setDescription(e.getMessage()); LOG.error(result.getDescription(), e); submissionResultList.add(result); continue; } if (task == null) { final TaskSubmissionResult result = new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.TASK_NOT_FOUND); result.setDescription("Task " + re.getTaskNameWithIndex() + " (" + vertexID + ") was already running"); LOG.error(result.getDescription()); submissionResultList.add(result); continue; } submissionResultList.add(new TaskSubmissionResult(vertexID, AbstractTaskResult.ReturnCode.SUCCESS)); tasksToStart.add(task); } // Now start the tasks for (final Task task : tasksToStart) { task.startExecution(); } return submissionResultList; } /** * Registers an newly incoming runtime task with the task manager. * * @param id * the ID of the task to register * @param jobConfiguration * the job configuration that has been attached to the original job graph * @param environment * the environment of the task to be registered * @return the task to be started or <code>null</code> if a task with the same ID was already running */ private Task createAndRegisterTask(final ExecutionVertexID id, final Configuration jobConfiguration, final RuntimeEnvironment environment) throws InsufficientResourcesException, IOException { if (id == null) { throw new IllegalArgumentException("Argument id is null"); } if (environment == null) { throw new IllegalArgumentException("Argument environment is null"); } // Task creation and registration must be atomic Task task; synchronized (this) { final Task runningTask = this.runningTasks.get(id); boolean registerTask = true; if (runningTask == null) { task = new Task(id, environment, this); } else { if (runningTask instanceof Task) { // Task is already running return null; } else { // There is already a replay task running, we will simply restart it task = runningTask; registerTask = false; } } if (registerTask) { // Register the task with the byte buffered channel manager this.channelManager.register(task); boolean enableProfiling = false; if (this.profiler != null && jobConfiguration.getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) { enableProfiling = true; } // Register environment, input, and output gates for profiling if (enableProfiling) { task.registerProfiler(this.profiler, jobConfiguration); } this.runningTasks.put(id, task); } } return task; } /** * Unregisters a finished or aborted task. * * @param id * the ID of the task to be unregistered */ private void unregisterTask(final ExecutionVertexID id) { // Task de-registration must be atomic synchronized (this) { final Task task = this.runningTasks.remove(id); if (task == null) { LOG.error("Cannot find task with ID " + id + " to unregister"); return; } // remove the local tmp file for unregistered tasks. for (Entry<String, String> e: DistributedCache.getCachedFile(task.getEnvironment().getJobConfiguration())) { this.fileCache.deleteTmpFile(e.getKey(), task.getJobID()); } // Unregister task from the byte buffered channel manager this.channelManager.unregister(id, task); // Unregister task from profiling task.unregisterProfiler(this.profiler); // Unregister task from memory manager task.unregisterMemoryManager(this.memoryManager); // Unregister task from library cache manager try { LibraryCacheManager.unregister(task.getJobID()); } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("Unregistering the job vertex ID " + id + " caused an IOException"); } } } } @Override public LibraryCacheProfileResponse getLibraryCacheProfile(LibraryCacheProfileRequest request) throws IOException { LibraryCacheProfileResponse response = new LibraryCacheProfileResponse(request); String[] requiredLibraries = request.getRequiredLibraries(); for (int i = 0; i < requiredLibraries.length; i++) { if (LibraryCacheManager.contains(requiredLibraries[i]) == null) { response.setCached(i, false); } else { response.setCached(i, true); } } return response; } @Override public void updateLibraryCache(LibraryCacheUpdate update) throws IOException { // Nothing to to here } public void executionStateChanged(final JobID jobID, final ExecutionVertexID id, final ExecutionState newExecutionState, final String optionalDescription) { // Don't propagate state CANCELING back to the job manager if (newExecutionState == ExecutionState.CANCELING) { return; } if (newExecutionState == ExecutionState.FINISHED || newExecutionState == ExecutionState.CANCELED || newExecutionState == ExecutionState.FAILED) { // Unregister the task (free all buffers, remove all channels, task-specific class loaders, etc...) unregisterTask(id); } // Get lock on the jobManager object and propagate the state change synchronized (this.jobManager) { try { this.jobManager.updateTaskExecutionState(new TaskExecutionState(jobID, id, newExecutionState, optionalDescription)); } catch (IOException e) { LOG.error(e); } } } /** * Shuts the task manager down. */ public void shutdown() { if (!this.shutdownStarted.compareAndSet(false, true)) { return; } LOG.info("Shutting down TaskManager"); // first, stop the heartbeat thread and wait for it to terminate this.heartbeatThread.interrupt(); try { this.heartbeatThread.join(1000); } catch (InterruptedException e) {} // Stop RPC proxy for the task manager RPC.stopProxy(this.jobManager); // Stop RPC proxy for the global input split assigner RPC.stopProxy(this.globalInputSplitProvider); // Stop RPC proxy for the lookup service RPC.stopProxy(this.lookupService); // Stop RPC proxy for accumulator reports RPC.stopProxy(this.accumulatorProtocolProxy); // Shut down the own RPC server this.taskManagerServer.stop(); // Stop profiling if enabled if (this.profiler != null) { this.profiler.shutdown(); } // Shut down the network channel manager this.channelManager.shutdown(); // Shut down the memory manager if (this.ioManager != null) { this.ioManager.shutdown(); } if (this.memoryManager != null) { this.memoryManager.shutdown(); } this.fileCache.shutdown(); // Shut down the executor service if (this.executorService != null) { this.executorService.shutdown(); try { this.executorService.awaitTermination(5000L, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { if (LOG.isDebugEnabled()) { LOG.debug(e); } } } this.shutdownComplete = true; } /** * Checks whether the task manager has already been shut down. * * @return <code>true</code> if the task manager has already been shut down, <code>false</code> otherwise */ public boolean isShutDown() { return this.shutdownComplete; } @Override public void logBufferUtilization() { this.channelManager.logBufferUtilization(); } @Override public void killTaskManager() throws IOException { // Kill the entire JVM after a delay of 10ms, so this RPC will finish properly before final Timer timer = new Timer(); final TimerTask timerTask = new TimerTask() { @Override public void run() { System.exit(0); } }; timer.schedule(timerTask, 10L); } @Override public void invalidateLookupCacheEntries(final Set<ChannelID> channelIDs) throws IOException { this.channelManager.invalidateLookupCacheEntries(channelIDs); } /** * Checks, whether the given strings describe existing directories that are writable. If that is not * the case, an exception is raised. * * @param tempDirs * An array of strings which are checked to be paths to writable directories. * @throws Exception * Thrown, if any of the mentioned checks fails. */ private static final void checkTempDirs(final String[] tempDirs) throws Exception { for (int i = 0; i < tempDirs.length; ++i) { final String dir = tempDirs[i]; if (dir == null) { throw new Exception("Temporary file directory #" + (i + 1) + " is null."); } final File f = new File(dir); if (!f.exists()) { throw new Exception("Temporary file directory '" + f.getAbsolutePath() + "' does not exist."); } if (!f.isDirectory()) { throw new Exception("Temporary file directory '" + f.getAbsolutePath() + "' is not a directory."); } if (!f.canWrite()) { throw new Exception("Temporary file directory '" + f.getAbsolutePath() + "' is not writable."); } } } }