/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.corona.InetAddress; import org.apache.hadoop.corona.ResourceGrant; import org.apache.hadoop.corona.ResourceRequest; import org.apache.hadoop.corona.SessionDriver; import org.apache.hadoop.corona.SessionDriverService; import org.apache.hadoop.corona.SessionStatus; import org.apache.hadoop.corona.Utilities; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.http.HttpServer; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.VersionInfo; /** * Tracker for a single job. This can run jobs one-at-a-time. */ @SuppressWarnings("deprecation") public class CoronaJobTracker extends JobTrackerTraits implements JobSubmissionProtocol, SessionDriverService.Iface, InterTrackerProtocol, ResourceTracker.ResourceProcessor { static { Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() { @Override public void uncaughtException(Thread t, Throwable e) { LOG.error("UNCAUGHT: Thread " + t.getName() + " got an uncaught exception", e); System.exit(1); } }); } public static final Log LOG = LogFactory.getLog(CoronaJobTracker.class); static long TASKTRACKER_EXPIRY_INTERVAL = 10 * 60 * 1000; public static final String TT_CONNECT_TIMEOUT_MSEC_KEY = "corona.tasktracker.connect.timeout.msec"; public static final String HEART_BEAT_INTERVAL_KEY = "corona.jobtracker.heartbeat.interval"; JobConf conf; // JT conf. FileSystem fs; // Handles the session with the cluster manager. SessionDriver sessionDriver; String sessionId; // Variable to atomically check if more than one job is attempted to be // launched via this jobtracker AtomicInteger jobCounter = new AtomicInteger(); // Identifier for the current job. final JobID jobId; CoronaJobInProgress job; ResourceTracker resourceTracker; List<ResourceGrant> grantsToRevoke = new ArrayList<ResourceGrant>(); volatile boolean running = true; Thread assignTasksThread; InetSocketAddress jobTrackerAddress; Server interTrackerServer; HttpServer infoServer; long startTime = System.currentTimeMillis(); TaskLookupTable taskLookupTable = new TaskLookupTable(); Map<String, TaskTrackerStatus> taskTrackerStatus = new ConcurrentHashMap<String, TaskTrackerStatus>(); ExpireLaunchingTasks expireLaunchingTasks = new ExpireLaunchingTasks(); Thread expireLaunchingTasksThread; /** * An Attempt and it's corresponding TaskInProgress * There is a unique TIP per Attempt. Hence the attempt * can be used as the unique key to identify this tuple * (in a Collection for example) */ public static final class TaskAttemptIDWithTip implements Comparable<TaskAttemptIDWithTip> { public final TaskAttemptID attemptId; public final TaskInProgress tip; public TaskAttemptIDWithTip(TaskAttemptID attemptId, TaskInProgress tip) { this.attemptId = attemptId; this.tip = tip; } public boolean equals(Object o) { TaskAttemptIDWithTip that = (TaskAttemptIDWithTip)o; return this.attemptId.equals(that.attemptId); } public int hashCode() { return attemptId.hashCode(); } public int compareTo(TaskAttemptIDWithTip that) { return this.attemptId.compareTo(that.attemptId); } } class TaskLookupTable { Map<TaskAttemptID, String> taskIdToTrackerMap = new HashMap<TaskAttemptID, String>(); Map<TaskAttemptID, TaskInProgress> taskIdToTIPMap = new HashMap<TaskAttemptID, TaskInProgress>(); Map<String, Set<TaskAttemptIDWithTip>> trackerToTaskMap = new HashMap<String, Set<TaskAttemptIDWithTip>>(); Map<TaskAttemptID, Integer> taskIdToGrantMap = new HashMap<TaskAttemptID, Integer>(); public void createTaskEntry( TaskAttemptID taskId, String taskTracker, TaskInProgress tip, Integer grant) { LOG.info("Adding task (" + tip.getAttemptType(taskId) + ") " + "'" + taskId + "' to tip " + tip.getTIPId() + ", for tracker '" + taskTracker + "' grant:" + grant); synchronized(lockObject) { // taskId --> tracker taskIdToTrackerMap.put(taskId, taskTracker); // tracker --> taskId Set<TaskAttemptIDWithTip> taskset = trackerToTaskMap.get(taskTracker); if (taskset == null) { taskset = new HashSet<TaskAttemptIDWithTip>(); trackerToTaskMap.put(taskTracker, taskset); } taskset.add(new TaskAttemptIDWithTip(taskId, tip)); // taskId --> TIP // We never remove this entry. taskIdToTIPMap.put(taskId, tip); taskIdToGrantMap.put(taskId, grant); } } public void removeTaskEntry(TaskAttemptID taskId) { LOG.info("Removing task '" + taskId + "'"); synchronized(lockObject) { // taskId --> tracker String tracker = taskIdToTrackerMap.get(taskId); // tracker --> taskId if (tracker != null) { Set<TaskAttemptIDWithTip> taskset = trackerToTaskMap.get(tracker); if (taskset != null) { // TaskAttemptIDWithTip.equals() uses attemptId equality. taskset.remove(new TaskAttemptIDWithTip(taskId, null)); } } taskIdToGrantMap.remove(taskId); } } public TaskInProgress getTIP(TaskAttemptID taskId) { synchronized(lockObject) { return taskIdToTIPMap.get(taskId); } } public TaskAttemptID taskForGrant(ResourceGrant grant) { synchronized(lockObject) { for (Map.Entry<TaskAttemptID, Integer> entry: taskIdToGrantMap.entrySet()) { if (entry.getValue().equals(grant.getId())) { return entry.getKey(); } } } return null; } public Set<Integer> grantsInUseOnTracker( String trackerName) { synchronized(lockObject) { Set<Integer> grants = new HashSet<Integer>(); for (TaskAttemptIDWithTip tip: trackerToTaskMap.get(trackerName)) { grants.add(taskIdToGrantMap.get(tip.attemptId)); } return grants; } } List<TaskTrackerAction> getTasksToKill(String taskTracker) { synchronized(lockObject) { Set<TaskAttemptIDWithTip> taskset = trackerToTaskMap.get(taskTracker); List<TaskTrackerAction> killList = new ArrayList<TaskTrackerAction>(); if (taskset != null) { for (TaskAttemptIDWithTip onetask : taskset) { TaskAttemptID killTaskId = onetask.attemptId; TaskInProgress tip = onetask.tip; if (tip == null) { continue; } if (tip.shouldClose(killTaskId)) { // // This is how the JobTracker ends a task at the TaskTracker. // It may be successfully completed, or may be killed in // mid-execution. // if (job != null && !job.getStatus().isJobComplete()) { killList.add(new KillTaskAction(killTaskId)); LOG.debug(taskTracker + " -> KillTaskAction: " + killTaskId); } } } } return killList; } } public Integer getGrantIdForTask(TaskAttemptID taskId) { synchronized(lockObject) { return taskIdToGrantMap.get(taskId); } } public String getAssignedTracker(TaskAttemptID attempt) { synchronized(lockObject) { return taskIdToTrackerMap.get(attempt); } } } class ExpireLaunchingTasks implements Runnable { /** * This is a map of the tasks that have been assigned to task trackers, * but that have not yet been seen in a status report. * map: task-id -> time-assigned */ Map<TaskAttemptID, Long> launchingTasks = new LinkedHashMap<TaskAttemptID, Long>(); public void run() { while (running) { try { expireLaunchingTasks(); // Every 3 minutes check for any tasks that are overdue Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL/3); } catch (InterruptedException ie) { // ignore. if shutting down, while cond. will catch it } catch (Exception e) { LOG.error("Expire Launching Task Thread got exception: ", e); } } } void expireLaunchingTasks() { if (job == null) { return; } long now = JobTracker.getClock().getTime(); LOG.debug("Starting launching task sweep"); synchronized (lockObject) { Iterator<Map.Entry<TaskAttemptID, Long>> itr = launchingTasks.entrySet().iterator(); while (itr.hasNext()) { Map.Entry<TaskAttemptID, Long> pair = itr.next(); TaskAttemptID taskId = pair.getKey(); long age = now - (pair.getValue()).longValue(); if (age > TASKTRACKER_EXPIRY_INTERVAL) { LOG.info("Launching task " + taskId + " timed out."); failTask(taskId, "Error launching task", false); itr.remove(); } } } } public void failedLaunch(TaskAttemptID attempt) { synchronized (lockObject) { // Check if the attempt exists in the map. // It might have expired already. if (launchingTasks.containsKey(attempt)) { // Set the launch time to a very old value. launchingTasks.put(attempt, (long)0); // Make the expire task logic run immediately. expireLaunchingTasksThread.interrupt(); lockObject.notify(); } } } public void addNewTask(TaskAttemptID taskName) { synchronized (lockObject) { launchingTasks.put(taskName, JobTracker.getClock().getTime()); } } public void removeTask(TaskAttemptID taskName) { synchronized (lockObject) { launchingTasks.remove(taskName); } } } private void failTask(TaskAttemptID taskId, String reason, boolean isFailed) { TaskInProgress tip = taskLookupTable.getTIP(taskId); Integer grantId = taskLookupTable.getGrantIdForTask(taskId); ResourceGrant grant = resourceTracker.getGrant(grantId); assert grant != null : "Task " + taskId + " is running but has no associated resource"; String trackerName = grant.getNodeName(); TaskTrackerStatus trackerStatus = getTaskTrackerStatus(trackerName); TaskStatus.Phase phase = tip.isMapTask()? TaskStatus.Phase.MAP: TaskStatus.Phase.STARTING; CoronaJobTracker.this.job.failedTask( tip, taskId,reason, phase, isFailed, trackerName, trackerStatus); } static class ActionToSend { String trackerHost; int port; TaskTrackerAction action; ActionToSend(String trackerHost, int port, TaskTrackerAction action) { this.trackerHost = trackerHost; this.action = action; this.port = port; } } List<ActionToSend> actionsToSend = new LinkedList<ActionToSend>(); Thread taskLauncherThread; TrackerClientCache trackerClientCache; ResourceUpdater resourceUpdater = new ResourceUpdater(); Thread resourceUpdaterThread; private int infoPort; private Object lockObject = new Object(); private Object closeLock = new Object(); CoronaJobHistory jobHistory; private int heartbeatInterval; // For testing. CoronaJobTracker(JobConf conf, String sessionId, TrackerClientCache cache) throws IOException { this.conf = conf; fs = FileSystem.get(conf); this.sessionId = sessionId; this.trackerClientCache = cache; this.resourceTracker = new ResourceTracker(lockObject); this.taskLookupTable = new TaskLookupTable(); this.jobId = new JobID(sessionId, 1); this.jobHistory = new CoronaJobHistory(conf, jobId); this.heartbeatInterval = conf.getInt(HEART_BEAT_INTERVAL_KEY, 100); } public CoronaJobTracker(JobConf conf) throws IOException { this.conf = conf; fs = FileSystem.get(conf); this.resourceTracker = new ResourceTracker(lockObject); this.taskLookupTable = new TaskLookupTable(); // Use the DNS hostname so that Task Trackers can connect to JT. jobTrackerAddress = NetUtils.createSocketAddr( java.net.InetAddress.getLocalHost().getCanonicalHostName(), 0); int handlerCount = conf.getInt("mapred.job.tracker.handler.count", 10); this.heartbeatInterval = conf.getInt(HEART_BEAT_INTERVAL_KEY, 3000); interTrackerServer = RPC.getServer((InterTrackerProtocol)this, jobTrackerAddress.getHostName(), jobTrackerAddress.getPort(), handlerCount, false, conf); interTrackerServer.start(); jobTrackerAddress = new InetSocketAddress( jobTrackerAddress.getHostName(), interTrackerServer.getListenerAddress().getPort()); LOG.info("CoronaJobTracker up at " + jobTrackerAddress); String infoAddr = NetUtils.getServerAddress(conf, "mapred.job.tracker.info.bindAddress", "mapred.job.tracker.info.port", "mapred.job.tracker.http.address"); InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr); String infoBindAddress = infoSocAddr.getHostName(); int tmpInfoPort = infoSocAddr.getPort(); infoServer = new HttpServer("jt", infoBindAddress, tmpInfoPort, tmpInfoPort == 0, conf); infoServer.setAttribute("job.tracker", this); infoServer.start(); this.infoPort = this.infoServer.getPort(); // TODO: we may want to bind the jobtracker to a specific interface? String hostname = java.net.InetAddress.getLocalHost().getCanonicalHostName(); this.conf.set("mapred.job.tracker.http.address", hostname + ":" + this.infoPort); this.conf.setInt("mapred.job.tracker.info.port", this.infoPort); this.conf.set("mapred.job.tracker.info.bindAddress", hostname); LOG.info("JobTracker webserver: " + this.infoPort); assignTasksThread = new Thread(new AssignTasksThread()); assignTasksThread.setName("assignTasks Thread"); assignTasksThread.setDaemon(true); assignTasksThread.start(); taskLauncherThread = new Thread(new TaskLauncherThread()); taskLauncherThread.setName("Task Launcher Thread"); taskLauncherThread.setDaemon(true); taskLauncherThread.start(); resourceUpdaterThread = new Thread(resourceUpdater); resourceUpdaterThread.setName("Resource Updater"); resourceUpdaterThread.setDaemon(true); resourceUpdaterThread.start(); expireLaunchingTasksThread = new Thread(expireLaunchingTasks); expireLaunchingTasksThread.setName("Expire launching tasks"); expireLaunchingTasksThread.setDaemon(true); expireLaunchingTasksThread.start(); // Create the session driver. This will contact the cluster manager. sessionDriver = new SessionDriver(conf, this); sessionId = sessionDriver.getSessionId(); // the jobtracker can run only a single job. it's jobid is fixed based // on the sessionId. jobId = new JobID(sessionId, 1); jobHistory = new CoronaJobHistory(conf, jobId); // Initialize history DONE folder if (!jobHistory.isDisabled()) { String historyLogDir = jobHistory.getCompletedJobHistoryLocation().toString(); infoServer.setAttribute("historyLogDir", historyLogDir); infoServer.setAttribute("conf", conf); } sessionDriver.setUrl(getUrl()); this.trackerClientCache = new TrackerClientCache(conf); } public String getJobTrackerMachine() { return jobTrackerAddress.getHostName(); } public String getUrl() throws IOException { String jobHistoryFileLocation = jobHistory.getCompletedJobHistoryPath(); String encodedJobHistoryFileLocation = URLEncoder.encode(jobHistoryFileLocation, "UTF-8"); String url = getProxyUrl(conf, "coronajobdetails.jsp?jobid=" + jobId + "&jobhistoryfileloc=" + encodedJobHistoryFileLocation); return url; } public static SessionStatus jobToSessionStatus(JobStatus jobStatus) { switch (jobStatus.getRunState()) { case JobStatus.PREP: case JobStatus.RUNNING: return SessionStatus.RUNNING; case JobStatus.SUCCEEDED: return SessionStatus.SUCCESSFUL; case JobStatus.FAILED: return SessionStatus.FAILED; case JobStatus.KILLED: return SessionStatus.KILLED; default: throw new RuntimeException("Unknown job status: " + jobStatus); } } protected void closeIfComplete(boolean closeFromWebUI) throws IOException { // Prevent multiple simultaneous executions of this function. We could have // the Web UI and JobSubmissionProtocol.killJob() call this, for example. synchronized(closeLock) { if (this.job.getStatus().isJobComplete()) { if (running) { running = false; close(closeFromWebUI); } } } } void close(boolean closeFromWebUI) throws IOException { try { jobHistory.markCompleted(); } catch (IOException ioe) { LOG.warn("Failed to mark job " + jobId + " as completed!", ioe); } jobHistory.shutdown(); if (sessionDriver != null) { sessionDriver.stop(jobToSessionStatus(job.getStatus())); } if (interTrackerServer != null) { interTrackerServer.stop(); } if (expireLaunchingTasksThread != null) { expireLaunchingTasksThread.interrupt(); try { expireLaunchingTasksThread.join(); } catch (InterruptedException e) {} expireLaunchingTasksThread = null; } if (resourceUpdaterThread != null) { resourceUpdaterThread.interrupt(); try { resourceUpdaterThread.join(); } catch (InterruptedException e) {} resourceUpdaterThread = null; } if (assignTasksThread != null) { assignTasksThread.interrupt(); try { assignTasksThread.join(); } catch (InterruptedException e) {} assignTasksThread = null; } if (sessionDriver != null) { try { sessionDriver.join(); } catch (InterruptedException e) {} sessionDriver = null; } // to stop the taskLauncher thread - we queue some additional actions and wake it up // the taskLauncher will wakeup, dispatch those actions and then terminate (because // running = false. if there are no actions, interrupt the launcher thread. synchronized (actionsToSend) { for(org.apache.hadoop.corona.InetAddress addr: resourceTracker.allTrackers()) { LOG.info("Sending kill job to " + addr.host + ":" + addr.port); ActionToSend action = new ActionToSend(addr.host, addr.port, new KillJobAction(this.jobId)); actionsToSend.add(action); } if (actionsToSend.size() > 0) { actionsToSend.notify(); } else { if (taskLauncherThread != null) { taskLauncherThread.interrupt(); } } } if (infoServer != null) { if (closeFromWebUI) { // If we are being called from the web UI, this function is executing in a // web-server thread. Give some time to the web-server to clean up. infoServer.setGracefulShutdown(1000); } try { infoServer.stop(); } catch (Exception ex) { LOG.warn("Exception shutting down web server ", ex); } } } class AssignTasksThread implements Runnable { public void run() { while(running) { try { assignTasks(); } catch (InterruptedException e) { // ignore and let loop check running flag } catch (Throwable t) { LOG.fatal("assignTasks thread dying because of " + StringUtils.stringifyException(t)); return; } } LOG.info ("Terminating AssignTasksThread"); } } @Override public boolean processAvailableResource(ResourceGrant grant) { org.apache.hadoop.corona.InetAddress addr = Utilities.appInfoToAddress(grant.appInfo); String trackerName = grant.getNodeName(); boolean isMapGrant = grant.getType().equals(ResourceTracker.RESOURCE_TYPE_MAP); Task task = getSetupAndCleanupTasks(trackerName, addr.host, isMapGrant); if (task == null) { TaskInProgress tip = resourceTracker.findTipForGrant(grant); if (tip.isMapTask()) { task = job.obtainNewMapTaskForTip(trackerName, addr.host, tip); } else { task = job.obtainNewReduceTaskForTip(trackerName, addr.host, tip); } } if (task != null) { TaskAttemptID taskId = task.getTaskID(); taskLookupTable.createTaskEntry(taskId, trackerName, job.getTaskInProgress(taskId.getTaskID()), grant.getId()); expireLaunchingTasks.addNewTask(task.getTaskID()); queueTaskForLaunch(task, trackerName, addr); return true; } return false; } @Override public boolean isBadResource(ResourceGrant grant, TaskInProgress tip) { org.apache.hadoop.corona.InetAddress addr = grant.address; String trackerName = grant.getNodeName(); return !job.canTrackerBeUsed(trackerName, addr.host, tip); } /** * One iteration of core logic. */ void assignTasks() throws InterruptedException { resourceTracker.processAvailableGrants(this); } void processGrantsToRevoke() { Map<ResourceGrant, TaskAttemptID> processed = new HashMap<ResourceGrant, TaskAttemptID>(); Set<String> nodesOfGrants = new HashSet<String>(); synchronized(lockObject) { for (ResourceGrant grant : grantsToRevoke) { TaskAttemptID attemptId = taskLookupTable.taskForGrant(grant); if (attemptId != null) { if (removeFromTasksForLaunch(attemptId)) { // Kill the task in the job since it never got launched expireLaunchingTasks.failedLaunch(attemptId); continue; } boolean shouldFail = false; boolean killed = killTaskUnprotected(attemptId, shouldFail); processed.put(grant, attemptId); nodesOfGrants.add(grant.getNodeName()); // Grant will get removed from the resource tracker // when the kill takes effect and we get a response from TT. } } for (String ttNode : nodesOfGrants) { queueKillActions(ttNode); } } for (Map.Entry<ResourceGrant, TaskAttemptID> entry : processed.entrySet()) { LOG.info("Revoking resource " + entry.getKey().getId() + " task: " + entry.getValue()); grantsToRevoke.remove(entry.getKey()); } } boolean removeFromTasksForLaunch(TaskAttemptID attempt) { synchronized(actionsToSend) { Iterator<ActionToSend> actionIter = actionsToSend.iterator(); while (actionIter.hasNext()) { ActionToSend action = actionIter.next(); if (action.action instanceof LaunchTaskAction && ((LaunchTaskAction)action.action).getTask().getTaskID().equals(attempt)) { actionIter.remove(); return true; } } } return false; } void queueTaskForLaunch(Task task, String trackerName, org.apache.hadoop.corona.InetAddress addr) { CoronaSessionInfo info = new CoronaSessionInfo(sessionId, jobTrackerAddress); TaskTrackerAction action = new LaunchTaskAction(task, info); ActionToSend actionToSend = new ActionToSend(addr.host, addr.port, action); LOG.info("Queueing a launch task action for " + trackerName + "(" + addr.host + ":" + addr.port); synchronized(actionsToSend) { actionsToSend.add(actionToSend); actionsToSend.notify(); } } private void queueKillActions(String trackerName) { List<TaskTrackerAction> killActions = taskLookupTable.getTasksToKill(trackerName); org.apache.hadoop.corona.InetAddress addr = resourceTracker.getTrackerAddr(trackerName); synchronized(actionsToSend) { for (TaskTrackerAction killAction: killActions) { ActionToSend actionToSend = new ActionToSend(addr.host, addr.port, killAction); actionsToSend.add(actionToSend); } actionsToSend.notify(); } } static class TrackerClientCache { Map<InetSocketAddress, CoronaTaskTrackerProtocol> trackerClients = new HashMap<InetSocketAddress, CoronaTaskTrackerProtocol>(); Configuration conf; TrackerClientCache(Configuration conf) { this.conf = conf; } public synchronized CoronaTaskTrackerProtocol getClient(InetSocketAddress s) throws IOException { CoronaTaskTrackerProtocol client = trackerClients.get(s); if (client == null) { client = createClient(s); trackerClients.put(s, client); } return client; } public synchronized void resetClient(InetSocketAddress s) { trackerClients.remove(s); } protected CoronaTaskTrackerProtocol createClient(InetSocketAddress s) throws IOException { LOG.info("Creating client to " + s.getHostName() + ":" + s.getPort()); long connectTimeout = conf.getLong(TT_CONNECT_TIMEOUT_MSEC_KEY, 10000L); return (CoronaTaskTrackerProtocol) RPC.waitForProxy( CoronaTaskTrackerProtocol.class, CoronaTaskTrackerProtocol.versionID, s, conf, connectTimeout); } public synchronized void clearClient(InetSocketAddress s) { CoronaTaskTrackerProtocol client = trackerClients.get(s); if (client != null) { trackerClients.remove(s); } } } class TaskLauncherThread implements Runnable { public void run() { while (running) { try { launchTasks(); } catch (InterruptedException e) { // ignore, check running flag for termination } catch (Throwable t) { LOG.fatal("LaunchTaskThread dying because of " + StringUtils.stringifyException(t)); return; } } LOG.info("Terminating TaskLauncher thread"); } } void launchTasks() throws InterruptedException { List<ActionToSend> actions = new ArrayList<ActionToSend>(); synchronized(actionsToSend) { while (actionsToSend.isEmpty()) { actionsToSend.wait(); } actions.addAll(actionsToSend); actionsToSend.clear(); } Set<InetSocketAddress> badTrackers = new HashSet<InetSocketAddress>(); for (ActionToSend actionToSend: actions) { // Get the tracker address. InetSocketAddress trackerAddress = new InetSocketAddress(actionToSend.trackerHost, actionToSend.port); if (badTrackers.contains(trackerAddress)) { LOG.info("Not sending " + actionToSend.action.getClass() + " to " + actionToSend.trackerHost + " since previous communication " + " in this run failed"); processSendingError(actionToSend); continue; } // Fill in the job tracker information. CoronaSessionInfo info = new CoronaSessionInfo(sessionId, jobTrackerAddress); actionToSend.action.setExtensible(info); try { CoronaTaskTrackerProtocol client = trackerClientCache.getClient(trackerAddress); client.submitActions(new TaskTrackerAction[]{actionToSend.action}); } catch (IOException e) { LOG.error("Could not send " + actionToSend.action.getClass() + " action to " + actionToSend.trackerHost, e); trackerClientCache.resetClient(trackerAddress); badTrackers.add(trackerAddress); processSendingError(actionToSend); } } } void processSendingError(ActionToSend actionToSend) { if (actionToSend.action instanceof LaunchTaskAction) { LaunchTaskAction launchTaskAction = (LaunchTaskAction) actionToSend.action; TaskAttemptID attempt = launchTaskAction.getTask().getTaskID(); expireLaunchingTasks.failedLaunch(attempt); } else if (actionToSend.action instanceof KillTaskAction) { KillTaskAction killTaskAction = (KillTaskAction) actionToSend.action; TaskAttemptID attempt = killTaskAction.getTaskID(); failTask(attempt, "TaskTracker is dead", true); } } /** * A thread to update resource requests/releases. */ protected class ResourceUpdater implements Runnable { void notifyThread() { synchronized(this) { this.notify(); } } void waitToBeNotified() throws InterruptedException { synchronized(this) { this.wait(1000L); } } public void run() { while (running) { try { waitToBeNotified(); updateResources(); } catch (InterruptedException ie) { // ignore. if shutting down, while cond. will catch it } catch (Exception e) { LOG.error("Resource Updater Thread got exception: ", e); } } } public void updateResources() throws IOException { if (job == null) return; processGrantsToRevoke(); // Update resource requests based on speculation. if (job.getStatus().getRunState() == JobStatus.RUNNING) { job.updateSpeculationRequests(); } if (sessionDriver != null) { List<ResourceRequest> newRequests = resourceTracker.getWantedResources(); if (!newRequests.isEmpty()) { sessionDriver.requestResources(newRequests); } List<ResourceRequest> toRelease = resourceTracker.getResourcesToRelease(); if (!toRelease.isEmpty()) { sessionDriver.releaseResources(toRelease); } } } } Task getSetupAndCleanupTasks(String taskTrackerName, String hostName, boolean isMapGrant) { Task t = null; t = job.obtainJobCleanupTask(taskTrackerName, hostName, isMapGrant); if (t == null) { t = job.obtainTaskCleanupTask(taskTrackerName, isMapGrant); } if (t == null) { t = job.obtainJobSetupTask(taskTrackerName, hostName, isMapGrant); } return t; } void updateTaskStatuses(TaskTrackerStatus status) { String trackerName = status.getTrackerName(); for (TaskStatus report : status.getTaskReports()) { report.setTaskTracker(trackerName); TaskAttemptID taskId = report.getTaskID(); // Remove it from the expired task list if (report.getRunState() != TaskStatus.State.UNASSIGNED) { expireLaunchingTasks.removeTask(taskId); } if (!this.jobId.equals(taskId.getJobID())) { LOG.warn("Task " + taskId + " belongs to unknown job " + taskId.getJobID()); continue; } TaskInProgress tip = taskLookupTable.getTIP(taskId); if (tip == null) { continue; } // Clone TaskStatus object here, because CoronaJobInProgress // or TaskInProgress can modify this object and // the changes should not get reflected in TaskTrackerStatus. // An old TaskTrackerStatus is used later in countMapTasks, etc. job.updateTaskStatus(tip, (TaskStatus)report.clone(), status); processFetchFailures(report); } } private void processFetchFailures(TaskStatus taskStatus) { List<TaskAttemptID> failedFetchMaps = taskStatus.getFetchFailedMaps(); if (failedFetchMaps != null) { TaskAttemptID reportingAttempt = taskStatus.getTaskID(); for (TaskAttemptID mapTaskId : failedFetchMaps) { TaskInProgress failedFetchMap = taskLookupTable.getTIP(mapTaskId); if (failedFetchMap != null) { // Gather information about the map which has to be failed, if need be String failedFetchTrackerName = taskLookupTable.getAssignedTracker(mapTaskId); if (failedFetchTrackerName == null) { failedFetchTrackerName = "Lost task tracker"; } ((CoronaJobInProgress)failedFetchMap.getJob()).fetchFailureNotification( reportingAttempt, failedFetchMap, mapTaskId, failedFetchTrackerName); } else { LOG.warn("Could not find TIP for " + failedFetchMap); } } } } /** * A tracker wants to know if any of its Tasks can be committed */ List<ActionToSend> getCommitActionsToSend(TaskTrackerStatus tts) { synchronized(lockObject) { List<ActionToSend> saveList = new ArrayList<ActionToSend>(); List<TaskStatus> taskStatuses = tts.getTaskReports(); if (taskStatuses != null) { for (TaskStatus taskStatus : taskStatuses) { if (taskStatus.getRunState() == TaskStatus.State.COMMIT_PENDING) { TaskAttemptID taskId = taskStatus.getTaskID(); TaskInProgress tip = taskLookupTable.getTIP(taskId); if (tip == null) { continue; } if (tip.shouldCommit(taskId)) { Integer grant = taskLookupTable.getGrantIdForTask(taskId); if (grant != null) { InetAddress addr = Utilities.appInfoToAddress( resourceTracker.getGrant(grant).getAppInfo()); TaskTrackerAction commitAction = new CommitTaskAction(taskId); ActionToSend commitActionToSend = new ActionToSend( addr.getHost(), addr.getPort(), commitAction); saveList.add(commitActionToSend); LOG.debug(tts.getTrackerName() + " -> CommitTaskAction: " + taskId); } } } } } return saveList; } } CoronaJobInProgress createJob(JobID jobId, JobConf defaultConf) throws IOException { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); return new CoronaJobInProgress(lockObject, jobId, new Path(getSystemDir()), defaultConf, taskLookupTable, resourceTracker, jobHistory, getUrl()); } JobStatus startJob(CoronaJobInProgress jip, SessionDriver driver) throws IOException { synchronized(lockObject) { this.job = jip; } if (job.isJobEmpty()) { job.completeEmptyJob(); } else if (!job.isSetupCleanupRequired()) { job.completeSetup(); } resourceUpdater.notifyThread(); return job.getStatus(); } CoronaJobInProgress getJob() { return job; } public JobInProgressTraits getJobInProgress(JobID jobId) { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); return (JobInProgressTraits)this.job; } public long getProtocolVersion(String protocol, long clientVersion) throws IOException { if (protocol.equals(JobSubmissionProtocol.class.getName())) { return JobSubmissionProtocol.versionID; } else if (protocol.equals(InterTrackerProtocol.class.getName())) { return InterTrackerProtocol.versionID; } else { throw new IOException("Unknown protocol " + protocol); } } public void killJobFromWebUI(JobID jobId) throws IOException { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); LOG.info("Killing job from Web UI " + jobId); job.kill(); closeIfComplete(true); } ////////////////////////////////////////////////////////////////////////////// // JobSubmissionProtocol ////////////////////////////////////////////////////////////////////////////// /** * Returns a unique JobID for a new job. * CoronaJobTracker can only run a single job and it's id is fixed a-priori * */ @Override public JobID getNewJobId() throws IOException { int value = jobCounter.incrementAndGet(); if (value > 1) throw new RuntimeException ("CoronaJobTracker can only run one job! (value=" + value + ")"); return jobId; } @Override public JobStatus submitJob(JobID jobId) throws IOException { JobConf jobConf = new JobConf(conf); CoronaJobInProgress jip = createJob(jobId, jobConf); if (sessionDriver != null) { sessionDriver.setName(jobConf.getJobName()); } jip.initTasks(); return startJob(jip, sessionDriver); } @Override public ClusterStatus getClusterStatus(boolean detailed) throws IOException { // TODO return null; } @Override public void killJob(JobID jobId) throws IOException { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); LOG.info("Killing job " + jobId); job.kill(); closeIfComplete(false); } @Override public void setJobPriority(JobID jobId, String priority) throws IOException { if (!this.jobId.equals(jobId)) throw new IOException("JobId " + jobId + " does not match the expected id of: " + this.jobId); throw new UnsupportedOperationException( "Changing job priority in CoronaJobTracker is not supported"); } @Override public boolean killTask(TaskAttemptID taskId, boolean shouldFail) throws IOException { synchronized(lockObject) { return killTaskUnprotected(taskId, shouldFail); } } private boolean killTaskUnprotected(TaskAttemptID taskId, boolean shouldFail) { TaskInProgress tip = taskLookupTable.getTIP(taskId); return tip.killTask(taskId, shouldFail, "Request received to " + (shouldFail ? "fail" : "kill") + " task '" + taskId + "' by user" ); } @Override public JobProfile getJobProfile(JobID jobId) throws IOException { if (!this.jobId.equals(jobId)) { return null; } else { return this.job.getProfile(); } } @Override public JobStatus getJobStatus(JobID jobId) throws IOException { if (!this.jobId.equals(jobId)) { return null; } else { return this.job.getStatus(); } } @Override public Counters getJobCounters(JobID jobId) throws IOException { if (!this.jobId.equals(jobId)) { return null; } else { return this.job.getCounters(); } } @Override public String getFilesystemName() throws IOException { // TODO: return null; } @Override public JobStatus[] jobsToComplete() { return null; } @Override public JobStatus[] getAllJobs() { return null; } @Override public TaskCompletionEvent[] getTaskCompletionEvents(JobID jobid , int fromEventId, int maxEvents) { if (!this.jobId.equals(jobId)) { return TaskCompletionEvent.EMPTY_ARRAY; } else { return job.getTaskCompletionEvents(fromEventId, maxEvents); } } @Override public String getSystemDir() { Path sysDir = new Path(conf.get("mapred.system.dir", "/tmp/hadoop/mapred/system")); java.net.URI uri = sysDir.toUri(); if (uri.getScheme() != null && uri.getAuthority() != null) { return sysDir.toString(); } else { return fs.makeQualified(sysDir).toString(); } } @Override public JobQueueInfo[] getQueues() { return null; } @Override public JobQueueInfo getQueueInfo(String queue) { return null; } @Override public JobStatus[] getJobsFromQueue(String queue) { return null; } public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException { return null; } ////////////////////////////////////////////////////////////////////////////// // SessionDriverService.Iface ////////////////////////////////////////////////////////////////////////////// @Override public void grantResource(String handle, List<ResourceGrant> granted) { LOG.info("Received " + granted.size() + " new grants:" + granted.toString()); resourceTracker.addNewGrants(granted); } @Override public void revokeResource(String handle, List<ResourceGrant> revoked, boolean force) { synchronized(lockObject) { grantsToRevoke.addAll(revoked); } LOG.info("Giving up " + revoked.size() + " grants: " + revoked.toString()); } ///////////////////////////////////////////////////////////////////////////// // InterTrackerProtocol ///////////////////////////////////////////////////////////////////////////// @Override public String getBuildVersion() throws IOException { return VersionInfo.getBuildVersion(); } @Override public HeartbeatResponse heartbeat(TaskTrackerStatus status, boolean restarted, boolean initialContact, boolean acceptNewTasks, short responseId) throws IOException { updateTaskStatuses(status); String trackerName = status.getTrackerName(); // remember the last known status of this task tracker // This is a ConcurrentHashMap, so no lock required. taskTrackerStatus.put(trackerName, status); // Check for tasks whose outputs can be saved List<ActionToSend> commitActionsToSend = getCommitActionsToSend(status); if (commitActionsToSend.size() > 0) { synchronized(actionsToSend) { actionsToSend.addAll(commitActionsToSend); actionsToSend.notify(); } } // Return an empty response since the actions are sent separately. short newResponseId = (short)(responseId + 1); HeartbeatResponse response = new HeartbeatResponse(newResponseId, new TaskTrackerAction[0]); response.setHeartbeatInterval(getNextHeartbeatInterval()); queueKillActions(trackerName); closeIfComplete(false); return response; } private int getNextHeartbeatInterval() { return heartbeatInterval; } @Override public void reportTaskTrackerError(String taskTrackerName, String errorClass, String errorMessage) throws IOException { LOG.warn("reportTaskTrackerError is not implemented in Corona JT, " + "params are " + taskTrackerName + "," + errorClass + "," + errorMessage); } @Override public ProtocolSignature getProtocolSignature(String protocol, long clientVersion, int clientMethodsHash) throws IOException { return ProtocolSignature.getProtocolSignature( this, protocol, clientVersion, clientMethodsHash); } public int getInfoPort() { return infoPort; } public TaskTrackerStatus getTaskTrackerStatus(String trackerID) { synchronized(lockObject) { return taskTrackerStatus.get(trackerID); } } public TaskReport[] getMapTaskReports(JobID jobId) { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); synchronized(lockObject) { return super.getMapTaskReportsImpl(jobId); } } public TaskReport[] getReduceTaskReports(JobID jobId) { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); synchronized(lockObject) { return super.getReduceTaskReportsImpl(jobId); } } public TaskReport[] getCleanupTaskReports(JobID jobId) { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); synchronized(lockObject) { return super.getCleanupTaskReportsImpl(jobId); } } public TaskReport[] getSetupTaskReports(JobID jobId) { if (!this.jobId.equals(jobId)) throw new RuntimeException("JobId " + jobId + " does not match the expected id of: " + this.jobId); synchronized(lockObject) { return super.getSetupTaskReportsImpl(jobId); } } public String[] getTaskDiagnostics(TaskAttemptID taskId) throws IOException { synchronized(lockObject) { return super.getTaskDiagnosticsImpl(taskId); } } public String getProxyUrl(String relativeUrl) { return getProxyUrl(conf, relativeUrl); } public String getProxyJTAddr() { return getProxyJTAddr(conf); } public static String getProxyJTAddr(Configuration conf) { return conf.get("mapred.job.tracker.corona.proxyaddr", null); } public static String getProxyUrl(Configuration conf, String relativeUrl) { String proxyJtAddr = getProxyJTAddr(conf); if ((proxyJtAddr != null) && (proxyJtAddr.length() > 0)) { String ret = "http://" + proxyJtAddr + "/proxy?host=" + conf.get("mapred.job.tracker.info.bindAddress") + "&port=" + conf.get("mapred.job.tracker.info.port") + "&path="; int qIndex = relativeUrl.indexOf('?'); String path = (qIndex == -1) ? relativeUrl : relativeUrl.substring(0, qIndex); String params = (qIndex == -1) ? null : ( (qIndex == (relativeUrl.length()-1)) ? null : relativeUrl.substring(qIndex+1)); return ret + path + ((params == null) ? "" : ("&" + params)); } else { return relativeUrl; } } public String getClusterManagerUrl() { String httpConf = conf.get("cm.server.http.address"); if (httpConf != null) { return "http://" + httpConf; } else { return "NONE"; } } }