package org.apache.hadoop.mapred; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.corona.ResourceGrant; import org.apache.hadoop.corona.ResourceRequest; import org.apache.hadoop.corona.SessionDriver; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.CoronaStateUpdate.TaskLaunch; import org.apache.hadoop.mapred.CoronaStateUpdate.TaskStatusUpdate; import org.apache.hadoop.mapred.CoronaStateUpdate.TaskTimeout; /** * Holds update of remote CoronaJobTracker sent to local one. Used by remote JT * to restore its state after failure */ @SuppressWarnings("deprecation") public class CoronaJTState implements Writable { /** Logger */ public static final Log LOG = LogFactory.getLog(CoronaJTState.class); /** Updates in the same order as received */ List<CoronaStateUpdate> updates = new ArrayList<CoronaStateUpdate>(); /** Session id */ private String sessionId = ""; /** The number of remote job tracker failover executed*/ int restartNum = 0; public void setRestartNum(int restartNum) { this.restartNum = restartNum; } public void setSessionId(String sessionId) { this.sessionId = sessionId; } @Override public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, updates.size()); for (CoronaStateUpdate update : updates) { update.write(out); } Text.writeString(out, sessionId); WritableUtils.writeVInt(out, restartNum); } @Override public void readFields(DataInput in) throws IOException { updates.clear(); int size = WritableUtils.readVInt(in); for (int i = 0; i < size; ++i) { CoronaStateUpdate update = new CoronaStateUpdate(); update.readFields(in); updates.add(update); } sessionId = Text.readString(in); restartNum = WritableUtils.readVInt(in); } /** * Add state update to state * @param update update to add */ public void add(CoronaStateUpdate update) { updates.add(update); } /** * Prepares saved state for new JT * @return CoronaJTState prepared to be consumed by restarting JT */ public CoronaJTState prepare() { Collections.sort(updates); return this; } /** * Creates pretty report of saved state * @return string with report */ public String getPrettyReport(JobID jobId) { Map<TaskAttemptID, TaskLaunch> lastLaunch = new HashMap<TaskAttemptID, CoronaStateUpdate.TaskLaunch>(); Map<TaskAttemptID, TaskStatus.State> lastKnownStatus = new HashMap<TaskAttemptID, TaskStatus.State>(); JTFailoverMetrics jtFailoverMetrics = new JTFailoverMetrics(); for (CoronaStateUpdate update : updates) { if (update.getTaskLaunch() != null) { TaskLaunch launch = update.getTaskLaunch(); lastLaunch.put(launch.getTaskId(), launch); } else if (update.getTaskStatus() != null) { TaskStatus status = update.getTaskStatus(); lastKnownStatus.put(status.getTaskID(), status.getRunState()); jtFailoverMetrics.update(status); } } StringBuilder result = new StringBuilder(); result.append("CoronaJTState report"); if (jobId != null) { result.append(" for job ").append(jobId); } for (CoronaStateUpdate update : updates) { TaskLaunch launch = update.getTaskLaunch(); if (launch != null) { result.append("\n").append(launch).append(" last known "); result.append(lastKnownStatus.get(launch.getTaskId())); } } if (sessionId != null && !sessionId.isEmpty()) { result.append("\n Session id ").append(sessionId); } result.append("\nThis remoteJobTracker failover totally saved: "); result.append("\nmappers ").append(jtFailoverMetrics.savedMappers). append(" map cpu ").append(jtFailoverMetrics.savedMapCPU). append(" map wallclock ").append(jtFailoverMetrics.savedMapWallclock); result.append("\nreducers ").append(jtFailoverMetrics.savedReducers). append(" reduce cpu ").append(jtFailoverMetrics.savedReduceCPU). append(" reduce wallclock ").append(jtFailoverMetrics.savedReduceWallclock); return result.toString(); } @Override public String toString() { return getPrettyReport(null); } public static class JTFailoverMetrics { int savedMappers = 0; int savedReducers = 0; long savedMapCPU = 0L; long savedReduceCPU = 0L; long savedMapWallclock = 0L; long savedReduceWallclock = 0L; int restartNum = 0; long fetchStateCost = 0L; public void update(TaskStatus status) { if (status.getRunState() != TaskStatus.State.COMMIT_PENDING && status.getRunState() != TaskStatus.State.SUCCEEDED) { return; } if (status.getIsMap()) { savedMappers += 1; savedMapCPU += status.getCounters().getCounter(Task.Counter.CPU_MILLISECONDS); savedMapWallclock += status.getCounters().getCounter(Task.Counter.MAP_TASK_WALLCLOCK); } else { savedReducers += 1; savedReduceCPU += status.getCounters().getCounter(Task.Counter.CPU_MILLISECONDS); savedReduceWallclock += status.getCounters().getCounter(Task.Counter.REDUCE_TASK_WALLCLOCK); } } } /** * This class defines how state updates are sent to local JT from remote one. */ public static class Submitter { /** Attempt id of this task tracker */ private TaskAttemptID jtAttemptId; /** Destination where status updates will be saved */ InterCoronaJobTrackerProtocol localJT; /** Information pending processing and sending */ private LinkedBlockingQueue<CoronaStateUpdate> pendingProcessing; /** Indicates whether submitting thread is running */ private volatile boolean running = true; /** Submitting thread */ private Thread submitterThread; /** * Creates submitter that discards all state updates */ public Submitter() { } /** * Creates submitter of status updates to given destination. * @param localJT destination of submits * @param jtAttemptId attempt id of job tracker running this submitter */ public Submitter(InterCoronaJobTrackerProtocol localJT, TaskAttemptID jtAttemptId, JobConf conf) { pendingProcessing = new LinkedBlockingQueue<CoronaStateUpdate>(); this.localJT = localJT; this.jtAttemptId = jtAttemptId; submitterThread = new Thread(new AsyncSubmitter(conf)); submitterThread.start(); } /** * Determines whether submitter can send updates to it's destination * @return true iff updates can be sent */ public boolean canSubmit() { return (localJT != null); } /** * Submits state update to destination. This call can delay sending of * update depending on its type. * @param launch task launch event * @throws IOException */ public void submit(TaskLaunch launch) throws IOException { if (localJT == null || launch == null) return; try { // We're sending TaskLaunch updates synchronously localJT.pushCoronaJobTrackerStateUpdate(jtAttemptId, new CoronaStateUpdate[] { new CoronaStateUpdate(launch) }); // pendingProcessing.offer(launch); } catch (IOException e) { LOG.error("Failed to push update, failing submitter", e); close(); } } /** * Submits tracker status update. This call can delay sending of * update depending on its type. * @param tracker task tracker status to generate update from * @throws IOException */ public void submit(TaskTrackerStatus tracker) throws IOException { if (localJT == null || tracker == null) return; pendingProcessing.offer(new CoronaStateUpdate(tracker)); } /** * Submits task status update. This call can delay sending of update * depending on its type. * @param status task status to generate update from * @throws IOException */ public void submit(TaskStatus status) throws IOException { if (localJT == null || status == null) return; if (TaskStatus.TERMINATING_STATES.contains(status.getRunState()) || TaskStatus.State.COMMIT_PENDING.equals(status.getRunState())) { pendingProcessing.offer(new CoronaStateUpdate(status)); } } /** * Submits task status update. This call can delay sending of update * depending on its type. * @param timeout TaskTimout update to save */ public void submit(TaskTimeout timeout) { if (localJT == null || timeout == null) return; pendingProcessing.offer(new CoronaStateUpdate(timeout)); } /** * Closes submitter */ public void close() { running = false; if (submitterThread != null) { submitterThread.interrupt(); try { submitterThread.join(); } catch (InterruptedException e) { } } jtAttemptId = null; localJT = null; pendingProcessing = null; } /** * Thread that asynchronously process and submits state updates */ private class AsyncSubmitter implements Runnable { /** Max processed pending updates per batch */ private static final int MAX_BATCH_UPDATES_DEFAULT= 1000; /** Keeps track of the most recent tracker info */ private Map<String, TaskTrackerInfo> trackerToInfo = new HashMap<String, TaskTrackerInfo>(); /** The configure key for RJT to update the state to * local job tracker*/ public static final String MAX_BATCH_UPDATES_SIZE = "corona.jt.state.max.batch.updates.size"; /** The configure key for the wait timeout value when RJT updating * the state before getting the max batch update size in * millis*/ public static final String MAX_BATCH_UPDATES_WAITTIME = "corona.jt.state.batch.update.waittime"; private static final long MAX_BATCH_UPDATES_WAITTIME_DEFAULT = 1L; private long batchUpdateTimeout; private int maxBatchUpdateSize; public AsyncSubmitter(JobConf conf) { maxBatchUpdateSize = conf.getInt( MAX_BATCH_UPDATES_SIZE, MAX_BATCH_UPDATES_DEFAULT); batchUpdateTimeout = conf.getLong( MAX_BATCH_UPDATES_WAITTIME, MAX_BATCH_UPDATES_WAITTIME_DEFAULT); } @Override public void run() { List<CoronaStateUpdate> toSend = new ArrayList<CoronaStateUpdate>( maxBatchUpdateSize); while (running) { for (int updates = 0; updates < maxBatchUpdateSize; updates++) { CoronaStateUpdate update; try { if (toSend.isEmpty()) { // We're waiting for anything to send update = pendingProcessing.take(); } else { // We have things to send, but lets wait for a short time // Pushing every update will introduce bigger lag in this thread // than this wait, and more updates can get lost (are sync // pending) update = pendingProcessing.poll(batchUpdateTimeout, TimeUnit.MILLISECONDS); } } catch (InterruptedException e) { // Check running flag, we don't want to loose updates, so this // goes through sending code break; } if (update == null) { break; } Object obj = update.get(); // Classify different objects if (obj instanceof TaskLaunch) { // Launching task, no preprocessing toSend.add(update); } else if (obj instanceof TaskTrackerStatus) { TaskTrackerStatus tracker = (TaskTrackerStatus) obj; String trackerName = tracker.getTrackerName(); // Send new TaskTrackerInfo update only if has changed TaskTrackerInfo info = TaskTrackerInfo.fromStatus(tracker); TaskTrackerInfo savedInfo = trackerToInfo.get(trackerName); if (savedInfo == null || !savedInfo.equals(info)) { trackerToInfo.put(trackerName, info); update.set(info); toSend.add(update); } } else if (obj instanceof TaskStatus) { TaskStatus report = (TaskStatus) obj; // Encapsulate to provide tracker name update.set(new TaskStatusUpdate(report)); toSend.add(update); } else if (obj instanceof TaskTimeout) { // Timed out running or launching task toSend.add(update); } else { LOG.error("Unknown type of update"); } } // Send batch if (!toSend.isEmpty()) { try { localJT.pushCoronaJobTrackerStateUpdate(jtAttemptId, toSend .toArray(CoronaStateUpdate.EMPTY_ARRAY)); LOG.info("Batch of " + toSend.size() + " updates sent."); toSend.clear(); } catch (IOException e) { LOG.error("Failed to push updates", e); close(); } } } LOG.info("AsyncSubmitter exiting."); } } } /** * Fetches and serves queries for saved state, not designed for concurrent * access */ public static class Fetcher { /** Id of session saved with this state */ private String sessionId; /** List of updates in the same order as submitted to local JT */ private List<CoronaStateUpdate> updates; /** Maps tracker name to TaskTrackerInfo */ private Map<String, TaskTrackerInfo> trackerToInfo = new HashMap<String, TaskTrackerInfo>(); /** Clock used for restoring proper timestamps in JT */ private RestoringClock clock; /** The metrics to record the impact of RJT failover*/ JTFailoverMetrics jtFailoverMetrics = new JTFailoverMetrics(); /** The known trackers **/ private Set<String> taskLaunchTrackers = new HashSet<String>(); /** * Creates empty fetcher (which state can't be filled) */ public Fetcher() { } /** * When restoring JT status after restarting, it's possible that we have * several task attempts that were using the same grant. Only the most * recent task attempt is still using this grant, All finished restored * attempts should declare null grant. The last launched attempt for each * given grant is the attempt assumed to be running using this grant, rest * attempts must declare null grant. * @param parent local JT to fetch state from * @param jtAttemptId task attempt id of job tracker running this fetcher */ public Fetcher(InterCoronaJobTrackerProtocol parent, TaskAttemptID jtAttemptId) { CoronaJTState state; long startFetchingTime = System.currentTimeMillis(); try { state = parent.getCoronaJobTrackerState(jtAttemptId); } catch (IOException e) { LOG.error("Error when fetching state from parent JT. Proceeding with" + " cleared state. ", e); close(); return; } // State parts this.sessionId = state.sessionId; this.updates = Collections.unmodifiableList(state.updates); this.jtFailoverMetrics.restartNum = state.restartNum; for (Iterator<CoronaStateUpdate> iter = updates.iterator(); iter.hasNext();) { CoronaStateUpdate update = iter.next(); // Process task status updates for queries, // preserve order for each tracker TaskStatus status = update.getTaskStatus(); if (status != null) { jtFailoverMetrics.update(status); continue; } // Set non-existing grants in every attempt, prepare mapping from // grant to last attempt that was using this grant TaskLaunch launch = update.getTaskLaunch(); if (launch != null) { Integer grant = launch.getGrantId(); // assign non-existing grant, we will kill all the unfinished tasks launch.setGrantId(ResourceTracker.getNoneGrantId()); taskLaunchTrackers.add(launch.getTrackerName()); continue; } // Save tracker info for replaying task status TaskTrackerInfo info = update.getTrackerInfo(); if (info != null) { trackerToInfo.put(info.getTrackerName(), info); continue; } } trackerToInfo = Collections.unmodifiableMap(trackerToInfo); taskLaunchTrackers = Collections.unmodifiableSet(taskLaunchTrackers); jtFailoverMetrics.fetchStateCost = System.currentTimeMillis() - startFetchingTime; LOG.info(jtFailoverMetrics.fetchStateCost + " milliseconds used to do state fetching"); } /** * Returns saved session if any * @return saved session id or null */ public String getSessionId() { if (sessionId == null || sessionId.isEmpty()) { return null; } return sessionId; } /** * Determines whether tasks state has been restored * @return true iff tasks state has been restored */ public boolean hasTasksState() { return (sessionId != null && updates != null); } /** * Wipes out all state */ public void close() { sessionId = null; updates = null; taskLaunchTrackers = null; trackerToInfo = null; clock = null; } /** * Restores fetched state updates in the same order that they were saved * @param remoteJT JobTracekr to restore state */ public void restoreState(StateRestorer remoteJT) { if (!hasTasksState()) return; Clock oldClock = remoteJT.getClock(); clock = new RestoringClock(); remoteJT.setClock(clock); LOG.info("Begin to restoreState"); long restoreTime = oldClock.getTime(); for (Iterator<CoronaStateUpdate> iter = updates.iterator(); iter.hasNext();) { CoronaStateUpdate update = iter.next(); clock.setTimestamp(update.getTimestamp()); LOG.info("Current timestamp " + update.getTimestamp()); TaskStatus status = update.getTaskStatus(); if (status != null) { TaskTrackerInfo info = trackerToInfo.get(status.getTaskTracker()); if (info != null) { remoteJT.restoreTaskStatus(status, info); LOG.info("Restoring status " + status + " @ " + info); } else { // it is safe for us to kill more uncertain tasks LOG.error("Skipping status " + status + " because of null TaskTracker info"); } continue; } TaskLaunch launch = update.getTaskLaunch(); if (launch != null) { LOG.info("Restoring launch " + launch); remoteJT.restoreTaskLaunch(launch); continue; } TaskTimeout timeout = update.getTaskTimeout(); if (timeout != null) { String trackerName = timeout.getTrackerName(); LOG.info("Restoring timeout on " + trackerName); remoteJT.restoreTaskTimeout(trackerName); continue; } } long restoreCost = oldClock.getTime() - restoreTime; LOG.info("End the restoreState, totally " + restoreCost + "milliseconds used."); remoteJT.setClock(oldClock); } /** * Returns a set of task trackers that was in use during previous remote JT * life * @return set of task tracker's names */ public Set<String> getTaskLaunchTrackers() { return taskLaunchTrackers; } } /** * Clock that allows us to restore time as saved with status updates during * restarting process */ public static class RestoringClock extends Clock { /** Current manually set timestamp */ private volatile long timestamp; /** Determines whether we're using real or manually set timesamps */ private volatile boolean useRealTimestamps = false; /** * Switches to using real timestamps */ public void useRealTimestamps() { useRealTimestamps = true; } /** * Sets current timestamp * @param timestamp time to set */ public void setTimestamp(long timestamp) { useRealTimestamps = false; this.timestamp = timestamp; } @Override public long getTime() { if (useRealTimestamps) { return super.getTime(); } else { return timestamp; } } } /** * Contract between remote JT and Fetcher defining functions for restoring * state */ public interface StateRestorer { /** * Set clock in object that restores it's state * @param clock clock to use */ public void setClock(Clock clock); /** * Get clock being used in object that restores it's state * @return clock used currently by state restorer */ public Clock getClock(); /** * Restores task timeout event for provided task tracker * @param trackerName */ public void restoreTaskTimeout(String trackerName); /** * Restore task launch * @param launch a TaskLaunch */ public void restoreTaskLaunch(TaskLaunch launch); /** * Restore task status update saved from heartbeat report * @param status a TaskStatus * @param tracker a TaskStatusInfo of tracker that sent this update */ public void restoreTaskStatus(TaskStatus status, TaskTrackerInfo tracker); } }