/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.corona.ResourceGrant; import org.apache.hadoop.corona.ResourceRequest; import org.apache.hadoop.corona.SessionDriver; import org.apache.hadoop.corona.Utilities; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.ipc.ProtocolSignature; import org.apache.hadoop.ipc.RPC; /** * The Proxy used by the CoronaJobTracker in the client to communicate * with the CoronaJobTracker running on the TaskTracker in case of a * remote CoronaJobTracker */ @SuppressWarnings("deprecation") public class RemoteJTProxy implements InterCoronaJobTrackerProtocol, JobSubmissionProtocol { /** Logger */ public static final Log LOG = LogFactory.getLog(CoronaJobTracker.class); /** Amount of time to wait for remote JT to launch. */ public static final String REMOTE_JT_TIMEOUT_SEC_CONF = "mapred.coronajobtracker.remotejobtracker.wait"; /** Default amount of time to wait for remote JT to launch. */ public static final int REMOTE_JT_TIMEOUT_SEC_DEFAULT = 60; /** The proxy object to the CoronaJobTracker running in the cluster */ private JobSubmissionProtocol client; /** The task id for the current attempt of running CJT */ private TaskAttemptID currentAttemptId; /** The number of the current attempt */ private int attempt; /** Job configuration */ private final JobConf conf; /** Parent JobTracker */ private final CoronaJobTracker jt; /** The id of the job */ private final JobID jobId; /** The session id for the job tracker running in the cluster */ private String remoteSessionId; /** * Construct a proxy for the remote job tracker * @param jt parent job tracker * @param jobId id of the job the proxy is created for * @param conf job configuration */ @SuppressWarnings("deprecation") RemoteJTProxy(CoronaJobTracker jt, JobID jobId, JobConf conf) { this.conf = conf; this.jt = jt; this.jobId = jobId; attempt = 0; int partitionId = conf.getNumMapTasks() + 100000; currentAttemptId = new TaskAttemptID(new TaskID(jobId, true, partitionId), attempt); } public String getRemoteSessionId() { return remoteSessionId; } // /////////////////////////////////////////////////////////////////////////// // InterCoronaJobTrackerProtocol // /////////////////////////////////////////////////////////////////////////// @Override public void reportRemoteCoronaJobTracker( String attempt, String host, int port, String sessionId) throws IOException { TaskAttemptID attemptId = TaskAttemptID.forName(attempt); synchronized (this) { if (!attemptId.equals(currentAttemptId)) { throw new IOException("Attempt " + attempt + " does not match current attempt " + currentAttemptId); } initializeClientUnprotected(host, port, sessionId); this.notifyAll(); } } @Override public ProtocolSignature getProtocolSignature(String protocol, long clientVersion, int clientMethodsHash) throws IOException { return ProtocolSignature.getProtocolSignature( this, protocol, clientVersion, clientMethodsHash); } @Override public long getProtocolVersion(String protocol, long clientVersion) throws IOException { if (protocol.equals(InterCoronaJobTrackerProtocol.class.getName())) { return InterCoronaJobTrackerProtocol.versionID; } else { throw new IOException("Unknown protocol " + protocol); } } /** * Increment the attempt number for launching a remote corona job tracker. * Must be called only when holding the object lock. */ private void incrementAttemptUnprotected() { attempt++; currentAttemptId = new TaskAttemptID(currentAttemptId.getTaskID(), attempt); } /** * Create the RPC client to the remote corona job tracker. * @param host The host running the remote corona job tracker. * @param port The port of the remote corona job tracker. * @param sessionId The session for the remote corona job tracker. * @throws IOException */ void initializeClientUnprotected(String host, int port, String sessionId) throws IOException { if (client != null) { return; } LOG.info("Creating JT client to " + host + ":" + port); client = RPC.waitForProxy(JobSubmissionProtocol.class, JobSubmissionProtocol.versionID, new InetSocketAddress(host, port), conf); remoteSessionId = sessionId; } /** * Wait for the remote corona job tracker to be ready. * This involves * - getting a JOBTRACKER resource from the cluster manager. * - starting the remote job tracker by connecting to the corona task * tracker on the machine. * - waiting for the remote job tracker to report its port back to this * process. * @param jobConf The job configuration to use. * @throws IOException */ public void waitForJTStart(JobConf jobConf) throws IOException { int maxJTAttempts = jobConf.getInt( "mapred.coronajobtracker.remotejobtracker.attempts", 4); ResourceTracker resourceTracker = jt.getResourceTracker(); SessionDriver sessionDriver = jt.getSessionDriver(); List<ResourceGrant> excludeGrants = new ArrayList<ResourceGrant>(); for (int i = 0; i < maxJTAttempts; i++) { try { ResourceGrant jtGrant = waitForJTGrant(resourceTracker, sessionDriver, excludeGrants); boolean success = startRemoteJT(jobConf, jtGrant); if (success) { return; } else { excludeGrants.add(jtGrant); resourceTracker.releaseResource(jtGrant.getId()); List<ResourceRequest> released = resourceTracker.getResourcesToRelease(); sessionDriver.releaseResources(released); } } catch (InterruptedException e) { throw new IOException(e); } } throw new IOException("Could not start remote JT after " + maxJTAttempts + " attempts"); } /** * Wait for a JOBTRACKER grant. * @param resourceTracker The resource tracker object for getting the grant * @param sessionDriver The session driver for getting the grant * @param previousGrants Previous grants that could not be used successfully. * @return A new JOBTRACKER grant. * @throws IOException * @throws InterruptedException */ private ResourceGrant waitForJTGrant( ResourceTracker resourceTracker, SessionDriver sessionDriver, List<ResourceGrant> previousGrants) throws IOException, InterruptedException { LOG.info("Waiting for JT grant for " + jobId); ResourceRequest req = resourceTracker.newJobTrackerRequest(); for (ResourceGrant prev: previousGrants) { LOG.info("Adding " + prev.getNodeName() + " to excluded hosts"); req.addToExcludeHosts(prev.getAddress().getHost()); } resourceTracker.recordRequest(req); List<ResourceRequest> newRequests = resourceTracker.getWantedResources(); sessionDriver.requestResources(newRequests); final List<ResourceGrant> grants = new ArrayList<ResourceGrant>(); ResourceTracker.ResourceProcessor proc = new ResourceTracker.ResourceProcessor() { @Override public boolean processAvailableResource(ResourceGrant resource) { grants.add(resource); final boolean consumed = true; return consumed; } }; while (true) { // Try to get JT grant while periodically checking for session driver // exceptions. long timeout = 60 * 1000; // 1 min. resourceTracker.processAvailableGrants(proc, 1, timeout); IOException e = sessionDriver.getFailed(); if (e != null) { throw e; } if (!grants.isEmpty()) { return grants.get(0); } } } /** * Start corona job tracker on the machine provided by using the corona * task tracker API. * @param jobConf The job configuration. * @param grant The grant that specifies the remote machine. * @return A boolean indicating success. * @throws InterruptedException */ private boolean startRemoteJT( JobConf jobConf, ResourceGrant grant) throws InterruptedException { org.apache.hadoop.corona.InetAddress ttAddr = Utilities.appInfoToAddress(grant.appInfo); CoronaTaskTrackerProtocol coronaTT = null; try { coronaTT = jt.getTaskTrackerClient(ttAddr.getHost(), ttAddr.getPort()); } catch (IOException e) { LOG.error("Error while trying to connect to TT at " + ttAddr.getHost() + ":" + ttAddr.getPort(), e); return false; } LOG.info("Starting remote JT for " + jobId + " on " + ttAddr.getHost()); // Get a special map id for the JT task. Path systemDir = new Path(jt.getSystemDir()); String jobFile = CoronaJobInProgress.getJobFile(systemDir, jobId) .toString(); String splitClass = JobClient.RawSplit.class.getName(); BytesWritable split = new BytesWritable(); Task jobTask = new MapTask( jobFile, currentAttemptId, currentAttemptId.getTaskID().getId(), splitClass, split, 1, jobConf.getUser()); CoronaSessionInfo info = new CoronaSessionInfo(jt.getSessionId(), jt.getJobTrackerAddress()); synchronized (this) { try { coronaTT.startCoronaJobTracker(jobTask, info); } catch (IOException e) { // Increment the attempt so that the older attempt will get an error // in reportRemoteCoronaJobTracker(). incrementAttemptUnprotected(); LOG.error("Error while performing RPC to TT at " + ttAddr.getHost() + ":" + ttAddr.getPort(), e); return false; } } // Now wait for the remote CJT to report its address. final long waitStart = System.currentTimeMillis(); final long timeout = RemoteJTProxy.getRemotJTTimeout(jobConf); synchronized (this) { while (client == null) { LOG.info("Waiting for remote JT to start on " + ttAddr.getHost()); this.wait(1000); if (System.currentTimeMillis() - waitStart > timeout) { // Increment the attempt so that the older attempt will get an error // in reportRemoteCoronaJobTracker(). incrementAttemptUnprotected(); LOG.warn("Could not start remote JT on " + ttAddr.getHost()); return false; } } } return true; } // /////////////////////////////////////////////////////////////////////////// // JobSubmissionProtocol // /////////////////////////////////////////////////////////////////////////// @Override public JobID getNewJobId() throws IOException { throw new UnsupportedOperationException( "getNewJobId not supported by proxy"); } @Override public JobStatus submitJob(final JobID jobId) throws IOException { return (new Caller<JobStatus>() { @Override JobStatus call() throws IOException { return client.submitJob(jobId); } }).makeCall(); } @Override public ClusterStatus getClusterStatus(boolean detailed) throws IOException { throw new UnsupportedOperationException( "getClusterStatus is not supported by proxy"); } @Override public void killJob(final JobID jobId) throws IOException { (new Caller<JobID>() { @Override JobID call() throws IOException { client.killJob(jobId); return jobId; } }).makeCall(); } @Override public void setJobPriority(JobID jobId, String priority) throws IOException { throw new UnsupportedOperationException( "setJobPriority is not supported by proxy"); } @Override public boolean killTask(final TaskAttemptID taskId, final boolean shouldFail) throws IOException { return (new Caller<Boolean>() { @Override Boolean call() throws IOException { return client.killTask(taskId, shouldFail); } }).makeCall(); } @Override public JobProfile getJobProfile(final JobID jobId) throws IOException { return (new Caller<JobProfile>() { @Override JobProfile call() throws IOException { return client.getJobProfile(jobId); } }).makeCall(); } @Override public JobStatus getJobStatus(final JobID jobId) throws IOException { return (new Caller<JobStatus>() { @Override JobStatus call() throws IOException { return client.getJobStatus(jobId); } }).makeCall(); } @Override public Counters getJobCounters(final JobID jobId) throws IOException { return (new Caller<Counters>() { @Override Counters call() throws IOException { return client.getJobCounters(jobId); } }).makeCall(); } @Override public TaskReport[] getMapTaskReports(final JobID jobId) throws IOException { return (new Caller<TaskReport[]>() { @Override TaskReport[] call() throws IOException { return client.getMapTaskReports(jobId); } }).makeCall(); } @Override public TaskReport[] getReduceTaskReports(final JobID jobId) throws IOException { return (new Caller<TaskReport[]>() { @Override TaskReport[] call() throws IOException { return client.getReduceTaskReports(jobId); } }).makeCall(); } @Override public TaskReport[] getCleanupTaskReports(final JobID jobId) throws IOException { return (new Caller<TaskReport[]>() { @Override TaskReport[] call() throws IOException { return client.getCleanupTaskReports(jobId); } }).makeCall(); } @Override public TaskReport[] getSetupTaskReports(final JobID jobId) throws IOException { return (new Caller<TaskReport[]>() { @Override TaskReport[] call() throws IOException { return client.getSetupTaskReports(jobId); } }).makeCall(); } @Override public String getFilesystemName() throws IOException { throw new UnsupportedOperationException( "getFilesystemName is not supported by proxy"); } @Override public JobStatus[] jobsToComplete() { throw new UnsupportedOperationException( "jobsToComplete is not supported by proxy"); } @Override public JobStatus[] getAllJobs() { throw new UnsupportedOperationException( "getAllJobs is not supported by proxy"); } @Override public TaskCompletionEvent[] getTaskCompletionEvents(final JobID jobid, final int fromEventId, final int maxEvents) throws IOException { return (new Caller<TaskCompletionEvent[]>() { @Override TaskCompletionEvent[] call() throws IOException { return client.getTaskCompletionEvents(jobid, fromEventId, maxEvents); } }).makeCall(); } @Override public String[] getTaskDiagnostics(final TaskAttemptID taskId) throws IOException { return (new Caller<String[]>() { @Override String[] call() throws IOException { return client.getTaskDiagnostics(taskId); } }).makeCall(); } @Override public String getSystemDir() { throw new UnsupportedOperationException( "getSystemDir not supported by proxy."); } @Override public JobQueueInfo[] getQueues() { throw new UnsupportedOperationException("getQueues method is " + "not supported by proxy."); } @Override public JobQueueInfo getQueueInfo(String queue) { throw new UnsupportedOperationException( "getQueueInfo not supported by proxy."); } @Override public JobStatus[] getJobsFromQueue(String queue) { throw new UnsupportedOperationException( "getJobsFromQueue not supported by proxy."); } @Override public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException { throw new UnsupportedOperationException( "getQueueAclsForCurrentUser not supported by proxy."); } public void close() { synchronized (this) { if (client != null) { RPC.stopProxy(client); } } } /** * Generic caller interface. */ private abstract class Caller<T> { /** * Perform the call. Must be overridden by a sub-class. * @return The generic return value. * @throws IOException */ abstract T call() throws IOException; /** * Template function to make the call. * @return The generic return value. * @throws IOException */ public T makeCall() throws IOException { try { checkClient(); return call(); } catch (IOException e) { LOG.error("Error on remote call ", e); handleCallFailure(); throw e; } } } /** * Handle failures while making calls to the remote corona job tracker. * We need to close the local job tracker. * @throws IOException */ private void handleCallFailure() throws IOException { try { jt.close(false, true); } catch (InterruptedException e) { throw new IOException(e); } } /** * Check if the RPC client to the remote job tracker is ready, and wait if * not. * @throws IOException */ private void checkClient() throws IOException { synchronized (this) { if (client == null) { try { this.wait(); } catch (InterruptedException e) { throw new IOException(e); } } } } /** * Returns the timeout in milliseconds after which we timeout the remote job * tracker. * * @param conf * The configuration * @return The timeout in milliseconds. */ public static long getRemotJTTimeout(Configuration conf) { return conf.getInt(RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_CONF, RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_DEFAULT) * 1000; } }