ClientServiceDelegate.java example

Explorer
HDP-2.2-Patched-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.InetSocketAddress;
import java.security.PrivilegedExceptionAction;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.v2.LogParams;
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillJobRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.Counters;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptReport;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier;
import org.apache.hadoop.yarn.util.ConverterUtils;

import com.google.common.annotations.VisibleForTesting;

public class ClientServiceDelegate {
  private static final Log LOG = LogFactory.getLog(ClientServiceDelegate.class);
  private static final String UNAVAILABLE = "N/A";

  // Caches for per-user NotRunningJobs
  private HashMap<JobState, HashMap<String, NotRunningJob>> notRunningJobs;

  private final Configuration conf;
  private final JobID jobId;
  private final ApplicationId appId;
  private final ResourceMgrDelegate rm;
  private final MRClientProtocol historyServerProxy;
  private MRClientProtocol realProxy = null;
  private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
  private static String UNKNOWN_USER = "Unknown User";
  private String trackingUrl;
  private AtomicBoolean usingAMProxy = new AtomicBoolean(false);
  private int maxClientRetry;
  private boolean amAclDisabledStatusLogged = false;

  public ClientServiceDelegate(Configuration conf, ResourceMgrDelegate rm,
      JobID jobId, MRClientProtocol historyServerProxy) {
    this.conf = new Configuration(conf); // Cloning for modifying.
    // For faster redirects from AM to HS.
    this.conf.setInt(
        CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
        this.conf.getInt(MRJobConfig.MR_CLIENT_TO_AM_IPC_MAX_RETRIES,
            MRJobConfig.DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES));
    this.rm = rm;
    this.jobId = jobId;
    this.historyServerProxy = historyServerProxy;
    this.appId = TypeConverter.toYarn(jobId).getAppId();
    notRunningJobs = new HashMap<JobState, HashMap<String, NotRunningJob>>();
  }

  // Get the instance of the NotRunningJob corresponding to the specified
  // user and state
  private NotRunningJob getNotRunningJob(ApplicationReport applicationReport,
      JobState state) {
    synchronized (notRunningJobs) {
      HashMap<String, NotRunningJob> map = notRunningJobs.get(state);
      if (map == null) {
        map = new HashMap<String, NotRunningJob>();
        notRunningJobs.put(state, map);
      }
      String user =
          (applicationReport == null) ?
              UNKNOWN_USER : applicationReport.getUser();
      NotRunningJob notRunningJob = map.get(user);
      if (notRunningJob == null) {
        notRunningJob = new NotRunningJob(applicationReport, state);
        map.put(user, notRunningJob);
      }
      return notRunningJob;
    }
  }

  private MRClientProtocol getProxy() throws IOException {
    if (realProxy != null) {
      return realProxy;
    }
    
    // Possibly allow nulls through the PB tunnel, otherwise deal with an exception
    // and redirect to the history server.
    ApplicationReport application = null;
    try {
      application = rm.getApplicationReport(appId);
    } catch (YarnException e2) {
      throw new IOException(e2);
    }
    if (application != null) {
      trackingUrl = application.getTrackingUrl();
    }
    InetSocketAddress serviceAddr = null;
    while (application == null
        || YarnApplicationState.RUNNING == application
            .getYarnApplicationState()) {
      if (application == null) {
        LOG.info("Could not get Job info from RM for job " + jobId
            + ". Redirecting to job history server.");
        return checkAndGetHSProxy(null, JobState.NEW);
      }
      try {
        if (application.getHost() == null || "".equals(application.getHost())) {
          LOG.debug("AM not assigned to Job. Waiting to get the AM ...");
          Thread.sleep(2000);

          LOG.debug("Application state is " + application.getYarnApplicationState());
          application = rm.getApplicationReport(appId);
          continue;
        } else if (UNAVAILABLE.equals(application.getHost())) {
          if (!amAclDisabledStatusLogged) {
            LOG.info("Job " + jobId + " is running, but the host is unknown."
                + " Verify user has VIEW_JOB access.");
            amAclDisabledStatusLogged = true;
          }
          return getNotRunningJob(application, JobState.RUNNING);
        }
        if(!conf.getBoolean(MRJobConfig.JOB_AM_ACCESS_DISABLED, false)) {
          UserGroupInformation newUgi = UserGroupInformation.createRemoteUser(
              UserGroupInformation.getCurrentUser().getUserName());
          serviceAddr = NetUtils.createSocketAddrForHost(
              application.getHost(), application.getRpcPort());
          if (UserGroupInformation.isSecurityEnabled()) {
            org.apache.hadoop.yarn.api.records.Token clientToAMToken =
                application.getClientToAMToken();
            Token<ClientToAMTokenIdentifier> token =
                ConverterUtils.convertFromYarn(clientToAMToken, serviceAddr);
            newUgi.addToken(token);
          }
          LOG.debug("Connecting to " + serviceAddr);
          final InetSocketAddress finalServiceAddr = serviceAddr;
          realProxy = newUgi.doAs(new PrivilegedExceptionAction<MRClientProtocol>() {
            @Override
            public MRClientProtocol run() throws IOException {
              return instantiateAMProxy(finalServiceAddr);
            }
          });
        } else {
          if (!amAclDisabledStatusLogged) {
            LOG.info("Network ACL closed to AM for job " + jobId
                + ". Not going to try to reach the AM.");
            amAclDisabledStatusLogged = true;
          }
          return getNotRunningJob(null, JobState.RUNNING);
        }
        return realProxy;
      } catch (IOException e) {
        //possibly the AM has crashed
        //there may be some time before AM is restarted
        //keep retrying by getting the address from RM
        LOG.info("Could not connect to " + serviceAddr +
        ". Waiting for getting the latest AM address...");
        try {
          Thread.sleep(2000);
        } catch (InterruptedException e1) {
          LOG.warn("getProxy() call interruped", e1);
          throw new YarnRuntimeException(e1);
        }
        try {
          application = rm.getApplicationReport(appId);
        } catch (YarnException e1) {
          throw new IOException(e1);
        }
        if (application == null) {
          LOG.info("Could not get Job info from RM for job " + jobId
              + ". Redirecting to job history server.");
          return checkAndGetHSProxy(null, JobState.RUNNING);
        }
      } catch (InterruptedException e) {
        LOG.warn("getProxy() call interruped", e);
        throw new YarnRuntimeException(e);
      } catch (YarnException e) {
        throw new IOException(e);
      }
    }

    /** we just want to return if its allocating, so that we don't
     * block on it. This is to be able to return job status
     * on an allocating Application.
     */
    String user = application.getUser();
    if (user == null) {
      throw new IOException("User is not set in the application report");
    }
    if (application.getYarnApplicationState() == YarnApplicationState.NEW
        || application.getYarnApplicationState() ==
            YarnApplicationState.NEW_SAVING
        || application.getYarnApplicationState() == YarnApplicationState.SUBMITTED
        || application.getYarnApplicationState() == YarnApplicationState.ACCEPTED) {
      realProxy = null;
      return getNotRunningJob(application, JobState.NEW);
    }

    if (application.getYarnApplicationState() == YarnApplicationState.FAILED) {
      realProxy = null;
      return getNotRunningJob(application, JobState.FAILED);
    }

    if (application.getYarnApplicationState() == YarnApplicationState.KILLED) {
      realProxy = null;
      return getNotRunningJob(application, JobState.KILLED);
    }

    //History server can serve a job only if application
    //succeeded.
    if (application.getYarnApplicationState() == YarnApplicationState.FINISHED) {
      LOG.info("Application state is completed. FinalApplicationStatus="
          + application.getFinalApplicationStatus().toString()
          + ". Redirecting to job history server");
      realProxy = checkAndGetHSProxy(application, JobState.SUCCEEDED);
    }
    return realProxy;
  }

  private MRClientProtocol checkAndGetHSProxy(
      ApplicationReport applicationReport, JobState state) {
    if (null == historyServerProxy) {
      LOG.warn("Job History Server is not configured.");
      return getNotRunningJob(applicationReport, state);
    }
    return historyServerProxy;
  }

  MRClientProtocol instantiateAMProxy(final InetSocketAddress serviceAddr)
      throws IOException {
    LOG.trace("Connecting to ApplicationMaster at: " + serviceAddr);
    YarnRPC rpc = YarnRPC.create(conf);
    MRClientProtocol proxy = 
         (MRClientProtocol) rpc.getProxy(MRClientProtocol.class,
            serviceAddr, conf);
    usingAMProxy.set(true);
    LOG.trace("Connected to ApplicationMaster at: " + serviceAddr);
    return proxy;
  }

  private synchronized Object invoke(String method, Class argClass,
      Object args) throws IOException {
    Method methodOb = null;
    try {
      methodOb = MRClientProtocol.class.getMethod(method, argClass);
    } catch (SecurityException e) {
      throw new YarnRuntimeException(e);
    } catch (NoSuchMethodException e) {
      throw new YarnRuntimeException("Method name mismatch", e);
    }
    maxClientRetry = this.conf.getInt(
        MRJobConfig.MR_CLIENT_MAX_RETRIES,
        MRJobConfig.DEFAULT_MR_CLIENT_MAX_RETRIES);
    IOException lastException = null;
    while (maxClientRetry > 0) {
      MRClientProtocol MRClientProxy = null;
      try {
        MRClientProxy = getProxy();
        return methodOb.invoke(MRClientProxy, args);
      } catch (InvocationTargetException e) {
        // Will not throw out YarnException anymore
        LOG.debug("Failed to contact AM/History for job " + jobId + 
            " retrying..", e.getTargetException());
        // Force reconnection by setting the proxy to null.
        realProxy = null;
        // HS/AMS shut down
        // if it's AM shut down, do not decrement maxClientRetry as we wait for
        // AM to be restarted.
        if (!usingAMProxy.get()) {
          maxClientRetry--;
        }
        usingAMProxy.set(false);
        lastException = new IOException(e.getTargetException());
        try {
          Thread.sleep(100);
        } catch (InterruptedException ie) {
          LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
          throw new YarnRuntimeException(ie);
        }
      } catch (Exception e) {
        LOG.debug("Failed to contact AM/History for job " + jobId
            + "  Will retry..", e);
        // Force reconnection by setting the proxy to null.
        realProxy = null;
        // RM shutdown
        maxClientRetry--;
        lastException = new IOException(e.getMessage());
        try {
          Thread.sleep(100);
        } catch (InterruptedException ie) {
          LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
          throw new YarnRuntimeException(ie);
        }
      }
    }
    throw lastException;
  }

  // Only for testing
  @VisibleForTesting
  public int getMaxClientRetry() {
    return this.maxClientRetry;
  }

  public org.apache.hadoop.mapreduce.Counters getJobCounters(JobID arg0) throws IOException,
  InterruptedException {
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);
      GetCountersRequest request = recordFactory.newRecordInstance(GetCountersRequest.class);
      request.setJobId(jobID);
      Counters cnt = ((GetCountersResponse)
          invoke("getCounters", GetCountersRequest.class, request)).getCounters();
      return TypeConverter.fromYarn(cnt);

  }

  public TaskCompletionEvent[] getTaskCompletionEvents(JobID arg0, int arg1, int arg2)
      throws IOException, InterruptedException {
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter
        .toYarn(arg0);
    GetTaskAttemptCompletionEventsRequest request = recordFactory
        .newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
    request.setJobId(jobID);
    request.setFromEventId(arg1);
    request.setMaxEvents(arg2);
    List<org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent> list =
      ((GetTaskAttemptCompletionEventsResponse) invoke(
        "getTaskAttemptCompletionEvents", GetTaskAttemptCompletionEventsRequest.class, request)).
        getCompletionEventList();
    return TypeConverter
        .fromYarn(list
            .toArray(new org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent[0]));
  }

  public String[] getTaskDiagnostics(org.apache.hadoop.mapreduce.TaskAttemptID arg0)
      throws IOException, InterruptedException {

    org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter
        .toYarn(arg0);
    GetDiagnosticsRequest request = recordFactory
        .newRecordInstance(GetDiagnosticsRequest.class);
    request.setTaskAttemptId(attemptID);
    List<String> list = ((GetDiagnosticsResponse) invoke("getDiagnostics",
        GetDiagnosticsRequest.class, request)).getDiagnosticsList();
    String[] result = new String[list.size()];
    int i = 0;
    for (String c : list) {
      result[i++] = c.toString();
    }
    return result;
  }
  
  public JobStatus getJobStatus(JobID oldJobID) throws IOException {
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobId =
      TypeConverter.toYarn(oldJobID);
    GetJobReportRequest request =
        recordFactory.newRecordInstance(GetJobReportRequest.class);
    request.setJobId(jobId);
    JobReport report = ((GetJobReportResponse) invoke("getJobReport",
        GetJobReportRequest.class, request)).getJobReport();
    JobStatus jobStatus = null;
    if (report != null) {
      if (StringUtils.isEmpty(report.getJobFile())) {
        String jobFile = MRApps.getJobFile(conf, report.getUser(), oldJobID);
        report.setJobFile(jobFile);
      }
      String historyTrackingUrl = report.getTrackingUrl();
      String url = StringUtils.isNotEmpty(historyTrackingUrl)
          ? historyTrackingUrl : trackingUrl;
      jobStatus = TypeConverter.fromYarn(report, url);
    }
    return jobStatus;
  }

  public org.apache.hadoop.mapreduce.TaskReport[] getTaskReports(JobID oldJobID, TaskType taskType)
       throws IOException{
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobId =
      TypeConverter.toYarn(oldJobID);
    GetTaskReportsRequest request =
        recordFactory.newRecordInstance(GetTaskReportsRequest.class);
    request.setJobId(jobId);
    request.setTaskType(TypeConverter.toYarn(taskType));

    List<org.apache.hadoop.mapreduce.v2.api.records.TaskReport> taskReports =
      ((GetTaskReportsResponse) invoke("getTaskReports", GetTaskReportsRequest.class,
          request)).getTaskReportList();

    return TypeConverter.fromYarn
    (taskReports).toArray(new org.apache.hadoop.mapreduce.TaskReport[0]);
  }

  public boolean killTask(TaskAttemptID taskAttemptID, boolean fail)
       throws IOException {
    org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID
      = TypeConverter.toYarn(taskAttemptID);
    if (fail) {
      FailTaskAttemptRequest failRequest = recordFactory.newRecordInstance(FailTaskAttemptRequest.class);
      failRequest.setTaskAttemptId(attemptID);
      invoke("failTaskAttempt", FailTaskAttemptRequest.class, failRequest);
    } else {
      KillTaskAttemptRequest killRequest = recordFactory.newRecordInstance(KillTaskAttemptRequest.class);
      killRequest.setTaskAttemptId(attemptID);
      invoke("killTaskAttempt", KillTaskAttemptRequest.class, killRequest);
    }
    return true;
  }

  public boolean killJob(JobID oldJobID)
       throws IOException {
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobId
    = TypeConverter.toYarn(oldJobID);
    KillJobRequest killRequest = recordFactory.newRecordInstance(KillJobRequest.class);
    killRequest.setJobId(jobId);
    invoke("killJob", KillJobRequest.class, killRequest);
    return true;
  }

  public LogParams getLogFilePath(JobID oldJobID, TaskAttemptID oldTaskAttemptID)
      throws IOException {
    org.apache.hadoop.mapreduce.v2.api.records.JobId jobId =
        TypeConverter.toYarn(oldJobID);
    GetJobReportRequest request =
        recordFactory.newRecordInstance(GetJobReportRequest.class);
    request.setJobId(jobId);

    JobReport report =
        ((GetJobReportResponse) invoke("getJobReport",
            GetJobReportRequest.class, request)).getJobReport();
    if (EnumSet.of(JobState.SUCCEEDED, JobState.FAILED, JobState.KILLED,
        JobState.ERROR).contains(report.getJobState())) {
      if (oldTaskAttemptID != null) {
        GetTaskAttemptReportRequest taRequest =
            recordFactory.newRecordInstance(GetTaskAttemptReportRequest.class);
        taRequest.setTaskAttemptId(TypeConverter.toYarn(oldTaskAttemptID));
        TaskAttemptReport taReport =
            ((GetTaskAttemptReportResponse) invoke("getTaskAttemptReport",
                GetTaskAttemptReportRequest.class, taRequest))
                .getTaskAttemptReport();
        if (taReport.getContainerId() == null
            || taReport.getNodeManagerHost() == null) {
          throw new IOException("Unable to get log information for task: "
              + oldTaskAttemptID);
        }
        return new LogParams(
            taReport.getContainerId().toString(),
            taReport.getContainerId().getApplicationAttemptId()
                .getApplicationId().toString(),
            NodeId.newInstance(taReport.getNodeManagerHost(),
                taReport.getNodeManagerPort()).toString(), report.getUser());
      } else {
        if (report.getAMInfos() == null || report.getAMInfos().size() == 0) {
          throw new IOException("Unable to get log information for job: "
              + oldJobID);
        }
        AMInfo amInfo = report.getAMInfos().get(report.getAMInfos().size() - 1);
        return new LogParams(
            amInfo.getContainerId().toString(),
            amInfo.getAppAttemptId().getApplicationId().toString(),
            NodeId.newInstance(amInfo.getNodeManagerHost(),
                amInfo.getNodeManagerPort()).toString(), report.getUser());
      }
    } else {
      throw new IOException("Cannot get log path for a in-progress job");
    }
  }
}