/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.net.SocketException;
import java.net.ServerSocket;
import java.net.URLEncoder;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.corona.CoronaConf;
import org.apache.hadoop.corona.SessionHistoryManager;
import org.apache.hadoop.corona.TFactoryBasedThreadPoolServer;
import org.apache.hadoop.corona.Utilities;
import org.apache.hadoop.corona.CoronaProxyJobTrackerService;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.thrift.server.TServer;
/**
* This is used to proxy HTTP requests to individual Corona Job Tracker web
* UIs.
* Also used for aggregating information about jobs such as job counters.
*/
public class ProxyJobTracker implements
JobHistoryObserver, CoronaJobAggregator, Updater,
CoronaProxyJobTrackerService.Iface {
/** Logger. */
private static final Log LOG = LogFactory.getLog(ProxyJobTracker.class);
static {
Utilities.makeProcessExitOnUncaughtException(LOG);
}
/** Local machine name. */
private static String LOCALMACHINE;
/** Http server port. */
private static int LOCALPORT;
/** Configuration. */
private static CoronaConf conf;
/** Session History Manager. */
private static SessionHistoryManager sessionHistoryManager;
/** Default clock. */
private static final Clock DEFAULTCLOCK = new Clock();
/** Clock. */
private Clock clock = null;
/** Filesystem. */
private FileSystem fs = null;
/** The HTTP server. */
private HttpServer infoServer;
/** The RPC server. */
private Server rpcServer;
/** Cache expiry. */
private ExpireUnusedFilesInCache expireUnusedFilesInCache;
/** Start time of the server. */
private long startTime;
/** Aggregate job counters. */
private Counters aggregateCounters = new Counters();
/** Aggregate job stats. */
private JobStats aggregateJobStats = new JobStats();
/** Job Counters aggregated by pool */
private Map<String, Counters> poolToJobCounters =
new HashMap<String, Counters>();
/** Job Stats aggregated by pool */
private Map<String, JobStats> poolToJobStats =
new HashMap<String, JobStats>();
/** Metrics context. */
private MetricsContext context;
/** Metrics record. */
private MetricsRecord metricsRecord;
/** Is the Cluster Manager in Safe Mode? */
private volatile boolean clusterManagerSafeMode;
/** Metrics Record for pools */
private Map<String, MetricsRecord> poolToMetricsRecord =
new HashMap<String, MetricsRecord>();
/* This is the thrift server thread */
private TServerThread server;
/* The thrift server thread class */
public class TServerThread extends Thread {
private TServer server;
public TServerThread(TServer server) {
this.server = server;
}
public void run() {
try {
server.serve();
} catch (Exception e) {
LOG.info("Got an exception: ", e);
}
}
}
@Override
public void doUpdates(MetricsContext unused) {
synchronized (aggregateJobStats) {
// Update metrics with aggregate job stats and reset the aggregate.
aggregateJobStats.incrementMetricsAndReset(metricsRecord);
incrementMetricsAndReset(metricsRecord, aggregateCounters);
for (Map.Entry<String, MetricsRecord> entry :
poolToMetricsRecord.entrySet()) {
String pool = entry.getKey();
JobStats poolJobStats = poolToJobStats.get(pool);
poolJobStats.incrementMetricsAndReset(entry.getValue());
Counters poolCounters = poolToJobCounters.get(pool);
incrementMetricsAndReset(entry.getValue(), poolCounters);
}
}
}
private static void incrementMetricsAndReset(
MetricsRecord record, Counters counters) {
// Now update metrics with the counters and reset the aggregate.
for (Counters.Group group : counters) {
String groupName = group.getName();
for (Counter counter : group) {
String name = groupName + "_" + counter.getName();
name = name.replaceAll("[^a-zA-Z_]", "_").toLowerCase();
record.incrMetric(name, counter.getValue());
}
}
// Reset the aggregate counters.
for (Counters.Group g : counters) {
for (Counter c : g) {
c.setValue(0);
}
}
record.update();
}
/**
* Servlet to handle requests.
*/
public static class ProxyJobTrackerServlet extends HttpServlet {
@Override
public void init() throws ServletException {
LOG.info("Initialized " + this.getClass().getName());
super.init();
}
@Override
protected void doPost(
HttpServletRequest request, HttpServletResponse response)
throws IOException {
String destination = "";
String host = request.getParameter("host");
String port = request.getParameter("port");
String path = request.getParameter("path");
if (host == null || port == null || path == null) {
response.sendError(HttpServletResponse.SC_BAD_REQUEST,
"Missing mandatory host and/or port parameters");
return;
}
try {
destination = "http://" + host + ":" + port + "/" + path;
PostMethod method = new PostMethod(destination);
for (Enumeration e = request.getParameterNames();
e.hasMoreElements();) {
String key = (String) e.nextElement();
if (key.equals("host") || key.equals("port") || key.equals("path")) {
continue;
}
method.addParameter(key, request.getParameter(key));
}
HttpClient httpclient = new HttpClient();
int statusCode = httpclient.executeMethod(method);
response.setStatus(statusCode);
response.setContentType("text/html");
InputStream is = method.getResponseBodyAsStream();
int len = 0;
int bufferSize = 4096;
byte[] buf = new byte[bufferSize];
while ((len = is.read(buf)) >= 0) {
response.getOutputStream().write(buf, 0, len);
}
if (statusCode != HttpServletResponse.SC_OK) {
LOG.warn("Status " + statusCode + " forwarding request to: " +
destination);
}
} catch (ConnectException ce) {
handleDeadJobTracker(response, host, port);
} catch (SocketException se) {
checkDeadJobTracker(se, response, host, port);
} catch (IOException e) {
LOG.warn("Exception forwarding request to: " + destination);
throw e;
}
}
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
StringBuffer sb = null;
String methodString = null;
String host = request.getParameter("host");
String port = request.getParameter("port");
String path = request.getParameter("path");
try {
String jobIDParam = request.getParameter("jobid");
// Check if the full path is given.
String jobHistoryFileLocParam =
request.getParameter("jobhistoryfileloc");
Path jobHistoryFileLocation = jobHistoryFileLocParam == null ?
null : new Path(jobHistoryFileLocParam);
if (jobHistoryFileLocation == null) {
// If the full path is not given, check the history directory.
String historyDirParam = request.getParameter("historydir");
Path historyDir = null;
if (historyDirParam != null) {
historyDir = new Path(conf.getSessionsLogDir(), historyDirParam);
} else if (jobIDParam != null) {
// Infer the history location from the job id.
JobID jobID = JobID.forName(jobIDParam);
String sessionId = jobID.getJtIdentifier();
historyDir =
new Path(sessionHistoryManager.getLogPath(sessionId));
}
if (historyDir != null) {
Path doneDir = new Path(historyDir, "done");
jobHistoryFileLocation = new Path(doneDir, jobIDParam);
}
}
//check if the job is in the jobHistory
methodString = (jobHistoryFileLocation == null) ?
null :
urlInJobHistory(jobHistoryFileLocation, jobIDParam);
//it's not in the job history
//directly go to the job tracker to retrieve the job information
//otherwise, directly load the jobhistory page
if (methodString == null) {
if (host == null || port == null || path == null) {
response.sendError(HttpServletResponse.SC_BAD_REQUEST,
"Missing mandatory host and/or port parameters");
return;
}
LOG.info("history file: " + jobHistoryFileLocation +
" is not in jobhistory");
sb = new StringBuffer("http://");
sb.append(host).append(":").append(port).append("/").append(path);
Map<String, String []> m = request.getParameterMap();
boolean firstArg = true;
for (Map.Entry<String, String []> e: m.entrySet()) {
String key = e.getKey();
//also ingore the jobhistoryfileloc, only used when the job is done
//and log is in the job history
if (key.equals("host") || key.equals("path") ||
key.equals("port") || key.equals("jobhistoryfileloc")) {
continue;
}
if (firstArg) {
sb.append('?');
firstArg = false;
} else {
sb.append('&');
}
sb.append(e.getKey() + "=" + e.getValue()[0]);
}
methodString = sb.toString();
}
HttpClient httpclient = new HttpClient();
HttpMethod method = new GetMethod(methodString);
int sc = httpclient.executeMethod(method);
response.setStatus(sc);
response.setContentType("text/html");
InputStream is = method.getResponseBodyAsStream();
int len = 0;
int bufferSize = 4096;
byte[] buf = new byte[bufferSize];
while ((len = is.read(buf)) >= 0) {
response.getOutputStream().write(buf, 0, len);
}
if (sc != HttpServletResponse.SC_OK) {
LOG.warn("Status " + sc + " forwarding request to: " + methodString);
}
} catch (ConnectException ce) {
handleDeadJobTracker(response, host, port);
} catch (SocketException se) {
checkDeadJobTracker(se, response, host, port);
} catch (IOException e) {
LOG.warn("Exception forwarding request to: " + methodString);
throw e;
}
}
/**
* Check if the job tracker could be dead based on the exception
* encountered and provide a helpful message in the response.
* @param e the exception.
* @param response the HTTP response
* @param host The host of the job tracker.
* @param port the port of the job tracker.
* @throws IOException
*/
private void checkDeadJobTracker(
IOException e,
HttpServletResponse response,
String host,
String port) throws IOException {
if (e.getMessage().contains("Broken pipe") ||
e.getMessage().contains("Connection reset")) {
handleDeadJobTracker(response, host, port);
} else {
throw e;
}
}
/**
* Provide a helpful message in the response after the job tracker has
* been determined to be dead.
* @param response the HTTP response
* @param host The host of the job tracker.
* @param port the port of the job tracker.
* @throws IOException
*/
private void handleDeadJobTracker(
HttpServletResponse response,
String host,
String port) throws IOException {
String msg =
"Could not connect to Job Tracker at " + host + ":" + port +
". The job may have completed or been killed. Please go to the " +
"Cluster Manager UI and click on your job again, or retry the " +
"tracking URL if available.";
byte[] msgBytes = msg.getBytes();
response.getOutputStream().write(msgBytes, 0, msgBytes.length);
}
}
public int getPort() {
return infoServer.getPort();
}
public int getRpcPort() {
return rpcServer.getListenerAddress().getPort();
}
public String getProxyJobTrackerMachine() {
return LOCALMACHINE;
}
Clock getClock() {
return clock == null ? DEFAULTCLOCK : clock;
}
public void historyFileCopied(JobID jobid, String historyFile) {
}
public ProxyJobTracker(CoronaConf conf) throws IOException {
this.conf = conf;
fs = FileSystem.get(conf);
String infoAddr =
conf.get("mapred.job.tracker.corona.proxyaddr", "0.0.0.0:0");
InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
String infoBindAddress = infoSocAddr.getHostName();
int port = infoSocAddr.getPort();
LOCALMACHINE = infoBindAddress;
startTime = getClock().getTime();
CoronaConf coronaConf = new CoronaConf(conf);
InetSocketAddress rpcSockAddr = NetUtils.createSocketAddr(
coronaConf.getProxyJobTrackerAddress());
rpcServer = RPC.getServer(
this,
rpcSockAddr.getHostName(),
rpcSockAddr.getPort(),
conf.getInt("corona.proxy.job.tracker.handler.count", 10),
false,
conf);
rpcServer.start();
LOG.info("ProxyJobTracker RPC Server up at " +
rpcServer.getListenerAddress());
infoServer = new HttpServer("proxyjt", infoBindAddress, port,
port == 0, conf);
infoServer.setAttribute("proxy.job.tracker", this);
infoServer.setAttribute("conf", conf);
infoServer.addServlet("proxy", "/proxy",
ProxyJobTrackerServlet.class);
// initialize history parameters.
JobConf jobConf = new JobConf(conf);
boolean historyInitialized = JobHistory.init(
this, jobConf, this.LOCALMACHINE, this.startTime);
if (historyInitialized) {
JobHistory.initDone(jobConf, fs);
String historyLogDir =
JobHistory.getCompletedJobHistoryLocation().toString();
FileSystem historyFS = new Path(historyLogDir).getFileSystem(conf);
infoServer.setAttribute("historyLogDir", historyLogDir);
infoServer.setAttribute("fileSys", historyFS);
}
infoServer.start();
LOCALPORT = infoServer.getPort();
context = MetricsUtil.getContext("mapred");
metricsRecord = MetricsUtil.createRecord(context, "proxyjobtracker");
context.registerUpdater(this);
expireUnusedFilesInCache = new ExpireUnusedFilesInCache(
conf, getClock(), new Path(getSystemDir()), fs);
sessionHistoryManager = new SessionHistoryManager();
sessionHistoryManager.setConf(conf);
try {
String target = conf.getProxyJobTrackerThriftAddress();
InetSocketAddress addr = NetUtils.createSocketAddr(target);
LOG.info("Trying to start the Thrift Server at: " + target);
ServerSocket serverSocket = new ServerSocket(addr.getPort());
server = new TServerThread(
TFactoryBasedThreadPoolServer.createNewServer(
new CoronaProxyJobTrackerService.Processor(this),
serverSocket,
5000));
server.start();
LOG.info("Thrift server started on: " + target);
} catch (IOException e) {
LOG.info("Exception while starting the Thrift Server on CPJT: ", e);
}
}
@Override
public void reportJobStats(
String jobId, String pool, JobStats stats, Counters counters) {
synchronized (aggregateJobStats) {
aggregateJobStats.accumulate(stats);
JobStats poolJobStats = poolToJobStats.get(pool);
if (poolJobStats == null) {
poolJobStats = new JobStats();
poolToJobStats.put(pool, poolJobStats);
}
poolJobStats.accumulate(stats);
accumulateCounters(aggregateCounters, counters);
Counters poolCounters = poolToJobCounters.get(pool);
if (poolCounters == null) {
poolCounters = new Counters();
poolToJobCounters.put(pool, poolCounters);
}
accumulateCounters(poolCounters, counters);
if (!poolToMetricsRecord.containsKey(pool)) {
MetricsRecord poolRecord = context.createRecord("pool-" + pool);
poolToMetricsRecord.put(pool, poolRecord);
}
}
}
private static void accumulateCounters(
Counters aggregate, Counters increment) {
for (JobInProgress.Counter key : JobInProgress.Counter.values()) {
Counter counter = increment.findCounter(key);
if (counter != null) {
aggregate.findCounter(key).increment(counter.getValue());
}
}
for (Task.Counter key : Task.Counter.values()) {
Counter counter = increment.findCounter(key);
if (counter != null) {
aggregate.findCounter(key).increment(counter.getValue());
}
}
for (Counters.Counter counter :
increment.getGroup(Task.FILESYSTEM_COUNTER_GROUP)) {
aggregate.incrCounter(
Task.FILESYSTEM_COUNTER_GROUP, counter.getName(), counter.getValue());
}
}
@Override
public long getProtocolVersion(String protocol, long clientVersion)
throws IOException {
if (protocol.equals(CoronaJobAggregator.class.getName())) {
return CoronaJobAggregator.versionID;
} else {
throw new IOException("Unknown protocol " + protocol);
}
}
@Override
public ProtocolSignature getProtocolSignature(
String protocol,
long clientVersion,
int clientMethodsHash) throws IOException {
return ProtocolSignature.getProtocolSignature(
this, protocol, clientVersion, clientMethodsHash);
}
// Used by the CM to tell the CPJT if it's in Safe Mode.
@Override
public void setClusterManagerSafeModeFlag(boolean safeMode) {
clusterManagerSafeMode = safeMode;
LOG.info("On ProxyJobTracker, clusterManagerSafeModeFlag: " +
clusterManagerSafeMode);
}
// Has the CM gone into Safe Mode and told the CPJT about it?
@Override
public boolean getClusterManagerSafeModeFlag() {
return clusterManagerSafeMode;
}
public void join() throws InterruptedException {
infoServer.join();
rpcServer.join();
server.join();
}
public static ProxyJobTracker startProxyTracker(CoronaConf conf)
throws IOException {
ProxyJobTracker result = new ProxyJobTracker(conf);
return result;
}
/**
* Given the path to the jobHistoryFile, check if the file already exists.
* 1. If FileNoFoundException is caught, means the job is not yet finished,
* and there is not job hisotry log file in the done directory
* 2. If not, it means we get a hit for the jobHistoryFile, directly recover
* the url to the coronoajobdetailshistory page.
* @param jobId
* @return url if the job is done and the jobHistory is in the jobHistory
* folder, null if the job cannot be found in the jobHistory folder.
* @throws IOException
*/
public static String urlInJobHistory(
Path jobHistoryFileLocation, String jobId)
throws IOException {
try {
FileSystem fs = jobHistoryFileLocation.getFileSystem(conf);
fs.getFileStatus(jobHistoryFileLocation);
} catch (FileNotFoundException e) {
return null;
}
return "http://" + LOCALMACHINE + ":" + LOCALPORT +
"/coronajobdetailshistory.jsp?jobid=" + jobId +
"&logFile=" + URLEncoder.encode(jobHistoryFileLocation.toString());
}
public String getSystemDir() {
return CoronaJobTracker.getSystemDir(fs, conf);
}
public static void main(String[] argv) throws IOException {
StringUtils.startupShutdownMessage(ProxyJobTracker.class, argv, LOG);
ProxyJobTracker p = startProxyTracker(new CoronaConf(new Configuration()));
boolean joined = false;
while (!joined) {
try {
p.join();
joined = true;
} catch (InterruptedException e) {
LOG.warn("Ignoring InterruptedException");
}
}
}
}