/**
* CopyRight by Chinamobile
*
* BSPJobClient.java
*/
package com.chinamobile.bcbsp.client;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import javax.security.auth.login.LoginException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UnixUserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.chinamobile.bcbsp.Constants;
import com.chinamobile.bcbsp.BSPConfiguration;
import com.chinamobile.bcbsp.bspcontroller.BSPController;
import com.chinamobile.bcbsp.bspcontroller.ClusterStatus;
import com.chinamobile.bcbsp.fault.storage.Fault;
import com.chinamobile.bcbsp.rpc.JobSubmissionProtocol;
import com.chinamobile.bcbsp.util.BSPJobID;
import com.chinamobile.bcbsp.util.BSPJob;
import com.chinamobile.bcbsp.util.JobProfile;
import com.chinamobile.bcbsp.util.JobStatus;
import com.chinamobile.bcbsp.util.StaffAttemptID;
import com.chinamobile.bcbsp.util.StaffStatus;
/**
* BSPJobClient
*
* BSPJobClient is the primary interface for the user-job to interact with the
* BSPController.
*
* BSPJobClient provides facilities to submit jobs, track their progress, access
* component-staffs' reports/logs, get the BC-BSP cluster status information
* etc.
*
* @author
* @version
*/
public class BSPJobClient extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(BSPJobClient.class);
public static enum StaffStatusFilter {
NONE, KILLED, FAILED, SUCCEEDED, ALL
}
private static final long MAX_JOBPROFILE_AGE = 1000 * 2;
public class NetworkedJob implements RunningJob {
JobProfile profile;
JobStatus status;
long statustime;
public NetworkedJob(JobStatus job) throws IOException {
this.status = job;
this.profile = jobSubmitClient.getJobProfile(job.getJobID());
this.statustime = System.currentTimeMillis();
}
/**
* Some methods rely on having a recent job profile object. Refresh it,
* if necessary
*/
synchronized void ensureFreshStatus() throws IOException {
if (System.currentTimeMillis() - statustime > MAX_JOBPROFILE_AGE) {
updateStatus();
}
}
/**
* Some methods need to update status immediately. So, refresh
* immediately
*
* @throws IOException
*/
synchronized void updateStatus() throws IOException {
this.status = jobSubmitClient.getJobStatus(profile.getJobID());
this.statustime = System.currentTimeMillis();
}
/*
* (non-Javadoc)
*
* @see com.chinamobile.bcbsp.bsp.RunningJob#getID()
*/
@Override
public BSPJobID getID() {
return profile.getJobID();
}
/*
* (non-Javadoc)
*
* @see com.chinamobile.bcbsp.bsp.RunningJob#getJobName()
*/
@Override
public String getJobName() {
return profile.getJobName();
}
/*
* (non-Javadoc)
*
* @see com.chinamobile.bcbsp.bsp.RunningJob#getJobFile()
*/
@Override
public String getJobFile() {
return profile.getJobFile();
}
@Override
public long progress() throws IOException {
ensureFreshStatus();
return status.progress();
}
@Override
public boolean isComplete() throws IOException {
updateStatus();
return (status.getRunState() == JobStatus.SUCCEEDED
|| status.getRunState() == JobStatus.FAILED || status
.getRunState() == JobStatus.KILLED);
}
@Override
public boolean isSuccessful() throws IOException {
return status.getRunState() == JobStatus.SUCCEEDED;
}
@Override
public boolean isKilled() throws IOException {
return status.getRunState() == JobStatus.KILLED;
}
@Override
public boolean isFailed() throws IOException {
return status.getRunState() == JobStatus.FAILED;
}
@Override
public boolean isRecovery() throws IOException {
return status.getRunState() == JobStatus.RECOVERY;
}
public synchronized long getSuperstepCount() throws IOException {
ensureFreshStatus();
return status.getSuperstepCount();
}
/**
* Blocks until the job is finished
*/
public void waitForCompletion() throws IOException {
while (!isComplete()) {
try {
Thread.sleep(5000);
} catch (InterruptedException ie) {
}
}
}
/**
* Tells the service to get the state of the current job.
*/
public synchronized int getJobState() throws IOException {
updateStatus();
return status.getRunState();
}
/**
* Tells the service to terminate the current job.
*/
public synchronized void killJob() throws IOException {
jobSubmitClient.killJob(getID());
}
@Override
public void killStaff(StaffAttemptID staffId, boolean shouldFail)
throws IOException {
jobSubmitClient.killStaff(staffId, shouldFail);
}
}
private static class NewSplitComparator implements
Comparator<org.apache.hadoop.mapreduce.InputSplit> {
@Override
public int compare(org.apache.hadoop.mapreduce.InputSplit o1,
org.apache.hadoop.mapreduce.InputSplit o2) {
try {
long len1 = o1.getLength();
long len2 = o2.getLength();
if (len1 < len2) {
return 1;
} else if (len1 == len2) {
return 0;
} else {
return -1;
}
} catch (IOException ie) {
throw new RuntimeException("exception in compare", ie);
} catch (InterruptedException ie) {
throw new RuntimeException("exception in compare", ie);
}
}
}
public static class RawSplit implements Writable {
private String splitClass;
private BytesWritable bytes = new BytesWritable();
private String[] locations;
long dataLength;
public void setBytes(byte[] data, int offset, int length) {
bytes.set(data, offset, length);
}
public void setClassName(String className) {
splitClass = className;
}
public String getClassName() {
return splitClass;
}
public BytesWritable getBytes() {
return bytes;
}
public void clearBytes() {
bytes = null;
}
public void setLocations(String[] locations) {
this.locations = locations;
}
public String[] getLocations() {
return locations;
}
public void readFields(DataInput in) throws IOException {
splitClass = Text.readString(in);
dataLength = in.readLong();
bytes.readFields(in);
int len = WritableUtils.readVInt(in);
locations = new String[len];
for (int i = 0; i < len; ++i) {
locations[i] = Text.readString(in);
}
}
public void write(DataOutput out) throws IOException {
Text.writeString(out, splitClass);
out.writeLong(dataLength);
bytes.write(out);
WritableUtils.writeVInt(out, locations.length);
for (int i = 0; i < locations.length; i++) {
Text.writeString(out, locations[i]);
}
}
public long getDataLength() {
return dataLength;
}
public void setDataLength(long l) {
dataLength = l;
}
}
private JobSubmissionProtocol jobSubmitClient = null;
private Path sysDir = null;
private FileSystem fs = null;
// job files are world-wide readable and owner writable
final private static FsPermission JOB_FILE_PERMISSION = FsPermission
.createImmutable(( short ) 0644); // rw-r--r--
// job submission directory is world readable/writable/executable
final static FsPermission JOB_DIR_PERMISSION = FsPermission
.createImmutable(( short ) 0777); // rwx-rwx-rwx
public BSPJobClient(Configuration conf) throws IOException {
setConf(conf);
init(conf);
}
public BSPJobClient() {
}
public void init(Configuration conf) throws IOException {
this.jobSubmitClient = ( JobSubmissionProtocol ) RPC.getProxy(
JobSubmissionProtocol.class, JobSubmissionProtocol.versionID,
BSPController.getAddress(conf), conf,
NetUtils.getSocketFactory(conf, JobSubmissionProtocol.class));
}
/**
* Close the <code>JobClient</code>.
*/
public synchronized void close() throws IOException {
RPC.stopProxy(jobSubmitClient);
}
/**
* Get a filesystem handle. We need this to prepare jobs for submission to
* the BSP system.
*
* @return the filesystem handle.
*/
public synchronized FileSystem getFs() throws IOException {
if (this.fs == null) {
Path sysDir = getSystemDir();
this.fs = sysDir.getFileSystem(getConf());
}
return fs;
}
/**
* Gets the jobs that are submitted.
*
* @return array of {@link JobStatus} for the submitted jobs.
* @throws IOException
*/
public JobStatus[] getAllJobs() throws IOException {
return jobSubmitClient.getAllJobs();
}
public JobSubmissionProtocol getJobSubmitClient() {
return jobSubmitClient;
}
/**
* Gets the jobs that are not completed and not failed.
*
* @return array of {@link JobStatus} for the running/to-be-run jobs.
* @throws IOException
*/
public JobStatus[] jobsToComplete() throws IOException {
return jobSubmitClient.jobsToComplete();
}
private UnixUserGroupInformation getUGI(Configuration conf)
throws IOException {
UnixUserGroupInformation ugi = null;
try {
ugi = UnixUserGroupInformation.login(conf, true);
} catch (LoginException e) {
throw ( IOException ) (new IOException(
"Failed to get the current user's information.")
.initCause(e));
}
return ugi;
}
/**
* Submit a job to the BC-BSP system. This returns a handle to the
* {@link RunningJob} which can be used to track the running-job.
*
* @param job
* the job configuration.
* @return a handle to the {@link RunningJob} which can be used to track the
* running-job.
* @throws FileNotFoundException
* @throws IOException
*/
public RunningJob submitJob(BSPJob job) throws FileNotFoundException,
ClassNotFoundException, InterruptedException, IOException {
return submitJobInternal(job);
}
/**
* Submit a new job to run.
* @param job
* @return
*
* Review comments:
* (1)The content of submitJobDir is decided by the client. I think it
* is dangerous because two different clients maybe generate the same submitJobDir.
* Review time: 2011-11-30;
* Reviewer: Hongxu Zhang.
*
* Fix log:
* (1)In order to avoid the conflict, I use the jobId to generate the submitJobDir.
* Because the jobId is unique so this problem can be solved.
* Fix time: 2011-12-04;
* Programmer: Zhigang Wang.
*
* Review comments:
* (2)There, the client must submit relative information about the job. There
* maybe some exceptions during this process. When exceptions occur, this job
* should not be executed and the relative submitJobDir must be cleanup.
* Review time: 2011-12-04;
* Reviewer: Hongxu Zhang.
*
* Fix log:
* (2)The process of submiting files has been surrounded by try-catch. The submitJobDir will be
* cleanup in the catch process.
* Fix time: 2011-12-04;
* Programmer: Zhigang Wang.
*/
public RunningJob submitJobInternal(BSPJob job) {
BSPJobID jobId = null;
Path submitJobDir = null;
try {
jobId = jobSubmitClient.getNewJobId();
submitJobDir = new Path(getSystemDir(), "submit_"
+ jobId.toString());
Path submitJarFile = new Path(submitJobDir, "job.jar");
Path submitJobFile = new Path(submitJobDir, "job.xml");
Path submitSplitFile = new Path(submitJobDir, "job.split");
// set this user's id in job configuration, so later job files can
// be accessed using this user's id
UnixUserGroupInformation ugi = getUGI(job.getConf());
// Create a number of filenames in the BSPController's fs namespace
FileSystem fs = getFs();
fs.delete(submitJobDir, true);
submitJobDir = fs.makeQualified(submitJobDir);
submitJobDir = new Path(submitJobDir.toUri().getPath());
FsPermission bspSysPerms = new FsPermission(JOB_DIR_PERMISSION);
FileSystem.mkdirs(fs, submitJobDir, bspSysPerms);
fs.mkdirs(submitJobDir);
short replication = ( short ) job.getInt("bsp.submit.replication", 10);
String originalJarPath = job.getJar();
if (originalJarPath != null) {
// copy jar to BSPController's fs
// use jar name if job is not named.
if ("".equals(job.getJobName())) {
job.setJobName(new Path(originalJarPath).getName());
}
job.setJar(submitJarFile.toString());
fs.copyFromLocalFile(new Path(originalJarPath), submitJarFile);
fs.setReplication(submitJarFile, replication);
fs.setPermission(submitJarFile, new FsPermission(
JOB_FILE_PERMISSION));
} else {
LOG.warn("No job jar file set. User classes may not be found. "
+ "See BSPJob#setJar(String) or check Your jar file.");
}
// Set the user's name and working directory
job.setUser(ugi.getUserName());
if (ugi.getGroupNames().length > 0) {
job.set("group.name", ugi.getGroupNames()[0]);
}
if (job.getWorkingDirectory() == null) {
job.setWorkingDirectory(fs.getWorkingDirectory());
}
int maxClusterStaffs = jobSubmitClient.getClusterStatus(false).getMaxClusterStaffs();
if (job.getNumPartition() == 0) {
job.setNumPartition(maxClusterStaffs);
}
if (job.getNumPartition() > maxClusterStaffs) {
job.setNumPartition(maxClusterStaffs);
}
job.setNumBspStaff(job.getNumPartition());
int splitNum = 0;
splitNum = writeSplits(job, submitSplitFile);
if (splitNum > job.getNumPartition() && splitNum <= maxClusterStaffs) {
job.setNumPartition(splitNum);
job.setNumBspStaff(job.getNumPartition());
}
if (splitNum > maxClusterStaffs) {
LOG.error("Sorry, the number of files is more than maxClusterStaffs:" +
maxClusterStaffs);
throw new IOException("Could not launch job");
}
job.set(Constants.USER_BC_BSP_JOB_SPLIT_FILE,
submitSplitFile.toString());
LOG.info("[Max Staff Number] " + maxClusterStaffs);
LOG.info("The number of splits for the job is: " + splitNum);
LOG.info("The number of staffs for the job is: " + job.getNumBspStaff());
// Write job file to BSPController's fs
FSDataOutputStream out = FileSystem.create(fs, submitJobFile,
new FsPermission(JOB_FILE_PERMISSION));
try {
job.writeXml(out);
} finally {
out.close();
}
// Now, actually submit the job (using the submit name)
JobStatus status = jobSubmitClient.submitJob(jobId,
submitJobFile.toString());
if (status != null) {
return new NetworkedJob(status);
} else {
throw new IOException("Could not launch job");
}
} catch (FileNotFoundException fnfE) {
LOG.error("Exception has been catched in BSPJobClient--submitJobInternal !", fnfE);
Fault f = new Fault(Fault.Type.SYSTEMSERVICE,
Fault.Level.INDETERMINATE, "null", fnfE.toString());
jobSubmitClient.recordFault(f);
jobSubmitClient.recovery(jobId);
try {
FileSystem fs = getFs();
fs.delete(submitJobDir, true);
} catch (Exception e) {
LOG.error("Failed to cleanup the submitJobDir:" + submitJobDir);
}
return null;
} catch (ClassNotFoundException cnfE) {
LOG.error("Exception has been catched in BSPJobClient--submitJobInternal !", cnfE);
Fault f = new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.WARNING,
"null", cnfE.toString());
jobSubmitClient.recordFault(f);
jobSubmitClient.recovery(jobId);
try {
FileSystem fs = getFs();
fs.delete(submitJobDir, true);
} catch (Exception e) {
LOG.error("Failed to cleanup the submitJobDir:" + submitJobDir);
}
return null;
} catch (InterruptedException iE) {
LOG.error("Exception has been catched in BSPJobClient--submitJobInternal !", iE);
Fault f = new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL,
"null", iE.toString());
jobSubmitClient.recordFault(f);
jobSubmitClient.recovery(jobId);
try {
FileSystem fs = getFs();
fs.delete(submitJobDir, true);
} catch (Exception e) {
LOG.error("Failed to cleanup the submitJobDir:" + submitJobDir);
}
return null;
} catch (Exception ioE) {
LOG.error("Exception has been catched in BSPJobClient--submitJobInternal !", ioE);
Fault f = new Fault(Fault.Type.DISK, Fault.Level.CRITICAL,
"null", ioE.toString());
jobSubmitClient.recordFault(f);
jobSubmitClient.recovery(jobId);
try {
FileSystem fs = getFs();
fs.delete(submitJobDir, true);
} catch (Exception e) {
LOG.error("Failed to cleanup the submitJobDir:" + submitJobDir);
}
return null;
}
}
@SuppressWarnings("unchecked")
private <T extends org.apache.hadoop.mapreduce.InputSplit> int writeSplits(
BSPJob job, Path submitSplitFile) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = job.getConf();
com.chinamobile.bcbsp.io.InputFormat<?, ?> input = ReflectionUtils
.newInstance(job.getInputFormatClass(), conf);
input.initialize(job.getConf());
List<org.apache.hadoop.mapreduce.InputSplit> splits = input
.getSplits(job);
int maxSplits = job.getNumPartition();
int splitNum = splits.size();
double factor = splitNum / ( float ) maxSplits;
if (factor > 1.0) {
job.setInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR,
( int ) Math.ceil(factor));
LOG.info("[Split Adjust Factor] " + ( int ) Math.ceil(factor));
LOG.info("[Partition Num] " + maxSplits);
splits = input.getSplits(job);
splitNum = splits.size();
}
T[] array = ( T[] ) splits
.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits
.size()]);
// sort the splits into order based on size, so that the biggest
// go first
Arrays.sort(array, new NewSplitComparator());
DataOutputStream out = writeSplitsFileHeader(conf, submitSplitFile,
array.length);
try {
if (array.length != 0) {
DataOutputBuffer buffer = new DataOutputBuffer();
RawSplit rawSplit = new RawSplit();
SerializationFactory factory = new SerializationFactory(conf);
Serializer<T> serializer = factory
.getSerializer(( Class<T> ) array[0].getClass());
serializer.open(buffer);
for (T split : array) {
rawSplit.setClassName(split.getClass().getName());
buffer.reset();
serializer.serialize(split);
rawSplit.setDataLength(split.getLength());
rawSplit.setBytes(buffer.getData(), 0, buffer.getLength());
rawSplit.setLocations(split.getLocations());
rawSplit.write(out);
}
serializer.close();
}
} finally {
out.close();
}
return splitNum;
}
private static final int CURRENT_SPLIT_FILE_VERSION = 0;
private static final byte[] SPLIT_FILE_HEADER = "SPL".getBytes();
private DataOutputStream writeSplitsFileHeader(Configuration conf,
Path filename, int length) throws IOException {
// write the splits to a file for the job tracker
FileSystem fs = filename.getFileSystem(conf);
FSDataOutputStream out = FileSystem.create(fs, filename,
new FsPermission(JOB_FILE_PERMISSION));
out.write(SPLIT_FILE_HEADER);
WritableUtils.writeVInt(out, CURRENT_SPLIT_FILE_VERSION);
WritableUtils.writeVInt(out, length);
return out;
}
/**
* Read a splits file into a list of raw splits
*
* @param in
* the stream to read from
* @return the complete list of splits
* @throws IOException
* NEU change in version-0.2.3 add new function: 直接从Hadoop中引用过来的
*/
public static RawSplit[] readSplitFile(DataInput in) throws IOException {
byte[] header = new byte[SPLIT_FILE_HEADER.length];
in.readFully(header);
if (!Arrays.equals(SPLIT_FILE_HEADER, header)) {
throw new IOException("Invalid header on split file");
}
int vers = WritableUtils.readVInt(in);
if (vers != CURRENT_SPLIT_FILE_VERSION) {
throw new IOException("Unsupported split version " + vers);
}
int len = WritableUtils.readVInt(in);
RawSplit[] result = new RawSplit[len];
for (int i = 0; i < len; ++i) {
result[i] = new RawSplit();
result[i].readFields(in);
}
return result;
}
/**
* Monitor a job and print status in real-time as progress is made and tasks
* fail.
*
* @param job
* @param info
* @return true, if job is successful
* @throws IOException
* @throws InterruptedException
*/
public boolean monitorAndPrintJob(BSPJob job, RunningJob info)
throws IOException, InterruptedException {
String lastReport = null;
LOG.info("Running job : " + info.getID());
StringBuffer sb = new StringBuffer("JOB FINISHED");
sb.append("\n*************************************************************");
long startTime = System.currentTimeMillis();
try {
while (!info.isComplete()) {
Thread.sleep(3000);
long step = info.progress();
String report = "the current supersteps number : " + step;
if (!report.equals(lastReport)) {
LOG.info(report);
lastReport = report;
}
}
if (info.isSuccessful()) {
sb.append("\n INFO : The job is finished successfully");
}
if (info.isKilled()) {
sb.append("\n WARN : The job is killed by user");
}
double totalTime = (System.currentTimeMillis() - startTime) / 1000.0;
sb.append("\n STATISTICS : Total supersteps : "
+ info.progress());
sb.append("\n Total time(seconds): " + totalTime);
sb.append("\n*************************************************************");
LOG.info(sb.toString());
return job.isSuccessful();
} catch (Exception e) {
sb.append("\n ERROR : " + e.getMessage());
sb.append("\n ERROR : The job is viewed as killed by system");
double totalTime = (System.currentTimeMillis() - startTime) / 1000.0;
sb.append("\n STATISTICS : Total supersteps : " + lastReport);
sb.append("\n Total time(seconds) : " + totalTime);
sb.append("\n*************************************************************");
LOG.info(sb.toString());
return false;
}
}
/**
* Grab the controller system directory path where job-specific files are to
* be placed.
*
* @return the system directory where job-specific files are to be placed.
*/
public Path getSystemDir() {
if (sysDir == null) {
sysDir = new Path(jobSubmitClient.getSystemDir());
}
return sysDir;
}
public static void runJob(BSPJob job) throws FileNotFoundException,
ClassNotFoundException, InterruptedException, IOException {
BSPJobClient jc = new BSPJobClient(job.getConf());
RunningJob running = jc.submitJobInternal(job);
BSPJobID jobId = running.getID();
LOG.info("Running job: " + jobId.toString());
while (true) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
}
if (running.isComplete()) {
break;
}
running = jc.getJob(jobId);
}
LOG.info("Job complete: " + jobId);
LOG.info("The total number of supersteps: " + running.getSuperstepCount());
jc.close();
}
/**
* Get an RunningJob object to track an ongoing job. Returns null if the id
* does not correspond to any known job.
*
* @throws IOException
*/
private RunningJob getJob(BSPJobID jobId) throws IOException {
JobStatus status = jobSubmitClient.getJobStatus(jobId);
if (status != null) {
return new NetworkedJob(status);
} else {
return null;
}
}
/**
* Get status information about the BSP cluster
*
* @param detailed
* if true then get a detailed status including the groomserver
* names
*
* @return the status information about the BSP cluster as an object of
* {@link ClusterStatus}.
*
* @throws IOException
*/
public ClusterStatus getClusterStatus(boolean detailed) throws IOException {
return jobSubmitClient.getClusterStatus(detailed);
}
@SuppressWarnings("deprecation")
@Override
public int run(String[] args) throws Exception {
int exitCode = -1;
if (args.length < 1) {
displayUsage("");
return exitCode;
}
// process arguments
String cmd = args[0];
boolean listJobs = false;
boolean listAllJobs = false;
boolean listActiveWorkerManagers = false;
boolean killJob = false;
boolean submitJob = false;
boolean getStatus = false;
boolean listJobTasks = false;
boolean listBspController = false;
boolean setCheckPoint = false;
String submitJobFile = null;
String jobid = null;
String checkpointCmd = null;
BSPConfiguration conf = new BSPConfiguration(getConf());
init(conf);
if ("-list".equals(cmd)) {
if (args.length != 1
&& !(args.length == 2 && "all".equals(args[1]))) {
displayUsage(cmd);
return exitCode;
}
if (args.length == 2 && "all".equals(args[1])) {
listAllJobs = true;
} else {
listJobs = true;
}
} else if ("-workers".equals(cmd)) {
if (args.length != 1) {
displayUsage(cmd);
return exitCode;
}
listActiveWorkerManagers = true;
} else if ("-submit".equals(cmd)) {
if (args.length == 1) {
displayUsage(cmd);
return exitCode;
}
submitJob = true;
submitJobFile = args[1];
} else if ("-kill".equals(cmd)) {
if (args.length != 2) {
displayUsage(cmd);
return exitCode;
}
killJob = true;
jobid = args[1];
} else if ("-status".equals(cmd)) {
if (args.length != 2) {
displayUsage(cmd);
return exitCode;
}
jobid = args[1];
getStatus = true;
// TODO Later, below functions should be implemented
// with the Fault Tolerant mechanism.
} else if ("-list-staffs".equals(cmd)) {
if (args.length != 2) {
displayUsage(cmd);
return exitCode;
}
jobid = args[1];
listJobTasks = true;
} else if ("-setcheckpoint".equals(cmd)) {
if (args.length != 3) {
displayUsage(cmd);
return exitCode;
}
jobid = args[1];
checkpointCmd = args[2];
setCheckPoint = true;
} else if ("-master".equals(cmd)) {
if (args.length != 1) {
displayUsage(cmd);
return exitCode;
}
listBspController = true;
} else if ("-kill-staff".equals(cmd)) {
System.out.println("This function is not implemented yet.");
return exitCode;
} else if ("-fail-staff".equals(cmd)) {
System.out.println("This function is not implemented yet.");
return exitCode;
}
BSPJobClient jc = new BSPJobClient(new BSPConfiguration());
if (listJobs) {
listJobs();
exitCode = 0;
} else if (listAllJobs) {
listAllJobs();
exitCode = 0;
} else if (listActiveWorkerManagers) {
listActiveWorkerManagers();
exitCode = 0;
} else if (submitJob) {
BSPConfiguration tConf = new BSPConfiguration(new Path(
submitJobFile));
RunningJob job = jc.submitJob(new BSPJob(tConf));
System.out.println("Created job " + job.getID().toString());
} else if (killJob) {
RunningJob job = jc.getJob(new BSPJobID().forName(jobid));
if (job == null) {
System.out.println("Could not find job " + jobid);
} else {
job.killJob();
System.out.println("Killed job " + jobid);
}
exitCode = 0;
} else if (getStatus) {
RunningJob job = jc.getJob(new BSPJobID().forName(jobid));
if (job == null) {
System.out.println("Could not find job " + jobid);
} else {
JobStatus jobStatus = jobSubmitClient.getJobStatus(job.getID());
String start = "NONE", finish = "NONE";
if (jobStatus.getStartTime() != 0) {
start = new Date(jobStatus.getStartTime()).toLocaleString();
}
if (jobStatus.getFinishTime() != 0) {
finish = new Date(jobStatus.getFinishTime()).toLocaleString();
}
System.out.printf("States are:\n\tRunning : 1\tSucceded : 2"
+ "\tFailed : 3\tPrep : 4\n");
System.out.printf("Job name: %s\tUserName: %s\n", job.getJobName(), jobStatus.getUsername());
System.out.printf("ID: %s\tState: %d\tSuperStep: %d\tStartTime: %s\tEndTime: %s\n", jobStatus.getJobID(),
jobStatus.getRunState(), jobStatus.progress(), start, finish);
exitCode = 0;
}
} else if (listJobTasks) {
StaffAttemptID id[] = jobSubmitClient.getStaffStatus(new BSPJobID()
.forName(jobid));
for (StaffAttemptID ids : id) {
System.out.println(ids);
}
StaffStatus ss[] = jobSubmitClient.getStaffDetail(new BSPJobID()
.forName(jobid));
System.out.println("array list size is" + ss.length);
} else if (setCheckPoint) {
if (checkpointCmd.equals("next")) {
jobSubmitClient.setCheckFrequencyNext(new BSPJobID().forName(jobid));
} else {
jobSubmitClient.setCheckFrequency(new BSPJobID().forName(jobid), Integer.valueOf(checkpointCmd));
}
} else if (listBspController) {
listBspController();
exitCode = 0;
}
return 0;
}
private void listBspController() throws IOException {
ClusterStatus c = jobSubmitClient.getClusterStatus(true);
System.out.println("Controller:" + BSPController.getAddress(getConf()));
System.out.println("Controller state is :" + c.getBSPControllerState());
}
/**
* Display usage of the command-line tool and terminate execution
*/
private void displayUsage(String cmd) {
String prefix = "Usage: bcbsp job ";
String taskStates = "running, completed";
if ("-submit".equals(cmd)) {
System.err.println(prefix + "[" + cmd + " <job-file>]");
} else if ("-status".equals(cmd) || "-kill".equals(cmd)) {
System.err.println(prefix + "[" + cmd + " <job-id>]");
} else if ("-list".equals(cmd)) {
System.err.println(prefix + "[" + cmd + " [all]]");
} else if ("-kill-staff".equals(cmd) || "-fail-staff".equals(cmd)) {
System.err.println(prefix + "[" + cmd + " <staff-id>]");
} else if ("-list-active-workermanagers".equals(cmd)) {
System.err.println(prefix + "[" + cmd + "]");
} else if ("-list-staffs".equals(cmd)) {
System.err.println(prefix + "[" + cmd + " <job-id> <staff-state>]. "
+ "Valid values for <staff-state> are " + taskStates);
} else {
System.err.printf(prefix + "<command> <args>\n");
System.err.printf("\t[-submit <job-file>]\n");
System.err.printf("\t[-status <job-id>]\n");
System.err.printf("\t[-kill <job-id>]\n");
System.err.printf("\t[-list [all]]\n");
System.err.printf("\t[-list-active-workermanagers]\n");
System.err.println("\t[-list-attempt <job-id> " + "<staff-state>]\n");
System.err.printf("\t[-kill-staff <staff-id>]\n");
System.err.printf("\t[-fail-staff <staff-id>]\n\n");
}
}
/**
* Dump a list of currently running jobs
*
* @throws IOException
*/
private void listJobs() throws IOException {
JobStatus[] jobs = jobsToComplete();
if (jobs == null)
jobs = new JobStatus[0];
System.out.printf("%d jobs currently running\n", jobs.length);
displayJobList(jobs);
}
/**
* Dump a list of all jobs submitted.
*
* @throws IOException
*/
private void listAllJobs() throws IOException {
JobStatus[] jobs = getAllJobs();
if (jobs == null)
jobs = new JobStatus[0];
System.out.printf("%d jobs submitted\n", jobs.length);
System.out.printf("States are:\n\tRunning : 1\tSucceded : 2"
+ "\tFailed : 3\tPrep : 4\n");
displayJobList(jobs);
}
public void displayJobList(JobStatus[] jobs) {
System.out.printf("JobId\tState\tStartTime\tUserName\n");
for (JobStatus job : jobs) {
System.out.printf("%s\t%d\t%d\t%s\n", job.getJobID(),
job.getRunState(), job.getStartTime(), job.getUsername());
}
}
/**
* Display the list of active worker servers
*/
private void listActiveWorkerManagers() throws IOException {
ClusterStatus c = jobSubmitClient.getClusterStatus(true);
int runningClusterStaffs = c.getRunningClusterStaffs();
String[] activeWorkerManagersName = c.getActiveWorkerManagersName();
System.out
.println("running ClusterStaffs is : " + runningClusterStaffs);
for (String workerManagerName : activeWorkerManagersName) {
System.out.println(workerManagerName + " active");
}
}
/**
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new BSPJobClient(), args);
System.exit(res);
}
}