/**
* Copyright 2010 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package datameer.awstasks.aws.emr;
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import org.apache.log4j.Logger;
import awstasks.com.amazonaws.auth.BasicAWSCredentials;
import awstasks.com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce;
import awstasks.com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest;
import awstasks.com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsRequest;
import awstasks.com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsResult;
import awstasks.com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig;
import awstasks.com.amazonaws.services.elasticmapreduce.model.JobFlowDetail;
import awstasks.com.amazonaws.services.elasticmapreduce.model.JobFlowInstancesConfig;
import awstasks.com.amazonaws.services.elasticmapreduce.model.KeyValue;
import awstasks.com.amazonaws.services.elasticmapreduce.model.RunJobFlowRequest;
import awstasks.com.amazonaws.services.elasticmapreduce.model.RunJobFlowResult;
import awstasks.com.amazonaws.services.elasticmapreduce.model.StepConfig;
import awstasks.com.amazonaws.services.elasticmapreduce.model.StepDetail;
import awstasks.com.amazonaws.services.elasticmapreduce.model.TerminateJobFlowsRequest;
import awstasks.com.amazonaws.services.s3.AmazonS3;
import awstasks.com.amazonaws.services.s3.AmazonS3Client;
import awstasks.com.amazonaws.services.simpledb.AmazonSimpleDB;
import awstasks.com.amazonaws.services.simpledb.AmazonSimpleDBClient;
import awstasks.com.amazonaws.services.simpledb.model.Attribute;
import awstasks.com.amazonaws.services.simpledb.model.Item;
import awstasks.com.amazonaws.services.simpledb.model.SelectRequest;
import datameer.awstasks.aws.concurrent.ObjectLock;
import datameer.awstasks.aws.emr.JobFlowState.StateCategory;
import datameer.awstasks.util.S3Util;
import datameer.com.google.common.base.Preconditions;
import datameer.com.google.common.base.Predicate;
import datameer.com.google.common.collect.Collections2;
/**
* Allows access and management of amazons elastic map-reduce. One emr cluster maps to one job flow.
*/
public class EmrCluster {
private static final StepConfig DEBUG_STEP = createDebugStep();
private static StepConfig createDebugStep() {
StepConfig debugStep = new StepConfig();
debugStep.setName("Setup Hadoop Debugging");
debugStep.setActionOnFailure("TERMINATE_JOB_FLOW");
HadoopJarStepConfig hadoopJarStepConfig = new HadoopJarStepConfig();
hadoopJarStepConfig.setJar("s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar");
hadoopJarStepConfig.getArgs().add("s3://us-east-1.elasticmapreduce/libs/state-pusher/0.1/fetch");
debugStep.setHadoopJarStep(hadoopJarStepConfig);
return debugStep;
}
protected static final Logger LOG = Logger.getLogger(EmrCluster.class);
private final EmrSettings _settings;
private final String _accessSecret;
protected AmazonElasticMapReduceCustomClient _emrWebService;
private AmazonS3 _s3Service;
protected AmazonSimpleDB _simpleDB;
protected long _startTime;
protected volatile String _masterHost;
protected volatile int _instanceCount;
protected String _jobFlowId;
protected ClusterState _clusterState = ClusterState.UNCONNECTED;
private ObjectLock<String> _uploadLock = ObjectLock.create();
// TODO jz: rethrow interrupted exceptions
public EmrCluster(EmrSettings settings, String accessSecret) {
_accessSecret = accessSecret;
_settings = settings;
_emrWebService = new AmazonElasticMapReduceCustomClient(settings.getAccessKey(), _accessSecret);
// FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
if (settings.isDebugEnabled()) {
_simpleDB = new AmazonSimpleDBClient(new BasicAWSCredentials(settings.getAccessKey(), accessSecret));
}
}
public String getName() {
return getSettings().getClusterName();
}
public EmrSettings getSettings() {
return _settings;
}
public AmazonElasticMapReduce getEmrService() {
return _emrWebService;
}
public void setRequestInterval(long requestInterval) {
_emrWebService.setRequestInterval(requestInterval);
}
public long getRequestInterval() {
return _emrWebService.getRequestInterval();
}
public long getStartTime() {
checkConnection(true);
return _startTime;
}
public String getMasterHost() {
checkConnection(true);
return _masterHost;
}
public int getInstanceCount() {
checkConnection(true);
return _instanceCount;
}
public synchronized void startup() throws InterruptedException {
checkConnection(false);
_clusterState = ClusterState.STARTING;
boolean successful = false;
try {
EmrSettings settings = getSettings();
if (settings.getPrivateKeyName() == null) {
throw new NullPointerException("privateKeyName must not be null please configure settings properly");
}
LOG.info("Starting job flow '" + getName() + "' with ami version '" + settings.getAmiVersion() + "' hadoop version '" + settings.getHadoopVersion() + "' and "
+ settings.getInstanceCount() + " instances ...");
if (getRunningJobFlowDetails(false) != null) {
throw new IllegalStateException("Job flow with name '" + getName() + "' already running.");
}
boolean keepAlive = true;
JobFlowInstancesConfig jobConfig = new JobFlowInstancesConfig();
jobConfig.setHadoopVersion(_settings.getHadoopVersion());
jobConfig.setMasterInstanceType(settings.getMasterInstanceType().toString());
jobConfig.setSlaveInstanceType(settings.getNodeInstanceType().toString());
jobConfig.setInstanceCount(settings.getInstanceCount());
jobConfig.setEc2KeyName(settings.getPrivateKeyName());
jobConfig.setKeepJobFlowAliveWhenNoSteps(keepAlive);
jobConfig.setEc2SubnetId(settings.getSubnetId());
final RunJobFlowRequest startRequest = new RunJobFlowRequest();
startRequest.setAmiVersion(settings.getAmiVersion());
startRequest.setLogUri("s3n://" + settings.getS3Bucket() + settings.getS3LogPath());
startRequest.setInstances(jobConfig);
startRequest.setName(getName());
startRequest.setAdditionalInfo(_settings.getAdditionalStartInfo());
startRequest.setBootstrapActions(_settings.getBootstrapActions());
if (settings.isDebugEnabled()) {
startRequest.withSteps(DEBUG_STEP);
}
RunJobFlowResult startResponse = _emrWebService.runJobFlow(startRequest);
_jobFlowId = startResponse.getJobFlowId();
waitUntilClusterStateChange(_jobFlowId, StateCategory.OPERATIONAL);
LOG.info("elastic cluster '" + getName() + "/" + _jobFlowId + "' started, master-host is " + _masterHost);
successful = true;
} finally {
if (successful) {
_clusterState = ClusterState.CONNECTED;
} else {
_clusterState = ClusterState.UNCONNECTED;
_jobFlowId = null;
}
}
}
/**
* Disconnect this class instance from the cluster without shutting it down.
*/
public void disconnect() {
_jobFlowId = null;
_startTime = 0;
_clusterState = ClusterState.UNCONNECTED;
// shutdownS3Service();
}
public synchronized void shutdown() throws InterruptedException {
checkConnection(true);
_clusterState = ClusterState.STOPPING;
_emrWebService.terminateJobFlows(new TerminateJobFlowsRequest().withJobFlowIds(_jobFlowId));
waitUntilClusterStateChange(_jobFlowId, StateCategory.DEAD);
disconnect();
}
/**
* Connect by cluster name.
*
* @throws InterruptedException
*/
public void connectByName() throws InterruptedException {
checkConnection(false);
JobFlowDetail jobFlow = getRunningJobFlowDetails(true);
connectById(jobFlow.getJobFlowId());
}
/**
* Connect to a cluster/jobFlow with the given id.
*
* @param jobFlowId
* @throws InterruptedException
*/
public void connectById(String jobFlowId) throws InterruptedException {
checkConnection(false);
_jobFlowId = jobFlowId;
waitUntilClusterStateChange(_jobFlowId, StateCategory.OPERATIONAL);
LOG.info("connected to elastic cluster '" + getName() + "/" + _jobFlowId + "', master-host is " + _masterHost);
_clusterState = ClusterState.CONNECTED;
}
/**
* Connects to EMR cluster and equilibrate the local state with the remote state.
*
* @throws InterruptedException
*/
public void synchronizeState() throws InterruptedException {
if (_clusterState == ClusterState.UNCONNECTED) {
try {
connectByName();
return;// we have a new state
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
return; // there is no cluster up
}
}
JobFlowDetail jobFlowDetail = getJobFlowDetail(_jobFlowId);
JobFlowState state = JobFlowState.valueOf(jobFlowDetail.getExecutionStatusDetail().getState());
if (!state.isOperational() && _clusterState == ClusterState.CONNECTED) {
disconnect();
}
}
public ClusterState getState() {
return _clusterState;
}
public boolean isIdle() {
if (_clusterState != ClusterState.CONNECTED) {
return false;
}
JobFlowState state = JobFlowState.valueOf(getJobFlowDetail(_jobFlowId).getExecutionStatusDetail().getState());
return state.isIdle();
}
public String getJobFlowId() {
return _jobFlowId;
}
protected void checkConnection(boolean shouldRun) {
if (shouldRun && (_clusterState == ClusterState.UNCONNECTED || _clusterState == ClusterState.STOPPING)) {
throw new IllegalStateException("not connected to cluster/jobFlow");
}
if (!shouldRun && _clusterState == ClusterState.CONNECTED) {
throw new IllegalStateException("already connected to cluster/jobFlow");
}
}
public JobStepBuilder createJobStep(String name, File jobJar) {
return new JobStepBuilder(name, jobJar);
}
public StepFuture executeJobStep(String name, File jobJar, String... args) {
return createJobStep(name, jobJar).setMainArgs(args).submit();
}
public StepFuture executeJobStep(String name, File jobJar, Class<?> mainClass, String... args) {
return createJobStep(name, jobJar).setMainClass(mainClass).setMainArgs(args).submit();
}
public StepFuture executeJobStep(String name, File jobJar, String s3JobJarName, Class<?> mainClass, String... args) {
return createJobStep(name, jobJar).setS3JobJarName(s3JobJarName).setMainClass(mainClass).setMainArgs(args).submit();
}
private void waitUntilClusterStateChange(final String jobFlowId, final StateCategory targetState) throws InterruptedException {
doWhileNot(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
JobFlowDetail jobFlowDetail = getJobFlowDetail(jobFlowId);
String lastStateChangeReason = jobFlowDetail.getExecutionStatusDetail().getLastStateChangeReason();
JobFlowState state = JobFlowState.valueOf(jobFlowDetail.getExecutionStatusDetail().getState());
LOG.info("elastic cluster '" + jobFlowDetail.getName() + "/" + jobFlowId + "' in state '" + state + "'");
boolean finished = !state.isChangingState();
if (finished) {
Preconditions.checkState(state.isIn(targetState), "State transition to %s of job flow '%s' failed with state '%s' and reason '%s'", targetState, jobFlowId, state,
lastStateChangeReason);
if (state.isOperational()) {
_masterHost = jobFlowDetail.getInstances().getMasterPublicDnsName();
_instanceCount = jobFlowDetail.getInstances().getInstanceCount();
_startTime = jobFlowDetail.getExecutionStatusDetail().getStartDateTime().getTime();
} else {
_masterHost = null;
_instanceCount = 0;
_startTime = 0;
}
}
return finished;
}
}, getRequestInterval());
}
protected void waitUntilStepFinished(final String jobFlowId, final String stepName, final int stepIndex) throws InterruptedException {
doWhileNot(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
StepState stepState = getStepState(jobFlowId, stepName);
LOG.info("job step " + stepIndex + "/" + stepName + " in state '" + stepState + "'");
boolean finished = stepState.isFinished();
if (finished) {
if (!stepState.isSuccessful()) {
int stepIndex = getStepIndex(getJobFlowDetail(jobFlowId), stepName);
throw new RuntimeException("job step '" + stepName + "' (" + jobFlowId + "/" + stepIndex + ") failed with state '" + stepState + "'");
}
}
return finished;
}
}, getRequestInterval());
}
protected StepState getStepState(final String jobFlowId, final String stepName) {
JobFlowDetail flowDetail = getJobFlowDetail(jobFlowId);
StepDetail stepDetail = getStepDetail(flowDetail, stepName);
StepState stepState = StepState.valueOf(stepDetail.getExecutionStatusDetail().getState());
return stepState;
}
protected static void doWhileNot(Callable<Boolean> callable, long requestInterval) throws InterruptedException {
boolean finished = false;
do {
Thread.sleep(requestInterval);
try {
finished = callable.call();
} catch (InterruptedException e) {
throw e;
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
} while (!finished);
}
protected JobFlowDetail getJobFlowDetail(String jobFlowId) {
DescribeJobFlowsResult describeJobFlows = _emrWebService.describeJobFlows(new DescribeJobFlowsRequest().withJobFlowIds(jobFlowId));
List<JobFlowDetail> jobFlows = describeJobFlows.getJobFlows();
Preconditions.checkArgument(jobFlows.size() > 0, "No job flow with id '%s' found", jobFlowId);
Preconditions.checkState(jobFlows.size() < 2, "More then one job flow with id '%s' found", jobFlowId);
return jobFlows.get(0);
}
public int getCurrentStepCount(String jobFlowId) {
if (_clusterState != ClusterState.CONNECTED) {
return 0;
}
JobFlowDetail jobFlowDetail = getJobFlowDetail(jobFlowId);
return jobFlowDetail.getSteps().size();
}
protected JobFlowDetail getRunningJobFlowDetails(boolean hasToExist) {
DescribeJobFlowsResult describeJobFlows = _emrWebService.describeJobFlows(new DescribeJobFlowsRequest().withJobFlowStates(JobFlowState.STARTING.name(), JobFlowState.BOOTSTRAPPING.name(),
JobFlowState.WAITING.name(), JobFlowState.RUNNING.name()));
final String jobFlowName = getName();
Collection<JobFlowDetail> matchingJobFlows = Collections2.filter(describeJobFlows.getJobFlows(), new Predicate<JobFlowDetail>() {
@Override
public boolean apply(JobFlowDetail input) {
return input.getName().equals(jobFlowName);
}
});
if (matchingJobFlows.isEmpty() && !hasToExist) {
return null;
}
Preconditions.checkState(matchingJobFlows.size() <= 1, "More than one job flow with name '%s' running.", jobFlowName);
Preconditions.checkState(matchingJobFlows.size() > 0, "No job flow with name '%s' running.", jobFlowName);
return matchingJobFlows.iterator().next();
}
protected StepDetail getStepDetail(JobFlowDetail flowDetail, String stepName) {
for (StepDetail stepDetail : flowDetail.getSteps()) {
if (stepName.equals(stepDetail.getStepConfig().getName())) {
return stepDetail;
}
}
throw new IllegalStateException("no step detail with name '" + stepName + "' found in " + flowDetail.getJobFlowId());
}
protected int getStepIndex(JobFlowDetail flowDetail, String stepName) {
for (int i = 0; i < flowDetail.getSteps().size(); i++) {
if (stepName.equals(flowDetail.getSteps().get(i).getStepConfig().getName())) {
return i + 1;// starting from 1
}
}
throw new IllegalStateException("no step detail with name '" + stepName + "' found in " + flowDetail.getJobFlowId());
}
static class InterruptedRuntimeException extends RuntimeException {
private static final long serialVersionUID = 1L;
public InterruptedRuntimeException(String message, InterruptedException cause) {
super(message, cause);
}
public InterruptedRuntimeException(InterruptedException cause) {
super(cause);
}
@Override
public InterruptedException getCause() {
return (InterruptedException) super.getCause();
}
}
public class StepFuture {
private final String _stepName;
private final int _stepIndex;
private String _domain;
public StepFuture(String stepName, int stepIndex) {
_stepName = stepName;
_stepIndex = stepIndex;
}
public int getStepIndex() {
return _stepIndex;
}
public String getStepName() {
return _stepName;
}
public StepState getStepState() {
return EmrCluster.this.getStepState(_jobFlowId, _stepName);
}
public StepMetadata getStepMetaData() {
if (_simpleDB == null) {
throw new IllegalStateException("can retrieve step metadata only when hadoop debugging enabled");
}
if (_domain == null) {
_domain = getDomain();
}
String query = "SELECT * FROM `" + _domain + "` WHERE " + StepMetadata.JOB_FLOW_ID + " = '" + _jobFlowId + "' AND " + StepMetadata.STEP_ID + " = '" + _stepIndex + "' AND "
+ StepMetadata.TYPE + " = 'job'";
List<Item> items = _simpleDB.select(new SelectRequest(query)).getItems();
if (items.size() > 1) {
throw new IllegalStateException("found more then one (" + items.size() + ") item for query '" + query + "'");
}
StepMetadata stepMetadata = new StepMetadata();
if (items.isEmpty()) {
LOG.debug("found no items for query '" + query + "' yet...");
return stepMetadata;
// throw new IllegalStateException("found no items for query '" + query + "'");
}
for (Attribute attr : items.get(0).getAttributes()) {
stepMetadata.add(attr.getName(), attr.getValue());
}
return stepMetadata;
}
private String getDomain() {
List<String> domains = _simpleDB.listDomains().getDomainNames();
for (Iterator<String> iterator = domains.iterator(); iterator.hasNext();) {
String domain = iterator.next();
if (!domain.startsWith("ElasticMapReduce-")) {
iterator.remove();
}
}
Collections.sort(domains);
Collections.reverse(domains);
if (domains.isEmpty()) {
throw new IllegalStateException("found no hadoop-debugging domains");
}
return domains.get(0);
}
public void join() throws InterruptedException {
try {
waitUntilStepFinished(_jobFlowId, _stepName, _stepIndex);
} catch (InterruptedRuntimeException e) {
throw e.getCause();
}
}
}
public class StepMetadata {
public final static String JOB_ID = "jobId";
public final static String JOB_FLOW_ID = "jobFlowId";
public final static String JOB_INDEX = "jobIndex";
public final static String JOB_STATE = "jobState";
public final static String TYPE = "type";
public final static String STEP_ID = "stepId";
public final static String USERNAME = "username";
public final static String START_TIME = "startTime";
public final static String NUM_TASKS = "numTasks";
public final static String NUM_PENDING_TASKS = "numPendingTasks";
public final static String NUM_FAILED_TASKS = "numFailedTasks";
public final static String NUM_RUNNING_TASKS = "numRunningTasks";
public final static String NUM_CANCELLED_TASKS = "numCancelledTasks";
public final static String NUM_COMPLETED_TASKS = "numCompletedTasks";
private Map<String, String> _mdMap = new HashMap<String, String>();
public void add(String key, String value) {
_mdMap.put(key, value);
}
public String get(String key) {
return _mdMap.get(key);
}
public Long getAsLong(String key) {
String value = get(key);
if (value == null) {
return null;
}
return Long.parseLong(value);
}
@Override
public String toString() {
return _mdMap.toString();
}
}
public static enum ClusterState {
CONNECTED, UNCONNECTED, STARTING, STOPPING;
}
public class JobStepBuilder {
private final StepConfig _stepConfig = new StepConfig().withHadoopJarStep(new HadoopJarStepConfig());
private final File _jobJar;
private String _s3jobJarName;
JobStepBuilder(String name, File jobJar) {
_stepConfig.setName(name);
_jobJar = jobJar;
_s3jobJarName = jobJar.getName();
_stepConfig.setActionOnFailure("CONTINUE");
}
public JobStepBuilder setS3JobJarName(String s3JobJarName) {
_s3jobJarName = s3JobJarName;
return this;
}
public JobStepBuilder setMainClass(Class<?> mainClass) {
_stepConfig.getHadoopJarStep().setMainClass(mainClass.getName());
return this;
}
public JobStepBuilder setMainArgs(String... args) {
_stepConfig.getHadoopJarStep().setArgs(Arrays.asList(args));
return this;
}
public JobStepBuilder addJvmArg(String key, String value) {
_stepConfig.getHadoopJarStep().getProperties().add(new KeyValue(key, value));
return this;
}
public StepFuture submit() {
checkConnection(true);
String s3JobJarUri = uploadingJobJar(_jobJar, _s3jobJarName);
_stepConfig.getHadoopJarStep().setJar(s3JobJarUri);
_emrWebService.addJobFlowSteps(new AddJobFlowStepsRequest().withJobFlowId(_jobFlowId).withSteps(_stepConfig));
_emrWebService.clearDescribeJobFlowCache();
return new StepFuture(_stepConfig.getName(), getStepIndex(getJobFlowDetail(_jobFlowId), _stepConfig.getName()));
}
private String uploadingJobJar(File jobJar, String s3JobJarName) {
if (_s3Service == null) {
_s3Service = new AmazonS3Client(new BasicAWSCredentials(getSettings().getAccessKey(), _accessSecret));
}
_uploadLock.lock(jobJar.getAbsolutePath());
try {
String s3JobJarPath = new File(getSettings().getS3JobJarBasePath(), s3JobJarName).getPath();
String s3Bucket = getSettings().getS3Bucket();
if (!_s3Service.doesBucketExist(s3Bucket)) {
throw new IllegalStateException("s3 bucket '" + s3Bucket + "' does not exists");
}
if (!S3Util.existsFile(_s3Service, s3Bucket, s3JobJarPath)) {
LOG.info("uploading " + jobJar + " to " + s3JobJarPath);
S3Util.uploadFile(_s3Service, s3Bucket, jobJar, s3JobJarPath);
} else {
LOG.info("using cached job-jar: " + s3JobJarPath);
}
return "s3n://" + getSettings().getAccessKey() + "@" + s3Bucket + s3JobJarPath;
} finally {
_uploadLock.unlock(jobJar.getAbsolutePath());
}
}
}
}