/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.client.api.async.impl;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.AMCommand;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import com.google.common.annotations.VisibleForTesting;
@Private
@Unstable
public class AMRMClientAsyncImpl<T extends ContainerRequest>
extends AMRMClientAsync<T> {
private static final Log LOG = LogFactory.getLog(AMRMClientAsyncImpl.class);
private final HeartbeatThread heartbeatThread;
private final CallbackHandlerThread handlerThread;
private final BlockingQueue<AllocateResponse> responseQueue;
private final Object unregisterHeartbeatLock = new Object();
private volatile boolean keepRunning;
private volatile float progress;
private volatile Throwable savedException;
public AMRMClientAsyncImpl(int intervalMs, CallbackHandler callbackHandler) {
this(new AMRMClientImpl<T>(), intervalMs, callbackHandler);
}
@Private
@VisibleForTesting
public AMRMClientAsyncImpl(AMRMClient<T> client, int intervalMs,
CallbackHandler callbackHandler) {
super(client, intervalMs, callbackHandler);
heartbeatThread = new HeartbeatThread();
handlerThread = new CallbackHandlerThread();
responseQueue = new LinkedBlockingQueue<AllocateResponse>();
keepRunning = true;
savedException = null;
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
super.serviceInit(conf);
client.init(conf);
}
@Override
protected void serviceStart() throws Exception {
handlerThread.setDaemon(true);
handlerThread.start();
client.start();
super.serviceStart();
}
/**
* Tells the heartbeat and handler threads to stop and waits for them to
* terminate.
*/
@Override
protected void serviceStop() throws Exception {
keepRunning = false;
heartbeatThread.interrupt();
try {
heartbeatThread.join();
} catch (InterruptedException ex) {
LOG.error("Error joining with heartbeat thread", ex);
}
client.stop();
handlerThread.interrupt();
super.serviceStop();
}
public void setHeartbeatInterval(int interval) {
heartbeatIntervalMs.set(interval);
}
public List<? extends Collection<T>> getMatchingRequests(
Priority priority,
String resourceName,
Resource capability) {
return client.getMatchingRequests(priority, resourceName, capability);
}
/**
* Registers this application master with the resource manager. On successful
* registration, starts the heartbeating thread.
* @throws YarnException
* @throws IOException
*/
public RegisterApplicationMasterResponse registerApplicationMaster(
String appHostName, int appHostPort, String appTrackingUrl)
throws YarnException, IOException {
RegisterApplicationMasterResponse response = client
.registerApplicationMaster(appHostName, appHostPort, appTrackingUrl);
heartbeatThread.start();
return response;
}
/**
* Unregister the application master. This must be called in the end.
* @param appStatus Success/Failure status of the master
* @param appMessage Diagnostics message on failure
* @param appTrackingUrl New URL to get master info
* @throws YarnException
* @throws IOException
*/
public void unregisterApplicationMaster(FinalApplicationStatus appStatus,
String appMessage, String appTrackingUrl) throws YarnException,
IOException {
synchronized (unregisterHeartbeatLock) {
keepRunning = false;
client.unregisterApplicationMaster(appStatus, appMessage, appTrackingUrl);
}
}
/**
* Request containers for resources before calling <code>allocate</code>
* @param req Resource request
*/
public void addContainerRequest(T req) {
client.addContainerRequest(req);
}
/**
* Remove previous container request. The previous container request may have
* already been sent to the ResourceManager. So even after the remove request
* the app must be prepared to receive an allocation for the previous request
* even after the remove request
* @param req Resource request
*/
public void removeContainerRequest(T req) {
client.removeContainerRequest(req);
}
/**
* Release containers assigned by the Resource Manager. If the app cannot use
* the container or wants to give up the container then it can release them.
* The app needs to make new requests for the released resource capability if
* it still needs it. eg. it released non-local resources
* @param containerId
*/
public void releaseAssignedContainer(ContainerId containerId) {
client.releaseAssignedContainer(containerId);
}
/**
* Get the currently available resources in the cluster.
* A valid value is available after a call to allocate has been made
* @return Currently available resources
*/
public Resource getAvailableResources() {
return client.getAvailableResources();
}
/**
* Get the current number of nodes in the cluster.
* A valid values is available after a call to allocate has been made
* @return Current number of nodes in the cluster
*/
public int getClusterNodeCount() {
return client.getClusterNodeCount();
}
private class HeartbeatThread extends Thread {
public HeartbeatThread() {
super("AMRM Heartbeater thread");
}
public void run() {
while (true) {
AllocateResponse response = null;
// synchronization ensures we don't send heartbeats after unregistering
synchronized (unregisterHeartbeatLock) {
if (!keepRunning) {
return;
}
try {
response = client.allocate(progress);
} catch (Throwable ex) {
LOG.error("Exception on heartbeat", ex);
savedException = ex;
// interrupt handler thread in case it waiting on the queue
handlerThread.interrupt();
return;
}
}
if (response != null) {
while (true) {
try {
responseQueue.put(response);
if (response.getAMCommand() == AMCommand.AM_RESYNC
|| response.getAMCommand() == AMCommand.AM_SHUTDOWN) {
return;
}
break;
} catch (InterruptedException ex) {
LOG.info("Interrupted while waiting to put on response queue", ex);
}
}
}
try {
Thread.sleep(heartbeatIntervalMs.get());
} catch (InterruptedException ex) {
LOG.info("Heartbeater interrupted", ex);
}
}
}
}
private class CallbackHandlerThread extends Thread {
public CallbackHandlerThread() {
super("AMRM Callback Handler Thread");
}
public void run() {
while (true) {
if (!keepRunning) {
return;
}
try {
AllocateResponse response;
if(savedException != null) {
LOG.error("Stopping callback due to: ", savedException);
handler.onError(savedException);
return;
}
try {
response = responseQueue.take();
} catch (InterruptedException ex) {
LOG.info("Interrupted while waiting for queue", ex);
continue;
}
if (response.getAMCommand() != null) {
switch(response.getAMCommand()) {
case AM_RESYNC:
case AM_SHUTDOWN:
handler.onShutdownRequest();
LOG.info("Shutdown requested. Stopping callback.");
return;
default:
String msg =
"Unhandled value of RM AMCommand: " + response.getAMCommand();
LOG.error(msg);
throw new YarnRuntimeException(msg);
}
}
List<NodeReport> updatedNodes = response.getUpdatedNodes();
if (!updatedNodes.isEmpty()) {
handler.onNodesUpdated(updatedNodes);
}
List<ContainerStatus> completed =
response.getCompletedContainersStatuses();
if (!completed.isEmpty()) {
handler.onContainersCompleted(completed);
}
List<Container> allocated = response.getAllocatedContainers();
if (!allocated.isEmpty()) {
handler.onContainersAllocated(allocated);
}
progress = handler.getProgress();
} catch (Throwable ex) {
handler.onError(ex);
// re-throw exception to end the thread
throw new YarnRuntimeException(ex);
}
}
}
}
}