package com.neverwinterdp.scribengin.yarn; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest; import org.apache.hadoop.yarn.client.api.NMClient; import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.util.Records; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.mortbay.log.Log; import com.beust.jcommander.Parameter; import com.neverwinterdp.scribengin.constants.Constants; import com.neverwinterdp.scribengin.scribeconsumer.ScribeConsumerConfig; import com.neverwinterdp.scribengin.utilities.Util; public abstract class AbstractApplicationMaster { private AMRMClientAsync<ContainerRequest> resourceManager; private NMClient nodeManager; private Configuration conf; protected static final Logger LOG = Logger.getLogger(AbstractApplicationMaster.class.getName()); @Parameter(names = {"-" + Constants.OPT_CONTAINER_MEM, "--" + Constants.OPT_CONTAINER_MEM}) private int containerMem; protected int totalContainerCount; private AtomicInteger completedContainerCount; private AtomicInteger allocatedContainerCount; private AtomicInteger failedContainerCount; private AtomicInteger requestedContainerCount; private String appMasterHostname = ""; // TODO: What should this really be? private int appMasterRpcPort = 0; // TODO: What should this really be? private String appMasterTrackingUrl = ""; // TODO: What should this really be? private boolean done; protected Map<ContainerId, String> containerIdCommandMap; protected List<String> failedCommandList; protected ScribeConsumerConfig scribeConsumerConfig; public AbstractApplicationMaster() { conf = new YarnConfiguration(); completedContainerCount = new AtomicInteger(); allocatedContainerCount = new AtomicInteger(); failedContainerCount = new AtomicInteger(); requestedContainerCount = new AtomicInteger(); containerIdCommandMap = new HashMap<ContainerId, String>(); failedCommandList = new ArrayList<String>(); } public AbstractApplicationMaster(String yarnSiteXml) { conf = new YarnConfiguration(); //example - "/etc/hadoop/conf/yarn-site.xml" this.conf.addResource(new Path(yarnSiteXml)); completedContainerCount = new AtomicInteger(); allocatedContainerCount = new AtomicInteger(); failedContainerCount = new AtomicInteger(); requestedContainerCount = new AtomicInteger(); containerIdCommandMap = new HashMap<ContainerId, String>(); failedCommandList = new ArrayList<String>(); } public void init() { LOG.setLevel(Level.INFO); done = false; } public boolean run() throws IOException, YarnException { // Initialize clients to RM and NMs. LOG.info("ApplicationMaster::run"); AMRMClientAsync.CallbackHandler rmListener = new RMCallbackHandler(); resourceManager = AMRMClientAsync.createAMRMClientAsync(1000, rmListener); resourceManager.init(conf); resourceManager.start(); nodeManager = NMClient.createNMClient(); nodeManager.init(conf); nodeManager.start(); // Register with RM resourceManager.registerApplicationMaster(appMasterHostname, appMasterRpcPort, appMasterTrackingUrl); Log.info("total container count: "+Integer.toString(totalContainerCount)); // Ask RM to give us a bunch of containers //for (int i = 0; i < totalContainerCount; i++) { ContainerRequest containerReq = setupContainerReqForRM(); resourceManager.addContainerRequest(containerReq); //} requestedContainerCount.addAndGet(totalContainerCount); while (!done) { try { Thread.sleep(200); } catch (InterruptedException ex) { } }// while // Un-register with ResourceManager resourceManager.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, "", ""); return true; } private ContainerRequest setupContainerReqForRM() { // Priority for worker containers - priorities are intra-application Priority priority = Records.newRecord(Priority.class); priority.setPriority(0); // Resource requirements for worker containers Resource capability = Records.newRecord(Resource.class); capability.setMemory(containerMem); //capability.setVirtualCores(1); ContainerRequest containerReq = new ContainerRequest( capability, null /* hosts String[] */, null /* racks String [] */, priority); return containerReq; } private synchronized void recordFailedCommand(ContainerId cid) { String failedCmd = containerIdCommandMap.get(cid); containerIdCommandMap.remove(cid); failedCommandList.add(failedCmd); } //abstract protected List<String> buildCommandList(int startingFrom, int containerCnt); abstract protected List<String> buildCommandList(ScribeConsumerConfig conf); private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler { // CallbackHandler for RM. // Execute a program when the container is allocated // Reallocate upon failure. public void onContainersCompleted(List<ContainerStatus> statuses) { LOG.info("onContainersCompleted"); for (ContainerStatus status : statuses) { assert (status.getState() == ContainerState.COMPLETE); int exitStatus = status.getExitStatus(); if (exitStatus != ContainerExitStatus.SUCCESS) { if (exitStatus != ContainerExitStatus.ABORTED) { failedContainerCount.incrementAndGet(); } allocatedContainerCount.decrementAndGet(); requestedContainerCount.decrementAndGet(); recordFailedCommand(status.getContainerId()); } else { completedContainerCount.incrementAndGet(); } } int askAgainCount = totalContainerCount - requestedContainerCount.get(); requestedContainerCount.addAndGet(askAgainCount); if (askAgainCount > 0) { // need to reallocate failed containers for (int i = 0; i < askAgainCount; i++) { ContainerRequest req = setupContainerReqForRM(); resourceManager.addContainerRequest(req); } } if (completedContainerCount.get() == totalContainerCount) { done = true; } } public void onContainersAllocated(List<Container> containers) { LOG.info("onContainersAllocated"); int containerCnt = containers.size(); List<String> cmdLst; if (failedCommandList.isEmpty()) { int startFrom = allocatedContainerCount.getAndAdd(containerCnt); LOG.info("containerCnt: " + containerCnt); cmdLst = buildCommandList(scribeConsumerConfig); } else { // TODO: keep track of failed commands' history. cmdLst = failedCommandList; int failedCommandListCnt = failedCommandList.size(); if (failedCommandListCnt < containerCnt) { // It's possible that the allocated containers are for both newly allocated and failed containers int startFrom = allocatedContainerCount.getAndAdd(containerCnt - failedCommandListCnt); cmdLst = buildCommandList(scribeConsumerConfig); } } FileSystem fs = null; try { fs = FileSystem.get(conf); } catch (IOException e) { LOG.error("Error in instantiating fs. Reason: " + e); } for (int i = 0; i < containerCnt; i++) { Container c = containers.get(i); String cmdStr = cmdLst.remove(0); LOG.info("running cmd: " + cmdStr); StringBuilder sb = new StringBuilder(); ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); containerIdCommandMap.put(c.getId(), cmdStr); ctx.setCommands(Collections.singletonList( sb.append(cmdStr) .append(" 1> ").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR) .append("/stdout") .append(" 2> ").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR) .append("/stderr") .toString())); try { // TODO: get rid of the hardcoding of scribengin-1.0-SNAPSHOT.jar ctx.setLocalResources( Collections.singletonMap("scribeconsumer.jar", Util.newYarnAppResource(fs, new Path("/scribengin-1.0-SNAPSHOT.jar"), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION))); nodeManager.startContainer(c, ctx); } catch (YarnException e) { LOG.error("Error in onContainerAlloacted. Reason: " + e); } catch (IOException e) { LOG.error("Error in onContainerAlloacted. Reason: " + e); } } } public void onNodesUpdated(List<NodeReport> updated) {} public void onError(Throwable e) { done = true; resourceManager.stop(); } // Called when the ResourceManager wants the ApplicationMaster to shutdown // for being out of sync etc. The ApplicationMaster should not unregister // with the RM unless the ApplicationMaster wants to be the last attempt. public void onShutdownRequest() { done = true; } public float getProgress() { return 0; } } }