package org.apache.hadoop.mapred; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.FSError; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.ProtocolProxy; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.mapred.CoronaDirectTaskUmbilical.VersionedProtocolPointer; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.metrics.jvm.JvmMetrics; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.LogManager; public class CoronaChild extends Child { /** List of proxies to close on cleanup */ static List<VersionedProtocolPointer> proxiesCreated = new ArrayList<VersionedProtocolPointer>(); public static void main(String[] args) throws Throwable { LOG.info("Corona Child starting"); JobConf defaultConf = new JobConf(); String host = args[0]; int port = Integer.parseInt(args[1]); InetSocketAddress address = new InetSocketAddress(host, port); final TaskAttemptID firstTaskid = TaskAttemptID.forName(args[2]); final int SLEEP_LONGER_COUNT = 5; int jvmIdInt = Integer.parseInt(args[3]); JVMId jvmId = new JVMId(firstTaskid.getJobID(),firstTaskid.isMap(),jvmIdInt); UserGroupInformation ticket = UserGroupInformation.login(defaultConf); int timeout = defaultConf.getInt("mapred.socket.timeout", 60000); TaskUmbilicalProtocol umbilical = ((ProtocolProxy<TaskUmbilicalProtocol>) RPC.getProtocolProxy( TaskUmbilicalProtocol.class, TaskUmbilicalProtocol.versionID, address, ticket, defaultConf, NetUtils.getDefaultSocketFactory(defaultConf), timeout)).getProxy(); proxiesCreated.add(new VersionedProtocolPointer(umbilical)); int numTasksToExecute = -1; //-1 signifies "no limit" int numTasksExecuted = 0; Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { if (taskid != null) { TaskLog.syncLogs(firstTaskid, taskid, isCleanup); } } catch (Throwable throwable) { } } }); Thread t = new Thread() { public void run() { //every so often wake up and syncLogs so that we can track //logs of the currently running task while (true) { try { Thread.sleep(5000); if (taskid != null) { TaskLog.syncLogs(firstTaskid, taskid, isCleanup); } } catch (InterruptedException ie) { } catch (IOException iee) { LOG.error("Error in syncLogs: " + iee); System.exit(-1); } } } }; t.setName("Thread for syncLogs"); t.setDaemon(true); t.start(); String pid = ""; if (!Shell.WINDOWS) { pid = System.getenv().get("JVM_PID"); } JvmContext context = new JvmContext(jvmId, pid); int idleLoopCount = 0; Task task = null; try { while (true) { taskid = null; JvmTask myTask = umbilical.getTask(context); if (myTask.shouldDie()) { break; } else { if (myTask.getTask() == null) { taskid = null; if (++idleLoopCount >= SLEEP_LONGER_COUNT) { //we sleep for a bigger interval when we don't receive //tasks for a while Thread.sleep(1500); } else { Thread.sleep(500); } continue; } } idleLoopCount = 0; task = myTask.getTask(); taskid = task.getTaskID(); isCleanup = task.isTaskCleanupTask(); // reset the statistics for the task FileSystem.clearStatistics(); //create the index file so that the log files //are viewable immediately TaskLog.syncLogs(firstTaskid, taskid, isCleanup); //setupWorkDir actually sets up the symlinks for the distributed //cache. After a task exits we wipe the workdir clean, and hence //the symlinks have to be rebuilt. JobConf job = new JobConf(task.getJobFile()); //read from bigParam if it exists String bigParamStr = job.get("mapred.bigparam.path", ""); if ( bigParamStr != null && bigParamStr.length() > 0) { Path bigParamPath = new Path(bigParamStr); File file = new File(bigParamPath.toUri().getPath()).getAbsoluteFile(); BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); int MAX_BUFFER_SIZE = 1024; StringBuilder result = new StringBuilder(); char[] buffer = new char[MAX_BUFFER_SIZE]; int readChars = 0, totalChars = 0; while ((readChars = in.read(buffer, 0, MAX_BUFFER_SIZE)) > 0) { result.append(buffer, 0, readChars); totalChars += readChars; } job.set("mapred.input.dir", result.toString()); LOG.info("Read mapred.input.dir: " + totalChars); in.close(); } TaskRunner.setupWorkDir(job); { // Use conf written by CoronaTaskTracker.reconfigureLocalJobConf() // for some reason attempt conf is not updated between tasks // and DirectTaskUmbilical c'tor is falling back to local JT LocalDirAllocator lDir = new LocalDirAllocator("mapred.local.dir"); Path umbilicalConfFile = lDir.getLocalPathToRead( CoronaTaskTracker.getLocalJobDir(task.getJobID().toString()) + Path.SEPARATOR + "job.xml", job); JobConf umbilicalConf = new JobConf(umbilicalConfFile); umbilical = convertToDirectUmbilicalIfNecessary(umbilical, umbilicalConf, task); } numTasksToExecute = job.getNumTasksToExecutePerJvm(); assert(numTasksToExecute != 0); task.setConf(job); defaultConf.addResource(new Path(task.getJobFile())); // Initiate Java VM metrics JvmMetrics.init(task.getPhase().toString(), job.getSessionId()); // use job-specified working directory FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory()); try { task.run(job, umbilical); // run the task } finally { TaskLog.syncLogs(firstTaskid, taskid, isCleanup); } if (numTasksToExecute > 0 && ++numTasksExecuted == numTasksToExecute) { break; } } } catch (FSError e) { LOG.fatal("FSError from child", e); umbilical.fsError(taskid, e.getMessage()); } catch (Exception exception) { LOG.warn("Error running child", exception); try { if (task != null) { // do cleanup for the task task.taskCleanup(umbilical); } } catch (Exception e) { LOG.info("Error cleaning up" + e); } // Report back any failures, for diagnostic purposes ByteArrayOutputStream baos = new ByteArrayOutputStream(); exception.printStackTrace(new PrintStream(baos)); if (taskid != null) { umbilical.reportDiagnosticInfo(taskid, baos.toString()); } } catch (Throwable throwable) { LOG.fatal("Error running child : " + StringUtils.stringifyException(throwable)); if (taskid != null) { Throwable tCause = throwable.getCause(); String cause = tCause == null ? throwable.getMessage() : StringUtils.stringifyException(tCause); umbilical.fatalError(taskid, cause); } } finally { for (VersionedProtocolPointer proxy : proxiesCreated) { RPC.stopProxy(proxy.getClient()); } MetricsContext metricsContext = MetricsUtil.getContext("mapred"); metricsContext.close(); // Shutting down log4j of the child-vm... // This assumes that on return from Task.run() // there is no more logging done. LogManager.shutdown(); } } private static TaskUmbilicalProtocol convertToDirectUmbilicalIfNecessary( TaskUmbilicalProtocol umbilical, JobConf job, Task task) throws IOException { // We only need a direct umbilical for reducers. if (task.isMapTask()) { return umbilical; } InetSocketAddress directAddress = CoronaDirectTaskUmbilical.getAddress( job, CoronaDirectTaskUmbilical.DIRECT_UMBILICAL_JT_ADDRESS); InetSocketAddress secondaryAddress = CoronaDirectTaskUmbilical .getAddress(job, CoronaDirectTaskUmbilical.DIRECT_UMBILICAL_FALLBACK_ADDRESS); if (directAddress != null) { CoronaDirectTaskUmbilical direct = CoronaDirectTaskUmbilical.createDirectUmbilical( umbilical, directAddress, secondaryAddress, job); proxiesCreated.addAll(direct.getCreatedProxies()); return direct; } return umbilical; } }