package com.scaleunlimited.cascading; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TimeZone; import java.util.concurrent.ExecutionException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cascading.flow.Flow; import cascading.flow.FlowProcess; import cascading.flow.FlowStep; import cascading.flow.hadoop.HadoopFlowProcess; import cascading.stats.CascadingStats.Status; import cascading.stats.FlowStats; import cascading.stats.FlowStepStats; import cascading.stats.hadoop.HadoopSliceStats; import cascading.stats.hadoop.HadoopSliceStats.Kind; import cascading.stats.hadoop.HadoopStepStats; import cascading.stats.local.LocalStepStats; import com.scaleunlimited.cascading.hadoop.HadoopUtils; public class FlowRunner { static final Logger LOGGER = LoggerFactory.getLogger(FlowRunner.class); private static final long FLOW_CHECK_INTERVAL = 1 * 1000L; private static final long TERMINATE_CHECK_INTERVAL = 100L; // Default number of flows to run in parallel private static final int DEFAULT_MAX_FLOWS = 100; private static class TaskStats { private String _flowName; private String _stepName; private int _mapCount; private int _reduceCount; private long _mapTime; private long _reduceTime; public TaskStats(String flowName, String stepName) { _flowName = flowName; _stepName = stepName; _mapCount = 0; _reduceCount = 0; _mapTime = 0; _reduceTime = 0; } public void incMapCount(int mapCount) { _mapCount += mapCount; } public void incReduceCount(int reduceCount) { _reduceCount += reduceCount; } public void incMapTime(long mapTime) { _mapTime += mapTime; } public void incReduceTime(long reduceTime) { _reduceTime += reduceTime; } public String getFlowName() { return _flowName; } public String getStepName() { return _stepName; } public int getMapCount() { return _mapCount; } public int getReduceCount() { return _reduceCount; } public long getMapTime() { return _mapTime; } public long getReduceTime() { return _reduceTime; } } private int _maxFlows; private final List<FlowFuture> _flowFutures = new ArrayList<FlowFuture>(); private Thread _statsThread; public FlowRunner() { this(DEFAULT_MAX_FLOWS); } public FlowRunner(int maxFlows) { this("FlowRunner", maxFlows, null, 0); } /** * Create a FlowRunner suitable for running up to maxFlows flows in parallel. * Note that maxFlows is ignored (treated as 1) when adding a flow, if we're * not in a real (non-local) Hadoop environment. * * @param runnerName Name of the FlowRunner, for stats & job naming purposes. * @param maxFlows Maximum number of flows to run in parallel * @param statsDir Where to put continuous job statistics (or null for no stats) * @param checkInterval How often to write out statistics, in milliseconds. */ public FlowRunner(String runnerName, int maxFlows, File statsDir, final long checkInterval) { if ((maxFlows > 1) && (HadoopUtils.isJobLocal(new JobConf()))) { LOGGER.warn("Running locally, so flows must be run serially for thread safety."); _maxFlows = 1; } else { _maxFlows = maxFlows; } createStats(runnerName, statsDir, checkInterval); } private void createStats(final String runnerName, File statsDir, final long checkInterval) { if (statsDir == null) { return; } // Since the caller wants stats, we need to fire up a thread that will check // the flows on a regular basis. statsDir.mkdirs(); final PrintStream statsStream = makeStatsStream(statsDir, runnerName, "stats.tsv"); final PrintStream detailsStream = makeStatsStream(statsDir, runnerName, "details.tsv"); final PrintStream summaryStream = makeStatsStream(statsDir, runnerName, "summary.tsv"); _statsThread = new Thread(new Runnable() { @Override public void run() { long startTime = System.currentTimeMillis(); // We want to set our next check time so that when the checkInterval is // added to it, we're on a checkInterval boundary. So that way we'd // get a check time of say 03:34:57, and the next check time would // be 03:35:00, if the interval was every minute. long nextCheckTime = startTime - (startTime % checkInterval); SimpleDateFormat timeFormatter = new SimpleDateFormat("yyyyMMdd hh:mm:ss"); timeFormatter.setTimeZone(TimeZone.getTimeZone("UTC")); // Set up for per-step summaries. For each unique step (Hadoop Job), we'll // keep track of total map-minutes, total reduce-minutes. Map<String, Long> mapMilliseconds = new HashMap<String, Long>(); Map<String, Long> reduceMilliseconds = new HashMap<String, Long>(); try { boolean terminating = false; while (!terminating) { // We sleep at the top, so that when we get interrupted (typically when a flow is done) // we collect the final stats for the flow. try { Thread.sleep(Math.max(0, nextCheckTime - System.currentTimeMillis())); } catch (InterruptedException e) { // We were interrupted, so terminate. LOGGER.info("Terminating Flow stats thread"); terminating = true; } Map<String, TaskStats> taskCounts = new HashMap<String, TaskStats>(); String timestamp = timeFormatter.format(nextCheckTime); nextCheckTime += checkInterval; synchronized (_flowFutures) { for (FlowFuture ff : _flowFutures) { collectStats(ff, taskCounts); } } // Output counts. Format is // <timestamp><tab><map tasks><tab><reduce tasks><tab><task details> String stats = makeStats(taskCounts, true); statsStream.println(String.format("%s\t%s", timestamp, stats)); // System.out.println("" + timestamp + "\t" + stats); // Generate a summary line // <timestamp> <map tasks> <reduce tasks> stats = makeStats(taskCounts, false); detailsStream.println(String.format("%s\t%s", timestamp, stats)); // For each of the tasks with map/reduce counts, output a separate line after the summary line. // <map tasks> <reduce tasks> <task name> <timestamp> for (TaskStats taskStat : taskCounts.values()) { String flowAndStepName = String.format("%s|%s", taskStat.getFlowName(), taskStat.getStepName()); int mapCount = taskStat.getMapCount(); addSlotTime(mapMilliseconds, flowAndStepName, mapCount, checkInterval); int reduceCount = taskStat.getReduceCount(); addSlotTime(reduceMilliseconds, flowAndStepName, reduceCount, checkInterval); if (mapCount + reduceCount > 0) { detailsStream.println(String.format("\t%d\t%d\t%s\t%s", mapCount, reduceCount, flowAndStepName, timestamp)); } } } // All done with Flow, write out summary results. for (String flowAndStepName : mapMilliseconds.keySet()) { long mapDuration = mapMilliseconds.get(flowAndStepName) / 60 * 1000L; long reduceDuration = reduceMilliseconds.get(flowAndStepName) / 60 * 1000L; summaryStream.println(String.format("%d\t%d\t%s", mapDuration, reduceDuration, flowAndStepName)); } } catch (Throwable t) { LOGGER.error("Exception while collecting stats for " + runnerName, t); } finally { IOUtils.closeQuietly(statsStream); IOUtils.closeQuietly(detailsStream); IOUtils.closeQuietly(summaryStream); } } private void addSlotTime(Map<String, Long> slotTime, String flowAndStepName, int slotCount, long checkInterval) { Long curSlotTime = slotTime.get(flowAndStepName); if (curSlotTime == null) { curSlotTime = new Long(0); } slotTime.put(flowAndStepName, curSlotTime + (slotCount * checkInterval)); } }, "FlowRunner stats"); // We don't want to hold up the JVM for this one thread. _statsThread.setDaemon(true); // We want to make sure the thread gets terminated on shutdown Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { @Override public void run() { terminate(); } }, "FlowRunner stats shutdown hook")); _statsThread.start(); } private PrintStream makeStatsStream(File statsDir, String runnerName, String suffix) { File statsFile = new File(statsDir, runnerName + "-" + suffix); statsFile.delete(); final PrintStream statsStream; try { statsStream = new PrintStream(statsFile, "UTF-8"); LOGGER.info("Logging stats to " + statsFile); } catch (FileNotFoundException e) { throw new RuntimeException("Can't create stats output file: " + statsFile, e); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Impossible exception", e); } return statsStream; } private String makeStats(Map<String, TaskStats> taskCounts, boolean includeTaskDetails) { // We want <# map tasks><tab><# reduce tasks><tab><task details> // where <task details> looks like <flowname|stepname=mapcount,reducecount; flowname|stepname=mapcount,reducecount> int mapCount = 0; int reduceCount = 0; StringBuilder taskDetails = new StringBuilder(); for (TaskStats stats : taskCounts.values()) { if ((stats.getMapCount() == 0) && (stats.getReduceCount() == 0)) { // TODO Figure out why we don't get any total map/reduce time values. // taskDetails.append(String.format("%s|%s=%dms,%dms; ", stats.getFlowName(), stats.getStepName(), stats.getMapTime(), stats.getReduceTime())); } else { mapCount += stats.getMapCount(); reduceCount += stats.getReduceCount(); if (includeTaskDetails) { taskDetails.append(String.format("%s|%s=%d,%d; ", stats.getFlowName(), stats.getStepName(), stats.getMapCount(), stats.getReduceCount())); } } } return String.format("%d\t%d\t%s", mapCount, reduceCount, taskDetails.toString()); } private void collectStats(FlowFuture ff, Map<String, TaskStats> taskCounts) { Flow flow = ff.getFlow(); FlowStats fs = flow.getFlowStats(); String flowId = flow.getID(); String flowName = flow.getName(); List<FlowStep> flowSteps = flow.getFlowSteps(); for (FlowStep flowStep : flowSteps) { FlowStepStats stepStats = flowStep.getFlowStepStats(); String stepId = flowStep.getID(); String stepName = flowStep.getName(); String countsKey = String.format("%s-%s", flowId, stepId); if (stepStats instanceof HadoopStepStats) { HadoopStepStats hadoopSS = (HadoopStepStats)stepStats; // We don't want/need info on task attempts hadoopSS.captureDetail(false); // We have one child for every task. We have to see if it's // running, and if so, whether it's a mapper or reducer Iterator<HadoopSliceStats> iter = hadoopSS.getChildren().iterator(); while (iter.hasNext()) { HadoopSliceStats sliceStats = iter.next(); // System.out.println(String.format("id=%s, kind=%s, status=%s", sliceStats.getID(), sliceStats.getKind(), sliceStats.getStatus())); if (sliceStats.getStatus() == Status.SUCCESSFUL) { // Set the total time // TODO this doesn't seem to be working, I get 0. // Plus it needs JobInProgress.Counter as a class, which means anyone using // cascading.utils winds up needing to include Hadoop. /* incrementCounts(taskCounts, countsKey, flowName, stepName, 0, 0, sliceStats.getCounterValue(JobInProgress.Counter.SLOTS_MILLIS_MAPS), sliceStats.getCounterValue(JobInProgress.Counter.SLOTS_MILLIS_REDUCES)); */ } else if (sliceStats.getStatus() == Status.RUNNING) { if (sliceStats.getKind() == Kind.MAPPER) { incrementCounts(taskCounts, countsKey, flowName, stepName, 1, 0, 0, 0); } else if (sliceStats.getKind() == Kind.REDUCER) { incrementCounts(taskCounts, countsKey, flowName, stepName, 0, 1, 0, 0); } } } } else if (stepStats instanceof LocalStepStats) { stepStats.captureDetail(); // map & reduce kind of run as one, so just add one to both if there's a group. incrementCounts(taskCounts, countsKey, flowName, stepName, 1, 0, 0, 0); if (flowStep.getGroups().size() > 0) { incrementCounts(taskCounts, countsKey, flowName, stepName, 0, 1, 0, 0); } } else { throw new RuntimeException("Unknown type returned by FlowStep.getFlowStepStats: " + stepStats.getClass()); } } } /** * Cascading will hang on to the HadoopStepStats for every flow, until we get rid of them. So we'll clear out * stats once a FlowFuture is done. */ private void clearStats(FlowFuture ff) { Flow flow = ff.getFlow(); FlowStats fs = ff.getFlow().getFlowStats(); List<FlowStep> flowSteps = flow.getFlowSteps(); for (FlowStep flowStep : flowSteps) { FlowStepStats stepStats = flowStep.getFlowStepStats(); if (stepStats instanceof HadoopStepStats) { HadoopStepStats hadoopSS = (HadoopStepStats)stepStats; hadoopSS.getTaskStats().clear(); } } } private void incrementCounts(Map<String, TaskStats> counts, String countsKey, String flowName, String stepName, int mapCount, int reduceCount, long mapTime, long reduceTime) { // If they're all zero, just ignore the call so we don't get extra entries. if ((mapCount == 0) && (reduceCount == 0) && (mapTime == 0) && (reduceTime == 0)) { return; } TaskStats curStats = counts.get(countsKey); if (curStats == null) { curStats = new TaskStats(flowName, stepName); } curStats.incMapCount(mapCount); curStats.incReduceCount(reduceCount); curStats.incMapTime(mapTime); curStats.incReduceTime(reduceTime); counts.put(countsKey, curStats); } /** * Wait for an open slot, and then start the Flow running, returning * the corresponding FlowFuture. * * FUTURE - we could make FlowFuture something where the flow doesn't * start until you call FlowFuture.start(). Then this routine could * immediately return the FlowFuture, and start it when we drop under * <maxFlows>. But that would require an async executor to constantly * be polling the running flows, determining when they are done. * * @param flow the Flow to run * @return FlowFuture * @throws InterruptedException */ public FlowFuture addFlow(Flow flow) throws InterruptedException { // Assume single-threaded unless we're a non-local Hadoop flow. boolean singleThreaded = true; FlowProcess<?> fp = HadoopUtils.undelegate(flow.getFlowProcess()); if (fp instanceof HadoopFlowProcess) { HadoopFlowProcess hfp = (HadoopFlowProcess)fp; singleThreaded = HadoopUtils.isJobLocal(hfp.getJobConf()); } String message; if (singleThreaded) { message = String.format( "Adding flow %s to single-threaded FlowRunner", flow.getName()); } else { message = String.format( "Adding flow %s to FlowRunner (which supports %d simultaneous flows)", flow.getName(), _maxFlows); } LOGGER.info(message); // Find an open spot, or loop until we get one. while (true) { synchronized (_flowFutures) { Iterator<FlowFuture> iter = _flowFutures.iterator(); while (iter.hasNext()) { FlowFuture ff = iter.next(); if (ff.isDone()) { clearStats(ff); iter.remove(); } } // Now that we've removed any flows that are done, see if we // can add the new flow. If we're single threaded then the // max number of flows at one time is just one. if (_flowFutures.size() < (singleThreaded ? 1 : _maxFlows)) { FlowFuture ff = new FlowFuture(flow); _flowFutures.add(ff); return ff; } } // No open slots, so loop Thread.sleep(FLOW_CHECK_INTERVAL); } } /** * @return false if there is room to add at least one more flow. */ public boolean isFull() { // Release any completed flows isDone(); synchronized (_flowFutures) { return (_flowFutures.size() >= _maxFlows); } } /** * Return true if all of the flows are done running. * * @return */ public boolean isDone() { synchronized (_flowFutures) { Iterator<FlowFuture> iter = _flowFutures.iterator(); while (iter.hasNext()) { FlowFuture ff = iter.next(); if (ff.isDone()) { iter.remove(); } else { return false; } } } // Nothing still running, so we're all done. return true; } /** * Wait until all of the flows have finished. * * @throws InterruptedException */ public void complete() throws InterruptedException { while (!isDone()) { Thread.sleep(FLOW_CHECK_INTERVAL); } terminate(); } public void terminate() { // First terminate all of the running flows. synchronized (_flowFutures) { Iterator<FlowFuture> iter = _flowFutures.iterator(); while (iter.hasNext()) { FlowFuture ff = iter.next(); if (ff.isDone()) { iter.remove(); } else { ff.cancel(true); } } } // And now interrupt the stats thread, as we should now have collected // all of the data. Thread statsThread = null; if (_statsThread != null) { synchronized(_statsThread) { // Somebody might have cleared the thread if ((_statsThread != null) && _statsThread.isAlive()) { _statsThread.interrupt(); // Save thread off, since we have to leave the synchronized // block before the actual running code can clear the thread // state. statsThread = _statsThread; _statsThread = null; } } } // Now we need to wait until the stats thread has really stopped, // so that stats results are available after a call to terminate(). // This is mostly so that unit tests can reliably run. if (statsThread != null) { // Wait until the stats thread has really stopped while (statsThread.isAlive()) { try { Thread.sleep(TERMINATE_CHECK_INTERVAL); } catch (InterruptedException e) { LOGGER.warn("Stats thread termination interrupted!"); } } } } @Override protected void finalize() throws Throwable { terminate(); super.finalize(); } /** * Convenience method for running a Flow and returning the result. * * @param flow Flow to run * @return Result of running the flow. * @throws InterruptedException * @throws ExecutionException */ public static FlowResult run(Flow flow) throws InterruptedException, ExecutionException { FlowFuture ff = new FlowFuture(flow); return ff.get(); } }