package com.scaleunlimited.cascading; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.Map; import java.util.concurrent.CancellationException; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import junit.framework.Assert; import org.junit.Test; import cascading.flow.Flow; import cascading.flow.FlowProcess; import cascading.operation.BaseOperation; import cascading.operation.Filter; import cascading.operation.FilterCall; import cascading.pipe.Each; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.assembly.SumBy; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryCollector; import com.scaleunlimited.cascading.hadoop.HadoopPlatform; import com.scaleunlimited.cascading.hadoop.test.MiniClusterPlatform; import com.scaleunlimited.cascading.local.LocalPlatform; public class FlowRunnerTest extends Assert { private enum MyCounters { FILTER_REQUESTS, } @SuppressWarnings({ "serial", "rawtypes" }) private static class MyFilter extends BaseOperation implements Filter { private boolean _fails; private boolean _didDelay; public MyFilter(boolean fails) { _fails = fails; _didDelay = false; } @Override public boolean isRemove(FlowProcess process, FilterCall filterCall) { if (_fails) { throw new RuntimeException("We failed!"); } if (!_didDelay) { _didDelay = true; try { Thread.sleep(100L); } catch (InterruptedException e) { // ignore exception } } process.increment(MyCounters.FILTER_REQUESTS, 1); return false; } } @Test public void testAsyncOperation() throws Throwable { FlowRunner fr = new FlowRunner(); Assert.assertTrue(fr.isDone()); FlowFuture result0 = fr.addFlow(makeFlow("testAsyncOperation", 10, 0)); FlowFuture result1 = fr.addFlow(makeFlow("testAsyncOperation", 100, 1)); Assert.assertFalse(fr.isDone()); // Try the get() call on the future before it will have completed. Map<String, Long> counters0 = result0.get().getCounters(); Assert.assertEquals(10, (long)counters0.get(MyCounters.class.getName() + "." + MyCounters.FILTER_REQUESTS.name())); // Now wait for everything to complete. fr.complete(); Map<String, Long> counters1 = result1.get().getCounters(); Assert.assertEquals(100, (long)counters1.get(MyCounters.class.getName() + "." + MyCounters.FILTER_REQUESTS.name())); } @Test public void testShortWait() throws Exception { FlowRunner fr = new FlowRunner(); FlowFuture result = fr.addFlow(makeFlow("testShortWait", 100, 0)); try { // Wait for a very short amount of time. result.get(1, TimeUnit.NANOSECONDS); Assert.fail("No TimeoutException was thrown"); } catch (TimeoutException e) { // what we want } } @Test public void testFailureHandling() throws Exception { FlowRunner fr = new FlowRunner(); FlowFuture result = fr.addFlow(makeFlow("testFailureHandling", 100, 0, true)); try { result.get(); Assert.fail("No ExecutionException was thrown"); } catch (ExecutionException e) { // what we want } } @Test public void testCancelling() throws Exception { FlowRunner fr = new FlowRunner(); FlowFuture result = fr.addFlow(makeFlow("testCancelling", 100, 0, true)); // Have to interrupt running job to get it to be canceled Assert.assertFalse(result.isDone()); Assert.assertFalse(result.isCancelled()); Assert.assertFalse(result.cancel(false)); Assert.assertFalse(result.isDone()); Assert.assertFalse(result.isCancelled()); // Really cancel it. Assert.assertTrue(result.cancel(true)); Assert.assertTrue(result.isDone()); Assert.assertTrue(result.isCancelled()); try { result.get(); Assert.fail("No CancellationException was thrown"); } catch (CancellationException e) { // what we want } } @Test public void testIsFull() throws Throwable { System.setProperty("java.security.krb5.realm", ""); System.setProperty("java.security.krb5.kdc", ""); // TODO It would be better to test with a larger capacity, but it only // runs one flow at a time in local mode. // An empty runner shouldn't be full. FlowRunner fr = new FlowRunner(1); Assert.assertFalse(fr.isFull()); // There should be no room after we fill it up. FlowFuture result0 = fr.addFlow(makeFlow("testIsFull", 10, 0)); Assert.assertTrue(fr.isFull()); // There should be room after the first flow finishes. result0.get(); Assert.assertFalse(fr.isFull()); // There should be no room after we fill the empty slot. fr.addFlow(makeFlow("testIsFull", 10, 1)); Assert.assertTrue(fr.isFull()); // There should be room after everything completes. fr.complete(); Assert.assertFalse(fr.isFull()); } @Test public void testStatsLocal() throws Exception { final String logDirName = "build/test/FlowRunnerTest/testStatsLocal/log"; BasePlatform platform = new LocalPlatform(FlowRunnerTest.class); platform.setLogDir(new File(logDirName)); FlowRunner fr = new FlowRunner("testStatsLocal", 1, new File(logDirName), 10); FlowFuture ff = fr.addFlow(makeFlow("testStatsLocal", 10, 0, false, platform)); ff.get(); fr.terminate(); // We should some number of entries in the stats file. The sink name is now // part of the step name, so it's "group on total local". checkStatsFile(logDirName, "testStatsLocal", "group on total local", 1, 1); // And also in the summary file checkSummaryFile(platform.getLogDir().getAbsolutePath(), "testStatsLocal", "group on total"); } @Test public void testStatsHadoop() throws Exception { System.setProperty("java.security.krb5.realm", ""); System.setProperty("java.security.krb5.kdc", ""); final String logDirName = "build/test/FlowRunnerTest/testStatsHadoop/log"; BasePlatform platform = new HadoopPlatform(FlowRunnerTest.class); FlowRunner fr = new FlowRunner("testStatsHadoop", 1, new File(logDirName), 1000L); FlowFuture result = fr.addFlow(makeFlow("testStatsHadoop", 10, 0, false, platform)); result.get(); fr.terminate(); // We should some number of entries in the stats file // Unfortunately you get no stats for Hadoop when running in Hadoop local mode, as there // is no JobTracker // checkStatsFile(logDirName, "testStatsHadoop", "group on total", 0, 1); } @Test public void testTerminationHadoop() throws Exception { System.setProperty("java.security.krb5.realm", ""); System.setProperty("java.security.krb5.kdc", ""); BasePlatform platform = new HadoopPlatform(FlowRunnerTest.class); FlowRunner fr = new FlowRunner("testTerminationHadoop", 1, new File("build/test/FlowRunnerTest/testTerminationHadoop/log"), 100); fr.addFlow(makeFlow("testTerminationHadoop", 10, 0, false, platform)); fr.terminate(); } @Test public void testTerminationLocal() throws Exception { BasePlatform platform = new LocalPlatform(FlowRunnerTest.class); FlowRunner fr = new FlowRunner("testTerminationLocal", 1, new File("build/test/FlowRunnerTest/testTerminationLocal/log"), 10); fr.addFlow(makeFlow("testTerminationLocal", 10, 0, false, platform)); fr.terminate(); } @Test public void testStatsHadoopMiniCluster() throws Exception { final int numContainers = 2; MiniClusterPlatform platform = new MiniClusterPlatform(FlowRunnerTest.class, numContainers, "build/test/FlowRunnerTest/testStatsHadoopMiniCluster/log/"); platform.setJobPollingInterval(10); platform.setNumReduceTasks(numContainers); FlowRunner fr = new FlowRunner("testStatsHadoopMiniCluster", 1, platform.getLogDir(), 1000); FlowFuture result = fr.addFlow(makeFlow("testStatsHadoopMiniCluster", 10, 0, false, platform)); result.get(); fr.terminate(); // We should some number of entries in the stats file checkStatsFile(platform.getLogDir().getAbsolutePath(), "testStatsHadoopMiniCluster", "group on total (2/2) ...tsHadoopMiniCluster/out-0", 0, 2); // And check for something similar in the details file checkDetailsFile(platform.getLogDir().getAbsolutePath(), "testStatsHadoopMiniCluster", "group on total (2/2) ...tsHadoopMiniCluster/out-0", 0, 2); // And also in the summary file checkSummaryFile(platform.getLogDir().getAbsolutePath(), "testStatsHadoopMiniCluster", "group on total (2/2) ...tsHadoopMiniCluster/out-0"); platform.shutdown(); } private BufferedReader openStatsFile(String logDirName, String testName) throws FileNotFoundException { File statsFile = getStatsFile(logDirName, testName); assertTrue(statsFile.exists()); return new BufferedReader(new FileReader(statsFile)); } private File getStatsFile(String logDirName, String testName) { File statsDir = new File(logDirName); return new File(statsDir, testName + "-stats.tsv"); } private BufferedReader openDetailsFile(String logDirName, String testName) throws FileNotFoundException { File statsDir = new File(logDirName); File statsFile = new File(statsDir, testName + "-details.tsv"); assertTrue(statsFile.exists()); return new BufferedReader(new FileReader(statsFile)); } private BufferedReader openSummaryFile(String logDirName, String testName) throws FileNotFoundException { File statsDir = new File(logDirName); File statsFile = new File(statsDir, testName + "-summary.tsv"); assertTrue(statsFile.exists()); return new BufferedReader(new FileReader(statsFile)); } private void checkStatsFile(String logDirName, String testName, String stepName, int numMaps, int numReduces) throws IOException { String targetText = String.format("\t%d\t%d\t%s|%s=%d,%d;", numMaps, numReduces, testName, stepName, numMaps, numReduces); BufferedReader br = openStatsFile(logDirName, testName); String curLine; while ((curLine = br.readLine()) != null) { if (curLine.contains(targetText)) { return; } } fail("Couldn't find target line in stats file: " + getStatsFile(logDirName, testName)); } private void checkDetailsFile(String logDirName, String testName, String stepName, int numMaps, int numReduces) throws IOException { String targetText = String.format("\t%d\t%d\t%s|%s\t", numMaps, numReduces, testName, stepName); BufferedReader br = openDetailsFile(logDirName, testName); String curLine; while ((curLine = br.readLine()) != null) { if (curLine.contains(targetText)) { return; } } fail("Couldn't find target line in details file"); } private void checkSummaryFile(String logDirName, String testName, String stepName) throws IOException { String targetText = String.format("%s|%s", testName, stepName); BufferedReader br = openSummaryFile(logDirName, testName); String curLine; while ((curLine = br.readLine()) != null) { if (curLine.contains(targetText)) { return; } } String filename = logDirName + "/" + testName + "-summary.tsv"; fail(String.format("Couldn't find target line \"%s\" in summary file %s", targetText, filename)); } @SuppressWarnings("rawtypes") private Flow makeFlow(String testName, int numDatums, int id) throws Exception { return makeFlow(testName, numDatums, id, false); } @SuppressWarnings("rawtypes") private Flow makeFlow(String testName, int numDatums, int id, boolean fails) throws Exception { return makeFlow(testName, numDatums, id, fails, new HadoopPlatform(FlowRunnerTest.class)); } @SuppressWarnings({ "rawtypes", "unchecked" }) private Flow makeFlow(String testName, int numDatums, int id, boolean fails, BasePlatform platform) throws Exception { final Fields testFields = new Fields("user", "value"); BasePath testDir = platform.makePath("build/test/FlowRunnerTest/" + testName + "/"); BasePath in = platform.makePath(testDir, "in-" + id); Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(testFields), in, SinkMode.REPLACE); TupleEntryCollector write = sourceTap.openForWrite(platform.makeFlowProcess()); for (int i = 0; i < numDatums; i++) { String username = "user-" + (i % 3); write.add(new Tuple(username, i)); } write.close(); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new MyFilter(fails)); pipe = new SumBy("sum values", pipe, new Fields("user"), new Fields("value"), new Fields("total"), Integer.class); pipe = new GroupBy("group on total", pipe, new Fields("total")); BasePath out = platform.makePath(testDir, "out-" + id); Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(new Fields("user", "total")), out, SinkMode.REPLACE); Flow flow = platform.makeFlowConnector().connect(testName, sourceTap, sinkTap, pipe); FlowUtils.nameFlowSteps(flow); return flow; } }