/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.datapipeline; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.api.workflow.NodeStatus; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.datapipeline.mock.NaiveBayesClassifier; import co.cask.cdap.datapipeline.mock.NaiveBayesTrainer; import co.cask.cdap.datapipeline.mock.SpamMessage; import co.cask.cdap.etl.api.batch.SparkCompute; import co.cask.cdap.etl.api.batch.SparkSink; import co.cask.cdap.etl.mock.batch.MockSink; import co.cask.cdap.etl.mock.batch.MockSource; import co.cask.cdap.etl.mock.batch.NodeStatesAction; import co.cask.cdap.etl.mock.batch.aggregator.FieldCountAggregator; import co.cask.cdap.etl.mock.batch.aggregator.IdentityAggregator; import co.cask.cdap.etl.mock.test.HydratorTestBase; import co.cask.cdap.etl.mock.transform.IdentityTransform; import co.cask.cdap.etl.mock.transform.StringValueFilterTransform; import co.cask.cdap.etl.proto.Engine; import co.cask.cdap.etl.proto.v2.ETLBatchConfig; import co.cask.cdap.etl.proto.v2.ETLPlugin; import co.cask.cdap.etl.proto.v2.ETLStage; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.artifact.AppRequest; import co.cask.cdap.proto.artifact.ArtifactSummary; import co.cask.cdap.proto.id.ArtifactId; import co.cask.cdap.proto.id.NamespaceId; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.DataSetManager; import co.cask.cdap.test.StreamManager; import co.cask.cdap.test.TestConfiguration; import co.cask.cdap.test.WorkflowManager; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import org.junit.After; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; /** * */ public class DataPipelineTest extends HydratorTestBase { protected static final ArtifactId APP_ARTIFACT_ID = new ArtifactId(NamespaceId.DEFAULT.getNamespace(), "app", "1.0.0"); protected static final ArtifactSummary APP_ARTIFACT = new ArtifactSummary("app", "1.0.0"); private static int startCount = 0; @ClassRule public static final TestConfiguration CONFIG = new TestConfiguration("explore.enabled", false); @BeforeClass public static void setupTest() throws Exception { if (startCount++ > 0) { return; } setupBatchArtifacts(APP_ARTIFACT_ID, DataPipelineApp.class); // add some test plugins addPluginArtifact(new ArtifactId(NamespaceId.DEFAULT.getNamespace(), "spark-plugins", "1.0.0"), APP_ARTIFACT_ID, NaiveBayesTrainer.class, NaiveBayesClassifier.class); } @After public void cleanupTest() throws Exception { getMetricsManager().resetAll(); } @Test public void testSinglePhase() throws Exception { /* * source --> sink */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("singleInput"))) .addStage(new ETLStage("sink", MockSink.getPlugin("singleOutput"))) .addConnection("source", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "SinglePhaseApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); // write records to source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "singleInput"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob)); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); // check sink DataSetManager<Table> sinkManager = getDataset("singleOutput"); Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); validateMetric(2, appId, "source.records.out"); validateMetric(2, appId, "sink.records.in"); } @Test public void testSimpleMultiSource() throws Exception { /* * source1 --| * |--> sink * source2 --| */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source1", MockSource.getPlugin("simpleMSInput1"))) .addStage(new ETLStage("source2", MockSource.getPlugin("simpleMSInput2"))) .addStage(new ETLStage("sink", MockSink.getPlugin("simpleMSOutput"))) .addConnection("source1", "sink") .addConnection("source2", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "SimpleMultiSourceApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); // write one record to each source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "simpleMSInput1"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel)); inputManager = getDataset(Id.Namespace.DEFAULT, "simpleMSInput2"); MockSource.writeInput(inputManager, ImmutableList.of(recordBob)); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); // check sink DataSetManager<Table> sinkManager = getDataset("simpleMSOutput"); Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); validateMetric(1, appId, "source1.records.out"); validateMetric(1, appId, "source2.records.out"); validateMetric(2, appId, "sink.records.in"); } @Test public void testMultiSource() throws Exception { /* * source1 --| |--> sink1 * |--> transform1 --| * source2 --| | * |--> transform2 --> sink2 * ^ * | * source3 ----------------------------| */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source1", MockSource.getPlugin("msInput1"))) .addStage(new ETLStage("source2", MockSource.getPlugin("msInput2"))) .addStage(new ETLStage("source3", MockSource.getPlugin("msInput3"))) .addStage(new ETLStage("transform1", IdentityTransform.getPlugin())) .addStage(new ETLStage("transform2", IdentityTransform.getPlugin())) .addStage(new ETLStage("sink1", MockSink.getPlugin("msOutput1"))) .addStage(new ETLStage("sink2", MockSink.getPlugin("msOutput2"))) .addConnection("source1", "transform1") .addConnection("source2", "transform1") .addConnection("transform1", "sink1") .addConnection("transform1", "transform2") .addConnection("transform2", "sink2") .addConnection("source3", "transform2") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "MultiSourceApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); // write one record to each source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "msInput1"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel)); inputManager = getDataset(Id.Namespace.DEFAULT, "msInput2"); MockSource.writeInput(inputManager, ImmutableList.of(recordBob)); inputManager = getDataset(Id.Namespace.DEFAULT, "msInput3"); MockSource.writeInput(inputManager, ImmutableList.of(recordJane)); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); // sink1 should get records from source1 and source2 DataSetManager<Table> sinkManager = getDataset("msOutput1"); Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); // sink2 should get all records sinkManager = getDataset("msOutput2"); expected = ImmutableSet.of(recordSamuel, recordBob, recordJane); actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); validateMetric(1, appId, "source1.records.out"); validateMetric(1, appId, "source2.records.out"); validateMetric(1, appId, "source3.records.out"); validateMetric(2, appId, "transform1.records.in"); validateMetric(2, appId, "transform1.records.out"); validateMetric(3, appId, "transform2.records.in"); validateMetric(3, appId, "transform2.records.out"); validateMetric(2, appId, "sink1.records.in"); validateMetric(3, appId, "sink2.records.in"); } @Test public void testMapRedSequentialAggregators() throws Exception { testSequentialAggregators(Engine.MAPREDUCE); } @Test public void testSparkSequentialAggregators() throws Exception { testSequentialAggregators(Engine.SPARK); } @Test public void testMapRedParallelAggregators() throws Exception { testParallelAggregators(Engine.MAPREDUCE); } @Test public void testSparkParallelAggregators() throws Exception { testParallelAggregators(Engine.SPARK); } private void testSequentialAggregators(Engine engine) throws Exception { String sourceName = "linearAggInput-" + engine.name(); String sinkName = "linearAggOutput-" + engine.name(); /* * source --> filter1 --> aggregator1 --> aggregator2 --> filter2 --> sink */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .setEngine(engine) .addStage(new ETLStage("source", MockSource.getPlugin(sourceName))) .addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))) .addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "bob"))) .addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "jane"))) .addStage(new ETLStage("aggregator1", IdentityAggregator.getPlugin())) .addStage(new ETLStage("aggregator2", IdentityAggregator.getPlugin())) .addConnection("source", "filter1") .addConnection("filter1", "aggregator1") .addConnection("aggregator1", "aggregator2") .addConnection("aggregator2", "filter2") .addConnection("filter2", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "LinearAggApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); // write one record to each source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, sourceName); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane)); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); // check output DataSetManager<Table> sinkManager = getDataset(sinkName); Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); validateMetric(3, appId, "source.records.out"); validateMetric(3, appId, "filter1.records.in"); validateMetric(2, appId, "filter1.records.out"); validateMetric(2, appId, "aggregator1.records.in"); validateMetric(2, appId, "aggregator1.records.out"); validateMetric(2, appId, "aggregator2.records.in"); validateMetric(2, appId, "aggregator2.records.out"); validateMetric(2, appId, "filter2.records.in"); validateMetric(1, appId, "filter2.records.out"); validateMetric(1, appId, "sink.records.out"); } private void testParallelAggregators(Engine engine) throws Exception { String sourceName = "pAggInput-" + engine.name(); String sink1Name = "pAggOutput1-" + engine.name(); String sink2Name = "pAggOutput2-" + engine.name(); /* |--> agg1 --> sink1 source --| |--> agg2 --> sink2 */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .setEngine(engine) .addStage(new ETLStage("source", MockSource.getPlugin(sourceName))) .addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))) .addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))) .addStage(new ETLStage("agg1", FieldCountAggregator.getPlugin("user", "string"))) .addStage(new ETLStage("agg2", FieldCountAggregator.getPlugin("item", "long"))) .addConnection("source", "agg1") .addConnection("source", "agg2") .addConnection("agg1", "sink1") .addConnection("agg2", "sink2") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "ParallelAggApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema inputSchema = Schema.recordOf( "testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)) ); // write one record to each source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, sourceName); MockSource.writeInput(inputManager, ImmutableList.of( StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build() )); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); Schema outputSchema1 = Schema.recordOf( "user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)) ); Schema outputSchema2 = Schema.recordOf( "item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)) ); // check output DataSetManager<Table> sinkManager = getDataset(sink1Name); Set<StructuredRecord> expected = ImmutableSet.of( StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build()); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); sinkManager = getDataset(sink2Name); expected = ImmutableSet.of( StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build()); actual = Sets.newHashSet(MockSink.readOutput(sinkManager)); Assert.assertEquals(expected, actual); validateMetric(5, appId, "source.records.out"); validateMetric(5, appId, "agg1.records.in"); // 2 users, but FieldCountAggregator always emits an 'all' group validateMetric(3, appId, "agg1.aggregator.groups"); validateMetric(3, appId, "agg1.records.out"); validateMetric(5, appId, "agg2.records.in"); // 4 items, but FieldCountAggregator always emits an 'all' group validateMetric(5, appId, "agg2.aggregator.groups"); validateMetric(5, appId, "agg2.records.out"); validateMetric(3, appId, "sink1.records.in"); validateMetric(5, appId, "sink2.records.in"); } @Test public void testSparkSinkAndCompute() throws Exception { // use the SparkSink to train a model testSinglePhaseWithSparkSink(); // use a SparkCompute to classify all records going through the pipeline, using the model build with the SparkSink testSinglePhaseWithSparkCompute(); } @Test public void testPostAction() throws Exception { ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("actionInput"))) .addStage(new ETLStage("sink", MockSink.getPlugin("actionOutput"))) .addPostAction(new ETLStage("tokenWriter", NodeStatesAction.getPlugin("tokenTable"))) .addConnection("source", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "ActionApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "actionInput"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane)); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); DataSetManager<Table> tokenTableManager = getDataset(Id.Namespace.DEFAULT, "tokenTable"); Table tokenTable = tokenTableManager.get(); NodeStatus status = NodeStatus.valueOf(Bytes.toString( tokenTable.get(Bytes.toBytes("phase-1"), Bytes.toBytes("status")))); Assert.assertEquals(NodeStatus.COMPLETED, status); } private void testSinglePhaseWithSparkSink() throws Exception { /* * source --> sparksink */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("messages"))) .addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))) .addConnection("source", "customsink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "SparkSinkApp"); ApplicationManager appManager = deployApplication(appId, appRequest); // set up five spam messages and five non-spam messages to be used for classification List<StructuredRecord> messagesToWrite = new ArrayList<>(); messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord()); messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord()); // write records to source DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "messages"); MockSource.writeInput(inputManager, messagesToWrite); // ingest in some messages to be classified StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY); textsToClassify.send("how are you doing today"); textsToClassify.send("free money money"); textsToClassify.send("what are you doing today"); textsToClassify.send("genuine report"); // manually trigger the pipeline WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS); Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d); // only 'free money money' should be predicated as spam Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d); Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d); Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d); validateMetric(10, appId, "source.records.out"); validateMetric(10, appId, "customsink.records.in"); } private void testSinglePhaseWithSparkCompute() throws Exception { /* * source --> sparkcompute --> sink */ String classifiedTextsTable = "classifiedTextTable"; ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY))) .addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet" , "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))) .addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))) .addConnection("source", "sparkcompute") .addConnection("sparkcompute", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "SparkComputeApp"); ApplicationManager appManager = deployApplication(appId, appRequest); // write some some messages to be classified List<StructuredRecord> messagesToWrite = new ArrayList<>(); messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord()); messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord()); messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord()); messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord()); DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, NaiveBayesTrainer.TEXTS_TO_CLASSIFY); MockSource.writeInput(inputManager, messagesToWrite); // manually trigger the pipeline WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable); List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts); Set<SpamMessage> results = new HashSet<>(); for (StructuredRecord structuredRecord : structuredRecords) { results.add(SpamMessage.fromStructuredRecord(structuredRecord)); } Set<SpamMessage> expected = new HashSet<>(); expected.add(new SpamMessage("how are you doing today", 0.0)); // only 'free money money' should be predicated as spam expected.add(new SpamMessage("free money money", 1.0)); expected.add(new SpamMessage("what are you doing today", 0.0)); expected.add(new SpamMessage("genuine report", 0.0)); Assert.assertEquals(expected, results); validateMetric(4, appId, "source.records.out"); validateMetric(4, appId, "sparkcompute.records.in"); validateMetric(4, appId, "sink.records.in"); } private void validateMetric(long expected, Id.Application appId, String metric) throws TimeoutException, InterruptedException { Map<String, String> tags = ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, appId.getNamespaceId(), Constants.Metrics.Tag.APP, appId.getId(), Constants.Metrics.Tag.WORKFLOW, SmartWorkflow.NAME); getMetricsManager().waitForTotalMetricCount(tags, "user." + metric, expected, 20, TimeUnit.SECONDS); // wait for won't throw an exception if the metric count is greater than expected Assert.assertEquals(expected, getMetricsManager().getTotalMetric(tags, "user." + metric)); } }