/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.etl.batch; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.api.workflow.NodeStatus; import co.cask.cdap.etl.batch.mapreduce.ETLMapReduce; import co.cask.cdap.etl.common.Constants; import co.cask.cdap.etl.mock.batch.MockExternalSink; import co.cask.cdap.etl.mock.batch.MockExternalSource; import co.cask.cdap.etl.mock.batch.MockSink; import co.cask.cdap.etl.mock.batch.MockSource; import co.cask.cdap.etl.mock.batch.NodeStatesAction; import co.cask.cdap.etl.mock.transform.ErrorTransform; import co.cask.cdap.etl.mock.transform.StringValueFilterTransform; import co.cask.cdap.etl.proto.Engine; import co.cask.cdap.etl.proto.v2.ETLBatchConfig; import co.cask.cdap.etl.proto.v2.ETLStage; import co.cask.cdap.format.StructuredRecordStringConverter; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.ProgramRunStatus; import co.cask.cdap.proto.RunRecord; import co.cask.cdap.proto.artifact.AppRequest; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.DataSetManager; import co.cask.cdap.test.MapReduceManager; import co.cask.cdap.test.WorkflowManager; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import org.apache.avro.generic.GenericRecord; import org.junit.Assert; import org.junit.Test; import java.io.File; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; /** * Tests for ETLBatch. */ public class ETLWorkflowTestRun extends ETLBatchTestBase { @Test public void testInvalidTransformConfigFailsToDeploy() { ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("inputTable"))) .addStage(new ETLStage("sink", MockSink.getPlugin("outputTable"))) .addStage(new ETLStage("transform", StringValueFilterTransform.getPlugin(null, null))) .addConnection("source", "transform") .addConnection("transform", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "badConfig"); try { deployApplication(appId, appRequest); Assert.fail(); } catch (Exception e) { // expected } } @Test public void testPostAction() throws Exception { ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("actionInput"))) .addStage(new ETLStage("sink", MockSink.getPlugin("actionOutput"))) .addPostAction(new ETLStage("tokenWriter", NodeStatesAction.getPlugin("tokenTable"))) .addConnection("source", "sink") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "ActionApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "actionInput"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane)); WorkflowManager workflowManager = appManager.getWorkflowManager(ETLWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); DataSetManager<Table> tokenTableManager = getDataset(Id.Namespace.DEFAULT, "tokenTable"); Table tokenTable = tokenTableManager.get(); NodeStatus status = NodeStatus.valueOf(Bytes.toString( tokenTable.get(Bytes.toBytes(ETLMapReduce.NAME), Bytes.toBytes("status")))); Assert.assertEquals(NodeStatus.COMPLETED, status); } @Test public void testDAG() throws Exception { /* * ----- error transform ------------ * | |---- sink1 * source --------- string value filter -------- * | |---- sink2 * ---------------------------------- */ ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *") .addStage(new ETLStage("source", MockSource.getPlugin("daginput"))) .addStage(new ETLStage("sink1", MockSink.getPlugin("dagoutput1"))) .addStage(new ETLStage("sink2", MockSink.getPlugin("dagoutput2"))) .addStage(new ETLStage("error", ErrorTransform.getPlugin(), "errors")) .addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("name", "samuel"))) .addConnection("source", "error") .addConnection("source", "filter") .addConnection("source", "sink2") .addConnection("error", "sink1") .addConnection("filter", "sink1") .addConnection("filter", "sink2") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "DagApp"); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); DataSetManager<Table> inputManager = getDataset(Id.Namespace.DEFAULT, "daginput"); MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane)); MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME); mrManager.start(); mrManager.waitForFinish(5, TimeUnit.MINUTES); // sink1 should only get non-samuel records DataSetManager<Table> sink1Manager = getDataset("dagoutput1"); Set<StructuredRecord> expected = ImmutableSet.of(recordBob, recordJane); Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sink1Manager)); Assert.assertEquals(expected, actual); // sink2 should get bob and jane from the filter, plus everything again from the source Map<String, Integer> expectedCounts = ImmutableMap.of("samuel", 1, "bob", 2, "jane", 2); DataSetManager<Table> sink2Manager = getDataset("dagoutput2"); Map<String, Integer> actualCounts = new HashMap<>(); actualCounts.put("samuel", 0); actualCounts.put("bob", 0); actualCounts.put("jane", 0); for (StructuredRecord record : MockSink.readOutput(sink2Manager)) { String name = record.get("name"); actualCounts.put(name, actualCounts.get(name) + 1); } Assert.assertEquals(expectedCounts, actualCounts); // error dataset should have all records Set<StructuredRecord> expectedErrors = ImmutableSet.of(recordSamuel, recordBob, recordJane); Set<StructuredRecord> actualErrors = new HashSet<>(); DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset("errors"); try (TimePartitionedFileSet fileSet = fileSetManager.get()) { List<GenericRecord> records = readOutput(fileSet, Constants.ERROR_SCHEMA); for (GenericRecord record : records) { StructuredRecord invalidRecord = StructuredRecordStringConverter.fromJsonString( record.get(Constants.ErrorDataset.INVALIDENTRY).toString(), schema); actualErrors.add(invalidRecord); } } Assert.assertEquals(expectedErrors, actualErrors); } @Test public void testExternalDatasetTrackingMR() throws Exception { testExternalDatasetTracking(Engine.MAPREDUCE, false); } @Test public void testExternalDatasetTrackingSpark() throws Exception { testExternalDatasetTracking(Engine.SPARK, false); } @Test public void testBackwardsCompatibleExternalDatasetTrackingMR() throws Exception { testExternalDatasetTracking(Engine.MAPREDUCE, true); } @Test public void testBackwardsCompatibleExternalDatasetTrackingSpark() throws Exception { testExternalDatasetTracking(Engine.SPARK, true); } private void testExternalDatasetTracking(Engine engine, boolean backwardsCompatible) throws Exception { String suffix = engine.name() + (backwardsCompatible ? "-bc" : ""); // Define input/output datasets String expectedExternalDatasetInput = "fileInput-" + suffix; String expectedExternalDatasetOutput = "fileOutput-" + suffix; // Define input/output directories File inputDir = TMP_FOLDER.newFolder("input-" + suffix); String inputFile = "input-file1.txt"; File outputDir = TMP_FOLDER.newFolder("output-" + suffix); File outputSubDir1 = new File(outputDir, "subdir1"); File outputSubDir2 = new File(outputDir, "subdir2"); if (!backwardsCompatible) { // Assert that there are no external datasets Assert.assertNull(getDataset(Id.Namespace.DEFAULT, expectedExternalDatasetInput).get()); Assert.assertNull(getDataset(Id.Namespace.DEFAULT, expectedExternalDatasetOutput).get()); } ETLBatchConfig.Builder builder = ETLBatchConfig.builder("* * * * *"); ETLBatchConfig etlConfig = builder .setEngine(engine) // TODO: test multiple inputs CDAP-5654 .addStage(new ETLStage("source", MockExternalSource.getPlugin(expectedExternalDatasetInput, inputDir.getAbsolutePath()))) .addStage(new ETLStage("sink1", MockExternalSink.getPlugin( backwardsCompatible ? null : expectedExternalDatasetOutput, "dir1", outputSubDir1.getAbsolutePath()))) .addStage(new ETLStage("sink2", MockExternalSink.getPlugin( backwardsCompatible ? null : expectedExternalDatasetOutput, "dir2", outputSubDir2.getAbsolutePath()))) .addConnection("source", "sink1") .addConnection("source", "sink2") .build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "ExternalDatasetApp-" + suffix); ApplicationManager appManager = deployApplication(appId, appRequest); Schema schema = Schema.recordOf( "testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)) ); StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build(); StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build(); StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build(); ImmutableList<StructuredRecord> allInput = ImmutableList.of(recordSamuel, recordBob, recordJane); // Create input files MockExternalSource.writeInput(new File(inputDir, inputFile).getAbsolutePath(), allInput); WorkflowManager workflowManager = appManager.getWorkflowManager(ETLWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(5, TimeUnit.MINUTES); List<RunRecord> history = workflowManager.getHistory(); // there should be only one completed run Assert.assertEquals(1, history.size()); Assert.assertEquals(ProgramRunStatus.COMPLETED, history.get(0).getStatus()); // Assert output Assert.assertEquals(allInput, MockExternalSink.readOutput(outputSubDir1.getAbsolutePath())); Assert.assertEquals(allInput, MockExternalSink.readOutput(outputSubDir2.getAbsolutePath())); if (!backwardsCompatible) { // Assert that external datasets got created Assert.assertNotNull(getDataset(Id.Namespace.DEFAULT, expectedExternalDatasetInput).get()); Assert.assertNotNull(getDataset(Id.Namespace.DEFAULT, expectedExternalDatasetOutput).get()); } } }