/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.example.plugin; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.lib.FileSet; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.datapipeline.DataPipelineApp; import co.cask.cdap.datapipeline.SmartWorkflow; import co.cask.cdap.etl.api.Transform; import co.cask.cdap.etl.api.batch.BatchAggregator; import co.cask.cdap.etl.api.batch.BatchSink; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.api.batch.SparkCompute; import co.cask.cdap.etl.api.batch.SparkSink; import co.cask.cdap.etl.mock.batch.MockSink; import co.cask.cdap.etl.mock.batch.MockSource; import co.cask.cdap.etl.mock.test.HydratorTestBase; import co.cask.cdap.etl.proto.v2.ETLBatchConfig; import co.cask.cdap.etl.proto.v2.ETLPlugin; import co.cask.cdap.etl.proto.v2.ETLStage; import co.cask.cdap.proto.Id; import co.cask.cdap.proto.artifact.AppRequest; import co.cask.cdap.proto.artifact.ArtifactSummary; import co.cask.cdap.proto.id.ArtifactId; import co.cask.cdap.proto.id.NamespaceId; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.DataSetManager; import co.cask.cdap.test.TestConfiguration; import co.cask.cdap.test.WorkflowManager; import org.apache.twill.filesystem.Location; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; /** * Unit tests for our plugins. */ public class PipelineTest extends HydratorTestBase { private static final ArtifactSummary APP_ARTIFACT = new ArtifactSummary("data-pipeline", "1.0.0"); @ClassRule public static final TestConfiguration CONFIG = new TestConfiguration("explore.enabled", false); @BeforeClass public static void setupTestClass() throws Exception { ArtifactId parentArtifact = NamespaceId.DEFAULT.artifact(APP_ARTIFACT.getName(), APP_ARTIFACT.getVersion()); // add the data-pipeline artifact and mock plugins setupBatchArtifacts(parentArtifact, DataPipelineApp.class); // add our plugins artifact with the data-pipeline artifact as its parent. // this will make our plugins available to data-pipeline. addPluginArtifact(NamespaceId.DEFAULT.artifact("example-plugins", "1.0.0"), parentArtifact, TextFileSetSource.class, TextFileSetSink.class, WordCountAggregator.class, WordCountCompute.class, WordCountSink.class); } @Test public void testTextFileSource() throws Exception { // create the pipeline config String inputName = "sourceTestInput"; String outputName = "sourceTestOutput"; Map<String, String> sourceProperties = new HashMap<>(); sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName); sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true"); sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true"); ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null)); ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName)); ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *") .addStage(source) .addStage(sink) .addConnection(source.getName(), sink.getName()) .build(); // create the pipeline Id.Application pipelineId = Id.Application.from(Id.Namespace.DEFAULT, "textSourceTestPipeline"); ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig)); // write some data to the input fileset DataSetManager<FileSet> inputManager = getDataset(inputName); Location inputFile = inputManager.get().getBaseLocation().append("somedir"); String line1 = "Hello World!"; String line2 = "Good to meet you"; String line3 = "My name is Hal"; String inputText = line1 + "\n" + line2 + "\n" + line3; try (OutputStream outputStream = inputFile.getOutputStream()) { outputStream.write(inputText.getBytes(Charset.forName("UTF-8"))); } // run the pipeline Map<String, String> runtimeArgs = new HashMap<>(); runtimeArgs.put(String.format("dataset.%s.input.paths", inputName), inputFile.getName()); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(runtimeArgs); workflowManager.waitForFinish(4, TimeUnit.MINUTES); // check the pipeline output DataSetManager<Table> outputManager = getDataset(outputName); Set<StructuredRecord> outputRecords = new HashSet<>(); outputRecords.addAll(MockSink.readOutput(outputManager)); Set<StructuredRecord> expected = new HashSet<>(); expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA) .set("position", (long) inputText.indexOf(line1)) .set("text", line1) .build()); expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA) .set("position", (long) inputText.indexOf(line2)) .set("text", line2) .build()); expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA) .set("position", (long) inputText.indexOf(line3)) .set("text", line3) .build()); Assert.assertEquals(expected, outputRecords); // check that the input was deleted by the source Assert.assertFalse(inputFile.exists()); } @Test public void testTextFileSink() throws Exception { // create the pipeline config String inputName = "sinkTestInput"; String outputName = "sinkTestOutput"; ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName)); Map<String, String> sinkProperties = new HashMap<>(); sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName); sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|"); ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null)); ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *") .addStage(source) .addStage(sink) .addConnection(source.getName(), sink.getName()) .build(); // create the pipeline Id.Application pipelineId = Id.Application.from(Id.Namespace.DEFAULT, "textSinkTestPipeline"); ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig)); // write some data to the input fileset Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING))); Map<String, String> users = new HashMap<>(); users.put("samuel", "wallet"); users.put("dwayne", "rock"); users.put("christopher", "cowbell"); List<StructuredRecord> inputRecords = new ArrayList<>(); for (Map.Entry<String, String> userEntry : users.entrySet()) { String name = userEntry.getKey(); String item = userEntry.getValue(); inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build()); } DataSetManager<Table> inputManager = getDataset(inputName); MockSource.writeInput(inputManager, inputRecords); // run the pipeline Map<String, String> runtimeArgs = new HashMap<>(); String outputPath = "users"; runtimeArgs.put(String.format("dataset.%s.output.path", outputName), outputPath); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(runtimeArgs); workflowManager.waitForFinish(4, TimeUnit.MINUTES); // check the pipeline output DataSetManager<FileSet> outputManager = getDataset(outputName); FileSet output = outputManager.get(); Location outputDir = output.getBaseLocation().append(outputPath); Map<String, String> actual = new HashMap<>(); for (Location outputFile : outputDir.list()) { // mapreduce writes multiple files to the output directory. There is a _SUCCESS file, and various files // that end with .crc that do not have any content. The actual output should not be read from those files. if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) { continue; } try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) { String line; while ((line = reader.readLine()) != null) { String[] parts = line.split("\\|"); actual.put(parts[0], parts[1]); } } } Assert.assertEquals(actual, users); } @SuppressWarnings("ConstantConditions") @Test public void testWordCountSparkSink() throws Exception { String inputName = "sparkSinkInput"; String outputName = "sparkSinkOutput"; // create the pipeline config ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName)); Map<String, String> sinkProperties = new HashMap<>(); sinkProperties.put("field", "text"); sinkProperties.put("tableName", outputName); ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null)); ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *") .addStage(source) .addStage(sink) .addConnection(source.getName(), sink.getName()) .build(); // create the pipeline Id.Application pipelineId = Id.Application.from(Id.Namespace.DEFAULT, "sparkSinkTestPipeline"); ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig)); // write the input Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING))); DataSetManager<Table> inputManager = getDataset(inputName); List<StructuredRecord> inputRecords = new ArrayList<>(); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build()); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build()); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build()); MockSource.writeInput(inputManager, inputRecords); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(4, TimeUnit.MINUTES); DataSetManager<KeyValueTable> outputManager = getDataset(outputName); KeyValueTable output = outputManager.get(); Assert.assertEquals(3L, Bytes.toLong(output.read("Hello"))); Assert.assertEquals(1L, Bytes.toLong(output.read("World"))); Assert.assertEquals(2L, Bytes.toLong(output.read("my"))); Assert.assertEquals(2L, Bytes.toLong(output.read("name"))); Assert.assertEquals(2L, Bytes.toLong(output.read("is"))); Assert.assertEquals(1L, Bytes.toLong(output.read("Hal"))); Assert.assertEquals(1L, Bytes.toLong(output.read("Sam"))); } @Test public void testStringCaseTransform() throws Exception { String inputName = "transformTestInput"; String outputName = "transformTestOutput"; // create the pipeline config ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName)); ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName)); Map<String, String> transformProperties = new HashMap<>(); transformProperties.put("lowerFields", "first"); transformProperties.put("upperFields", "last"); ETLStage transform = new ETLStage("transform", new ETLPlugin(StringCaseTransform.NAME, Transform.PLUGIN_TYPE, transformProperties, null)); ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *") .addStage(source) .addStage(sink) .addStage(transform) .addConnection(source.getName(), transform.getName()) .addConnection(transform.getName(), sink.getName()) .build(); // create the pipeline Id.Application pipelineId = Id.Application.from(Id.Namespace.DEFAULT, "transformTestPipeline"); ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig)); // write the input Schema schema = Schema.recordOf( "name", Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)) ); DataSetManager<Table> inputManager = getDataset(inputName); List<StructuredRecord> inputRecords = new ArrayList<>(); inputRecords.add(StructuredRecord.builder(schema).set("first", "Samuel").set("last", "Jackson").build()); MockSource.writeInput(inputManager, inputRecords); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(4, TimeUnit.MINUTES); DataSetManager<Table> outputManager = getDataset(outputName); List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager); List<StructuredRecord> expected = new ArrayList<>(); expected.add(StructuredRecord.builder(schema).set("first", "samuel").set("last", "JACKSON").build()); Assert.assertEquals(expected, outputRecords); } @Test public void testWordCountAggregator() throws Exception { testWordCount(BatchAggregator.PLUGIN_TYPE); } @Test public void testWordCountSparkCompute() throws Exception { testWordCount(SparkCompute.PLUGIN_TYPE); } public void testWordCount(String pluginType) throws Exception { String inputName = "wcInput-" + pluginType; String outputName = "wcOutput-" + pluginType; // create the pipeline config ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName)); ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName)); Map<String, String> aggProperties = new HashMap<>(); aggProperties.put("field", "text"); ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null)); ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *") .addStage(source) .addStage(sink) .addStage(agg) .addConnection(source.getName(), agg.getName()) .addConnection(agg.getName(), sink.getName()) .build(); // create the pipeline Id.Application pipelineId = Id.Application.from(Id.Namespace.DEFAULT, "wcTestPipeline-" + pluginType); ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig)); // write the input Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING))); DataSetManager<Table> inputManager = getDataset(inputName); List<StructuredRecord> inputRecords = new ArrayList<>(); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build()); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build()); inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build()); MockSource.writeInput(inputManager, inputRecords); WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME); workflowManager.start(); workflowManager.waitForFinish(4, TimeUnit.MINUTES); DataSetManager<Table> outputManager = getDataset(outputName); Set<StructuredRecord> outputRecords = new HashSet<>(); outputRecords.addAll(MockSink.readOutput(outputManager)); Set<StructuredRecord> expected = new HashSet<>(); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "Hello") .set("count", 3L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "World") .set("count", 1L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "my") .set("count", 2L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "name") .set("count", 2L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "is") .set("count", 2L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "Hal") .set("count", 1L).build()); expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA) .set("word", "Sam") .set("count", 1L).build()); Assert.assertEquals(expected, outputRecords); } }