/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.batch;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.etl.mock.batch.MockSink;
import co.cask.cdap.etl.mock.batch.MockSource;
import co.cask.cdap.etl.mock.transform.StringValueFilterTransform;
import co.cask.cdap.etl.proto.Engine;
import co.cask.cdap.etl.proto.v2.ETLBatchConfig;
import co.cask.cdap.etl.proto.v2.ETLStage;
import co.cask.cdap.proto.Id;
import co.cask.cdap.proto.artifact.AppRequest;
import co.cask.cdap.test.ApplicationManager;
import co.cask.cdap.test.DataSetManager;
import co.cask.cdap.test.WorkflowManager;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import org.junit.Assert;
import org.junit.Test;
import java.util.Set;
import java.util.concurrent.TimeUnit;
/**
*
*/
public class ETLSparkTestRun extends ETLBatchTestBase {
@Test
public void test() throws Exception {
/*
* ---------------- sink1
* |
* source ---
* |
* ---- filter ---- sink2
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *")
.setEngine(Engine.SPARK)
.addStage(new ETLStage("source", MockSource.getPlugin("sparkinput")))
.addStage(new ETLStage("sink1", MockSink.getPlugin("sparkoutput1")))
.addStage(new ETLStage("sink2", MockSink.getPlugin("sparkoutput2")))
.addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("name", "samuel")))
.addConnection("source", "sink1")
.addConnection("source", "filter")
.addConnection("filter", "sink2")
.build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "DagApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input
Schema schema = Schema.recordOf(
"testRecord",
Schema.Field.of("name", Schema.of(Schema.Type.STRING))
);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
DataSetManager<Table> sourceManager = getDataset(Id.Namespace.DEFAULT, "sparkinput");
MockSource.writeInput(sourceManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
// run the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(ETLWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
// check output of the first output, which was transformed before it was written
DataSetManager<Table> sink1Manager = getDataset("sparkoutput1");
Set<StructuredRecord> actualOutput = Sets.newHashSet(MockSink.readOutput(sink1Manager));
Set<StructuredRecord> expectedOutput = ImmutableSet.of(recordSamuel, recordBob, recordJane);
Assert.assertEquals(expectedOutput, actualOutput);
// check output of the second output, which had one record filtered before it was written
DataSetManager<Table> sink2Manager = getDataset("sparkoutput2");
actualOutput = Sets.newHashSet(MockSink.readOutput(sink2Manager));
expectedOutput = ImmutableSet.of(recordBob, recordJane);
Assert.assertEquals(expectedOutput, actualOutput);
}
}