/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.spark; import co.cask.cdap.api.common.RuntimeArguments; import co.cask.cdap.api.common.Scope; import co.cask.cdap.api.dataset.lib.FileSet; import co.cask.cdap.api.dataset.lib.FileSetArguments; import co.cask.cdap.api.dataset.lib.PartitionDetail; import co.cask.cdap.api.dataset.lib.PartitionFilter; import co.cask.cdap.api.dataset.lib.PartitionKey; import co.cask.cdap.api.dataset.lib.PartitionOutput; import co.cask.cdap.api.dataset.lib.PartitionedFileSet; import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments; import co.cask.cdap.proto.ProgramRunStatus; import co.cask.cdap.spark.app.ScalaFileCountSparkProgram; import co.cask.cdap.spark.app.SparkAppUsingFileSet; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.DataSetManager; import co.cask.cdap.test.SparkManager; import co.cask.cdap.test.base.TestFrameworkTestBase; import co.cask.tephra.TransactionFailureException; import org.apache.twill.filesystem.Location; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashMap; import java.util.Map; import java.util.concurrent.TimeUnit; /** * Unit-tests for testing FileSet usages in Spark program. */ public class SparkFileSetTestRun extends TestFrameworkTestBase { private ApplicationManager applicationManager; @Before public void init() { applicationManager = deployApplication(SparkAppUsingFileSet.class); } @Test public void testSparkWithFileSet() throws Exception { testSparkWithFileSet(applicationManager, FileCountSparkProgram.class.getSimpleName()); testSparkWithFileSet(applicationManager, ScalaFileCountSparkProgram.class.getSimpleName()); } @Test public void testSparkWithCustomFileSet() throws Exception { testSparkWithCustomFileSet(applicationManager, FileCountSparkProgram.class.getSimpleName()); testSparkWithCustomFileSet(applicationManager, ScalaFileCountSparkProgram.class.getSimpleName()); } @Test public void testSparkWithTimePartitionedFileSet() throws Exception { testSparkWithTimePartitionedFileSet(applicationManager, FileCountSparkProgram.class.getSimpleName()); testSparkWithTimePartitionedFileSet(applicationManager, ScalaFileCountSparkProgram.class.getSimpleName()); } @Test public void testSparkWithPartitionedFileSet() throws Exception { testSparkWithPartitionedFileSet(applicationManager, FileCountSparkProgram.class.getSimpleName()); testSparkWithPartitionedFileSet(applicationManager, ScalaFileCountSparkProgram.class.getSimpleName()); } private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { DataSetManager<FileSet> filesetManager = getDataset("fs"); final FileSet fileset = filesetManager.get(); Location location = fileset.getLocation("nn"); prepareFileInput(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs)); args.put("input", "fs"); args.put("output", "fs"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForFinish(1, TimeUnit.MINUTES); Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size()); validateFileOutput(fileset.getLocation("xx"), "custom:"); // Cleanup paths after running test fileset.getLocation("nn").delete(true); fileset.getLocation("xx").delete(true); } private void testSparkWithCustomFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { final DataSetManager<SparkAppUsingFileSet.MyFileSet> myFileSetManager = getDataset("myfs"); SparkAppUsingFileSet.MyFileSet myfileset = myFileSetManager.get(); final FileSet fileset = myfileset.getEmbeddedFileSet(); Location location = fileset.getLocation("nn"); prepareFileInput(location); Map<String, String> inputArgs = new HashMap<>(); FileSetArguments.setInputPath(inputArgs, "nn"); Map<String, String> outputArgs = new HashMap<>(); FileSetArguments.setOutputPath(inputArgs, "xx"); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", outputArgs)); args.put("input", "myfs"); args.put("output", "myfs"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForFinish(2, TimeUnit.MINUTES); Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size()); validateFileOutput(fileset.getLocation("xx")); // verify that onSuccess() was called and onFailure() was not Assert.assertTrue(myfileset.getSuccessLocation().exists()); Assert.assertFalse(myfileset.getFailureLocation().exists()); myfileset.getSuccessLocation().delete(); // run the program again. It should fail due to existing output. sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForFinish(2, TimeUnit.MINUTES); Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.FAILED).size()); // Then we can verify that onFailure() was called. Assert.assertFalse(myfileset.getSuccessLocation().exists()); Assert.assertTrue(myfileset.getFailureLocation().exists()); // Cleanup the paths after running the Spark program fileset.getLocation("nn").delete(true); fileset.getLocation("xx").delete(true); myfileset.getSuccessLocation().delete(true); myfileset.getFailureLocation().delete(true); } private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { long customOutputPartitionKey = 123456789L; long customInputPartitionKey = 987654321L; DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs"); long inputTime = System.currentTimeMillis(); long outputTime = inputTime + TimeUnit.HOURS.toMillis(1); addTimePartition(tpfsManager, inputTime); addTimePartition(tpfsManager, customInputPartitionKey); Map<String, String> inputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100); Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs)); args.put("input", "tpfs"); args.put("output", "tpfs"); args.put("outputKey", String.valueOf(customOutputPartitionKey)); args.put("inputKey", String.valueOf(customInputPartitionKey)); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForFinish(10, TimeUnit.MINUTES); Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size()); tpfsManager.flush(); TimePartitionedFileSet tpfs = tpfsManager.get(); PartitionDetail partition = tpfs.getPartitionByTime(outputTime); Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition); validateFileOutput(partition.getLocation()); PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey); Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition); validateFileOutput(customPartition.getLocation()); // Cleanup after running the test tpfs.getPartitionOutput(inputTime).getLocation().delete(true); tpfs.getPartitionOutput(customInputPartitionKey).getLocation().delete(true); partition.getLocation().delete(true); customPartition.getLocation().delete(true); tpfs.dropPartition(inputTime); tpfs.dropPartition(customInputPartitionKey); tpfs.dropPartition(partition.getPartitionKey()); tpfs.dropPartition(customPartition.getPartitionKey()); tpfsManager.flush(); } private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception { DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs"); PartitionedFileSet pfs = pfsManager.get(); PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build()); Location location = partitionOutput.getLocation(); prepareFileInput(location); partitionOutput.addPartition(); pfsManager.flush(); Map<String, String> inputArgs = new HashMap<>(); PartitionedFileSetArguments.setInputPartitionFilter( inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build()); Map<String, String> outputArgs = new HashMap<>(); final PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build(); PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs)); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs)); args.put("input", "pfs"); args.put("output", "pfs"); SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args); sparkManager.waitForFinish(10, TimeUnit.MINUTES); Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size()); pfsManager.flush(); PartitionDetail partition = pfs.getPartition(outputKey); Assert.assertNotNull(partition); validateFileOutput(partition.getLocation()); // Cleanup after test completed location.delete(true); partition.getLocation().delete(true); pfs.dropPartition(partitionOutput.getPartitionKey()); pfs.dropPartition(partition.getPartitionKey()); pfsManager.flush(); } private void prepareFileInput(Location location) throws IOException { try (OutputStreamWriter out = new OutputStreamWriter(location.getOutputStream())) { out.write("13 characters\n"); out.write("7 chars\n"); } } private void validateFileOutput(Location location) throws Exception { validateFileOutput(location, ""); } private void validateFileOutput(Location location, String prefix) throws Exception { Assert.assertTrue(location.isDirectory()); for (Location child : location.list()) { if (child.getName().startsWith("part-r-")) { try (BufferedReader reader = new BufferedReader(new InputStreamReader(child.getInputStream()))) { String line = reader.readLine(); Assert.assertNotNull(line); Assert.assertEquals(prefix + "13 characters:13", line); line = reader.readLine(); Assert.assertNotNull(line); Assert.assertEquals(prefix + "7 chars:7", line); line = reader.readLine(); Assert.assertNull(line); return; } } } Assert.fail("Output directory does not contain any part file: " + location.list()); } private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException { TimePartitionedFileSet tpfs = tpfsManager.get(); PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime); Location location = partitionOutput.getLocation(); prepareFileInput(location); partitionOutput.addPartition(); tpfsManager.flush(); } }