/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.test.app; import co.cask.cdap.api.annotation.UseDataSet; import co.cask.cdap.api.app.AbstractApplication; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.batch.Input; import co.cask.cdap.api.data.batch.Output; import co.cask.cdap.api.dataset.lib.CloseableIterator; import co.cask.cdap.api.dataset.lib.FileSet; import co.cask.cdap.api.dataset.lib.FileSetArguments; import co.cask.cdap.api.dataset.lib.FileSetProperties; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.mapreduce.AbstractMapReduce; import co.cask.cdap.api.mapreduce.MapReduceContext; import co.cask.cdap.api.metrics.Metrics; import co.cask.cdap.api.workflow.AbstractWorkflow; import co.cask.cdap.api.workflow.AbstractWorkflowAction; import co.cask.cdap.api.workflow.WorkflowContext; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.twill.filesystem.Location; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; import java.util.StringTokenizer; import java.util.concurrent.TimeUnit; /** * Workflow application containing local datasets. */ public class WorkflowAppWithLocalDatasets extends AbstractApplication { public static final String WORDCOUNT_DATASET = "wordcount"; public static final String RESULT_DATASET = "result"; public static final String CSV_FILESET_DATASET = "csvfileset"; public static final String WORKFLOW_NAME = "WorkflowWithLocalDatasets"; public static final String WORKFLOW_RUNS_DATASET = "workflowruns"; @Override public void configure() { setName("WorkflowAppWithLocalDatasets"); setDescription("App to test the local dataset functionality for the Workflow."); addSpark(new SparkCSVToSpaceProgram()); addMapReduce(new WordCount()); addWorkflow(new WorkflowWithLocalDatasets()); createDataset(RESULT_DATASET, KeyValueTable.class); createDataset(WORKFLOW_RUNS_DATASET, KeyValueTable.class); } /** * Workflow which configures the local dataset. */ public static class WorkflowWithLocalDatasets extends AbstractWorkflow { @Override public void initialize(WorkflowContext context) throws Exception { super.initialize(context); KeyValueTable workflowRuns = context.getDataset(WORKFLOW_RUNS_DATASET); workflowRuns.write(context.getRunId().getId(), "STARTED"); } @Override public void destroy() { KeyValueTable workflowRuns = getContext().getDataset(WORKFLOW_RUNS_DATASET); String status = Bytes.toString(workflowRuns.read(getContext().getRunId().getId())); if (!"STARTED".equals(status)) { return; } if (getContext().getRuntimeArguments().containsKey("destroy.throw.exception")) { // throw exception from destroy. should not affect the Workflow run status throw new RuntimeException("destroy"); } workflowRuns.write(getContext().getRunId().getId(), "COMPLETED"); } @Override protected void configure() { setName(WORKFLOW_NAME); setDescription("Workflow program with local datasets."); createLocalDataset(WORDCOUNT_DATASET, KeyValueTable.class); createLocalDataset(CSV_FILESET_DATASET, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); addAction(new LocalDatasetWriter()); addSpark("JavaSparkCSVToSpaceConverter"); addMapReduce("WordCount"); addAction(new LocalDatasetReader("readerAction")); } } /** * Custom action writing to the local file set dataset. */ public static class LocalDatasetWriter extends AbstractWorkflowAction { private static final Logger LOG = LoggerFactory.getLogger(LocalDatasetWriter.class); private Metrics metrics; @Override public void run() { String inputPath = getContext().getRuntimeArguments().get("input.path"); FileSet fileSetDataset = getContext().getDataset(CSV_FILESET_DATASET); Location inputLocation = fileSetDataset.getLocation(inputPath); try { try (PrintWriter writer = new PrintWriter(inputLocation.getOutputStream())) { writer.write("this,text,has"); writer.println(); writer.write("two,words,text,inside"); metrics.gauge("num.lines", 2); } } catch (Throwable t) { LOG.error("Exception occurred while running custom action ", t); } } } /** * MapReduce program that simply counts the number of occurrences of the words in the input files. */ public static class WordCount extends AbstractMapReduce { @Override public void beforeSubmit(MapReduceContext context) throws Exception { String inputPath = context.getRuntimeArguments().get("output.path"); Map<String, String> fileSetArgs = new HashMap<>(); FileSetArguments.addInputPath(fileSetArgs, inputPath); context.addInput(Input.ofDataset(CSV_FILESET_DATASET, fileSetArgs)); context.addOutput(Output.ofDataset(WORDCOUNT_DATASET)); Job job = context.getHadoopJob(); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(IntSumReducer.class); job.setNumReduceTasks(1); } } /** * Mapper to tokenized the the line into words. */ public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private static final IntWritable ONE = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, ONE); } } } /** * Reducer to write the word counts to the local Workflow dataset. */ public static class IntSumReducer extends Reducer<Text, IntWritable, byte[], byte[]> { private IntWritable result = new IntWritable(); private Metrics metrics; public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); metrics.count("num.words", sum); context.write(Bytes.toBytes(key.toString()), Bytes.toBytes(String.valueOf(result.get()))); } } /** * Custom action that reads the local dataset and writes to the non-local dataset. */ public static class LocalDatasetReader extends AbstractWorkflowAction { private static final Logger LOG = LoggerFactory.getLogger(LocalDatasetReader.class); private Metrics metrics; private final String actionName; private LocalDatasetReader(String name) { this.actionName = name; } @Override protected void configure() { super.configure(); setName(actionName); } @UseDataSet("wordcount") private KeyValueTable wordCount; @UseDataSet("result") private KeyValueTable result; @Override public void run() { LOG.info("Read the local dataset"); try { File waitFile = new File(getContext().getRuntimeArguments().get("wait.file")); waitFile.createNewFile(); int uniqueWordCount = 0; CloseableIterator<KeyValue<byte[], byte[]>> scanner = wordCount.scan(null, null); try { while (scanner.hasNext()) { scanner.next(); uniqueWordCount++; } } finally { scanner.close(); } result.write("UniqueWordCount", String.valueOf(uniqueWordCount)); metrics.gauge("unique.words", uniqueWordCount); File doneFile = new File(getContext().getRuntimeArguments().get("done.file")); while (!doneFile.exists()) { TimeUnit.MILLISECONDS.sleep(50); } } catch (Exception e) { // no-op } } } }