/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.example.plugin; import co.cask.cdap.api.annotation.Description; import co.cask.cdap.api.annotation.Name; import co.cask.cdap.api.annotation.Plugin; import co.cask.cdap.api.data.batch.Input; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.lib.FileSet; import co.cask.cdap.api.dataset.lib.FileSetProperties; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.plugin.PluginConfig; import co.cask.cdap.etl.api.Emitter; import co.cask.cdap.etl.api.PipelineConfigurer; import co.cask.cdap.etl.api.batch.BatchRuntimeContext; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.api.batch.BatchSourceContext; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.twill.filesystem.Location; import java.io.IOException; import javax.annotation.Nullable; /** * Batch Source that reads from a FileSet that has its data formatted as text. * * LongWritable is the first parameter because that is the key used by Hadoop's {@link TextInputFormat}. * Similarly, Text is the second parameter because that is the value used by Hadoop's {@link TextInputFormat}. * {@link StructuredRecord} is the third parameter because that is what the source will output. * All the plugins included with Hydrator operate on StructuredRecord. */ @Plugin(type = BatchSource.PLUGIN_TYPE) @Name(TextFileSetSource.NAME) @Description("Reads from a FileSet that has its data formatted as text.") public class TextFileSetSource extends BatchSource<LongWritable, Text, StructuredRecord> { public static final String NAME = "TextFileSet"; public static final Schema OUTPUT_SCHEMA = Schema.recordOf( "textRecord", Schema.Field.of("position", Schema.of(Schema.Type.LONG)), Schema.Field.of("text", Schema.of(Schema.Type.STRING)) ); private final Conf config; /** * Config properties for the plugin. */ public static class Conf extends PluginConfig { public static final String FILESET_NAME = "fileSetName"; public static final String CREATE_IF_NOT_EXISTS = "createIfNotExists"; public static final String DELETE_INPUT_ON_SUCCESS = "deleteInputOnSuccess"; // The name annotation tells CDAP what the property name is. It is optional, and defaults to the variable name. // Note: only primitives (including boxed types) and string are the types that are supported @Name(FILESET_NAME) @Description("The name of the FileSet to read from.") private String fileSetName; // A nullable fields tells CDAP that this is an optional field. @Nullable @Name(CREATE_IF_NOT_EXISTS) @Description("Whether to create the FileSet if it doesn't already exist. Defaults to false.") private Boolean createIfNotExists; @Nullable @Name(DELETE_INPUT_ON_SUCCESS) @Description("Whether to delete the data read by the source after the run succeeds. Defaults to false.") private Boolean deleteInputOnSuccess; // Use a no-args constructor to set field defaults. public Conf() { fileSetName = ""; createIfNotExists = false; deleteInputOnSuccess = false; } } // CDAP will pass in a config with its fields populated based on the configuration given when creating the pipeline. public TextFileSetSource(Conf config) { this.config = config; } // configurePipeline is called exactly once when the pipeline is being created. // Any static configuration should be performed here. @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { // if the user has set createIfNotExists to true, create the FileSet here. if (config.createIfNotExists) { pipelineConfigurer.createDataset(config.fileSetName, FileSet.class, FileSetProperties.builder() .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .setEnableExploreOnCreate(true) .setExploreFormat("text") .setExploreSchema("text string") .build() ); } // set the output schema of this stage so that stages further down the pipeline will know their input schema. pipelineConfigurer.getStageConfigurer().setOutputSchema(OUTPUT_SCHEMA); } // prepareRun is called before every pipeline run, and is used to configure what the input should be, // as well as any arguments the input should use. It is called by the client that is submitting the batch job. @Override public void prepareRun(BatchSourceContext context) throws IOException { context.setInput(Input.ofDataset(config.fileSetName)); } // onRunFinish is called at the end of the pipeline run by the client that submitted the batch job. @Override public void onRunFinish(boolean succeeded, BatchSourceContext context) { // perform any actions that should happen at the end of the run. // in our case, we want to delete the data read during this run if the run succeeded. if (succeeded && config.deleteInputOnSuccess) { FileSet fileSet = context.getDataset(config.fileSetName); for (Location inputLocation : fileSet.getInputLocations()) { try { inputLocation.delete(true); } catch (IOException e) { throw new RuntimeException(e); } } } } // initialize is called by each job executor before any call to transform is made. // This occurs at the start of the batch job run, after the job has been successfully submitted. // For example, if mapreduce is the execution engine, each mapper will call initialize at the start of the program. @Override public void initialize(BatchRuntimeContext context) throws Exception { super.initialize(context); // create any resources required by transform() } // destroy is called by each job executor at the end of its life. // For example, if mapreduce is the execution engine, each mapper will call destroy at the end of the program. @Override public void destroy() { // clean up any resources created by initialize } // transform is used to transform the key-value pair output by the input into objects output by this source. // The output should be a StructuredRecord if you want the source to be compatible with the plugins included // with Hydrator. @Override public void transform(KeyValue<LongWritable, Text> input, Emitter<StructuredRecord> emitter) throws Exception { emitter.emit(StructuredRecord.builder(OUTPUT_SCHEMA) .set("position", input.getKey().get()) .set("text", input.getValue().toString()) .build() ); } }