/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.example.plugin;
import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.util.Iterator;
import javax.annotation.Nullable;
/**
* Batch Sink that writes to a FileSet in text format.
* Each record will be written as a single line, with record fields separated by a configurable separator.
*
* StructuredRecord is the first parameter because that is the input to the sink.
* The second and third parameters are the key and value expected by Hadoop's {@link TextOutputFormat}.
*/
@Plugin(type = BatchSink.PLUGIN_TYPE)
@Name(TextFileSetSink.NAME)
@Description("Writes to a FileSet in text format.")
public class TextFileSetSink extends BatchSink<StructuredRecord, NullWritable, Text> {
public static final String NAME = "TextFileSet";
private final Conf config;
/**
* Config properties for the plugin.
*/
public static class Conf extends PluginConfig {
public static final String FILESET_NAME = "fileSetName";
public static final String FIELD_SEPARATOR = "fieldSeparator";
// The name annotation tells CDAP what the property name is. It is optional, and defaults to the variable name.
// Note: only primitives (including boxed types) and string are the types that are supported
@Name(FILESET_NAME)
@Description("The name of the FileSet to read from.")
private String fileSetName;
@Nullable
@Name(FIELD_SEPARATOR)
@Description("The separator to use to join input record fields together. Defaults to ','.")
private String fieldSeparator;
// Use a no-args constructor to set field defaults.
public Conf() {
fileSetName = "";
fieldSeparator = ",";
}
}
// CDAP will pass in a config with its fields populated based on the configuration given when creating the pipeline.
public TextFileSetSink(Conf config) {
this.config = config;
}
// configurePipeline is called exactly once when the pipeline is being created.
// Any static configuration should be performed here.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// create the FileSet here.
pipelineConfigurer.createDataset(config.fileSetName,
FileSet.class,
FileSetProperties.builder()
.setInputFormat(TextInputFormat.class)
.setOutputFormat(TextOutputFormat.class)
.setEnableExploreOnCreate(true)
.setExploreFormat("text")
.setExploreSchema("text string")
.build()
);
}
// prepareRun is called before every pipeline run, and is used to configure what the input should be,
// as well as any arguments the input should use. It is called by the client that is submitting the batch job.
@Override
public void prepareRun(BatchSinkContext context) throws Exception {
context.addOutput(Output.ofDataset(config.fileSetName));
}
// onRunFinish is called at the end of the pipeline run by the client that submitted the batch job.
@Override
public void onRunFinish(boolean succeeded, BatchSinkContext context) {
// perform any actions that should happen at the end of the run.
}
// initialize is called by each job executor before any call to transform is made.
// This occurs at the start of the batch job run, after the job has been successfully submitted.
// For example, if mapreduce is the execution engine, each mapper will call initialize at the start of the program.
@Override
public void initialize(BatchRuntimeContext context) throws Exception {
super.initialize(context);
// create any resources required by transform()
}
// destroy is called by each job executor at the end of its life.
// For example, if mapreduce is the execution engine, each mapper will call destroy at the end of the program.
@Override
public void destroy() {
// clean up any resources created by initialize
}
@Override
public void transform(StructuredRecord input, Emitter<KeyValue<NullWritable, Text>> emitter) throws Exception {
StringBuilder joinedFields = new StringBuilder();
Iterator<Schema.Field> fieldIter = input.getSchema().getFields().iterator();
if (!fieldIter.hasNext()) {
// shouldn't happen
return;
}
Object val = input.get(fieldIter.next().getName());
if (val != null) {
joinedFields.append(val);
}
while (fieldIter.hasNext()) {
String fieldName = fieldIter.next().getName();
joinedFields.append(config.fieldSeparator);
val = input.get(fieldName);
if (val != null) {
joinedFields.append(val);
}
}
emitter.emit(new KeyValue<>(NullWritable.get(), new Text(joinedFields.toString())));
}
}