/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.etl.batch.connector; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.dataset.lib.PartitionDetail; import co.cask.cdap.api.dataset.lib.PartitionFilter; import co.cask.cdap.api.dataset.lib.PartitionedFileSet; import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments; import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties; import co.cask.cdap.api.dataset.lib.Partitioning; import co.cask.cdap.api.workflow.WorkflowConfigurer; import co.cask.cdap.etl.api.Emitter; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.api.batch.BatchSourceContext; import co.cask.cdap.format.StructuredRecordStringConverter; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import java.util.HashMap; import java.util.Map; import javax.annotation.Nullable; /** * Internal batch source used as a connector between pipeline phases. * Though this extends BatchSource, this will not be instantiated through the plugin framework, but will * be created explicitly through the application. * * The batch connector is just a PartitionedFileSet, where a partition is the name of a phase that wrote to it. * This way, multiple phases can have the same local PartitionedFileSet as a sink, and the source will read data * from all partitions. * * This is because we don't want this to show up as a plugin that users can select and use, and also because * it uses features not exposed in the etl api (local workflow datasets). * * TODO: improve storage format. It is currently a json of the record but that is obviously not ideal */ public class ConnectorSource extends BatchSource<LongWritable, Text, StructuredRecord> { static final Schema RECORD_WITH_SCHEMA = Schema.recordOf( "record", Schema.Field.of("schema", Schema.of(Schema.Type.STRING)), Schema.Field.of("record", Schema.of(Schema.Type.STRING))); private final String datasetName; @Nullable private final Schema schema; public ConnectorSource(String datasetName, @Nullable Schema schema) { this.datasetName = datasetName; this.schema = schema; } // not the standard configurePipeline method. Need a workflowConfigurer to create a local dataset // we may want to expose local datasets in cdap-etl-api, but that is a separate track. public void configure(WorkflowConfigurer workflowConfigurer) { Partitioning partitioning = Partitioning.builder() .addField("phase", Partitioning.FieldType.STRING) .build(); workflowConfigurer.createLocalDataset(datasetName, PartitionedFileSet.class, PartitionedFileSetProperties.builder() .setPartitioning(partitioning) .setInputFormat(TextInputFormat.class) .setOutputFormat(TextOutputFormat.class) .build()); } @Override public void prepareRun(BatchSourceContext context) throws Exception { Map<String, String> arguments = new HashMap<>(); PartitionedFileSet inputFileset = context.getDataset(datasetName); for (PartitionDetail partitionDetail : inputFileset.getPartitions(PartitionFilter.ALWAYS_MATCH)) { PartitionedFileSetArguments.addInputPartition(arguments, partitionDetail); } context.setInput(datasetName, arguments); } @Override public void transform(KeyValue<LongWritable, Text> input, Emitter<StructuredRecord> emitter) throws Exception { StructuredRecord output; String inputStr = input.getValue().toString(); if (schema == null) { StructuredRecord recordWithSchema = StructuredRecordStringConverter.fromJsonString(inputStr, RECORD_WITH_SCHEMA); Schema outputSchema = Schema.parseJson((String) recordWithSchema.get("schema")); output = StructuredRecordStringConverter.fromJsonString((String) recordWithSchema.get("record"), outputSchema); } else { output = StructuredRecordStringConverter.fromJsonString(inputStr, schema); } emitter.emit(output); } }