/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.spec;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.plugin.PluginConfigurer;
import co.cask.cdap.etl.api.PipelineConfigurable;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.common.DefaultPipelineConfigurer;
import co.cask.cdap.etl.planner.Dag;
import co.cask.cdap.etl.proto.Connection;
import co.cask.cdap.etl.proto.v2.ETLConfig;
import co.cask.cdap.etl.proto.v2.ETLPlugin;
import co.cask.cdap.etl.proto.v2.ETLStage;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This is run at application configure time to take an application config {@link ETLConfig} and call
* {@link PipelineConfigurable#configurePipeline(PipelineConfigurer)} on all plugins in the pipeline.
* This generates a {@link PipelineSpec} which the programs understand.
*
* @param <C> the type of user provided config
* @param <P> the pipeline specification generated from the config
*/
public abstract class PipelineSpecGenerator<C extends ETLConfig, P extends PipelineSpec> {
protected final PluginConfigurer configurer;
private final Class<? extends Dataset> errorDatasetClass;
private final DatasetProperties errorDatasetProperties;
private final Set<String> sourcePluginTypes;
private final Set<String> sinkPluginTypes;
protected PipelineSpecGenerator(PluginConfigurer configurer,
Set<String> sourcePluginTypes,
Set<String> sinkPluginTypes,
Class<? extends Dataset> errorDatasetClass,
DatasetProperties errorDatasetProperties) {
this.configurer = configurer;
this.sourcePluginTypes = sourcePluginTypes;
this.sinkPluginTypes = sinkPluginTypes;
this.errorDatasetClass = errorDatasetClass;
this.errorDatasetProperties = errorDatasetProperties;
}
/**
* Validate the user provided ETL config and generate a pipeline specification from it.
* It will also register all plugins used by the pipeline and create any error datasets used by the pipeline.
*
* A valid pipeline has the following properties:
*
* All stages in the pipeline have a unique name.
* Source stages have at least one output and no inputs.
* Sink stages have at least one input and no outputs.
* There are no cycles in the pipeline.
* All inputs into a stage have the same schema.
*
* @param config user provided ETL config
*/
public abstract P generateSpec(C config);
/**
* Performs most of the validation and configuration needed by a pipeline.
* Handles stages, connections, resources, and stage logging settings.
*
* @param config user provided ETL config
* @param specBuilder builder for creating a pipeline spec.
*/
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) {
// validate the config and determine the order we should configure the stages in.
List<StageConnections> traversalOrder = validateConfig(config);
Map<String, DefaultPipelineConfigurer> pluginConfigurers = new HashMap<>(traversalOrder.size());
for (StageConnections stageConnections : traversalOrder) {
String stageName = stageConnections.getStage().getName();
pluginConfigurers.put(stageName, new DefaultPipelineConfigurer(configurer, stageName));
}
// configure the stages in order and build up the stage specs
for (StageConnections stageConnections : traversalOrder) {
ETLStage stage = stageConnections.getStage();
String stageName = stage.getName();
DefaultPipelineConfigurer pluginConfigurer = pluginConfigurers.get(stageName);
StageSpec stageSpec = configureStage(stageConnections, pluginConfigurer);
Schema outputSchema = stageSpec.getOutputSchema();
// for each output, set their input schema to our output schema
for (String outputStageName : stageConnections.getOutputs()) {
pluginConfigurers.get(outputStageName).getStageConfigurer().setInputSchema(outputSchema);
}
specBuilder.addStage(stageSpec);
}
specBuilder.addConnections(config.getConnections())
.setResources(config.getResources())
.setStageLoggingEnabled(config.isStageLoggingEnabled());
}
/**
* Configures a stage and returns the spec for it.
*
* @param stageConnections the user provided configuration for the stage along with its connections
* @param pluginConfigurer configurer used to configure the stage
* @return the spec for the stage
*/
private StageSpec configureStage(StageConnections stageConnections, DefaultPipelineConfigurer pluginConfigurer) {
ETLStage stage = stageConnections.getStage();
String stageName = stage.getName();
ETLPlugin stagePlugin = stage.getPlugin();
if (!Strings.isNullOrEmpty(stage.getErrorDatasetName())) {
configurer.createDataset(stage.getErrorDatasetName(), errorDatasetClass, errorDatasetProperties);
}
PluginSpec pluginSpec = configurePlugin(stageName, stagePlugin, pluginConfigurer);
Schema inputSchema = pluginConfigurer.getStageConfigurer().getInputSchema();
Schema outputSchema = pluginConfigurer.getStageConfigurer().getOutputSchema();
return StageSpec.builder(stageName, pluginSpec)
.setErrorDatasetName(stage.getErrorDatasetName())
.setInputSchema(inputSchema)
.setOutputSchema(outputSchema)
.addInputs(stageConnections.getInputs())
.addOutputs(stageConnections.getOutputs())
.build();
}
/**
* Configures a plugin and returns the spec for it.
*
* @param pluginId the unique plugin id
* @param etlPlugin user provided configuration for the plugin
* @param pipelineConfigurer configurer used to configure the plugin
* @return the spec for the plugin
*/
protected PluginSpec configurePlugin(String pluginId, ETLPlugin etlPlugin,
PipelineConfigurer pipelineConfigurer) {
TrackedPluginSelector pluginSelector = new TrackedPluginSelector(etlPlugin.getPluginSelector());
PipelineConfigurable plugin = configurer.usePlugin(etlPlugin.getType(),
etlPlugin.getName(),
pluginId,
etlPlugin.getPluginProperties(),
pluginSelector);
if (plugin == null) {
throw new IllegalArgumentException(
String.format("No plugin of type %s and name %s could be found stage for %s.",
etlPlugin.getType(), etlPlugin.getName(), pluginId));
}
try {
plugin.configurePipeline(pipelineConfigurer);
} catch (Exception e) {
throw new RuntimeException(
String.format("Exception while configuring plugin of type %s and name %s for stage %s: %s",
etlPlugin.getType(), etlPlugin.getName(), pluginId, e.getMessage()),
e);
}
return new PluginSpec(etlPlugin.getType(),
etlPlugin.getName(),
etlPlugin.getProperties(),
pluginSelector.getSelectedArtifact());
}
/**
* Validate that this is a valid pipeline. A valid pipeline has the following properties:
*
* All stages in the pipeline have a unique name.
* Source stages have at least one output and no inputs.
* Sink stages have at least one input and no outputs.
* There are no cycles in the pipeline.
* All inputs into a stage have the same schema.
*
* Returns the stages in the order they should be configured to ensure that all input stages are configured
* before their output.
*
* @param config the user provided configuration
* @return the order to configure the stages in
* @throws IllegalArgumentException if the pipeline is invalid
*/
private List<StageConnections> validateConfig(ETLConfig config) {
config.validate();
if (config.getStages().isEmpty()) {
throw new IllegalArgumentException("A pipeline must contain at least one stage.");
}
// check stage name uniqueness
Set<String> stageNames = new HashSet<>();
for (ETLStage stage : config.getStages()) {
if (!stageNames.add(stage.getName())) {
throw new IllegalArgumentException(
String.format("Invalid pipeline. Multiple stages are named %s. Please ensure all stage names are unique",
stage.getName()));
}
}
// check that the from and to are names of actual stages
for (Connection connection : config.getConnections()) {
if (!stageNames.contains(connection.getFrom())) {
throw new IllegalArgumentException(
String.format("Invalid connection %s. %s is not a stage.", connection, connection.getFrom()));
}
if (!stageNames.contains(connection.getTo())) {
throw new IllegalArgumentException(
String.format("Invalid connection %s. %s is not a stage.", connection, connection.getTo()));
}
}
Dag dag = new Dag(config.getConnections());
// check source plugins are sources in the dag
// check sink plugins are sinks in the dag
// check that other plugins are not sources or sinks in the dag
Map<String, StageConnections> stages = new HashMap<>();
for (ETLStage stage : config.getStages()) {
String stageName = stage.getName();
Set<String> stageInputs = dag.getNodeInputs(stageName);
Set<String> stageOutputs = dag.getNodeOutputs(stageName);
if (isSource(stage.getPlugin().getType())) {
if (!stageInputs.isEmpty()) {
throw new IllegalArgumentException(
String.format("Source %s has incoming connections from %s. Sources cannot have any incoming connections.",
stageName, Joiner.on(',').join(stageInputs)));
}
} else if (isSink(stage.getPlugin().getType())) {
if (!stageOutputs.isEmpty()) {
throw new IllegalArgumentException(
String.format("Sink %s has outgoing connections to %s. Sinks cannot have any outgoing connections.",
stageName, Joiner.on(',').join(stageOutputs)));
}
} else {
if (stageInputs.isEmpty()) {
throw new IllegalArgumentException(
String.format("Stage %s is unreachable, it has no incoming connections.", stageName));
}
if (stageOutputs.isEmpty()) {
throw new IllegalArgumentException(
String.format("Stage %s is a dead end, it has no outgoing connections.", stageName));
}
}
stages.put(stageName, new StageConnections(stage, stageInputs, stageOutputs));
}
List<StageConnections> traversalOrder = new ArrayList<>(stages.size());
for (String stageName : dag.getTopologicalOrder()) {
traversalOrder.add(stages.get(stageName));
}
return traversalOrder;
}
private boolean isSource(String pluginType) {
return sourcePluginTypes.contains(pluginType);
}
private boolean isSink(String pluginType) {
return sinkPluginTypes.contains(pluginType);
}
}