/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.etl.planner; import co.cask.cdap.etl.common.Constants; import co.cask.cdap.etl.common.PipelinePhase; import co.cask.cdap.etl.proto.Connection; import co.cask.cdap.etl.spec.PipelineSpec; import co.cask.cdap.etl.spec.StageSpec; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeSet; /** * Takes a {@link PipelineSpec} and creates an execution plan from it. */ public class PipelinePlanner { private final Set<String> reduceTypes; private final Set<String> isolationTypes; private final Set<String> supportedPluginTypes; public PipelinePlanner(Set<String> supportedPluginTypes, Set<String> reduceTypes, Set<String> isolationTypes) { this.reduceTypes = ImmutableSet.copyOf(reduceTypes); this.isolationTypes = ImmutableSet.copyOf(isolationTypes); this.supportedPluginTypes = ImmutableSet.copyOf(supportedPluginTypes); } /** * Create an execution plan for the given logical pipeline. This is used for batch pipelines. * Though it may eventually be useful to mark windowing points for realtime pipelines. * * A plan consists of one or more phases, with connections between phases. * A connection between a phase indicates control flow, and not necessarily * data flow. This class assumes that it receives a valid pipeline spec. * That is, the pipeline has no cycles, all its nodes have unique names, * sources don't have any input, sinks don't have any output, * everything else has both an input and an output, etc. * * We start by inserting connector nodes into the logical dag, * which are used to mark boundaries between mapreduce jobs. * Each connector represents a node where we will need to write to a local dataset. * * Next, the logical pipeline is broken up into phases, * using the connectors as sinks in one phase, and a source in another. * After this point, connections between phases do not indicate data flow, but control flow. * * @param spec the pipeline spec, representing a logical pipeline * @return the execution plan */ public PipelinePlan plan(PipelineSpec spec) { // go through the stages and examine their plugin type to determine which stages are reduce stages Set<String> reduceNodes = new HashSet<>(); Set<String> isolationNodes = new HashSet<>(); Map<String, StageSpec> specs = new HashMap<>(); for (StageSpec stage : spec.getStages()) { if (reduceTypes.contains(stage.getPlugin().getType())) { reduceNodes.add(stage.getName()); } if (isolationTypes.contains(stage.getPlugin().getType())) { isolationNodes.add(stage.getName()); } specs.put(stage.getName(), stage); } // insert connector stages into the logical pipeline ConnectorDag cdag = ConnectorDag.builder() .addConnections(spec.getConnections()) .addReduceNodes(reduceNodes) .addIsolationNodes(isolationNodes) .build(); cdag.insertConnectors(); Set<String> connectorNodes = cdag.getConnectors(); // now split the logical pipeline into pipeline phases, using the connectors as split points Map<String, Dag> subdags = new HashMap<>(); // assign some name to each subdag for (Dag subdag : cdag.splitOnConnectors()) { String name = getPhaseName(subdag.getSources(), subdag.getSinks()); subdags.put(name, subdag); } // build connections between phases Set<Connection> phaseConnections = new HashSet<>(); for (Map.Entry<String, Dag> subdagEntry1 : subdags.entrySet()) { String dag1Name = subdagEntry1.getKey(); Dag dag1 = subdagEntry1.getValue(); for (Map.Entry<String, Dag> subdagEntry2: subdags.entrySet()) { String dag2Name = subdagEntry2.getKey(); Dag dag2 = subdagEntry2.getValue(); if (dag1Name.equals(dag2Name)) { continue; } // if dag1 has any sinks that are a source in dag2, add a connection between the dags if (Sets.intersection(dag1.getSinks(), dag2.getSources()).size() > 0) { phaseConnections.add(new Connection(dag1Name, dag2Name)); } } } // convert to objects the programs expect. Map<String, PipelinePhase> phases = new HashMap<>(); for (Map.Entry<String, Dag> dagEntry : subdags.entrySet()) { phases.put(dagEntry.getKey(), dagToPipeline(dagEntry.getValue(), connectorNodes, specs)); } return new PipelinePlan(phases, phaseConnections); } /** * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag. * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc. * * @param dag the dag to convert * @param connectors connector nodes across all dags * @param specs specifications for every stage * @return the converted dag */ private PipelinePhase dagToPipeline(Dag dag, Set<String> connectors, Map<String, StageSpec> specs) { PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes); for (String stageName : dag.getTopologicalOrder()) { Set<String> outputs = dag.getNodeOutputs(stageName); if (!outputs.isEmpty()) { phaseBuilder.addConnections(stageName, outputs); } // add connectors if (connectors.contains(stageName)) { phaseBuilder.addStage(Constants.CONNECTOR_TYPE, new StageInfo(stageName, null)); continue; } // add other plugin types StageSpec spec = specs.get(stageName); String pluginType = spec.getPlugin().getType(); StageInfo stageInfo = new StageInfo(stageName, spec.getErrorDatasetName()); phaseBuilder.addStage(pluginType, stageInfo); } return phaseBuilder.build(); } @VisibleForTesting static String getPhaseName(Set<String> sources, Set<String> sinks) { // using sorted sets to guarantee the name is deterministic return Joiner.on('.').join(new TreeSet<>(sources)) + ".to." + Joiner.on('.').join(new TreeSet<>(sinks)); } }