/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.etl.batch.spark; import co.cask.cdap.api.spark.AbstractSpark; import co.cask.cdap.api.spark.SparkClientContext; import co.cask.cdap.etl.api.batch.BatchAggregator; import co.cask.cdap.etl.api.batch.BatchConfigurable; import co.cask.cdap.etl.api.batch.BatchSinkContext; import co.cask.cdap.etl.api.batch.BatchSourceContext; import co.cask.cdap.etl.api.batch.SparkSink; import co.cask.cdap.etl.batch.AbstractAggregatorContext; import co.cask.cdap.etl.batch.BatchPhaseSpec; import co.cask.cdap.etl.batch.CompositeFinisher; import co.cask.cdap.etl.batch.Finisher; import co.cask.cdap.etl.batch.PipelinePluginInstantiator; import co.cask.cdap.etl.common.Constants; import co.cask.cdap.etl.common.DatasetContextLookupProvider; import co.cask.cdap.etl.common.SetMultimapCodec; import co.cask.cdap.etl.planner.StageInfo; import com.google.common.base.Joiner; import com.google.common.collect.SetMultimap; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import org.apache.spark.SparkConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; /** * Configures and sets up runs of {@link ETLSparkProgram}. */ public class ETLSpark extends AbstractSpark { private static final Logger LOG = LoggerFactory.getLogger(ETLSpark.class); private static final Gson GSON = new GsonBuilder() .registerTypeAdapter(SetMultimap.class, new SetMultimapCodec<>()).create(); private final BatchPhaseSpec phaseSpec; private Finisher finisher; private List<File> cleanupFiles; public ETLSpark(BatchPhaseSpec phaseSpec) { this.phaseSpec = phaseSpec; } @Override protected void configure() { setName(phaseSpec.getPhaseName()); setDescription("Spark phase executor. " + phaseSpec.getDescription()); setMainClass(ETLSparkProgram.class); setExecutorResources(phaseSpec.getResources()); setDriverResources(phaseSpec.getResources()); if (phaseSpec.getPhase().getSources().size() != 1) { throw new IllegalArgumentException("Pipeline must contain exactly one source."); } if (phaseSpec.getPhase().getSinks().isEmpty()) { throw new IllegalArgumentException("Pipeline must contain at least one sink."); } // add source, sink, transform ids to the properties. These are needed at runtime to instantiate the plugins Map<String, String> properties = new HashMap<>(); properties.put(Constants.PIPELINEID, GSON.toJson(phaseSpec)); setProperties(properties); } @Override public void beforeSubmit(SparkClientContext context) throws Exception { cleanupFiles = new ArrayList<>(); CompositeFinisher.Builder finishers = CompositeFinisher.builder(); context.setSparkConf(new SparkConf().set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m")); Map<String, String> properties = context.getSpecification().getProperties(); BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class); PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec); // we checked at configure time that there is exactly one source String sourceName = phaseSpec.getPhase().getSources().iterator().next(); BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName); DatasetContextLookupProvider lookProvider = new DatasetContextLookupProvider(context); SparkBatchSourceContext sourceContext = new SparkBatchSourceContext(context, lookProvider, sourceName); batchSource.prepareRun(sourceContext); SparkBatchSourceFactory sourceFactory = sourceContext.getSourceFactory(); if (sourceFactory == null) { // TODO: Revisit what exception to throw throw new IllegalArgumentException("No input was set. Please make sure the source plugin calls setInput when " + "preparing the run."); } finishers.add(batchSource, sourceContext); SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory(); for (String sinkName : phaseSpec.getPhase().getSinks()) { BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName); if (batchSink instanceof SparkSink) { BasicSparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, lookProvider, sinkName); ((SparkSink) batchSink).prepareRun(sparkPluginContext); finishers.add((SparkSink) batchSink, sparkPluginContext); } else { BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, sinkName); batchSink.prepareRun(sinkContext); finishers.add(batchSink, sinkContext); } } Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE); Integer numPartitions = null; if (!aggregators.isEmpty()) { if (aggregators.size() > 1) { throw new IllegalArgumentException(String.format( "There was an error during planning. Phase %s has multiple aggregators %s.", phaseSpec.getPhaseName(), Joiner.on(',').join(aggregators))); } String aggregatorName = aggregators.iterator().next().getName(); BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName); AbstractAggregatorContext aggregatorContext = new SparkAggregatorContext(context, new DatasetContextLookupProvider(context), aggregatorName); aggregator.prepareRun(aggregatorContext); finishers.add(aggregator, aggregatorContext); numPartitions = aggregatorContext.getNumPartitions(); } File configFile = File.createTempFile("ETLSpark", ".config"); cleanupFiles.add(configFile); try (OutputStream os = new FileOutputStream(configFile)) { sourceFactory.serialize(os); sinkFactory.serialize(os); DataOutput dataOutput = new DataOutputStream(os); dataOutput.writeInt(numPartitions == null ? -1 : numPartitions); } finisher = finishers.build(); context.localize("ETLSpark.config", configFile.toURI()); } @Override public void onFinish(boolean succeeded, SparkClientContext context) throws Exception { finisher.onFinish(succeeded); for (File file : cleanupFiles) { if (!file.delete()) { LOG.warn("Failed to clean up resource {} ", file); } } } }