/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.etl.batch; import co.cask.cdap.api.app.AbstractApplication; import co.cask.cdap.api.dataset.lib.FileSetProperties; import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet; import co.cask.cdap.api.schedule.Schedules; import co.cask.cdap.etl.api.Transform; import co.cask.cdap.etl.api.batch.BatchSink; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.batch.mapreduce.ETLMapReduce; import co.cask.cdap.etl.batch.spark.ETLSpark; import co.cask.cdap.etl.common.Constants; import co.cask.cdap.etl.common.PipelinePhase; import co.cask.cdap.etl.planner.PipelinePlan; import co.cask.cdap.etl.planner.PipelinePlanner; import co.cask.cdap.etl.proto.Engine; import co.cask.cdap.etl.proto.v2.ETLBatchConfig; import co.cask.cdap.etl.spec.PipelineSpecGenerator; import co.cask.cdap.etl.spec.StageSpec; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import java.util.HashMap; import java.util.Set; /** * ETL Batch Application. */ public class ETLBatchApplication extends AbstractApplication<ETLBatchConfig> { public static final String SCHEDULE_NAME = "etlWorkflow"; public static final String DEFAULT_DESCRIPTION = "Extract-Transform-Load (ETL) Batch Application"; private static final Set<String> SUPPORTED_PLUGIN_TYPES = ImmutableSet.of( BatchSource.PLUGIN_TYPE, BatchSink.PLUGIN_TYPE, Transform.PLUGIN_TYPE); @Override public void configure() { ETLBatchConfig config = getConfig().convertOldConfig(); setDescription(DEFAULT_DESCRIPTION); PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator( getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder() .setInputFormat(AvroKeyInputFormat.class) .setOutputFormat(AvroKeyOutputFormat.class) .setEnableExploreOnCreate(true) .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe") .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat") .setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()) .build()); BatchPipelineSpec spec = specGenerator.generateSpec(config); int sourceCount = 0; for (StageSpec stageSpec : spec.getStages()) { if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) { sourceCount++; } } if (sourceCount != 1) { throw new IllegalArgumentException("Invalid pipeline. There must only be one source."); } PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of()); PipelinePlan plan = planner.plan(spec); if (plan.getPhases().size() != 1) { // should never happen if there is only one source throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase."); } PipelinePhase pipeline = plan.getPhases().values().iterator().next(); switch (config.getEngine()) { case MAPREDUCE: BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.isStageLoggingEnabled(), new HashMap<String, String>()); addMapReduce(new ETLMapReduce(batchPhaseSpec)); break; case SPARK: batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.isStageLoggingEnabled(), new HashMap<String, String>()); addSpark(new ETLSpark(batchPhaseSpec)); break; default: throw new IllegalArgumentException( String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values()))); } addWorkflow(new ETLWorkflow(spec, config.getEngine())); scheduleWorkflow(Schedules.builder(SCHEDULE_NAME) .setDescription("ETL Batch schedule") .createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME); } }