/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.api.batch;
import co.cask.cdap.api.annotation.Beta;
import co.cask.cdap.etl.api.Aggregator;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurable;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.StageLifecycle;
import java.util.Iterator;
/**
* An {@link Aggregator} used in batch programs.
* As it is used in batch programs, a BatchAggregator must be parameterized
* with supported group key and value classes. Group keys and values can be a
* byte[], Boolean, Integer, Long, Float, Double, String, or StructuredRecord.
* If the group key is not one of those types and is being used in mapreduce,
* it must implement Hadoop's org.apache.hadoop.io.WritableComparable interface.
* If the group value is not one of those types and is being used in mapreduce,
* it must implement Hadoop's org.apache.hadoop.io.Writable interface.
* If the aggregator is being used in spark, both the group key and value must implement the
* {@link java.io.Serializable} interface.
*
* @param <GROUP_KEY> group key type. Must be a supported type
* @param <GROUP_VALUE> group value type. Must be a supported type
* @param <OUT> output object type
*/
@Beta
public abstract class BatchAggregator<GROUP_KEY, GROUP_VALUE, OUT> extends BatchConfigurable<BatchAggregatorContext>
implements Aggregator<GROUP_KEY, GROUP_VALUE, OUT>, PipelineConfigurable, StageLifecycle<BatchRuntimeContext> {
public static final String PLUGIN_TYPE = "batchaggregator";
/**
* Configure the pipeline. This is run once when the pipeline is being published.
* This is where you perform any static logic, like creating required datasets, performing schema validation,
* setting output schema, and things of that nature.
*
* @param pipelineConfigurer the configurer used to add required datasets and streams
*/
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
// no-op
}
/**
* Prepare a pipeline run. This is run before every time before a pipeline runs in order to help set up the run.
* This is where you would set things like the number of partitions to use when grouping, and setting the
* group key and value classes if they are not known at compile time.
*
* @param context batch execution context
* @throws Exception
*/
@Override
public void prepareRun(BatchAggregatorContext context) throws Exception {
// no-op
}
/**
* Initialize the Batch Aggregator. Executed inside the Batch Run. This method is guaranteed to be invoked
* before any calls to {@link #groupBy(Object, Emitter)} and {@link #aggregate(Object, Iterator, Emitter)} are made.
*
* @param context {@link BatchRuntimeContext}
* @throws Exception if there is any error during initialization
*/
@Override
public void initialize(BatchRuntimeContext context) throws Exception {
// no-op
}
/**
* Destroy the Batch Aggregator. Executed at the end of the Batch Run.
*/
@Override
public void destroy() {
// no-op
}
}