/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.api.dataset.lib.partitioned;
import co.cask.cdap.api.annotation.Beta;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.dataset.lib.DatasetStatePersistor;
import co.cask.cdap.api.dataset.lib.Partition;
import co.cask.cdap.api.dataset.lib.PartitionDetail;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A utility class for batch processing (i.e. MapReduce). It works by exposing functionality to configure a
* {@link PartitionedFileSet} as input to a MapReduceContext with runtime arguments to appropriately process partitions.
*/
@Beta
public class PartitionBatchInput {
/**
* See {@link #setInput(MapReduceContext, String, DatasetStatePersistor, ConsumerConfiguration)},
* but using {@link ConsumerConfiguration#DEFAULT}.
*/
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext,
String partitionedFileSetName,
DatasetStatePersistor statePersistor) {
return setInput(mapreduceContext, partitionedFileSetName, statePersistor, ConsumerConfiguration.DEFAULT);
}
/**
* Used from the beforeSubmit method of the implementing batch job to configure as input a PartitionedFileSet that has
* specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
* It does this by reading back the previous state, determining the new partitions to read, computing the new
* state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
* passed in.
*
* @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
* configured
* @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
* @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
* managed
* @param consumerConfiguration defines parameters for the partition consumption
* @return a BatchPartitionCommitter used to persist the state of the partition consumer
*/
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext,
String partitionedFileSetName,
DatasetStatePersistor statePersistor,
ConsumerConfiguration consumerConfiguration) {
PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
final PartitionConsumer partitionConsumer =
new ConcurrentPartitionConsumer(partitionedFileSet,
new DelegatingStatePersistor(mapreduceContext, statePersistor),
consumerConfiguration);
final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Map<String, String> arguments = new HashMap<>();
PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
return new BatchPartitionCommitter() {
@Override
public void onFinish(boolean succeeded) {
partitionConsumer.onFinish(consumedPartitions, succeeded);
}
};
}
/**
* Used to persist the state of the partition consumer. Call this method at the end of processing the partitions.
*/
public interface BatchPartitionCommitter {
void onFinish(boolean succeeded);
}
}