/*
* Copyright © 2014-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.InputFormatProvider;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.batch.OutputFormatProvider;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.data.stream.StreamBatchReadable;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceSpecification;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.api.metrics.MetricsCollectionService;
import co.cask.cdap.api.metrics.MetricsContext;
import co.cask.cdap.api.plugin.Plugin;
import co.cask.cdap.app.metrics.ProgramUserMetrics;
import co.cask.cdap.app.program.Program;
import co.cask.cdap.app.runtime.Arguments;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.common.logging.LoggingContext;
import co.cask.cdap.data.stream.StreamInputFormatProvider;
import co.cask.cdap.data2.dataset2.DatasetFramework;
import co.cask.cdap.data2.metadata.lineage.AccessType;
import co.cask.cdap.data2.transaction.stream.StreamAdmin;
import co.cask.cdap.internal.app.runtime.AbstractContext;
import co.cask.cdap.internal.app.runtime.batch.dataset.DatasetInputFormatProvider;
import co.cask.cdap.internal.app.runtime.batch.dataset.DatasetOutputFormatProvider;
import co.cask.cdap.internal.app.runtime.batch.dataset.input.MapperInput;
import co.cask.cdap.internal.app.runtime.distributed.LocalizeResource;
import co.cask.cdap.internal.app.runtime.plugin.PluginInstantiator;
import co.cask.cdap.internal.app.runtime.workflow.BasicWorkflowToken;
import co.cask.cdap.internal.app.runtime.workflow.WorkflowProgramInfo;
import co.cask.cdap.logging.context.MapReduceLoggingContext;
import co.cask.cdap.logging.context.WorkflowProgramLoggingContext;
import co.cask.cdap.proto.Id;
import co.cask.cdap.proto.ProgramType;
import co.cask.cdap.proto.id.Ids;
import co.cask.cdap.proto.id.ProgramId;
import co.cask.tephra.TransactionContext;
import co.cask.tephra.TransactionSystemClient;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.twill.api.RunId;
import org.apache.twill.discovery.DiscoveryServiceClient;
import java.io.File;
import java.net.URI;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
/**
* Mapreduce job runtime context
*/
public class BasicMapReduceContext extends AbstractContext implements MapReduceContext {
private final MapReduceSpecification spec;
private final LoggingContext loggingContext;
private final WorkflowProgramInfo workflowProgramInfo;
private final Metrics userMetrics;
private final Map<String, Plugin> plugins;
private final Map<String, OutputFormatProvider> outputFormatProviders;
private final TransactionContext txContext;
private final StreamAdmin streamAdmin;
private final File pluginArchive;
private final Map<String, LocalizeResource> resourcesToLocalize;
// key is input name, value is the MapperInput (configuration info) for that input
private Map<String, MapperInput> inputs;
private Job job;
private Resources mapperResources;
private Resources reducerResources;
public BasicMapReduceContext(Program program,
RunId runId,
Arguments runtimeArguments,
MapReduceSpecification spec,
@Nullable WorkflowProgramInfo workflowProgramInfo,
DiscoveryServiceClient discoveryServiceClient,
MetricsCollectionService metricsCollectionService,
TransactionSystemClient txClient,
DatasetFramework dsFramework,
StreamAdmin streamAdmin,
@Nullable File pluginArchive,
@Nullable PluginInstantiator pluginInstantiator) {
super(program, runId, runtimeArguments, Collections.<String>emptySet(),
createMetricsContext(program, runId.getId(), metricsCollectionService, workflowProgramInfo),
dsFramework, txClient, discoveryServiceClient, false, pluginInstantiator);
this.workflowProgramInfo = workflowProgramInfo;
this.userMetrics = new ProgramUserMetrics(getProgramMetrics());
this.loggingContext = createLoggingContext(program.getId(), runId, workflowProgramInfo);
this.spec = spec;
this.mapperResources = spec.getMapperResources();
this.reducerResources = spec.getReducerResources();
this.plugins = Maps.newHashMap(program.getApplicationSpecification().getPlugins());
this.txContext = getDatasetCache().newTransactionContext();
this.streamAdmin = streamAdmin;
this.pluginArchive = pluginArchive;
this.resourcesToLocalize = new HashMap<>();
this.inputs = new HashMap<>();
this.outputFormatProviders = new HashMap<>();
if (spec.getInputDataSet() != null) {
addInput(Input.ofDataset(spec.getInputDataSet()));
}
if (spec.getOutputDataSet() != null) {
setOutput(spec.getOutputDataSet());
}
}
public TransactionContext getTransactionContext() {
return txContext;
}
private LoggingContext createLoggingContext(Id.Program programId, RunId runId,
@Nullable WorkflowProgramInfo workflowProgramInfo) {
if (workflowProgramInfo == null) {
return new MapReduceLoggingContext(programId.getNamespaceId(), programId.getApplicationId(),
programId.getId(), runId.getId());
}
ProgramId workflowProramId = Ids.namespace(programId.getNamespaceId()).app(programId.getApplicationId())
.workflow(workflowProgramInfo.getName());
return new WorkflowProgramLoggingContext(workflowProramId.getNamespace(), workflowProramId.getApplication(),
workflowProramId.getProgram(), workflowProgramInfo.getRunId().getId(),
ProgramType.MAPREDUCE, programId.getId());
}
@Override
public String toString() {
return String.format("job=%s,=%s", spec.getName(), super.toString());
}
@Override
public Map<String, Plugin> getPlugins() {
return plugins;
}
@Override
public MapReduceSpecification getSpecification() {
return spec;
}
/**
* Returns the WorkflowToken if the MapReduce program is executed as a part of the Workflow.
*/
@Override
@Nullable
public BasicWorkflowToken getWorkflowToken() {
return workflowProgramInfo == null ? null : workflowProgramInfo.getWorkflowToken();
}
public void setJob(Job job) {
this.job = job;
}
@SuppressWarnings("unchecked")
@Override
public <T> T getHadoopJob() {
return (T) job;
}
@Override
public void setInput(StreamBatchReadable stream) {
setInput(new StreamInputFormatProvider(getProgram().getId().getNamespace(), stream, streamAdmin));
}
@Override
public void setInput(String datasetName) {
setInput(datasetName, ImmutableMap.<String, String>of());
}
@Override
public void setInput(String datasetName, Map<String, String> arguments) {
setInput(createInputFormatProvider(datasetName, arguments, null));
}
@Override
public void setInput(String datasetName, List<Split> splits) {
setInput(datasetName, ImmutableMap.<String, String>of(), splits);
}
@Override
public void setInput(String datasetName, Map<String, String> arguments, List<Split> splits) {
setInput(createInputFormatProvider(datasetName, arguments, splits));
}
@Override
public void setInput(String inputDatasetName, Dataset dataset) {
setInput(new DatasetInputFormatProvider(inputDatasetName, Collections.<String, String>emptyMap(),
dataset, null, MapReduceBatchReadableInputFormat.class));
}
@Override
public void setInput(InputFormatProvider inputFormatProvider) {
// with the setInput method, only 1 input will be set, and so the name does not matter much.
// make it immutable to prevent calls to addInput after setting a single input.
inputs = ImmutableMap.of(inputFormatProvider.getInputFormatClassName(), new MapperInput(inputFormatProvider));
}
@Override
public void addInput(Input input) {
addInput(input, null);
}
@SuppressWarnings("unchecked")
private void addInput(String alias, InputFormatProvider inputFormatProvider, @Nullable Class<?> mapperClass) {
// prevent calls to addInput after setting a single input.
if (inputs instanceof ImmutableMap) {
throw new IllegalStateException("Can not add inputs after setting a single input.");
}
if (mapperClass != null && !Mapper.class.isAssignableFrom(mapperClass)) {
throw new IllegalArgumentException("Specified mapper class must extend Mapper.");
}
if (inputs.containsKey(alias)) {
throw new IllegalArgumentException("Input already configured: " + alias);
}
inputs.put(alias, new MapperInput(inputFormatProvider, (Class<? extends Mapper>) mapperClass));
}
@Override
public void addInput(Input input, @Nullable Class<?> mapperCls) {
if (input instanceof Input.DatasetInput) {
Input.DatasetInput datasetInput = (Input.DatasetInput) input;
// the createInput method call will translate the input name from stream uri to the stream's name
// see the implementation of createInput for more information on the hack
Input.InputFormatProviderInput createdInput = createInput(datasetInput);
addInput(createdInput.getAlias(), createdInput.getInputFormatProvider(), mapperCls);
} else if (input instanceof Input.StreamInput) {
StreamBatchReadable streamBatchReadable = ((Input.StreamInput) input).getStreamBatchReadable();
addInput(input.getAlias(),
new StreamInputFormatProvider(getProgram().getId().getNamespace(), streamBatchReadable, streamAdmin),
mapperCls);
} else if (input instanceof Input.InputFormatProviderInput) {
addInput(input.getAlias(), ((Input.InputFormatProviderInput) input).getInputFormatProvider(), mapperCls);
} else {
// shouldn't happen unless user defines their own Input class
throw new IllegalArgumentException(String.format("Input %s has unknown input class %s",
input.getName(), input.getClass().getCanonicalName()));
}
}
@Override
public void setOutput(String datasetName) {
clearOutputs();
addOutput(datasetName);
}
//TODO: update this to allow a BatchWritable once the DatasetOutputFormat can support taking an instance
// and not just the name
@Override
public void setOutput(String datasetName, Dataset dataset) {
clearOutputs();
addOutput(datasetName, new DatasetOutputFormatProvider(datasetName, Collections.<String, String>emptyMap(),
dataset, MapReduceBatchWritableOutputFormat.class));
}
@Override
public void addOutput(String datasetName) {
addOutput(datasetName, Collections.<String, String>emptyMap());
}
@Override
public void addOutput(String datasetName, Map<String, String> arguments) {
addOutput(Output.ofDataset(datasetName, arguments));
}
@Override
public void addOutput(String alias, OutputFormatProvider outputFormatProvider) {
if (this.outputFormatProviders.containsKey(alias)) {
throw new IllegalArgumentException("Output already configured: " + alias);
}
this.outputFormatProviders.put(alias, outputFormatProvider);
}
@Override
public void addOutput(Output output) {
if (output instanceof Output.DatasetOutput) {
String datasetName = output.getName();
Map<String, String> arguments = ((Output.DatasetOutput) output).getArguments();
// we can delay the instantiation of the Dataset to later, but for now, we still have to maintain backwards
// compatibility for the #setOutput(String, Dataset) method, so delaying the instantiation of this dataset will
// bring about code complexity without much benefit. Once #setOutput(String, Dataset) is removed, we can postpone
// this dataset instantiation
DatasetOutputFormatProvider outputFormatProvider =
new DatasetOutputFormatProvider(datasetName, arguments,
getDataset(datasetName, arguments, AccessType.WRITE),
MapReduceBatchWritableOutputFormat.class);
addOutput(output.getAlias(), outputFormatProvider);
} else if (output instanceof Output.OutputFormatProviderOutput) {
addOutput(output.getAlias(), ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider());
} else {
// shouldn't happen unless user defines their own Output class
throw new IllegalArgumentException(String.format("Output %s has unknown output class %s",
output.getName(), output.getClass().getCanonicalName()));
}
}
private void clearOutputs() {
this.outputFormatProviders.clear();
}
/**
* Gets the MapperInputs for this MapReduce job.
*
* @return a mapping from input name to the MapperInputs for that input
*/
Map<String, MapperInput> getMapperInputs() {
return ImmutableMap.copyOf(inputs);
}
/**
* Gets the OutputFormatProviders for this MapReduce job.
*
* @return the OutputFormatProviders for the MapReduce job
*/
Map<String, OutputFormatProvider> getOutputFormatProviders() {
return ImmutableMap.copyOf(outputFormatProviders);
}
@Override
public void setMapperResources(Resources resources) {
this.mapperResources = resources;
}
@Override
public void setReducerResources(Resources resources) {
this.reducerResources = resources;
}
@Override
public Metrics getMetrics() {
return userMetrics;
}
public LoggingContext getLoggingContext() {
return loggingContext;
}
public Resources getMapperResources() {
return mapperResources;
}
public Resources getReducerResources() {
return reducerResources;
}
public File getPluginArchive() {
return pluginArchive;
}
/**
* Returns the information about Workflow if the MapReduce program is executed
* as a part of it, otherwise {@code null} is returned.
*/
@Override
@Nullable
public WorkflowProgramInfo getWorkflowInfo() {
return workflowProgramInfo;
}
@Override
public void localize(String name, URI uri) {
localize(name, uri, false);
}
@Override
public void localize(String name, URI uri, boolean archive) {
resourcesToLocalize.put(name, new LocalizeResource(uri, archive));
}
Map<String, LocalizeResource> getResourcesToLocalize() {
return resourcesToLocalize;
}
private Input.InputFormatProviderInput createInput(Input.DatasetInput datasetInput) {
String datasetName = datasetInput.getName();
Map<String, String> datasetArgs = datasetInput.getArguments();
// keep track of the original alias to set it on the created Input before returning it
String originalAlias = datasetInput.getAlias();
// TODO: It's a hack for stream. It was introduced in Reactor 2.2.0. Fix it when addressing CDAP-4158.
// This check is needed due to the implementation of AbstractMapReduce#useStreamInput(StreamBatchReadable).
// It can probably be removed once that method is removed (deprecated currently).
if (datasetName.startsWith(Constants.Stream.URL_PREFIX)) {
StreamBatchReadable streamBatchReadable = new StreamBatchReadable(URI.create(datasetName));
Input input = Input.of(streamBatchReadable.getStreamName(),
new StreamInputFormatProvider(getProgram().getId().getNamespace(),
streamBatchReadable, streamAdmin));
return (Input.InputFormatProviderInput) input.alias(originalAlias);
}
DatasetInputFormatProvider datasetInputFormatProvider =
new DatasetInputFormatProvider(datasetName, datasetArgs, getDataset(datasetName, datasetArgs, AccessType.READ),
datasetInput.getSplits(), MapReduceBatchReadableInputFormat.class);
return (Input.InputFormatProviderInput) Input.of(datasetName, datasetInputFormatProvider).alias(originalAlias);
}
private InputFormatProvider createInputFormatProvider(String datasetName,
Map<String, String> datasetArgs,
@Nullable List<Split> splits) {
return createInput((Input.DatasetInput) Input.ofDataset(datasetName, datasetArgs, splits)).getInputFormatProvider();
}
private static MetricsContext createMetricsContext(Program program, String runId, MetricsCollectionService service,
@Nullable WorkflowProgramInfo workflowProgramInfo) {
Map<String, String> tags = Maps.newHashMap();
tags.putAll(getMetricsContext(program, runId));
if (workflowProgramInfo != null) {
// If running inside Workflow, add the WorkflowMetricsContext as well
tags.put(Constants.Metrics.Tag.WORKFLOW, workflowProgramInfo.getName());
tags.put(Constants.Metrics.Tag.WORKFLOW_RUN_ID, workflowProgramInfo.getRunId().getId());
tags.put(Constants.Metrics.Tag.NODE, workflowProgramInfo.getNodeId());
}
return service.getContext(tags);
}
}