/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.api.spark; import co.cask.cdap.api.RuntimeContext; import co.cask.cdap.api.ServiceDiscoverer; import co.cask.cdap.api.TaskLocalizationContext; import co.cask.cdap.api.Transactional; import co.cask.cdap.api.annotation.Beta; import co.cask.cdap.api.data.DatasetInstantiationException; import co.cask.cdap.api.data.batch.Split; import co.cask.cdap.api.data.format.FormatSpecification; import co.cask.cdap.api.dataset.Dataset; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.api.metrics.Metrics; import co.cask.cdap.api.plugin.PluginContext; import co.cask.cdap.api.stream.GenericStreamEventData; import co.cask.cdap.api.stream.StreamEventDecoder; import co.cask.cdap.api.workflow.WorkflowInfoProvider; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.Text; import org.apache.spark.Partition; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import java.io.Serializable; import java.util.Collections; import java.util.Map; import javax.annotation.Nullable; /** * Spark program execution context. User Spark program can interact with CDAP through this context. */ @Beta public abstract class JavaSparkExecutionContext implements RuntimeContext, Transactional, WorkflowInfoProvider { /** * @return The specification used to configure this {@link Spark} job instance. */ public abstract SparkSpecification getSpecification(); /** * Returns the logical start time of this Spark job. Logical start time is the time when this Spark * job is supposed to start if this job is started by the scheduler. Otherwise it would be the current time when the * job runs. * * @return Time in milliseconds since epoch time (00:00:00 January 1, 1970 UTC). */ public abstract long getLogicalStartTime(); /** * Returns a {@link Serializable} {@link ServiceDiscoverer} for Service Discovery in Spark Program which can be * passed in Spark program's closures. * * @return A {@link Serializable} {@link ServiceDiscoverer} */ public abstract ServiceDiscoverer getServiceDiscoverer(); /** * Returns a {@link Serializable} {@link Metrics} which can be used to emit custom metrics from user's {@link Spark} * program. This can also be passed in Spark program's closures and workers can emit their own metrics * * @return {@link Serializable} {@link Metrics} for {@link Spark} programs */ public abstract Metrics getMetrics(); /** * Returns a {@link Serializable} {@link PluginContext} which can be used to request for plugins instances. The * instance returned can also be used in Spark program's closures. * * @return A {@link Serializable} {@link PluginContext}. */ public abstract PluginContext getPluginContext(); /** * Returns a {@link Serializable} {@link TaskLocalizationContext} which can be used to retrieve files localized to * task containers. The instance returned can also be used in Spark program's closures. * * @return the {@link TaskLocalizationContext} for the {@link Spark} program */ public abstract TaskLocalizationContext getLocalizationContext(); /** * Creates a {@link JavaPairRDD} from the given {@link Dataset}. * * @param datasetName name of the Dataset * @param <K> key type * @param <V> value type * @return A new {@link JavaPairRDD} instance that reads from the given Dataset * @throws DatasetInstantiationException if the Dataset doesn't exist */ public <K, V> JavaPairRDD<K, V> fromDataset(String datasetName) { return fromDataset(datasetName, Collections.<String, String>emptyMap()); } /** * Creates a {@link JavaPairRDD} from the given {@link Dataset} with the given set of Dataset arguments. * * @param datasetName name of the Dataset * @param arguments arguments for the Dataset * @param <K> key type * @param <V> value type * @return A new {@link JavaPairRDD} instance that reads from the given Dataset * @throws DatasetInstantiationException if the Dataset doesn't exist */ public <K, V> JavaPairRDD<K, V> fromDataset(String datasetName, Map<String, String> arguments) { return fromDataset(datasetName, arguments, null); } /** * Creates a {@link JavaPairRDD} from the given {@link Dataset} with the given set of Dataset arguments * and custom list of {@link Split}s. Each {@link Split} will create a {@link Partition} in the {@link JavaPairRDD}. * * @param datasetName name of the Dataset * @param arguments arguments for the Dataset * @param splits list of {@link Split} or {@code null} to use the default splits provided by the Dataset * @param <K> key type * @param <V> value type * @return A new {@link JavaPairRDD} instance that reads from the given Dataset * @throws DatasetInstantiationException if the Dataset doesn't exist */ public abstract <K, V> JavaPairRDD<K, V> fromDataset(String datasetName, Map<String, String> arguments, @Nullable Iterable<? extends Split> splits); /** * Creates a {@link JavaRDD} that represents all events from the given stream. * * @param streamName name of the stream * @return A new {@link JavaRDD} instance that reads from the given stream * @throws DatasetInstantiationException if the Stream doesn't exist */ public JavaRDD<StreamEvent> fromStream(String streamName) { return fromStream(streamName, 0, Long.MAX_VALUE); } /** * Creates a {@link JavaRDD} that represents events from the given stream in the given time range. * * @param streamName name of the stream * @param startTime the starting time of the stream to be read in milliseconds (inclusive); * passing in {@code 0} means start reading from the first event available in the stream. * @param endTime the ending time of the streams to be read in milliseconds (exclusive); * passing in {@link Long#MAX_VALUE} means read up to latest event available in the stream. * @return A new {@link JavaRDD} instance that reads from the given stream * @throws DatasetInstantiationException if the Stream doesn't exist */ public abstract JavaRDD<StreamEvent> fromStream(String streamName, long startTime, long endTime); /** * Creates a {@link JavaPairRDD} that represents all events from the given stream. The key in the * resulting {@link JavaPairRDD} is the event timestamp. The stream body will * be decoded as the give value type. Currently it supports {@link Text}, {@link String} and {@link ByteWritable}. * * @param streamName name of the stream * @param valueType type of the stream body to decode to * @return A new {@link JavaRDD} instance that reads from the given stream * @throws DatasetInstantiationException if the Stream doesn't exist */ public <V> JavaPairRDD<Long, V> fromStream(String streamName, Class<V> valueType) { return fromStream(streamName, 0, Long.MAX_VALUE, valueType); } /** * Creates a {@link JavaPairRDD} that represents events from the given stream in the given time range. * The key in the resulting {@link JavaPairRDD} is the event timestamp. * The stream body will be decoded as the give value type. * Currently it supports {@link Text}, {@link String} and {@link ByteWritable}. * * @param streamName name of the stream * @param startTime the starting time of the stream to be read in milliseconds (inclusive); * passing in {@code 0} means start reading from the first event available in the stream. * @param endTime the ending time of the streams to be read in milliseconds (exclusive); * passing in {@link Long#MAX_VALUE} means read up to latest event available in the stream. * @param valueType type of the stream body to decode to * @return A new {@link JavaRDD} instance that reads from the given stream * @throws DatasetInstantiationException if the Stream doesn't exist */ public abstract <V> JavaPairRDD<Long, V> fromStream(String streamName, long startTime, long endTime, Class<V> valueType); /** * Creates a {@link JavaPairRDD} that represents events from the given stream in the given time range. * Each steam event will be decoded by an instance of the given {@link StreamEventDecoder} class. * * @param streamName name of the stream * @param startTime the starting time of the stream to be read in milliseconds (inclusive); * passing in {@code 0} means start reading from the first event available in the stream. * @param endTime the ending time of the streams to be read in milliseconds (exclusive); * passing in {@link Long#MAX_VALUE} means read up to latest event available in the stream. * @param decoderClass the {@link StreamEventDecoder} for decoding {@link StreamEvent} * @param keyType the type of the decoded key * @param valueType the type of the decoded value * @return A new {@link JavaRDD} instance that reads from the given stream * @throws DatasetInstantiationException if the Stream doesn't exist */ public abstract <K, V> JavaPairRDD<K, V> fromStream(String streamName, long startTime, long endTime, Class<? extends StreamEventDecoder<K, V>> decoderClass, Class<K> keyType, Class<V> valueType); /** * Creates a {@link JavaPairRDD} that represents all events from the given stream. * The first entry in the pair is a {@link Long}, representing the * event timestamp, while the second entry is a {@link GenericStreamEventData}, * which contains data decoded from the stream event body base on * the given {@link FormatSpecification}. * * @param streamName name of the stream * @param formatSpec the {@link FormatSpecification} describing the format in the stream * @param <T> value type * @return a new {@link JavaPairRDD} instance that reads from the given stream. * @throws DatasetInstantiationException if the Stream doesn't exist */ public <T> JavaPairRDD<Long, GenericStreamEventData<T>> fromStream(String streamName, FormatSpecification formatSpec, Class<T> dataType) { return fromStream(streamName, formatSpec, 0, Long.MAX_VALUE, dataType); } /** * Creates a {@link JavaPairRDD} that represents data from the given stream for events in the given * time range. The first entry in the pair is a {@link Long}, representing the * event timestamp, while the second entry is a {@link GenericStreamEventData}, * which contains data decoded from the stream event body base on * the given {@link FormatSpecification}. * * @param streamName name of the stream * @param formatSpec the {@link FormatSpecification} describing the format in the stream * @param startTime the starting time of the stream to be read in milliseconds (inclusive); * passing in {@code 0} means start reading from the first event available in the stream. * @param endTime the ending time of the streams to be read in milliseconds (exclusive); * passing in {@link Long#MAX_VALUE} means read up to latest event available in the stream. * @param <T> value type * @return a new {@link JavaPairRDD} instance that reads from the given stream. * @throws DatasetInstantiationException if the Stream doesn't exist */ public abstract <T> JavaPairRDD<Long, GenericStreamEventData<T>> fromStream(String streamName, FormatSpecification formatSpec, long startTime, long endTime, Class<T> dataType); /** * Saves the given {@link JavaPairRDD} to the given {@link Dataset}. * * @param rdd the {@link JavaPairRDD} to be saved * @param datasetName name of the Dataset * @throws DatasetInstantiationException if the Dataset doesn't exist */ public <K, V> void saveAsDataset(JavaPairRDD<K, V> rdd, String datasetName) { saveAsDataset(rdd, datasetName, Collections.<String, String>emptyMap()); } /** * Saves the given {@link JavaPairRDD} to the given {@link Dataset} with the given set of Dataset arguments. * * @param rdd the {@link JavaPairRDD} to be saved * @param datasetName name of the Dataset * @param arguments arguments for the Dataset * @throws DatasetInstantiationException if the Dataset doesn't exist */ public abstract <K, V> void saveAsDataset(JavaPairRDD<K, V> rdd, String datasetName, Map<String, String> arguments); }