package org.apache.blur.spark; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; import java.util.List; import org.apache.blur.mapreduce.lib.BlurMutate; import org.apache.blur.spark.util.JavaSparkUtil; import org.apache.blur.thrift.generated.Blur.Iface; import org.apache.blur.thrift.generated.RowMutation; import org.apache.hadoop.conf.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.Time; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import scala.Tuple2; @SuppressWarnings("serial") public abstract class BlurLoadSparkProcessor<T> implements Serializable { protected static final String MAPRED_OUTPUT_COMMITTER_CLASS = "mapred.output.committer.class"; protected static final String MAPREDUCE_PARTITIONER_CLASS = "mapreduce.partitioner.class"; protected static final String SPARK_STREAMING_BLOCK_INTERVAL = "spark.streaming.blockInterval"; protected static final String SPARK_EXECUTOR_EXTRA_CLASS_PATH = "spark.executor.extraClassPath"; protected static final String ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER = "org.apache.spark.serializer.KryoSerializer"; protected static final String SPARK_SERIALIZER = "spark.serializer"; public void run() throws IOException { SparkConf conf = new SparkConf(); conf.setAppName(getAppName()); conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER); JavaSparkUtil.packProjectJars(conf); setupSparkConf(conf); JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration()); List<JavaDStream<T>> streamsList = getStreamsList(ssc); // Union all the streams if there is more than 1 stream JavaDStream<T> streams = unionStreams(ssc, streamsList); JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() { public Tuple2<String, RowMutation> call(T t) { RowMutation rowMutation = convert(t); return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation); } }); pairDStream.foreachRDD(getFunction()); ssc.start(); ssc.awaitTermination(); } protected abstract Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction(); private JavaDStream<T> unionStreams(JavaStreamingContext ssc, List<JavaDStream<T>> streamsList) { JavaDStream<T> unionStreams; if (streamsList.size() > 1) { unionStreams = ssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size())); } else { // Otherwise, just use the 1 stream unionStreams = streamsList.get(0); } return unionStreams; } protected abstract String getOutputPath(); /** * Gets the storage level for the spark job, default of MEMORY_ONLY_2. * * @return */ protected StorageLevel getStorageLevel() { return StorageLevel.MEMORY_AND_DISK(); } /** * Called just before spark job is executed. * * @param configuration */ protected void setupBlurHadoopConfig(Configuration configuration) { } /** * Add custom spark information. * * @param conf */ protected void setupSparkConf(SparkConf conf) { } /** * Gets the duration for the batch, default of 10 seconds. * * @return */ protected Duration getDuration() { return new Duration(10000); } /** * Gets the blur table name to load. * * @return */ protected abstract String getBlurTableName(); /** * Gets the blur client for the table. * * @return */ protected abstract Iface getBlurClient(); /** * Gets the spark application name. * * @return */ protected abstract String getAppName(); /** * Gets the list of streams to load into Blur. * * @param ssc * * @return */ protected abstract List<JavaDStream<T>> getStreamsList(JavaStreamingContext ssc); /** * Converts the data into a {@link BlurMutate} object. * * @param t * @return */ protected abstract RowMutation convert(T t); }