BlurLoadSparkProcessor.java example

Explorer
incubator-blur-master
package org.apache.blur.spark;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Serializable;
import java.util.List;

import org.apache.blur.mapreduce.lib.BlurMutate;
import org.apache.blur.spark.util.JavaSparkUtil;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.RowMutation;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;

@SuppressWarnings("serial")
public abstract class BlurLoadSparkProcessor<T> implements Serializable {

  protected static final String MAPRED_OUTPUT_COMMITTER_CLASS = "mapred.output.committer.class";
  protected static final String MAPREDUCE_PARTITIONER_CLASS = "mapreduce.partitioner.class";
  protected static final String SPARK_STREAMING_BLOCK_INTERVAL = "spark.streaming.blockInterval";
  protected static final String SPARK_EXECUTOR_EXTRA_CLASS_PATH = "spark.executor.extraClassPath";
  protected static final String ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER = "org.apache.spark.serializer.KryoSerializer";
  protected static final String SPARK_SERIALIZER = "spark.serializer";

  public void run() throws IOException {
    SparkConf conf = new SparkConf();
    conf.setAppName(getAppName());
    conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
    JavaSparkUtil.packProjectJars(conf);
    setupSparkConf(conf);

    JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
    List<JavaDStream<T>> streamsList = getStreamsList(ssc);

    // Union all the streams if there is more than 1 stream
    JavaDStream<T> streams = unionStreams(ssc, streamsList);

    JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
      public Tuple2<String, RowMutation> call(T t) {
        RowMutation rowMutation = convert(t);
        return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
      }
    });

    pairDStream.foreachRDD(getFunction());

    ssc.start();
    ssc.awaitTermination();
  }

  protected abstract Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction();

  private JavaDStream<T> unionStreams(JavaStreamingContext ssc, List<JavaDStream<T>> streamsList) {
    JavaDStream<T> unionStreams;
    if (streamsList.size() > 1) {
      unionStreams = ssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
    } else {
      // Otherwise, just use the 1 stream
      unionStreams = streamsList.get(0);
    }
    return unionStreams;
  }

  protected abstract String getOutputPath();

  /**
   * Gets the storage level for the spark job, default of MEMORY_ONLY_2.
   * 
   * @return
   */
  protected StorageLevel getStorageLevel() {
    return StorageLevel.MEMORY_AND_DISK();
  }

  /**
   * Called just before spark job is executed.
   * 
   * @param configuration
   */
  protected void setupBlurHadoopConfig(Configuration configuration) {

  }

  /**
   * Add custom spark information.
   * 
   * @param conf
   */
  protected void setupSparkConf(SparkConf conf) {

  }

  /**
   * Gets the duration for the batch, default of 10 seconds.
   * 
   * @return
   */
  protected Duration getDuration() {
    return new Duration(10000);
  }

  /**
   * Gets the blur table name to load.
   * 
   * @return
   */
  protected abstract String getBlurTableName();

  /**
   * Gets the blur client for the table.
   * 
   * @return
   */
  protected abstract Iface getBlurClient();

  /**
   * Gets the spark application name.
   * 
   * @return
   */
  protected abstract String getAppName();

  /**
   * Gets the list of streams to load into Blur.
   * 
   * @param ssc
   * 
   * @return
   */
  protected abstract List<JavaDStream<T>> getStreamsList(JavaStreamingContext ssc);

  /**
   * Converts the data into a {@link BlurMutate} object.
   * 
   * @param t
   * @return
   */
  protected abstract RowMutation convert(T t);
}