package com.xavient.dip.spark; import java.util.HashMap; import java.util.Map; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.KafkaUtils; import com.xavient.dip.common.AppArgs; import com.xavient.dip.common.config.DiPConfiguration; import com.xavient.dip.common.exceptions.DataIngestException; import com.xavient.dip.common.utils.CmdLineParser; import com.xavient.dip.common.utils.FlatJsonConverter; import com.xavient.dip.spark.twitter.TopNLocationByTweets; import com.xavient.dip.spark.twitter.TopNUsersWithMaxFollowers; import com.xavient.dip.spark.writer.SparkHBaseWriter; import com.xavient.dip.spark.writer.SparkHdfsWriter; import com.xavient.dip.spark.writer.SparkJdbcSourceWriter; public class TwitterDataIngestion { public static void main(String[] args) throws DataIngestException { CmdLineParser cmdLineParser = new CmdLineParser(); final AppArgs appArgs = cmdLineParser.validateArgs(args); System.setProperty("HADOOP_USER_NAME", appArgs.getProperty(DiPConfiguration.HADOOP_USER_NAME)); SparkConf conf = new SparkConf().setAppName("SparkTwitterStreaming") .setMaster("local[*]"); try (JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(conf), new Duration(1000))) { JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(jsc, appArgs.getProperty(DiPConfiguration.ZK_HOST)+":"+appArgs.getProperty(DiPConfiguration.ZK_PORT), "spark-stream", getKafkaTopics(appArgs)); JavaDStream<Object[]> twitterStreams = stream.map(tuple -> FlatJsonConverter.convertToValuesArray(tuple._2)) .cache(); SparkHdfsWriter.write(twitterStreams, appArgs); new SparkHBaseWriter(jsc.sparkContext(), appArgs).write(twitterStreams); SparkJdbcSourceWriter jdbcSourceWriter = new SparkJdbcSourceWriter(new SQLContext(jsc.sparkContext()), appArgs); new TopNLocationByTweets(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); new TopNUsersWithMaxFollowers(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); jsc.start(); jsc.awaitTermination(); } } private static Map<String, Integer> getKafkaTopics(AppArgs appArgs) { Map<String, Integer> topics = new HashMap<String, Integer>(); topics.put(appArgs.getProperty(DiPConfiguration.KAFKA_TOPIC), 1); return topics; } }