/*
* Main class which submits the flink program to the cluster .
* It also creates the sink and the source
*/
package com.xavient.dip.flink;
import java.util.Arrays;
import java.util.Properties;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.fs.DateTimeBucketer;
import org.apache.flink.streaming.connectors.fs.RollingSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer08;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import com.xavient.dip.common.AppArgs;
import com.xavient.dip.common.config.DiPConfiguration;
import com.xavient.dip.common.utils.CmdLineParser;
import com.xavient.dip.common.utils.FlatJsonConverter;
import com.xavient.dip.flink.hbase.HBaseOutputFormat;
/**
* Main Class For FlinkStreaming
*
*/
public class FlinkTweeterStreamProcessor {
public static void main(String[] args) throws Exception {
CmdLineParser parser = new CmdLineParser();
AppArgs appArgs = parser.validateArgs(args);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5000);
Properties properties = new Properties();
properties.setProperty(DiPConfiguration.KAFKA_BOOTSTRAP_SERVERS,
appArgs.getProperty(DiPConfiguration.KAFKA_BOOTSTRAP_SERVERS));
properties.setProperty("zookeeper.connect",
appArgs.getProperty(DiPConfiguration.ZK_HOST) + ":" + appArgs.getProperty(DiPConfiguration.ZK_PORT));
properties.setProperty("group.id", DiPConfiguration.KAFKA_GROUP_ID);
// Creates a source from which flink program picks up the data
DataStream<String> kafkaSourceStream = env
.addSource(new FlinkKafkaConsumer08<String>(appArgs.getProperty(DiPConfiguration.KAFKA_TOPIC),
new SimpleStringSchema(), properties))
.name("KafkaSource");
DataStream<Object[]> tweeterStream = kafkaSourceStream
.map(record -> FlatJsonConverter.convertToValuesArray(record)).name("Map Data");
DataStream<String> hdfsStream = tweeterStream.map(record -> {
StringBuilder recordBuilder = new StringBuilder();
for (Object e : Arrays.copyOfRange(record, 1, record.length)) {
recordBuilder.append(e);
recordBuilder.append(appArgs.getProperty(DiPConfiguration.HDFS_OUTPUT_DELIMITER));
}
return StringUtils.removeEnd(recordBuilder.toString(),
appArgs.getProperty(DiPConfiguration.HDFS_OUTPUT_DELIMITER));
});
System.setProperty("HADOOP_USER_NAME", appArgs.getProperty(DiPConfiguration.HADOOP_USER_NAME));
// HDFS Sink to write the data to the HDFS
RollingSink<String> hdfsSink = new RollingSink<>(appArgs.getProperty(DiPConfiguration.HDFS_OUTPUT_PATH));
hdfsSink.setBucketer(new DateTimeBucketer("yyyy-MM-dd--HH-mm-ss"));
hdfsSink.setBatchSize(1024 * 1024 * 400);
hdfsSink.setInProgressPrefix("flink");
hdfsSink.setInProgressSuffix(".text");
hdfsStream.addSink(hdfsSink).name("HDFS Sink");
// FLink writes to the HBASE using the HBASE output format
tweeterStream.writeUsingOutputFormat(new HBaseOutputFormat(appArgs)).name("HBASE Sink");
env.execute();
}
}