package org.apache.pig.backend.hadoop.executionengine.spark_streaming.converter;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.spark_streaming.SparkUtil;
import org.apache.pig.data.DefaultTuple;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.dstream.DStream;
import org.apache.spark.streaming.twitter.TwitterUtils;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import scala.Function1;
import scala.Option;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.runtime.AbstractFunction1;
import twitter4j.Status;
import twitter4j.Twitter;
import twitter4j.TwitterFactory;
import twitter4j.auth.AccessToken;
import twitter4j.auth.Authorization;
import twitter4j.auth.BasicAuthorization;
import com.google.common.collect.Lists;
/**
* Converter that loads data via POLoad and converts it to RRD<Tuple>. Abuses the interface a bit
* in that there is no inoput RRD to convert in this case. Instead input is the source path of the
* POLoad.
*
* @author billg
*/
@SuppressWarnings({ "serial"})
public class LoadConverter implements POConverter<Tuple, Tuple, POLoad> {
private static final Function1<Tuple2<Text, Tuple>, Tuple> TO_VALUE_FUNCTION = new ToTupleFunction();
private PigContext pigContext;
private PhysicalPlan physicalPlan;
private JavaStreamingContext sparkContext;
public LoadConverter(PigContext pigContext, PhysicalPlan physicalPlan, JavaStreamingContext sparkContext2) {
this.pigContext = pigContext;
this.physicalPlan = physicalPlan;
this.sparkContext = sparkContext2;
}
@Override
public JavaDStream<Tuple> convert(List<JavaDStream<Tuple>> predecessorRdds, POLoad poLoad) throws IOException {
// if (predecessors.size()!=0) {
// throw new RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
// }
configureLoader(physicalPlan, poLoad, sparkContext.ssc().sc().hadoopConfiguration(),this.pigContext);
Iterator<PhysicalOperator> top = physicalPlan.iterator();
boolean isTwitter = false;
while(top.hasNext()){
String load = top.next().toString();
if(load.contains("hdfs://")){
String[] splitted = load.split("hdfs://");
String url = "hdfs://" + splitted[1];
if(url.contains("/_twitter")){
isTwitter = true;
}
break;
}
}
if(!isTwitter){
DStream<Tuple2<Text, Tuple>> hadoopRDD= sparkContext.ssc().fileStream(poLoad.getLFile().getFileName(),
SparkUtil.getManifest(Text.class),
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(PigInputFormat.class));
//hadoopRDD.print();
/*
JavaDStream<String> mhadoopRDD = sparkContext.textFileStream(poLoad.getLFile().getFileName());
stringTupleFunction tf = new stringTupleFunction();
JavaDStream<Tuple> lulz = mhadoopRDD.map(tf);
//lulz.print();
return lulz;
*/
JavaDStream<Tuple> hdfsTuple = new JavaDStream<Tuple>(hadoopRDD.map(TO_VALUE_FUNCTION,SparkUtil.getManifest(Tuple.class)),SparkUtil.getManifest(Tuple.class));
hdfsTuple.print();
return hdfsTuple;
}else{
System.out.println("=====Tweeets-Tweets=======");
System.setProperty("twitter4j.oauth.consumerKey","mGkece93BmDILkPXXXXX");
System.setProperty("twitter4j.oauth.consumerSecret","K9RhnuOdZJlxDgxKJXXXXXXXXXXXXXXXXXXXXX");
System.setProperty("twitter4j.oauth.accessToken","2493987XXXXXXXXXXXXXXXXXXXXXXXXXFPRs0Ho7");
System.setProperty("twitter4j.oauth.accessTokenSecret","XXXXXXXXXXXXXXXXXXXXikQ0KxfqByVrtzs3jYP");
//sparkContext.checkpoint("/home/akhld/mobi/temp/pig/twitter/");
//JavaDStream<Status> dtweets= sparkContext.twitterStream();
JavaDStream<Status> dtweets = TwitterUtils.createStream(sparkContext);
System.out.println("=====Tweeets-Tweets=======");
tweetFunction fnc = new tweetFunction();
DStream<Tuple> dstatuses = dtweets.dstream().map(fnc,SparkUtil.getManifest(Tuple.class));
dstatuses.print();
JavaDStream<Tuple> tweetTuple = new JavaDStream<Tuple>(dstatuses, SparkUtil.getManifest(Tuple.class));
return tweetTuple;
}
}
private static class tweetFunction extends Function<Status, Tuple> implements Serializable {
@Override
public Tuple call(Status status) throws Exception {
ArrayList<String> al = new ArrayList<String>();
//String replaced_tweet = status.getText().replaceAll("[^\\x20-\\x7e]", "");
String replaced_tweet = status.getText();
al.add(replaced_tweet);
TupleFactory mTupleFactory = TupleFactory.getInstance();
Tuple t = mTupleFactory.newTupleNoCopy(al);
return t;
}
}
private static class ToTupleFunction extends AbstractFunction1<Tuple2<Text, Tuple>,Tuple>
implements Function1<Tuple2<Text, Tuple>,Tuple>, Serializable {
@Override
public Tuple apply(Tuple2<Text, Tuple> v1) {
return v1._2();
}
}
/**
* stolen from JobControlCompiler
* TODO: refactor it to share this
* @param physicalPlan
* @param poLoad
* @param configuration
* @return
* @throws java.io.IOException
*/
private static Configuration configureLoader(PhysicalPlan physicalPlan,
POLoad poLoad, Configuration configuration, PigContext pigContext) throws IOException {
Job job = new Job(configuration);
LoadFunc loadFunc = poLoad.getLoadFunc();
loadFunc.setLocation(poLoad.getLFile().getFileName(), job);
// stolen from JobControlCompiler
ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>();
//Store the inp filespecs
pigInputs.add(poLoad.getLFile());
ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList();
ArrayList<String> inpSignatures = Lists.newArrayList();
ArrayList<Long> inpLimits = Lists.newArrayList();
//Store the target operators for tuples read
//from this input
List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad);
List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList();
if(loadSuccessors!=null){
for (PhysicalOperator loadSuccessor : loadSuccessors) {
loadSuccessorsKeys.add(loadSuccessor.getOperatorKey());
}
}
inpTargets.add(loadSuccessorsKeys);
inpSignatures.add(poLoad.getSignature());
inpLimits.add(poLoad.getLimit());
configuration.set("pig.inputs", ObjectSerializer.serialize(pigInputs));
configuration.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets));
configuration.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures));
configuration.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits));
configuration.set("pig.pigContext", ObjectSerializer.serialize(pigContext));
configuration.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
return configuration;
}
}