LoadConverter.java example

Explorer
spork-streaming-master
package org.apache.pig.backend.hadoop.executionengine.spark_streaming.converter;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.spark_streaming.SparkUtil;
import org.apache.pig.data.DefaultTuple;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.dstream.DStream;
import org.apache.spark.streaming.twitter.TwitterUtils;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import scala.Function1;
import scala.Option;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.runtime.AbstractFunction1;
import twitter4j.Status;
import twitter4j.Twitter;
import twitter4j.TwitterFactory;
import twitter4j.auth.AccessToken;
import twitter4j.auth.Authorization;
import twitter4j.auth.BasicAuthorization;

import com.google.common.collect.Lists;

/**
 * Converter that loads data via POLoad and converts it to RRD<Tuple>. Abuses the interface a bit
 * in that there is no inoput RRD to convert in this case. Instead input is the source path of the
 * POLoad.
 *
 * @author billg
 */
@SuppressWarnings({ "serial"})
public class LoadConverter implements POConverter<Tuple, Tuple, POLoad> {

	private static final Function1<Tuple2<Text, Tuple>, Tuple> TO_VALUE_FUNCTION = new ToTupleFunction();

	private PigContext pigContext;
	private PhysicalPlan physicalPlan;
	private JavaStreamingContext sparkContext;

	public LoadConverter(PigContext pigContext, PhysicalPlan physicalPlan, JavaStreamingContext sparkContext2) {
		this.pigContext = pigContext;
		this.physicalPlan = physicalPlan;
		this.sparkContext = sparkContext2;
	}

	@Override
	public JavaDStream<Tuple> convert(List<JavaDStream<Tuple>> predecessorRdds, POLoad poLoad) throws IOException {
		//        if (predecessors.size()!=0) {
		//            throw new RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
		//        }
		configureLoader(physicalPlan, poLoad, sparkContext.ssc().sc().hadoopConfiguration(),this.pigContext);

		Iterator<PhysicalOperator> top = physicalPlan.iterator();
		boolean isTwitter = false;
		while(top.hasNext()){    		
			String load = top.next().toString();

			if(load.contains("hdfs://")){
				String[] splitted = load.split("hdfs://");       		 
				String url = "hdfs://" + splitted[1];

				if(url.contains("/_twitter")){
					isTwitter = true;        			
				}
				break;

			}

		}

		if(!isTwitter){    	   

			DStream<Tuple2<Text, Tuple>> hadoopRDD= sparkContext.ssc().fileStream(poLoad.getLFile().getFileName(), 
					SparkUtil.getManifest(Text.class), 
					SparkUtil.getManifest(Tuple.class), 
					SparkUtil.getManifest(PigInputFormat.class));

			//hadoopRDD.print();
			/*
			JavaDStream<String> mhadoopRDD = sparkContext.textFileStream(poLoad.getLFile().getFileName());
			
			stringTupleFunction tf = new stringTupleFunction();
			
			JavaDStream<Tuple> lulz = mhadoopRDD.map(tf);
			
			//lulz.print();			
			
			return lulz;
			*/
			
			JavaDStream<Tuple> hdfsTuple = new JavaDStream<Tuple>(hadoopRDD.map(TO_VALUE_FUNCTION,SparkUtil.getManifest(Tuple.class)),SparkUtil.getManifest(Tuple.class));

			hdfsTuple.print();
			
			return hdfsTuple;
			
		}else{

			System.out.println("=====Tweeets-Tweets=======");
			System.setProperty("twitter4j.oauth.consumerKey","mGkece93BmDILkPXXXXX");
			System.setProperty("twitter4j.oauth.consumerSecret","K9RhnuOdZJlxDgxKJXXXXXXXXXXXXXXXXXXXXX");
			System.setProperty("twitter4j.oauth.accessToken","2493987XXXXXXXXXXXXXXXXXXXXXXXXXFPRs0Ho7");
			System.setProperty("twitter4j.oauth.accessTokenSecret","XXXXXXXXXXXXXXXXXXXXikQ0KxfqByVrtzs3jYP");
			//sparkContext.checkpoint("/home/akhld/mobi/temp/pig/twitter/");

			//JavaDStream<Status> dtweets= sparkContext.twitterStream();
			JavaDStream<Status> dtweets = TwitterUtils.createStream(sparkContext);

			System.out.println("=====Tweeets-Tweets=======");

			tweetFunction fnc = new tweetFunction();
			DStream<Tuple> dstatuses = dtweets.dstream().map(fnc,SparkUtil.getManifest(Tuple.class));				

			dstatuses.print();

			JavaDStream<Tuple> tweetTuple = new JavaDStream<Tuple>(dstatuses, SparkUtil.getManifest(Tuple.class));

			return tweetTuple;

		}

	}
	
	private static class tweetFunction extends Function<Status, Tuple> implements Serializable {
		@Override
		public Tuple call(Status status) throws Exception {
			ArrayList<String> al = new ArrayList<String>();
			//String replaced_tweet = status.getText().replaceAll("[^\\x20-\\x7e]", "");
			String replaced_tweet = status.getText();
			al.add(replaced_tweet);
			TupleFactory mTupleFactory = TupleFactory.getInstance();
			Tuple t =  mTupleFactory.newTupleNoCopy(al);
			return t;
		}
	}
	
	
	private static class ToTupleFunction extends AbstractFunction1<Tuple2<Text, Tuple>,Tuple>
            implements Function1<Tuple2<Text, Tuple>,Tuple>, Serializable {

        @Override
        public Tuple apply(Tuple2<Text, Tuple> v1) {
        	return v1._2();
        }
    }

	/**
	 * stolen from JobControlCompiler
	 * TODO: refactor it to share this
	 * @param physicalPlan
	 * @param poLoad
	 * @param configuration
	 * @return
	 * @throws java.io.IOException
	 */
	private static Configuration configureLoader(PhysicalPlan physicalPlan,
			POLoad poLoad, Configuration configuration, PigContext pigContext) throws IOException {

		Job job = new Job(configuration);
		LoadFunc loadFunc = poLoad.getLoadFunc();

		loadFunc.setLocation(poLoad.getLFile().getFileName(), job);

		// stolen from JobControlCompiler
		ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>();
		//Store the inp filespecs
		pigInputs.add(poLoad.getLFile());

		ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList();
		ArrayList<String> inpSignatures = Lists.newArrayList();
		ArrayList<Long> inpLimits = Lists.newArrayList();
		//Store the target operators for tuples read
		//from this input
		List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad);
		List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList();
		if(loadSuccessors!=null){
			for (PhysicalOperator loadSuccessor : loadSuccessors) {
				loadSuccessorsKeys.add(loadSuccessor.getOperatorKey());
			}
		}
		inpTargets.add(loadSuccessorsKeys);
		inpSignatures.add(poLoad.getSignature());
		inpLimits.add(poLoad.getLimit());

		configuration.set("pig.inputs", ObjectSerializer.serialize(pigInputs));
		configuration.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets));
		configuration.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures));
		configuration.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits));
		configuration.set("pig.pigContext", ObjectSerializer.serialize(pigContext));
		configuration.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
		return configuration;
	}
}