package org.apache.pig.backend.hadoop.executionengine.spark_streaming.converter; import java.io.IOException; import java.io.Serializable; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort; import org.apache.pig.backend.hadoop.executionengine.spark_streaming.SparkUtil; import org.apache.pig.data.DefaultTuple; import org.apache.pig.data.Tuple; import scala.Function1; import scala.Function2; import scala.Tuple2; import scala.reflect.ClassTag; import scala.runtime.AbstractFunction1; import scala.runtime.AbstractFunction2; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.rdd.PairRDDFunctions; import org.apache.spark.rdd.RDD; import org.apache.spark.streaming.dstream.DStream; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import java.io.IOException; import java.io.Serializable; import java.util.List; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLimit; import org.apache.pig.data.Tuple; import scala.collection.Iterator; import scala.collection.JavaConversions; import org.apache.spark.api.java.function.Function; import org.apache.spark.rdd.RDD; @SuppressWarnings("serial") public class SortConverter implements POConverter<Tuple, Tuple, POSort> { private static final Log LOG = LogFactory.getLog(SortConverter.class); @Override public JavaDStream<Tuple> convert(List<JavaDStream<Tuple>> predecessors, final POSort sortOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1); JavaDStream<Tuple> rdd = predecessors.get(0); DStream<Tuple2<Tuple, Object>> rddPair = rdd.dstream().map(new ToKeyValueFunction(), SparkUtil.<Tuple, Object>getTuple2Manifest()); JavaPairDStream<Tuple, Object> r = new JavaPairDStream<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); JavaPairDStream<Tuple, Object> sorted = r.transform( new Function<JavaPairRDD<Tuple, Object>, JavaPairRDD<Tuple, Object>>() { public JavaPairRDD<Tuple, Object> call(JavaPairRDD<Tuple, Object> in) throws Exception { return in.sortByKey(sortOperator.getmComparator(),false); } }); JavaDStream<Tuple> mapped = new JavaDStream<Tuple>(sorted.dstream().map(new ToValueFunction(),SparkUtil.getManifest(Tuple.class)), SparkUtil.getManifest(Tuple.class)); return mapped; } private static class ToValueFunction extends AbstractFunction1<Tuple2<Tuple, Object>,Tuple> implements Serializable { @Override public Tuple apply(Tuple2<Tuple, Object> t) { if (LOG.isDebugEnabled()) { LOG.debug("Sort ToKeyValueFunction in "+t); } Tuple key = t._1; return key; } } private static class ToKeyValueFunction extends AbstractFunction1<Tuple,Tuple2<Tuple, Object>> implements Serializable { @Override public Tuple2<Tuple, Object> apply(Tuple t) { if (LOG.isDebugEnabled()) { LOG.debug("Sort ToKeyValueFunction in "+t); } Tuple key = t; Object value = null; // (key, value) Tuple2<Tuple, Object> out = new Tuple2<Tuple, Object>(key, value); if (LOG.isDebugEnabled()) { LOG.debug("Sort ToKeyValueFunction out "+out); } return out; } } }