package org.apache.pig.backend.hadoop.executionengine.spark.converter;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct;
import org.apache.pig.backend.hadoop.executionengine.spark.SparkUtil;
import org.apache.pig.data.Tuple;
import org.apache.spark.rdd.PairRDDFunctions;
import org.apache.spark.rdd.RDD;
import scala.Function1;
import scala.Function2;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.runtime.AbstractFunction1;
import scala.runtime.AbstractFunction2;
@SuppressWarnings({ "serial" })
public class DistinctConverter implements POConverter<Tuple, Tuple, PODistinct> {
private static final Log LOG = LogFactory.getLog(DistinctConverter.class);
private static final Function1<Tuple, Tuple2<Tuple, Object>> TO_KEY_VALUE_FUNCTION = new ToKeyValueFunction();
private static final Function2<Object, Object, Object> MERGE_VALUES_FUNCTION = new MergeValuesFunction();
private static final Function1<Tuple2<Tuple, Object>, Tuple> TO_VALUE_FUNCTION = new ToValueFunction();
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
PODistinct poDistinct) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
RDD<Tuple> rdd = predecessors.get(0);
ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
.<Tuple, Object> getTuple2Manifest();
RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
tuple2ClassManifest);
PairRDDFunctions<Tuple, Object> pairRDDFunctions
= new PairRDDFunctions<Tuple, Object>(
rddPairs, SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class), null);
int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
.map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}
private static final class ToKeyValueFunction extends
AbstractFunction1<Tuple, Tuple2<Tuple, Object>> implements
Serializable {
@Override
public Tuple2<Tuple, Object> apply(Tuple t) {
if (LOG.isDebugEnabled())
LOG.debug("DistinctConverter.ToKeyValueFunction in " + t);
Tuple key = t;
Object value = null; // value
// (key, value)
Tuple2<Tuple, Object> out = new Tuple2<Tuple, Object>(key, value);
if (LOG.isDebugEnabled())
LOG.debug("DistinctConverter.ToKeyValueFunction out " + out);
return out;
}
}
private static final class MergeValuesFunction extends
AbstractFunction2<Object, Object, Object> implements Serializable {
@Override
public Object apply(Object arg0, Object arg1) {
return null;
}
}
private static final class ToValueFunction extends
AbstractFunction1<Tuple2<Tuple, Object>, Tuple> implements
Serializable {
@Override
public Tuple apply(Tuple2<Tuple, Object> input) {
return input._1;
}
}
}