package org.apache.pig.backend.hadoop.executionengine.spark.converter; import java.io.IOException; import java.io.Serializable; import java.util.Iterator; import java.util.List; import org.apache.hadoop.mapred.JobConf; import org.apache.pig.FuncSpec; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach; import org.apache.pig.backend.hadoop.executionengine.spark.KryoSerializer; import org.apache.pig.backend.hadoop.executionengine.spark.SparkUtil; import org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil; import org.apache.pig.data.SchemaTupleBackend; import org.apache.pig.data.Tuple; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.util.ObjectSerializer; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.rdd.RDD; /** * Convert that is able to convert an RRD to another RRD using a POForEach */ @SuppressWarnings({"serial" }) public class ForEachConverter implements POConverter<Tuple, Tuple, POForEach> { private byte[] confBytes; public ForEachConverter(byte[] confBytes) { this.confBytes = confBytes; } @Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POForEach physicalOperator) { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); ForEachFunction forEachFunction = new ForEachFunction(physicalOperator, this.confBytes); return rdd.toJavaRDD().mapPartitions(forEachFunction, true).rdd(); } private static class ForEachFunction implements FlatMapFunction<Iterator<Tuple>, Tuple>, Serializable { private POForEach poForEach; private byte[] confBytes; private transient JobConf jobConf; private ForEachFunction(POForEach poForEach, byte[] confBytes) { this.poForEach = poForEach; this.confBytes = confBytes; } void initializeJobConf() { if (this.jobConf == null) { this.jobConf = KryoSerializer.deserializeJobConf(this.confBytes); PigMapReduce.sJobConfInternal.set(jobConf); try { MapRedUtil.setupUDFContext(jobConf); PigContext pc = (PigContext) ObjectSerializer.deserialize(jobConf.get("pig.pigContext")); SchemaTupleBackend.initialize(jobConf, pc); } catch (IOException ioe) { String msg = "Problem while configuring UDFContext from ForEachConverter."; throw new RuntimeException(msg, ioe); } } } public Iterable<Tuple> call(final Iterator<Tuple> input) { initializeJobConf(); PhysicalOperator[] planLeafOps= poForEach.getPlanLeafOps(); if (planLeafOps != null) { for (PhysicalOperator op : planLeafOps) { if (op.getClass() == POUserFunc.class) { POUserFunc udf = (POUserFunc) op; udf.setFuncInputSchema(); } } } return new Iterable<Tuple>() { @Override public Iterator<Tuple> iterator() { return new POOutputConsumerIterator(input) { protected void attach(Tuple tuple) { poForEach.setInputs(null); poForEach.attachInput(tuple); } protected Result getNextResult() throws ExecException { return poForEach.getNextTuple(); } }; } }; } } }