package org.apache.pig.backend.stratosphere.executionengine.pactLayer.relationalOperators; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.PactOperator; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.Result; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.SOStatus; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.expressionOperators.POProject; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.plans.PactPlan; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.plans.PactPlanVisitor; import org.apache.pig.data.AccumulativeBag; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.plan.DependencyOrderWalker; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.pen.Illustrator; import org.apache.pig.pen.util.ExampleTuple; import org.apache.pig.pen.util.LineageTracer; public class SOForEach extends PactOperator { private static final long serialVersionUID = 1L; protected List<PactPlan> inputPlans; protected List<PactOperator> opsToBeReset; protected static final TupleFactory mTupleFactory = TupleFactory.getInstance(); //Since the plan has a generate, this needs to be maintained //as the generate can potentially return multiple tuples for //same call. protected boolean processingPlan = false; //its holds the iterators of the databags given by the input expressions which need flattening. transient protected Iterator<Tuple> [] its = null; //This holds the outputs given out by the input expressions of any datatype protected Object [] bags = null; //This is the template whcih contains tuples and is flattened out in createTuple() to generate the final output protected Object[] data = null; // store result types of the plan leaves protected byte[] resultTypes = null; // array version of isToBeFlattened - this is purely // for optimization - instead of calling isToBeFlattened.get(i) // we can do the quicker array access - isToBeFlattenedArray[i]. // Also we can store "boolean" values rather than "Boolean" objects // so we can also save on the Boolean.booleanValue() calls protected boolean[] isToBeFlattenedArray; ExampleTuple tIn = null; protected int noItems; protected PactOperator[] planLeafOps = null; protected Tuple inpTuple; public SOForEach(OperatorKey k) { this(k,-1,null,null); } public SOForEach(OperatorKey k, int rp, List inp) { this(k,rp,inp,null); } public SOForEach(OperatorKey k, int rp) { this(k,rp,null,null); } public SOForEach(OperatorKey k, List inp) { this(k,-1,inp,null); } public SOForEach(OperatorKey k, int rp, List<PactPlan> inp, List<Boolean> isToBeFlattened){ super(k, rp); setUpFlattens(isToBeFlattened); this.inputPlans = inp; opsToBeReset = new ArrayList<PactOperator>(); getLeaves(); } @Override public void visit(PactPlanVisitor v) throws VisitorException { v.visitSOForEach(this); } @Override public String name() { return getAliasString() + "SOForEach" + "(" + getFlatStr() + ")" + "[" + DataType.findTypeName(resultType) + "]" + " - " + mKey.toString(); } String getFlatStr() { if(isToBeFlattenedArray ==null) { return ""; } StringBuilder sb = new StringBuilder(); for (Boolean b : isToBeFlattenedArray) { sb.append(b); sb.append(','); } if(sb.length()>0){ sb.deleteCharAt(sb.length()-1); } return sb.toString(); } @Override public boolean supportsMultipleInputs() { return false; } @Override public boolean supportsMultipleOutputs() { return false; } /** * Calls getNext on the generate operator inside the nested * physical plan and returns it maintaining an additional state * to denote the begin and end of the nested plan processing. */ @Override public Result getNext(Tuple t) throws ExecException { Result res = null; Result inp = null; //The nested plan is under processing //So return tuples that the generate oper //returns if(processingPlan){ while(true) { res = processPlan(); if(res.returnStatus==SOStatus.STATUS_OK) { return res; } if(res.returnStatus==SOStatus.STATUS_EOP) { processingPlan = false; for(PactPlan plan : inputPlans) { plan.detachInput(); } break; } if(res.returnStatus==SOStatus.STATUS_ERR) { return res; } if(res.returnStatus==SOStatus.STATUS_NULL) { continue; } } } //The nested plan processing is done or is //yet to begin. So process the input and start //nested plan processing on the input tuple //read while (true) { inp = processInput(); if (inp.returnStatus == SOStatus.STATUS_EOP || inp.returnStatus == SOStatus.STATUS_ERR) { return inp; } if (inp.returnStatus == SOStatus.STATUS_NULL) { continue; } attachInputToPlans((Tuple) inp.result); inpTuple = (Tuple)inp.result; for (PactOperator po : opsToBeReset) { po.reset(); } res = processPlan(); processingPlan = true; return res; } } protected Result processPlan() throws ExecException{ Result res = new Result(); //We check if all the databags have exhausted the tuples. If so we enforce the reading of new data by setting data and its to null if(its != null) { boolean restartIts = true; for(int i = 0; i < noItems; ++i) { if(its[i] != null && isToBeFlattenedArray[i] == true) { restartIts &= !its[i].hasNext(); } } //this means that all the databags have reached their last elements. so we need to force reading of fresh databags if(restartIts) { its = null; data = null; } } if(its == null) { //getNext being called for the first time OR starting with a set of new data from inputs its = new Iterator[noItems]; bags = new Object[noItems]; for(int i = 0; i < noItems; ++i) { //Getting the iterators //populate the input data Result inputData = null; switch(resultTypes[i]) { case DataType.BAG: case DataType.TUPLE : case DataType.BYTEARRAY : case DataType.MAP : case DataType.BOOLEAN : case DataType.INTEGER : case DataType.DOUBLE : case DataType.LONG : case DataType.FLOAT : case DataType.CHARARRAY : inputData = planLeafOps[i].getNext(getDummy(resultTypes[i]), resultTypes[i]); break; default: { int errCode = 2080; String msg = "Foreach currently does not handle type " + DataType.findTypeName(resultTypes[i]); throw new ExecException(msg, errCode, PigException.BUG); } } if (inputData.returnStatus == SOStatus.STATUS_BATCH_OK) { continue; } if(inputData.returnStatus == SOStatus.STATUS_EOP) { //we are done with all the elements. Time to return. its = null; bags = null; return inputData; } // if we see a error just return it if(inputData.returnStatus == SOStatus.STATUS_ERR) { return inputData; } // Object input = null; bags[i] = inputData.result; if(inputData.result instanceof DataBag && isToBeFlattenedArray[i]) { its[i] = ((DataBag)bags[i]).iterator(); } else { its[i] = null; } } } // if accumulating, we haven't got data yet for some fields, just return if (isAccumulative() && isAccumStarted()) { res.returnStatus = SOStatus.STATUS_BATCH_OK; return res; } while(true) { if(data == null) { //getNext being called for the first time or starting on new input data //we instantiate the template array and start populating it with data data = new Object[noItems]; for(int i = 0; i < noItems; ++i) { if(isToBeFlattenedArray[i] && bags[i] instanceof DataBag) { if(its[i].hasNext()) { data[i] = its[i].next(); } else { //the input set is null, so we return. This is // caught above and this function recalled with // new inputs. its = null; data = null; res.returnStatus = SOStatus.STATUS_NULL; return res; } } else { data[i] = bags[i]; } } if(reporter!=null) { reporter.progress(); } //createTuple(data); res.result = createTuple(data); res.returnStatus = SOStatus.STATUS_OK; return res; } else { //we try to find the last expression which needs flattening and start iterating over it //we also try to update the template array for(int index = noItems - 1; index >= 0; --index) { if(its[index] != null && isToBeFlattenedArray[index]) { if(its[index].hasNext()) { data[index] = its[index].next(); res.result = createTuple(data); res.returnStatus = SOStatus.STATUS_OK; return res; } else{ // reset this index's iterator so cross product can be achieved // we would be resetting this way only for the indexes from the end // when the first index which needs to be flattened has reached the // last element in its iterator, we won't come here - instead, we reset // all iterators at the beginning of this method. its[index] = ((DataBag)bags[index]).iterator(); data[index] = its[index].next(); } } } } } //return null; } /** * * @param data array that is the template for the final flattened tuple * @return the final flattened tuple */ protected Tuple createTuple(Object[] data) throws ExecException { Tuple out = mTupleFactory.newTuple(); for(int i = 0; i < data.length; ++i) { Object in = data[i]; if(isToBeFlattenedArray[i] && in instanceof Tuple) { Tuple t = (Tuple)in; int size = t.size(); for(int j = 0; j < size; ++j) { out.append(t.get(j)); } } else { out.append(in); } } if (inpTuple != null) { return illustratorMarkup(inpTuple, out, 0); } else { return illustratorMarkup2(data, out); } } protected void attachInputToPlans(Tuple t) { //super.attachInput(t); for(PactPlan p : inputPlans) { p.attachInput(t); } } public void getLeaves() { if (inputPlans != null) { int i=-1; if(isToBeFlattenedArray == null) { isToBeFlattenedArray = new boolean[inputPlans.size()]; } planLeafOps = new PactOperator[inputPlans.size()]; for(PactPlan p : inputPlans) { ++i; PactOperator leaf = p.getLeaves().get(0); planLeafOps[i] = leaf; if(leaf instanceof POProject && leaf.getResultType() == DataType.TUPLE && ((POProject)leaf).isProjectToEnd() ) { isToBeFlattenedArray[i] = true; } } } // we are calculating plan leaves // so lets reinitialize reInitialize(); } private void reInitialize() { if(planLeafOps != null) { noItems = planLeafOps.length; resultTypes = new byte[noItems]; for (int i = 0; i < resultTypes.length; i++) { resultTypes[i] = planLeafOps[i].getResultType(); } } else { noItems = 0; resultTypes = null; } if(inputPlans != null) { for (PactPlan pp : inputPlans) { try { ResetFinder lf = new ResetFinder(pp, opsToBeReset); lf.visit(); } catch (VisitorException ve) { String errMsg = "Internal Error: Unexpected error looking for nested operators which need to be reset in FOREACH"; throw new RuntimeException(errMsg, ve); } } } } public List<PactPlan> getInputPlans() { return inputPlans; } public void setInputPlans(List<PactPlan> plans) { inputPlans = plans; planLeafOps = null; getLeaves(); } public void addInputPlan(PactPlan plan, boolean flatten) { inputPlans.add(plan); // add to planLeafOps // copy existing leaves PactOperator[] newPlanLeafOps = new PactOperator[planLeafOps.length + 1]; for (int i = 0; i < planLeafOps.length; i++) { newPlanLeafOps[i] = planLeafOps[i]; } // add to the end newPlanLeafOps[planLeafOps.length] = plan.getLeaves().get(0); planLeafOps = newPlanLeafOps; // add to isToBeFlattenedArray // copy existing values boolean[] newIsToBeFlattenedArray = new boolean[isToBeFlattenedArray.length + 1]; for(int i = 0; i < isToBeFlattenedArray.length; i++) { newIsToBeFlattenedArray[i] = isToBeFlattenedArray[i]; } // add to end newIsToBeFlattenedArray[isToBeFlattenedArray.length] = flatten; isToBeFlattenedArray = newIsToBeFlattenedArray; // we just added a leaf - reinitialize reInitialize(); } public void setToBeFlattened(List<Boolean> flattens) { setUpFlattens(flattens); } public List<Boolean> getToBeFlattened() { List<Boolean> result = null; if(isToBeFlattenedArray != null) { result = new ArrayList<Boolean>(); for (int i = 0; i < isToBeFlattenedArray.length; i++) { result.add(isToBeFlattenedArray[i]); } } return result; } public boolean inProcessing() { return processingPlan; } protected void setUpFlattens(List<Boolean> isToBeFlattened) { if(isToBeFlattened == null) { isToBeFlattenedArray = null; } else { isToBeFlattenedArray = new boolean[isToBeFlattened.size()]; int i = 0; for (Iterator<Boolean> it = isToBeFlattened.iterator(); it.hasNext();) { isToBeFlattenedArray[i++] = it.next(); } } } /** * Visits a pipeline and calls reset on all the nodes. Currently only * pays attention to limit nodes, each of which need to be told to reset * their limit. */ private class ResetFinder extends PactPlanVisitor { ResetFinder(PactPlan plan, List<PactOperator> toBeReset) { super(plan, new DependencyOrderWalker<PactOperator, PactPlan>(plan)); } /* @Override public void visitDistinct(PODistinct d) throws VisitorException { // FIXME: add only if limit is present opsToBeReset.add(d); } @Override public void visitLimit(POLimit limit) throws VisitorException { opsToBeReset.add(limit); } @Override public void visitSort(POSort sort) throws VisitorException { // FIXME: add only if limit is present opsToBeReset.add(sort); }*/ /* (non-Javadoc) * @see org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor#visitProject(org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject) */ /* @Override public void visitProject(POProject proj) throws VisitorException { if(proj instanceof PORelationToExprProject) { opsToBeReset.add(proj); } }*/ } /** * @return the opsToBeReset */ public List<PactOperator> getOpsToBeReset() { return opsToBeReset; } /** * @param opsToBeReset the opsToBeReset to set */ public void setOpsToBeReset(List<PactOperator> opsToBeReset) { this.opsToBeReset = opsToBeReset; } private Tuple illustratorMarkup2(Object[] in, Object out) { if(illustrator != null) { ExampleTuple tOut = new ExampleTuple((Tuple) out); illustrator.getLineage().insert(tOut); boolean synthetic = false; for (Object tIn : in) { synthetic |= ((ExampleTuple) tIn).synthetic; illustrator.getLineage().union(tOut, (Tuple) tIn); } illustrator.addData(tOut); int i; for (i = 0; i < noItems; ++i) { if (((DataBag)bags[i]).size() < 2) { break; } } if (i >= noItems && !illustrator.getEqClassesShared()) { illustrator.getEquivalenceClasses().get(0).add(tOut); } tOut.synthetic = synthetic; return tOut; } else { return (Tuple) out; } } @Override public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) { if(illustrator != null) { ExampleTuple tOut = new ExampleTuple((Tuple) out); illustrator.addData(tOut); if (!illustrator.getEqClassesShared()) { illustrator.getEquivalenceClasses().get(0).add(tOut); } LineageTracer lineageTracer = illustrator.getLineage(); lineageTracer.insert(tOut); tOut.synthetic = ((ExampleTuple) in).synthetic; lineageTracer.union((ExampleTuple) in , tOut); return tOut; } else { return (Tuple) out; } } @Override public void setIllustrator(Illustrator illustrator) { // TODO Auto-generated method stub } }