/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators; import java.util.List; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.InternalCachedBag; import org.apache.pig.data.NonSpillableDataBag; import org.apache.pig.data.Tuple; import org.apache.pig.impl.io.NullableTuple; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; public class POJoinPackage extends POPackage { private static final long serialVersionUID = 1L; private POOptimizedForEach forEach; private boolean newKey = true; private Tuple res = null; private boolean lastInputTuple = false; private static final Tuple t1 = null; private static final Result eopResult = new Result(POStatus.STATUS_EOP, null); private boolean firstTime = true; private boolean useDefaultBag = false; public static final String DEFAULT_CHUNK_SIZE = "1000"; private long chunkSize = Long.parseLong(DEFAULT_CHUNK_SIZE); private Result forEachResult; private DataBag[] dbs = null; private int lastBagIndex; public POJoinPackage(OperatorKey k, int rp, POPackage p, POForEach f) { super(k, rp); String scope = getOperatorKey().getScope(); NodeIdGenerator nig = NodeIdGenerator.getGenerator(); forEach = new POOptimizedForEach(new OperatorKey(scope,nig.getNextNodeId(scope))); if (p!=null) { setKeyType(p.getKeyType()); setNumInps(p.getNumInps()); lastBagIndex = numInputs - 1; setInner(p.getInner()); setKeyInfo(p.getKeyInfo()); this.isKeyTuple = p.isKeyTuple; this.isKeyCompound = p.isKeyCompound; } if (f!=null) { setInputPlans(f.getInputPlans()); setToBeFlattened(f.getToBeFlattened()); } } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitJoinPackage(this); } @Override public String name() { String fString = forEach.getFlatStr(); return "POJoinPackage" + "(" + fString + ")" + "[" + DataType.findTypeName(resultType) + "]" +" - " + mKey.toString(); } /** * Calls getNext to get next ForEach result. The input for POJoinPackage is * a (key, NullableTuple) pair. We will materialize n-1 inputs into bags, feed input#n * one tuple a time to the delegated ForEach operator, the input for ForEach is * * (input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists * * of k tuples. * For every ForEach input, pull all the results from ForEach. * getNext will be called multiple times for a particular input, * it returns one output tuple from ForEach every time we call getNext, * so we need to maintain internal status to keep tracking of where we are. */ @Override public Result getNextTuple() throws ExecException { if(firstTime){ firstTime = false; if (PigMapReduce.sJobConfInternal.get() != null) { String bagType = PigMapReduce.sJobConfInternal.get().get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } // if a previous call to foreach.getNext() // has still not returned all output, process it if (forEach.processingPlan) { forEachResult = forEach.getNextTuple(); switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } NullableTuple it = null; // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n, // we will get input#n one tuple a time, fill in res, feed to ForEach. // After this block, we have the first tuple of input#n in hand (kept in variable it) if (newKey) { lastInputTuple = false; //Put n-1 inputs into bags dbs = new DataBag[numInputs]; for (int i = 0; i < numInputs - 1; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POJoinPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs-1); } // For last bag, we always use NonSpillableBag. dbs[lastBagIndex] = new NonSpillableDataBag((int)chunkSize); //For each Nullable tuple in the input, put it //into the corresponding bag based on the index, // except for the last input, which we will stream // The tuples will arrive in the order of the index, // starting from index 0 and such that all tuples for // a given index arrive before a tuple for the next // index does. while (tupIter.hasNext()) { it = tupIter.next(); int itIndex = it.getIndex(); if (itIndex!= numInputs - 1) { dbs[itIndex].add(getValueTuple(it, itIndex)); } else { lastInputTuple = true; break; } if(getReporter()!=null) { getReporter().progress(); } } // If we don't have any tuple for input#n // we do not need any further process, return EOP if (!lastInputTuple) { // we will return at this point because we ought // to be having a flatten on this last input // and we have an empty bag which should result // in this key being taken out of the output newKey = true; return eopResult; } res = mTupleFactory.newTuple(numInputs+1); for (int i = 0; i < dbs.length; i++) res.set(i+1,dbs[i]); res.set(0,key); // if we have an inner anywhere and the corresponding // bag is empty, we can just return for (int i = 0; i < dbs.length - 1; i++) { if(inner[i]&&dbs[i].size()==0){ detachInput(); return eopResult; } } newKey = false; // set up the bag with last input to contain // a chunk of CHUNKSIZE values OR the entire bag if // it has less than CHUNKSIZE values - the idea is in most // cases the values are > CHUNKSIZE in number and in // those cases we will be sending the last bag // as a set of smaller chunked bags thus holding lesser // in memory // the first tuple can be directly retrieved from "it" dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); for(int i = 0; i < chunkSize -1 && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNextTuple(); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } // Keep attaching input tuple to ForEach, until: // 1. We can initialize ForEach.getNext(); // 2. There is no more input#n while (true) { if (tupIter.hasNext()) { // try setting up a bag of CHUNKSIZE OR // the remainder of the bag of last input // (if < CHUNKSIZE) to foreach dbs[lastBagIndex].clear(); // clear last chunk for(int i = 0; i < chunkSize && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } } else // if we do not have any more tuples for input#n, return EOP { detachInput(); newKey = true; return eopResult; } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNextTuple(); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } } public List<PhysicalPlan> getInputPlans() { return forEach.getInputPlans(); } public void setInputPlans(List<PhysicalPlan> plans) { forEach.setInputPlans(plans); } public void setToBeFlattened(List<Boolean> flattens) { forEach.setToBeFlattened(flattens); } /** * @return the forEach */ public POOptimizedForEach getForEach() { return forEach; } /** * @param chunkSize - the chunk size for the biggest input */ public void setChunkSize(long chunkSize) { this.chunkSize = chunkSize; } }