/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.pig.PigException; import org.apache.pig.FuncSpec; import org.apache.pig.PigWarning; import org.apache.pig.data.DataType; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROperPlan; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROpPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.ConstantExpression; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POFilter; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLimit; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCombinerPackage; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPartialAgg; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPreCombinerLocalRearrange; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort; import org.apache.pig.impl.plan.CompilationMessageCollector; import org.apache.pig.impl.plan.DependencyOrderWalker; import org.apache.pig.impl.plan.DepthFirstWalker; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.PlanException; import org.apache.pig.impl.plan.PlanWalker; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.plan.CompilationMessageCollector.MessageType; import org.apache.pig.impl.plan.optimizer.OptimizerException; import org.apache.pig.impl.util.Pair; /** * Optimize map reduce plans to use the combiner where possible. * Algebriac functions and distinct in nested plan of a foreach are partially * computed in the map and combine phase. * A new foreach statement with initial and intermediate forms of algebraic * functions are added to map and combine plans respectively. * * If bag portion of group-by result is projected or a non algebraic * expression/udf has bag as input, combiner will not be used. This is because * the use of combiner in such case is likely to degrade performance * as there will not be much reduction in data size in combine stage to * offset the cost of the additional number of times (de)serialization is done. * * * Major areas for enhancement: * 1. use of combiner in cogroup * 2. queries with order-by, limit or sort in a nested foreach after group-by * 3. case where group-by is followed by filter that has algebraic expression * * * * */ public class CombinerOptimizer extends MROpPlanVisitor { private static final String DISTINCT_UDF_CLASSNAME = org.apache.pig.builtin.Distinct.class.getName(); private Log log = LogFactory.getLog(getClass()); private CompilationMessageCollector messageCollector = null; private boolean doMapAgg; public CombinerOptimizer(MROperPlan plan, boolean doMapAgg) { this(plan, doMapAgg, new CompilationMessageCollector()); } public CombinerOptimizer(MROperPlan plan, boolean doMapAgg, CompilationMessageCollector messageCollector) { super(plan, new DepthFirstWalker<MapReduceOper, MROperPlan>(plan)); this.messageCollector = messageCollector; this.doMapAgg = doMapAgg; } public CompilationMessageCollector getMessageCollector() { return messageCollector; } @Override public void visitMROp(MapReduceOper mr) throws VisitorException { log.trace("Entering CombinerOptimizer.visitMROp"); if (mr.reducePlan.isEmpty()) return; // part one - check if this MR job represents a group-by + foreach // Find the POLocalRearrange in the map. I'll need it later. List<PhysicalOperator> mapLeaves = mr.mapPlan.getLeaves(); if (mapLeaves == null || mapLeaves.size() != 1) { messageCollector.collect("Expected map to have single leaf!", MessageType.Warning, PigWarning.MULTI_LEAF_MAP); return; } PhysicalOperator mapLeaf = mapLeaves.get(0); if (!(mapLeaf instanceof POLocalRearrange)) { return; } POLocalRearrange rearrange = (POLocalRearrange)mapLeaf; List<PhysicalOperator> reduceRoots = mr.reducePlan.getRoots(); if (reduceRoots.size() != 1) { messageCollector.collect("Expected reduce to have single leaf", MessageType.Warning, PigWarning.MULTI_LEAF_REDUCE); return; } // I expect that the first root should always be a POPackage. If // not, I don't know what's going on, so I'm out of here. PhysicalOperator root = reduceRoots.get(0); if (!(root instanceof POPackage)) { messageCollector.collect("Expected reduce root to be a POPackage", MessageType.Warning, PigWarning.NON_PACKAGE_REDUCE_PLAN_ROOT); return; } POPackage pack = (POPackage)root; List<PhysicalOperator> packSuccessors = mr.reducePlan.getSuccessors(root); if (packSuccessors == null || packSuccessors.size() != 1) return; PhysicalOperator successor = packSuccessors.get(0); if (successor instanceof POLimit) { //POLimit is acceptable, as long has it has a single foreach // as successor List<PhysicalOperator> limitSucs = mr.reducePlan.getSuccessors(successor); if(limitSucs != null && limitSucs.size() == 1 && limitSucs.get(0) instanceof POForEach) { // the code below will now further examine // the foreach successor = limitSucs.get(0); } } if (successor instanceof POForEach) { POForEach foreach = (POForEach)successor; List<PhysicalPlan> feInners = foreach.getInputPlans(); // find algebraic operators and also check if the foreach statement // is suitable for combiner use List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = findAlgebraicOps(feInners); if(algebraicOps == null || algebraicOps.size() == 0){ // the plan is not combinable or there is nothing to combine //we're done return; } if (mr.combinePlan.getRoots().size() != 0) { messageCollector.collect("Wasn't expecting to find anything already " + "in the combiner!", MessageType.Warning, PigWarning.NON_EMPTY_COMBINE_PLAN); return; } log.info("Choosing to move algebraic foreach to combiner"); try { // replace PODistinct->Project[*] with distinct udf (which is Algebriac) for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){ if(! (op2plan.first instanceof PODistinct)) continue; DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second); distinctPatcher.visit(); if(distinctPatcher.getDistinct() == null){ int errCode = 2073; String msg = "Problem with replacing distinct operator with distinct built-in function."; throw new PlanException(msg, errCode, PigException.BUG); } op2plan.first = distinctPatcher.getDistinct(); } //create new map foreach POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); Map<PhysicalOperator, Integer> op2newpos = new HashMap<PhysicalOperator, Integer>(); Integer pos = 1; //create plan for each algebraic udf and add as inner plan in map-foreach for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){ PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second); mfe.addInputPlan(udfPlan, false); op2newpos.put(op2plan.first, pos++); } changeFunc(mfe, POUserFunc.INITIAL); // since we will only be creating SingleTupleBag as input to // the map foreach, we should flag the POProjects in the map // foreach inner plans to also use SingleTupleBag for (PhysicalPlan mpl : mfe.getInputPlans()) { try { new fixMapProjects(mpl).visit(); } catch (VisitorException e) { int errCode = 2089; String msg = "Unable to flag project operator to use single tuple bag."; throw new PlanException(msg, errCode, PigException.BUG, e); } } //create new combine foreach POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); //add algebraic functions with appropriate projection addAlgebraicFuncToCombineFE(cfe, op2newpos); changeFunc(cfe, POUserFunc.INTERMEDIATE); //fix projection and function time for algebraic functions in reduce foreach for(Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps ){ setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first)); ((POUserFunc)op2plan.first).setAlgebraicFunction(POUserFunc.FINAL); } // we have modified the foreach inner plans - so set them // again for the foreach so that foreach can do any re-initialization // around them. // FIXME - this is a necessary evil right now because the leaves are explicitly // stored in the POForeach as a list rather than computed each time at // run time from the plans for optimization. Do we want to have the Foreach // compute the leaves each time and have Java optimize it (will Java optimize?)? mfe.setInputPlans(mfe.getInputPlans()); cfe.setInputPlans(cfe.getInputPlans()); foreach.setInputPlans(foreach.getInputPlans()); //tell POCombinerPackage which fields need projected and // which placed in bags. First field is simple project // rest need to go into bags int numFields = algebraicOps.size() + 1; // algebraic funcs + group key boolean[] bags = new boolean[numFields]; bags[0] = false; for (int i = 1; i < numFields; i++) { bags[i] = true; } // Use the POCombiner package in the combine plan // as it needs to act differently than the regular // package operator. mr.combinePlan = new PhysicalPlan(); POCombinerPackage combinePack = new POCombinerPackage(pack, bags); mr.combinePlan.add(combinePack); mr.combinePlan.add(cfe); mr.combinePlan.connect(combinePack, cfe); // No need to connect projections in cfe to cp, because // PigCombiner directly attaches output from package to // root of remaining plan. POLocalRearrange mlr = getNewRearrange(rearrange); POPartialAgg mapAgg = null; if(doMapAgg){ mapAgg = createPartialAgg(cfe); } // A specialized local rearrange operator will replace // the normal local rearrange in the map plan. This behaves // like the regular local rearrange in the getNext() // as far as getting its input and constructing the // "key" out of the input. It then returns a tuple with // two fields - the key in the first position and the // "value" inside a bag in the second position. This output // format resembles the format out of a Package. This output // will feed to the map foreach which expects this format. // If the key field isn't in the project of the combiner or map foreach, // it is added to the end (This is required so that we can // set up the inner plan of the new Local Rearrange leaf in the map // and combine plan to contain just the project of the key). patchUpMap(mr.mapPlan, getPreCombinerLR(rearrange), mfe, mapAgg, mlr); POLocalRearrange clr = getNewRearrange(rearrange); mr.combinePlan.add(clr); mr.combinePlan.connect(cfe, clr); // Change the package operator in the reduce plan to // be the POCombiner package, as it needs to act // differently than the regular package operator. POCombinerPackage newReducePack = new POCombinerPackage(pack, bags); mr.reducePlan.replace(pack, newReducePack); // the replace() above only changes // the plan and does not change "inputs" to // operators // set up "inputs" for the operator after // package correctly List<PhysicalOperator> packList = new ArrayList<PhysicalOperator>(); packList.add(newReducePack); List<PhysicalOperator> sucs = mr.reducePlan.getSuccessors(newReducePack); // there should be only one successor to package sucs.get(0).setInputs(packList); } catch (Exception e) { int errCode = 2018; String msg = "Internal error. Unable to introduce the combiner for optimization."; throw new OptimizerException(msg, errCode, PigException.BUG, e); } } } /** * Translate POForEach in combiner into a POPartialAgg * @param combineFE * @return partial aggregate operator * @throws CloneNotSupportedException */ private POPartialAgg createPartialAgg(POForEach combineFE) throws CloneNotSupportedException { String scope = combineFE.getOperatorKey().scope; POPartialAgg poAgg = new POPartialAgg(new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope))); poAgg.addOriginalLocation(combineFE.getAlias(), combineFE.getOriginalLocations()); poAgg.setResultType(combineFE.getResultType()); //first plan in combine foreach is the group key poAgg.setKeyPlan(combineFE.getInputPlans().get(0).clone()); List<PhysicalPlan> valuePlans = new ArrayList<PhysicalPlan>(); for(int i=1; i<combineFE.getInputPlans().size(); i++){ valuePlans.add(combineFE.getInputPlans().get(i).clone()); } poAgg.setValuePlans(valuePlans); return poAgg; } /** * find algebraic operators and also check if the foreach statement * is suitable for combiner use * @param feInners inner plans of foreach * @return null if plan is not combinable, otherwise list of combinable operators * @throws VisitorException */ private List<Pair<PhysicalOperator, PhysicalPlan>> findAlgebraicOps(List<PhysicalPlan> feInners) throws VisitorException { ArrayList<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = new ArrayList<Pair<PhysicalOperator, PhysicalPlan>>(); //check each foreach inner plan for(PhysicalPlan pplan : feInners){ //check for presence of non combinable operators AlgebraicPlanChecker algChecker = new AlgebraicPlanChecker(pplan); algChecker.visit(); if(algChecker.sawNonAlgebraic){ return null; } //if we found a combinable distinct add that to list if(algChecker.sawDistinctAgg){ algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(algChecker.getDistinct(), pplan)); continue; } List<PhysicalOperator> roots = pplan.getRoots(); //combinable operators have to be attached to POProject root(s) // if root does not have a successor that is combinable, the project // has to be projecting the group column . Otherwise this MR job //is considered not combinable as we don't want to use combiner for // cases where this foreach statement is projecting bags (likely to // bad for performance because of additional (de)serialization costs) for(PhysicalOperator root : roots){ if(root instanceof ConstantExpression){ continue; } if(! (root instanceof POProject)){ // how can this happen? - expect root of inner plan to be // constant or project. not combining it //TODO: Warn return null; } POProject proj = (POProject)root; POUserFunc combineUdf = getAlgebraicSuccessor(proj, pplan); if(combineUdf == null){ if(proj.isProjectToEnd()){ //project-star or project to end // not combinable return null; } // Check to see if this is a projection of the grouping column. // If so, it will be a projection of col 0 List<Integer> cols = proj.getColumns(); if (cols != null && cols.size() == 1 && cols.get(0) == 0) { //it is project of grouping column, so the plan is still //combinable continue; }else{ //not combinable return null; } } // The algebraic udf can have more than one input. Add the udf only once boolean exist = false; for (Pair<PhysicalOperator, PhysicalPlan> pair : algebraicOps) { if (pair.first.equals(combineUdf)) { exist = true; break; } } if (!exist) algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(combineUdf, pplan)); } } return algebraicOps; } /** * Look for a algebraic POUserFunc as successor to this project, called * recursively to skip any other projects seen on the way. * @param proj project * @param pplan physical plan * @return null if any operator other POProject or algebraic POUserFunc is * found while going down the plan, otherwise algebraic POUserFunc is returned */ private POUserFunc getAlgebraicSuccessor(POProject proj, PhysicalPlan pplan) { //check if root is followed by combinable operator List<PhysicalOperator> succs = pplan.getSuccessors(proj); if(succs == null || succs.size() == 0){ return null; } if(succs.size() > 1){ //project shared by more than one operator - does not happen // in plans generated today // won't try to combine this return null; } PhysicalOperator succ = succs.get(0); if(succ instanceof POProject){ return getAlgebraicSuccessor((POProject) succ, pplan); } if(succ instanceof POUserFunc && ((POUserFunc)succ).combinable() ){ return (POUserFunc)succ; } //some other operator ? can't combine return null; } /** * Create a new foreach with same scope,alias as given foreach * add an inner plan that projects the group column, which is going to be * the first input * @param foreach source foreach * @param keyType type for group-by key * @return new POForeach */ private POForEach createForEachWithGrpProj(POForEach foreach, byte keyType) { String scope = foreach.getOperatorKey().scope; POForEach newFE = new POForEach(createOperatorKey(scope), new ArrayList<PhysicalPlan>()); newFE.addOriginalLocation(foreach.getAlias(), foreach.getOriginalLocations()); newFE.setResultType(foreach.getResultType()); //create plan that projects the group column PhysicalPlan grpProjPlan = new PhysicalPlan(); //group by column is the first column POProject proj = new POProject(createOperatorKey(scope), 1, 0); proj.setResultType(keyType); grpProjPlan.add(proj); newFE.addInputPlan(grpProjPlan, false); return newFE; } /** * Create new plan and add to it the clones of operator algeOp and its * predecessors from the physical plan pplan . * @param algeOp algebraic operator * @param pplan physical plan that has algeOp * @return new plan * @throws CloneNotSupportedException * @throws PlanException */ private PhysicalPlan createPlanWithPredecessors(PhysicalOperator algeOp, PhysicalPlan pplan) throws CloneNotSupportedException, PlanException { PhysicalPlan newplan = new PhysicalPlan(); addPredecessorsToPlan(algeOp, pplan, newplan); return newplan; } /** * Recursively clone op and its predecessors from pplan and add them to newplan * @param op * @param pplan * @param newplan * @return * @throws CloneNotSupportedException * @throws PlanException */ private PhysicalOperator addPredecessorsToPlan(PhysicalOperator op, PhysicalPlan pplan, PhysicalPlan newplan) throws CloneNotSupportedException, PlanException { PhysicalOperator newOp = op.clone(); newplan.add(newOp); if(pplan.getPredecessors(op) == null || pplan.getPredecessors(op).size() == 0){ return newOp; } for(PhysicalOperator pred : pplan.getPredecessors(op)){ PhysicalOperator newPred = addPredecessorsToPlan(pred, pplan, newplan); newplan.connect(newPred, newOp); } return newOp; } /** * add algebraic functions with appropriate projection to new foreach in combiner * @param cfe - the new foreach in combiner * @param op2newpos - mapping of physical operator to position in input * @throws CloneNotSupportedException * @throws PlanException */ private void addAlgebraicFuncToCombineFE(POForEach cfe, Map<PhysicalOperator, Integer> op2newpos) throws CloneNotSupportedException, PlanException { //an array that we will first populate with physical operators in order //of their position in input. Used while adding plans to combine foreach // just so that output of combine foreach same positions as input. That // means the same operator to position mapping can be used by reduce as well PhysicalOperator[] opsInOrder = new PhysicalOperator[op2newpos.size() + 1]; for(Map.Entry<PhysicalOperator, Integer> op2pos : op2newpos.entrySet()){ opsInOrder[op2pos.getValue()] = op2pos.getKey(); } // first position is used by group column and a plan has been added for it, //so start with 1 for(int i=1; i < opsInOrder.length; i++){ //create new inner plan for foreach //add cloned copy of given physical operator and a new project. // Even if the udf in query takes multiple input, only one project // needs to be added because input to this udf //will be the INITIAL version of udf evaluated in map. PhysicalPlan newPlan = new PhysicalPlan(); PhysicalOperator newOp = opsInOrder[i].clone(); newPlan.add(newOp); POProject proj = new POProject( createOperatorKey(cfe.getOperatorKey().getScope()), 1, i ); proj.setResultType(DataType.BAG); newPlan.add(proj); newPlan.connect(proj, newOp); cfe.addInputPlan(newPlan, false); } } /** * Replace old POLocalRearrange with new pre-combine LR, * add new map foreach, new map-local-rearrange, and connect them * * @param mapPlan * @param preCombinerLR * @param mfe * @param mapAgg * @param mlr * @throws PlanException */ private void patchUpMap(PhysicalPlan mapPlan, POPreCombinerLocalRearrange preCombinerLR, POForEach mfe, POPartialAgg mapAgg, POLocalRearrange mlr) throws PlanException { POLocalRearrange oldLR = (POLocalRearrange)mapPlan.getLeaves().get(0); mapPlan.replace(oldLR, preCombinerLR); mapPlan.add(mfe); mapPlan.connect(preCombinerLR, mfe); //the operator before local rearrange PhysicalOperator opBeforeLR = mfe; if(mapAgg != null){ mapPlan.add(mapAgg); mapPlan.connect(mfe, mapAgg); opBeforeLR = mapAgg; } mapPlan.add(mlr); mapPlan.connect(opBeforeLR, mlr); } /** * @param rearrange * @return */ private POPreCombinerLocalRearrange getPreCombinerLR(POLocalRearrange rearrange) { String scope = rearrange.getOperatorKey().scope; POPreCombinerLocalRearrange pclr = new POPreCombinerLocalRearrange( createOperatorKey(scope), rearrange.getRequestedParallelism(), rearrange.getInputs()); pclr.setPlans(rearrange.getPlans()); return pclr; } private OperatorKey createOperatorKey(String scope) { return new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)); } /** * @param op * @param index * @param plan * @throws PlanException */ private void setProjectInput(PhysicalOperator op, PhysicalPlan plan, int index) throws PlanException { String scope = op.getOperatorKey().scope; POProject proj = new POProject(new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), op.getRequestedParallelism(), index); proj.setResultType(DataType.BAG); // Remove old connections and elements from the plan plan.trimAbove(op); plan.add(proj); plan.connect(proj, op); List<PhysicalOperator> inputs = new ArrayList<PhysicalOperator>(1); inputs.add(proj); op.setInputs(inputs); } /** * Change the algebriac function type for algebraic functions in map and combine * In map and combine the algebraic functions will be leaf of the plan * @param fe * @param type * @throws PlanException */ private void changeFunc(POForEach fe, byte type) throws PlanException { for(PhysicalPlan plan : fe.getInputPlans()){ List<PhysicalOperator> leaves = plan.getLeaves(); if (leaves == null || leaves.size() != 1) { int errCode = 2019; String msg = "Expected to find plan with single leaf. Found " + leaves.size() + " leaves."; throw new PlanException(msg, errCode, PigException.BUG); } PhysicalOperator leaf = leaves.get(0); if(leaf instanceof POProject){ continue; } if (!(leaf instanceof POUserFunc)) { int errCode = 2020; String msg = "Expected to find plan with UDF or project leaf. Found " + leaf.getClass().getSimpleName(); throw new PlanException(msg, errCode, PigException.BUG); } POUserFunc func = (POUserFunc)leaf; try { func.setAlgebraicFunction(type); } catch (ExecException e) { int errCode = 2075; String msg = "Could not set algebraic function type."; throw new PlanException(msg, errCode, PigException.BUG, e); } } } /** * create new Local rearrange by cloning existing rearrange and * add plan for projecting the key * @param rearrange * @return * @throws PlanException * @throws CloneNotSupportedException */ private POLocalRearrange getNewRearrange(POLocalRearrange rearrange) throws PlanException, CloneNotSupportedException { POLocalRearrange newRearrange = rearrange.clone(); // Set the projection to be the key PhysicalPlan newPlan = new PhysicalPlan(); String scope = newRearrange.getOperatorKey().scope; POProject proj = new POProject(new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), -1, 0); proj.setResultType(newRearrange.getKeyType()); newPlan.add(proj); List<PhysicalPlan> plans = new ArrayList<PhysicalPlan>(1); plans.add(newPlan); newRearrange.setPlansFromCombiner(plans); return newRearrange; } /** * Checks if there is something that prevents the use of algebraic interface, * and looks for the PODistinct that can be used as algebraic * */ private static class AlgebraicPlanChecker extends PhyPlanVisitor { boolean sawNonAlgebraic = false; boolean sawDistinctAgg = false; private boolean sawForeach = false; private PODistinct distinct = null; AlgebraicPlanChecker(PhysicalPlan plan) { super(plan, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(plan)); } /* (non-Javadoc) * @see org.apache.pig.impl.plan.PlanVisitor#visit() */ @Override public void visit() throws VisitorException { super.visit(); // if we saw foreach and distinct agg its ok // else if we only saw foreach, mark it as non algebraic if(sawForeach && !sawDistinctAgg) { sawNonAlgebraic = true; } } @Override public void visitDistinct(PODistinct distinct) throws VisitorException { this.distinct = distinct; if(sawDistinctAgg) { // we want to combine only in the case where there is only // one PODistinct which is the only input to an agg // we apparently have seen a PODistinct before, so lets not // combine. sawNonAlgebraic = true; return; } // check that this distinct is the only input to an agg // We could have the following two cases // script 1: // .. // b = group a by ... // c = foreach b { x = distinct a; generate AGG(x), ...} // The above script leads to the following plan for AGG(x): // POUserFunc(org.apache.pig.builtin.COUNT)[long] // | // |---Project[bag][*] // | // |---PODistinct[bag] // | // |---Project[tuple][1] // script 2: // .. // b = group a by ... // c = foreach b { x = distinct a; generate AGG(x.$1), ...} // The above script leads to the following plan for AGG(x.$1): // POUserFunc(org.apache.pig.builtin.IntSum)[long] // | // |---Project[bag][1] // | // |---Project[bag][*] // | // |---PODistinct[bag] // | // |---Project[tuple][1] // So tracing from the PODistinct to its successors upto the leaf, we should // see a Project[bag][*] as the immediate successor and an optional Project[bag] // as the next successor till we see the leaf. PhysicalOperator leaf = mPlan.getLeaves().get(0); // the leaf has to be a POUserFunc (need not be algebraic) if(leaf instanceof POUserFunc) { // we want to combine only in the case where there is only // one PODistinct which is the only input to an agg. // Do not combine if there are additional inputs. List<PhysicalOperator> preds = mPlan.getPredecessors(leaf); if (preds.size() > 1) { sawNonAlgebraic = true; return; } List<PhysicalOperator> immediateSuccs = mPlan.getSuccessors(distinct); if(immediateSuccs.size() == 1 && immediateSuccs.get(0) instanceof POProject) { if(checkSuccessorIsLeaf(leaf, immediateSuccs.get(0))) { // script 1 above sawDistinctAgg = true; return; } else { // check for script 2 scenario above List<PhysicalOperator> nextSuccs = mPlan.getSuccessors(immediateSuccs.get(0)); if(nextSuccs.size() == 1) { PhysicalOperator op = nextSuccs.get(0); if(op instanceof POProject) { if(checkSuccessorIsLeaf(leaf, op)) { sawDistinctAgg = true; return; } } } } } } // if we did not return above, that means we did not see // the pattern we expected sawNonAlgebraic = true; } /** * @return the distinct */ public PODistinct getDistinct() { if(sawNonAlgebraic) return null; return distinct; } @Override public void visitLimit(POLimit limit) throws VisitorException { sawNonAlgebraic = true; } private boolean checkSuccessorIsLeaf(PhysicalOperator leaf, PhysicalOperator opToCheck) { List<PhysicalOperator> succs = mPlan.getSuccessors(opToCheck); if(succs.size() == 1) { PhysicalOperator op = succs.get(0); if(op == leaf) { return true; } } return false; } @Override public void visitFilter(POFilter filter) throws VisitorException { sawNonAlgebraic = true; } @Override public void visitPOForEach(POForEach fe) throws VisitorException { // we need to allow foreach as input for distinct // but don't want it for other things (why?). So lets // flag the presence of Foreach and if this is present // with a distinct agg, it will be allowed. sawForeach = true; } @Override public void visitSort(POSort sort) throws VisitorException { sawNonAlgebraic = true; } } /** * A visitor to replace * Project[bag][*] * | * |---PODistinct[bag] * with * POUserFunc(org.apache.pig.builtin.Distinct)[DataBag] */ private static class DistinctPatcher extends PhyPlanVisitor { private POUserFunc distinct = null; /** * @param plan * @param walker */ public DistinctPatcher(PhysicalPlan plan, PlanWalker<PhysicalOperator, PhysicalPlan> walker) { super(plan, walker); } /** * @param physicalPlan */ public DistinctPatcher(PhysicalPlan physicalPlan) { this(physicalPlan, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(physicalPlan)); } /* (non-Javadoc) * @see org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor#visitProject(org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject) */ @Override public void visitProject(POProject proj) throws VisitorException { // check if this project is preceded by PODistinct and // has the return type bag List<PhysicalOperator> preds = mPlan.getPredecessors(proj); if(preds == null) return; // this is a leaf project and so not interesting for patching PhysicalOperator pred = preds.get(0); if(preds.size() == 1 && pred instanceof PODistinct) { if(distinct != null) { // we should not already have been patched since the // Project-Distinct pair should occur only once int errCode = 2076; String msg = "Unexpected Project-Distinct pair while trying to set up plans for use with combiner."; throw new OptimizerException(msg, errCode, PigException.BUG); } // we have stick in the POUserfunc(org.apache.pig.builtin.Distinct)[DataBag] // in place of the Project-PODistinct pair PhysicalOperator distinctPredecessor = mPlan.getPredecessors(pred).get(0); POUserFunc func = null; try { String scope = proj.getOperatorKey().scope; List<PhysicalOperator> funcInput = new ArrayList<PhysicalOperator>(); FuncSpec fSpec = new FuncSpec(DISTINCT_UDF_CLASSNAME); funcInput.add(distinctPredecessor); // explicitly set distinctPredecessor's result type to // be tuple - this is relevant when distinctPredecessor is // originally a POForeach with return type BAG - we need to // set it to tuple so we get a stream of tuples. distinctPredecessor.setResultType(DataType.TUPLE); func = new POUserFunc(new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)),-1, funcInput, fSpec); func.setResultType(DataType.BAG); mPlan.replace(proj, func); mPlan.remove(pred); // connect the the newly added "func" to // the predecessor to the earlier PODistinct mPlan.connect(distinctPredecessor, func); } catch (PlanException e) { int errCode = 2077; String msg = "Problem with reconfiguring plan to add distinct built-in function."; throw new OptimizerException(msg, errCode, PigException.BUG, e); } distinct = func; } } POUserFunc getDistinct(){ return distinct; } } private static class fixMapProjects extends PhyPlanVisitor { public fixMapProjects(PhysicalPlan plan) { this(plan, new DepthFirstWalker<PhysicalOperator, PhysicalPlan>( plan)); } /** * @param plan * @param walker */ public fixMapProjects(PhysicalPlan plan, PlanWalker<PhysicalOperator, PhysicalPlan> walker) { super(plan, walker); } /* * (non-Javadoc) * * @see org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor#visitProject(org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject) */ @Override public void visitProject(POProject proj) throws VisitorException { if (proj.getResultType() == DataType.BAG) { // IMPORTANT ASSUMPTION: // we should be calling this visitor only for // fixing up the projects in the map's foreach // inner plan. In the map side, we are dealing // with single tuple bags - so set the flag in // the project to use single tuple bags. If in // future we don't have single tuple bags in the // input to map's foreach, we should NOT be doing // this! proj.setResultSingleTupleBag(true); } } } }