PruneColumns.java example

Explorer
hadoop-pig-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.logicalLayer.optimizer;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadPushDown;
import org.apache.pig.PigException;
import org.apache.pig.LoadPushDown.RequiredField;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.LoadPushDown.RequiredFieldResponse;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.ColumnPruner;
import org.apache.pig.impl.logicalLayer.ExpressionOperator;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LOCast;
import org.apache.pig.impl.logicalLayer.LOCogroup;
import org.apache.pig.impl.logicalLayer.LOCross;
import org.apache.pig.impl.logicalLayer.LODistinct;
import org.apache.pig.impl.logicalLayer.LOFilter;
import org.apache.pig.impl.logicalLayer.LOForEach;
import org.apache.pig.impl.logicalLayer.LOJoin;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LOMapLookup;
import org.apache.pig.impl.logicalLayer.LOProject;
import org.apache.pig.impl.logicalLayer.LOSort;
import org.apache.pig.impl.logicalLayer.LOSplit;
import org.apache.pig.impl.logicalLayer.LOSplitOutput;
import org.apache.pig.impl.logicalLayer.LOStore;
import org.apache.pig.impl.logicalLayer.LOStream;
import org.apache.pig.impl.logicalLayer.LOUnion;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.RelationalOperator;
import org.apache.pig.impl.logicalLayer.TopLevelProjectFinder;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.MapKeysInfo;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.plan.ProjectionMap.Column;
import org.apache.pig.impl.plan.optimizer.OptimizerException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;

class RequiredInfo {
    List<RequiredFields> requiredFieldsList;

    RequiredInfo(List<RequiredFields> requiredFieldsList)
    {
        this.requiredFieldsList = requiredFieldsList;
    }
}

public class PruneColumns extends LogicalTransformer {
    private boolean safeToPrune = true;
    private static Log log = LogFactory.getLog(PruneColumns.class);
    Map<RelationalOperator, RequiredInfo> cachedRequiredInfo = new HashMap<RelationalOperator, RequiredInfo>();
    private Map<LOLoad, RequiredFields> prunedLoaderColumnsMap = new HashMap<LOLoad, RequiredFields>();
    ColumnPruner pruner;
    public PruneColumns(LogicalPlan plan) {
        super(plan);
        pruner = new ColumnPruner(plan);
    }

    @Override
    public boolean check(List<LogicalOperator> nodes) throws OptimizerException {
        if((nodes == null) || (nodes.size() <= 0)) {
            int errCode = 2177;
            String msg = "Cannot retrieve operator from null or empty list.";
            throw new OptimizerException(msg, errCode, PigException.BUG);
        }
        
        try {
            LogicalOperator lo = nodes.get(0);
            if (lo == null) {
                int errCode = 2178;
                String msg = "The matching node from the optimizor framework is null";
                throw new OptimizerException(msg, errCode, PigException.BUG);
            }
            if ((lo instanceof LOForEach||lo instanceof LOSplit)&&lo.getSchema()!=null)
                return true;
            return false;
        } catch (Exception e) {
            int errCode = 2179;
            String msg = "Error while performing checks to prune columns.";
            throw new OptimizerException(msg, errCode, PigException.BUG, e);
        }
    }
    
    @Override
    // transform will pick every LOForEach and LOSplit
    public void transform(List<LogicalOperator> nodes)
            throws OptimizerException {
        if((nodes == null) || (nodes.size() <= 0)) {
            int errCode = 2177;
            String msg = "Cannot retrieve operator from null or empty list.";
            throw new OptimizerException(msg, errCode, PigException.BUG);
        }
        try {
            LogicalOperator lo = nodes.get(0);
            if (lo == null || !(lo instanceof LOForEach || lo instanceof LOSplit)) {
                int errCode = 2178;
                String msg = "Expected " + LOForEach.class.getSimpleName() + " or " + LOSplit.class.getSimpleName();
                throw new OptimizerException(msg, errCode, PigException.BUG);
            }

            // Check if we have saved requiredInfo, if so, we will use that as required output fields for that operator;
            // Otherwise means we require every output field
            RequiredInfo requiredOutputInfo = cachedRequiredInfo.get(lo);

            if (requiredOutputInfo==null)
            {
                List<RequiredFields> requiredOutputFieldsList = new ArrayList<RequiredFields>();
                List<LogicalOperator> successors = mPlan.getSuccessors(lo);
                if (successors==null)
                {
                    requiredOutputFieldsList.add(new RequiredFields(true));
                }
                else
                {
                    // The only case requiredOutputFieldsList more than 1 element is when the current 
                    // operator is LOSplit
                    for (int i=0;i<successors.size();i++)
                    {
                        requiredOutputFieldsList.add(new RequiredFields(true));
                    }
                }
                requiredOutputInfo = new RequiredInfo(requiredOutputFieldsList);
            }
            processNode(lo, requiredOutputInfo);
        } catch (OptimizerException oe) {
            throw oe;
        } catch (Exception e) {
            int errCode = 2181;
            String msg = "Unable to prune columns.";
            throw new OptimizerException(msg, errCode, PigException.BUG, e);
        }
    }

    // We recursively collect required fields from forEach from bottom to top, until one of the following conditions occurs:
    //   1. If we see another LOForEach, we simply stop because optimizor will pick that foreach later and start from there
    //   2. If we see LOStore, LOStream, LODistinct, we stop, LOStore, LOStream, LODistinct require all fields, we cannot push upward
    //   3. If we see LOLoad, we set required fields and stop, LOLoad suppose to read only required fields
    //   4. If we see LOSplit, we save requiredInfo and quit. optimizor will pick that split after all its successors are visited
    // For all other operators, we recursively call processNode for all its parents
    // 
    // Inside processNode, we will collect required input columns from required output columns. Required input/output columns 
    //     also include required map keys referred by the logical plan beneath. Required input columns come from two sources:
    //   1. Relevant input fields of required output fields
    //   2. Required input fields of the logic operator
    //
    // lo: logical operator to process
    // requiredOutputFields: requiredFieldsList below this operator
    public void processNode(LogicalOperator lo, RequiredInfo requiredOutputInfo) throws OptimizerException
    {
        try
        {
            if (!safeToPrune)
                return;
            if (!(lo instanceof RelationalOperator))
            {
                int errCode = 2182;
                String msg = "Only relational operator can be used in column prune optimization.";
                throw new OptimizerException(msg, errCode, PigException.BUG);
            }
            if (lo.getSchema()==null)
            {
                safeToPrune = false;
                return;
            }
            RelationalOperator rlo = (RelationalOperator)lo;
            List<LogicalOperator> predecessors = (mPlan.getPredecessors(rlo) == null ? null
                    : new ArrayList<LogicalOperator>(mPlan.getPredecessors(rlo)));
            
            // Now we have collected required output fields of LOLoad (include requried map keys).
            // We need to push these into the loader
            if (rlo instanceof LOLoad)
            {
                // LOLoad has only one output
                RequiredFields loaderRequiredFields = requiredOutputInfo.requiredFieldsList.get(0);
                prunedLoaderColumnsMap.put((LOLoad)rlo, loaderRequiredFields);
                return;
            }
            
            // If the predecessor is one of LOStore/LOStream/LODistinct, we stop to trace up.
            // We require all input fields. We stop processing here. The optimizer will
            // pick the next ForEach and start processing from there
            if (rlo instanceof LOStore || rlo instanceof LOStream || rlo instanceof LODistinct) {
                return;
            }
            
            // merge requiredOutputFields and process the predecessor
            if (rlo instanceof LOSplit)
            {
                List<RequiredFields> requiredInputFieldsList = new ArrayList<RequiredFields>();
                RequiredFields requiredFields = new RequiredFields(false);
                for (int i=0;i<mPlan.getSuccessors(rlo).size();i++)
                {
                    RequiredFields rf = null;
                    try {
                        rf = requiredOutputInfo.requiredFieldsList.get(i);
                    } catch (Exception e) {
                    }
                    if (rf!=null)
                    {
                        rf.reIndex(0);
                        requiredFields.merge(rf);
                    } else {
                        // need all fields
                        List<Pair<Integer, Integer>> l = new ArrayList<Pair<Integer, Integer>>();
                        for (int j=0;j<rlo.getSchema().size();j++)
                            l.add(new Pair<Integer, Integer>(0, j));
                        rf = new RequiredFields(l);
                        requiredFields.merge(rf);
                        break;
                    }
                }
                requiredInputFieldsList.add(requiredFields);
                if (predecessors.get(0) instanceof LOForEach || predecessors.get(0) instanceof LOSplit)
                    cachedRequiredInfo.put((RelationalOperator)predecessors.get(0), new RequiredInfo(requiredInputFieldsList));
                else
                    processNode(predecessors.get(0), new RequiredInfo(requiredInputFieldsList));
                return;
            }
            
            // Initialize requiredInputFieldsList
            List<RequiredFields> requiredInputFieldsList = new ArrayList<RequiredFields>(); 
            for (int i=0;i<predecessors.size();i++)
                requiredInputFieldsList.add(null);
            
            // Map required output columns to required input columns.
            // We also collect required output map keys into input map keys.
            // Since we have already processed Split, so every remaining operator
            // have only one element in requiredOutputFieldList, so we get the first 
            // element and process
            RequiredFields requiredOutputFields = requiredOutputInfo.requiredFieldsList.get(0);
            
            // needAllFields means we require every individual output column and all map keys of that output.
            // We convert needAllFields to individual fields here to facilitate further processing
            if (requiredOutputFields.needAllFields())
            {
                List<Pair<Integer, Integer>> outputList = new ArrayList<Pair<Integer, Integer>>(); 
                for (int j=0;j<rlo.getSchema().size();j++)
                    outputList.add(new Pair<Integer, Integer>(0, j));
                requiredOutputFields = new RequiredFields(outputList);
                for (int i=0;i<requiredOutputFields.size();i++)
                    requiredOutputFields.setMapKeysInfo(i, new MapKeysInfo(true));
            }
            
            if (requiredOutputFields.getFields()==null)
            {
                int errCode = 2184;
                String msg = "Fields list inside RequiredFields is null.";
                throw new OptimizerException(msg, errCode, PigException.BUG);
            }
            
            for (int i=0;i<requiredOutputFields.size();i++)
            {
                Pair<Integer, Integer> requiredOutputField = requiredOutputFields.getField(i);
                MapKeysInfo outputMapKeysInfo = requiredOutputFields.getMapKeysInfo(i);

                List<RequiredFields> relevantFieldsList = rlo.getRelevantInputs(requiredOutputField.first, requiredOutputField.second);
                
                // We do not have any relevant input fields for this output, continue to next output 
                if (relevantFieldsList==null)
                    continue;
                
                for (int j=0;j<relevantFieldsList.size();j++)
                {
                    RequiredFields relevantFields = relevantFieldsList.get(j);
                    
                    if (relevantFields!=null && relevantFields.needAllFields())
                    {
                        requiredInputFieldsList.set(j, new RequiredFields(true));
                        continue;
                    }
                    
                    // Mapping output map keys to input map keys
                    if (rlo instanceof LOCogroup)
                    {
                        if (j!=0 && relevantFields!=null && !relevantFields.needAllFields())
                        {
                            for (Pair<Integer, Integer> pair : relevantFields.getFields())
                                relevantFields.setMapKeysInfo(pair.first, pair.second, 
                                        new MapKeysInfo(true));
                        }
                    }
                    else if (rlo instanceof LOForEach)
                    {
                        // Relay map keys from output to input
                        LogicalPlan forEachPlan = ((LOForEach)rlo).getRelevantPlan(requiredOutputField.second);
                        if (relevantFields.getFields()!=null && relevantFields.getFields().size()!=0)
                        {
                            int index = ((LOForEach)rlo).getForEachPlans().indexOf(forEachPlan);
                            // We check if the field get flattened, if it does, then we do not relay output map keys to input map keys.
                            // There are two situations:
                            // 1. input column is tuple, bag, or other simple type, there is no concept of map key, so we do not relay
                            // 2. input column is map, flatten does not do anything, we can still relay
                            boolean nonflatten = false;
                            if (!((LOForEach)rlo).getFlatten().get(index))
                            {
                                nonflatten = true;
                            }
                            else
                            {
                                // Foreach plan is flattened, check if there is only one input for this foreach plan 
                                // and input schema for that input is not map, if so, it is a dummy flatten 
                                if (forEachPlan.getRoots().size()==1 && forEachPlan.getRoots().get(0) instanceof LOProject)
                                {
                                    LOProject loProj = (LOProject)forEachPlan.getRoots().get(0);
                                    if (loProj.getExpression().getSchema()!=null &&
                                            loProj.getExpression().getSchema().getField(loProj.getCol()).type!=DataType.BAG)
                                        nonflatten = true;
                                }
                            }
                            if (nonflatten && outputMapKeysInfo!=null && isSimpleProjectCast(forEachPlan))
                            {
                                Pair<Integer, Integer> inputColumn = relevantFields.getFields().get(0);
                                relevantFields.setMapKeysInfo(inputColumn.first, inputColumn.second, outputMapKeysInfo);
                            }
                        }
                        
                        // Collect required map keys in foreach plan here.
                        // This is the only logical operator that we collect map keys 
                        // which are introduced by the operator here.
                        // For all other logical operators, it is attached to required fields
                        // of that logical operator, will process in required fields processing
                        // section
                        for (Pair<Integer, Integer> relevantField : relevantFields.getFields())
                        {
                            MapKeysInfo mapKeysInfo = getMapKeysInPlan(forEachPlan, relevantField.second);
                            relevantFields.mergeMapKeysInfo(0, relevantField.second, mapKeysInfo);
                        }
                    }
                    else
                    {
                        // For all other logical operators, we have one output column mapping to one or more input column.
                        // We copy the output map keys from the output column to the according input column
                        if (relevantFields!=null && relevantFields.getFields()!=null && outputMapKeysInfo!=null)
                        {
                            for (Pair<Integer, Integer> pair : relevantFields.getFields())
                                relevantFields.setMapKeysInfo(pair.first, pair.second, 
                                        outputMapKeysInfo);
                        }
                    }
                    
                    // Now we aggregate the input columns of this output column to the required input columns
                    if (requiredInputFieldsList.get(j)==null)
                        requiredInputFieldsList.set(j, relevantFields);
                    else
                    {
                        requiredInputFieldsList.get(j).merge(relevantFields);
                    }
                }
            }
            
            // Merge with required input fields of this logical operator.
            // RequiredInputFields come from two sources, one is mapping from required output to input, 
            // the other is from the operator itself. Here we use getRequiredFields to get the second part,
            // and merge with the first part
            List<RequiredFields> requiredFieldsListOfLOOp;
            
            // For LOForEach, requiredFields all flattened fields. Even the flattened fields get pruned, 
            // it may expand the number of rows in the result. So flattened fields shall not be pruned.
            // LOForEach.getRequiredFields does not give the required fields. RequiredFields means that field
            // is required by all the outputs. The pipeline does not work correctly without that field. 
            // LOForEach.getRequiredFields give all the input fields referred in the LOForEach statement, but those
            // fields can still be pruned (which means, not required)
            // Eg:
            // B = foreach A generate a0, a1, a2+a3;
            // LOForEach.getRequiredFields gives (a0, a1, a2, a3);
            // However, a2,a3 can be pruned if we do not need the a2+a3 for LOForEach.
            // So here, we do not use LOForEach.getRequiredFields, instead, any flattened fields are required fields
            if (rlo instanceof LOForEach) {
                List<Pair<Integer, Integer>> flattenedInputs = new ArrayList<Pair<Integer, Integer>>();
                for (int i=0;i<rlo.getSchema().size();i++) {
                    if (((LOForEach)rlo).isInputFlattened(i)) {
                        flattenedInputs.add(new Pair<Integer, Integer>(0, i));
                    }
                }
                if (!flattenedInputs.isEmpty()) {
                    requiredFieldsListOfLOOp = new ArrayList<RequiredFields>();
                    requiredFieldsListOfLOOp.add(new RequiredFields(flattenedInputs));
                }
                else
                    requiredFieldsListOfLOOp = null;
            }
            // For LOCross/LOUnion, actually we do not require any field here
            else if (rlo instanceof LOCross || rlo instanceof LOUnion)
                requiredFieldsListOfLOOp = null;
            else
                requiredFieldsListOfLOOp = rlo.getRequiredFields();
            
            if (requiredFieldsListOfLOOp!=null)
            {
                for (int i=0;i<requiredFieldsListOfLOOp.size();i++)
                {
                    RequiredFields requiredFieldsOfLOOp = requiredFieldsListOfLOOp.get(i);
                    if (requiredInputFieldsList.get(i)==null)
                        requiredInputFieldsList.set(i, requiredFieldsOfLOOp);
                    else
                    {
                        requiredInputFieldsList.get(i).merge(requiredFieldsOfLOOp);
                    }
                }
                
                // Collect required map keys of this operator
                // Cases are:
                // 1. Single predecessor: LOFilter, LOSplitOutput, LOSort
                // 2. Multiple predecessors: LOJoin
                // 3. LOForEach do not have operator-wise required fields, we
                //    have already processed it
                // 4. LOCogroup require all map keys (even if we cogroup by a0#'k1', a0 itself will be in bag a
                //    and we have no way to figure out which keys are referenced for a0. So we do not process it and
                //    simply require all map keys)
                // 5. Other operators do not have required fields, no need to process
                if (rlo instanceof LOFilter || rlo instanceof LOSplitOutput || rlo instanceof LOSort)
                {
                    List<LogicalPlan> innerPlans = new ArrayList<LogicalPlan>();
                    if (rlo instanceof LOFilter)
                    {
                        innerPlans.add(((LOFilter)rlo).getComparisonPlan());
                    }
                    else if (rlo instanceof LOSplitOutput)
                    {
                        innerPlans.add(((LOSplitOutput)rlo).getConditionPlan());
                    }
                    else if (rlo instanceof LOSort)
                    {
                        innerPlans.addAll(((LOSort)rlo).getSortColPlans());
                    }
                    for (LogicalPlan p : innerPlans)
                    {
                        for (RequiredFields rf : requiredFieldsListOfLOOp)
                        {
                            if (rf.getFields()==null)
                                continue;
                            for (Pair<Integer, Integer> pair : rf.getFields())
                            {
                                MapKeysInfo mapKeysInfo = getMapKeysInPlan(p, pair.second);
                                if (mapKeysInfo!=null && !mapKeysInfo.needAllKeys() && mapKeysInfo.getKeys()!=null)
                                    requiredInputFieldsList.get(0).mergeMapKeysInfo(0, pair.second, 
                                            mapKeysInfo);
                            }
                        }
                    }
                }
                else if (rlo instanceof LOJoin)
                {
                    for (int i=0;i<predecessors.size();i++)
                    {
                        Collection<LogicalPlan> joinPlans = ((LOJoin)rlo).getJoinPlans().get(predecessors.get(i));
                        if (joinPlans==null)
                            continue;
                        for (LogicalPlan p : joinPlans)
                        {
                            RequiredFields rf = requiredFieldsListOfLOOp.get(i);
                            if (rf.getFields()==null)
                                continue;
                            for (Pair<Integer, Integer> pair : rf.getFields())
                            {
                                MapKeysInfo mapKeysInfo = getMapKeysInPlan(p, pair.second);
                                if (mapKeysInfo!=null && !mapKeysInfo.needAllKeys() && mapKeysInfo.getKeys()!=null)
                                    requiredInputFieldsList.get(i).mergeMapKeysInfo(i, pair.second, 
                                            mapKeysInfo);
                            }
                        }
                    }
                }
            }
            
            // Now we finish the current logical operator, we need to process next logical operator. There are two cases:
            // 1. If the predecessor is LOForEach or LOSplit, we put requiredOutputFieldsList into cache and exit, the optimizer
            //    will invoke transform() on LOForEach or LOSplit and continue to process
            // 2. If the predecessor is otherwise, we then recursively collect required fields for the predecessor
            for (int i=0;i<predecessors.size();i++)
            {
            	RelationalOperator predecessor = (RelationalOperator)predecessors.get(i);
                
                List<RequiredFields> newRequiredOutputFieldsList = new ArrayList<RequiredFields>();
            	
                // In this optimization, we only prune columns and do not change structure of logical plan
                // So if we do not require anything from the input, we change it to require the first field
                if (requiredInputFieldsList.get(i)==null || requiredInputFieldsList.get(i).getNeedNoFields())
                {
                    List<Pair<Integer, Integer>> dummyFields = new ArrayList<Pair<Integer, Integer>>();
                    dummyFields.add(new Pair<Integer, Integer>(i, 0));
                    requiredInputFieldsList.set(i, new RequiredFields(dummyFields));
                }
                // For all logical operator with one output, reindex the output to 0
                if (!(predecessor instanceof LOSplit))
                {
                    if (requiredInputFieldsList.get(i)!=null)
                        requiredInputFieldsList.get(i).reIndex(0);
                    newRequiredOutputFieldsList.add(requiredInputFieldsList.get(i));
                }
            	if (predecessor instanceof LOForEach)
            	{
            	    cachedRequiredInfo.put(predecessor, new RequiredInfo(newRequiredOutputFieldsList));
            	    continue;
            	}
            	
            	if (predecessor instanceof LOSplit)
            	{
            	    int outputIndex = mPlan.getSuccessors(predecessor).indexOf(rlo);
            	    if (outputIndex==-1)
            	    {
            	        int errCode = 2186;
                        String msg = "Cannot locate node from successor";
                        throw new OptimizerException(msg, errCode, PigException.BUG);
            	    }
            	    if (requiredInputFieldsList.get(i)!=null)
                    {
            	        requiredInputFieldsList.get(i).reIndex(outputIndex);
                    }
            	    if (cachedRequiredInfo.containsKey(predecessor))
            	        newRequiredOutputFieldsList = cachedRequiredInfo.get(predecessor).requiredFieldsList;
        	        
            	    while (newRequiredOutputFieldsList.size()<=outputIndex)
            	        newRequiredOutputFieldsList.add(null);
        	        newRequiredOutputFieldsList.set(outputIndex, requiredInputFieldsList.get(i));

            	    cachedRequiredInfo.put(predecessor, new RequiredInfo(newRequiredOutputFieldsList));
            	    continue;
            	}
            	processNode(predecessors.get(i), new RequiredInfo(newRequiredOutputFieldsList));
            }
        } catch (FrontendException e) {
            int errCode = 2211;
            String msg = "Unable to prune columns when processing node " + lo;
            throw new OptimizerException(msg, errCode, PigException.BUG, e);
        }
    }
    
    // Get map keys in an inner plan of a particular input. 
    private MapKeysInfo getMapKeysInPlan(LogicalPlan plan, int column) throws OptimizerException
    {
        // Determine if this foreach/cogroup plan can relate map keys from output columns to input columns.
        // For criteria of this, see the comment of method isMapKeyRelayableInInnerPlan
        // If this is true, the reference of the column here does not actually used by the logical operator:
        // eg: B = foreach A generate a0;
        //   We relay map key of a0 to B.$0. Appearance of a0 on its own here does not mean we need all map keys of a0
        //   So once we see this situation, we stop collecting required map keys of this logical operator
        if (isSimpleProjectCast(plan))
            return null;
        
        boolean requireAll = false;
        List<String> mapKeys = null;
        TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(plan);
        try {
            projectFinder.visit();
        } catch (VisitorException ve) {
            int errCode = 2200;
            String msg = "Error getting top level project ";
            throw new OptimizerException(msg, errCode, PigException.BUG, ve);
        }
        for (LOProject project : projectFinder.getProjectSet())
        {
            if (!project.isStar() && project.getCol()==column)  // LOProject for that column 
            {
                List<LogicalOperator> successors = plan.getSuccessors(project);
                // If there are LOCast(s) in the middle (can only be cast to map, otherwise, there will not be maplookup below)
                // it is fine, we can ignore LOCast and continue to look for LOMapLookup 
                while (successors!=null && successors.size()==1 && successors.get(0) instanceof LOCast)
                {
                    LOCast cast = (LOCast)successors.get(0);
                    successors = plan.getSuccessors(cast);
                }
                if (successors!=null && successors.size()==1 && successors.get(0) instanceof LOMapLookup)
                {
                    LOMapLookup loMapLookup = (LOMapLookup)successors.get(0);
                    if (loMapLookup.getLookUpKey()!=null)
                    {
                        if (mapKeys==null)
                            mapKeys = new ArrayList<String>();
                        if (!mapKeys.contains(loMapLookup.getLookUpKey()))
                            mapKeys.add(loMapLookup.getLookUpKey());
                    }
                    requireAll = false;
                }
                else
                {
                    requireAll = true;
                }
            }
        }
        return new MapKeysInfo(requireAll, mapKeys);
    }
    
    // Figure if we need to relay output map keys to input map keys. It is used for inner plan for LOForEach
    // and groupByPlan for LOCogroup
    // There are several cases:
    // 1. UDF, we cannot figure out how each input field is used in UDF, so for each input field,
    //    we require everything
    // 2. Map constant, which do not requires any data input
    // 3. BinCond (B = foreach A generate a0==0?a1:a2; when a1, a2 is map)
    //    This situation is complex. Two branches (a1, a2) is relayed independently, if it relays, then we cannot 
    //    collect map keys of that branches as the required operator map keys.
    //    However, it is unlikely that user will refer a map key of that output, we simply say we need all map keys of 
    //    both map inputs
    // 4. Cast (B = foreach A generate a0 as (map[]);)
    //    The only cases we can cast a field to a map is the input field is a map already or the field is a byte array
    //    If the input field is a byte array, then the input field do not have concept of map keys. So we do not need
    //    to figure out the map key for this input. If the input field is a map, then this cast is just a 1 to 1 mapping.
    //    It is case 5
    // 5. 1 to 1 mapping (B = foreach A generate a0;)
    // 6. Map resolution (B = foreach A generate a0#'key1' as b0)
    //    Since we only trace one level map keys, so relay no key (since all the key in the following script refer to b0#'k'
    //    actually refer to a0#'key1'#'k', we do not relay second level map key 'k'), but require 'key1' for a0
    // 
    // Based on the above observation, the algorithm to map output map keys to input map keys are:
    // 1. If a output column map to multiple input column, then we do not relay this output
    // 2. Find top level project for that plan, we collect map key only when we have only one input 
    //    associate with it
    // 3. Check if the predecessor of the project is null, if not, stop relaying input map keys
    // 4. Check the successors of that project, if it is not a null or cast, stop relaying input map keys
    // The qualifying logical plan should takes one project as root, optionally followed by one or more casts:
    //
    //      Project
    //         |
    //       Cast*
    private boolean isSimpleProjectCast(LogicalPlan innerPlan) throws OptimizerException
    {
        TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(innerPlan);
        
        try {
            projectFinder.visit();
        } catch (VisitorException ve) {
            throw new OptimizerException();
        }
        boolean relayingMapKeys = false;
        if (projectFinder.getProjectSet()!=null && projectFinder.getProjectSet().size()==1)
        {
            LOProject project = projectFinder.getProjectSet().iterator().next();
            if (innerPlan.getPredecessors(project)==null)
            {
                relayingMapKeys = true;
                LogicalOperator pred = project;
                while (innerPlan.getSuccessors(pred)!=null)
                {
                    if (innerPlan.getSuccessors(pred).size()!=1)
                        return false;
                    if (!(innerPlan.getSuccessors(pred).get(0) instanceof LOCast))
                    {
                        return false;
                    }
                    pred = innerPlan.getSuccessors(pred).get(0);
                }
            }
            if (relayingMapKeys)
                return true;
        }
        return false;
    }
    
    // Prune fields of LOLoad, and use ColumePruner to prune all the downstream logical operators
    private void pruneLoader(LOLoad load, RequiredFields loaderRequiredFields) throws FrontendException
    {
        RequiredFieldList requiredFieldList = new RequiredFieldList();

        if (loaderRequiredFields==null || loaderRequiredFields.needAllFields())
            return;
        Schema loadSchema = load.getSchema();
        for (int i=0;i<loaderRequiredFields.size();i++)
        {
            Pair<Integer, Integer> pair = loaderRequiredFields.getField(i);
            MapKeysInfo mapKeysInfo = loaderRequiredFields.getMapKeysInfo(i);
            RequiredField requiredField = new RequiredField();
            requiredField.setIndex(pair.second);
            requiredField.setAlias(loadSchema.getField(pair.second).alias);
            requiredField.setType(loadSchema.getField(pair.second).type);
            if (mapKeysInfo!=null && !mapKeysInfo.needAllKeys())
            {
                List<RequiredField> subFieldList = new ArrayList<RequiredField>();
                for (String key : mapKeysInfo.getKeys())
                {
                    RequiredField mapKeyField = new RequiredField();
                    mapKeyField.setIndex(-1);
                    mapKeyField.setType(DataType.UNKNOWN);
                    mapKeyField.setAlias(key);
                    subFieldList.add(mapKeyField);
                }
                requiredField.setSubFields(subFieldList);
            }
            // Sort requiredFieldList, loader expect required field list sorted by index
            int j=0;
            while (requiredFieldList.getFields().size()>j && requiredFieldList.getFields().get(j).getIndex()<pair.second)
                j++;
            requiredFieldList.getFields().add(j, requiredField);
        }
        
        boolean[] columnRequired = new boolean[load.getSchema().size()];
        RequiredFieldResponse response = null;
        try {
            response = load.pushProjection(requiredFieldList);    
            
        } catch (FrontendException e) {
            log.warn("fieldsToRead on "+load+" throw an exception, skip it");
        }
        
        // If the request is not granted, probably the loader support position prune only,
        // and do not prune map key pruning (such as PigStorage). Drop all map keys (means 
        // we do not prune map keys) and try again
        if (response==null || !response.getRequiredFieldResponse())
        {
            for (RequiredField rf : requiredFieldList.getFields())
            {
                if (rf.getType() == DataType.MAP)
                    rf.setSubFields(null);
            }
            try {
                response = load.pushProjection(requiredFieldList);    
            } catch (FrontendException e) {
                log.warn("fieldsToRead on "+load+" throw an exception, skip it");
            }
        }
        
        // Loader does not support column pruning, insert foreach
        LogicalOperator forEach = null;
        if (response==null || !response.getRequiredFieldResponse())
        {
            List<Integer> columnsToProject = new ArrayList<Integer>();
            for (RequiredField rf : requiredFieldList.getFields())
                columnsToProject.add(rf.getIndex());
            
            forEach = load.insertPlainForEachAfter(columnsToProject);
        }
        
        // Begin to prune
        for (Pair<Integer, Integer> pair: loaderRequiredFields.getFields())
            columnRequired[pair.second] = true;
        
        List<Pair<Integer, Integer>> pruneList = new ArrayList<Pair<Integer, Integer>>();
        for (int i=0;i<columnRequired.length;i++)
        {
            if (!columnRequired[i])
                pruneList.add(new Pair<Integer, Integer>(0, i));
        }

        StringBuffer message = new StringBuffer();
        if (pruneList.size()!=0)
        {
            if (forEach == null)
                pruner.addPruneMap(load, pruneList);
            else
                pruner.addPruneMap(forEach, pruneList);

            message.append("Columns pruned for " + load.getAlias() + ": ");
            for (int i=0;i<pruneList.size();i++)
            {
                message.append("$"+pruneList.get(i).second);
                if (i!=pruneList.size()-1)
                    message.append(", ");
            }
            log.info(message);
        }
        else
            log.info("No column pruned for " + load.getAlias());
        message = new StringBuffer();;
        for (RequiredField rf : requiredFieldList.getFields())
        {
            if (rf.getSubFields()!=null)
            {
                message.append("Map key required for " + load.getAlias()+": ");
                if (rf.getIndex()!=-1)
                    message.append("$"+rf.getIndex());
                else
                    message.append(rf.getAlias());
                message.append("->[");
                for (int i=0;i<rf.getSubFields().size();i++)
                {
                    RequiredField keyrf = rf.getSubFields().get(i);
                    message.append(keyrf);
                    if (i!=rf.getSubFields().size()-1)
                        message.append(",");
                }
                message.append("] ");
            }
        }
        if (message.length()!=0)
            log.info(message);
        else
            log.info("No map keys pruned for " + load.getAlias());
    }
    
    public void prune() throws OptimizerException {
        try {
            if (!safeToPrune)
                return;
            
            for (LOLoad load : prunedLoaderColumnsMap.keySet())
                pruneLoader(load, prunedLoaderColumnsMap.get(load));
            
            if (!pruner.isEmpty())
                pruner.visit();
        }
        catch (FrontendException e) {
            int errCode = 2212;
            String msg = "Unable to prune plan";
            throw new OptimizerException(msg, errCode, PigException.BUG, e);
        }
    }    
}