PartitionFilterOptimizer.java example

Explorer
hadoop-pig-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.logicalLayer.optimizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.PigException;
import org.apache.pig.Expression.BinaryExpression;
import org.apache.pig.Expression.Column;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LOFilter;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.PColFilterExtractor;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.optimizer.OptimizerException;

/**
 * When the load statement in a pig script is loading a table from a meta data
 * system (like owl), the load can be followed by a filter which can contain
 * conditions on partition columns. This filter can also contain conditions on
 * non partition columns. This optimizer looks at the logical plan and checks if
 * there is a load followed by such a filter which has conditions on partition
 * columns. If so, it extracts the conditions on partition columns out of the
 * filter.
 */
public class PartitionFilterOptimizer extends
        LogicalTransformer {
    
    private String[] partitionKeys;
    
    /**
     * a reference to the LoadMetada implementation 
     */
    private LoadMetadata loadMetadata;

    /**
     * a reference to the LoadFunc implementation
     */
    private LoadFunc loadFunc;
    
    private LOLoad loLoad;
    private LOFilter loFilter;
    
    /**
     * to ensure we only do the optimization once for performance reasons
     */
    private Set<LogicalOperator> alreadyChecked = new HashSet<LogicalOperator>();
    
    /**
     * a map between column names as reported in 
     * {@link LoadMetadata#getSchema(String, org.apache.hadoop.conf.Configuration)}
     * and as present in {@link LOLoad#getSchema()}. The two will be different 
     * when the user has provided a schema in the load statement
     */
    private Map<String, String> colNameMap = new HashMap<String, String>();
    
    /**
     * a map between column nameas as present in {@link LOLoad#getSchema()} and
     * as reported in 
     * {@link LoadMetadata#getSchema(String, org.apache.hadoop.conf.Configuration)}.
     * The two will be different when the user has provided a schema in the 
     * load statement.
     */
    private Map<String, String> reverseColNameMap = new HashMap<String, String>();
    

    protected PartitionFilterOptimizer(LogicalPlan plan) {
        super(plan);
    }

    @Override
    public boolean check(List<LogicalOperator> nodes) throws OptimizerException 
    {
        if((nodes == null) || (nodes.size() <= 0)) {
            int errCode = 2052;
            String msg = "Internal error. Cannot retrieve operator from null " +
            		"or empty list.";
            throw new OptimizerException(msg, errCode, PigException.BUG);
        }
        if(nodes.size() != 1|| !(nodes.get(0) instanceof LOLoad )) {
            return false;
        }
        if (!alreadyChecked.add(nodes.get(0))) {
            return false;
        }
        if(nodes.get(0) instanceof LOLoad) {
            loLoad = (LOLoad)nodes.get(0);
        } 
        List<LogicalOperator> sucs = mPlan.getSuccessors(loLoad);
        if(sucs == null || sucs.size() != 1 || !(sucs.get(0) instanceof LOFilter)) {
            return false;
        }
        loFilter = (LOFilter)sucs.get(0);
        
        // we have to check more only if LoadFunc implements LoadMetada
        loadFunc = loLoad.getLoadFunc();
        if(!(loadFunc instanceof LoadMetadata)) {
            return false;
        }
        loadMetadata = (LoadMetadata)loadFunc;
        try {
            partitionKeys = loadMetadata.getPartitionKeys(
                    loLoad.getInputFile().getFileName(), new Job(loLoad.getConfiguration()));
            if(partitionKeys == null || partitionKeys.length == 0) {
                return false;
            }
        } catch (IOException e) {
            int errCode = 2209;
            throw new OptimizerException(
                    "Internal error while processing any partition filter " +
                    "conditions in the filter after the load" ,
                    errCode,
                    PigException.BUG
            );
        }
        
        // we found a load-filter pattern where the load returns partition keys
        return true;
    }

    @Override
    public void transform(List<LogicalOperator> nodes)
            throws OptimizerException {
        try {
            setupColNameMaps();
            PColFilterExtractor pColFilterFinder = new PColFilterExtractor(
                    loFilter.getComparisonPlan(), getMappedKeys(partitionKeys));
            pColFilterFinder.visit();
            Expression partitionFilter = pColFilterFinder.getPColCondition();
            if(partitionFilter != null) {
                // the column names in the filter may be the ones provided by
                // the user in the schema in the load statement - we may need
                // to replace them with partition column names as given by
                // LoadFunc.getSchema()
                updateMappedColNames(partitionFilter);
                loadMetadata.setPartitionFilter(partitionFilter);
                if(pColFilterFinder.isFilterRemovable()) {
                    // remove this filter from the plan                  
                    mPlan.removeAndReconnect(loFilter);
                }
            }
        } catch (Exception e) {
            int errCode = 2209;
            throw new OptimizerException(
                    "Internal error while processing any partition filter " +
                    "conditions in the filter after the load:" ,
                    errCode,
                    PigException.BUG,
                    e
            );
        }
    }
    
    

    /**
     * @param expr
     */
    private void updateMappedColNames(Expression expr) {
        if(expr instanceof BinaryExpression) {
            updateMappedColNames(((BinaryExpression) expr).getLhs());
            updateMappedColNames(((BinaryExpression) expr).getRhs());
        } else if (expr instanceof Column) {
            Column col = (Column) expr;
            col.setName(reverseColNameMap.get(col.getName()));
        }
    }

    /**
     * The partition keys in the argument are as reported by 
     * {@link LoadMetadata#getPartitionKeys(String, org.apache.hadoop.conf.Configuration)}.
     * The user may have renamed these by providing a schema with different names
     * in the load statement - this method will replace the former names with
     * the latter names.
     * @param partitionKeys
     * @return
     */
    private List<String> getMappedKeys(String[] partitionKeys) {
        List<String> mappedKeys = new ArrayList<String>(partitionKeys.length);
        for (int i = 0; i < partitionKeys.length; i++) {
            mappedKeys.add(colNameMap.get(partitionKeys[i]));
        }
        return mappedKeys;
    }

    
    
    /**
     * @throws FrontendException 
     * 
     */
    private void setupColNameMaps() throws FrontendException {
        Schema loadFuncSchema = loLoad.getDeterminedSchema();
        Schema loLoadSchema = loLoad.getSchema();
        for(int i = 0; i < loadFuncSchema.size(); i++) {
            colNameMap.put(loadFuncSchema.getField(i).alias, 
                    (i < loLoadSchema.size() ? loLoadSchema.getField(i).alias :
                        loadFuncSchema.getField(i).alias));
            
            reverseColNameMap.put((i < loLoadSchema.size() ? loLoadSchema.getField(i).alias :
                        loadFuncSchema.getField(i).alias), 
                        loadFuncSchema.getField(i).alias);
        }
    }

}