/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.impl.logicalLayer.optimizer; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.mapreduce.Job; import org.apache.pig.Expression; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.PigException; import org.apache.pig.Expression.BinaryExpression; import org.apache.pig.Expression.Column; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.LOFilter; import org.apache.pig.impl.logicalLayer.LOLoad; import org.apache.pig.impl.logicalLayer.LogicalOperator; import org.apache.pig.impl.logicalLayer.LogicalPlan; import org.apache.pig.impl.logicalLayer.PColFilterExtractor; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.plan.optimizer.OptimizerException; /** * When the load statement in a pig script is loading a table from a meta data * system (like owl), the load can be followed by a filter which can contain * conditions on partition columns. This filter can also contain conditions on * non partition columns. This optimizer looks at the logical plan and checks if * there is a load followed by such a filter which has conditions on partition * columns. If so, it extracts the conditions on partition columns out of the * filter. */ public class PartitionFilterOptimizer extends LogicalTransformer { private String[] partitionKeys; /** * a reference to the LoadMetada implementation */ private LoadMetadata loadMetadata; /** * a reference to the LoadFunc implementation */ private LoadFunc loadFunc; private LOLoad loLoad; private LOFilter loFilter; /** * to ensure we only do the optimization once for performance reasons */ private Set<LogicalOperator> alreadyChecked = new HashSet<LogicalOperator>(); /** * a map between column names as reported in * {@link LoadMetadata#getSchema(String, org.apache.hadoop.conf.Configuration)} * and as present in {@link LOLoad#getSchema()}. The two will be different * when the user has provided a schema in the load statement */ private Map<String, String> colNameMap = new HashMap<String, String>(); /** * a map between column nameas as present in {@link LOLoad#getSchema()} and * as reported in * {@link LoadMetadata#getSchema(String, org.apache.hadoop.conf.Configuration)}. * The two will be different when the user has provided a schema in the * load statement. */ private Map<String, String> reverseColNameMap = new HashMap<String, String>(); protected PartitionFilterOptimizer(LogicalPlan plan) { super(plan); } @Override public boolean check(List<LogicalOperator> nodes) throws OptimizerException { if((nodes == null) || (nodes.size() <= 0)) { int errCode = 2052; String msg = "Internal error. Cannot retrieve operator from null " + "or empty list."; throw new OptimizerException(msg, errCode, PigException.BUG); } if(nodes.size() != 1|| !(nodes.get(0) instanceof LOLoad )) { return false; } if (!alreadyChecked.add(nodes.get(0))) { return false; } if(nodes.get(0) instanceof LOLoad) { loLoad = (LOLoad)nodes.get(0); } List<LogicalOperator> sucs = mPlan.getSuccessors(loLoad); if(sucs == null || sucs.size() != 1 || !(sucs.get(0) instanceof LOFilter)) { return false; } loFilter = (LOFilter)sucs.get(0); // we have to check more only if LoadFunc implements LoadMetada loadFunc = loLoad.getLoadFunc(); if(!(loadFunc instanceof LoadMetadata)) { return false; } loadMetadata = (LoadMetadata)loadFunc; try { partitionKeys = loadMetadata.getPartitionKeys( loLoad.getInputFile().getFileName(), new Job(loLoad.getConfiguration())); if(partitionKeys == null || partitionKeys.length == 0) { return false; } } catch (IOException e) { int errCode = 2209; throw new OptimizerException( "Internal error while processing any partition filter " + "conditions in the filter after the load" , errCode, PigException.BUG ); } // we found a load-filter pattern where the load returns partition keys return true; } @Override public void transform(List<LogicalOperator> nodes) throws OptimizerException { try { setupColNameMaps(); PColFilterExtractor pColFilterFinder = new PColFilterExtractor( loFilter.getComparisonPlan(), getMappedKeys(partitionKeys)); pColFilterFinder.visit(); Expression partitionFilter = pColFilterFinder.getPColCondition(); if(partitionFilter != null) { // the column names in the filter may be the ones provided by // the user in the schema in the load statement - we may need // to replace them with partition column names as given by // LoadFunc.getSchema() updateMappedColNames(partitionFilter); loadMetadata.setPartitionFilter(partitionFilter); if(pColFilterFinder.isFilterRemovable()) { // remove this filter from the plan mPlan.removeAndReconnect(loFilter); } } } catch (Exception e) { int errCode = 2209; throw new OptimizerException( "Internal error while processing any partition filter " + "conditions in the filter after the load:" , errCode, PigException.BUG, e ); } } /** * @param expr */ private void updateMappedColNames(Expression expr) { if(expr instanceof BinaryExpression) { updateMappedColNames(((BinaryExpression) expr).getLhs()); updateMappedColNames(((BinaryExpression) expr).getRhs()); } else if (expr instanceof Column) { Column col = (Column) expr; col.setName(reverseColNameMap.get(col.getName())); } } /** * The partition keys in the argument are as reported by * {@link LoadMetadata#getPartitionKeys(String, org.apache.hadoop.conf.Configuration)}. * The user may have renamed these by providing a schema with different names * in the load statement - this method will replace the former names with * the latter names. * @param partitionKeys * @return */ private List<String> getMappedKeys(String[] partitionKeys) { List<String> mappedKeys = new ArrayList<String>(partitionKeys.length); for (int i = 0; i < partitionKeys.length; i++) { mappedKeys.add(colNameMap.get(partitionKeys[i])); } return mappedKeys; } /** * @throws FrontendException * */ private void setupColNameMaps() throws FrontendException { Schema loadFuncSchema = loLoad.getDeterminedSchema(); Schema loLoadSchema = loLoad.getSchema(); for(int i = 0; i < loadFuncSchema.size(); i++) { colNameMap.put(loadFuncSchema.getField(i).alias, (i < loLoadSchema.size() ? loLoadSchema.getField(i).alias : loadFuncSchema.getField(i).alias)); reverseColNameMap.put((i < loLoadSchema.size() ? loLoadSchema.getField(i).alias : loadFuncSchema.getField(i).alias), loadFuncSchema.getField(i).alias); } } }