/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.newplan.logical.rules; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.pig.data.DataType; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.newplan.DependencyOrderWalker; import org.apache.pig.newplan.Operator; import org.apache.pig.newplan.OperatorPlan; import org.apache.pig.newplan.OperatorSubPlan; import org.apache.pig.newplan.ReverseDependencyOrderWalker; import org.apache.pig.newplan.logical.expression.LogicalExpression; import org.apache.pig.newplan.logical.expression.LogicalExpressionPlan; import org.apache.pig.newplan.logical.expression.LogicalExpressionVisitor; import org.apache.pig.newplan.logical.expression.MapLookupExpression; import org.apache.pig.newplan.logical.expression.UserFuncExpression; import org.apache.pig.newplan.logical.optimizer.AllExpressionVisitor; import org.apache.pig.newplan.logical.relational.LOCogroup; import org.apache.pig.newplan.logical.relational.LOFilter; import org.apache.pig.newplan.logical.relational.LOGenerate; import org.apache.pig.newplan.logical.relational.LOJoin; import org.apache.pig.newplan.logical.relational.LOLoad; import org.apache.pig.newplan.logical.relational.LOSort; import org.apache.pig.newplan.logical.relational.LOSplitOutput; import org.apache.pig.newplan.logical.relational.LOStore; import org.apache.pig.newplan.logical.relational.LOUnion; import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator; import org.apache.pig.newplan.logical.relational.LogicalSchema; import org.apache.pig.newplan.logical.relational.LogicalSchema.LogicalFieldSchema; /** * This filter Marks every Load Operator which has a Map * with MAP_MARKER_ANNOTATION. The annotation value is * <code>Map<Integer,Set<String>><code> where Integer is the column number * of the field and Set is the set of Keys in this field ( field is a map field only ). * * It does this for only the top level schema in load. * * Algorithm: * Traverse the Plan in ReverseDependency order ( ie. Sink to Source ) * For LogicalRelationalOperators having MapLookupExpression in their * expressionPlan collect uid and keys related to it. This is * retained in the visitor * For ForEach having nested LogicalPlan use the same visitor hence * there is no distinction required * At Sources find all the uids provided by this source and annotate this * LogicalRelationalOperator ( load ) with <code>Map<Integer,Set<String>></code> * containing only the column numbers that this LogicalRelationalOperator generates * * NOTE: This is a simple Map Pruner. If a map key is mentioned in the script * then this pruner assumes you need the key. This pruner is not as optimized * as column pruner ( which removes a column if it is mentioned but never used ) * */ public class MapKeysPruneHelper { public static final String REQUIRED_MAPKEYS = "MapPruner:RequiredKeys"; private OperatorPlan currentPlan; private OperatorSubPlan subplan; public MapKeysPruneHelper(OperatorPlan currentPlan) { this.currentPlan = currentPlan; if (currentPlan instanceof OperatorSubPlan) { subplan = new OperatorSubPlan(((OperatorSubPlan)currentPlan).getBasePlan()); } else { subplan = new OperatorSubPlan(currentPlan); } } @SuppressWarnings("unchecked") public boolean check() throws FrontendException { // First check if we have a load with a map in it or not List<Operator> sources = currentPlan.getSources(); for( Operator source : sources ) { LogicalSchema schema = ((LogicalRelationalOperator)source).getSchema(); // If any of the loads has a null schema we dont know the ramifications here // so we skip this optimization if( schema == null ) { return false; } } // Now we check what keys are needed MapMarker marker = new MapMarker(currentPlan); marker.visit(); // If the uid is the input uid of LOStore, LOCogroup, LOUnion, UserFunc, that means // the entire map may be used. For simplicity, we do not prune any map key in this case Set<Long> fullMapUids = new HashSet<Long>(); FullMapCollector collector = new FullMapCollector(currentPlan, fullMapUids); collector.visit(); // If we have found specific keys which are needed then we return true; // Else if we dont have any specific keys we return false boolean hasAnnotation = false; for( Operator source : sources ) { Map<Integer,Set<String>> annotationValue = (Map<Integer, Set<String>>) ((LogicalRelationalOperator)source).getAnnotation(REQUIRED_MAPKEYS); // Now for all full maps found in sinks we cannot prune them at source if( ! fullMapUids.isEmpty() && annotationValue != null && !annotationValue.isEmpty() ) { Integer[] annotationKeyArray = annotationValue.keySet().toArray( new Integer[0] ); LogicalSchema sourceSchema = ((LogicalRelationalOperator)source).getSchema(); for( Integer col : annotationKeyArray ) { if( fullMapUids.contains(sourceSchema.getField(col).uid)) { annotationValue.remove( col ); } } } if ( annotationValue != null && annotationValue.isEmpty()) { ((LogicalRelationalOperator)source).removeAnnotation(REQUIRED_MAPKEYS); annotationValue = null; } // Can we still prune any keys if( annotationValue != null ) { hasAnnotation = true; subplan.add(source); } } // If all the sinks dont have any schema, we cant to any optimization return hasAnnotation; } /** * This function checks if the schema has a map. * We dont check for a nested structure. * @param schema Schema to be checked * @return true if it has a map, else false * @throws NullPointerException incase Schema is null */ private boolean hasMap(LogicalSchema schema ) { for( LogicalFieldSchema field : schema.getFields() ) { if( field.type == DataType.MAP ) { return true; } } return false; } /** * This function returns a set of Uids corresponding to * map datatype in the first level of this schema * @param schema Schema having fields * @return */ private static Set<Long> getMapUids(LogicalSchema schema ) { Set<Long> uids = new HashSet<Long>(); if( schema != null ) { for( LogicalFieldSchema field : schema.getFields() ) { uids.add( field.uid ); } } return uids; } public OperatorPlan reportChanges() { return subplan; } /** * This class collects all the information required to create * the list of keys required for a map */ static public class MapMarker extends AllExpressionVisitor { Map<Long,Set<String>> inputUids = null; protected MapMarker(OperatorPlan plan) throws FrontendException { super(plan, new ReverseDependencyOrderWalker(plan)); inputUids = new HashMap<Long,Set<String>>(); } @Override public void visit(LOLoad load) throws FrontendException { if( load.getSchema() != null ) { Map<Integer,Set<String>> annotation = new HashMap<Integer,Set<String>>(); for( int i=0; i<load.getSchema().size(); i++) { LogicalFieldSchema field = load.getSchema().getField(i); if( inputUids.containsKey( field.uid ) ) { annotation.put(i, inputUids.get( field.uid ) ); } } load.annotate(REQUIRED_MAPKEYS, annotation); } } @Override public void visit(LOFilter filter) throws FrontendException { currentOp = filter; MapExprMarker v = (MapExprMarker) getVisitor(filter.getFilterPlan()); v.visit(); mergeUidKeys( v.inputUids ); } @Override public void visit(LOJoin join) throws FrontendException { currentOp = join; Collection<LogicalExpressionPlan> c = join.getExpressionPlanValues(); for (LogicalExpressionPlan plan : c) { MapExprMarker v = (MapExprMarker) getVisitor(plan); v.visit(); mergeUidKeys( v.inputUids ); } } @Override public void visit(LOGenerate gen) throws FrontendException { currentOp = gen; Collection<LogicalExpressionPlan> plans = gen.getOutputPlans(); for( LogicalExpressionPlan plan : plans ) { MapExprMarker v = (MapExprMarker) getVisitor(plan); v.visit(); mergeUidKeys( v.inputUids ); } } @Override public void visit(LOSort sort) throws FrontendException { currentOp = sort; Collection<LogicalExpressionPlan> c = sort.getSortColPlans(); for (LogicalExpressionPlan plan : c) { MapExprMarker v = (MapExprMarker) getVisitor(plan); v.visit(); mergeUidKeys( v.inputUids ); } } @Override public void visit(LOSplitOutput splitOutput) throws FrontendException { currentOp = splitOutput; MapExprMarker v = (MapExprMarker) getVisitor(splitOutput.getFilterPlan()); v.visit(); mergeUidKeys( v.inputUids ); if (splitOutput.getSchema()!=null) { for (LogicalFieldSchema fs : splitOutput.getSchema().getFields()) { long inputUid = splitOutput.getInputUids(fs.uid); if( inputUid!=-1) { Set<String> mapKeySet = inputUids.get(fs.uid); if (mapKeySet!=null) { if (inputUids.containsKey(inputUid)) inputUids.get(inputUid).addAll(mapKeySet); else inputUids.put(inputUid, mapKeySet); } } } } } private void mergeUidKeys( Map<Long, Set<String> > inputMap ) { for( Map.Entry<Long, Set<String>> entry : inputMap.entrySet() ) { if( inputUids.containsKey(entry.getKey()) ) { Set<String> mapKeySet = inputUids.get(entry.getKey()); mapKeySet.addAll(entry.getValue()); } else { inputUids.put(entry.getKey(), inputMap.get(entry.getKey())); } } } @Override protected LogicalExpressionVisitor getVisitor(LogicalExpressionPlan expr) throws FrontendException { return new MapExprMarker(expr ); } static class MapExprMarker extends LogicalExpressionVisitor { Map<Long,Set<String>> inputUids = null; protected MapExprMarker(OperatorPlan p) throws FrontendException { super(p, new DependencyOrderWalker(p)); inputUids = new HashMap<Long,Set<String>>(); } @Override public void visit(MapLookupExpression op) throws FrontendException { Long uid = op.getMap().getFieldSchema().uid; String key = op.getLookupKey(); HashSet<String> mapKeySet = null; if( inputUids.containsKey(uid) ) { mapKeySet = (HashSet<String>) inputUids.get(uid); } else { mapKeySet = new HashSet<String>(); inputUids.put(uid, mapKeySet); } mapKeySet.add(key); } } } static public class FullMapCollector extends AllExpressionVisitor { Set<Long> fullMapUids = new HashSet<Long>(); protected FullMapCollector(OperatorPlan plan, Set<Long> fullMapUids) throws FrontendException { super(plan, new ReverseDependencyOrderWalker(plan)); this.fullMapUids = fullMapUids; } @Override public void visit(LOStore store) throws FrontendException { super.visit(store); Set<Long> uids = getMapUids(store.getSchema()); fullMapUids.addAll(uids); } @SuppressWarnings("unchecked") @Override public void visit(LOUnion union) throws FrontendException { super.visit(union); List<Operator> preds = plan.getPredecessors(union); if (preds!=null) { for (Operator pred : preds) { LogicalSchema schema = ((LogicalRelationalOperator)pred).getSchema(); Set<Long> uids = getMapUids(schema); fullMapUids.addAll(uids); } } } @SuppressWarnings("unchecked") @Override public void visit(LOCogroup cogroup) throws FrontendException { super.visit(cogroup); List<Operator> preds = plan.getPredecessors(cogroup); if (preds!=null) { for (Operator pred : preds) { LogicalSchema schema = ((LogicalRelationalOperator)pred).getSchema(); Set<Long> uids = getMapUids(schema); fullMapUids.addAll(uids); } } } @Override public void visit(LOSplitOutput splitOutput) throws FrontendException { super.visit(splitOutput); if (splitOutput.getSchema()!=null) { for (LogicalFieldSchema fs : splitOutput.getSchema().getFields()) { if (fullMapUids.contains(fs.uid) && splitOutput.getInputUids(fs.uid)!=-1) fullMapUids.add(splitOutput.getInputUids(fs.uid)); } } } @Override protected LogicalExpressionVisitor getVisitor(LogicalExpressionPlan expr) throws FrontendException { return new FullMapExpCollector(expr, fullMapUids); } static class FullMapExpCollector extends LogicalExpressionVisitor { Set<Long> fullMapUids = new HashSet<Long>(); protected FullMapExpCollector(OperatorPlan plan, Set<Long> fullMapUids) throws FrontendException { super(plan, new DependencyOrderWalker(plan)); this.fullMapUids = fullMapUids; } @Override public void visit(UserFuncExpression userFunc) throws FrontendException { List<Operator> succs = userFunc.getPlan().getSuccessors(userFunc); if (succs==null) return; LogicalExpression succ = (LogicalExpression)succs.get(0); if (succ.getFieldSchema()!=null && succ.getFieldSchema().type==DataType.MAP) fullMapUids.add(succ.getFieldSchema().uid); } } } }