/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.impl.logicalLayer; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.PigException; import org.apache.pig.data.DataType; import org.apache.pig.impl.logicalLayer.optimizer.SchemaRemover; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException; import org.apache.pig.impl.plan.Operator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.PlanException; import org.apache.pig.impl.plan.ProjectionMap; import org.apache.pig.impl.plan.RequiredFields; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.MultiMap; import org.apache.pig.impl.util.Pair; public class LOForEach extends RelationalOperator { private static final long serialVersionUID = 2L; /** * The foreach operator supports nested query plans. At this point its one * level of nesting. Foreach can have a list of operators that need to be * applied over the input. */ private ArrayList<LogicalPlan> mForEachPlans; private ArrayList<Boolean> mFlatten; private ArrayList<Schema> mUserDefinedSchema = null; private static Log log = LogFactory.getLog(LOForEach.class); // Cache the information of generating inner plan for each output schema while generating output schema, // for later use in caculate relevant field private List<LogicalPlan> mSchemaPlanMapping = new ArrayList<LogicalPlan>(); /** * @param plan * Logical plan this operator is a part of. * @param k * Operator key to assign to this node. * @param foreachPlans * the list of plans that are applied for each input * @param flattenList * boolean list that tells which elements of the foreach * projection should be flattened. */ public LOForEach(LogicalPlan plan, OperatorKey k, ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList) { super(plan, k); mForEachPlans = foreachPlans; mFlatten = flattenList; } public LOForEach(LogicalPlan plan, OperatorKey k, ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList, ArrayList<Schema> userDefinedSchemaList) { super(plan, k); mForEachPlans = foreachPlans; mFlatten = flattenList; mUserDefinedSchema = userDefinedSchemaList; } public ArrayList<LogicalPlan> getForEachPlans() { return mForEachPlans; } public void setForEachPlans(ArrayList<LogicalPlan> foreachPlans) { mForEachPlans = foreachPlans; } public List<Boolean> getFlatten() { return mFlatten; } public void setFlatten(ArrayList<Boolean> flattenList) { mFlatten = flattenList; } public List<Schema> getUserDefinedSchema() { return mUserDefinedSchema; } public void setUserDefinedSchema(ArrayList<Schema> userDefinedSchema) { mUserDefinedSchema = userDefinedSchema; } @Override public String name() { return getAliasString() + "ForEach " + mKey.scope + "-" + mKey.id; } @Override public boolean supportsMultipleInputs() { return false; } @Override public void visit(LOVisitor v) throws VisitorException { v.visit(this); } public byte getType() { return DataType.BAG ; } private void updateAliasCount(Map<String, Integer> aliases, String alias) { if((null == aliases) || (null == alias)) return; Integer count = aliases.get(alias); if(null == count) { aliases.put(alias, 1); } else { aliases.put(alias, count + 1); } } @Override public Schema getSchema() throws FrontendException { log.debug("Entering getSchema"); if (!mIsSchemaComputed) { List<Schema.FieldSchema> fss = new ArrayList<Schema.FieldSchema>( mForEachPlans.size()); mSchemaPlanMapping = new ArrayList<LogicalPlan>(); for (LogicalPlan plan : mForEachPlans) { log.debug("Number of leaves in " + plan + " = " + plan.getLeaves().size()); for(int i = 0; i < plan.getLeaves().size(); ++i) { log.debug("Leaf" + i + "= " + plan.getLeaves().get(i)); } //LogicalOperator op = plan.getRoots().get(0); LogicalOperator op = plan.getLeaves().get(0); log.debug("op: " + op.getClass().getName() + " " + op); } log.debug("Printed the leaves of the generate plans"); Map<Schema.FieldSchema, String> flattenAlias = new HashMap<Schema.FieldSchema, String>(); Map<String, Boolean> inverseFlattenAlias = new HashMap<String, Boolean>(); Map<String, Integer> aliases = new HashMap<String, Integer>(); for (int planCtr = 0; planCtr < mForEachPlans.size(); ++planCtr) { LogicalPlan plan = mForEachPlans.get(planCtr); LogicalOperator op = plan.getLeaves().get(0); log.debug("op: " + op.getClass().getName() + " " + op); log.debug("Flatten: " + mFlatten.get(planCtr)); Schema.FieldSchema planFs; if(op instanceof LOProject) { //the check for the type is required for statements like //foreach cogroup { // a1 = order a by *; // generate a1; //} //In the above script, the generate a1, will translate to //project(a1) -> project(*) and will not be translated to a sequence of projects //As a result the project(*) will remain but the return type is a bag //project(*) with a data type set to tuple indicates a project(*) from an input //that has no schema if( (((LOProject)op).isStar() ) && (((LOProject)op).getType() == DataType.TUPLE) ) { mSchema = null; mIsSchemaComputed = true; return mSchema; } } try { planFs = ((ExpressionOperator)op).getFieldSchema(); log.debug("planFs: " + planFs); Schema userDefinedSchema = null; if(null != mUserDefinedSchema) { userDefinedSchema = mUserDefinedSchema.get(planCtr); } if(null != planFs) { String outerCanonicalAlias = op.getAlias(); if(null == outerCanonicalAlias) { outerCanonicalAlias = planFs.alias; } log.debug("Outer canonical alias: " + outerCanonicalAlias); if(mFlatten.get(planCtr)) { //need to extract the children and create the aliases //assumption here is that flatten is only for one column //i.e., flatten(A), flatten(A.x) and NOT //flatten(B.(x,y,z)) Schema s = planFs.schema; if(null != s && s.isTwoLevelAccessRequired()) { // this is the case where the schema is that of // a bag which has just one tuple fieldschema which // in turn has a list of fieldschemas. The schema // after flattening would consist of the fieldSchemas // present in the tuple // check that indeed we only have one field schema // which is that of a tuple if(s.getFields().size() != 1) { int errCode = 1008; String msg = "Expected a bag schema with a single " + "element of type "+ DataType.findTypeName(DataType.TUPLE) + " but got a bag schema with multiple elements."; throw new FrontendException(msg, errCode, PigException.INPUT, false, null); } Schema.FieldSchema tupleFS = s.getField(0); if(tupleFS.type != DataType.TUPLE) { int errCode = 1009; String msg = "Expected a bag schema with a single " + "element of type "+ DataType.findTypeName(DataType.TUPLE) + " but got an element of type " + DataType.findTypeName(tupleFS.type); throw new FrontendException(msg, errCode, PigException.INPUT, false, null); } s = tupleFS.schema; } if(null != s && s.size()!=0) { for(int i = 0; i < s.size(); ++i) { Schema.FieldSchema fs; fs = Schema.FieldSchema.copyAndLink(s.getField(i), op); log.debug("fs: " + fs); if(null != userDefinedSchema) { Schema.FieldSchema userDefinedFieldSchema; try { if(i < userDefinedSchema.size()) { userDefinedFieldSchema = userDefinedSchema.getField(i); fs = fs.mergePrefixFieldSchema(userDefinedFieldSchema); } } catch (SchemaMergeException sme) { int errCode = 1016; String msg = "Problems in merging user defined schema"; throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme); } outerCanonicalAlias = null; } String innerCanonicalAlias = fs.alias; Schema.FieldSchema newFs; if((null != outerCanonicalAlias) && (null != innerCanonicalAlias)) { String disambiguatorAlias = outerCanonicalAlias + "::" + innerCanonicalAlias; newFs = new Schema.FieldSchema(disambiguatorAlias, fs.schema, fs.type); newFs.setParent(s.getField(i).canonicalName, op); fss.add(newFs); mSchemaPlanMapping.add(plan); updateAliasCount(aliases, disambiguatorAlias); //it's fine if there are duplicates //we just need to record if its due to //flattening } else { newFs = new Schema.FieldSchema(fs); newFs.setParent(s.getField(i).canonicalName, op); fss.add(newFs); mSchemaPlanMapping.add(plan); } updateAliasCount(aliases, innerCanonicalAlias); flattenAlias.put(newFs, innerCanonicalAlias); inverseFlattenAlias.put(innerCanonicalAlias, true); } } else { Schema.FieldSchema newFs; if(null != userDefinedSchema) { if(!DataType.isSchemaType(planFs.type)) { if(userDefinedSchema.size() > 1) { int errCode = 1017; String msg = "Schema mismatch. A basic type on flattening cannot have more than one column. User defined schema: " + userDefinedSchema; throw new FrontendException(msg, errCode, PigException.INPUT, false, null); } newFs = new Schema.FieldSchema(null, planFs.type); try { newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0)); } catch (SchemaMergeException sme) { int errCode = 1016; String msg = "Problems in merging user defined schema"; throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme); } updateAliasCount(aliases, newFs.alias); fss.add(newFs); mSchemaPlanMapping.add(plan); newFs.setParent(planFs.canonicalName, op); } else { for(Schema.FieldSchema ufs: userDefinedSchema.getFields()) { Schema.FieldSchema.setFieldSchemaDefaultType(ufs, DataType.BYTEARRAY); newFs = new Schema.FieldSchema(ufs); fss.add(newFs); mSchemaPlanMapping.add(plan); newFs.setParent(null, op); updateAliasCount(aliases, ufs.alias); } } } else { if(!DataType.isSchemaType(planFs.type)) { newFs = new Schema.FieldSchema(planFs.alias, planFs.type); } else { newFs = new Schema.FieldSchema(null, DataType.BYTEARRAY); } fss.add(newFs); mSchemaPlanMapping.add(plan); newFs.setParent( planFs.canonicalName, op ); } } } else { //just populate the schema with the field schema of the expression operator //check if the user has defined a schema for the operator; compare the schema //with that of the expression operator field schema and then add it to the list Schema.FieldSchema newFs = Schema.FieldSchema.copyAndLink(planFs, op); if(null != userDefinedSchema) { try { newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0)); updateAliasCount(aliases, newFs.alias); } catch (SchemaMergeException sme) { int errCode = 1016; String msg = "Problems in merging user defined schema"; throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme); } } newFs.setParent(planFs.canonicalName, op); fss.add(newFs); mSchemaPlanMapping.add(plan); } } else { //did not get a valid list of field schemas String outerCanonicalAlias = null; if(null != userDefinedSchema) { Schema.FieldSchema userDefinedFieldSchema = new Schema.FieldSchema(userDefinedSchema.getField(0)); fss.add(userDefinedFieldSchema); mSchemaPlanMapping.add(plan); userDefinedFieldSchema.setParent(null, op); updateAliasCount(aliases, userDefinedFieldSchema.alias); } else { mSchema = null; mIsSchemaComputed = true; return mSchema; } } } catch (FrontendException fee) { mSchema = null; mIsSchemaComputed = false; throw fee; } } //check for duplicate column names and throw an error if there are duplicates //ensure that flatten gets rid of duplicate column names when the checks are //being done log.debug(" flattenAlias: " + flattenAlias); log.debug(" inverseFlattenAlias: " + inverseFlattenAlias); log.debug(" aliases: " + aliases); log.debug(" fss.size: " + fss.size()); boolean duplicates = false; Map<String, Integer> duplicateAliases = new HashMap<String, Integer>(); for(Map.Entry<String, Integer> e: aliases.entrySet()) { Integer count = e.getValue(); if(count > 1) {//not checking for null here as counts are intitalized to 1 Boolean inFlatten = false; log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias); inFlatten = inverseFlattenAlias.get(e.getKey()); log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias); if((null == inFlatten) || (!inFlatten)) { duplicates = true; duplicateAliases.put(e.getKey(), count); } } } if(duplicates) { String errMessage; StringBuffer sb = new StringBuffer("Found duplicates in schema. "); if(duplicateAliases.size() > 0) { Set<Map.Entry<String, Integer>> es = duplicateAliases.entrySet(); Iterator<Map.Entry<String, Integer>> iter = es.iterator(); Map.Entry<String, Integer> e = iter.next(); sb.append(": "); sb.append(e.getValue()); sb.append(" columns"); while(iter.hasNext()) { e = iter.next(); sb.append(", "); sb.append(e.getKey()); sb.append(": "); sb.append(e.getValue()); sb.append(" columns"); } } sb.append(". Please alias the columns with unique names."); errMessage = sb.toString(); log.debug(errMessage); int errCode = 1007; throw new FrontendException(errMessage, errCode, PigException.INPUT, false, null); } mSchema = new Schema(fss); //add the aliases that are unique after flattening for(int i=0;i<mSchema.getFields().size();i++) { Schema.FieldSchema fs = mSchema.getFields().get(i); String alias = flattenAlias.get(fs); Integer count = aliases.get(alias); if (null == count) count = 1; log.debug("alias: " + alias); if((null != alias) && (count == 1)) { mSchema.addAlias(alias, fs); } } mIsSchemaComputed = true; } log.debug("Exiting getSchema"); return mSchema; } public void unsetSchema() throws VisitorException{ for(LogicalPlan plan: mForEachPlans) { SchemaRemover sr = new SchemaRemover(plan); sr.visit(); } super.unsetSchema(); mSchemaPlanMapping = new ArrayList<LogicalPlan>(); } private void doAllSuccessors(LogicalPlan lp, LogicalOperator node, Set<LogicalOperator> seen, Collection<LogicalOperator> fifo) throws VisitorException { if (!seen.contains(node)) { // We haven't seen this one before. Collection<LogicalOperator> succs = lp.getSuccessors(node); if (succs != null && succs.size() > 0) { // Do all our predecessors before ourself for (LogicalOperator op : succs) { doAllSuccessors(lp, op, seen, fifo); } } // Now do ourself seen.add(node); fifo.add(node); } } public Schema dumpNestedSchema(String alias, String nestedAlias) throws IOException { boolean found = false; // To avoid non-deterministic traversal, // we do a traversal from leaf to root with ReverseDependencyOrderWalker // this way schema we print is always the latest schema in the order in the script // Also, since we do not allow union, join, cogroup, cross etc as part of inner plan // we have a tree (not a DAG) as part of inner plan and hence traversal is simpler for(LogicalPlan lp : mForEachPlans) { // Following walk is highly inefficient as we create a fifo list of all elements // we need to traverse and then check for the suitable element // but should be fine as our innerplans are expected to be small // Also, although we are sure that innerplan is a tree instead of DAG // We keep the algorithm assuming it is DAG, to avoid bugs later // This is borrowed logic from ReverseDependencyOrderWalker ;) List<LogicalOperator> fifo = new ArrayList<LogicalOperator>(); Set<LogicalOperator> seen = new HashSet<LogicalOperator>(); for(LogicalOperator op : lp.getRoots()) { doAllSuccessors(lp, op, seen, fifo); } for(LogicalOperator op: fifo) { if(!(op instanceof LOProject) && nestedAlias.equalsIgnoreCase(op.mAlias)) { found = true; // Expression operators do not have any schema if(op instanceof RelationalOperator) { Schema nestedSc = op.getSchema(); if(nestedSc == null) { System.out.println("Schema for "+ alias+ "::" + nestedAlias + " unknown."); } else { System.out.println(alias+ "::" + nestedAlias + ": " + nestedSc.toString()); } return nestedSc; } else { int errCode = 1113; String msg = "Describe nested expression is not supported"; throw new FrontendException (msg, errCode, PigException.INPUT, false, null); } } } } if(!found) { int errCode = 1114; String msg = "Unable to find schema for nested alias "+ nestedAlias; throw new FrontendException (msg, errCode, PigException.INPUT, false, null); } return null; } /** * @see org.apache.pig.impl.plan.Operator#clone() * Do not use the clone method directly. Operators are cloned when logical plans * are cloned using {@link LogicalPlanCloner} */ @Override protected Object clone() throws CloneNotSupportedException { // Do generic LogicalOperator cloning LOForEach forEachClone = (LOForEach)super.clone(); // create deep copies of attributes specific to foreach if(mFlatten != null) { forEachClone.mFlatten = new ArrayList<Boolean>(); for (Iterator<Boolean> it = mFlatten.iterator(); it.hasNext();) { forEachClone.mFlatten.add(Boolean.valueOf(it.next())); } } if(mForEachPlans != null) { forEachClone.mForEachPlans = new ArrayList<LogicalPlan>(); for (Iterator<LogicalPlan> it = mForEachPlans.iterator(); it.hasNext();) { LogicalPlanCloneHelper lpCloneHelper = new LogicalPlanCloneHelper(it.next()); forEachClone.mForEachPlans.add(lpCloneHelper.getClonedPlan()); } } if(mUserDefinedSchema != null) { forEachClone.mUserDefinedSchema = new ArrayList<Schema>(); for (Iterator<Schema> it = mUserDefinedSchema.iterator(); it.hasNext();) { Schema s = it.next(); forEachClone.mUserDefinedSchema.add(s != null ? s.clone() : null); } } return forEachClone; } @Override public ProjectionMap getProjectionMap() { if(mIsProjectionMapComputed) return mProjectionMap; mIsProjectionMapComputed = true; Schema outputSchema; try { outputSchema = getSchema(); } catch (FrontendException fee) { mProjectionMap = null; return mProjectionMap; } if(outputSchema == null) { mProjectionMap = null; return mProjectionMap; } List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this); if(predecessors == null) { mProjectionMap = null; return mProjectionMap; } LogicalOperator predecessor = predecessors.get(0); Schema inputSchema; try { inputSchema = predecessor.getSchema(); } catch (FrontendException fee) { mProjectionMap = null; return mProjectionMap; } List<LogicalPlan> foreachPlans = getForEachPlans(); List<Boolean> flattenList = getFlatten(); MultiMap<Integer, ProjectionMap.Column> mapFields = new MultiMap<Integer, ProjectionMap.Column>(); List<Integer> addedFields = new ArrayList<Integer>(); int outputColumn = 0; for(int i = 0; i < foreachPlans.size(); ++i) { LogicalPlan foreachPlan = foreachPlans.get(i); List<LogicalOperator> leaves = foreachPlan.getLeaves(); if(leaves == null || leaves.size() > 1) { mProjectionMap = null; return mProjectionMap; } int inputColumn = -1; boolean mapped = false; LOCast cast = null; if(leaves.get(0) instanceof LOProject || leaves.get(0) instanceof LOCast) { //find out if this project is a chain of projects Pair<LOProject, LOCast> pair = LogicalPlan.chainOfProjects(foreachPlan); if (pair != null) { LOProject topProject = pair.first; cast = pair.second; if (topProject != null) { inputColumn = topProject.getCol(); mapped = true; } } } Schema.FieldSchema leafFS; try { leafFS = ((ExpressionOperator)leaves.get(0)).getFieldSchema(); } catch (FrontendException fee) { mProjectionMap = null; return mProjectionMap; } if(leafFS == null) { mProjectionMap = null; return mProjectionMap; } if(flattenList.get(i)) { Schema innerSchema = leafFS.schema; if(innerSchema != null) { if(innerSchema.isTwoLevelAccessRequired()) { // this is the case where the schema is that of // a bag which has just one tuple fieldschema which // in turn has a list of fieldschemas. The schema // after flattening would consist of the fieldSchemas // present in the tuple // check that indeed we only have one field schema // which is that of a tuple if(innerSchema.getFields().size() != 1) { mProjectionMap = null; return mProjectionMap; } Schema.FieldSchema tupleFS; try { tupleFS = innerSchema.getField(0); } catch (FrontendException fee) { mProjectionMap = null; return mProjectionMap; } if(tupleFS.type != DataType.TUPLE) { mProjectionMap = null; return mProjectionMap; } innerSchema = tupleFS.schema; } //innerSchema could be modified and hence the second check if(innerSchema != null) { for(int j = 0; j < innerSchema.size(); ++j) { if(mapped) { //map each flattened column to the original column if (cast != null) { mapFields.put(outputColumn++, new ProjectionMap.Column( new Pair<Integer, Integer>(0, inputColumn), true, cast.getType() ) ); } else { mapFields.put(outputColumn++, new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn)) ); } } else { addedFields.add(outputColumn++); } } } else { //innerSchema is null if(mapped) { //map each flattened column to the original column if (cast != null) { mapFields.put(outputColumn++, new ProjectionMap.Column( new Pair<Integer, Integer>(0, inputColumn), true, cast.getType() ) ); } else { mapFields.put(outputColumn++, new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn)) ); } } else { addedFields.add(outputColumn++); } } } else { //innerSchema is null if(mapped) { //map each flattened column to the original column if (cast != null) { mapFields.put(outputColumn++, new ProjectionMap.Column( new Pair<Integer, Integer>(0, inputColumn), true, cast.getType() ) ); } else { mapFields.put(outputColumn++, new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn)) ); } } else { addedFields.add(outputColumn++); } } } else { //not a flattened column if(mapped) { if (cast != null) { mapFields.put(outputColumn++, new ProjectionMap.Column( new Pair<Integer, Integer>(0, inputColumn), true, cast.getType() ) ); } else { mapFields.put(outputColumn++, new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn)) ); } } else { addedFields.add(outputColumn++); } } } List<Pair<Integer, Integer>> removedFields = new ArrayList<Pair<Integer, Integer>>(); //if the size of the map is zero then set it to null if(mapFields.size() == 0) { mapFields = null; } if(addedFields.size() == 0) { addedFields = null; } if(inputSchema == null) { //if input schema is null then there are no removedFields removedFields = null; } else { //input schema is not null. Need to compute the removedFields //compute the set difference between the input schema and mapped fields Set<Integer> removedSet = new HashSet<Integer>(); for(int i = 0; i < inputSchema.size(); ++i) { removedSet.add(i); } if(mapFields != null) { Set<Integer> mappedSet = new HashSet<Integer>(); for(Integer key: mapFields.keySet()) { List<ProjectionMap.Column> values = (ArrayList<ProjectionMap.Column>) mapFields.get(key); for (ProjectionMap.Column value : values) { mappedSet.add(value.getInputColumn().second); } } removedSet.removeAll(mappedSet); } if(removedSet.size() == 0) { removedFields = null; } else { for(Integer i: removedSet) { removedFields.add(new Pair<Integer, Integer>(0, i)); } } } mProjectionMap = new ProjectionMap(mapFields, removedFields, addedFields); return mProjectionMap; } @Override public List<RequiredFields> getRequiredFields() { List<RequiredFields> requiredFields = new ArrayList<RequiredFields>(); Set<Pair<Integer, Integer>> fields = new HashSet<Pair<Integer, Integer>>(); Set<LOProject> projectSet = new HashSet<LOProject>(); boolean starRequired = false; for (LogicalPlan plan : getForEachPlans()) { TopLevelProjectFinder projectFinder = new TopLevelProjectFinder( plan); try { projectFinder.visit(); } catch (VisitorException ve) { requiredFields.clear(); requiredFields.add(null); return requiredFields; } projectSet.addAll(projectFinder.getProjectSet()); if(projectFinder.getProjectStarSet() != null) { starRequired = true; } } if(starRequired) { requiredFields.add(new RequiredFields(true)); return requiredFields; } else { for (LOProject project : projectSet) { for (int inputColumn : project.getProjection()) { fields.add(new Pair<Integer, Integer>(0, inputColumn)); } } if(fields.size() == 0) { requiredFields.add(new RequiredFields(false, true)); } else { requiredFields.add(new RequiredFields(new ArrayList<Pair<Integer, Integer>>(fields))); } return (requiredFields.size() == 0? null: requiredFields); } } /* (non-Javadoc) * @see org.apache.pig.impl.plan.Operator#rewire(org.apache.pig.impl.plan.Operator, org.apache.pig.impl.plan.Operator) */ @Override public void rewire(Operator<LOVisitor> oldPred, int oldPredIndex, Operator<LOVisitor> newPred, boolean useOldPred) throws PlanException { super.rewire(oldPred, oldPredIndex, newPred, useOldPred); LogicalOperator previous = (LogicalOperator) oldPred; LogicalOperator current = (LogicalOperator) newPred; for(LogicalPlan plan: mForEachPlans) { try { ProjectFixerUpper projectFixer = new ProjectFixerUpper( plan, previous, oldPredIndex, current, useOldPred, this); projectFixer.visit(); } catch (VisitorException ve) { int errCode = 2144; String msg = "Problem while fixing project inputs during rewiring."; throw new PlanException(msg, errCode, PigException.BUG, ve); } } } /** * A helper method to check if the foreach has a flattened element * * @return true if any of the expressions in the foreach has a flatten; * false otherwise */ public Pair<Boolean, List<Integer>> hasFlatten() { boolean hasFlatten = false; List<Integer> flattenedColumns = new ArrayList<Integer>(); for (int i = 0; i < mFlatten.size(); ++i) { Boolean b = mFlatten.get(i); if (b.equals(true)) { hasFlatten = true; flattenedColumns.add(i); } } return new Pair<Boolean, List<Integer>>(hasFlatten, flattenedColumns); } public LogicalPlan getRelevantPlan(int column) { if (column<0) return null; if (mSchema == null) return null; return mSchemaPlanMapping.get(column); } public boolean isInputFlattened(int column) throws FrontendException { for (int i=0;i<mForEachPlans.size();i++) { LogicalPlan forEachPlan = mForEachPlans.get(i); TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(forEachPlan); projectFinder.visit(); for (LOProject project : projectFinder.getProjectList()) { if (project.getCol()==column) { if (mFlatten.get(i)) return true; } } } return false; } @Override public List<RequiredFields> getRelevantInputs(int output, int column) throws FrontendException { if (!mIsSchemaComputed) getSchema(); if (output!=0) return null; if (column<0) return null; List<RequiredFields> result = new ArrayList<RequiredFields>(); if (mSchema == null) return null; if (mSchema.size()<=column) { return null; } LogicalPlan plan = getRelevantPlan(column); TopLevelProjectFinder projectFinder = new TopLevelProjectFinder( plan); try { projectFinder.visit(); } catch (VisitorException ve) { return null; } if(projectFinder.getProjectStarSet() != null) { result.add(new RequiredFields(true)); return result; } ArrayList<Pair<Integer, Integer>> inputList = new ArrayList<Pair<Integer, Integer>>(); for (LOProject project : projectFinder.getProjectSet()) { for (int inputColumn : project.getProjection()) { if (!inputList.contains(new Pair<Integer, Integer>(0, inputColumn))) inputList.add(new Pair<Integer, Integer>(0, inputColumn)); } } if (inputList.size()==0) return null; result.add(new RequiredFields(inputList)); return result; } @Override public boolean pruneColumns(List<Pair<Integer, Integer>> columns) throws FrontendException { if (!mIsSchemaComputed) getSchema(); if (mSchema == null) { log.warn("Cannot prune columns in foreach, no schema information found"); return false; } List<LogicalOperator> predecessors = mPlan.getPredecessors(this); if (predecessors == null) { int errCode = 2190; throw new FrontendException("Cannot find predecessors for foreach", errCode, PigException.BUG); } if (predecessors.size() != 1) { int errCode = 2193; throw new FrontendException("Foreach can only have 1 predecessor", errCode, PigException.BUG); } if (predecessors.get(0).getSchema() == null) { int errCode = 2194; throw new FrontendException("Expect schema", errCode, PigException.BUG); } for (Pair<Integer, Integer> column : columns) { if (column.first != 0) { int errCode = 2191; throw new FrontendException( "foreach only take 1 input, cannot prune input with index " + column.first, errCode, PigException.BUG); } if (column.second < 0) { int errCode = 2192; throw new FrontendException("Column to prune does not exist", errCode, PigException.BUG); } } List<Integer> planToRemove = new ArrayList<Integer>(); for (int i = 0; i < mForEachPlans.size(); i++) { LogicalPlan plan = mForEachPlans.get(i); TopLevelProjectFinder projectFinder = new TopLevelProjectFinder( plan); try { projectFinder.visit(); } catch (VisitorException ve) { int errCode = 2195; throw new FrontendException("Fail to visit foreach inner plan", errCode, PigException.BUG); } // this inner plan need all fields, cannot remove if (projectFinder.getProjectStarSet() != null) { continue; } // Constant plan, we never remove constant field if (projectFinder.getProjectSet().size()==0) { continue; } boolean anyPruned = false; for (LOProject loProject : projectFinder.getProjectSet()) { Pair<Integer, Integer> pair = new Pair<Integer, Integer>(0, loProject.getCol()); if (columns.contains(pair)) { anyPruned = true; break; } } if (anyPruned) { planToRemove.add(i); } } while (planToRemove.size() > 0) { int index = planToRemove.get(planToRemove.size()-1); if (mUserDefinedSchema!=null) { for (int i=mUserDefinedSchema.size()-1;i>=0;i--) { if (getRelevantPlan(i)==mForEachPlans.get(index)) mUserDefinedSchema.remove(i); } } mForEachPlans.remove(index); mFlatten.remove(index); planToRemove.remove(planToRemove.size()-1); } // Adjust col# in LOProject in every forEachPlan, pruneColumnInPlan will check if the col# need to adjust, // if so, change the col# inside that LOProject for (int i=columns.size()-1;i>=0;i--) { Pair<Integer, Integer> column = columns.get(i); for (LogicalPlan plan : mForEachPlans) { pruneColumnInPlan(plan, column.second); } } super.pruneColumns(columns); return true; } }