/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.logicalLayer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.PigException;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.optimizer.SchemaRemover;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException;
import org.apache.pig.impl.plan.Operator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
public class LOForEach extends RelationalOperator {
private static final long serialVersionUID = 2L;
/**
* The foreach operator supports nested query plans. At this point its one
* level of nesting. Foreach can have a list of operators that need to be
* applied over the input.
*/
private ArrayList<LogicalPlan> mForEachPlans;
private ArrayList<Boolean> mFlatten;
private ArrayList<Schema> mUserDefinedSchema = null;
private static Log log = LogFactory.getLog(LOForEach.class);
// Cache the information of generating inner plan for each output schema while generating output schema,
// for later use in caculate relevant field
private List<LogicalPlan> mSchemaPlanMapping = new ArrayList<LogicalPlan>();
/**
* @param plan
* Logical plan this operator is a part of.
* @param k
* Operator key to assign to this node.
* @param foreachPlans
* the list of plans that are applied for each input
* @param flattenList
* boolean list that tells which elements of the foreach
* projection should be flattened.
*/
public LOForEach(LogicalPlan plan, OperatorKey k,
ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList) {
super(plan, k);
mForEachPlans = foreachPlans;
mFlatten = flattenList;
}
public LOForEach(LogicalPlan plan, OperatorKey k,
ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList,
ArrayList<Schema> userDefinedSchemaList) {
super(plan, k);
mForEachPlans = foreachPlans;
mFlatten = flattenList;
mUserDefinedSchema = userDefinedSchemaList;
}
public ArrayList<LogicalPlan> getForEachPlans() {
return mForEachPlans;
}
public void setForEachPlans(ArrayList<LogicalPlan> foreachPlans) {
mForEachPlans = foreachPlans;
}
public List<Boolean> getFlatten() {
return mFlatten;
}
public void setFlatten(ArrayList<Boolean> flattenList) {
mFlatten = flattenList;
}
public List<Schema> getUserDefinedSchema() {
return mUserDefinedSchema;
}
public void setUserDefinedSchema(ArrayList<Schema> userDefinedSchema) {
mUserDefinedSchema = userDefinedSchema;
}
@Override
public String name() {
return getAliasString() + "ForEach " + mKey.scope + "-" + mKey.id;
}
@Override
public boolean supportsMultipleInputs() {
return false;
}
@Override
public void visit(LOVisitor v) throws VisitorException {
v.visit(this);
}
public byte getType() {
return DataType.BAG ;
}
private void updateAliasCount(Map<String, Integer> aliases, String alias) {
if((null == aliases) || (null == alias)) return;
Integer count = aliases.get(alias);
if(null == count) {
aliases.put(alias, 1);
} else {
aliases.put(alias, count + 1);
}
}
@Override
public Schema getSchema() throws FrontendException {
log.debug("Entering getSchema");
if (!mIsSchemaComputed) {
List<Schema.FieldSchema> fss = new ArrayList<Schema.FieldSchema>(
mForEachPlans.size());
mSchemaPlanMapping = new ArrayList<LogicalPlan>();
for (LogicalPlan plan : mForEachPlans) {
log.debug("Number of leaves in " + plan + " = " + plan.getLeaves().size());
for(int i = 0; i < plan.getLeaves().size(); ++i) {
log.debug("Leaf" + i + "= " + plan.getLeaves().get(i));
}
//LogicalOperator op = plan.getRoots().get(0);
LogicalOperator op = plan.getLeaves().get(0);
log.debug("op: " + op.getClass().getName() + " " + op);
}
log.debug("Printed the leaves of the generate plans");
Map<Schema.FieldSchema, String> flattenAlias = new HashMap<Schema.FieldSchema, String>();
Map<String, Boolean> inverseFlattenAlias = new HashMap<String, Boolean>();
Map<String, Integer> aliases = new HashMap<String, Integer>();
for (int planCtr = 0; planCtr < mForEachPlans.size(); ++planCtr) {
LogicalPlan plan = mForEachPlans.get(planCtr);
LogicalOperator op = plan.getLeaves().get(0);
log.debug("op: " + op.getClass().getName() + " " + op);
log.debug("Flatten: " + mFlatten.get(planCtr));
Schema.FieldSchema planFs;
if(op instanceof LOProject) {
//the check for the type is required for statements like
//foreach cogroup {
// a1 = order a by *;
// generate a1;
//}
//In the above script, the generate a1, will translate to
//project(a1) -> project(*) and will not be translated to a sequence of projects
//As a result the project(*) will remain but the return type is a bag
//project(*) with a data type set to tuple indicates a project(*) from an input
//that has no schema
if( (((LOProject)op).isStar() ) && (((LOProject)op).getType() == DataType.TUPLE) ) {
mSchema = null;
mIsSchemaComputed = true;
return mSchema;
}
}
try {
planFs = ((ExpressionOperator)op).getFieldSchema();
log.debug("planFs: " + planFs);
Schema userDefinedSchema = null;
if(null != mUserDefinedSchema) {
userDefinedSchema = mUserDefinedSchema.get(planCtr);
}
if(null != planFs) {
String outerCanonicalAlias = op.getAlias();
if(null == outerCanonicalAlias) {
outerCanonicalAlias = planFs.alias;
}
log.debug("Outer canonical alias: " + outerCanonicalAlias);
if(mFlatten.get(planCtr)) {
//need to extract the children and create the aliases
//assumption here is that flatten is only for one column
//i.e., flatten(A), flatten(A.x) and NOT
//flatten(B.(x,y,z))
Schema s = planFs.schema;
if(null != s && s.isTwoLevelAccessRequired()) {
// this is the case where the schema is that of
// a bag which has just one tuple fieldschema which
// in turn has a list of fieldschemas. The schema
// after flattening would consist of the fieldSchemas
// present in the tuple
// check that indeed we only have one field schema
// which is that of a tuple
if(s.getFields().size() != 1) {
int errCode = 1008;
String msg = "Expected a bag schema with a single " +
"element of type "+ DataType.findTypeName(DataType.TUPLE) +
" but got a bag schema with multiple elements.";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
Schema.FieldSchema tupleFS = s.getField(0);
if(tupleFS.type != DataType.TUPLE) {
int errCode = 1009;
String msg = "Expected a bag schema with a single " +
"element of type "+ DataType.findTypeName(DataType.TUPLE) +
" but got an element of type " +
DataType.findTypeName(tupleFS.type);
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
s = tupleFS.schema;
}
if(null != s && s.size()!=0) {
for(int i = 0; i < s.size(); ++i) {
Schema.FieldSchema fs;
fs = Schema.FieldSchema.copyAndLink(s.getField(i), op);
log.debug("fs: " + fs);
if(null != userDefinedSchema) {
Schema.FieldSchema userDefinedFieldSchema;
try {
if(i < userDefinedSchema.size()) {
userDefinedFieldSchema = userDefinedSchema.getField(i);
fs = fs.mergePrefixFieldSchema(userDefinedFieldSchema);
}
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
outerCanonicalAlias = null;
}
String innerCanonicalAlias = fs.alias;
Schema.FieldSchema newFs;
if((null != outerCanonicalAlias) && (null != innerCanonicalAlias)) {
String disambiguatorAlias = outerCanonicalAlias + "::" + innerCanonicalAlias;
newFs = new Schema.FieldSchema(disambiguatorAlias, fs.schema, fs.type);
newFs.setParent(s.getField(i).canonicalName, op);
fss.add(newFs);
mSchemaPlanMapping.add(plan);
updateAliasCount(aliases, disambiguatorAlias);
//it's fine if there are duplicates
//we just need to record if its due to
//flattening
} else {
newFs = new Schema.FieldSchema(fs);
newFs.setParent(s.getField(i).canonicalName, op);
fss.add(newFs);
mSchemaPlanMapping.add(plan);
}
updateAliasCount(aliases, innerCanonicalAlias);
flattenAlias.put(newFs, innerCanonicalAlias);
inverseFlattenAlias.put(innerCanonicalAlias, true);
}
} else {
Schema.FieldSchema newFs;
if(null != userDefinedSchema) {
if(!DataType.isSchemaType(planFs.type)) {
if(userDefinedSchema.size() > 1) {
int errCode = 1017;
String msg = "Schema mismatch. A basic type on flattening cannot have more than one column. User defined schema: " + userDefinedSchema;
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
newFs = new Schema.FieldSchema(null, planFs.type);
try {
newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
updateAliasCount(aliases, newFs.alias);
fss.add(newFs);
mSchemaPlanMapping.add(plan);
newFs.setParent(planFs.canonicalName, op);
} else {
for(Schema.FieldSchema ufs: userDefinedSchema.getFields()) {
Schema.FieldSchema.setFieldSchemaDefaultType(ufs, DataType.BYTEARRAY);
newFs = new Schema.FieldSchema(ufs);
fss.add(newFs);
mSchemaPlanMapping.add(plan);
newFs.setParent(null, op);
updateAliasCount(aliases, ufs.alias);
}
}
} else {
if(!DataType.isSchemaType(planFs.type)) {
newFs = new Schema.FieldSchema(planFs.alias, planFs.type);
} else {
newFs = new Schema.FieldSchema(null, DataType.BYTEARRAY);
}
fss.add(newFs);
mSchemaPlanMapping.add(plan);
newFs.setParent( planFs.canonicalName, op );
}
}
} else {
//just populate the schema with the field schema of the expression operator
//check if the user has defined a schema for the operator; compare the schema
//with that of the expression operator field schema and then add it to the list
Schema.FieldSchema newFs = Schema.FieldSchema.copyAndLink(planFs, op);
if(null != userDefinedSchema) {
try {
newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
updateAliasCount(aliases, newFs.alias);
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
}
newFs.setParent(planFs.canonicalName, op);
fss.add(newFs);
mSchemaPlanMapping.add(plan);
}
} else {
//did not get a valid list of field schemas
String outerCanonicalAlias = null;
if(null != userDefinedSchema) {
Schema.FieldSchema userDefinedFieldSchema = new Schema.FieldSchema(userDefinedSchema.getField(0));
fss.add(userDefinedFieldSchema);
mSchemaPlanMapping.add(plan);
userDefinedFieldSchema.setParent(null, op);
updateAliasCount(aliases, userDefinedFieldSchema.alias);
} else {
mSchema = null;
mIsSchemaComputed = true;
return mSchema;
}
}
} catch (FrontendException fee) {
mSchema = null;
mIsSchemaComputed = false;
throw fee;
}
}
//check for duplicate column names and throw an error if there are duplicates
//ensure that flatten gets rid of duplicate column names when the checks are
//being done
log.debug(" flattenAlias: " + flattenAlias);
log.debug(" inverseFlattenAlias: " + inverseFlattenAlias);
log.debug(" aliases: " + aliases);
log.debug(" fss.size: " + fss.size());
boolean duplicates = false;
Map<String, Integer> duplicateAliases = new HashMap<String, Integer>();
for(Map.Entry<String, Integer> e: aliases.entrySet()) {
Integer count = e.getValue();
if(count > 1) {//not checking for null here as counts are intitalized to 1
Boolean inFlatten = false;
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
inFlatten = inverseFlattenAlias.get(e.getKey());
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
if((null == inFlatten) || (!inFlatten)) {
duplicates = true;
duplicateAliases.put(e.getKey(), count);
}
}
}
if(duplicates) {
String errMessage;
StringBuffer sb = new StringBuffer("Found duplicates in schema. ");
if(duplicateAliases.size() > 0) {
Set<Map.Entry<String, Integer>> es = duplicateAliases.entrySet();
Iterator<Map.Entry<String, Integer>> iter = es.iterator();
Map.Entry<String, Integer> e = iter.next();
sb.append(": ");
sb.append(e.getValue());
sb.append(" columns");
while(iter.hasNext()) {
e = iter.next();
sb.append(", ");
sb.append(e.getKey());
sb.append(": ");
sb.append(e.getValue());
sb.append(" columns");
}
}
sb.append(". Please alias the columns with unique names.");
errMessage = sb.toString();
log.debug(errMessage);
int errCode = 1007;
throw new FrontendException(errMessage, errCode, PigException.INPUT, false, null);
}
mSchema = new Schema(fss);
//add the aliases that are unique after flattening
for(int i=0;i<mSchema.getFields().size();i++) {
Schema.FieldSchema fs = mSchema.getFields().get(i);
String alias = flattenAlias.get(fs);
Integer count = aliases.get(alias);
if (null == count) count = 1;
log.debug("alias: " + alias);
if((null != alias) && (count == 1)) {
mSchema.addAlias(alias, fs);
}
}
mIsSchemaComputed = true;
}
log.debug("Exiting getSchema");
return mSchema;
}
public void unsetSchema() throws VisitorException{
for(LogicalPlan plan: mForEachPlans) {
SchemaRemover sr = new SchemaRemover(plan);
sr.visit();
}
super.unsetSchema();
mSchemaPlanMapping = new ArrayList<LogicalPlan>();
}
private void doAllSuccessors(LogicalPlan lp,
LogicalOperator node,
Set<LogicalOperator> seen,
Collection<LogicalOperator> fifo) throws VisitorException {
if (!seen.contains(node)) {
// We haven't seen this one before.
Collection<LogicalOperator> succs = lp.getSuccessors(node);
if (succs != null && succs.size() > 0) {
// Do all our predecessors before ourself
for (LogicalOperator op : succs) {
doAllSuccessors(lp, op, seen, fifo);
}
}
// Now do ourself
seen.add(node);
fifo.add(node);
}
}
public Schema dumpNestedSchema(String alias, String nestedAlias) throws IOException {
boolean found = false;
// To avoid non-deterministic traversal,
// we do a traversal from leaf to root with ReverseDependencyOrderWalker
// this way schema we print is always the latest schema in the order in the script
// Also, since we do not allow union, join, cogroup, cross etc as part of inner plan
// we have a tree (not a DAG) as part of inner plan and hence traversal is simpler
for(LogicalPlan lp : mForEachPlans) {
// Following walk is highly inefficient as we create a fifo list of all elements
// we need to traverse and then check for the suitable element
// but should be fine as our innerplans are expected to be small
// Also, although we are sure that innerplan is a tree instead of DAG
// We keep the algorithm assuming it is DAG, to avoid bugs later
// This is borrowed logic from ReverseDependencyOrderWalker ;)
List<LogicalOperator> fifo = new ArrayList<LogicalOperator>();
Set<LogicalOperator> seen = new HashSet<LogicalOperator>();
for(LogicalOperator op : lp.getRoots()) {
doAllSuccessors(lp, op, seen, fifo);
}
for(LogicalOperator op: fifo) {
if(!(op instanceof LOProject) && nestedAlias.equalsIgnoreCase(op.mAlias)) {
found = true;
// Expression operators do not have any schema
if(op instanceof RelationalOperator) {
Schema nestedSc = op.getSchema();
if(nestedSc == null) {
System.out.println("Schema for "+ alias+ "::" + nestedAlias + " unknown.");
} else {
System.out.println(alias+ "::" + nestedAlias + ": " + nestedSc.toString());
}
return nestedSc;
}
else {
int errCode = 1113;
String msg = "Describe nested expression is not supported";
throw new FrontendException (msg, errCode, PigException.INPUT, false, null);
}
}
}
}
if(!found) {
int errCode = 1114;
String msg = "Unable to find schema for nested alias "+ nestedAlias;
throw new FrontendException (msg, errCode, PigException.INPUT, false, null);
}
return null;
}
/**
* @see org.apache.pig.impl.plan.Operator#clone()
* Do not use the clone method directly. Operators are cloned when logical plans
* are cloned using {@link LogicalPlanCloner}
*/
@Override
protected Object clone() throws CloneNotSupportedException {
// Do generic LogicalOperator cloning
LOForEach forEachClone = (LOForEach)super.clone();
// create deep copies of attributes specific to foreach
if(mFlatten != null) {
forEachClone.mFlatten = new ArrayList<Boolean>();
for (Iterator<Boolean> it = mFlatten.iterator(); it.hasNext();) {
forEachClone.mFlatten.add(Boolean.valueOf(it.next()));
}
}
if(mForEachPlans != null) {
forEachClone.mForEachPlans = new ArrayList<LogicalPlan>();
for (Iterator<LogicalPlan> it = mForEachPlans.iterator(); it.hasNext();) {
LogicalPlanCloneHelper lpCloneHelper = new LogicalPlanCloneHelper(it.next());
forEachClone.mForEachPlans.add(lpCloneHelper.getClonedPlan());
}
}
if(mUserDefinedSchema != null) {
forEachClone.mUserDefinedSchema = new ArrayList<Schema>();
for (Iterator<Schema> it = mUserDefinedSchema.iterator(); it.hasNext();) {
Schema s = it.next();
forEachClone.mUserDefinedSchema.add(s != null ? s.clone() : null);
}
}
return forEachClone;
}
@Override
public ProjectionMap getProjectionMap() {
if(mIsProjectionMapComputed) return mProjectionMap;
mIsProjectionMapComputed = true;
Schema outputSchema;
try {
outputSchema = getSchema();
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
if(outputSchema == null) {
mProjectionMap = null;
return mProjectionMap;
}
List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this);
if(predecessors == null) {
mProjectionMap = null;
return mProjectionMap;
}
LogicalOperator predecessor = predecessors.get(0);
Schema inputSchema;
try {
inputSchema = predecessor.getSchema();
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
List<LogicalPlan> foreachPlans = getForEachPlans();
List<Boolean> flattenList = getFlatten();
MultiMap<Integer, ProjectionMap.Column> mapFields = new MultiMap<Integer, ProjectionMap.Column>();
List<Integer> addedFields = new ArrayList<Integer>();
int outputColumn = 0;
for(int i = 0; i < foreachPlans.size(); ++i) {
LogicalPlan foreachPlan = foreachPlans.get(i);
List<LogicalOperator> leaves = foreachPlan.getLeaves();
if(leaves == null || leaves.size() > 1) {
mProjectionMap = null;
return mProjectionMap;
}
int inputColumn = -1;
boolean mapped = false;
LOCast cast = null;
if(leaves.get(0) instanceof LOProject || leaves.get(0) instanceof LOCast) {
//find out if this project is a chain of projects
Pair<LOProject, LOCast> pair = LogicalPlan.chainOfProjects(foreachPlan);
if (pair != null) {
LOProject topProject = pair.first;
cast = pair.second;
if (topProject != null) {
inputColumn = topProject.getCol();
mapped = true;
}
}
}
Schema.FieldSchema leafFS;
try {
leafFS = ((ExpressionOperator)leaves.get(0)).getFieldSchema();
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
if(leafFS == null) {
mProjectionMap = null;
return mProjectionMap;
}
if(flattenList.get(i)) {
Schema innerSchema = leafFS.schema;
if(innerSchema != null) {
if(innerSchema.isTwoLevelAccessRequired()) {
// this is the case where the schema is that of
// a bag which has just one tuple fieldschema which
// in turn has a list of fieldschemas. The schema
// after flattening would consist of the fieldSchemas
// present in the tuple
// check that indeed we only have one field schema
// which is that of a tuple
if(innerSchema.getFields().size() != 1) {
mProjectionMap = null;
return mProjectionMap;
}
Schema.FieldSchema tupleFS;
try {
tupleFS = innerSchema.getField(0);
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
if(tupleFS.type != DataType.TUPLE) {
mProjectionMap = null;
return mProjectionMap;
}
innerSchema = tupleFS.schema;
}
//innerSchema could be modified and hence the second check
if(innerSchema != null) {
for(int j = 0; j < innerSchema.size(); ++j) {
if(mapped) {
//map each flattened column to the original column
if (cast != null) {
mapFields.put(outputColumn++,
new ProjectionMap.Column(
new Pair<Integer, Integer>(0, inputColumn), true, cast.getType()
)
);
} else {
mapFields.put(outputColumn++,
new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn))
);
}
} else {
addedFields.add(outputColumn++);
}
}
} else {
//innerSchema is null
if(mapped) {
//map each flattened column to the original column
if (cast != null) {
mapFields.put(outputColumn++,
new ProjectionMap.Column(
new Pair<Integer, Integer>(0, inputColumn), true, cast.getType()
)
);
} else {
mapFields.put(outputColumn++,
new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn))
);
}
} else {
addedFields.add(outputColumn++);
}
}
} else {
//innerSchema is null
if(mapped) {
//map each flattened column to the original column
if (cast != null) {
mapFields.put(outputColumn++,
new ProjectionMap.Column(
new Pair<Integer, Integer>(0, inputColumn), true, cast.getType()
)
);
} else {
mapFields.put(outputColumn++,
new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn))
);
}
} else {
addedFields.add(outputColumn++);
}
}
} else {
//not a flattened column
if(mapped) {
if (cast != null) {
mapFields.put(outputColumn++,
new ProjectionMap.Column(
new Pair<Integer, Integer>(0, inputColumn), true, cast.getType()
)
);
} else {
mapFields.put(outputColumn++,
new ProjectionMap.Column(new Pair<Integer, Integer>(0, inputColumn))
);
}
} else {
addedFields.add(outputColumn++);
}
}
}
List<Pair<Integer, Integer>> removedFields = new ArrayList<Pair<Integer, Integer>>();
//if the size of the map is zero then set it to null
if(mapFields.size() == 0) {
mapFields = null;
}
if(addedFields.size() == 0) {
addedFields = null;
}
if(inputSchema == null) {
//if input schema is null then there are no removedFields
removedFields = null;
} else {
//input schema is not null. Need to compute the removedFields
//compute the set difference between the input schema and mapped fields
Set<Integer> removedSet = new HashSet<Integer>();
for(int i = 0; i < inputSchema.size(); ++i) {
removedSet.add(i);
}
if(mapFields != null) {
Set<Integer> mappedSet = new HashSet<Integer>();
for(Integer key: mapFields.keySet()) {
List<ProjectionMap.Column> values = (ArrayList<ProjectionMap.Column>) mapFields.get(key);
for (ProjectionMap.Column value : values) {
mappedSet.add(value.getInputColumn().second);
}
}
removedSet.removeAll(mappedSet);
}
if(removedSet.size() == 0) {
removedFields = null;
} else {
for(Integer i: removedSet) {
removedFields.add(new Pair<Integer, Integer>(0, i));
}
}
}
mProjectionMap = new ProjectionMap(mapFields, removedFields, addedFields);
return mProjectionMap;
}
@Override
public List<RequiredFields> getRequiredFields() {
List<RequiredFields> requiredFields = new ArrayList<RequiredFields>();
Set<Pair<Integer, Integer>> fields = new HashSet<Pair<Integer, Integer>>();
Set<LOProject> projectSet = new HashSet<LOProject>();
boolean starRequired = false;
for (LogicalPlan plan : getForEachPlans()) {
TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(
plan);
try {
projectFinder.visit();
} catch (VisitorException ve) {
requiredFields.clear();
requiredFields.add(null);
return requiredFields;
}
projectSet.addAll(projectFinder.getProjectSet());
if(projectFinder.getProjectStarSet() != null) {
starRequired = true;
}
}
if(starRequired) {
requiredFields.add(new RequiredFields(true));
return requiredFields;
} else {
for (LOProject project : projectSet) {
for (int inputColumn : project.getProjection()) {
fields.add(new Pair<Integer, Integer>(0, inputColumn));
}
}
if(fields.size() == 0) {
requiredFields.add(new RequiredFields(false, true));
} else {
requiredFields.add(new RequiredFields(new ArrayList<Pair<Integer, Integer>>(fields)));
}
return (requiredFields.size() == 0? null: requiredFields);
}
}
/* (non-Javadoc)
* @see org.apache.pig.impl.plan.Operator#rewire(org.apache.pig.impl.plan.Operator, org.apache.pig.impl.plan.Operator)
*/
@Override
public void rewire(Operator<LOVisitor> oldPred, int oldPredIndex, Operator<LOVisitor> newPred, boolean useOldPred) throws PlanException {
super.rewire(oldPred, oldPredIndex, newPred, useOldPred);
LogicalOperator previous = (LogicalOperator) oldPred;
LogicalOperator current = (LogicalOperator) newPred;
for(LogicalPlan plan: mForEachPlans) {
try {
ProjectFixerUpper projectFixer = new ProjectFixerUpper(
plan, previous, oldPredIndex, current, useOldPred, this);
projectFixer.visit();
} catch (VisitorException ve) {
int errCode = 2144;
String msg = "Problem while fixing project inputs during rewiring.";
throw new PlanException(msg, errCode, PigException.BUG, ve);
}
}
}
/**
* A helper method to check if the foreach has a flattened element
*
* @return true if any of the expressions in the foreach has a flatten;
* false otherwise
*/
public Pair<Boolean, List<Integer>> hasFlatten() {
boolean hasFlatten = false;
List<Integer> flattenedColumns = new ArrayList<Integer>();
for (int i = 0; i < mFlatten.size(); ++i) {
Boolean b = mFlatten.get(i);
if (b.equals(true)) {
hasFlatten = true;
flattenedColumns.add(i);
}
}
return new Pair<Boolean, List<Integer>>(hasFlatten, flattenedColumns);
}
public LogicalPlan getRelevantPlan(int column)
{
if (column<0)
return null;
if (mSchema == null)
return null;
return mSchemaPlanMapping.get(column);
}
public boolean isInputFlattened(int column) throws FrontendException {
for (int i=0;i<mForEachPlans.size();i++) {
LogicalPlan forEachPlan = mForEachPlans.get(i);
TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(forEachPlan);
projectFinder.visit();
for (LOProject project : projectFinder.getProjectList()) {
if (project.getCol()==column) {
if (mFlatten.get(i))
return true;
}
}
}
return false;
}
@Override
public List<RequiredFields> getRelevantInputs(int output, int column) throws FrontendException {
if (!mIsSchemaComputed)
getSchema();
if (output!=0)
return null;
if (column<0)
return null;
List<RequiredFields> result = new ArrayList<RequiredFields>();
if (mSchema == null)
return null;
if (mSchema.size()<=column)
{
return null;
}
LogicalPlan plan = getRelevantPlan(column);
TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(
plan);
try {
projectFinder.visit();
} catch (VisitorException ve) {
return null;
}
if(projectFinder.getProjectStarSet() != null) {
result.add(new RequiredFields(true));
return result;
}
ArrayList<Pair<Integer, Integer>> inputList = new ArrayList<Pair<Integer, Integer>>();
for (LOProject project : projectFinder.getProjectSet()) {
for (int inputColumn : project.getProjection()) {
if (!inputList.contains(new Pair<Integer, Integer>(0, inputColumn)))
inputList.add(new Pair<Integer, Integer>(0, inputColumn));
}
}
if (inputList.size()==0)
return null;
result.add(new RequiredFields(inputList));
return result;
}
@Override
public boolean pruneColumns(List<Pair<Integer, Integer>> columns)
throws FrontendException {
if (!mIsSchemaComputed)
getSchema();
if (mSchema == null) {
log.warn("Cannot prune columns in foreach, no schema information found");
return false;
}
List<LogicalOperator> predecessors = mPlan.getPredecessors(this);
if (predecessors == null) {
int errCode = 2190;
throw new FrontendException("Cannot find predecessors for foreach",
errCode, PigException.BUG);
}
if (predecessors.size() != 1) {
int errCode = 2193;
throw new FrontendException("Foreach can only have 1 predecessor",
errCode, PigException.BUG);
}
if (predecessors.get(0).getSchema() == null) {
int errCode = 2194;
throw new FrontendException("Expect schema", errCode,
PigException.BUG);
}
for (Pair<Integer, Integer> column : columns) {
if (column.first != 0) {
int errCode = 2191;
throw new FrontendException(
"foreach only take 1 input, cannot prune input with index "
+ column.first, errCode, PigException.BUG);
}
if (column.second < 0) {
int errCode = 2192;
throw new FrontendException("Column to prune does not exist", errCode, PigException.BUG);
}
}
List<Integer> planToRemove = new ArrayList<Integer>();
for (int i = 0; i < mForEachPlans.size(); i++) {
LogicalPlan plan = mForEachPlans.get(i);
TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(
plan);
try {
projectFinder.visit();
} catch (VisitorException ve) {
int errCode = 2195;
throw new FrontendException("Fail to visit foreach inner plan",
errCode, PigException.BUG);
}
// this inner plan need all fields, cannot remove
if (projectFinder.getProjectStarSet() != null) {
continue;
}
// Constant plan, we never remove constant field
if (projectFinder.getProjectSet().size()==0)
{
continue;
}
boolean anyPruned = false;
for (LOProject loProject : projectFinder.getProjectSet()) {
Pair<Integer, Integer> pair = new Pair<Integer, Integer>(0,
loProject.getCol());
if (columns.contains(pair)) {
anyPruned = true;
break;
}
}
if (anyPruned) {
planToRemove.add(i);
}
}
while (planToRemove.size() > 0) {
int index = planToRemove.get(planToRemove.size()-1);
if (mUserDefinedSchema!=null) {
for (int i=mUserDefinedSchema.size()-1;i>=0;i--) {
if (getRelevantPlan(i)==mForEachPlans.get(index))
mUserDefinedSchema.remove(i);
}
}
mForEachPlans.remove(index);
mFlatten.remove(index);
planToRemove.remove(planToRemove.size()-1);
}
// Adjust col# in LOProject in every forEachPlan, pruneColumnInPlan will check if the col# need to adjust,
// if so, change the col# inside that LOProject
for (int i=columns.size()-1;i>=0;i--) {
Pair<Integer, Integer> column = columns.get(i);
for (LogicalPlan plan : mForEachPlans) {
pruneColumnInPlan(plan, column.second);
}
}
super.pruneColumns(columns);
return true;
}
}