LOForEach.java example

Explorer
Cloud-Stenography-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.logicalLayer;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.Iterator;

import org.apache.pig.PigException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException;
import org.apache.pig.impl.logicalLayer.optimizer.SchemaRemover;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.data.DataType;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


public class LOForEach extends LogicalOperator {

    private static final long serialVersionUID = 2L;

    /**
     * The foreach operator supports nested query plans. At this point its one
     * level of nesting. Foreach can have a list of operators that need to be
     * applied over the input.
     */

    private ArrayList<LogicalPlan> mForEachPlans;
    private ArrayList<Boolean> mFlatten;
    private ArrayList<Schema> mUserDefinedSchema = null;
    private static Log log = LogFactory.getLog(LOForEach.class);

    /**
     * @param plan
     *            Logical plan this operator is a part of.
     * @param k
     *            Operator key to assign to this node.
     * @param foreachPlans
     *            the list of plans that are applied for each input
     * @param flattenList
     *            boolean list that tells which elements of the foreach
     *            projection should be flattened.
     */

    public LOForEach(LogicalPlan plan, OperatorKey k,
            ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList) {

        super(plan, k);
        mForEachPlans = foreachPlans;
        mFlatten = flattenList;
    }

    public LOForEach(LogicalPlan plan, OperatorKey k,
            ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList,
            ArrayList<Schema> userDefinedSchemaList) {

        super(plan, k);
        mForEachPlans = foreachPlans;
        mFlatten = flattenList;
        mUserDefinedSchema = userDefinedSchemaList;
    }

    public ArrayList<LogicalPlan> getForEachPlans() {
        return mForEachPlans;
    }

    public void setForEachPlans(ArrayList<LogicalPlan> foreachPlans) {
        mForEachPlans = foreachPlans;
    }

    public List<Boolean> getFlatten() {
        return mFlatten;
    }

    public void setFlatten(ArrayList<Boolean> flattenList) {
        mFlatten = flattenList;
    }

    public List<Schema> getUserDefinedSchema() {
        return mUserDefinedSchema;
    }

    public void setUserDefinedSchema(ArrayList<Schema> userDefinedSchema) {
        mUserDefinedSchema = userDefinedSchema;
    }

    @Override
    public String name() {
        return "ForEach " + mKey.scope + "-" + mKey.id;
    }

    @Override
    public boolean supportsMultipleInputs() {
        return false;
    }

    @Override
    public void visit(LOVisitor v) throws VisitorException {
        v.visit(this);
    }

    public byte getType() {
        return DataType.BAG ;
    }

    private void updateAliasCount(Map<String, Integer> aliases, String alias) {
        if((null == aliases) || (null == alias)) return;
		Integer count = aliases.get(alias);
		if(null == count) {
			aliases.put(alias, 1);
		} else {
			aliases.put(alias, ++count);
		}
    }

    @Override
    public Schema getSchema() throws FrontendException {
        log.debug("Entering getSchema");
        if (!mIsSchemaComputed) {
            List<Schema.FieldSchema> fss = new ArrayList<Schema.FieldSchema>(
                    mForEachPlans.size());

            for (LogicalPlan plan : mForEachPlans) {
                log.debug("Number of leaves in " + plan + " = " + plan.getLeaves().size());
                for(int i = 0; i < plan.getLeaves().size(); ++i) {
                    log.debug("Leaf" + i + "= " + plan.getLeaves().get(i));
                }
                //LogicalOperator op = plan.getRoots().get(0);
                LogicalOperator op = plan.getLeaves().get(0);
                log.debug("op: " + op.getClass().getName() + " " + op);
            }
            log.debug("Printed the leaves of the generate plans");

            Map<Schema.FieldSchema, String> flattenAlias = new HashMap<Schema.FieldSchema, String>();
            Map<String, Boolean> inverseFlattenAlias = new HashMap<String, Boolean>();
            Map<String, Integer> aliases = new HashMap<String, Integer>();

            for (int planCtr = 0; planCtr < mForEachPlans.size(); ++planCtr) {
                LogicalPlan plan = mForEachPlans.get(planCtr);
                LogicalOperator op = plan.getLeaves().get(0);
                log.debug("op: " + op.getClass().getName() + " " + op);
                log.debug("Flatten: " + mFlatten.get(planCtr));
                Schema.FieldSchema planFs;

                if(op instanceof LOProject) {
                    //the check for the type is required for statements like
                    //foreach cogroup {
                    // a1 = order a by *;
                    // generate a1;
                    //}
                    //In the above script, the generate a1, will translate to 
                    //project(a1) -> project(*) and will not be translated to a sequence of projects
                    //As a result the project(*) will remain but the return type is a bag
                    //project(*) with a data type set to tuple indicates a project(*) from an input
                    //that has no schema
                    if( (((LOProject)op).isStar() ) && (((LOProject)op).getType() == DataType.TUPLE) ) {
                        mSchema = null;
                        mIsSchemaComputed = true;
                        return mSchema;
                    }
                }
                
                try {
	                planFs = ((ExpressionOperator)op).getFieldSchema();
                    log.debug("planFs: " + planFs);
                    Schema userDefinedSchema = null;
                    if(null != mUserDefinedSchema) {
                        userDefinedSchema = mUserDefinedSchema.get(planCtr);
                    }
					if(null != planFs) {
						String outerCanonicalAlias = op.getAlias();
						if(null == outerCanonicalAlias) {
							outerCanonicalAlias = planFs.alias;
						}
						log.debug("Outer canonical alias: " + outerCanonicalAlias);
						if(mFlatten.get(planCtr)) {
							//need to extract the children and create the aliases
							//assumption here is that flatten is only for one column
							//i.e., flatten(A), flatten(A.x) and NOT
							//flatten(B.(x,y,z))
							Schema s = planFs.schema;
							if(null != s && s.isTwoLevelAccessRequired()) {
							    // this is the case where the schema is that of
					            // a bag which has just one tuple fieldschema which
					            // in turn has a list of fieldschemas. The schema
							    // after flattening would consist of the fieldSchemas
							    // present in the tuple
					            
					            // check that indeed we only have one field schema
					            // which is that of a tuple
					            if(s.getFields().size() != 1) {
					                int errCode = 1008;
					                String msg = "Expected a bag schema with a single " +
                                    "element of type "+ DataType.findTypeName(DataType.TUPLE) +
                                    " but got a bag schema with multiple elements.";
					                throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
					            }
					            Schema.FieldSchema tupleFS = s.getField(0);
					            if(tupleFS.type != DataType.TUPLE) {
					                int errCode = 1009;
					                String msg = "Expected a bag schema with a single " +
                                    "element of type "+ DataType.findTypeName(DataType.TUPLE) +
                                    " but got an element of type " +
                                    DataType.findTypeName(tupleFS.type);
					                throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
					            }
					            s = tupleFS.schema;
							    
							}
							if(null != s) {
								for(int i = 0; i < s.size(); ++i) {
                                    Schema.FieldSchema fs;
                                    fs = new Schema.FieldSchema(s.getField(i));
                                    fs.setParent(s.getField(i).canonicalName, op);
									log.debug("fs: " + fs);
                                    if(null != userDefinedSchema) {
                                        Schema.FieldSchema userDefinedFieldSchema;
                                        try {
                                            if(i < userDefinedSchema.size()) {
                                                userDefinedFieldSchema = userDefinedSchema.getField(i);
                                                fs = fs.mergePrefixFieldSchema(userDefinedFieldSchema);
                                            }
                                        } catch (SchemaMergeException sme) {
                                            int errCode = 1016;
                                            String msg = "Problems in merging user defined schema";
                                            throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
                                        }
                                        outerCanonicalAlias = null;
                                    }
									String innerCanonicalAlias = fs.alias;
                                    Schema.FieldSchema newFs;
									if((null != outerCanonicalAlias) && (null != innerCanonicalAlias)) {
										String disambiguatorAlias = outerCanonicalAlias + "::" + innerCanonicalAlias;
										newFs = new Schema.FieldSchema(disambiguatorAlias, fs.schema, fs.type);
                                        newFs.setParent(s.getField(i).canonicalName, op);
                                        fss.add(newFs);
                                        updateAliasCount(aliases, disambiguatorAlias);
										//it's fine if there are duplicates
										//we just need to record if its due to
										//flattening
									} else {
										newFs = new Schema.FieldSchema(fs);
                                        newFs.setParent(s.getField(i).canonicalName, op);
										fss.add(newFs);
									}
                                    updateAliasCount(aliases, innerCanonicalAlias);
									flattenAlias.put(newFs, innerCanonicalAlias);
									inverseFlattenAlias.put(innerCanonicalAlias, true);
								}
							} else {
                                Schema.FieldSchema newFs;
                                if(null != userDefinedSchema) {
                                    if(!DataType.isSchemaType(planFs.type)) {
                                        if(userDefinedSchema.size() > 1) {
                                            int errCode = 1017;
                                            String msg = "Schema mismatch. A basic type on flattening cannot have more than one column. User defined schema: " + userDefinedSchema;
                                            throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
                                        }
								        newFs = new Schema.FieldSchema(null, planFs.type);
                                        try {
                                            newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
                                        } catch (SchemaMergeException sme) {
                                            int errCode = 1016;
                                            String msg = "Problems in merging user defined schema";
                                            throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
                                        }
                                        updateAliasCount(aliases, newFs.alias);
                                        fss.add(newFs);
                                        newFs.setParent(null, op);
                                    } else {
                                        for(Schema.FieldSchema ufs: userDefinedSchema.getFields()) {
                                            Schema.FieldSchema.setFieldSchemaDefaultType(ufs, DataType.BYTEARRAY);
                                            newFs = new Schema.FieldSchema(ufs);
                                            fss.add(newFs);
                                            newFs.setParent(null, op);
                                            updateAliasCount(aliases, ufs.alias);
                                        }
                                    }
								} else {
                                    if(!DataType.isSchemaType(planFs.type)) {
								        newFs = new Schema.FieldSchema(planFs.alias, planFs.type);
                                    } else {
								        newFs = new Schema.FieldSchema(null, DataType.BYTEARRAY);
                                    }
                                    fss.add(newFs);
                                    newFs.setParent(null, op);
                                }
							}
						} else {
							//just populate the schema with the field schema of the expression operator
                            //check if the user has defined a schema for the operator; compare the schema
                            //with that of the expression operator field schema and then add it to the list
                            Schema.FieldSchema newFs = new Schema.FieldSchema(planFs);
                            if(null != userDefinedSchema) {
                                try {
                                    newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
                                    updateAliasCount(aliases, newFs.alias);
                                } catch (SchemaMergeException sme) {
                                    int errCode = 1016;
                                    String msg = "Problems in merging user defined schema";
                                    throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
                                }
                            }
                            newFs.setParent(planFs.canonicalName, op);
                            fss.add(newFs);
						}
					} else {
						//did not get a valid list of field schemas
                        String outerCanonicalAlias = null;
                        if(null != userDefinedSchema) {
                            Schema.FieldSchema userDefinedFieldSchema = new Schema.FieldSchema(userDefinedSchema.getField(0));
                            fss.add(userDefinedFieldSchema);
                            userDefinedFieldSchema.setParent(null, op);
                            updateAliasCount(aliases, userDefinedFieldSchema.alias);
                        } else {
                            mSchema = null;
                            mIsSchemaComputed = true;
                            return mSchema;
                        }
					}
                } catch (FrontendException fee) {
                    mSchema = null;
                    mIsSchemaComputed = false;
                    throw fee;
                }
            }
			//check for duplicate column names and throw an error if there are duplicates
			//ensure that flatten gets rid of duplicate column names when the checks are
			//being done
			log.debug(" flattenAlias: " + flattenAlias);
			log.debug(" inverseFlattenAlias: " + inverseFlattenAlias);
			log.debug(" aliases: " + aliases);
			log.debug(" fss.size: " + fss.size());
			boolean duplicates = false;
			Map<String, Integer> duplicateAliases = new HashMap<String, Integer>();
			for(String alias: aliases.keySet()) {
				Integer count = aliases.get(alias);
				if(count > 1) {//not checking for null here as counts are intitalized to 1
					Boolean inFlatten = false;
					log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
					inFlatten = inverseFlattenAlias.get(alias);
					log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
					if((null == inFlatten) || (!inFlatten)) {
						duplicates = true;
						duplicateAliases.put(alias, count);
					}
				}
			}
			if(duplicates) {
				String errMessage = "Found duplicates in schema. ";
				if(duplicateAliases.size() > 0) {
					Set<String> duplicateCols = duplicateAliases.keySet();
					Iterator<String> iter = duplicateCols.iterator();
					String col = iter.next();
					errMessage += col + ": " + duplicateAliases.get(col) + " columns";
					while(iter.hasNext()) {
						col = iter.next();
						errMessage += ", " + col + ": " + duplicateAliases.get(col) + " columns";
					}
				}
				errMessage += ". Please alias the columns with unique names.";
				log.debug(errMessage);
				int errCode = 1007;
				throw new FrontendException(errMessage, errCode, PigException.INPUT, false, null);
			}
            mSchema = new Schema(fss);
			//add the aliases that are unique after flattening
			for(Schema.FieldSchema fs: mSchema.getFields()) {
				String alias = flattenAlias.get(fs);
				Integer count = aliases.get(alias);
				if (null == count) count = 1;
				log.debug("alias: " + alias);
				if((null != alias) && (count == 1)) {
					mSchema.addAlias(alias, fs);
				}
			}
            mIsSchemaComputed = true;
        }
        log.debug("Exiting getSchema");
        return mSchema;
    }

    public void unsetSchema() throws VisitorException{
        for(LogicalPlan plan: mForEachPlans) {
            SchemaRemover sr = new SchemaRemover(plan);
            sr.visit();
        }
        super.unsetSchema();
    }

    /**
     * @see org.apache.pig.impl.plan.Operator#clone()
     * Do not use the clone method directly. Operators are cloned when logical plans
     * are cloned using {@link LogicalPlanCloner}
     */
    @Override
    protected Object clone() throws CloneNotSupportedException {
        // Do generic LogicalOperator cloning
        LOForEach forEachClone = (LOForEach)super.clone();
        
        // create deep copies of attributes specific to foreach
        if(mFlatten != null) {
            forEachClone.mFlatten = new ArrayList<Boolean>();
            for (Iterator<Boolean> it = mFlatten.iterator(); it.hasNext();) {
                forEachClone.mFlatten.add(new Boolean(it.next()));
            }
        }
        
        if(mForEachPlans != null) {
            forEachClone.mForEachPlans = new ArrayList<LogicalPlan>();
            for (Iterator<LogicalPlan> it = mForEachPlans.iterator(); it.hasNext();) {
                LogicalPlanCloneHelper lpCloneHelper = new LogicalPlanCloneHelper(it.next());
                forEachClone.mForEachPlans.add(lpCloneHelper.getClonedPlan());
            }
        }
        
        if(mUserDefinedSchema != null) {
            forEachClone.mUserDefinedSchema = new ArrayList<Schema>();
            for (Iterator<Schema> it = mUserDefinedSchema.iterator(); it.hasNext();) {
                Schema s = it.next();
                forEachClone.mUserDefinedSchema.add(s != null ? s.clone() : null);
            }
        }
        return forEachClone;
    }

    @Override
    public ProjectionMap getProjectionMap() {
        Schema outputSchema;
        
        try {
            outputSchema = getSchema();
        } catch (FrontendException fee) {
            return null;
        }
        
        if(outputSchema == null) {
            return null;
        }
        
        List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this);
        if(predecessors == null) {
            return null;
        }
        
        LogicalOperator predecessor = predecessors.get(0);
        
        Schema inputSchema;
        
        try {
            inputSchema = predecessor.getSchema();
        } catch (FrontendException fee) {
            return null;
        }
        
        List<LogicalPlan> foreachPlans = getForEachPlans();
        List<Boolean> flattenList = getFlatten();
        
        MultiMap<Integer, Pair<Integer, Integer>> mapFields = new MultiMap<Integer, Pair<Integer, Integer>>();
        List<Integer> addedFields = new ArrayList<Integer>();
        int outputColumn = 0;
        
        for(int i = 0; i < foreachPlans.size(); ++i) {
            LogicalPlan foreachPlan = foreachPlans.get(i);
            List<LogicalOperator> leaves = foreachPlan.getLeaves();
            if(leaves == null || leaves.size() > 1) {
                return null;
            }
            
            int inputColumn = -1;
            boolean mapped = false;
            
            if(leaves.get(0) instanceof LOProject) {
                //find out if this project is a chain of projects
                if(LogicalPlan.chainOfProjects(foreachPlan)) {
                    LOProject rootProject = (LOProject)foreachPlan.getRoots().get(0);
                    inputColumn = rootProject.getCol();
                    if(inputSchema != null) {
                        mapped = true;
                    }
                }
            }
            
            Schema.FieldSchema leafFS;
            try {
                leafFS = ((ExpressionOperator)leaves.get(0)).getFieldSchema();
            } catch (FrontendException fee) {
                return null;
            }
            
            if(leafFS == null) {
                return null;
            }
            
            if(flattenList.get(i)) {
                Schema innerSchema = leafFS.schema;
                
                if(innerSchema != null) {                    
                    if(innerSchema.isTwoLevelAccessRequired()) {
                        // this is the case where the schema is that of
                        // a bag which has just one tuple fieldschema which
                        // in turn has a list of fieldschemas. The schema
                        // after flattening would consist of the fieldSchemas
                        // present in the tuple
                        
                        // check that indeed we only have one field schema
                        // which is that of a tuple
                        if(innerSchema.getFields().size() != 1) {
                            return null;
                        }
                        Schema.FieldSchema tupleFS;
                        try {
                            tupleFS = innerSchema.getField(0);
                        } catch (FrontendException fee) {
                            return null;
                        }
                        
                        if(tupleFS.type != DataType.TUPLE) {
                            return null;
                        }
                        innerSchema = tupleFS.schema;
                    }
                    
                    //innerSchema could be modified and hence the second check
                    if(innerSchema != null) {
                        for(int j = 0; j < innerSchema.size(); ++j) {
                            if(mapped) {
                                //map each flattened column to the original column
                                mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                            } else {
                                addedFields.add(outputColumn++);
                            }
                        }
                    } else {
                        //innerSchema is null; check for schema type
                        if(DataType.isSchemaType(leafFS.type)) {
                            //flattening a null schema results in a bytearray
                            if(mapped) {
                                //map each flattened column to the original column
                                mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                            } else {
                                addedFields.add(outputColumn++);
                            }
                        } else {
                            mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                        }
                    }
                } else {
                    //innerSchema is null; check for schema type
                    if(DataType.isSchemaType(leafFS.type)) {
                        //flattening a null schema results in a bytearray
                        if(mapped) {
                            //map each flattened column to the original column
                            mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                        } else {
                            addedFields.add(outputColumn++);
                        }
                    } else {
                        mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                    }
                }
            } else {
                //not a flattened column
                if(mapped) {
                    mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
                } else {
                    addedFields.add(outputColumn++);
                }
            }
        }
        
        List<Pair<Integer, Integer>> removedFields = new ArrayList<Pair<Integer, Integer>>();
       
        if(inputSchema == null) {
            //if input schema is null then there are no mappedFields and removedFields
            mapFields = null;
            removedFields = null;
        } else {
            
            //if the size of the map is zero then set it to null
            if(mapFields.size() == 0) {
                mapFields = null;
            }
            
            if(addedFields.size() == 0) {
                addedFields = null;
            }
            
            //input schema is not null. Need to compute the removedFields
            //compute the set difference between the input schema and mapped fields
            
            Set<Integer> removedSet = new HashSet<Integer>();
            for(int i = 0; i < inputSchema.size(); ++i) {
                removedSet.add(i);
            }
            
            if(mapFields != null) {
                Set<Integer> mappedSet = new HashSet<Integer>();
                for(Integer key: mapFields.keySet()) {
                    List<Pair<Integer, Integer>> values = (ArrayList<Pair<Integer, Integer>>)mapFields.get(key);
                    for(Pair<Integer, Integer> value: values) {
                        mappedSet.add(value.second);
                    }
                }
                removedSet.removeAll(mappedSet);
            }
            
            if(removedSet.size() == 0) {
                removedFields = null;
            } else {
                for(Integer i: removedSet) {
                    removedFields.add(new Pair<Integer, Integer>(0, i));
                }
            }
        }

        return new ProjectionMap(mapFields, removedFields, addedFields);
    }

    @Override
    public List<RequiredFields> getRequiredFields() {
        List<RequiredFields> requiredFields = new ArrayList<RequiredFields>();
        Set<Pair<Integer, Integer>> fields = new HashSet<Pair<Integer, Integer>>();
        Set<LOProject> projectSet = new HashSet<LOProject>();
        boolean starRequired = false;

        for (LogicalPlan plan : getForEachPlans()) {
            TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(
                    plan);
            try {
                projectFinder.visit();
            } catch (VisitorException ve) {
                requiredFields.clear();
                requiredFields.add(null);
                return requiredFields;
            }
            projectSet.addAll(projectFinder.getProjectSet());
            if(projectFinder.getProjectStarSet() != null) {
                starRequired = true;
            }
        }

        if(starRequired) {
            requiredFields.add(new RequiredFields(true));
            return requiredFields;
        } else {
            for (LOProject project : projectSet) {
                for (int inputColumn : project.getProjection()) {
                    fields.add(new Pair<Integer, Integer>(0, inputColumn));
                }
            }
    
            if(fields.size() == 0) {
                requiredFields.add(new RequiredFields(false, true));
            } else {                
                requiredFields.add(new RequiredFields(new ArrayList<Pair<Integer, Integer>>(fields)));
            }
            return (requiredFields.size() == 0? null: requiredFields);
        }
    }

}