MapReduceOper.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.plans.MROpPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCounter;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PORank;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POUnion;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.Operator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;

/**
 * An operator model for a Map Reduce job. 
 * Acts as a host to the plans that will
 * execute in map, reduce and optionally combine
 * phases. These will be embedded in the MROperPlan
 * in order to capture the dependencies amongst jobs.
 */
public class MapReduceOper extends Operator<MROpPlanVisitor> {
    private static final long serialVersionUID = 1L;

    //The physical plan that should be executed
    //in the map phase
    public PhysicalPlan mapPlan;
    
    //The physical plan that should be executed
    //in the reduce phase
    public PhysicalPlan reducePlan;
    
    //The physical plan that should be executed
    //in the combine phase if one exists. Will be used
    //by the optimizer.
    public PhysicalPlan combinePlan;
    
    // key for the map plan
    // this is needed when the key is null to create
    // an appropriate NullableXXXWritable object
    public byte mapKeyType;
    
    //Indicates that the map plan creation
    //is complete
    boolean mapDone = false;
    
    //Indicates that the reduce plan creation
    //is complete
    boolean reduceDone = false;
    
    // Indicates that there is an operator which uses endOfAllInput flag in the 
    // map plan
    boolean endOfAllInputInMap = false;
    
    // Indicates that there is an operator which uses endOfAllInput flag in the 
    // reduce plan
    boolean endOfAllInputInReduce = false;;
    
    //Indicates if this job is an order by job
    boolean globalSort = false;

    // Indicates if this is a limit after a sort
    boolean limitAfterSort = false;
    
    // Indicate if the entire purpose for this map reduce job is doing limit, does not change
    // anything else. This is to help POPackageAnnotator to find the right POPackage to annotate
    boolean limitOnly = false;
    
    OPER_FEATURE feature = OPER_FEATURE.NONE;

    // If true, putting an identity combine in this
    // mapreduce job will speed things up.
    boolean needsDistinctCombiner = false;
    
    // If true, we will use secondary key in the map-reduce job
    boolean useSecondaryKey = false;
    
    //The quantiles file name if globalSort is true
    String quantFile;
    
    //The sort order of the columns;
    //asc is true and desc is false
    boolean[] sortOrder;
    
    // Sort order for secondary keys;
    boolean[] secondarySortOrder;

    public Set<String> UDFs;
    
    public Set<PhysicalOperator> scalars;
    
    // Indicates if a UDF comparator is used
    boolean isUDFComparatorUsed = false;
    
    transient NodeIdGenerator nig;

    private String scope;
    
    int requestedParallelism = -1;
    
    // estimated at runtime
    int estimatedParallelism = -1;
    
    // calculated at runtime 
    int runtimeParallelism = -1;
    
    /* Name of the Custom Partitioner used */ 
    String customPartitioner = null;
    
    // Last POLimit value in this map reduce operator, needed by LimitAdjuster
    // to add additional map reduce operator with 1 reducer after this
    long limit = -1;

    // POLimit can also have an expression. See PIG-1926
    PhysicalPlan limitPlan = null;

    // Indicates that this MROper is a splitter MROper.
    // That is, this MROper ends due to a POSPlit operator.
    private boolean splitter = false;

	// Set to true if it is skewed join
	private boolean skewedJoin = false;

    // Name of the partition file generated by sampling process,
    // Used by Skewed Join
	private String skewedJoinPartitionFile;
	
	// Flag to communicate from MRCompiler to JobControlCompiler what kind of
	// comparator is used by Hadoop for sorting for this MROper. 
	// By default, set to false which will make Pig provide raw comparators. 
	// Set to true in indexing job generated in map-side cogroup, merge join.
	private boolean usingTypedComparator = false;
	
	// Flag to indicate if the small input splits need to be combined to form a larger
	// one in order to reduce the number of mappers. For merge join, both tables
	// are NOT combinable for correctness.
	private boolean combineSmallSplits = true;
	
	// Map of the physical operator in physical plan to the one in MR plan: only needed
	// if the physical operator is changed/replaced in MR compilation due to, e.g., optimization
	public MultiMap<PhysicalOperator, PhysicalOperator> phyToMRMap;
	
	private static enum OPER_FEATURE {
	    NONE,
	    // Indicate if this job is a sampling job
	    SAMPLER,
	    // Indicate if this job is a merge indexer
	    INDEXER,
	    // Indicate if this job is a group by job
	    GROUPBY,	    
	    // Indicate if this job is a cogroup job
	    COGROUP,	    
	    // Indicate if this job is a regular join job
	    HASHJOIN;
	};
	
    public MapReduceOper(OperatorKey k) {
        super(k);
        mapPlan = new PhysicalPlan();
        combinePlan = new PhysicalPlan();
        reducePlan = new PhysicalPlan();
        UDFs = new HashSet<String>();
        scalars = new HashSet<PhysicalOperator>();
        nig = NodeIdGenerator.getGenerator();
        scope = k.getScope();
        phyToMRMap = new MultiMap<PhysicalOperator, PhysicalOperator>();
    }

    /*@Override
    public String name() {
        return "MapReduce - " + mKey.toString();
    }*/
    
    private String shiftStringByTabs(String DFStr, String tab) {
        StringBuilder sb = new StringBuilder();
        String[] spl = DFStr.split("\n");
        for (int i = 0; i < spl.length; i++) {
            sb.append(tab);
            sb.append(spl[i]);
            sb.append("\n");
        }
        sb.delete(sb.length() - "\n".length(), sb.length());
        return sb.toString();
    }
    
    /**
     * Uses the string representation of the 
     * component plans to identify itself.
     */
    @Override
    public String name() {
        String udfStr = getUDFsAsStr();
        
        StringBuilder sb = new StringBuilder("MapReduce" + "(" + requestedParallelism + 
                (udfStr.equals("")? "" : ",") + udfStr + ")" + " - " + mKey.toString()
                + ":\n");
        int index = sb.length();
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        if(!mapPlan.isEmpty()){
            mapPlan.explain(baos);
            String mp = new String(baos.toByteArray());
            sb.append(shiftStringByTabs(mp, "|   "));
        }
        else
            sb.append("Map Plan Empty");
        if (!reducePlan.isEmpty()){
            baos.reset();
            reducePlan.explain(baos);
            String rp = new String(baos.toByteArray());
            sb.insert(index, shiftStringByTabs(rp, "|   ") + "\n");
        }
        else
            sb.insert(index, "Reduce Plan Empty" + "\n");
        return sb.toString();
    }

    private String getUDFsAsStr() {
        StringBuilder sb = new StringBuilder();
        if(UDFs!=null && UDFs.size()>0){
            for (String str : UDFs) {
                sb.append(str.substring(str.lastIndexOf('.')+1));
                sb.append(',');
            }
            sb.deleteCharAt(sb.length()-1);
        }
        return sb.toString();
    }

    @Override
    public boolean supportsMultipleInputs() {
        return true;
    }

    @Override
    public boolean supportsMultipleOutputs() {
        return true;
    }

    @Override
    public void visit(MROpPlanVisitor v) throws VisitorException {
        v.visitMROp(this);
    }
    
    public boolean isMapDone() {
        return mapDone;
    }
    
    public void setMapDone(boolean mapDone){
        this.mapDone = mapDone;
    }
    
    public void setMapDoneSingle(boolean mapDone) throws PlanException{
        this.mapDone = mapDone;
        if (mapDone && mapPlan.getLeaves().size()>1) {
            mapPlan.addAsLeaf(getUnion());
        }
    }
    
    public void setMapDoneMultiple(boolean mapDone) throws PlanException{
        this.mapDone = mapDone;
        if (mapDone && mapPlan.getLeaves().size()>0) {
            mapPlan.addAsLeaf(getUnion());
        }
    }
    
    private POUnion getUnion(){
        return new POUnion(new OperatorKey(scope,nig.getNextNodeId(scope)));
    }
    
    public boolean isReduceDone() {
        return reduceDone;
    }

    public void setReduceDone(boolean reduceDone){
        this.reduceDone = reduceDone;
    }
    
    public boolean isGlobalSort() {
        return globalSort;
    }
    
    public boolean isSkewedJoin() {
    	return (skewedJoinPartitionFile != null);
    }
    
    public void setSkewedJoinPartitionFile(String file) {    	
    	skewedJoinPartitionFile = file;
    }
    
    public String getSkewedJoinPartitionFile() {
    	return skewedJoinPartitionFile;
    }

	public void setSkewedJoin(boolean skJoin) {
		this.skewedJoin = skJoin;
	}

	public boolean getSkewedJoin() {
		return skewedJoin;
	}

    public void setGlobalSort(boolean globalSort) {
        this.globalSort = globalSort;
    }

    public boolean isLimitAfterSort() {
        return limitAfterSort;
    }

    public void setLimitAfterSort(boolean las) {
        limitAfterSort = las;
    }
    
    public boolean isLimitOnly() {
        return limitOnly;
    }
    
    public void setLimitOnly(boolean limitOnly) {
        this.limitOnly = limitOnly;
    }

    public boolean isIndexer() {
        return (feature == OPER_FEATURE.INDEXER);
    }
    
    public void markIndexer() {
        feature = OPER_FEATURE.INDEXER;
    }
    
    public boolean isSampler() {
        return (feature == OPER_FEATURE.SAMPLER);
    }
    
    public void markSampler() {
        feature = OPER_FEATURE.SAMPLER;
    }
    
    public boolean isGroupBy() {
        return (feature == OPER_FEATURE.GROUPBY);
    }
    
    public void markGroupBy() {
        feature = OPER_FEATURE.GROUPBY;
    }
    
    public boolean isCogroup() {
        return (feature == OPER_FEATURE.COGROUP);
    }
    
    public void markCogroup() {
        feature = OPER_FEATURE.COGROUP;
    }
    
    public boolean isRegularJoin() {
        return (feature == OPER_FEATURE.HASHJOIN);
    }
    
    public void markRegularJoin() {
        feature = OPER_FEATURE.HASHJOIN;
    }
    
    public boolean needsDistinctCombiner() { 
        return needsDistinctCombiner;
    }

    public void setNeedsDistinctCombiner(boolean nic) {
        needsDistinctCombiner = nic;
    }

    public String getQuantFile() {
        return quantFile;
    }

    public void setQuantFile(String quantFile) {
        this.quantFile = quantFile;
    }

    public void setSortOrder(boolean[] sortOrder) {
        if(null == sortOrder) return;
        this.sortOrder = new boolean[sortOrder.length];
        for(int i = 0; i < sortOrder.length; ++i) {
            this.sortOrder[i] = sortOrder[i];
        }
    }
    
    public void setSecondarySortOrder(boolean[] secondarySortOrder) {
        if(null == secondarySortOrder) return;
        this.secondarySortOrder = new boolean[secondarySortOrder.length];
        for(int i = 0; i < secondarySortOrder.length; ++i) {
            this.secondarySortOrder[i] = secondarySortOrder[i];
        }
    }
             
    public boolean[] getSortOrder() {
        return sortOrder;
    }

    public boolean[] getSecondarySortOrder() {
        return secondarySortOrder;
    }

    /**
     * @return whether end of all input is set in the map plan
     */
    public boolean isEndOfAllInputSetInMap() {
        return endOfAllInputInMap;
    }

    /**
     * @param endOfAllInputInMap the streamInMap to set
     */
    public void setEndOfAllInputInMap(boolean endOfAllInputInMap) {
        this.endOfAllInputInMap = endOfAllInputInMap;
    }

    /**
     * @return whether end of all input is set in the reduce plan
     */
    public boolean isEndOfAllInputSetInReduce() {
        return endOfAllInputInReduce;
    }

    /**
     * @param endOfAllInputInReduce the streamInReduce to set
     */
    public void setEndOfAllInputInReduce(boolean endOfAllInputInReduce) {
        this.endOfAllInputInReduce = endOfAllInputInReduce;
    }    

    public int getRequestedParallelism() {
        return requestedParallelism;
    }
    
    public String getCustomPartitioner() {
    	return customPartitioner;
    }

    public void setSplitter(boolean spl) {
        splitter = spl;
    }

    public boolean isSplitter() {
        return splitter;
    }
    
    public boolean getUseSecondaryKey() {
        return useSecondaryKey;
    }
    
    public void setUseSecondaryKey(boolean useSecondaryKey) {
        this.useSecondaryKey = useSecondaryKey;
    }

    protected boolean usingTypedComparator() {
        return usingTypedComparator;
    }

    protected void useTypedComparator(boolean useTypedComparator) {
        this.usingTypedComparator = useTypedComparator;
    }

    protected void noCombineSmallSplits() {
        combineSmallSplits = false;
    }

    public boolean combineSmallSplits() {
        return combineSmallSplits;
    }

    public boolean isRankOperation() {
        return getRankOperationId().size() != 0;
    }
    
    public ArrayList<String> getRankOperationId() {
        ArrayList<String> operationIDs = new ArrayList<String>();
        Iterator<PhysicalOperator> mapRoots = this.mapPlan.getRoots().iterator();

        while(mapRoots.hasNext()) {
            PhysicalOperator operation = mapRoots.next();
            if(operation instanceof PORank)
                operationIDs.add(((PORank) operation).getOperationID());
        }

        return operationIDs;
    }

    public boolean isCounterOperation() {
        return (getCounterOperation() != null);
    }

    public boolean isRowNumber() {
        POCounter counter = getCounterOperation();
        return (counter != null)?counter.isRowNumber():false;
    }

    public String getOperationID() {
        POCounter counter = getCounterOperation();
        return (counter != null)?counter.getOperationID():null;
    }

    private POCounter getCounterOperation() {
        PhysicalOperator operator;
        Iterator<PhysicalOperator> it =  this.mapPlan.getLeaves().iterator();

        while(it.hasNext()) {
            operator = it.next();
            if(operator instanceof POCounter)
                return (POCounter) operator;
        }

        it =  this.reducePlan.getLeaves().iterator();

        while(it.hasNext()) {
            operator = it.next();
            if(operator instanceof POCounter)
                return (POCounter) operator;
        }

        return null;
    }
}