StratoEngine.java example

Explorer
PonIC-master
package org.apache.pig.backend.stratosphere.executionengine;

import java.io.IOException;
import java.net.Socket;
import java.net.SocketException;
import java.net.SocketImplFactory;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.pig.ExecType;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.HDataStorage;
import org.apache.pig.backend.hadoop.executionengine.HExecutionEngine;
import org.apache.pig.backend.stratosphere.executionengine.pactLayer.PactOperator;
import org.apache.pig.backend.stratosphere.executionengine.pactLayer.plans.PactPlan;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.logical.optimizer.DanglingNestedNodeRemover;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.newplan.logical.optimizer.SchemaResetter;
import org.apache.pig.newplan.logical.optimizer.UidResetter;
import org.apache.pig.newplan.logical.relational.LOCogroup;
import org.apache.pig.newplan.logical.relational.LOCross;
import org.apache.pig.newplan.logical.relational.LOFilter;
import org.apache.pig.newplan.logical.relational.LOJoin;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LOStore;
import org.apache.pig.newplan.logical.relational.LogToPactTranslationVisitor;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.rules.InputOutputFileValidator;
import org.apache.pig.newplan.logical.rules.LoadStoreFuncDupSignatureValidator;
import org.apache.pig.newplan.logical.visitor.SortInfoSetter;
import org.apache.pig.newplan.logical.visitor.StoreAliasSetter;
import org.apache.pig.pen.POOptimizeDisabler;

public class StratoEngine extends HExecutionEngine {

    // key: the operator key from the logical plan that originated the pact plan
    // val: the operator key for the root of the pact plan
    protected Map<OperatorKey, OperatorKey> logicalToPactKeys;
    protected Map<Operator, PactOperator> newLogToPactMap;
    
    private static final String HADOOP_SITE = "hadoop-site.xml";
    private static final String CORE_SITE = "core-site.xml";
    private static final String FILE_SYSTEM_LOCATION = "fs.default.name";
    private static final String ALTERNATIVE_FILE_SYSTEM_LOCATION = "fs.defaultFS";
    private static final String YARN_SITE = "yarn-site.xml";
    
    private final Log log = LogFactory.getLog(getClass());
    private LogicalPlan newPreoptimizedPlan;
    protected Map<Operator, PactOperator> newLogToPhyMap;
    
	public StratoEngine(PigContext pigContext) {
		super(pigContext);
	}

	@Override
	public void init() throws ExecException {
        init(this.pigContext.getProperties());
    }
	
	@SuppressWarnings("deprecation")
    private void init(Properties properties) throws ExecException {
		this.logicalToPactKeys = new HashMap<OperatorKey, OperatorKey>();  
		
        //First set the ssh socket factory
        setSSHFactory();
        
        String cluster = null;
        String nameNode = null;
    
        // We need to build a configuration object first in the manner described below
        // and then get back a properties object to inspect the JOB_TRACKER_LOCATION
        // and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only at
        // the existing properties object, we may not get the right settings. So we want
        // to read the configurations in the order specified below and only then look
        // for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION.
            
        // Hadoop by default specifies two resources, loaded in-order from the classpath:
        // 1. hadoop-default.xml : Read-only defaults for hadoop.
        // 2. hadoop-site.xml: Site-specific configuration for a given hadoop installation.
        // Now add the settings from "properties" object to override any existing properties
        // All of the above is accomplished in the method call below
           
        // Check existence of hadoop-site.xml or core-site.xml
        
        JobConf jc = null;	//TODO: JobConf object shouldn't be needed in Stratosphere version
        
        if(this.pigContext.getExecType() == ExecType.STRATO){	
	        Configuration testConf = new Configuration();
	        ClassLoader cl = testConf.getClassLoader();
	        URL hadoop_site = cl.getResource( HADOOP_SITE );
	        URL core_site = cl.getResource( CORE_SITE );
	        
	        if( hadoop_site == null && core_site == null ) {
	            throw new ExecException("Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)." +
	                    " If you plan to use local mode, please put -x local option in command line", 
	                    4010);
	        }
	        
	        jc = new JobConf();
	        jc.addResource("pig-cluster-hadoop-site.xml");
	        jc.addResource(YARN_SITE);
	        
	        // Trick to invoke static initializer of DistributedFileSystem to add hdfs-default.xml 
	        // into configuration
	        new DistributedFileSystem();
	
	        
	        //the method below alters the properties object by overriding the
	        //hadoop properties with the values from properties and recomputing
	        //the properties
	        recomputeProperties(jc, properties);
        }	//endif -- TODO: remove this when deployed to cluster
        
        else if (this.pigContext.getExecType() == ExecType.LOCAL_STRATO) {
        	jc = new JobConf(false);
            jc.addResource("core-default.xml");
            jc.addResource("mapred-default.xml");
            jc.addResource("yarn-default.xml");
            recomputeProperties(jc, properties);
            
            properties.setProperty("mapreduce.framework.name", "local");
            properties.setProperty(JOB_TRACKER_LOCATION, LOCAL );
            properties.setProperty(FILE_SYSTEM_LOCATION, "file:///");
            properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///");
        }
        else
        	throw new RuntimeException("Execution type not supported");
        
        cluster = properties.getProperty(JOB_TRACKER_LOCATION);
        nameNode = properties.getProperty(FILE_SYSTEM_LOCATION);
        if (nameNode==null)
            nameNode = (String)pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION);
        
        if (cluster != null && cluster.length() > 0) {
            if(!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) {
                cluster = cluster + ":50020";
            }
            properties.setProperty(JOB_TRACKER_LOCATION, cluster);
        }

        if (nameNode!=null && nameNode.length() > 0) {
            if(!nameNode.contains(":")  && !nameNode.equalsIgnoreCase(LOCAL)) {
                nameNode = nameNode + ":8020";
            }
            properties.setProperty(FILE_SYSTEM_LOCATION, nameNode);
        }
        
        log.info("Connecting to hadoop file system at: "  + (nameNode==null? LOCAL: nameNode) )  ;
        // constructor sets DEFAULT_REPLICATION_FACTOR_KEY
        ds = new HDataStorage(properties);
        
       
                
        if(cluster != null && !cluster.equalsIgnoreCase(LOCAL)){
        	//TODO: Connecting to Nephele Job Manager...
        	log.info("Connecting to Nephele JobManager at: *MISSING*");
        }
    }
	
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void setSSHFactory(){
        Properties properties = this.pigContext.getProperties();
        String g = properties.getProperty("ssh.gateway");
        if (g == null || g.length() == 0) return;
        try {
            Class clazz = Class.forName("org.apache.pig.shock.SSHSocketImplFactory");
            SocketImplFactory f = (SocketImplFactory)clazz.getMethod("getFactory", new Class[0]).invoke(0, new Object[0]);
            Socket.setSocketImplFactory(f);
        } 
        catch (SocketException e) {}
        catch (Exception e){
            throw new RuntimeException(e);
        }
    }
    
    /**
     * Method to apply pig properties to JobConf
     * (replaces properties with resulting jobConf values)
     * @param conf JobConf with appropriate hadoop resource files
     * @param properties Pig properties that will override hadoop properties; properties might be modified
     */
    @SuppressWarnings("deprecation")
    private void recomputeProperties(JobConf jobConf, Properties properties) {
        // We need to load the properties from the hadoop configuration
        // We want to override these with any existing properties we have.
        if (jobConf != null && properties != null) {
            // set user properties on the jobConf to ensure that defaults
            // and deprecation is applied correctly
            Enumeration<Object> propertiesIter = properties.keys();
            while (propertiesIter.hasMoreElements()) {
                String key = (String) propertiesIter.nextElement();
                String val = properties.getProperty(key);
                // We do not put user.name, See PIG-1419
                if (!key.equals("user.name"))
                	jobConf.set(key, val);
            }
            //clear user defined properties and re-populate
            properties.clear();
            Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
            while (iter.hasNext()) {
                Map.Entry<String, String> entry = iter.next();
                properties.put(entry.getKey(), entry.getValue());
            } 
        }
    }
    
    @SuppressWarnings("unchecked")
    @Override
    public PactPlan compileS(LogicalPlan plan, Properties properties) throws FrontendException {
    	if (plan == null) {
            int errCode = 2041;
            String msg = "No Plan to compile";
            throw new FrontendException(msg, errCode, PigException.BUG);
        }
    
        newPreoptimizedPlan = new LogicalPlan(plan);
        
        if (pigContext.inIllustrator) {
            // disable all PO-specific optimizations
            POOptimizeDisabler pod = new POOptimizeDisabler( plan );
            pod.visit();
        }
        
        DanglingNestedNodeRemover DanglingNestedNodeRemover = new DanglingNestedNodeRemover( plan );
        DanglingNestedNodeRemover.visit();
        
        UidResetter uidResetter = new UidResetter( plan );
        uidResetter.visit();
        
        SchemaResetter schemaResetter = new SchemaResetter( plan, true /*skip duplicate uid check*/ );
        schemaResetter.visit();
        
        HashSet<String> optimizerRules = null;
        try {
            optimizerRules = (HashSet<String>) ObjectSerializer
                    .deserialize(pigContext.getProperties().getProperty(
                            "pig.optimizer.rules"));
        } catch (IOException ioe) {
            int errCode = 2110;
            String msg = "Unable to deserialize optimizer rules.";
            throw new FrontendException(msg, errCode, PigException.BUG, ioe);
        }
        
        if (pigContext.inIllustrator) {
            // disable MergeForEach in illustrator
            if (optimizerRules == null)
                optimizerRules = new HashSet<String>();
            optimizerRules.add("MergeForEach");
            optimizerRules.add("PartitionFilterOptimizer");
            optimizerRules.add("LimitOptimizer");
            optimizerRules.add("SplitFilter");
            optimizerRules.add("PushUpFilter");
            optimizerRules.add("MergeFilter");
            optimizerRules.add("PushDownForEachFlatten");
            optimizerRules.add("ColumnMapKeyPrune");
            optimizerRules.add("AddForEach");
            optimizerRules.add("GroupByConstParallelSetter");
        }
        
        // Check if we have duplicate signature
        LoadStoreFuncDupSignatureValidator loadStoreFuncDupSignatureValidator = new LoadStoreFuncDupSignatureValidator(plan);
        loadStoreFuncDupSignatureValidator.validate();
        
        StoreAliasSetter storeAliasSetter = new StoreAliasSetter( plan );
        storeAliasSetter.visit();
        
        // run optimizer
        LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer( plan, 100, optimizerRules );
        optimizer.optimize();
        
        // compute whether output data is sorted or not
        SortInfoSetter sortInfoSetter = new SortInfoSetter( plan );
        sortInfoSetter.visit();
        
        if (pigContext.inExplain==false) {
            // Validate input/output file. Currently no validation framework in
            // new logical plan, put this validator here first.
            // We might decide to move it out to a validator framework in future
            InputOutputFileValidator validator = new InputOutputFileValidator( plan, pigContext );
            validator.validate();
        }
        
    	// check that only supported operators are used
    	Iterator<Operator> operators = plan.getOperators();
    	while (operators.hasNext()) {
    		Operator current = operators.next();
    		if (!( current instanceof LOLoad || current instanceof LOStore ||
    				current instanceof LOFilter || current instanceof LOJoin ||
    				current instanceof LOCross || current instanceof LOCogroup )) {
    			throw new FrontendException("Operator " + current.getName() 
    					+ " is not currently supported", -2013, PigException.ERROR);
    		}
    	}

        // translate new logical plan to pact plan
        LogToPactTranslationVisitor translator = new LogToPactTranslationVisitor(plan);
                
        translator.setPigContext(pigContext);
        translator.visit();
        newLogToPhyMap = translator.getLogToPhyMap();
        return translator.getPhysicalPlan();
    }
    
    public LogicalPlan getNewPlan() {
        return newPreoptimizedPlan;
    }
}