package org.apache.pig.backend.stratosphere.executionengine;
import java.io.IOException;
import java.net.Socket;
import java.net.SocketException;
import java.net.SocketImplFactory;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.pig.ExecType;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.HDataStorage;
import org.apache.pig.backend.hadoop.executionengine.HExecutionEngine;
import org.apache.pig.backend.stratosphere.executionengine.pactLayer.PactOperator;
import org.apache.pig.backend.stratosphere.executionengine.pactLayer.plans.PactPlan;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.newplan.Operator;
import org.apache.pig.newplan.logical.optimizer.DanglingNestedNodeRemover;
import org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer;
import org.apache.pig.newplan.logical.optimizer.SchemaResetter;
import org.apache.pig.newplan.logical.optimizer.UidResetter;
import org.apache.pig.newplan.logical.relational.LOCogroup;
import org.apache.pig.newplan.logical.relational.LOCross;
import org.apache.pig.newplan.logical.relational.LOFilter;
import org.apache.pig.newplan.logical.relational.LOJoin;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LOStore;
import org.apache.pig.newplan.logical.relational.LogToPactTranslationVisitor;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.rules.InputOutputFileValidator;
import org.apache.pig.newplan.logical.rules.LoadStoreFuncDupSignatureValidator;
import org.apache.pig.newplan.logical.visitor.SortInfoSetter;
import org.apache.pig.newplan.logical.visitor.StoreAliasSetter;
import org.apache.pig.pen.POOptimizeDisabler;
public class StratoEngine extends HExecutionEngine {
// key: the operator key from the logical plan that originated the pact plan
// val: the operator key for the root of the pact plan
protected Map<OperatorKey, OperatorKey> logicalToPactKeys;
protected Map<Operator, PactOperator> newLogToPactMap;
private static final String HADOOP_SITE = "hadoop-site.xml";
private static final String CORE_SITE = "core-site.xml";
private static final String FILE_SYSTEM_LOCATION = "fs.default.name";
private static final String ALTERNATIVE_FILE_SYSTEM_LOCATION = "fs.defaultFS";
private static final String YARN_SITE = "yarn-site.xml";
private final Log log = LogFactory.getLog(getClass());
private LogicalPlan newPreoptimizedPlan;
protected Map<Operator, PactOperator> newLogToPhyMap;
public StratoEngine(PigContext pigContext) {
super(pigContext);
}
@Override
public void init() throws ExecException {
init(this.pigContext.getProperties());
}
@SuppressWarnings("deprecation")
private void init(Properties properties) throws ExecException {
this.logicalToPactKeys = new HashMap<OperatorKey, OperatorKey>();
//First set the ssh socket factory
setSSHFactory();
String cluster = null;
String nameNode = null;
// We need to build a configuration object first in the manner described below
// and then get back a properties object to inspect the JOB_TRACKER_LOCATION
// and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only at
// the existing properties object, we may not get the right settings. So we want
// to read the configurations in the order specified below and only then look
// for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION.
// Hadoop by default specifies two resources, loaded in-order from the classpath:
// 1. hadoop-default.xml : Read-only defaults for hadoop.
// 2. hadoop-site.xml: Site-specific configuration for a given hadoop installation.
// Now add the settings from "properties" object to override any existing properties
// All of the above is accomplished in the method call below
// Check existence of hadoop-site.xml or core-site.xml
JobConf jc = null; //TODO: JobConf object shouldn't be needed in Stratosphere version
if(this.pigContext.getExecType() == ExecType.STRATO){
Configuration testConf = new Configuration();
ClassLoader cl = testConf.getClassLoader();
URL hadoop_site = cl.getResource( HADOOP_SITE );
URL core_site = cl.getResource( CORE_SITE );
if( hadoop_site == null && core_site == null ) {
throw new ExecException("Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)." +
" If you plan to use local mode, please put -x local option in command line",
4010);
}
jc = new JobConf();
jc.addResource("pig-cluster-hadoop-site.xml");
jc.addResource(YARN_SITE);
// Trick to invoke static initializer of DistributedFileSystem to add hdfs-default.xml
// into configuration
new DistributedFileSystem();
//the method below alters the properties object by overriding the
//hadoop properties with the values from properties and recomputing
//the properties
recomputeProperties(jc, properties);
} //endif -- TODO: remove this when deployed to cluster
else if (this.pigContext.getExecType() == ExecType.LOCAL_STRATO) {
jc = new JobConf(false);
jc.addResource("core-default.xml");
jc.addResource("mapred-default.xml");
jc.addResource("yarn-default.xml");
recomputeProperties(jc, properties);
properties.setProperty("mapreduce.framework.name", "local");
properties.setProperty(JOB_TRACKER_LOCATION, LOCAL );
properties.setProperty(FILE_SYSTEM_LOCATION, "file:///");
properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///");
}
else
throw new RuntimeException("Execution type not supported");
cluster = properties.getProperty(JOB_TRACKER_LOCATION);
nameNode = properties.getProperty(FILE_SYSTEM_LOCATION);
if (nameNode==null)
nameNode = (String)pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION);
if (cluster != null && cluster.length() > 0) {
if(!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) {
cluster = cluster + ":50020";
}
properties.setProperty(JOB_TRACKER_LOCATION, cluster);
}
if (nameNode!=null && nameNode.length() > 0) {
if(!nameNode.contains(":") && !nameNode.equalsIgnoreCase(LOCAL)) {
nameNode = nameNode + ":8020";
}
properties.setProperty(FILE_SYSTEM_LOCATION, nameNode);
}
log.info("Connecting to hadoop file system at: " + (nameNode==null? LOCAL: nameNode) ) ;
// constructor sets DEFAULT_REPLICATION_FACTOR_KEY
ds = new HDataStorage(properties);
if(cluster != null && !cluster.equalsIgnoreCase(LOCAL)){
//TODO: Connecting to Nephele Job Manager...
log.info("Connecting to Nephele JobManager at: *MISSING*");
}
}
@SuppressWarnings({ "unchecked", "rawtypes" })
private void setSSHFactory(){
Properties properties = this.pigContext.getProperties();
String g = properties.getProperty("ssh.gateway");
if (g == null || g.length() == 0) return;
try {
Class clazz = Class.forName("org.apache.pig.shock.SSHSocketImplFactory");
SocketImplFactory f = (SocketImplFactory)clazz.getMethod("getFactory", new Class[0]).invoke(0, new Object[0]);
Socket.setSocketImplFactory(f);
}
catch (SocketException e) {}
catch (Exception e){
throw new RuntimeException(e);
}
}
/**
* Method to apply pig properties to JobConf
* (replaces properties with resulting jobConf values)
* @param conf JobConf with appropriate hadoop resource files
* @param properties Pig properties that will override hadoop properties; properties might be modified
*/
@SuppressWarnings("deprecation")
private void recomputeProperties(JobConf jobConf, Properties properties) {
// We need to load the properties from the hadoop configuration
// We want to override these with any existing properties we have.
if (jobConf != null && properties != null) {
// set user properties on the jobConf to ensure that defaults
// and deprecation is applied correctly
Enumeration<Object> propertiesIter = properties.keys();
while (propertiesIter.hasMoreElements()) {
String key = (String) propertiesIter.nextElement();
String val = properties.getProperty(key);
// We do not put user.name, See PIG-1419
if (!key.equals("user.name"))
jobConf.set(key, val);
}
//clear user defined properties and re-populate
properties.clear();
Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
while (iter.hasNext()) {
Map.Entry<String, String> entry = iter.next();
properties.put(entry.getKey(), entry.getValue());
}
}
}
@SuppressWarnings("unchecked")
@Override
public PactPlan compileS(LogicalPlan plan, Properties properties) throws FrontendException {
if (plan == null) {
int errCode = 2041;
String msg = "No Plan to compile";
throw new FrontendException(msg, errCode, PigException.BUG);
}
newPreoptimizedPlan = new LogicalPlan(plan);
if (pigContext.inIllustrator) {
// disable all PO-specific optimizations
POOptimizeDisabler pod = new POOptimizeDisabler( plan );
pod.visit();
}
DanglingNestedNodeRemover DanglingNestedNodeRemover = new DanglingNestedNodeRemover( plan );
DanglingNestedNodeRemover.visit();
UidResetter uidResetter = new UidResetter( plan );
uidResetter.visit();
SchemaResetter schemaResetter = new SchemaResetter( plan, true /*skip duplicate uid check*/ );
schemaResetter.visit();
HashSet<String> optimizerRules = null;
try {
optimizerRules = (HashSet<String>) ObjectSerializer
.deserialize(pigContext.getProperties().getProperty(
"pig.optimizer.rules"));
} catch (IOException ioe) {
int errCode = 2110;
String msg = "Unable to deserialize optimizer rules.";
throw new FrontendException(msg, errCode, PigException.BUG, ioe);
}
if (pigContext.inIllustrator) {
// disable MergeForEach in illustrator
if (optimizerRules == null)
optimizerRules = new HashSet<String>();
optimizerRules.add("MergeForEach");
optimizerRules.add("PartitionFilterOptimizer");
optimizerRules.add("LimitOptimizer");
optimizerRules.add("SplitFilter");
optimizerRules.add("PushUpFilter");
optimizerRules.add("MergeFilter");
optimizerRules.add("PushDownForEachFlatten");
optimizerRules.add("ColumnMapKeyPrune");
optimizerRules.add("AddForEach");
optimizerRules.add("GroupByConstParallelSetter");
}
// Check if we have duplicate signature
LoadStoreFuncDupSignatureValidator loadStoreFuncDupSignatureValidator = new LoadStoreFuncDupSignatureValidator(plan);
loadStoreFuncDupSignatureValidator.validate();
StoreAliasSetter storeAliasSetter = new StoreAliasSetter( plan );
storeAliasSetter.visit();
// run optimizer
LogicalPlanOptimizer optimizer = new LogicalPlanOptimizer( plan, 100, optimizerRules );
optimizer.optimize();
// compute whether output data is sorted or not
SortInfoSetter sortInfoSetter = new SortInfoSetter( plan );
sortInfoSetter.visit();
if (pigContext.inExplain==false) {
// Validate input/output file. Currently no validation framework in
// new logical plan, put this validator here first.
// We might decide to move it out to a validator framework in future
InputOutputFileValidator validator = new InputOutputFileValidator( plan, pigContext );
validator.validate();
}
// check that only supported operators are used
Iterator<Operator> operators = plan.getOperators();
while (operators.hasNext()) {
Operator current = operators.next();
if (!( current instanceof LOLoad || current instanceof LOStore ||
current instanceof LOFilter || current instanceof LOJoin ||
current instanceof LOCross || current instanceof LOCogroup )) {
throw new FrontendException("Operator " + current.getName()
+ " is not currently supported", -2013, PigException.ERROR);
}
}
// translate new logical plan to pact plan
LogToPactTranslationVisitor translator = new LogToPactTranslationVisitor(plan);
translator.setPigContext(pigContext);
translator.visit();
newLogToPhyMap = translator.getLogToPhyMap();
return translator.getPhysicalPlan();
}
public LogicalPlan getNewPlan() {
return newPreoptimizedPlan;
}
}