/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.controlprogram; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.sysml.api.DMLScript; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; import org.apache.sysml.conf.CompilerConfig; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.hops.recompile.Recompiler; import org.apache.sysml.lops.Lop; import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.DMLProgram; import org.apache.sysml.parser.DataIdentifier; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.parser.ParForStatementBlock; import org.apache.sysml.parser.StatementBlock; import org.apache.sysml.parser.VariableSet; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.controlprogram.parfor.DataPartitioner; import org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerLocal; import org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteMR; import org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteSpark; import org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker; import org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue; import org.apache.sysml.runtime.controlprogram.parfor.ParForBody; import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter; import org.apache.sysml.runtime.controlprogram.parfor.RemoteDPParForMR; import org.apache.sysml.runtime.controlprogram.parfor.RemoteDPParForSpark; import org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn; import org.apache.sysml.runtime.controlprogram.parfor.RemoteParForMR; import org.apache.sysml.runtime.controlprogram.parfor.RemoteParForSpark; import org.apache.sysml.runtime.controlprogram.parfor.ResultMerge; import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalAutomatic; import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile; import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalMemory; import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeRemoteMR; import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeRemoteSpark; import org.apache.sysml.runtime.controlprogram.parfor.Task; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerFactoring; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerFactoringCmax; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerFactoringCmin; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerFixedsize; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerNaive; import org.apache.sysml.runtime.controlprogram.parfor.TaskPartitionerStatic; import org.apache.sysml.runtime.controlprogram.parfor.mqo.RuntimePiggybacking; import org.apache.sysml.runtime.controlprogram.parfor.opt.CostEstimator; import org.apache.sysml.runtime.controlprogram.parfor.opt.CostEstimator.TestMeasure; import org.apache.sysml.runtime.controlprogram.parfor.opt.CostEstimatorHops; import org.apache.sysml.runtime.controlprogram.parfor.opt.OptTree; import org.apache.sysml.runtime.controlprogram.parfor.opt.OptTreeConverter; import org.apache.sysml.runtime.controlprogram.parfor.opt.OptimizationWrapper; import org.apache.sysml.runtime.controlprogram.parfor.opt.OptimizerRuleBased; import org.apache.sysml.runtime.controlprogram.parfor.opt.ProgramRecompiler; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.controlprogram.parfor.stat.Stat; import org.apache.sysml.runtime.controlprogram.parfor.stat.StatisticMonitor; import org.apache.sysml.runtime.controlprogram.parfor.stat.Timing; import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.cp.BooleanObject; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.cp.DoubleObject; import org.apache.sysml.runtime.instructions.cp.IntObject; import org.apache.sysml.runtime.instructions.cp.StringObject; import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.sysml.utils.Statistics; import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer; /** * The ParForProgramBlock has the same execution semantics as a ForProgamBlock but executes * the independent iterations in parallel. See ParForStatementBlock for the loop dependency * analysis. At runtime level, iterations are guaranteed to be completely independent. * * NEW FUNCTIONALITIES (not for BI 2.0 release) * TODO: reduction variables (operations: +=, -=, /=, *=, min, max) * TODO: papply(A,1:2,FUN) language construct (compiled to ParFOR) via DML function repository => modules OK, but second-order functions required * */ public class ParForProgramBlock extends ForProgramBlock { // execution modes public enum PExecMode { LOCAL, //local (master) multi-core execution mode REMOTE_MR, //remote (MR cluster) execution mode REMOTE_MR_DP, //remote (MR cluster) execution mode, fused with data partitioning REMOTE_SPARK, //remote (Spark cluster) execution mode REMOTE_SPARK_DP,//remote (Spark cluster) execution mode, fused with data partitioning UNSPECIFIED } // task partitioner public enum PTaskPartitioner { FIXED, //fixed-sized task partitioner, uses tasksize NAIVE, //naive task partitioner (tasksize=1) STATIC, //static task partitioner (numIterations/numThreads) FACTORING, //factoring task partitioner FACTORING_CMIN, //constrained factoring task partitioner, uses tasksize as min constraint FACTORING_CMAX, //constrained factoring task partitioner, uses tasksize as max constraint UNSPECIFIED } public enum PDataPartitionFormat { NONE, ROW_WISE, ROW_BLOCK_WISE, ROW_BLOCK_WISE_N, COLUMN_WISE, COLUMN_BLOCK_WISE, COLUMN_BLOCK_WISE_N, BLOCK_WISE_M_N; /** * Note: Robust version of valueOf in order to return NONE without exception * if misspelled or non-existing and for case-insensitivity. * * @param s data partition format as string * @return data partition format */ public static PDataPartitionFormat parsePDataPartitionFormat(String s) { if (s.equalsIgnoreCase("ROW_WISE")) return ROW_WISE; else if (s.equalsIgnoreCase("ROW_BLOCK_WISE")) return ROW_BLOCK_WISE; else if (s.equalsIgnoreCase("ROW_BLOCK_WISE_N")) return ROW_BLOCK_WISE_N; else if (s.equalsIgnoreCase("COLUMN_WISE")) return COLUMN_WISE; else if (s.equalsIgnoreCase("COLUMN_BLOCK_WISE")) return COLUMN_BLOCK_WISE; else if (s.equalsIgnoreCase("COLUMN_BLOCK_WISE_N")) return COLUMN_BLOCK_WISE_N; else if (s.equalsIgnoreCase("BLOCK_WISE_M_N")) return BLOCK_WISE_M_N; else return NONE; } } /** * Convenience class to package PDataPartitionFormat and its parameters. */ public static class PartitionFormat implements Serializable { private static final long serialVersionUID = 4729309847778707801L; public static final PartitionFormat NONE = new PartitionFormat(PDataPartitionFormat.NONE, -1); public static final PartitionFormat ROW_WISE = new PartitionFormat(PDataPartitionFormat.ROW_WISE, -1); public static final PartitionFormat COLUMN_WISE = new PartitionFormat(PDataPartitionFormat.COLUMN_WISE, -1); public final PDataPartitionFormat _dpf; public final int _N; public PartitionFormat(PDataPartitionFormat dpf, int N) { _dpf = dpf; _N = N; } @Override public boolean equals(Object o) { return (o instanceof PartitionFormat) && _dpf == ((PartitionFormat)o)._dpf && _N == ((PartitionFormat)o)._N; } @Override public String toString() { return _dpf.name()+","+_N; } public static PartitionFormat valueOf(String value) { String[] parts = value.split(","); return new PartitionFormat(PDataPartitionFormat .parsePDataPartitionFormat(parts[0]), Integer.parseInt(parts[1])); } public boolean isBlockwise() { return _dpf == PDataPartitionFormat.COLUMN_BLOCK_WISE_N || _dpf == PDataPartitionFormat.ROW_BLOCK_WISE_N; } public long getNumParts(MatrixCharacteristics mc) { switch( _dpf ) { case ROW_WISE: return mc.getRows(); case ROW_BLOCK_WISE: return mc.getNumRowBlocks(); case ROW_BLOCK_WISE_N: return (long)Math.ceil((double)mc.getRows()/_N); case COLUMN_WISE: return mc.getCols(); case COLUMN_BLOCK_WISE: return mc.getNumColBlocks(); case COLUMN_BLOCK_WISE_N: return (long)Math.ceil((double)mc.getCols()/_N); default: throw new RuntimeException("Unsupported partition format: "+_dpf); } } } public enum PDataPartitioner { NONE, // no data partitioning LOCAL, // local file based partition split on master node REMOTE_MR, // remote partition split using a reblock MR job REMOTE_SPARK, // remote partition split using a spark job UNSPECIFIED, } public enum PResultMerge { LOCAL_MEM, // in-core (in-memory) result merge (output and one input at a time) LOCAL_FILE, // out-of-core result merge (file format dependent) LOCAL_AUTOMATIC, // decides between MEM and FILE based on the size of the output matrix REMOTE_MR, // remote MR parallel result merge REMOTE_SPARK, // remote Spark parallel result merge UNSPECIFIED, } //optimizer public enum POptMode{ NONE, //no optimization, use defaults and specified parameters RULEBASED, //rule-based rewritings with memory constraints CONSTRAINED, //same as rule-based but with given params as constraints HEURISTIC, //smae as rule-based but with time-based cost estimates } // internal parameters public static final boolean OPTIMIZE = true; // run all automatic optimizations on top-level parfor public static final boolean USE_PB_CACHE = false; // reuse copied program blocks whenever possible, not there can be issues related to recompile public static boolean USE_RANGE_TASKS_IF_USEFUL = true; // use range tasks whenever size>3, false, otherwise wrong split order in remote public static final boolean USE_STREAMING_TASK_CREATION = true; // start working while still creating tasks, prevents blocking due to too small task queue public static final boolean ALLOW_NESTED_PARALLELISM = true; // if not, transparently change parfor to for on program conversions (local,remote) public static boolean ALLOW_REUSE_MR_JVMS = true; // potential benefits: less setup costs per task, NOTE> cannot be used MR4490 in Hadoop 1.0.3, still not fixed in 1.1.1 public static boolean ALLOW_REUSE_MR_PAR_WORKER = ALLOW_REUSE_MR_JVMS; //potential benefits: less initialization, reuse in-memory objects and result consolidation! public static final boolean USE_PARALLEL_RESULT_MERGE = false; // if result merge is run in parallel or serial public static final boolean USE_PARALLEL_RESULT_MERGE_REMOTE = true; // if remote result merge should be run in parallel for multiple result vars public static final boolean ALLOW_DATA_COLOCATION = true; public static final boolean CREATE_UNSCOPED_RESULTVARS = true; public static boolean ALLOW_REUSE_PARTITION_VARS = true; //reuse partition input matrices, applied only if read-only in surrounding loops public static final int WRITE_REPLICATION_FACTOR = 1; public static final int MAX_RETRYS_ON_ERROR = 1; public static final boolean FORCE_CP_ON_REMOTE_MR = true; // compile body to CP if exec type forced to MR public static final boolean LIVEVAR_AWARE_EXPORT = true; //export only read variables according to live variable analysis public static final boolean RESET_RECOMPILATION_FLAGs = true; public static final String PARFOR_FNAME_PREFIX = "/parfor/"; public static final String PARFOR_MR_TASKS_TMP_FNAME = PARFOR_FNAME_PREFIX + "%ID%_MR_taskfile"; public static final String PARFOR_MR_RESULT_TMP_FNAME = PARFOR_FNAME_PREFIX + "%ID%_MR_results"; public static final String PARFOR_MR_RESULTMERGE_FNAME = PARFOR_FNAME_PREFIX + "%ID%_resultmerge%VAR%"; public static final String PARFOR_DATAPARTITIONS_FNAME = PARFOR_FNAME_PREFIX + "%ID%_datapartitions%VAR%"; public static final String PARFOR_COUNTER_GROUP_NAME = "SystemML ParFOR Counters"; // static ID generator sequences private static IDSequence _pfIDSeq = null; private static IDSequence _pwIDSeq = null; // runtime parameters protected HashMap<String,String> _params = null; protected int _numThreads = -1; protected PTaskPartitioner _taskPartitioner = null; protected long _taskSize = -1; protected PDataPartitioner _dataPartitioner = null; protected PResultMerge _resultMerge = null; protected PExecMode _execMode = null; protected POptMode _optMode = null; protected boolean _monitor = false; protected Level _optLogLevel = null; //specifics used for optimization protected long _numIterations = -1; protected String[] _iterablePredicateVarsOriginal = null; //specifics used for data partitioning protected LocalVariableMap _variablesDPOriginal = null; protected LocalVariableMap _variablesDPReuse = null; protected String _colocatedDPMatrix = null; protected boolean _tSparseCol = false; protected int _replicationDP = WRITE_REPLICATION_FACTOR; protected int _replicationExport = -1; //specifics used for result partitioning protected boolean _jvmReuse = true; //specifics used for recompilation protected double _oldMemoryBudget = -1; protected double _recompileMemoryBudget = -1; //specifics for caching protected boolean _enableCPCaching = true; protected boolean _enableRuntimePiggybacking = false; //specifics for spark protected Collection<String> _variablesRP = null; protected Collection<String> _variablesECache = null; // program block meta data protected long _ID = -1; protected int _IDPrefix = -1; protected ArrayList<String> _resultVars = null; protected IDSequence _resultVarsIDSeq = null; protected IDSequence _dpVarsIDSeq = null; protected boolean _monitorReport = false; protected boolean _hasFunctions = true; // local parworker data protected long[] _pwIDs = null; protected HashMap<Long,ArrayList<ProgramBlock>> _pbcache = null; static { //init static ID sequence generators _pfIDSeq = new IDSequence(); _pwIDSeq = new IDSequence(); } public ParForProgramBlock(Program prog, String[] iterPredVars, HashMap<String,String> params) throws DMLRuntimeException { this( -1, prog, iterPredVars, params); } /** * ParForProgramBlock constructor. It reads the specified parameter settings, where defaults for non-specified parameters * have been set in ParForStatementBlock.validate(). Furthermore, it generates the IDs for the ParWorkers. * * @param ID parfor program block id * @param prog runtime program * @param iterPredVars ? * @param params map of parameters * @throws DMLRuntimeException if DMLRuntimeException occurs */ public ParForProgramBlock(int ID, Program prog, String[] iterPredVars, HashMap<String,String> params) { super(prog, iterPredVars); //init internal flags according to DML config initInternalConfigurations(ConfigurationManager.getDMLConfig()); //ID generation and setting setParForProgramBlockIDs( ID ); _resultVarsIDSeq = new IDSequence(); _dpVarsIDSeq = new IDSequence(); //parse and use internal parameters (already set to default if not specified) _params = params; try { _numThreads = Integer.parseInt( getParForParam(ParForStatementBlock.PAR) ); _taskPartitioner = PTaskPartitioner.valueOf( getParForParam(ParForStatementBlock.TASK_PARTITIONER) ); _taskSize = Integer.parseInt( getParForParam(ParForStatementBlock.TASK_SIZE) ); _dataPartitioner = PDataPartitioner.valueOf( getParForParam(ParForStatementBlock.DATA_PARTITIONER) ); _resultMerge = PResultMerge.valueOf( getParForParam(ParForStatementBlock.RESULT_MERGE) ); _execMode = PExecMode.valueOf( getParForParam(ParForStatementBlock.EXEC_MODE) ); _optMode = POptMode.valueOf( getParForParam(ParForStatementBlock.OPT_MODE) ); _optLogLevel = Level.toLevel( getParForParam(ParForStatementBlock.OPT_LOG) ); _monitor = (Integer.parseInt(getParForParam(ParForStatementBlock.PROFILE) ) == 1); } catch(Exception ex) { throw new RuntimeException("Error parsing specified ParFOR parameters.",ex); } //reset the internal opt mode if optimization globally disabled. if( !OPTIMIZE ) _optMode = POptMode.NONE; _variablesDPOriginal = new LocalVariableMap(); _variablesDPReuse = new LocalVariableMap(); //create IDs for all parworkers if( _execMode == PExecMode.LOCAL /*&& _optMode==POptMode.NONE*/ ) setLocalParWorkerIDs(); //initialize program block cache if necessary if( USE_PB_CACHE ) _pbcache = new HashMap<Long, ArrayList<ProgramBlock>>(); //created profiling report after parfor exec _monitorReport = _monitor; //materialized meta data (reused for all invocations) _hasFunctions = ProgramRecompiler.containsAtLeastOneFunction(this); LOG.trace("PARFOR: ParForProgramBlock created with mode = "+_execMode+", optmode = "+_optMode+", numThreads = "+_numThreads); } public long getID() { return _ID; } public PExecMode getExecMode() { return _execMode; } public HashMap<String,String> getParForParams() { return _params; } public String getParForParam(String key) { String tmp = getParForParams().get(key); return (tmp == null) ? null : UtilFunctions.unquote(tmp).toUpperCase(); } public ArrayList<String> getResultVariables() { return _resultVars; } public void setResultVariables(ArrayList<String> resultVars) { _resultVars = resultVars; } public void disableOptimization() { _optMode = POptMode.NONE; } public POptMode getOptimizationMode() { return _optMode; } public int getDegreeOfParallelism() { return _numThreads; } public void setDegreeOfParallelism(int k) { _numThreads = k; _params.put(ParForStatementBlock.PAR, String.valueOf(_numThreads)); //kept up-to-date for copies setLocalParWorkerIDs(); } public void setCPCaching(boolean flag) { _enableCPCaching = flag; } public void setRuntimePiggybacking(boolean flag) { _enableRuntimePiggybacking = flag; } public void setExecMode( PExecMode mode ) { _execMode = mode; _params.put(ParForStatementBlock.EXEC_MODE, String.valueOf(_execMode)); //kept up-to-date for copies } public void setTaskPartitioner( PTaskPartitioner partitioner ) { _taskPartitioner = partitioner; _params.put(ParForStatementBlock.TASK_PARTITIONER, String.valueOf(_taskPartitioner)); //kept up-to-date for copies } public void setTaskSize( long tasksize ) { _taskSize = tasksize; _params.put(ParForStatementBlock.TASK_SIZE, String.valueOf(_taskSize)); //kept up-to-date for copies } public void setDataPartitioner(PDataPartitioner partitioner) { _dataPartitioner = partitioner; _params.put(ParForStatementBlock.DATA_PARTITIONER, String.valueOf(_dataPartitioner)); //kept up-to-date for copies } public void enableColocatedPartitionedMatrix( String varname ) { //only called from optimizer _colocatedDPMatrix = varname; } public void setTransposeSparseColumnVector( boolean flag ) { _tSparseCol = flag; } public void setPartitionReplicationFactor( int rep ) { //only called from optimizer _replicationDP = rep; } public void setExportReplicationFactor( int rep ) { //only called from optimizer _replicationExport = rep; } public void disableJVMReuse() { //only called from optimizer _jvmReuse = false; } public void disableMonitorReport() { _monitorReport = false; } public void setResultMerge(PResultMerge merge) { _resultMerge = merge; _params.put(ParForStatementBlock.RESULT_MERGE, String.valueOf(_resultMerge)); //kept up-to-date for copies } public void setRecompileMemoryBudget( double localMem ) { _recompileMemoryBudget = localMem; } public void setSparkRepartitionVariables(Collection<String> vars) { _variablesRP = vars; } public Collection<String> getSparkRepartitionVariables() { return _variablesRP; } public void setSparkEagerCacheVariables(Collection<String> vars) { _variablesECache = vars; } public long getNumIterations() { return _numIterations; } public boolean hasFunctions() { return _hasFunctions; } public static void initInternalConfigurations( DMLConfig conf ) { ALLOW_REUSE_MR_JVMS = conf.getBooleanValue(DMLConfig.JVM_REUSE); ALLOW_REUSE_MR_PAR_WORKER = ALLOW_REUSE_MR_JVMS; } @Override public void execute(ExecutionContext ec) throws DMLRuntimeException { ParForStatementBlock sb = (ParForStatementBlock)getStatementBlock(); // add the iterable predicate variable to the variable set String iterVarName = _iterablePredicateVars[0]; // evaluate from, to, incr only once (assumption: known at for entry) IntObject from = executePredicateInstructions( 1, _fromInstructions, ec ); IntObject to = executePredicateInstructions( 2, _toInstructions, ec ); IntObject incr = (_incrementInstructions == null || _incrementInstructions.isEmpty()) && _iterablePredicateVars[3]==null ? new IntObject((from.getLongValue()<=to.getLongValue()) ? 1 : -1) : executePredicateInstructions( 3, _incrementInstructions, ec ); if ( incr.getLongValue() == 0 ) //would produce infinite loop throw new DMLRuntimeException(this.printBlockErrorLocation() + "Expression for increment of variable '" + iterVarName + "' must evaluate to a non-zero value."); //early exit on num iterations = zero if( computeNumIterations(from, to, incr) <= 0 ) return; //avoid unnecessary optimization/initialization /////// //OPTIMIZATION of ParFOR body (incl all child parfor PBs) /////// if( _optMode != POptMode.NONE ) { updateIterablePredicateVars( iterVarName, from, to, incr ); OptimizationWrapper.setLogLevel(_optLogLevel); //set optimizer log level OptimizationWrapper.optimize( _optMode, sb, this, ec, _monitor ); //core optimize //take changed iterable predicate into account iterVarName = _iterablePredicateVars[0]; from = executePredicateInstructions( 1, _fromInstructions, ec ); to = executePredicateInstructions( 2, _toInstructions, ec ); incr = executePredicateInstructions( 3, _incrementInstructions, ec ); } /////// //DATA PARTITIONING of read-only parent variables of type (matrix,unpartitioned) /////// Timing time = _monitor ? new Timing(true) : null; //partitioning on demand (note: for fused data partitioning and execute the optimizer set //the data partitioner to NONE in order to prevent any side effects) handleDataPartitioning( ec ); //repartitioning of variables for spark cpmm/zipmm in order prevent unnecessary shuffle handleSparkRepartitioning( ec ); //eager rdd caching of variables for spark in order prevent read/write contention handleSparkEagerCaching( ec ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_DATA_T, time.stop()); // initialize iter var to form value IntObject iterVar = new IntObject(iterVarName, from.getLongValue() ); /////// //begin PARALLEL EXECUTION of (PAR)FOR body /////// LOG.trace("EXECUTE PARFOR ID = "+_ID+" with mode = "+_execMode+", numThreads = "+_numThreads+", taskpartitioner = "+_taskPartitioner); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTHREADS, _numThreads); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKSIZE, _taskSize); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKPARTITIONER, _taskPartitioner.ordinal()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_DATAPARTITIONER, _dataPartitioner.ordinal()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_EXECMODE, _execMode.ordinal()); } //preserve shared input/result variables of cleanup ArrayList<String> varList = ec.getVarList(); HashMap<String, Boolean> varState = ec.pinVariables(varList); try { switch( _execMode ) { case LOCAL: //create parworkers as local threads if (DMLScript.USE_ACCELERATOR) { GPUContextPool.returnToPool(ec.getGPUContext()); ec.setGPUContext(null); setDegreeOfParallelism(GPUContextPool.getDeviceCount()); } executeLocalParFor(ec, iterVar, from, to, incr); break; case REMOTE_MR: // create parworkers as MR tasks (one job per parfor) executeRemoteMRParFor(ec, iterVar, from, to, incr); break; case REMOTE_MR_DP: // create parworkers as MR tasks (one job per parfor) executeRemoteMRParForDP(ec, iterVar, from, to, incr); break; case REMOTE_SPARK: // create parworkers as Spark tasks (one job per parfor) executeRemoteSparkParFor(ec, iterVar, from, to, incr); break; case REMOTE_SPARK_DP: // create parworkers as Spark tasks (one job per parfor) executeRemoteSparkParForDP(ec, iterVar, from, to, incr); break; default: throw new DMLRuntimeException("Undefined execution mode: '"+_execMode+"'."); } } catch(Exception ex) { throw new DMLRuntimeException("PARFOR: Failed to execute loop in parallel.",ex); } //reset state of shared input/result variables ec.unpinVariables(varList, varState); //cleanup unpinned shared variables cleanupSharedVariables(ec, varState); //set iteration var to TO value (+ increment) for FOR equivalence iterVar = new IntObject( iterVarName, to.getLongValue() ); //consistent with for ec.setVariable(iterVarName, iterVar); //ensure that subsequent program blocks never see partitioned data (invalid plans!) //we can replace those variables, because partitioning only applied for read-only matrices for( String var : _variablesDPOriginal.keySet() ) { //cleanup partitioned matrix (if not reused) if( !_variablesDPReuse.keySet().contains(var) ) VariableCPInstruction.processRemoveVariableInstruction(ec, var); //reset to original matrix MatrixObject mo = (MatrixObject) _variablesDPOriginal.get( var ); ec.setVariable(var, mo); } /////// //end PARALLEL EXECUTION of (PAR)FOR body /////// //print profiling report (only if top-level parfor because otherwise in parallel context) if( _monitorReport ) LOG.info("\n"+StatisticMonitor.createReport()); //reset flags/modifications made by optimizer //TODO reset of hop parallelism constraint (e.g., ba+*) for( String dpvar : _variablesDPOriginal.keySet() ) //release forced exectypes ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, dpvar, ec, false); //release forced exectypes for fused dp/exec if( _execMode == PExecMode.REMOTE_MR_DP || _execMode == PExecMode.REMOTE_SPARK_DP ) ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, _colocatedDPMatrix, ec, false); resetIterablePredicateVars(); resetOptimizerFlags(); //after release, deletes dp_varnames //execute exit instructions (usually empty) executeInstructions(_exitInstructions, ec); } /** * Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution. * This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See * below for details of the realization. * * @param ec execution context * @param itervar ? * @param from ? * @param to ? * @param incr ? * @throws DMLRuntimeException if DMLRuntimeException occurs * @throws InterruptedException if InterruptedException occurs */ private void executeLocalParFor( ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException, InterruptedException { LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads); /* Step 1) init parallel workers, task queue and threads * start threads (from now on waiting for tasks) * Step 2) create tasks * put tasks into queue * mark end of task input stream * Step 3) join all threads (wait for finished work) * Step 4) collect results from each parallel worker */ Timing time = new Timing(true); int numExecutedTasks = 0; int numExecutedIterations = 0; //restrict recompilation to thread local memory setMemoryBudget(); //enable runtime piggybacking if required if( _enableRuntimePiggybacking ) RuntimePiggybacking.start( _numThreads ); //default piggybacking worker try { // Step 1) init parallel workers, task queue and threads LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>(); Thread[] threads = new Thread[_numThreads]; LocalParWorker[] workers = new LocalParWorker[_numThreads]; for( int i=0; i<_numThreads; i++ ) { //create parallel workers as (lazy) deep copies //including preparation of update-in-place variables workers[i] = createParallelWorker( _pwIDs[i], queue, ec ); threads[i] = new Thread( workers[i] ); threads[i].setPriority(Thread.MAX_PRIORITY); } // start threads (from now on waiting for tasks) for( Thread thread : threads ) thread.start(); //maintain statistics long tinit = (long) time.stop(); if( DMLScript.STATISTICS ) Statistics.incrementParForInitTime(tinit); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit); // Step 2) create tasks TaskPartitioner partitioner = createTaskPartitioner(from, to, incr); long numIterations = partitioner.getNumIterations(); long numCreatedTasks = -1; if( USE_STREAMING_TASK_CREATION ) { //put tasks into queue (parworker start work on first tasks while creating tasks) numCreatedTasks = partitioner.createTasks(queue); } else { List<Task> tasks = partitioner.createTasks(); numCreatedTasks = tasks.size(); // put tasks into queue for( Task t : tasks ) queue.enqueueTask( t ); // mark end of task input stream queue.closeInput(); } if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop()); // Step 3) join all threads (wait for finished work) for( Thread thread : threads ) thread.join(); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop()); // Step 4) collecting results from each parallel worker //obtain results LocalVariableMap [] localVariables = new LocalVariableMap [_numThreads]; for( int i=0; i<_numThreads; i++ ) { localVariables[i] = workers[i].getVariables(); numExecutedTasks += workers[i].getExecutedTasks(); numExecutedIterations += workers[i].getExecutedIterations(); } //consolidate results into global symbol table consolidateAndCheckResults( ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables ); // Step 5) cleanup local parworkers (e.g., remove created functions) for( int i=0; i<_numThreads; i++ ) { Collection<String> fnNames = workers[i].getFunctionNames(); if( fnNames!=null && !fnNames.isEmpty() ) for( String fn : fnNames ) { String[] parts = DMLProgram.splitFunctionKey(fn); _prog.removeFunctionProgramBlock(parts[0], parts[1]); } } // Frees up the GPUContexts used in the threaded Parfor and sets // the main thread to use the GPUContext if (DMLScript.USE_ACCELERATOR) { for (int i = 0; i < _numThreads; i++) { GPUContext gCtx = workers[i].getExecutionContext().getGPUContext(); GPUContextPool.returnToPool(gCtx); } ec.setGPUContext(GPUContextPool.getFromPool()); ec.getGPUContext().initializeThread(); } } finally { //remove thread-local memory budget (reset to original budget) //(in finally to prevent error side effects for multiple scripts in one jvm) resetMemoryBudget(); //disable runtime piggybacking if( _enableRuntimePiggybacking ) RuntimePiggybacking.stop(); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations); } } } private void executeRemoteMRParFor( ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException, IOException { /* Step 0) check and recompile MR inst * Step 1) serialize child PB and inst * Step 2) create tasks * serialize tasks * Step 3) submit MR Jobs and wait for results * Step 4) collect results from each parallel worker */ Timing time = ( _monitor ? new Timing(true) : null ); // Step 0) check and compile to CP (if forced remote parfor) boolean flagForced = false; if( FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode==PExecMode.REMOTE_MR)) ) { //tid = 0 because replaced in remote parworker flagForced = checkMRAndRecompileToCP(0); } // Step 1) init parallel workers (serialize PBs) // NOTES: each mapper changes filenames with regard to his ID as we submit a single job, // cannot reuse serialized string, since variables are serialized as well. ParForBody body = new ParForBody( _childBlocks, _resultVars, ec ); String program = ProgramConverter.serializeParForBody( body ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop()); // Step 2) create tasks TaskPartitioner partitioner = createTaskPartitioner(from, to, incr); String taskFile = constructTaskFileName(); String resultFile = constructResultFileName(); long numIterations = partitioner.getNumIterations(); int maxDigits = (int)Math.log10(to.getLongValue()) + 1; long numCreatedTasks = -1; if( USE_STREAMING_TASK_CREATION ) { LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>(); //put tasks into queue and start writing to taskFile numCreatedTasks = partitioner.createTasks(queue); taskFile = writeTasksToFile( taskFile, queue, maxDigits ); } else { //sequentially create tasks and write to disk List<Task> tasks = partitioner.createTasks(); numCreatedTasks = tasks.size(); taskFile = writeTasksToFile( taskFile, tasks, maxDigits ); } if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop()); //write matrices to HDFS exportMatricesToHDFS(ec); // Step 3) submit MR job (wait for finished work) MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix!=null)? ec.getMatrixObject(_colocatedDPMatrix) : null; RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse) ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop()); // Step 4) collecting results from each parallel worker int numExecutedTasks = ret.getNumExecutedTasks(); int numExecutedIterations = ret.getNumExecutedIterations(); //consolidate results into global symbol table consolidateAndCheckResults( ec, numIterations, numCreatedTasks, numExecutedIterations , numExecutedTasks, ret.getVariables() ); if( flagForced ) //see step 0 releaseForcedRecompile(0); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations); } } private void executeRemoteMRParForDP( ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException, IOException { /* Step 0) check and recompile MR inst * Step 1) serialize child PB and inst * Step 2) create tasks * serialize tasks * Step 3) submit MR Jobs and wait for results * Step 4) collect results from each parallel worker */ Timing time = ( _monitor ? new Timing(true) : null ); // Step 0) check and compile to CP (if forced remote parfor) boolean flagForced = checkMRAndRecompileToCP(0); // Step 1) prepare partitioned input matrix (needs to happen before serializing the program) ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock(); MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix ); PartitionFormat inputDPF = sb.determineDataPartitionFormat( _colocatedDPMatrix ); inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N); //mark matrix var as partitioned // Step 2) init parallel workers (serialize PBs) // NOTES: each mapper changes filenames with regard to his ID as we submit a single job, // cannot reuse serialized string, since variables are serialized as well. ParForBody body = new ParForBody( _childBlocks, _resultVars, ec ); String program = ProgramConverter.serializeParForBody( body ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop()); // Step 3) create tasks TaskPartitioner partitioner = createTaskPartitioner(from, to, incr); String resultFile = constructResultFileName(); long numIterations = partitioner.getNumIterations(); long numCreatedTasks = numIterations;//partitioner.createTasks().size(); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop()); //write matrices to HDFS exportMatricesToHDFS(ec); // Step 4) submit MR job (wait for finished work) OutputInfo inputOI = ((inputMatrix.getSparsity()<0.1 && inputDPF==PartitionFormat.COLUMN_WISE)|| (inputMatrix.getSparsity()<0.001 && inputDPF==PartitionFormat.ROW_WISE))? OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo; RemoteParForJobReturn ret = RemoteDPParForMR.runJob(_ID, itervar.getName(), _colocatedDPMatrix, program, resultFile, inputMatrix, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads, _replicationDP ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop()); // Step 5) collecting results from each parallel worker int numExecutedTasks = ret.getNumExecutedTasks(); int numExecutedIterations = ret.getNumExecutedIterations(); //consolidate results into global symbol table consolidateAndCheckResults( ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables() ); if( flagForced ) //see step 0 releaseForcedRecompile(0); inputMatrix.unsetPartitioned(); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations); } } private void executeRemoteSparkParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException { Timing time = ( _monitor ? new Timing(true) : null ); // Step 0) check and compile to CP (if forced remote parfor) boolean flagForced = false; if( FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode==PExecMode.REMOTE_SPARK)) ) { //tid = 0 because replaced in remote parworker flagForced = checkMRAndRecompileToCP(0); } // Step 1) init parallel workers (serialize PBs) // NOTES: each mapper changes filenames with regard to his ID as we submit a single job, // cannot reuse serialized string, since variables are serialized as well. ParForBody body = new ParForBody(_childBlocks, _resultVars, ec); HashMap<String, byte[]> clsMap = new HashMap<String, byte[]>(); String program = ProgramConverter.serializeParForBody(body, clsMap); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop()); // Step 2) create tasks TaskPartitioner partitioner = createTaskPartitioner(from, to, incr); long numIterations = partitioner.getNumIterations(); //sequentially create tasks as input to parfor job List<Task> tasks = partitioner.createTasks(); long numCreatedTasks = tasks.size(); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop()); //write matrices to HDFS exportMatricesToHDFS(ec); // Step 3) submit Spark parfor job (no lazy evaluation, since collect on result) //MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix!=null)? (MatrixObject)ec.getVariable(_colocatedDPMatrix) : null; RemoteParForJobReturn ret = RemoteParForSpark.runJob(_ID, program, clsMap, tasks, ec, _enableCPCaching, _numThreads); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop()); // Step 4) collecting results from each parallel worker int numExecutedTasks = ret.getNumExecutedTasks(); int numExecutedIterations = ret.getNumExecutedIterations(); //consolidate results into global symbol table consolidateAndCheckResults( ec, numIterations, numCreatedTasks, numExecutedIterations , numExecutedTasks, ret.getVariables() ); if( flagForced ) //see step 0 releaseForcedRecompile(0); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations); } } private void executeRemoteSparkParForDP( ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException, IOException { Timing time = ( _monitor ? new Timing(true) : null ); // Step 0) check and compile to CP (if forced remote parfor) boolean flagForced = checkMRAndRecompileToCP(0); // Step 1) prepare partitioned input matrix (needs to happen before serializing the progam) ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock(); MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix ); PartitionFormat inputDPF = sb.determineDataPartitionFormat( _colocatedDPMatrix ); inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N); //mark matrix var as partitioned // Step 2) init parallel workers (serialize PBs) // NOTES: each mapper changes filenames with regard to his ID as we submit a single job, // cannot reuse serialized string, since variables are serialized as well. ParForBody body = new ParForBody( _childBlocks, _resultVars, ec ); HashMap<String, byte[]> clsMap = new HashMap<String, byte[]>(); String program = ProgramConverter.serializeParForBody( body, clsMap ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop()); // Step 3) create tasks TaskPartitioner partitioner = createTaskPartitioner(from, to, incr); String resultFile = constructResultFileName(); long numIterations = partitioner.getNumIterations(); long numCreatedTasks = numIterations;//partitioner.createTasks().size(); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop()); //write matrices to HDFS, except DP matrix which is the input to the RemoteDPParForSpark job exportMatricesToHDFS(ec, _colocatedDPMatrix); // Step 4) submit MR job (wait for finished work) //TODO runtime support for binary cell partitioning //OutputInfo inputOI = ((inputMatrix.getSparsity()<0.1 && inputDPF==PDataPartitionFormat.COLUMN_WISE)|| // (inputMatrix.getSparsity()<0.001 && inputDPF==PDataPartitionFormat.ROW_WISE))? // OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo; OutputInfo inputOI = OutputInfo.BinaryBlockOutputInfo; RemoteParForJobReturn ret = RemoteDPParForSpark.runJob(_ID, itervar.getName(), _colocatedDPMatrix, program, clsMap, resultFile, inputMatrix, ec, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads ); if( _monitor ) StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop()); // Step 5) collecting results from each parallel worker int numExecutedTasks = ret.getNumExecutedTasks(); int numExecutedIterations = ret.getNumExecutedIterations(); //consolidate results into global symbol table consolidateAndCheckResults( ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables() ); if( flagForced ) //see step 0 releaseForcedRecompile(0); inputMatrix.unsetPartitioned(); if( _monitor ) { StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop()); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks); StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations); } } private void handleDataPartitioning( ExecutionContext ec ) throws DMLRuntimeException { PDataPartitioner dataPartitioner = _dataPartitioner; if( dataPartitioner != PDataPartitioner.NONE ) { ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock(); if( sb == null ) throw new DMLRuntimeException("ParFor statement block required for reasoning about data partitioning."); ArrayList<String> vars = sb.getReadOnlyParentVars(); for( String var : vars ) { Data dat = ec.getVariable(var); //skip non-existing input matrices (which are due to unknown sizes marked for //partitioning but typically related branches are never executed) if( dat != null && dat instanceof MatrixObject ) { MatrixObject moVar = (MatrixObject) dat; //unpartitioned input PartitionFormat dpf = sb.determineDataPartitionFormat( var ); LOG.trace("PARFOR ID = "+_ID+", Partitioning read-only input variable "+var+" (format="+dpf+", mode="+_dataPartitioner+")"); if( dpf != PartitionFormat.NONE ) { if( dataPartitioner != PDataPartitioner.REMOTE_SPARK && dpf.isBlockwise() ) { LOG.warn("PARFOR ID = "+_ID+", Switching data partitioner from " + dataPartitioner + " to " + PDataPartitioner.REMOTE_SPARK.name()+" for blockwise-n partitioning."); dataPartitioner = PDataPartitioner.REMOTE_SPARK; } Timing ltime = new Timing(true); //input data partitioning (reuse if possible) Data dpdatNew = _variablesDPReuse.get(var); if( dpdatNew == null ) //no reuse opportunity { DataPartitioner dp = createDataPartitioner( dpf, dataPartitioner, ec ); //disable binary cell for sparse if consumed by MR jobs if( !OptimizerRuleBased.allowsBinaryCellPartitions(moVar, dpf ) || OptimizerUtils.isSparkExecutionMode() ) //TODO support for binarycell { dp.disableBinaryCell(); } MatrixObject moVarNew = dp.createPartitionedMatrixObject(moVar, constructDataPartitionsFileName()); dpdatNew = moVarNew; //skip remaining partitioning logic if not partitioned (e.g., too small) if( moVar == moVarNew ) continue; //skip to next } ec.setVariable(var, dpdatNew); //recompile parfor body program ProgramRecompiler.rFindAndRecompileIndexingHOP(sb,this,var,ec,true); //store original and partitioned matrix (for reuse if applicable) _variablesDPOriginal.put(var, moVar); if( ALLOW_REUSE_PARTITION_VARS && ProgramRecompiler.isApplicableForReuseVariable(sb.getDMLProg(), sb, var) ) { _variablesDPReuse.put(var, dpdatNew); } LOG.trace("Partitioning and recompilation done in "+ltime.stop()+"ms"); } } } } } private void handleSparkRepartitioning( ExecutionContext ec ) throws DMLRuntimeException { if( OptimizerUtils.isSparkExecutionMode() && _variablesRP != null && !_variablesRP.isEmpty() ) { SparkExecutionContext sec = (SparkExecutionContext) ec; for( String var : _variablesRP ) sec.repartitionAndCacheMatrixObject(var); } } private void handleSparkEagerCaching( ExecutionContext ec ) throws DMLRuntimeException { if( OptimizerUtils.isSparkExecutionMode() && _variablesECache != null && !_variablesECache.isEmpty() ) { SparkExecutionContext sec = (SparkExecutionContext) ec; for( String var : _variablesECache ) sec.cacheMatrixObject(var); } } /** * Cleanup result variables of parallel workers after result merge. * * @param ec execution context * @param out output matrix * @param in array of input matrix objects * @throws DMLRuntimeException if DMLRuntimeException occurs */ private void cleanWorkerResultVariables(ExecutionContext ec, MatrixObject out, MatrixObject[] in) throws DMLRuntimeException { for( MatrixObject tmp : in ) { //check for empty inputs (no iterations executed) if( tmp != null && tmp != out ) ec.cleanupMatrixObject(tmp); } } /** * Create empty matrix objects and scalars for all unscoped vars * (created within the parfor). * * NOTE: parfor gives no guarantees on the values of those objects - hence * we return -1 for sclars and empty matrix objects. * * @param out local variable map * @param sb statement block * @throws DMLRuntimeException if DMLRuntimeException occurs */ private void createEmptyUnscopedVariables( LocalVariableMap out, StatementBlock sb ) throws DMLRuntimeException { VariableSet updated = sb.variablesUpdated(); VariableSet livein = sb.liveIn(); //for all vars IN <updated> AND NOT IN <livein> for( String var : updated.getVariableNames() ) if( !livein.containsVariable(var) ) { //create empty output DataIdentifier dat = updated.getVariable(var); DataType datatype = dat.getDataType(); ValueType valuetype = dat.getValueType(); Data dataObj = null; switch( datatype ) { case SCALAR: switch( valuetype ) { case BOOLEAN: dataObj = new BooleanObject(var,false); break; case INT: dataObj = new IntObject(var,-1); break; case DOUBLE: dataObj = new DoubleObject(var,-1d); break; case STRING: dataObj = new StringObject(var,"-1"); break; default: throw new DMLRuntimeException("Value type not supported: "+valuetype); } break; case MATRIX: //currently we do not create any unscoped matrix object outputs //because metadata (e.g., outputinfo) not known at this place. break; case UNKNOWN: break; default: throw new DMLRuntimeException("Data type not supported: "+datatype); } if( dataObj != null ) out.put(var, dataObj); } } private void exportMatricesToHDFS(ExecutionContext ec, String... blacklistNames) throws CacheException { ParForStatementBlock sb = (ParForStatementBlock)getStatementBlock(); HashSet<String> blacklist = new HashSet<String>(Arrays.asList(blacklistNames)); if( LIVEVAR_AWARE_EXPORT && sb != null) { //optimization to prevent unnecessary export of matrices //export only variables that are read in the body VariableSet varsRead = sb.variablesRead(); for (String key : ec.getVariables().keySet() ) { if( varsRead.containsVariable(key) && !blacklist.contains(key) ) { Data d = ec.getVariable(key); if( d.getDataType() == DataType.MATRIX ) ((MatrixObject)d).exportData(_replicationExport); } } } else { //export all matrices in symbol table for (String key : ec.getVariables().keySet() ) { if( !blacklist.contains(key) ) { Data d = ec.getVariable(key); if( d.getDataType() == DataType.MATRIX ) ((MatrixObject)d).exportData(_replicationExport); } } } } private void cleanupSharedVariables( ExecutionContext ec, HashMap<String,Boolean> varState ) throws DMLRuntimeException { //TODO needs as precondition a systematic treatment of persistent read information. /* if( LIVEVAR_AWARE_CLEANUP && _sb != null) { //cleanup shared variables after they are unpinned VariableSet liveout = _sb.liveOut(); for( Entry<String, Boolean> var : varState.entrySet() ) { String varname = var.getKey(); boolean unpinned = var.getValue(); String fprefix = ConfigurationManager.getConfig().getTextValue("scratch") + Lop.FILE_SEPARATOR + Lop.PROCESS_PREFIX + DMLScript.getUUID(); //delete unpinned vars if not in liveout (similar like rmvar) and not persistent input if( unpinned && !liveout.containsVariable(varname) ) { VariableCPInstruction.processRemoveVariableInstruction(ec,varname); } } } */ } /** * Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child * program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced * anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous * executions of this parfor. * * @param pwID parworker id * @param queue task queue * @param ec execution context * @return local parworker * @throws DMLRuntimeException if DMLRuntimeException occurs */ private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec) throws DMLRuntimeException { LocalParWorker pw = null; try { //create deep copies of required elements child blocks ArrayList<ProgramBlock> cpChildBlocks = null; HashSet<String> fnNames = new HashSet<String>(); if( USE_PB_CACHE ) { if( _pbcache.containsKey(pwID) ) { cpChildBlocks = _pbcache.get(pwID); } else { cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false); _pbcache.put(pwID, cpChildBlocks); } } else { cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false); } //deep copy execution context (including prepare parfor update-in-place) ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec); // If GPU mode is enabled, gets a GPUContext from the pool of GPUContexts // and sets it in the ExecutionContext if (DMLScript.USE_ACCELERATOR){ cpEc.setGPUContext(GPUContextPool.getFromPool()); } //prepare basic update-in-place variables (vars dropped on result merge) prepareUpdateInPlaceVariables(cpEc, pwID); //copy compiler configuration (for jmlc w/o global config) CompilerConfig cconf = ConfigurationManager.getCompilerConfig(); //create the actual parallel worker ParForBody body = new ParForBody( cpChildBlocks, _resultVars, cpEc ); pw = new LocalParWorker( pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor ); pw.setFunctionNames(fnNames); } catch(Exception ex) { throw new DMLRuntimeException(ex); } return pw; } /** * Creates a new task partitioner according to the specified runtime parameter. * * @param from ? * @param to ? * @param incr ? * @return task partitioner * @throws DMLRuntimeException if DMLRuntimeException occurs */ private TaskPartitioner createTaskPartitioner( IntObject from, IntObject to, IntObject incr ) throws DMLRuntimeException { TaskPartitioner tp = null; switch( _taskPartitioner ) { case FIXED: tp = new TaskPartitionerFixedsize( _taskSize, _iterablePredicateVars[0], from, to, incr ); break; case NAIVE: tp = new TaskPartitionerNaive( _taskSize, _iterablePredicateVars[0], from, to, incr ); break; case STATIC: tp = new TaskPartitionerStatic( _taskSize, _numThreads, _iterablePredicateVars[0], from, to, incr ); break; case FACTORING: tp = new TaskPartitionerFactoring( _taskSize,_numThreads, _iterablePredicateVars[0], from, to, incr ); break; case FACTORING_CMIN: //for constrained factoring the tasksize is used as the minimum constraint tp = new TaskPartitionerFactoringCmin( _taskSize,_numThreads, _taskSize, _iterablePredicateVars[0], from, to, incr ); break; case FACTORING_CMAX: //for constrained factoring the tasksize is used as the minimum constraint tp = new TaskPartitionerFactoringCmax( _taskSize,_numThreads, _taskSize, _iterablePredicateVars[0], from, to, incr ); break; default: throw new DMLRuntimeException("Undefined task partitioner: '"+_taskPartitioner+"'."); } return tp; } /** * Creates a new data partitioner according to the specified runtime parameter. * * @param dpf data partition format * @param dataPartitioner data partitioner * @param ec execution context * @return data partitioner * @throws DMLRuntimeException if DMLRuntimeException occurs */ private DataPartitioner createDataPartitioner(PartitionFormat dpf, PDataPartitioner dataPartitioner, ExecutionContext ec) throws DMLRuntimeException { DataPartitioner dp = null; //determine max degree of parallelism int numReducers = ConfigurationManager.getNumReducers(); int maxNumRed = InfrastructureAnalyzer.getRemoteParallelReduceTasks(); //correction max number of reducers on yarn clusters if( InfrastructureAnalyzer.isYarnEnabled() ) maxNumRed = (int)Math.max( maxNumRed, YarnClusterAnalyzer.getNumCores()/2 ); int numRed = Math.min(numReducers,maxNumRed); //create data partitioner switch( dataPartitioner ) { case LOCAL: dp = new DataPartitionerLocal(dpf, _numThreads); break; case REMOTE_MR: dp = new DataPartitionerRemoteMR( dpf, _ID, numRed, _replicationDP, ALLOW_REUSE_MR_JVMS, false ); break; case REMOTE_SPARK: dp = new DataPartitionerRemoteSpark( dpf, ec, numRed, _replicationDP, false ); break; default: throw new DMLRuntimeException("Unknown data partitioner: '" +dataPartitioner.name()+"'."); } return dp; } private ResultMerge createResultMerge( PResultMerge prm, MatrixObject out, MatrixObject[] in, String fname, ExecutionContext ec ) throws DMLRuntimeException { ResultMerge rm = null; //determine degree of parallelism int maxMap = -1, maxRed = -1; if( OptimizerUtils.isSparkExecutionMode() ) { maxMap = (int) SparkExecutionContext.getDefaultParallelism(true); maxRed = maxMap; //equal map/reduce } else { int numReducers = ConfigurationManager.getNumReducers(); maxMap = InfrastructureAnalyzer.getRemoteParallelMapTasks(); maxRed = Math.min(numReducers, InfrastructureAnalyzer.getRemoteParallelReduceTasks()); //correction max number of reducers on yarn clusters if( InfrastructureAnalyzer.isYarnEnabled() ) { maxMap = (int)Math.max( maxMap, YarnClusterAnalyzer.getNumCores() ); maxRed = (int)Math.max( maxRed, YarnClusterAnalyzer.getNumCores()/2 ); } } int numMap = Math.max(_numThreads, maxMap); int numRed = maxRed; //create result merge implementation switch( prm ) { case LOCAL_MEM: rm = new ResultMergeLocalMemory( out, in, fname ); break; case LOCAL_FILE: rm = new ResultMergeLocalFile( out, in, fname ); break; case LOCAL_AUTOMATIC: rm = new ResultMergeLocalAutomatic( out, in, fname ); break; case REMOTE_MR: rm = new ResultMergeRemoteMR( out, in, fname, _ID, numMap, numRed, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, ALLOW_REUSE_MR_JVMS ); break; case REMOTE_SPARK: rm = new ResultMergeRemoteSpark( out, in, fname, ec, numMap, numRed ); break; default: throw new DMLRuntimeException("Undefined result merge: '" +prm.toString()+"'."); } return rm; } /** * Recompile program block hierarchy to forced CP if MR instructions or functions. * Returns true if recompile was necessary and possible * * @param tid thread id * @return true if recompile was necessary and possible * @throws DMLRuntimeException if DMLRuntimeException occurs */ private boolean checkMRAndRecompileToCP(long tid) throws DMLRuntimeException { //no MR instructions, ok if( !OptTreeConverter.rContainsMRJobInstruction(this, true) ) return false; //no statement block, failed ParForStatementBlock sb = (ParForStatementBlock)getStatementBlock(); if( sb == null ) { LOG.warn("Missing parfor statement block for recompile."); return false; } //try recompile MR instructions to CP HashSet<String> fnStack = new HashSet<String>(); Recompiler.recompileProgramBlockHierarchy2Forced(_childBlocks, tid, fnStack, ExecType.CP); return true; } private void releaseForcedRecompile(long tid) throws DMLRuntimeException { HashSet<String> fnStack = new HashSet<String>(); Recompiler.recompileProgramBlockHierarchy2Forced(_childBlocks, tid, fnStack, null); } private String writeTasksToFile(String fname, List<Task> tasks, int maxDigits) throws DMLRuntimeException, IOException { BufferedWriter br = null; try { Path path = new Path(fname); FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); br = new BufferedWriter(new OutputStreamWriter(fs.create(path,true))); boolean flagFirst = true; //workaround for keeping gen order for( Task t : tasks ) { br.write( createTaskFileLine( t, maxDigits, flagFirst ) ); if( flagFirst ) flagFirst = false; } } catch(Exception ex) { throw new DMLRuntimeException("Error writing tasks to taskfile "+fname, ex); } finally { IOUtilFunctions.closeSilently(br); } return fname; } private String writeTasksToFile(String fname, LocalTaskQueue<Task> queue, int maxDigits) throws DMLRuntimeException, IOException { BufferedWriter br = null; try { Path path = new Path( fname ); FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); br = new BufferedWriter(new OutputStreamWriter(fs.create(path,true))); Task t = null; boolean flagFirst = true; //workaround for keeping gen order while( (t = queue.dequeueTask()) != LocalTaskQueue.NO_MORE_TASKS ) { br.write( createTaskFileLine( t, maxDigits, flagFirst ) ); if( flagFirst ) flagFirst = false; } } catch(Exception ex) { throw new DMLRuntimeException("Error writing tasks to taskfile "+fname, ex); } finally { IOUtilFunctions.closeSilently(br); } return fname; } private String createTaskFileLine( Task t, int maxDigits, boolean flagFirst ) { //always pad to max digits in order to preserve task order String ret = t.toCompactString(maxDigits) + (flagFirst?" ":"") + "\n"; return ret; } private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap [] results) throws DMLRuntimeException { Timing time = new Timing(true); //result merge if( checkParallelRemoteResultMerge() ) { //execute result merge in parallel for all result vars int par = Math.min( _resultVars.size(), InfrastructureAnalyzer.getLocalParallelism() ); if( InfrastructureAnalyzer.isLocalMode() ) { int parmem = (int)Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer()); par = Math.min(par, Math.max(parmem, 1)); //reduce k if necessary } try { //enqueue all result vars as tasks LocalTaskQueue<String> q = new LocalTaskQueue<String>(); for( String var : _resultVars ) //foreach non-local write if( ec.getVariable(var) instanceof MatrixObject ) //robustness scalars q.enqueueTask(var); q.closeInput(); //run result merge workers ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par]; for( int i=0; i<par; i++ ) rmWorkers[i] = new ResultMergeWorker(q, results, ec); for( int i=0; i<par; i++ ) //start all rmWorkers[i].start(); for( int i=0; i<par; i++ ) { //wait for all rmWorkers[i].join(); if( !rmWorkers[i].finishedNoError() ) throw new DMLRuntimeException("Error occured in parallel result merge worker."); } } catch(Exception ex) { throw new DMLRuntimeException(ex); } } else { //execute result merge sequentially for all result vars for( String var : _resultVars ) //foreach non-local write { Data dat = ec.getVariable(var); if( dat instanceof MatrixObject ) //robustness scalars { MatrixObject out = (MatrixObject) dat; MatrixObject[] in = new MatrixObject[ results.length ]; for( int i=0; i< results.length; i++ ) in[i] = (MatrixObject) results[i].get( var ); String fname = constructResultMergeFileName(); ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, ec); MatrixObject outNew = null; if( USE_PARALLEL_RESULT_MERGE ) outNew = rm.executeParallelMerge( _numThreads ); else outNew = rm.executeSerialMerge(); //cleanup existing var Data exdata = ec.removeVariable(var); if( exdata != null && exdata != outNew && exdata instanceof MatrixObject ) ec.cleanupMatrixObject((MatrixObject)exdata); //cleanup of intermediate result variables cleanWorkerResultVariables( ec, out, in ); //set merged result variable ec.setVariable(var, outNew); } } } //handle unscoped variables (vars created in parfor, but potentially used afterwards) ParForStatementBlock sb = (ParForStatementBlock)getStatementBlock(); if( CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null ) //sb might be null for nested parallelism createEmptyUnscopedVariables( ec.getVariables(), sb ); //check expected counters if( numTasks != expTasks || numIters !=expIters ) //consistency check throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks "+numTasks+"/"+expTasks+", iters "+numIters+"/"+expIters+"."); if( DMLScript.STATISTICS ) Statistics.incrementParForMergeTime((long) time.stop()); } /** * NOTE: Currently we use a fixed rule (multiple results AND REMOTE_MR -> only selected by the optimizer * if mode was REMOTE_MR as well). * * TODO The optimizer should explicitly decide about parallel result merge and its degree of parallelism. * * @return true if ? */ private boolean checkParallelRemoteResultMerge() { return (USE_PARALLEL_RESULT_MERGE_REMOTE && _resultVars.size() > 1 && ( _resultMerge == PResultMerge.REMOTE_MR ||_resultMerge == PResultMerge.REMOTE_SPARK) ); } private void setParForProgramBlockIDs(int IDPrefix) { _IDPrefix = IDPrefix; if( _IDPrefix == -1 ) //not specified _ID = _pfIDSeq.getNextID(); //generated new ID else //remote case (further nested parfors are all in one JVM) _ID = IDHandler.concatIntIDsToLong(_IDPrefix, (int)_pfIDSeq.getNextID()); } /** * TODO rework id handling in order to enable worker reuse * */ private void setLocalParWorkerIDs() { if( _numThreads<=0 ) return; //set all parworker IDs required if PExecMode.LOCAL is used _pwIDs = new long[ _numThreads ]; for( int i=0; i<_numThreads; i++ ) { if(_IDPrefix == -1) _pwIDs[i] = _pwIDSeq.getNextID(); else _pwIDs[i] = IDHandler.concatIntIDsToLong(_IDPrefix,(int)_pwIDSeq.getNextID()); if( _monitor ) StatisticMonitor.putPfPwMapping(_ID, _pwIDs[i]); } } private long computeNumIterations( IntObject from, IntObject to, IntObject incr ) { return (long)Math.ceil(((double)(to.getLongValue() - from.getLongValue() + 1)) / incr.getLongValue()); } private void updateIterablePredicateVars(String iterVarName, IntObject from, IntObject to, IntObject incr) { _numIterations = computeNumIterations(from, to, incr); //keep original iterable predicate _iterablePredicateVarsOriginal = new String[4]; System.arraycopy(_iterablePredicateVars, 0, _iterablePredicateVarsOriginal, 0, 4); _iterablePredicateVars[0] = iterVarName; _iterablePredicateVars[1] = from.getStringValue(); _iterablePredicateVars[2] = to.getStringValue(); _iterablePredicateVars[3] = incr.getStringValue(); } private void resetIterablePredicateVars() { //reset of modified for optimization (opt!=NONE) if( _iterablePredicateVarsOriginal!=null ) System.arraycopy(_iterablePredicateVarsOriginal, 0, _iterablePredicateVars, 0, 4); } /** * NOTE: Only required for remote parfor. Hence, there is no need to transfer DMLConfig to * the remote workers (MR job) since nested remote parfor is not supported. * * @return task file name */ private String constructTaskFileName() { String scratchSpaceLoc = ConfigurationManager.getScratchSpace(); StringBuilder sb = new StringBuilder(); sb.append(scratchSpaceLoc); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(PARFOR_MR_TASKS_TMP_FNAME.replaceAll("%ID%", String.valueOf(_ID))); return sb.toString(); } /** * NOTE: Only required for remote parfor. Hence, there is no need to transfer DMLConfig to * the remote workers (MR job) since nested remote parfor is not supported. * * @return result file name */ private String constructResultFileName() { String scratchSpaceLoc = ConfigurationManager.getScratchSpace(); StringBuilder sb = new StringBuilder(); sb.append(scratchSpaceLoc); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(PARFOR_MR_RESULT_TMP_FNAME.replaceAll("%ID%", String.valueOf(_ID))); return sb.toString(); } private String constructResultMergeFileName() { String scratchSpaceLoc = ConfigurationManager.getScratchSpace(); String fname = PARFOR_MR_RESULTMERGE_FNAME; fname = fname.replaceAll("%ID%", String.valueOf(_ID)); //replace workerID fname = fname.replaceAll("%VAR%", String.valueOf(_resultVarsIDSeq.getNextID())); StringBuilder sb = new StringBuilder(); sb.append(scratchSpaceLoc); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(fname); return sb.toString(); } private String constructDataPartitionsFileName() { String scratchSpaceLoc = ConfigurationManager.getScratchSpace(); String fname = PARFOR_DATAPARTITIONS_FNAME; fname = fname.replaceAll("%ID%", String.valueOf(_ID)); //replace workerID fname = fname.replaceAll("%VAR%", String.valueOf(_dpVarsIDSeq.getNextID())); StringBuilder sb = new StringBuilder(); sb.append(scratchSpaceLoc); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(fname); return sb.toString(); } private long getMinMemory(ExecutionContext ec) { long ret = -1; //if forced remote exec and single node if( DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && _execMode == PExecMode.REMOTE_MR && _optMode == POptMode.NONE ) { try { ParForStatementBlock sb = (ParForStatementBlock)getStatementBlock(); OptTree tree = OptTreeConverter.createAbstractOptTree(-1, -1, sb, this, new HashSet<String>(), ec); CostEstimator est = new CostEstimatorHops( OptTreeConverter.getAbstractPlanMapping() ); double mem = est.getEstimate(TestMeasure.MEMORY_USAGE, tree.getRoot()); ret = (long) (mem * ( 1d/OptimizerUtils.MEM_UTIL_FACTOR )); } catch(Exception e) { LOG.error("Failed to analyze minmum memory requirements.", e); } } return ret; } private void setMemoryBudget() { if( _recompileMemoryBudget > 0 ) { // store old budget for reset after exec _oldMemoryBudget = (double)InfrastructureAnalyzer.getLocalMaxMemory(); // scale budget with applied mem util factor (inverted during getMemBudget() ) long newMaxMem = (long) (_recompileMemoryBudget / OptimizerUtils.MEM_UTIL_FACTOR); InfrastructureAnalyzer.setLocalMaxMemory( newMaxMem ); } } private void resetMemoryBudget() { if( _recompileMemoryBudget > 0 ) { InfrastructureAnalyzer.setLocalMaxMemory((long)_oldMemoryBudget); } } private void resetOptimizerFlags() { //reset all state that was set but is not guaranteed to be overwritten by optimizer _variablesDPOriginal.removeAll(); _iterablePredicateVarsOriginal = null; _colocatedDPMatrix = null; _replicationDP = WRITE_REPLICATION_FACTOR; _replicationExport = -1; _jvmReuse = true; _recompileMemoryBudget = -1; _enableRuntimePiggybacking = false; _variablesRP = null; _variablesECache = null; } /** * Helper class for parallel invocation of REMOTE_MR result merge for multiple variables. */ private class ResultMergeWorker extends Thread { private LocalTaskQueue<String> _q = null; private LocalVariableMap[] _refVars = null; private ExecutionContext _ec = null; private boolean _success = false; public ResultMergeWorker( LocalTaskQueue<String> q, LocalVariableMap[] results, ExecutionContext ec ) { _q = q; _refVars = results; _ec = ec; } public boolean finishedNoError() { return _success; } @Override public void run() { try { while( true ) { String varname = _q.dequeueTask(); if( varname == LocalTaskQueue.NO_MORE_TASKS ) // task queue closed (no more tasks) break; MatrixObject out = null; synchronized( _ec.getVariables() ){ out = _ec.getMatrixObject(varname); } MatrixObject[] in = new MatrixObject[ _refVars.length ]; for( int i=0; i< _refVars.length; i++ ) in[i] = (MatrixObject) _refVars[i].get( varname ); String fname = constructResultMergeFileName(); ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, _ec); MatrixObject outNew = null; if( USE_PARALLEL_RESULT_MERGE ) outNew = rm.executeParallelMerge( _numThreads ); else outNew = rm.executeSerialMerge(); synchronized( _ec.getVariables() ){ _ec.getVariables().put( varname, outNew); } //cleanup of intermediate result variables cleanWorkerResultVariables( _ec, out, in ); } _success = true; } catch(Exception ex) { LOG.error("Error executing result merge: ", ex); } } } public String printBlockErrorLocation(){ return "ERROR: Runtime error in parfor program block generated from parfor statement block between lines " + _beginLine + " and " + _endLine + " -- "; } }