OptimizerUtils.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.hops;

import java.util.HashMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.conf.CompilerConfig;
import org.apache.sysml.conf.CompilerConfig.ConfigType;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.hops.Hop.DataOpTypes;
import org.apache.sysml.hops.Hop.FileFormatTypes;
import org.apache.sysml.hops.Hop.OpOp2;
import org.apache.sysml.hops.rewrite.HopRewriteUtils;
import org.apache.sysml.lops.Checkpoint;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.lops.compile.Dag;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.functionobjects.IntegerDivide;
import org.apache.sysml.runtime.functionobjects.Modulus;
import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.cp.ScalarObject;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.util.IndexRange;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;

public class OptimizerUtils 
{
	private static final Log LOG = LogFactory.getLog(OptimizerUtils.class.getName());
	
	////////////////////////////////////////////////////////
	// Optimizer constants and flags (incl tuning knobs)  //
	////////////////////////////////////////////////////////
	/**
	 * Utilization factor used in deciding whether an operation to be scheduled on CP or MR. 
	 * NOTE: it is important that MEM_UTIL_FACTOR+CacheableData.CACHING_BUFFER_SIZE < 1.0
	 */
	public static double MEM_UTIL_FACTOR = 0.7d;
	
	/** Default blocksize if unspecified or for testing purposes */
	public static final int DEFAULT_BLOCKSIZE = 1000;
	
	/** Default frame blocksize */
	public static final int DEFAULT_FRAME_BLOCKSIZE = 1000;
	
	/** Default optimization level if unspecified */
	public static final OptimizationLevel DEFAULT_OPTLEVEL = 
			OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT;
	
	/**
	 * Default memory size, which is used if the actual estimate can not be computed 
	 * e.g., when input/output dimensions are unknown. The default is set to a large 
	 * value so that operations are scheduled on MR while avoiding overflows as well.  
	 */
	public static double DEFAULT_SIZE;	
	
	
	public static final long DOUBLE_SIZE = 8;
	public static final long INT_SIZE = 4;
	public static final long CHAR_SIZE = 1;
	public static final long BOOLEAN_SIZE = 1;
	public static final double INVALID_SIZE = -1d; // memory estimate not computed

	//constants for valid CP matrix dimension sizes / nnz (dense/sparse)
	public static final long MAX_NUMCELLS_CP_DENSE = Integer.MAX_VALUE;
	public static final long MAX_NNZ_CP_SPARSE = (MatrixBlock.DEFAULT_SPARSEBLOCK == 
			SparseBlock.Type.MCSR) ? Long.MAX_VALUE : Integer.MAX_VALUE;

	/**
	 * Enables common subexpression elimination in dags. There is however, a potential tradeoff
	 * between computation redundancy and data transfer between MR jobs. Since, we do not reason
	 * about transferred data yet, this rewrite rule is enabled by default.
	 */
	public static boolean ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = true;

	/**
	 * Enables constant folding in dags. Constant folding computes simple expressions of binary 
	 * operations and literals and replaces the hop sub-DAG with a new literal operator. 
	 */
	public static boolean ALLOW_CONSTANT_FOLDING = true;
	
	public static boolean ALLOW_ALGEBRAIC_SIMPLIFICATION = true; 
	public static boolean ALLOW_OPERATOR_FUSION = true; 
	
	/**
	 * Enables if-else branch removal for constant predicates (original literals or 
	 * results of constant folding). 
	 * 
	 */
	public static boolean ALLOW_BRANCH_REMOVAL = true;
	
	public static boolean ALLOW_AUTO_VECTORIZATION = true;
	
	/**
	 * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
	 * expressions are defined as binary operations on literals and nrow/ncol. This applies
	 * only to exact size information.
	 */
	public static boolean ALLOW_SIZE_EXPRESSION_EVALUATION = true;

	/**
	 * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
	 * expressions are defined as binary operations on literals and b(+) or b(*) on nrow/ncol.
	 * This applies also to worst-case size information. 
	 */
	public static boolean ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = true;

	public static boolean ALLOW_RAND_JOB_RECOMPILE = true;
	
	/**
	 * Enables CP-side data transformation for small files.
	 */
	public static boolean ALLOW_TRANSFORM_RECOMPILE = true;

	/**
	 * Enables parfor runtime piggybacking of MR jobs into the packed jobs for
	 * scan sharing.
	 */
	public static boolean ALLOW_RUNTIME_PIGGYBACKING = true;
	
	/**
	 * Enables interprocedural analysis between main script and functions as well as functions
	 * and other functions. This includes, for example, to propagate statistics into functions
	 * if save to do so (e.g., if called once).
	 */
	public static boolean ALLOW_INTER_PROCEDURAL_ANALYSIS = true;

	/**
	 * Enables sum product rewrites such as mapmultchains. In the future, this will cover 
	 * all sum-product related rewrites.
	 */
	public static boolean ALLOW_SUM_PRODUCT_REWRITES = true;
	
	/**
	 * Enables a specific hop dag rewrite that splits hop dags after csv persistent reads with 
	 * unknown size in order to allow for recompile.
	 */
	public static boolean ALLOW_SPLIT_HOP_DAGS = true;
	
	/**
	 * Enables a specific rewrite that enables update in place for loop variables that are
	 * only read/updated via cp leftindexing.
	 */
	public static boolean ALLOW_LOOP_UPDATE_IN_PLACE = true;
	
	
	/**
	 * Specifies a multiplier computing the degree of parallelism of parallel
	 * text read/write out of the available degree of parallelism. Set it to 1.0
	 * to get a number of threads equal the number of virtual cores.
	 * 
	 */
	public static final double PARALLEL_CP_READ_PARALLELISM_MULTIPLIER = 1.0;
	public static final double PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER = 1.0;

	/**
	 * Enables the use of CombineSequenceFileInputFormat with splitsize = 2x hdfs blocksize, 
	 * if sort buffer size large enough and parallelism not hurt. This solves to issues: 
	 * (1) it combines small files (depending on producers), and (2) it reduces task
	 * latency of large jobs with many tasks by factor 2.
	 * 
	 */
	public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true;
	
	/**
	 * Enables automatic csv-binary block reblock.
	 */
	public static boolean ALLOW_FRAME_CSV_REBLOCK = true;
	
	
	public static long GPU_MEMORY_BUDGET = -1;
	
	//////////////////////
	// Optimizer levels //
	//////////////////////

	/**
	 * Optimization Types for Compilation
	 * 
	 *  O0 STATIC - Decisions for scheduling operations on CP/MR are based on
	 *  predefined set of rules, which check if the dimensions are below a 
	 *  fixed/static threshold (OLD Method of choosing between CP and MR). 
	 *  The optimization scope is LOCAL, i.e., per statement block.
	 *  Advanced rewrites like constant folding, common subexpression elimination,
	 *  or inter procedural analysis are NOT applied.
	 *  
	 *  O1 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
	 *  based on the amount of memory required to perform that operation. 
	 *  It does NOT take the execution time into account.
	 *  The optimization scope is LOCAL, i.e., per statement block.
	 *  Advanced rewrites like constant folding, common subexpression elimination,
	 *  or inter procedural analysis are NOT applied.
	 *  
	 *  O2 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
	 *  based on the amount of memory required to perform that operation. 
	 *  It does NOT take the execution time into account.
	 *  The optimization scope is LOCAL, i.e., per statement block.
	 *  All advanced rewrites are applied. This is the default optimization
	 *  level of SystemML.
	 *
	 *  O3 GLOBAL TIME_MEMORY_BASED - Operation scheduling on CP or MR as well as
	 *  many other rewrites of data flow properties such as block size, partitioning,
	 *  replication, vectorization, etc are done with the optimization objective of
	 *  minimizing execution time under hard memory constraints per operation and
	 *  execution context. The optimization scope if GLOBAL, i.e., program-wide.
	 *  All advanced rewrites are applied. This optimization level requires more 
	 *  optimization time but has higher optimization potential.
	 *  
	 *  O4 DEBUG MODE - All optimizations, global and local, which interfere with 
	 *  breakpoints are NOT applied. This optimization level is REQUIRED for the 
	 *  compiler running in debug mode.
	 */
	public enum OptimizationLevel { 
		O0_LOCAL_STATIC, 
		O1_LOCAL_MEMORY_MIN,
		O2_LOCAL_MEMORY_DEFAULT,
		O3_LOCAL_RESOURCE_TIME_MEMORY,
		O4_GLOBAL_TIME_MEMORY,
		O5_DEBUG_MODE,
	};
		
	public static OptimizationLevel getOptLevel() {
		int optlevel = ConfigurationManager.getCompilerConfig().getInt(ConfigType.OPT_LEVEL);
		return OptimizationLevel.values()[optlevel];
	}
	
	public static boolean isMemoryBasedOptLevel() {
		return (getOptLevel() != OptimizationLevel.O0_LOCAL_STATIC);
	}
	
	public static boolean isOptLevel( OptimizationLevel level ){
		return (getOptLevel() == level);
	}
	
	public static CompilerConfig constructCompilerConfig( DMLConfig dmlconf ) 
		throws DMLRuntimeException
	{
		//create default compiler configuration
		CompilerConfig cconf = new CompilerConfig();
		
		//each script sets its own block size, opt level etc
		cconf.set(ConfigType.BLOCK_SIZE, dmlconf.getIntValue( DMLConfig.DEFAULT_BLOCK_SIZE ));

		//handle optimization level
		int optlevel = dmlconf.getIntValue(DMLConfig.OPTIMIZATION_LEVEL);
		if( optlevel < 0 || optlevel > 7 )
			throw new DMLRuntimeException("Error: invalid optimization level '"+optlevel+"' (valid values: 0-5).");
	
		// This overrides any optimization level that is present in the configuration file.
		// Why ? This simplifies the calling logic: User doesnot have to maintain two config file or worse
		// edit config file everytime he/she is trying to call the debugger.
		if(DMLScript.ENABLE_DEBUG_MODE) {
			optlevel = 5;
		}
		
		switch( optlevel )
		{
			// opt level 0: static dimensionality
			case 0:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O0_LOCAL_STATIC.ordinal());
				ALLOW_CONSTANT_FOLDING = false;
				ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
				ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
				ALLOW_AUTO_VECTORIZATION = false;
				ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
				ALLOW_BRANCH_REMOVAL = false;
				ALLOW_SUM_PRODUCT_REWRITES = false;
				break;
			// opt level 1: memory-based (no advanced rewrites)	
			case 1:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O1_LOCAL_MEMORY_MIN.ordinal());
				ALLOW_CONSTANT_FOLDING = false;
				ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
				ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
				ALLOW_AUTO_VECTORIZATION = false;
				ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
				ALLOW_BRANCH_REMOVAL = false;
				ALLOW_SUM_PRODUCT_REWRITES = false;
				ALLOW_LOOP_UPDATE_IN_PLACE = false;
				break;
			// opt level 2: memory-based (all advanced rewrites)
			case 2:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
				break;
			// opt level 3: resource optimization, time- and memory-based (2 w/ resource optimizat)
			case 3:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O3_LOCAL_RESOURCE_TIME_MEMORY.ordinal());
			break;
							
			// opt level 3: global, time- and memory-based (all advanced rewrites)
			case 4:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O4_GLOBAL_TIME_MEMORY.ordinal());
				break;
			// opt level 4: debug mode (no interfering rewrites)
			case 5:				
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O5_DEBUG_MODE.ordinal());
				ALLOW_CONSTANT_FOLDING = false;
				ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
				ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
				ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
				ALLOW_BRANCH_REMOVAL = false;
				ALLOW_SIZE_EXPRESSION_EVALUATION = false;
				ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = false;
				ALLOW_RAND_JOB_RECOMPILE = false;
				ALLOW_SUM_PRODUCT_REWRITES = false;
				ALLOW_SPLIT_HOP_DAGS = false;
				cconf.set(ConfigType.ALLOW_DYN_RECOMPILATION, false);
				cconf.set(ConfigType.ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS, false);
				break;
			
			// opt level 6 and7: SPOOF w/o fused operators, otherwise same as O2
			// (hidden optimization levels not documented on purpose, as they will
			// be removed once SPOOF is production ready)	
			case 6:
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
				ALLOW_AUTO_VECTORIZATION = false;
				break;
			case 7:				
				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
				ALLOW_OPERATOR_FUSION = false;
				ALLOW_AUTO_VECTORIZATION = false;
				ALLOW_SUM_PRODUCT_REWRITES = false;
				break;	
		}
		
		//handle parallel text io (incl awareness of thread contention in <jdk8)
		if (!dmlconf.getBooleanValue(DMLConfig.CP_PARALLEL_TEXTIO)) {
			cconf.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, false);
			cconf.set(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS, false);
			cconf.set(ConfigType.PARALLEL_CP_READ_BINARYFORMATS, false);
			cconf.set(ConfigType.PARALLEL_CP_WRITE_BINARYFORMATS, false);
		}
		else if(   InfrastructureAnalyzer.isJavaVersionLessThanJDK8() 
			    && InfrastructureAnalyzer.getLocalParallelism() > 1   )
		{
			LOG.warn("Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8"
					+ " (java.version="+ System.getProperty("java.version")+").");			
			cconf.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, false);
		}

		//handle parallel matrix mult / rand configuration
		if (!dmlconf.getBooleanValue(DMLConfig.CP_PARALLEL_MATRIXMULT)) {
			cconf.set(ConfigType.PARALLEL_CP_MATRIX_OPERATIONS, false);
		}	
		
		return cconf;
	}

	public static long getDefaultSize() {
		//we need to set default_size larger than any execution context
		//memory budget, however, it should not produce overflows on sum
		return Math.max( InfrastructureAnalyzer.getLocalMaxMemory(),
					Math.max(InfrastructureAnalyzer.getRemoteMaxMemoryMap(),
				          InfrastructureAnalyzer.getRemoteMaxMemoryReduce()));
	}
	
	public static void resetDefaultSize() {
		DEFAULT_SIZE = getDefaultSize();
	}
	
	
	public static int getDefaultFrameSize()
	{
		return DEFAULT_FRAME_BLOCKSIZE;
	}
	
	/**
	 * Returns memory budget (according to util factor) in bytes
	 * 
	 * @return local memory budget
	 */
	public static double getLocalMemBudget()
	{
		double ret = InfrastructureAnalyzer.getLocalMaxMemory();
		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
	}
	
	public static double getRemoteMemBudgetMap()
	{
		return getRemoteMemBudgetMap(false);
	}
	
	public static double getRemoteMemBudgetMap(boolean substractSortBuffer)
	{
		double ret = InfrastructureAnalyzer.getRemoteMaxMemoryMap();
		if( substractSortBuffer )
			ret -= InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
	}

	public static double getRemoteMemBudgetReduce()
	{
		double ret = InfrastructureAnalyzer.getRemoteMaxMemoryReduce();
		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
	}

	public static boolean checkSparkBroadcastMemoryBudget( double size )
	{
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		//basic requirement: the broadcast needs to to fit once in the remote broadcast memory 
		//and twice into the local memory budget because we have to create a partitioned broadcast
		//memory and hand it over to the spark context as in-memory object
		return ( size < memBudgetExec && 2*size < memBudgetLocal );
	}

	public static boolean checkSparkBroadcastMemoryBudget( long rlen, long clen, long brlen, long bclen, long nnz )
	{
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		double sp = getSparsity(rlen, clen, nnz);
		double size = estimateSizeExactSparsity(rlen, clen, sp);
		double sizeP = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
		
		//basic requirement: the broadcast needs to to fit once in the remote broadcast memory 
		//and twice into the local memory budget because we have to create a partitioned broadcast
		//memory and hand it over to the spark context as in-memory object
		return (   OptimizerUtils.isValidCPDimensions(rlen, clen)
				&& sizeP < memBudgetExec && size+sizeP < memBudgetLocal );
	}

	public static boolean checkSparkCollectMemoryBudget( MatrixCharacteristics mc, long memPinned )
	{
		return checkSparkCollectMemoryBudget(
				mc.getRows(), 
				mc.getCols(),
				mc.getRowsPerBlock(),
				mc.getColsPerBlock(),
				mc.getNonZeros(), memPinned);
	}
	
	public static boolean checkSparkCollectMemoryBudget( long rlen, long clen, int brlen, int bclen, long nnz, long memPinned )
	{
		//compute size of output matrix and its blocked representation
		double sp = getSparsity(rlen, clen, nnz);
		double memMatrix = estimateSizeExactSparsity(rlen, clen, sp);
		double memPMatrix = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
		
		//check if both output matrix and partitioned matrix fit into local mem budget
		return (memPinned + memMatrix + memPMatrix < getLocalMemBudget());
	}

	public static boolean checkSparseBlockCSRConversion( MatrixCharacteristics mcIn ) {
		return Checkpoint.CHECKPOINT_SPARSE_CSR
			&& OptimizerUtils.getSparsity(mcIn) < MatrixBlock.SPARSITY_TURN_POINT;
	}
	
	/**
	 * Returns the number of reducers that potentially run in parallel.
	 * This is either just the configured value (SystemML config) or
	 * the minimum of configured value and available reduce slots. 
	 * 
	 * @param configOnly true if configured value
	 * @return number of reducers
	 */
	public static int getNumReducers( boolean configOnly )
	{
		if( isSparkExecutionMode() )
			return SparkExecutionContext.getDefaultParallelism(false);
		
		int ret = ConfigurationManager.getNumReducers();
		if( !configOnly ) {
			ret = Math.min(ret,InfrastructureAnalyzer.getRemoteParallelReduceTasks());
			
			//correction max number of reducers on yarn clusters
			if( InfrastructureAnalyzer.isYarnEnabled() )
				ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores()/2 );
		}
		
		return ret;
	}

	public static int getNumMappers()
	{
		if( isSparkExecutionMode() )
			return SparkExecutionContext.getDefaultParallelism(false);
		
		int ret = InfrastructureAnalyzer.getRemoteParallelMapTasks();
			
		//correction max number of reducers on yarn clusters
		if( InfrastructureAnalyzer.isYarnEnabled() )
			ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores() );
		
		return ret;
	}

	public static RUNTIME_PLATFORM getDefaultExecutionMode() {
		//default execution type is hybrid (cp+mr)
		RUNTIME_PLATFORM ret = RUNTIME_PLATFORM.HYBRID;
		
		//switch default to hybrid_spark (cp+spark) if in spark driver
		String sparkenv = System.getenv().get("SPARK_ENV_LOADED");
		if( sparkenv != null && sparkenv.equals("1") )
			ret = RUNTIME_PLATFORM.HYBRID_SPARK;
		
		return ret;
	}

	public static boolean isSparkExecutionMode() {
		return (   DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK
				|| DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK);
	}

	public static boolean isHadoopExecutionMode() {
		return (   DMLScript.rtplatform == RUNTIME_PLATFORM.HADOOP
				|| DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID);
	}

	public static boolean isHybridExecutionMode() {
		return (  DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID 
			   || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK );
	}
	
	/**
	 * Returns the degree of parallelism used for parallel text read. 
	 * This is computed as the number of virtual cores scales by the 
	 * PARALLEL_READ_PARALLELISM_MULTIPLIER. If PARALLEL_READ_TEXTFORMATS
	 * is disabled, this method returns 1.
	 * 
	 * @return degree of parallelism
	 */
	public static int getParallelTextReadParallelism()
	{
		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS) )
			return 1; // sequential execution
			
		//compute degree of parallelism for parallel text read
		double dop = InfrastructureAnalyzer.getLocalParallelism()
				     * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
		return (int) Math.round(dop);
	}

	public static int getParallelBinaryReadParallelism()
	{
		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_BINARYFORMATS) )
			return 1; // sequential execution
			
		//compute degree of parallelism for parallel text read
		double dop = InfrastructureAnalyzer.getLocalParallelism()
				     * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
		return (int) Math.round(dop);
	}
	
	/**
	 * Returns the degree of parallelism used for parallel text write. 
	 * This is computed as the number of virtual cores scales by the 
	 * PARALLEL_WRITE_PARALLELISM_MULTIPLIER. If PARALLEL_WRITE_TEXTFORMATS
	 * is disabled, this method returns 1.
	 * 
	 * @return degree of parallelism
	 */
	public static int getParallelTextWriteParallelism()
	{
		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS) )
			return 1; // sequential execution

		//compute degree of parallelism for parallel text read
		double dop = InfrastructureAnalyzer.getLocalParallelism()
				     * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
		return (int) Math.round(dop);
	}

	public static int getParallelBinaryWriteParallelism()
	{
		if( !ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_BINARYFORMATS) )
			return 1; // sequential execution

		//compute degree of parallelism for parallel text read
		double dop = InfrastructureAnalyzer.getLocalParallelism()
				     * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
		return (int) Math.round(dop);
	}
	
	////////////////////////
	// Memory Estimates   //
	////////////////////////
	
	public static long estimateSize(MatrixCharacteristics mc) {
		return estimateSizeExactSparsity(mc);
	}

	public static long estimateSizeExactSparsity(MatrixCharacteristics mc)
	{
		return estimateSizeExactSparsity(
				mc.getRows(),
				mc.getCols(),
				mc.getNonZeros());
	}
	
	/**
	 * Estimates the footprint (in bytes) for an in-memory representation of a
	 * matrix with dimensions=(nrows,ncols) and and number of non-zeros nnz.
	 * 
	 * @param nrows number of rows
	 * @param ncols number of cols
	 * @param nnz number of non-zeros
	 * @return memory footprint
	 */
	public static long estimateSizeExactSparsity(long nrows, long ncols, long nnz) 
	{
		double sp = getSparsity(nrows, ncols, nnz);
		return estimateSizeExactSparsity(nrows, ncols, sp);
	}
	
	/**
	 * Estimates the footprint (in bytes) for an in-memory representation of a
	 * matrix with dimensions=(nrows,ncols) and sparsity=sp.
	 * 
	 * This function can be used directly in Hops, when the actual sparsity is
	 * known i.e., <code>sp</code> is guaranteed to give worst-case estimate
	 * (e.g., Rand with a fixed sparsity). In all other cases, estimateSize()
	 * must be used so that worst-case estimates are computed, whenever
	 * applicable.
	 * 
	 * @param nrows number of rows
	 * @param ncols number of cols
	 * @param sp sparsity
	 * @return memory footprint
	 */
	public static long estimateSizeExactSparsity(long nrows, long ncols, double sp) 
	{
		return MatrixBlock.estimateSizeInMemory(nrows,ncols,sp);
	}
	
	/**
	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
	 * matrix with the given matrix characteristics
	 * 
	 * @param mc matrix characteristics
	 * @return memory estimate
	 */
	public static long estimatePartitionedSizeExactSparsity(MatrixCharacteristics mc)
	{
		return estimatePartitionedSizeExactSparsity(
				mc.getRows(), 
				mc.getCols(), 
				mc.getRowsPerBlock(), 
				mc.getColsPerBlock(), 
				mc.getNonZeros());
	}
	
	/**
	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
	 * matrix with dimensions=(nrows,ncols) and number of non-zeros nnz.
	 * 
	 * @param rlen number of rows
	 * @param clen number of cols
	 * @param brlen rows per block
	 * @param bclen cols per block
	 * @param nnz number of non-zeros
	 * @return memory estimate
	 */
	public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, long nnz) 
	{
		double sp = getSparsity(rlen, clen, nnz);
		return estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
	}
	
	/**
	 * Estimates the footprint (in bytes) for a partitioned in-memory representation of a
	 * matrix with dimensions=(nrows,ncols) and sparsity=sp.
	 * 
	 * @param rlen number of rows
	 * @param clen number of cols
	 * @param brlen rows per block
	 * @param bclen cols per block
	 * @param sp sparsity
	 * @return memory estimate
	 */
	public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, double sp) 
	{
		long ret = 0;

		//check for guaranteed existence of empty blocks (less nnz than total number of blocks)
		long tnrblks = (long)Math.ceil((double)rlen/brlen);
		long tncblks = (long)Math.ceil((double)clen/bclen);
		long nnz = (long) Math.ceil(sp * rlen * clen);		
		if( nnz < tnrblks * tncblks ) {
			long lrlen = Math.min(rlen, brlen);
			long lclen = Math.min(clen, bclen);
			return nnz * estimateSizeExactSparsity(lrlen, lclen, 1)
				 + (tnrblks * tncblks - nnz) * estimateSizeEmptyBlock(lrlen, lclen);
		}
		
		//estimate size of full brlen x bclen blocks
		long nrblks = rlen / brlen;
		long ncblks = clen / bclen;
		if( nrblks * ncblks > 0 )
			ret += nrblks * ncblks * estimateSizeExactSparsity(brlen, bclen, sp);

		//estimate size of bottom boundary blocks 
		long lrlen = rlen % brlen;
		if( ncblks > 0 && lrlen > 0 )
			ret += ncblks * estimateSizeExactSparsity(lrlen, bclen, sp);
		
		//estimate size of right boundary blocks
		long lclen = clen % bclen;
		if( nrblks > 0 && lclen > 0 )
			ret += nrblks * estimateSizeExactSparsity(brlen, lclen, sp);
		
		//estimate size of bottom right boundary block
		if( lrlen > 0 && lclen > 0  )
			ret += estimateSizeExactSparsity(lrlen, lclen, sp);
		
		return ret;
	}
	
	/**
	 * Similar to estimate() except that it provides worst-case estimates
	 * when the optimization type is ROBUST.
	 * 
	 * @param nrows number of rows
	 * @param ncols number of cols
	 * @return memory estimate
	 */
	public static long estimateSize(long nrows, long ncols) 
	{
		return estimateSizeExactSparsity(nrows, ncols, 1.0);
	}
	
	public static long estimateSizeEmptyBlock(long nrows, long ncols)
	{
		return estimateSizeExactSparsity(0, 0, 0.0d);
	}

	public static long estimateSizeTextOutput( long rows, long cols, long nnz, OutputInfo oinfo )
	{
		long bsize = MatrixBlock.estimateSizeOnDisk(rows, cols, nnz);
		if( oinfo == OutputInfo.TextCellOutputInfo || oinfo == OutputInfo.MatrixMarketOutputInfo )
			return bsize * 3;
		else if( oinfo == OutputInfo.CSVOutputInfo )
			return bsize * 2;
		
		//unknown output info
		return bsize;
	}
	
	/**
	 * Indicates if the given indexing range is block aligned, i.e., it does not require
	 * global aggregation of blocks.
	 * 
	 * @param ixrange indexing range
	 * @param mc matrix characteristics
	 * @return true if indexing range is block aligned
	 */
	public static boolean isIndexingRangeBlockAligned(IndexRange ixrange, MatrixCharacteristics mc) {
		long rl = ixrange.rowStart;
		long ru = ixrange.rowEnd;
		long cl = ixrange.colStart;
		long cu = ixrange.colEnd;
		long brlen = mc.getRowsPerBlock();
		long bclen = mc.getColsPerBlock();
		return isIndexingRangeBlockAligned(rl, ru, cl, cu, brlen, bclen);
	}
	
	/**
	 * Indicates if the given indexing range is block aligned, i.e., it does not require
	 * global aggregation of blocks.
	 * 
	 * @param rl rows lower
	 * @param ru rows upper
	 * @param cl cols lower
	 * @param cu cols upper
	 * @param brlen rows per block
	 * @param bclen cols per block
	 * @return true if indexing range is block aligned
	 */
	public static boolean isIndexingRangeBlockAligned(long rl, long ru, long cl, long cu, long brlen, long bclen) {
		return rl != -1 && ru != -1 && cl != -1 && cu != -1
				&&((rl-1)%brlen == 0 && (cl-1)%bclen == 0 
				|| (rl-1)/brlen == (ru-1)/brlen && (cl-1)%bclen == 0 
				|| (rl-1)%brlen == 0 && (cl-1)/bclen == (cu-1)/bclen);
	}
	/**
	 * Returns false if dimensions known to be invalid; other true
	 * 
	 * @param rows number of rows
	 * @param cols number of cols
	 * @return true if dimensions valid
	 */
	public static boolean isValidCPDimensions( long rows, long cols )
	{
		//the current CP runtime implementation requires that rows and cols
		//are integers since we use a single matrixblock to represent the
		//entire matrix
		return (rows <= Integer.MAX_VALUE && cols<=Integer.MAX_VALUE);
	}
	
	/**
	 * Returns false if schema and names are not properly specified; other true
	 * Length to be > 0, and length of both to be equal.
	 * 
	 * @param schema the schema
	 * @param names the names
	 * @return false if schema and names are not properly specified
	 */
	public static boolean isValidCPDimensions( ValueType[] schema, String[] names )
	{
		// Length of schema and names to be same, and > 0.
		return (schema != null && names != null && schema.length > 0 && schema.length == names.length);
	}
	
	/**
	 * Determines if valid matrix size to be represented in CP data structures. Note that
	 * sparsity needs to be specified as rows*cols if unknown. 
	 * 
	 * @param rows number of rows
	 * @param cols number of cols
	 * @param sparsity the sparsity
	 * @return true if valid matrix size
	 */
	public static boolean isValidCPMatrixSize( long rows, long cols, double sparsity )
	{
		boolean ret = true;
		
		//the current CP runtime implementation has several limitations:
		//1) for dense: 16GB because we use a linearized array (bounded to int in java)
		//2) for sparse: 2G x 2G nnz because (1) nnz maintained as long, (2) potential changes 
		//   to dense, and (3) sparse row arrays also of max int size (worst case in case of skew)  
		long nnz = (long)(sparsity * rows * cols);
		boolean sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, nnz);
		
		if( sparse ) //SPARSE
		{
			//check max nnz (dependent on sparse block format)
			ret = (nnz <= MAX_NNZ_CP_SPARSE);
		}
		else //DENSE
		{
			//check number of matrix cell
			ret = ((rows * cols) <= MAX_NUMCELLS_CP_DENSE);
		}
			
		return ret;
	}
	
	/**
	 * Indicates if the given matrix characteristics exceed the threshold for 
	 * caching, i.e., the matrix should be cached.
	 * 
	 * @param dim2 dimension 2
	 * @param outMem ?
	 * @return true if the given matrix characteristics exceed threshold
	 */
	public static boolean exceedsCachingThreshold(long dim2, double outMem) {
		return !(dim2 > 1 && outMem < getLocalMemBudget()
			|| dim2 == 1 && outMem < getLocalMemBudget()/3);
	}
	
	/**
	 * Wrapper over internal filename construction for external usage. 
	 * 
	 * @return unique temp file name
	 */
	public static String getUniqueTempFileName() {
		return ConfigurationManager.getScratchSpace()
			+ Lop.FILE_SEPARATOR + Lop.PROCESS_PREFIX + DMLScript.getUUID()
			+ Lop.FILE_SEPARATOR + ProgramConverter.CP_ROOT_THREAD_ID + Lop.FILE_SEPARATOR 
			+ Dag.getNextUniqueFilenameSuffix();
	}

	public static boolean allowsToFilterEmptyBlockOutputs( Hop hop ) 
		throws HopsException
	{
		boolean ret = true;
		for( Hop p : hop.getParent() ) {
			p.optFindExecType(); //ensure exec type evaluated
			ret &=   (  p.getExecType()==ExecType.CP 
					 ||(p instanceof AggBinaryOp && allowsToFilterEmptyBlockOutputs(p) )
					 ||(p instanceof DataOp && ((DataOp)p).getDataOpType()==DataOpTypes.PERSISTENTWRITE && ((DataOp)p).getInputFormatType()==FileFormatTypes.TEXT))
				  && !(p instanceof FunctionOp || (p instanceof DataOp && ((DataOp)p).getInputFormatType()!=FileFormatTypes.TEXT) ); //no function call or transient write
		}
			
		return ret;	
	}

	public static int getConstrainedNumThreads(int maxNumThreads)
	{
		//by default max local parallelism (vcores) 
		int ret = InfrastructureAnalyzer.getLocalParallelism();
		
		//apply external max constraint (e.g., set by parfor or other rewrites)
		if( maxNumThreads > 0 ) {
			ret = Math.min(ret, maxNumThreads);
		}
		
		//apply global multi-threading constraint
		if( !ConfigurationManager.isParallelMatrixOperations() ) {
			ret = 1;
		}
			
		return ret;
	}
	
	////////////////////////
	// Sparsity Estimates //
	////////////////////////
	
	/**
	 * Estimates the result sparsity for Matrix Multiplication A %*% B. 
	 *  
	 * @param sp1 sparsity of A
	 * @param sp2 sparsity of B
	 * @param m nrow(A)
	 * @param k ncol(A), nrow(B)
	 * @param n ncol(B)
	 * @param worstcase true if worst case
	 * @return the sparsity
	 */
	public static double getMatMultSparsity(double sp1, double sp2, long m, long k, long n, boolean worstcase) 
	{
		if( worstcase ){
			double nnz1 = sp1 * m * k;
			double nnz2 = sp2 * k * n;
			return Math.min(1, nnz1/m) * Math.min(1, nnz2/n);
		}
		else
			return (1 - Math.pow(1-sp1*sp2, k) );
	}

	public static double getLeftIndexingSparsity( long rlen1, long clen1, long nnz1, long rlen2, long clen2, long nnz2 )
	{
		boolean scalarRhs = (rlen2==0 && clen2==0);
		
		//infer output worstcase output nnz
		long lnnz = -1;
		if( nnz1>=0 && scalarRhs )
			lnnz = nnz1+1;             // nnz(left) + scalar
		else if( nnz1>=0 && nnz2>=0 )
			lnnz = nnz1 + nnz2;        // nnz(left) + nnz(right)
		else if( nnz1>=0 && rlen2>0 && clen2>0 )
			lnnz = nnz1 + rlen2*clen2; // nnz(left) + nnz(right_dense)
		lnnz = Math.min(lnnz, rlen1*clen1);
		
		return getSparsity(rlen1, clen1, (lnnz>=0) ? lnnz : rlen1*clen1);
	}
	
	/**
	 * Determines if a given binary op is potentially conditional sparse safe. 
	 * 
	 * @param op the HOP OpOp2
	 * @return true if potentially conditional sparse safe
	 */
	public static boolean isBinaryOpConditionalSparseSafe( OpOp2 op ) 
	{
		return (   op==OpOp2.GREATER 
			    || op==OpOp2.LESS 
			    || op==OpOp2.NOTEQUAL 
			    || op==OpOp2.EQUAL 
			    || op==OpOp2.MINUS);
	}
	
	/**
	 * Determines if a given binary op with scalar literal guarantee an output
	 * sparsity which is exactly the same as its matrix input sparsity.
	 * 
	 * @param op the HOP OpOp2
	 * @param lit literal operator
	 * @return true if output sparsity same as matrix input sparsity
	 */
	public static boolean isBinaryOpConditionalSparseSafeExact( OpOp2 op, LiteralOp lit )
	{
		double val = HopRewriteUtils.getDoubleValueSafe(lit);
		
		return ( op==OpOp2.NOTEQUAL && val==0);
	}

	public static double getBinaryOpSparsityConditionalSparseSafe( double sp1, OpOp2 op, LiteralOp lit )
	{
		double val = HopRewriteUtils.getDoubleValueSafe(lit);
		
		return (  (op==OpOp2.GREATER  && val==0) 
				||(op==OpOp2.LESS     && val==0)
				||(op==OpOp2.NOTEQUAL && val==0)
				||(op==OpOp2.EQUAL    && val!=0)
				||(op==OpOp2.MINUS    && val==0)) ? sp1 : 1.0;
	}
	
	/**
	 * Estimates the result sparsity for matrix-matrix binary operations (A op B)
	 * 
	 * @param sp1 sparsity of A
	 * @param sp2 sparsity of B
	 * @param op binary operation
	 * @param worstcase true if worst case
	 * @return result sparsity for matrix-matrix binary operations
	 */
	public static double getBinaryOpSparsity(double sp1, double sp2, OpOp2 op, boolean worstcase) 
	{
		// default is worst-case estimate for robustness
		double ret = 1.0;
		
		if( worstcase )
		{
			//NOTE: for matrix-scalar operations this estimate is too conservative, because 
			//Math.min(1, sp1 + sp2) will always give a sparsity 1 if we pass sp2=1 for scalars.
			//In order to do better (with guarantees), we need to take the actual values into account  
			switch(op) 
			{
				case PLUS:
				case MINUS:
				case LESS: 
				case GREATER:
				case NOTEQUAL:
				case MIN:
				case MAX:
				case OR:
					ret = Math.min(1, sp1 + sp2); break;
				case MULT:
				case AND:
					ret = Math.min(sp1, sp2); break;
				case DIV:
				case MODULUS:
				case POW:
				case MINUS_NZ:
				case LOG_NZ:	
					ret = sp1; break; 
				//case EQUAL: //doesnt work on worstcase estimates, but on 
				//	ret = 1-Math.abs(sp1-sp2); break;	
	
				default:
					ret = 1.0;
			}
		}
		else
		{
			switch(op) {			
				case PLUS:
				case MINUS:
					// result[i,j] != 0 iff A[i,j] !=0 || B[i,j] != 0
					// worst case estimate = sp1+sp2
					ret = (1 - (1-sp1)*(1-sp2)); 
					break;
					
				case MULT:
					// result[i,j] != 0 iff A[i,j] !=0 && B[i,j] != 0
					// worst case estimate = min(sp1,sp2)
					ret = sp1 * sp2;  
					break;
					
				case DIV:
					ret = 1.0; // worst case estimate
					break;
					
				case LESS: 
				case LESSEQUAL:
				case GREATER:
				case GREATEREQUAL:
				case EQUAL: 
				case NOTEQUAL:
					ret = 1.0; // purely data-dependent operations, and hence worse-case estimate
					break;
					
				//MIN, MAX, AND, OR, LOG, POW
				default:
					ret = 1.0;
			}
		}
		
		return ret; 
	}
	
	public static double getSparsity( MatrixCharacteristics mc ) {
		return getSparsity(mc.getRows(), mc.getCols(), mc.getNonZeros());
	}
	
	public static double getSparsity( long dim1, long dim2, long nnz )
	{
		if( dim1<=0 || dim2<=0 || nnz<0 )
			return 1.0;
		else
			return Math.min(((double)nnz)/dim1/dim2, 1.0);
	}
	
	public static String toMB(double inB) {
		if ( inB < 0 )
			return "-";
		return String.format("%.0f", inB/(1024*1024) );
	}
	

	
	/**
	 * Function to evaluate simple size expressions over literals and now/ncol.
	 * 
	 * It returns the exact results of this expressions if known, otherwise
	 * Long.MAX_VALUE if unknown.
	 * 
	 * @param root the root high-level operator
	 * @param valMemo ?
	 * @return size expression
	 * @throws HopsException if HopsException occurs
	 */
	public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo ) 
		throws HopsException
	{
		long ret = Long.MAX_VALUE;
		
		//for simplicity and robustness call double and cast.
		HashMap<Long, Double> dvalMemo = new HashMap<Long, Double>();
		double tmp = rEvalSimpleDoubleExpression(root, dvalMemo);
		if( tmp!=Double.MAX_VALUE )
			ret = UtilFunctions.toLong( tmp );
		
		return ret;
	}
	
	public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo, LocalVariableMap vars ) 
		throws HopsException
	{
		long ret = Long.MAX_VALUE;
		
		//for simplicity and robustness call double and cast.
		HashMap<Long, Double> dvalMemo = new HashMap<Long, Double>();
		double tmp = rEvalSimpleDoubleExpression(root, dvalMemo, vars);
		if( tmp!=Double.MAX_VALUE )
			ret = UtilFunctions.toLong( tmp );
		
		return ret;
	}
	
	public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;
		
		//always use constants
		if( root instanceof LiteralOp )
			ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);
		
		//advanced size expression evaluation
		if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
		{
			if( root instanceof UnaryOp )
				ret = rEvalSimpleUnaryDoubleExpression(root, valMemo);
			else if( root instanceof BinaryOp )
				ret = rEvalSimpleBinaryDoubleExpression(root, valMemo);
		}
		
		valMemo.put(root.getHopID(), ret);
		return ret;
	}
	
	public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;
		
		if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
		{
			if( root instanceof LiteralOp )
				ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);
			else if( root instanceof UnaryOp )
				ret = rEvalSimpleUnaryDoubleExpression(root, valMemo, vars);
			else if( root instanceof BinaryOp )
				ret = rEvalSimpleBinaryDoubleExpression(root, valMemo, vars);
			else if( root instanceof DataOp ) {
				String name = root.getName();
				Data dat = vars.get(name);
				if( dat!=null && dat instanceof ScalarObject )
					ret = ((ScalarObject)dat).getDoubleValue();
			}
		}
		
		valMemo.put(root.getHopID(), ret);
		return ret;
	}

	protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;
		
		UnaryOp uroot = (UnaryOp) root;
		Hop input = uroot.getInput().get(0);
		
		if(uroot.getOp() == Hop.OpOp1.NROW)
			ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE;
		else if( uroot.getOp() == Hop.OpOp1.NCOL )
			ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE;
		else
		{
			double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo);
			if( lval != Double.MAX_VALUE )
			{
				switch( uroot.getOp() )
				{
					case SQRT:	ret = Math.sqrt(lval); break;
					case ROUND: ret = Math.round(lval); break;
					case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
					case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
					case CAST_AS_DOUBLE: ret = lval; break;
					default: ret = Double.MAX_VALUE;
				}
			}
		}
			
		valMemo.put(root.getHopID(), ret);
		return ret;
	}

	protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;
		
		UnaryOp uroot = (UnaryOp) root;
		Hop input = uroot.getInput().get(0);
		
		if(uroot.getOp() == Hop.OpOp1.NROW)
			ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE;
		else if( uroot.getOp() == Hop.OpOp1.NCOL )
			ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE;
		else
		{
			double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo, vars);
			if( lval != Double.MAX_VALUE )
			{
				switch( uroot.getOp() )
				{
					case SQRT:	ret = Math.sqrt(lval); break;
					case ROUND: ret = Math.round(lval); break;
					case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
					case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
					case CAST_AS_DOUBLE: ret = lval; break;
					default: ret = Double.MAX_VALUE;
				}
			}
		}
			
		valMemo.put(root.getHopID(), ret);
		return ret;
	}

	protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;

		BinaryOp broot = (BinaryOp) root;
		
		double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo);
		double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo);
		//note: positive and negative values might be valid subexpressions
		if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
		{
			switch( broot.getOp() )
			{
				case PLUS:	ret = lret + rret; break;
				case MINUS:	ret = lret - rret; break;
				case MULT:  ret = lret * rret; break;
				case DIV:   ret = lret / rret; break;
				case MIN:   ret = Math.min(lret, rret); break;
				case MAX:   ret = Math.max(lret, rret); break;
				case POW:   ret = Math.pow(lret, rret); break; 
				//special mod / inddiv for runtime consistency
				case MODULUS: ret = Modulus.getFnObject().execute(lret, rret); break;
				case INTDIV:  ret = IntegerDivide.getFnObject().execute(lret, rret); break; 
				default: ret = Double.MAX_VALUE;
			}
		}
		
		valMemo.put(root.getHopID(), ret);
		return ret;
	}

	protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) 
		throws HopsException
	{
		//memoization (prevent redundant computation of common subexpr)
		if( valMemo.containsKey(root.getHopID()) )
			return valMemo.get(root.getHopID());
		
		double ret = Double.MAX_VALUE;

		BinaryOp broot = (BinaryOp) root;
		
		double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo, vars);
		double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo, vars);
		//note: positive and negative values might be valid subexpressions
		if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
		{
			switch( broot.getOp() )
			{
				case PLUS:	ret = lret + rret; break;
				case MINUS:	ret = lret - rret; break;
				case MULT:  ret = lret * rret; break;
				case DIV:   ret = lret / rret; break;
				case MIN:   ret = Math.min(lret, rret); break;
				case MAX:   ret = Math.max(lret, rret); break;
				case POW:   ret = Math.pow(lret, rret); break; 
				//special mod / inddiv for runtime consistency
				case MODULUS: ret = Modulus.getFnObject().execute(lret, rret); break;
				case INTDIV:  ret = IntegerDivide.getFnObject().execute(lret, rret); break; 
				default: ret = Double.MAX_VALUE;
			}
		}
		
		valMemo.put(root.getHopID(), ret);
		return ret;
	}		
}