/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.hops; import java.util.HashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.ibm.bi.dml.api.DMLScript; import com.ibm.bi.dml.api.DMLScript.RUNTIME_PLATFORM; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.conf.DMLConfig; import com.ibm.bi.dml.hops.Hop.DataOpTypes; import com.ibm.bi.dml.hops.Hop.FileFormatTypes; import com.ibm.bi.dml.hops.Hop.OpOp2; import com.ibm.bi.dml.hops.rewrite.HopRewriteUtils; import com.ibm.bi.dml.lops.LopProperties.ExecType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.controlprogram.LocalVariableMap; import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext; import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import com.ibm.bi.dml.runtime.instructions.cp.Data; import com.ibm.bi.dml.runtime.instructions.cp.ScalarObject; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.matrix.data.SparseRow; import com.ibm.bi.dml.runtime.util.UtilFunctions; import com.ibm.bi.dml.yarn.ropt.YarnClusterAnalyzer; public class OptimizerUtils { private static final Log LOG = LogFactory.getLog(OptimizerUtils.class.getName()); //////////////////////////////////////////////////////// // Optimizer constants and flags (incl tuning knobs) // //////////////////////////////////////////////////////// /** * Utilization factor used in deciding whether an operation to be scheduled on CP or MR. * NOTE: it is important that MEM_UTIL_FACTOR+CacheableData.CACHING_BUFFER_SIZE < 1.0 */ public static double MEM_UTIL_FACTOR = 0.7d; /** * Default memory size, which is used the actual estimate can not be computed * -- for example, when input/output dimensions are unknown. In case of ROBUST, * the default is set to a large value so that operations are scheduled on MR. */ public static double DEFAULT_SIZE; public static final long DOUBLE_SIZE = 8; public static final long INT_SIZE = 4; public static final long CHAR_SIZE = 1; public static final long BOOLEAN_SIZE = 1; public static final double BIT_SIZE = (double)1/8; public static final double INVALID_SIZE = -1d; // memory estimate not computed public static final long MAX_NUMCELLS_CP_DENSE = Integer.MAX_VALUE; /** * Enables/disables dynamic re-compilation of lops/instructions. * If enabled, we recompile each program block that contains at least * one hop that requires re-compilation (e.g., unknown statistics * during compilation, or program blocks in functions). */ public static boolean ALLOW_DYN_RECOMPILATION = true; public static boolean ALLOW_PARALLEL_DYN_RECOMPILATION = ALLOW_DYN_RECOMPILATION && true; /** * Enables/disables to put operations with data-dependent output * size into individual statement blocks / program blocks. * Since recompilation is done on the granularity of program blocks * this enables recompilation of subsequent operations according * to the actual output size. This rewrite might limit the opportunity * for piggybacking and therefore should only be applied if * dyanmic recompilation is enabled as well. */ public static boolean ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS = ALLOW_DYN_RECOMPILATION && true; /** * Enables common subexpression elimination in dags. There is however, a potential tradeoff * between computation redundancy and data transfer between MR jobs. Since, we do not reason * about transferred data yet, this rewrite rule is enabled by default. */ public static boolean ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = true; /** * Enables constant folding in dags. Constant folding computes simple expressions of binary * operations and literals and replaces the hop sub-DAG with a new literal operator. */ public static boolean ALLOW_CONSTANT_FOLDING = true; /** * */ public static boolean ALLOW_ALGEBRAIC_SIMPLIFICATION = true; /** * Enables if-else branch removal for constant predicates (original literals or * results of constant folding). * */ public static boolean ALLOW_BRANCH_REMOVAL = true; /** * */ public static boolean ALLOW_AUTO_VECTORIZATION = true; /** * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple * expressions are defined as binary operations on literals and nrow/ncol. This applies * only to exact size information. */ public static boolean ALLOW_SIZE_EXPRESSION_EVALUATION = true; /** * Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple * expressions are defined as binary operations on literals and b(+) or b(*) on nrow/ncol. * This applies also to worst-case size information. */ public static boolean ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = true; /** * */ public static boolean ALLOW_RAND_JOB_RECOMPILE = true; /** * Enables CP-side data transformation for small files. */ public static boolean ALLOW_TRANSFORM_RECOMPILE = true; /** * Enables parfor runtime piggybacking of MR jobs into the packed jobs for * scan sharing. */ public static boolean ALLOW_RUNTIME_PIGGYBACKING = true; /** * Enables interprocedural analysis between main script and functions as well as functions * and other functions. This includes, for example, to propagate statistics into functions * if save to do so (e.g., if called once). */ public static boolean ALLOW_INTER_PROCEDURAL_ANALYSIS = true; /** * Enables sum product rewrites such as mapmultchains. In the future, this will cover * all sum-product related rewrites. */ public static boolean ALLOW_SUM_PRODUCT_REWRITES = true; /** * Enables a specific hop dag rewrite that splits hop dags after csv persistent reads with * unknown size in order to allow for recompile. */ public static boolean ALLOW_SPLIT_HOP_DAGS = true; /** * Enables parallel read/write of all text formats (textcell, csv, mm) * and binary formats (binary block). * */ public static boolean PARALLEL_CP_READ_TEXTFORMATS = true; public static boolean PARALLEL_CP_WRITE_TEXTFORMATS = true; public static boolean PARALLEL_CP_READ_BINARYFORMATS = true; public static boolean PARALLEL_CP_WRITE_BINARYFORMATS = true; /** * Specifies a multiplier computing the degree of parallelism of parallel * text read/write out of the available degree of parallelism. Set it to 1.0 * to get a number of threads equal the number of virtual cores. * */ public static final double PARALLEL_CP_READ_PARALLELISM_MULTIPLIER = 1.0; public static final double PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER = 1.0; /** * Enables multi-threaded matrix multiply for mm, mmchain, and tsmm. * */ public static boolean PARALLEL_CP_MATRIX_MULTIPLY = true; /** * Enables the use of CombineSequenceFileInputFormat with splitsize = 2x hdfs blocksize, * if sort buffer size large enough and parallelism not hurt. This solves to issues: * (1) it combines small files (depending on producers), and (2) it reduces task * latency of large jobs with many tasks by factor 2. * */ public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true; ////////////////////// // Optimizer levels // ////////////////////// private static OptimizationLevel _optLevel = OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT; /** * Optimization Types for Compilation * * O0 STATIC - Decisions for scheduling operations on CP/MR are based on * predefined set of rules, which check if the dimensions are below a * fixed/static threshold (OLD Method of choosing between CP and MR). * The optimization scope is LOCAL, i.e., per statement block. * Advanced rewrites like constant folding, common subexpression elimination, * or inter procedural analysis are NOT applied. * * O1 MEMORY_BASED - Every operation is scheduled on CP or MR, solely * based on the amount of memory required to perform that operation. * It does NOT take the execution time into account. * The optimization scope is LOCAL, i.e., per statement block. * Advanced rewrites like constant folding, common subexpression elimination, * or inter procedural analysis are NOT applied. * * O2 MEMORY_BASED - Every operation is scheduled on CP or MR, solely * based on the amount of memory required to perform that operation. * It does NOT take the execution time into account. * The optimization scope is LOCAL, i.e., per statement block. * All advanced rewrites are applied. This is the default optimization * level of SystemML. * * O3 GLOBAL TIME_MEMORY_BASED - Operation scheduling on CP or MR as well as * many other rewrites of data flow properties such as block size, partitioning, * replication, vectorization, etc are done with the optimization objective of * minimizing execution time under hard memory constraints per operation and * execution context. The optimization scope if GLOBAL, i.e., program-wide. * All advanced rewrites are applied. This optimization level requires more * optimization time but has higher optimization potential. * * O4 DEBUG MODE - All optimizations, global and local, which interfere with * breakpoints are NOT applied. This optimization level is REQUIRED for the * compiler running in debug mode. */ public enum OptimizationLevel { O0_LOCAL_STATIC, O1_LOCAL_MEMORY_MIN, O2_LOCAL_MEMORY_DEFAULT, O3_LOCAL_RESOURCE_TIME_MEMORY, O4_GLOBAL_TIME_MEMORY, O5_DEBUG_MODE, }; public static OptimizationLevel getOptLevel() { return _optLevel; } public static boolean isMemoryBasedOptLevel() { return (_optLevel != OptimizationLevel.O0_LOCAL_STATIC); } public static boolean isOptLevel( OptimizationLevel level ){ return (_optLevel == level); } /** * * @param optlevel * @throws DMLRuntimeException */ public static void setOptimizationLevel( int optlevel ) throws DMLRuntimeException { if( optlevel < 0 || optlevel > 5 ) throw new DMLRuntimeException("Error: invalid optimization level '"+optlevel+"' (valid values: 0-5)."); // This overrides any optimization level that is present in the configuration file. // Why ? This simplifies the calling logic: User doesnot have to maintain two config file or worse // edit config file everytime he/she is trying to call the debugger. if(DMLScript.ENABLE_DEBUG_MODE) { optlevel = 5; } switch( optlevel ) { // opt level 0: static dimensionality case 0: _optLevel = OptimizationLevel.O0_LOCAL_STATIC; ALLOW_CONSTANT_FOLDING = false; ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false; ALLOW_ALGEBRAIC_SIMPLIFICATION = false; ALLOW_AUTO_VECTORIZATION = false; ALLOW_INTER_PROCEDURAL_ANALYSIS = false; ALLOW_BRANCH_REMOVAL = false; ALLOW_SUM_PRODUCT_REWRITES = false; break; // opt level 1: memory-based (no advanced rewrites) case 1: _optLevel = OptimizationLevel.O1_LOCAL_MEMORY_MIN; ALLOW_CONSTANT_FOLDING = false; ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false; ALLOW_ALGEBRAIC_SIMPLIFICATION = false; ALLOW_AUTO_VECTORIZATION = false; ALLOW_INTER_PROCEDURAL_ANALYSIS = false; ALLOW_BRANCH_REMOVAL = false; ALLOW_SUM_PRODUCT_REWRITES = false; break; // opt level 2: memory-based (all advanced rewrites) case 2: _optLevel = OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT; break; // opt level 3: resource optimization, time- and memory-based (2 w/ resource optimizat) case 3: _optLevel = OptimizationLevel.O3_LOCAL_RESOURCE_TIME_MEMORY; break; // opt level 3: global, time- and memory-based (all advanced rewrites) case 4: _optLevel = OptimizationLevel.O4_GLOBAL_TIME_MEMORY; break; // opt level 4: debug mode (no interfering rewrites) case 5: _optLevel = OptimizationLevel.O5_DEBUG_MODE; ALLOW_CONSTANT_FOLDING = false; ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false; ALLOW_ALGEBRAIC_SIMPLIFICATION = false; ALLOW_INTER_PROCEDURAL_ANALYSIS = false; ALLOW_BRANCH_REMOVAL = false; ALLOW_DYN_RECOMPILATION = false; ALLOW_SIZE_EXPRESSION_EVALUATION = false; ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = false; ALLOW_RAND_JOB_RECOMPILE = false; ALLOW_SUM_PRODUCT_REWRITES = false; ALLOW_SPLIT_HOP_DAGS = false; break; } setDefaultSize(); //handle parallel text io (incl awareness of thread contention in <jdk8) if (!ConfigurationManager.getConfig().getBooleanValue(DMLConfig.CP_PARALLEL_TEXTIO)) { PARALLEL_CP_READ_TEXTFORMATS = false; PARALLEL_CP_WRITE_TEXTFORMATS = false; PARALLEL_CP_READ_BINARYFORMATS = false; PARALLEL_CP_WRITE_BINARYFORMATS = false; } else if( InfrastructureAnalyzer.isJavaVersionLessThanJDK8() && InfrastructureAnalyzer.getLocalParallelism() > 1 ) { LOG.warn("Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8" + " (java.version="+ System.getProperty("java.version")+")."); //disable parallel text read PARALLEL_CP_READ_TEXTFORMATS = false; } //handle parallel matrix mult / rand configuration if (!ConfigurationManager.getConfig().getBooleanValue(DMLConfig.CP_PARALLEL_MATRIXMULT)) { PARALLEL_CP_MATRIX_MULTIPLY = false; } } /** * */ public static void setDefaultSize() { //we need to set default_size larger than any execution context //memory budget, however, it should not produce overflows on sum DEFAULT_SIZE = Math.max( InfrastructureAnalyzer.getLocalMaxMemory(), Math.max(InfrastructureAnalyzer.getRemoteMaxMemoryMap(), InfrastructureAnalyzer.getRemoteMaxMemoryReduce())); } /** * Returns memory budget (according to util factor) in bytes * * @param localOnly specifies if only budget of current JVM or also MR JVMs * @return */ public static double getLocalMemBudget() { double ret = InfrastructureAnalyzer.getLocalMaxMemory(); return ret * OptimizerUtils.MEM_UTIL_FACTOR; } /** * * @return */ public static double getRemoteMemBudgetMap() { return getRemoteMemBudgetMap(false); } /** * * @return */ public static double getRemoteMemBudgetMap(boolean substractSortBuffer) { double ret = InfrastructureAnalyzer.getRemoteMaxMemoryMap(); if( substractSortBuffer ) ret -= InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer(); return ret * OptimizerUtils.MEM_UTIL_FACTOR; } /** * * @return */ public static double getRemoteMemBudgetReduce() { double ret = InfrastructureAnalyzer.getRemoteMaxMemoryReduce(); return ret * OptimizerUtils.MEM_UTIL_FACTOR; } /** * * @param size * @return */ public static boolean checkSparkBroadcastMemoryBudget( double size ) { double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget(); double memBudgetLocal = OptimizerUtils.getLocalMemBudget(); //basic requirement: the broadcast needs to to fit once in the remote broadcast memory //and twice into the local memory budget because we have to create a partitioned broadcast //memory and hand it over to the spark context as in-memory object return ( size < memBudgetExec && 2*size < memBudgetLocal ); } /** * * @param rlen * @param clen * @param brlen * @param bclen * @param nnz * @return */ public static boolean checkSparkBroadcastMemoryBudget( long rlen, long clen, long brlen, long bclen, long nnz ) { double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget(); double memBudgetLocal = OptimizerUtils.getLocalMemBudget(); double sp = getSparsity(rlen, clen, nnz); double size = estimateSizeExactSparsity(rlen, clen, sp); double sizeP = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp); //basic requirement: the broadcast needs to to fit once in the remote broadcast memory //and twice into the local memory budget because we have to create a partitioned broadcast //memory and hand it over to the spark context as in-memory object return ( OptimizerUtils.isValidCPDimensions(rlen, clen) && sizeP < memBudgetExec && size+sizeP < memBudgetLocal ); } /** * * @param rlen * @param clen * @param brlen * @param bclen * @param nnz * @return */ public static boolean checkSparkCollectMemoryBudget( long rlen, long clen, int brlen, int bclen, long nnz, long memPinned ) { //compute size of output matrix and its blocked representation double sp = getSparsity(rlen, clen, nnz); double memMatrix = estimateSizeExactSparsity(rlen, clen, sp); double memPMatrix = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp); //check if both output matrix and partitioned matrix fit into local mem budget return (memPinned + memMatrix + memPMatrix < getLocalMemBudget()); } /** * Returns the number of reducers that potentially run in parallel. * This is either just the configured value (SystemML config) or * the minimum of configured value and available reduce slots. * * @param configOnly * @return */ public static int getNumReducers( boolean configOnly ) { int ret = ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS); if( !configOnly ) { ret = Math.min(ret,InfrastructureAnalyzer.getRemoteParallelReduceTasks()); //correction max number of reducers on yarn clusters if( InfrastructureAnalyzer.isYarnEnabled() ) ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores()/2 ); } return ret; } /** * * @return */ public static int getNumMappers() { int ret = InfrastructureAnalyzer.getRemoteParallelMapTasks(); //correction max number of reducers on yarn clusters if( InfrastructureAnalyzer.isYarnEnabled() ) ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores() ); return ret; } /** * * @return */ public static boolean isSparkExecutionMode() { return ( DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK); } /** * * @return */ public static boolean isHybridExecutionMode() { return ( DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK ); } /** * Returns the degree of parallelism used for parallel text read. * This is computed as the number of virtual cores scales by the * PARALLEL_READ_PARALLELISM_MULTIPLIER. If PARALLEL_READ_TEXTFORMATS * is disabled, this method returns 1. * * @return */ public static int getParallelTextReadParallelism() { if( !PARALLEL_CP_READ_TEXTFORMATS ) return 1; // sequential execution //compute degree of parallelism for parallel text read double dop = InfrastructureAnalyzer.getLocalParallelism() * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER; return (int) Math.round(dop); } /** * * @return */ public static int getParallelBinaryReadParallelism() { if( !PARALLEL_CP_READ_BINARYFORMATS ) return 1; // sequential execution //compute degree of parallelism for parallel text read double dop = InfrastructureAnalyzer.getLocalParallelism() * PARALLEL_CP_READ_PARALLELISM_MULTIPLIER; return (int) Math.round(dop); } /** * Returns the degree of parallelism used for parallel text write. * This is computed as the number of virtual cores scales by the * PARALLEL_WRITE_PARALLELISM_MULTIPLIER. If PARALLEL_WRITE_TEXTFORMATS * is disabled, this method returns 1. * * @return */ public static int getParallelTextWriteParallelism() { if( !PARALLEL_CP_WRITE_TEXTFORMATS ) return 1; // sequential execution //compute degree of parallelism for parallel text read double dop = InfrastructureAnalyzer.getLocalParallelism() * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER; return (int) Math.round(dop); } /** * * @return */ public static int getParallelBinaryWriteParallelism() { if( !PARALLEL_CP_WRITE_BINARYFORMATS ) return 1; // sequential execution //compute degree of parallelism for parallel text read double dop = InfrastructureAnalyzer.getLocalParallelism() * PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER; return (int) Math.round(dop); } //////////////////////// // Memory Estimates // //////////////////////// /** * * @param mc * @return */ public static long estimateSizeExactSparsity(MatrixCharacteristics mc) { return estimateSizeExactSparsity( mc.getRows(), mc.getCols(), mc.getNonZeros()); } /** * Estimates the footprint (in bytes) for an in-memory representation of a * matrix with dimensions=(nrows,ncols) and and number of non-zeros nnz. * * @param nrows * @param ncols * @param sp * @return */ public static long estimateSizeExactSparsity(long nrows, long ncols, long nnz) { double sp = getSparsity(nrows, ncols, nnz); return estimateSizeExactSparsity(nrows, ncols, sp); } /** * Estimates the footprint (in bytes) for an in-memory representation of a * matrix with dimensions=(nrows,ncols) and sparsity=sp. * * This function can be used directly in Hops, when the actual sparsity is * known i.e., <code>sp</code> is guaranteed to give worst-case estimate * (e.g., Rand with a fixed sparsity). In all other cases, estimateSize() * must be used so that worst-case estimates are computed, whenever * applicable. * * @param nrows * @param ncols * @param sp * @return */ public static long estimateSizeExactSparsity(long nrows, long ncols, double sp) { return MatrixBlock.estimateSizeInMemory(nrows,ncols,sp); } /** * Estimates the footprint (in bytes) for a partitioned in-memory representation of a * matrix with the given matrix characteristics * * @param mc * @return */ public static long estimatePartitionedSizeExactSparsity(MatrixCharacteristics mc) { return estimatePartitionedSizeExactSparsity( mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros()); } /** * Estimates the footprint (in bytes) for a partitioned in-memory representation of a * matrix with dimensions=(nrows,ncols) and number of non-zeros nnz. * * @param nrows * @param ncols * @param sp * @return */ public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, long nnz) { double sp = getSparsity(rlen, clen, nnz); return estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp); } /** * Estimates the footprint (in bytes) for a partitioned in-memory representation of a * matrix with dimensions=(nrows,ncols) and sparsity=sp. * * @param nrows * @param ncols * @param sp * @return */ public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, double sp) { long ret = 0; //check for guaranteed existence of empty blocks (less nnz than total number of blocks) long tnrblks = (long)Math.ceil((double)rlen/brlen); long tncblks = (long)Math.ceil((double)clen/bclen); long nnz = (long) Math.ceil(sp * rlen * clen); if( nnz < tnrblks * tncblks ) { long lrlen = Math.min(rlen, brlen); long lclen = Math.min(clen, bclen); return nnz * estimateSizeExactSparsity(lrlen, lclen, 1) + (tnrblks * tncblks - nnz) * estimateSizeEmptyBlock(lrlen, lclen); } //estimate size of full brlen x bclen blocks long nrblks = rlen / brlen; long ncblks = clen / bclen; if( nrblks * ncblks > 0 ) ret += nrblks * ncblks * estimateSizeExactSparsity(brlen, bclen, sp); //estimate size of bottom boundary blocks long lrlen = rlen % brlen; if( ncblks > 0 && lrlen > 0 ) ret += ncblks * estimateSizeExactSparsity(lrlen, bclen, sp); //estimate size of right boundary blocks long lclen = clen % bclen; if( nrblks > 0 && lclen > 0 ) ret += nrblks * estimateSizeExactSparsity(brlen, lclen, sp); //estimate size of bottom right boundary block if( lrlen > 0 && lclen > 0 ) ret += estimateSizeExactSparsity(lrlen, lclen, sp); return ret; } /** * Similar to estimate() except that it provides worst-case estimates * when the optimization type is ROBUST. * * @param nrows * @param ncols * @param sp * @return */ public static long estimateSize(long nrows, long ncols) { return estimateSizeExactSparsity(nrows, ncols, 1.0); } /** * * @param nrows * @param ncols * @return */ public static long estimateSizeEmptyBlock(long nrows, long ncols) { return estimateSizeExactSparsity(0, 0, 0.0d); } /** * Estimates the memory footprint of a SparseRow with <code>clen</code> * columns and <code>sp</code> sparsity. This method accounts for the * overhead incurred by extra cells allocated (but not used) for SparseRow. * It assumes that non-zeros are uniformly distributed in the matrix -- * i.e., #estimated nnz in a given SparseRow = clen*sp. * * @param clen * @param sp * @return estimated size in bytes */ public static long estimateRowSize(long clen, double sp) { if ( sp == 0 ) return 0; int basicSize = 28; int cellSize = 12; // every cell takes 12 (8+4) bytes if ( sp == 1 ) { return clen * cellSize; } long numCells = SparseRow.initialCapacity; if ( (long) (sp*clen) > numCells ) { numCells = (long) (sp*clen); } long allocatedCells = (long)Math.pow(2, Math.ceil(Math.log(numCells)/Math.log(2)) ); long rowSize = basicSize + allocatedCells * cellSize; return rowSize; } public static long estimateSizeTextOutput( long rows, long cols, long nnz, OutputInfo oinfo ) { long bsize = MatrixBlock.estimateSizeOnDisk(rows, cols, nnz); if( oinfo == OutputInfo.TextCellOutputInfo || oinfo == OutputInfo.MatrixMarketOutputInfo ) return bsize * 3; else if( oinfo == OutputInfo.CSVOutputInfo ) return bsize * 2; //unknown output info return bsize; } /** * Returns false if dimensions known to be invalid; other true * * @param rows * @param cols * @return */ public static boolean isValidCPDimensions( long rows, long cols ) { //the current CP runtime implementation requires that rows and cols //are integers since we use a single matrixblock to represent the //entire matrix return (rows <= Integer.MAX_VALUE && cols<=Integer.MAX_VALUE); } /** * Determines if valid matrix size to be represented in CP data structures. Note that * sparsity needs to be specified as rows*cols if unknown. * * @param rows * @param cols * @param sparsity * @return */ public static boolean isValidCPMatrixSize( long rows, long cols, double sparsity ) { boolean ret = true; //the current CP runtime implementation has several limitations: //1) for dense: 16GB because we use a linearized array (bounded to int in java) //2) for sparse: 2G x 2G nnz because (1) nnz maintained as long, (2) potential changes // to dense, and (3) sparse row arrays also of max int size (worst case in case of skew) long nnz = (long)(sparsity * rows * cols); boolean sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, nnz); if( sparse ) //SPARSE { //check max nnz ret = (nnz <= Long.MAX_VALUE); } else //DENSE { //check number of matrix cell ret = ((rows * cols) <= MAX_NUMCELLS_CP_DENSE); } return ret; } /** * * @return * @throws HopsException */ public static boolean allowsToFilterEmptyBlockOutputs( Hop hop ) throws HopsException { boolean ret = true; for( Hop p : hop.getParent() ) { p.optFindExecType(); //ensure exec type evaluated ret &= ( p.getExecType()==ExecType.CP ||(p instanceof AggBinaryOp && allowsToFilterEmptyBlockOutputs(p) ) ||(p instanceof DataOp && ((DataOp)p).getDataOpType()==DataOpTypes.PERSISTENTWRITE && ((DataOp)p).getInputFormatType()==FileFormatTypes.TEXT)) && !(p instanceof FunctionOp || (p instanceof DataOp && ((DataOp)p).getInputFormatType()!=FileFormatTypes.TEXT) ); //no function call or transient write } return ret; } /** * * @return */ public static int getConstrainedNumThreads(int maxNumThreads) { //by default max local parallelism (vcores) int ret = InfrastructureAnalyzer.getLocalParallelism(); //apply external max constraint (e.g., set by parfor or other rewrites) if( maxNumThreads > 0 ) { ret = Math.min(ret, maxNumThreads); } //apply global multi-threading constraint if( !PARALLEL_CP_MATRIX_MULTIPLY ) { ret = 1; } return ret; } //////////////////////// // Sparsity Estimates // //////////////////////// /** * Estimates the result sparsity for Matrix Multiplication A %*% B. * * @param sp1 -- sparsity of A * @param sp2 -- sparsity of B * @param m -- nrow(A) * @param k -- ncol(A), nrow(B) * @param n -- ncol(B) * @return */ public static double getMatMultSparsity(double sp1, double sp2, long m, long k, long n, boolean worstcase) { if( worstcase ){ double nnz1 = sp1 * m * k; double nnz2 = sp2 * k * n; return Math.min(1, nnz1/m) * Math.min(1, nnz2/n); } else return (1 - Math.pow(1-sp1*sp2, k) ); } /** * * @param rlen1 * @param clen1 * @param nnz1 * @param rlen2 * @param clen2 * @param nnz2 * @return */ public static double getLeftIndexingSparsity( long rlen1, long clen1, long nnz1, long rlen2, long clen2, long nnz2 ) { boolean scalarRhs = (rlen2==0 && clen2==0); //infer output worstcase output nnz long lnnz = -1; if( nnz1>=0 && scalarRhs ) lnnz = nnz1+1; // nnz(left) + scalar else if( nnz1>=0 && nnz2>=0 ) lnnz = nnz1 + nnz2; // nnz(left) + nnz(right) else if( nnz1>=0 && rlen2>0 && clen2>0 ) lnnz = nnz1 + rlen2*clen2; // nnz(left) + nnz(right_dense) lnnz = Math.min(lnnz, rlen1*clen1); return getSparsity(rlen1, clen1, (lnnz>=0) ? lnnz : rlen1*clen1); } /** * Determines if a given binary op is potentially conditional sparse safe. * * @param op * @return */ public static boolean isBinaryOpConditionalSparseSafe( OpOp2 op ) { return ( op==OpOp2.GREATER || op==OpOp2.LESS || op==OpOp2.NOTEQUAL || op==OpOp2.EQUAL || op==OpOp2.MINUS); } /** * Determines if a given binary op with scalar literal guarantee an output * sparsity which is exactly the same as its matrix input sparsity. * * @param op * @param lit * @return */ public static boolean isBinaryOpConditionalSparseSafeExact( OpOp2 op, LiteralOp lit ) { double val = HopRewriteUtils.getDoubleValueSafe(lit); return ( op==OpOp2.NOTEQUAL && val==0); } /** * * @param sp1 * @param op * @param lit * @return */ public static double getBinaryOpSparsityConditionalSparseSafe( double sp1, OpOp2 op, LiteralOp lit ) { double val = HopRewriteUtils.getDoubleValueSafe(lit); return ( (op==OpOp2.GREATER && val==0) ||(op==OpOp2.LESS && val==0) ||(op==OpOp2.NOTEQUAL && val==0) ||(op==OpOp2.EQUAL && val!=0) ||(op==OpOp2.MINUS && val==0)) ? sp1 : 1.0; } /** * Estimates the result sparsity for matrix-matrix binary operations (A op B) * * @param sp1 -- sparsity of A * @param sp2 -- sparsity of B * @param op -- binary operation * @return * * NOTE: append has specific computation */ public static double getBinaryOpSparsity(double sp1, double sp2, OpOp2 op, boolean worstcase) { // default is worst-case estimate for robustness double ret = 1.0; if( worstcase ) { //NOTE: for matrix-scalar operations this estimate is too conservative, because //Math.min(1, sp1 + sp2) will always give a sparsity 1 if we pass sp2=1 for scalars. //In order to do better (with guarantees), we need to take the actual values into account switch(op) { case PLUS: case MINUS: case LESS: case GREATER: case NOTEQUAL: case MIN: case MAX: case OR: ret = Math.min(1, sp1 + sp2); break; case MULT: case AND: ret = Math.min(sp1, sp2); break; case DIV: case MODULUS: case POW: case MINUS_NZ: case LOG_NZ: ret = sp1; break; //case EQUAL: //doesnt work on worstcase estimates, but on // ret = 1-Math.abs(sp1-sp2); break; default: ret = 1.0; } } else { switch(op) { case PLUS: case MINUS: // result[i,j] != 0 iff A[i,j] !=0 || B[i,j] != 0 // worst case estimate = sp1+sp2 ret = (1 - (1-sp1)*(1-sp2)); break; case MULT: // result[i,j] != 0 iff A[i,j] !=0 && B[i,j] != 0 // worst case estimate = min(sp1,sp2) ret = sp1 * sp2; break; case DIV: ret = 1.0; // worst case estimate break; case LESS: case LESSEQUAL: case GREATER: case GREATEREQUAL: case EQUAL: case NOTEQUAL: ret = 1.0; // purely data-dependent operations, and hence worse-case estimate break; //MIN, MAX, AND, OR, LOG, POW default: ret = 1.0; } } return ret; } public static double getSparsity( long dim1, long dim2, long nnz ) { if( dim1<=0 || dim2<=0 || nnz<0 ) return 1.0; else return Math.min(((double)nnz)/dim1/dim2, 1.0); } public static String toMB(double inB) { if ( inB < 0 ) return "-"; return String.format("%.0f", inB/(1024*1024) ); } /** * Function to evaluate simple size expressions over literals and now/ncol. * * It returns the exact results of this expressions if known, otherwise * Long.MAX_VALUE if unknown. * * @param root * @return * @throws HopsException */ public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo ) throws HopsException { long ret = Long.MAX_VALUE; //for simplicity and robustness call double and cast. HashMap<Long, Double> dvalMemo = new HashMap<Long, Double>(); double tmp = rEvalSimpleDoubleExpression(root, dvalMemo); if( tmp!=Double.MAX_VALUE ) ret = UtilFunctions.toLong( tmp ); return ret; } /** * * @param root * @param valMemo * @param vars * @return * @throws HopsException */ public static long rEvalSimpleLongExpression( Hop root, HashMap<Long, Long> valMemo, LocalVariableMap vars ) throws HopsException { long ret = Long.MAX_VALUE; //for simplicity and robustness call double and cast. HashMap<Long, Double> dvalMemo = new HashMap<Long, Double>(); double tmp = rEvalSimpleDoubleExpression(root, dvalMemo, vars); if( tmp!=Double.MAX_VALUE ) ret = UtilFunctions.toLong( tmp ); return ret; } /** * * @param root * @param valMemo * @return * @throws HopsException */ public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; //always use constants if( root instanceof LiteralOp ) ret = HopRewriteUtils.getDoubleValue((LiteralOp)root); //advanced size expression evaluation if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION ) { if( root instanceof UnaryOp ) ret = rEvalSimpleUnaryDoubleExpression(root, valMemo); else if( root instanceof BinaryOp ) ret = rEvalSimpleBinaryDoubleExpression(root, valMemo); } valMemo.put(root.getHopID(), ret); return ret; } /** * * @param root * @param valMemo * @param vars * @return * @throws HopsException */ public static double rEvalSimpleDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION ) { if( root instanceof LiteralOp ) ret = HopRewriteUtils.getDoubleValue((LiteralOp)root); else if( root instanceof UnaryOp ) ret = rEvalSimpleUnaryDoubleExpression(root, valMemo, vars); else if( root instanceof BinaryOp ) ret = rEvalSimpleBinaryDoubleExpression(root, valMemo, vars); else if( root instanceof DataOp ) { String name = root.getName(); Data dat = vars.get(name); if( dat!=null && dat instanceof ScalarObject ) ret = ((ScalarObject)dat).getDoubleValue(); } } valMemo.put(root.getHopID(), ret); return ret; } /** * * @param root * @param valMemo * @return * @throws HopsException */ protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; UnaryOp uroot = (UnaryOp) root; Hop input = uroot.getInput().get(0); if(uroot.getOp() == Hop.OpOp1.NROW) ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE; else if( uroot.getOp() == Hop.OpOp1.NCOL ) ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE; else { double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo); if( lval != Double.MAX_VALUE ) { switch( uroot.getOp() ) { case SQRT: ret = Math.sqrt(lval); break; case ROUND: ret = Math.round(lval); break; case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break; case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break; case CAST_AS_DOUBLE: ret = lval; break; default: ret = Double.MAX_VALUE; } } } valMemo.put(root.getHopID(), ret); return ret; } /** * * @param root * @param valMemo * @param vars * @return * @throws HopsException */ protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; UnaryOp uroot = (UnaryOp) root; Hop input = uroot.getInput().get(0); if(uroot.getOp() == Hop.OpOp1.NROW) ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE; else if( uroot.getOp() == Hop.OpOp1.NCOL ) ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE; else { double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo, vars); if( lval != Double.MAX_VALUE ) { switch( uroot.getOp() ) { case SQRT: ret = Math.sqrt(lval); break; case ROUND: ret = Math.round(lval); break; case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break; case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break; case CAST_AS_DOUBLE: ret = lval; break; default: ret = Double.MAX_VALUE; } } } valMemo.put(root.getHopID(), ret); return ret; } /** * * @param root * @param valMemo * @return * @throws HopsException */ protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; BinaryOp broot = (BinaryOp) root; double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo); double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo); //note: positive and negative values might be valid subexpressions if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known { switch( broot.getOp() ) { case PLUS: ret = lret + rret; break; case MINUS: ret = lret - rret; break; case MULT: ret = lret * rret; break; case DIV: ret = lret / rret; break; case MIN: ret = Math.min(lret, rret); break; case MAX: ret = Math.max(lret, rret); break; case POW: ret = Math.pow(lret, rret); break; default: ret = Double.MAX_VALUE; } } valMemo.put(root.getHopID(), ret); return ret; } /** * * @param root * @param valMemo * @param vars * @return * @throws HopsException */ protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap<Long, Double> valMemo, LocalVariableMap vars ) throws HopsException { //memoization (prevent redundant computation of common subexpr) if( valMemo.containsKey(root.getHopID()) ) return valMemo.get(root.getHopID()); double ret = Double.MAX_VALUE; BinaryOp broot = (BinaryOp) root; double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo, vars); double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo, vars); //note: positive and negative values might be valid subexpressions if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known { switch( broot.getOp() ) { case PLUS: ret = lret + rret; break; case MINUS: ret = lret - rret; break; case MULT: ret = lret * rret; break; case DIV: ret = lret / rret; break; case MIN: ret = Math.min(lret, rret); break; case MAX: ret = Math.max(lret, rret); break; case POW: ret = Math.pow(lret, rret); break; default: ret = Double.MAX_VALUE; } } valMemo.put(root.getHopID(), ret); return ret; } }