/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.controlprogram.context;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import com.ibm.bi.dml.api.DMLScript;
import com.ibm.bi.dml.api.MLContext;
import com.ibm.bi.dml.api.MLContextProxy;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.lops.Checkpoint;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.Program;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.instructions.spark.SPInstruction;
import com.ibm.bi.dml.runtime.instructions.spark.data.BroadcastObject;
import com.ibm.bi.dml.runtime.instructions.spark.data.LineageObject;
import com.ibm.bi.dml.runtime.instructions.spark.data.PartitionedBroadcastMatrix;
import com.ibm.bi.dml.runtime.instructions.spark.data.PartitionedMatrixBlock;
import com.ibm.bi.dml.runtime.instructions.spark.data.RDDObject;
import com.ibm.bi.dml.runtime.instructions.spark.functions.CopyBinaryCellFunction;
import com.ibm.bi.dml.runtime.instructions.spark.functions.CopyBlockPairFunction;
import com.ibm.bi.dml.runtime.instructions.spark.functions.CopyTextInputFunction;
import com.ibm.bi.dml.runtime.instructions.spark.utils.RDDAggregateUtils;
import com.ibm.bi.dml.runtime.instructions.spark.utils.SparkUtils;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixCell;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.utils.Statistics;
public class SparkExecutionContext extends ExecutionContext
{
private static final Log LOG = LogFactory.getLog(SparkExecutionContext.class.getName());
//internal configurations
private static boolean LAZY_SPARKCTX_CREATION = true;
private static boolean ASYNCHRONOUS_VAR_DESTROY = true;
private static boolean FAIR_SCHEDULER_MODE = true;
//executor memory and relative fractions as obtained from the spark configuration
private static long _memExecutors = -1; //mem per executors
private static double _memRatioData = -1;
private static double _memRatioShuffle = -1;
private static int _numExecutors = -1; //total executors
private static int _defaultPar = -1; //total vcores
private static boolean _confOnly = false; //infrastructure info based on config
// Only one SparkContext may be active per JVM. You must stop() the active SparkContext before creating a new one.
// This limitation may eventually be removed; see SPARK-2243 for more details.
private static JavaSparkContext _spctx = null;
protected SparkExecutionContext(Program prog)
{
//protected constructor to force use of ExecutionContextFactory
this( true, prog );
}
protected SparkExecutionContext(boolean allocateVars, Program prog)
{
//protected constructor to force use of ExecutionContextFactory
super( allocateVars, prog );
//spark context creation via internal initializer
if( !(LAZY_SPARKCTX_CREATION && OptimizerUtils.isHybridExecutionMode()) ) {
initSparkContext();
}
}
/**
* Returns the used singleton spark context. In case of lazy spark context
* creation, this methods blocks until the spark context is created.
*
* @return
*/
public JavaSparkContext getSparkContext()
{
//lazy spark context creation on demand (lazy instead of asynchronous
//to avoid wait for uninitialized spark context on close)
if( LAZY_SPARKCTX_CREATION ) {
initSparkContext();
}
//return the created spark context
return _spctx;
}
/**
*
* @return
*/
public static JavaSparkContext getSparkContextStatic()
{
initSparkContext();
return _spctx;
}
/**
*
*/
public void close()
{
synchronized( SparkExecutionContext.class ) {
if( _spctx != null )
{
//stop the spark context if existing
_spctx.stop();
//make sure stopped context is never used again
_spctx = null;
}
}
}
public static boolean isLazySparkContextCreation(){
return LAZY_SPARKCTX_CREATION;
}
/**
*
*/
private synchronized static void initSparkContext()
{
//check for redundant spark context init
if( _spctx != null )
return;
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//create a default spark context (master, appname, etc refer to system properties
//as given in the spark configuration or during spark-submit)
MLContext mlCtx = MLContextProxy.getActiveMLContext();
if(mlCtx != null)
{
// This is when DML is called through spark shell
// Will clean the passing of static variables later as this involves minimal change to DMLScript
_spctx = new JavaSparkContext(mlCtx.getSparkContext());
}
else
{
if(DMLScript.USE_LOCAL_SPARK_CONFIG) {
// For now set 4 cores for integration testing :)
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("My local integration test app");
// This is discouraged in spark but have added only for those testcase that cannot stop the context properly
// conf.set("spark.driver.allowMultipleContexts", "true");
conf.set("spark.ui.enabled", "false");
_spctx = new JavaSparkContext(conf);
}
else //default cluster setup
{
//setup systemml-preferred spark configuration (w/o user choice)
SparkConf conf = new SparkConf();
//always set unlimited result size (required for cp collect)
conf.set("spark.driver.maxResultSize", "0");
//always use the fair scheduler (for single jobs, it's equivalent to fifo
//but for concurrent jobs in parfor it ensures better data locality because
//round robin assignment mitigates the problem of 'sticky slots')
if( FAIR_SCHEDULER_MODE ) {
conf.set("spark.scheduler.mode", "FAIR");
}
_spctx = new JavaSparkContext(conf);
}
}
//globally add binaryblock serialization framework for all hdfs read/write operations
//TODO if spark context passed in from outside (mlcontext), we need to clean this up at the end
if( MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION )
MRJobConfiguration.addBinaryBlockSerializationFramework( _spctx.hadoopConfiguration() );
//statistics maintenance
if( DMLScript.STATISTICS ){
Statistics.setSparkCtxCreateTime(System.nanoTime()-t0);
}
}
/**
* Spark instructions should call this for all matrix inputs except broadcast
* variables.
*
* @param varname
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<MatrixIndexes,MatrixBlock> getBinaryBlockRDDHandleForVariable( String varname )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
return (JavaPairRDD<MatrixIndexes,MatrixBlock>) getRDDHandleForVariable( varname, InputInfo.BinaryBlockInputInfo);
}
/**
*
* @param varname
* @param inputInfo
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
public JavaPairRDD<?,?> getRDDHandleForVariable( String varname, InputInfo inputInfo )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
MatrixObject mo = getMatrixObject(varname);
return getRDDHandleForMatrixObject(mo, inputInfo);
}
/**
* This call returns an RDD handle for a given matrix object. This includes
* the creation of RDDs for in-memory or binary-block HDFS data.
*
*
* @param varname
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?,?> getRDDHandleForMatrixObject( MatrixObject mo, InputInfo inputInfo )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//NOTE: MB this logic should be integrated into MatrixObject
//However, for now we cannot assume that spark libraries are
//always available and hence only store generic references in
//matrix object while all the logic is in the SparkExecContext
JavaPairRDD<?,?> rdd = null;
//CASE 1: rdd already existing (reuse if checkpoint or trigger
//pending rdd operations if not yet cached but prevent to re-evaluate
//rdd operations if already executed and cached
if( mo.getRDDHandle()!=null
&& (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false)) )
{
//return existing rdd handling (w/o input format change)
rdd = mo.getRDDHandle().getRDD();
}
//CASE 2: dirty in memory data or cached result of rdd operations
else if( mo.isDirty() || mo.isCached(false) )
{
//get in-memory matrix block and parallelize it
MatrixBlock mb = mo.acquireRead(); //pin matrix in memory
rdd = toJavaPairRDD(getSparkContext(), mb, (int)mo.getNumRowsPerBlock(), (int)mo.getNumColumnsPerBlock());
mo.release(); //unpin matrix
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
mo.setRDDHandle(rddhandle);
}
//CASE 3: non-dirty (file exists on HDFS)
else
{
// parallelize hdfs-resident file
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if(inputInfo == InputInfo.BinaryBlockInputInfo) {
rdd = getSparkContext().hadoopFile( mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
//recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
rdd = ((JavaPairRDD<MatrixIndexes, MatrixBlock>)rdd).mapToPair( new CopyBlockPairFunction() ); //cp is workaround for read bug
}
else if(inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
rdd = getSparkContext().hadoopFile( mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
rdd = ((JavaPairRDD<LongWritable, Text>)rdd).mapToPair( new CopyTextInputFunction() ); //cp is workaround for read bug
}
else if(inputInfo == InputInfo.BinaryCellInputInfo) {
rdd = getSparkContext().hadoopFile( mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>)rdd).mapToPair( new CopyBinaryCellFunction() ); //cp is workaround for read bug
}
else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
return rdd;
}
/**
* TODO So far we only create broadcast variables but never destroy
* them. This is a memory leak which might lead to executor out-of-memory.
* However, in order to handle this, we need to keep track when broadcast
* variables are no longer required.
*
* @param varname
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
@SuppressWarnings("unchecked")
public PartitionedBroadcastMatrix getBroadcastForVariable( String varname )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
MatrixObject mo = getMatrixObject(varname);
PartitionedBroadcastMatrix bret = null;
if( mo.getBroadcastHandle()!=null
&& mo.getBroadcastHandle().isValid() )
{
//reuse existing broadcast handle
bret = mo.getBroadcastHandle().getBroadcast();
}
else
{
//obtain meta data for matrix
int brlen = (int) mo.getNumRowsPerBlock();
int bclen = (int) mo.getNumColumnsPerBlock();
//create partitioned matrix block and release memory consumed by input
MatrixBlock mb = mo.acquireRead();
PartitionedMatrixBlock pmb = new PartitionedMatrixBlock(mb, brlen, bclen);
mo.release();
//determine coarse-grained partitioning
int numPerPart = PartitionedBroadcastMatrix.computeBlocksPerPartition(mo.getNumRows(), mo.getNumColumns(), brlen, bclen);
int numParts = (int) Math.ceil((double)pmb.getNumRowBlocks()*pmb.getNumColumnBlocks() / numPerPart);
Broadcast<PartitionedMatrixBlock>[] ret = new Broadcast[numParts];
//create coarse-grained partitioned broadcasts
if( numParts > 1 ) {
for( int i=0; i<numParts; i++ ) {
int offset = i * numPerPart;
int numBlks = Math.min(numPerPart, pmb.getNumRowBlocks()*pmb.getNumColumnBlocks()-offset);
PartitionedMatrixBlock tmp = pmb.createPartition(offset, numBlks);
ret[i] = getSparkContext().broadcast(tmp);
}
}
else { //single partition
ret[0] = getSparkContext().broadcast( pmb);
}
bret = new PartitionedBroadcastMatrix(ret);
BroadcastObject bchandle = new BroadcastObject(bret, varname);
mo.setBroadcastHandle(bchandle);
}
return bret;
}
/**
* Keep the output rdd of spark rdd operations as meta data of matrix objects in the
* symbol table.
*
* Spark instructions should call this for all matrix outputs.
*
*
* @param varname
* @param rdd
* @throws DMLRuntimeException
*/
public void setRDDHandleForVariable(String varname, JavaPairRDD<MatrixIndexes,?> rdd)
throws DMLRuntimeException
{
MatrixObject mo = getMatrixObject(varname);
RDDObject rddhandle = new RDDObject(rdd, varname);
mo.setRDDHandle( rddhandle );
}
/**
* Utility method for creating an RDD out of an in-memory matrix block.
*
* @param sc
* @param block
* @return
* @throws DMLUnsupportedOperationException
* @throws DMLRuntimeException
*/
public static JavaPairRDD<MatrixIndexes,MatrixBlock> toJavaPairRDD(JavaSparkContext sc, MatrixBlock src, int brlen, int bclen)
throws DMLRuntimeException, DMLUnsupportedOperationException
{
LinkedList<Tuple2<MatrixIndexes,MatrixBlock>> list = new LinkedList<Tuple2<MatrixIndexes,MatrixBlock>>();
if( src.getNumRows() <= brlen
&& src.getNumColumns() <= bclen )
{
list.addLast(new Tuple2<MatrixIndexes,MatrixBlock>(new MatrixIndexes(1,1), src));
}
else
{
boolean sparse = src.isInSparseFormat();
//create and write subblocks of matrix
for(int blockRow = 0; blockRow < (int)Math.ceil(src.getNumRows()/(double)brlen); blockRow++)
for(int blockCol = 0; blockCol < (int)Math.ceil(src.getNumColumns()/(double)bclen); blockCol++)
{
int maxRow = (blockRow*brlen + brlen < src.getNumRows()) ? brlen : src.getNumRows() - blockRow*brlen;
int maxCol = (blockCol*bclen + bclen < src.getNumColumns()) ? bclen : src.getNumColumns() - blockCol*bclen;
MatrixBlock block = new MatrixBlock(maxRow, maxCol, sparse);
int row_offset = blockRow*brlen;
int col_offset = blockCol*bclen;
//copy submatrix to block
src.sliceOperations( row_offset, row_offset+maxRow-1,
col_offset, col_offset+maxCol-1, block );
//append block to sequence file
MatrixIndexes indexes = new MatrixIndexes(blockRow+1, blockCol+1);
list.addLast(new Tuple2<MatrixIndexes,MatrixBlock>(indexes, block));
}
}
return sc.parallelizePairs(list);
}
/**
* This method is a generic abstraction for calls from the buffer pool.
* See toMatrixBlock(JavaPairRDD<MatrixIndexes,MatrixBlock> rdd, int numRows, int numCols);
*
* @param rdd
* @param numRows
* @param numCols
* @return
* @throws DMLRuntimeException
*/
@SuppressWarnings("unchecked")
public static MatrixBlock toMatrixBlock(RDDObject rdd, int rlen, int clen, int brlen, int bclen, long nnz)
throws DMLRuntimeException
{
return toMatrixBlock(
(JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD(),
rlen, clen, brlen, bclen, nnz);
}
/**
* Utility method for creating a single matrix block out of an RDD. Note that this collect call
* might trigger execution of any pending transformations.
*
* NOTE: This is an unguarded utility function, which requires memory for both the output matrix
* and its collected, blocked representation.
*
* @param rdd
* @param numRows
* @param numCols
* @return
* @throws DMLRuntimeException
*/
public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes,MatrixBlock> rdd, int rlen, int clen, int brlen, int bclen, long nnz)
throws DMLRuntimeException
{
MatrixBlock out = null;
if( rlen <= brlen && clen <= bclen ) //SINGLE BLOCK
{
//special case without copy and nnz maintenance
List<Tuple2<MatrixIndexes,MatrixBlock>> list = rdd.collect();
if( list.size()>1 )
throw new DMLRuntimeException("Expecting no more than one result block.");
else if( list.size()==1 )
out = list.get(0)._2();
else //empty (e.g., after ops w/ outputEmpty=false)
out = new MatrixBlock(rlen, clen, true);
}
else //MULTIPLE BLOCKS
{
//determine target sparse/dense representation
long lnnz = (nnz >= 0) ? nnz : (long)rlen * clen;
boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz);
//create output matrix block (w/ lazy allocation)
out = new MatrixBlock(rlen, clen, sparse);
List<Tuple2<MatrixIndexes,MatrixBlock>> list = rdd.collect();
//copy blocks one-at-a-time into output matrix block
for( Tuple2<MatrixIndexes,MatrixBlock> keyval : list )
{
//unpack index-block pair
MatrixIndexes ix = keyval._1();
MatrixBlock block = keyval._2();
//compute row/column block offsets
int row_offset = (int)(ix.getRowIndex()-1)*brlen;
int col_offset = (int)(ix.getColumnIndex()-1)*bclen;
int rows = block.getNumRows();
int cols = block.getNumColumns();
if( sparse ) { //SPARSE OUTPUT
//append block to sparse target in order to avoid shifting
//note: this append requires a final sort of sparse rows
out.appendToSparse(block, row_offset, col_offset);
}
else { //DENSE OUTPUT
out.copy( row_offset, row_offset+rows-1,
col_offset, col_offset+cols-1, block, false );
}
}
//post-processing output matrix
if( sparse )
out.sortSparseRows();
out.recomputeNonZeros();
out.examSparsity();
}
return out;
}
/**
*
* @param rdd
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param nnz
* @return
* @throws DMLRuntimeException
*/
public static PartitionedMatrixBlock toPartitionedMatrixBlock(JavaPairRDD<MatrixIndexes,MatrixBlock> rdd, int rlen, int clen, int brlen, int bclen, long nnz)
throws DMLRuntimeException
{
PartitionedMatrixBlock out = new PartitionedMatrixBlock(rlen, clen, brlen, bclen);
List<Tuple2<MatrixIndexes,MatrixBlock>> list = rdd.collect();
//copy blocks one-at-a-time into output matrix block
for( Tuple2<MatrixIndexes,MatrixBlock> keyval : list )
{
//unpack index-block pair
MatrixIndexes ix = keyval._1();
MatrixBlock block = keyval._2();
out.setMatrixBlock((int)ix.getRowIndex(), (int)ix.getColumnIndex(), block);
}
return out;
}
/**
*
* @param rdd
* @param oinfo
*/
@SuppressWarnings("unchecked")
public static long writeRDDtoHDFS( RDDObject rdd, String path, OutputInfo oinfo )
{
JavaPairRDD<MatrixIndexes,MatrixBlock> lrdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD();
//recompute nnz
long nnz = SparkUtils.computeNNZFromBlocks(lrdd);
//save file is an action which also triggers nnz maintenance
lrdd.saveAsHadoopFile(path,
oinfo.outputKeyClass,
oinfo.outputValueClass,
oinfo.outputFormatClass);
//return nnz aggregate of all blocks
return nnz;
}
/**
* Returns the available memory budget for broadcast variables in bytes.
* In detail, this takes into account the total executor memory as well
* as relative ratios for data and shuffle. Note, that this is a conservative
* estimate since both data memory and shuffle memory might not be fully
* utilized.
*
* @return
*/
public static double getBroadcastMemoryBudget()
{
if( _memExecutors < 0 || _memRatioData < 0 || _memRatioShuffle < 0 )
analyzeSparkConfiguation();
//70% of remaining free memory
double membudget = OptimizerUtils.MEM_UTIL_FACTOR *
( _memExecutors
- _memExecutors*(_memRatioData+_memRatioShuffle) );
return membudget;
}
/**
*
* @return
*/
public static double getConfiguredTotalDataMemory() {
return getConfiguredTotalDataMemory(false);
}
/**
*
* @param refresh
* @return
*/
public static double getConfiguredTotalDataMemory(boolean refresh)
{
if( _memExecutors < 0 || _memRatioData < 0 )
analyzeSparkConfiguation();
//always get the current num executors on refresh because this might
//change if not all executors are initially allocated and it is plan-relevant
if( refresh && !_confOnly ) {
JavaSparkContext jsc = getSparkContextStatic();
int numExec = Math.max(jsc.sc().getExecutorMemoryStatus().size() - 1, 1);
return _memExecutors * _memRatioData * numExec;
}
else
return ( _memExecutors * _memRatioData * _numExecutors );
}
public static int getNumExecutors()
{
if( _numExecutors < 0 )
analyzeSparkConfiguation();
return _numExecutors;
}
public static int getDefaultParallelism() {
return getDefaultParallelism(false);
}
/**
*
* @return
*/
public static int getDefaultParallelism(boolean refresh)
{
if( _defaultPar < 0 && !refresh )
analyzeSparkConfiguation();
//always get the current default parallelism on refresh because this might
//change if not all executors are initially allocated and it is plan-relevant
if( refresh && !_confOnly )
return getSparkContextStatic().defaultParallelism();
else
return _defaultPar;
}
/**
*
*/
public static void analyzeSparkConfiguation()
{
SparkConf sconf = new SparkConf();
//parse absolute executor memory
String tmp = sconf.get("spark.executor.memory", "512m");
if ( tmp.endsWith("g") || tmp.endsWith("G") )
_memExecutors = Long.parseLong(tmp.substring(0,tmp.length()-1)) * 1024 * 1024 * 1024;
else if ( tmp.endsWith("m") || tmp.endsWith("M") )
_memExecutors = Long.parseLong(tmp.substring(0,tmp.length()-1)) * 1024 * 1024;
else if( tmp.endsWith("k") || tmp.endsWith("K") )
_memExecutors = Long.parseLong(tmp.substring(0,tmp.length()-1)) * 1024;
else
_memExecutors = Long.parseLong(tmp.substring(0,tmp.length()-2));
//get data and shuffle memory ratios (defaults not specified in job conf)
_memRatioData = sconf.getDouble("spark.storage.memoryFraction", 0.6); //default 60%
_memRatioShuffle = sconf.getDouble("spark.shuffle.memoryFraction", 0.2); //default 20%
int numExecutors = sconf.getInt("spark.executor.instances", -1);
int numCoresPerExec = sconf.getInt("spark.executor.cores", -1);
int defaultPar = sconf.getInt("spark.default.parallelism", -1);
if( numExecutors > 1 && (defaultPar > 1 || numCoresPerExec > 1) ) {
_numExecutors = numExecutors;
_defaultPar = (defaultPar>1) ? defaultPar : numExecutors * numCoresPerExec;
_confOnly = true;
}
else {
//get default parallelism (total number of executors and cores)
//note: spark context provides this information while conf does not
//(for num executors we need to correct for driver and local mode)
JavaSparkContext jsc = getSparkContextStatic();
_numExecutors = Math.max(jsc.sc().getExecutorMemoryStatus().size() - 1, 1);
_defaultPar = jsc.defaultParallelism();
_confOnly = false; //implies env info refresh w/ spark context
}
//note: required time for infrastructure analysis on 5 node cluster: ~5-20ms.
}
/**
*
*/
public void checkAndRaiseValidationWarningJDKVersion()
{
//check for jdk version less than jdk8
boolean isLtJDK8 = InfrastructureAnalyzer.isJavaVersionLessThanJDK8();
//check multi-threaded executors
int numExecutors = getNumExecutors();
int numCores = getDefaultParallelism();
boolean multiThreaded = (numCores > numExecutors);
//check for jdk version less than 8 (and raise warning if multi-threaded)
if( isLtJDK8 && multiThreaded)
{
//get the jre version
String version = System.getProperty("java.version");
LOG.warn("########################################################################################");
LOG.warn("### WARNING: Multi-threaded text reblock may lead to thread contention on JRE < 1.8 ####");
LOG.warn("### java.version = " + version);
LOG.warn("### total number of executors = " + numExecutors);
LOG.warn("### total number of cores = " + numCores);
LOG.warn("### JDK-7032154: Performance tuning of sun.misc.FloatingDecimal/FormattedFloatingDecimal");
LOG.warn("### Workaround: Convert text to binary w/ changed configuration of one executor per core");
LOG.warn("########################################################################################");
}
}
///////////////////////////////////////////
// Cleanup of RDDs and Broadcast variables
///////
/**
* Adds a child rdd object to the lineage of a parent rdd.
*
* @param varParent
* @param varChild
* @throws DMLRuntimeException
*/
public void addLineageRDD(String varParent, String varChild)
throws DMLRuntimeException
{
RDDObject parent = getMatrixObject(varParent).getRDDHandle();
RDDObject child = getMatrixObject(varChild).getRDDHandle();
parent.addLineageChild( child );
}
/**
* Adds a child broadcast object to the lineage of a parent rdd.
*
* @param varParent
* @param varChild
* @throws DMLRuntimeException
*/
public void addLineageBroadcast(String varParent, String varChild)
throws DMLRuntimeException
{
RDDObject parent = getMatrixObject(varParent).getRDDHandle();
BroadcastObject child = getMatrixObject(varChild).getBroadcastHandle();
parent.addLineageChild( child );
}
@Override
public void cleanupMatrixObject( MatrixObject mo )
throws DMLRuntimeException
{
//NOTE: this method overwrites the default behavior of cleanupMatrixObject
//and hence is transparently used by rmvar instructions and other users. The
//core difference is the lineage-based cleanup of RDD and broadcast variables.
try
{
if ( mo.isCleanupEnabled() )
{
//compute ref count only if matrix cleanup actually necessary
if ( !getVariables().hasReferences(mo) )
{
//clean cached data
mo.clearData();
//clean hdfs data
if( mo.isFileExists() ) {
String fpath = mo.getFileName();
if (fpath != null) {
MapReduceTool.deleteFileIfExistOnHDFS(fpath);
MapReduceTool.deleteFileIfExistOnHDFS(fpath + ".mtd");
}
}
//cleanup RDD and broadcast variables (recursive)
//note: requires that mo.clearData already removed back references
if( mo.getRDDHandle()!=null ) {
rCleanupLineageObject(mo.getRDDHandle());
}
if( mo.getBroadcastHandle()!=null ) {
rCleanupLineageObject(mo.getBroadcastHandle());
}
}
}
}
catch(Exception ex)
{
throw new DMLRuntimeException(ex);
}
}
private void rCleanupLineageObject(LineageObject lob)
{
//abort recursive cleanup if still consumers
if( lob.getNumReferences() > 0 )
return;
//abort if still reachable through matrix object (via back references for
//robustness in function calls and to prevent repeated scans of the symbol table)
if( lob.hasBackReference() )
return;
//cleanup current lineage object (from driver/executors)
if( lob instanceof RDDObject )
cleanupRDDVariable(((RDDObject)lob).getRDD());
else if( lob instanceof BroadcastObject ) {
PartitionedBroadcastMatrix pbm = ((BroadcastObject)lob).getBroadcast();
for( Broadcast<PartitionedMatrixBlock> bc : pbm.getBroadcasts() )
cleanupBroadcastVariable(bc);
}
//recursively process lineage children
for( LineageObject c : lob.getLineageChilds() ){
c.decrementNumReferences();
rCleanupLineageObject(c);
}
}
/**
* This call destroys a broadcast variable at all executors and the driver.
* Hence, it is intended to be used on rmvar only. Depending on the
* ASYNCHRONOUS_VAR_DESTROY configuration, this is asynchronous or not.
*
*
* @param inV
*/
public void cleanupBroadcastVariable(Broadcast<?> bvar)
{
//in comparison to 'unpersist' (which would only delete the broadcast from the executors),
//this call also deletes related data from the driver.
if( bvar.isValid() ) {
bvar.destroy( ASYNCHRONOUS_VAR_DESTROY );
}
}
/**
* This call removes an rdd variable from executor memory and disk if required.
* Hence, it is intended to be used on rmvar only. Depending on the
* ASYNCHRONOUS_VAR_DESTROY configuration, this is asynchronous or not.
*
* @param rvar
*/
public void cleanupRDDVariable(JavaPairRDD<?,?> rvar)
{
if( rvar.getStorageLevel()!=StorageLevel.NONE() ) {
rvar.unpersist( ASYNCHRONOUS_VAR_DESTROY );
}
}
/**
*
* @param var
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//get input rdd and default storage level
MatrixObject mo = getMatrixObject(var);
JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
//repartition and persist rdd (force creation of shuffled rdd via merge)
JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in);
out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
.count(); //trigger caching to prevent contention
//create new rdd handle, in-place of current matrix object
RDDObject inro = mo.getRDDHandle(); //guaranteed to exist (see above)
RDDObject outro = new RDDObject(out, var); //create new rdd object
outro.setCheckpointRDD(true); //mark as checkpointed
outro.addLineageChild(inro); //keep lineage to prevent cycles on cleanup
mo.setRDDHandle(outro);
}
/**
*
* @param var
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
@SuppressWarnings("unchecked")
public void cacheMatrixObject( String var )
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//get input rdd and default storage level
MatrixObject mo = getMatrixObject(var);
JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
//persist rdd (force rdd caching)
in.count(); //trigger caching to prevent contention
}
/**
*
* @param poolName
*/
public void setThreadLocalSchedulerPool(String poolName) {
if( FAIR_SCHEDULER_MODE ) {
getSparkContext().sc().setLocalProperty(
"spark.scheduler.pool", poolName);
}
}
/**
*
*/
public void cleanupThreadLocalSchedulerPool() {
if( FAIR_SCHEDULER_MODE ) {
getSparkContext().sc().setLocalProperty(
"spark.scheduler.pool", null);
}
}
///////////////////////////////////////////
// Debug String Handling (see explain); TODO to be removed
///////
/**
*
* @param inst
* @param outputVarName
* @throws DMLRuntimeException
*/
public void setDebugString(SPInstruction inst, String outputVarName)
throws DMLRuntimeException
{
RDDObject parentLineage = getMatrixObject(outputVarName).getRDDHandle();
if( parentLineage == null || parentLineage.getRDD() == null )
return;
MLContextProxy.addRDDForInstructionForMonitoring(inst, parentLineage.getRDD().id());
JavaPairRDD<?, ?> out = parentLineage.getRDD();
JavaPairRDD<?, ?> in1 = null;
JavaPairRDD<?, ?> in2 = null;
String input1VarName = null;
String input2VarName = null;
if(parentLineage.getLineageChilds() != null) {
for(LineageObject child : parentLineage.getLineageChilds()) {
if(child instanceof RDDObject) {
if(in1 == null) {
in1 = ((RDDObject) child).getRDD();
input1VarName = child.getVarName();
}
else if(in2 == null) {
in2 = ((RDDObject) child).getRDD();
input2VarName = child.getVarName();
}
else {
throw new DMLRuntimeException("PRINT_EXPLAIN_WITH_LINEAGE not yet supported for three outputs");
}
}
}
}
setLineageInfoForExplain(inst, out, in1, input1VarName, in2, input2VarName);
}
// The most expensive operation here is rdd.toDebugString() which can be a major hit because
// of unrolling lazy evaluation of Spark. Hence, it is guarded against it along with flag 'PRINT_EXPLAIN_WITH_LINEAGE' which is
// enabled only through MLContext. This way, it doesnot affect our performance evaluation through non-MLContext path
private void setLineageInfoForExplain(SPInstruction inst,
JavaPairRDD<?, ?> out,
JavaPairRDD<?, ?> in1, String in1Name,
JavaPairRDD<?, ?> in2, String in2Name) throws DMLRuntimeException {
// RDDInfo outInfo = org.apache.spark.storage.RDDInfo.fromRdd(out.rdd());
// First fetch start lines from input RDDs
String startLine1 = null;
String startLine2 = null;
int i1length = 0, i2length = 0;
if(in1 != null) {
String [] lines = in1.toDebugString().split("\\r?\\n");
startLine1 = SparkUtils.getStartLineFromSparkDebugInfo(lines[0]); // lines[0].substring(4, lines[0].length());
i1length = lines.length;
}
if(in2 != null) {
String [] lines = in2.toDebugString().split("\\r?\\n");
startLine2 = SparkUtils.getStartLineFromSparkDebugInfo(lines[0]); // lines[0].substring(4, lines[0].length());
i2length = lines.length;
}
String outDebugString = "";
int skip = 0;
// Now process output RDD and replace inputRDD debug string by the matrix variable name
String [] outLines = out.toDebugString().split("\\r?\\n");
for(int i = 0; i < outLines.length; i++) {
if(skip > 0) {
skip--;
// outDebugString += "\nSKIP:" + outLines[i];
}
else if(startLine1 != null && outLines[i].contains(startLine1)) {
String prefix = SparkUtils.getPrefixFromSparkDebugInfo(outLines[i]); // outLines[i].substring(0, outLines[i].length() - startLine1.length());
outDebugString += "\n" + prefix + "[[" + in1Name + "]]";
//outDebugString += "\n{" + prefix + "}[[" + in1Name + "]] => " + outLines[i];
skip = i1length - 1;
}
else if(startLine2 != null && outLines[i].contains(startLine2)) {
String prefix = SparkUtils.getPrefixFromSparkDebugInfo(outLines[i]); // outLines[i].substring(0, outLines[i].length() - startLine2.length());
outDebugString += "\n" + prefix + "[[" + in2Name + "]]";
skip = i2length - 1;
}
else {
outDebugString += "\n" + outLines[i];
}
}
MLContext mlContext = MLContextProxy.getActiveMLContext();
if(mlContext != null && mlContext.getMonitoringUtil() != null) {
mlContext.getMonitoringUtil().setLineageInfo(inst, outDebugString);
}
else {
throw new DMLRuntimeException("The method setLineageInfoForExplain should be called only through MLContext");
}
}
}