CheckpointSPInstruction.java example

Explorer
systemml-master
- system-ml
  - src
/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.ibm.bi.dml.runtime.instructions.spark;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.storage.StorageLevel;

import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.context.ExecutionContext;
import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.instructions.Instruction;
import com.ibm.bi.dml.runtime.instructions.InstructionUtils;
import com.ibm.bi.dml.runtime.instructions.cp.BooleanObject;
import com.ibm.bi.dml.runtime.instructions.cp.CPOperand;
import com.ibm.bi.dml.runtime.instructions.spark.data.RDDObject;
import com.ibm.bi.dml.runtime.instructions.spark.functions.CopyBlockFunction;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.operators.Operator;


public class CheckpointSPInstruction extends UnarySPInstruction
{
	//default storage level
	private StorageLevel _level = null;
	
	public CheckpointSPInstruction(Operator op, CPOperand in, CPOperand out, StorageLevel level, String opcode, String istr){
		super(op, in, out, opcode, istr);
		_sptype = SPINSTRUCTION_TYPE.Reorg;
		
		_level = level;
	}
	
	public static Instruction parseInstruction ( String str ) 
		throws DMLRuntimeException 
	{
		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
		InstructionUtils.checkNumFields(parts, 3);
		
		String opcode = parts[0];
		CPOperand in = new CPOperand(parts[1]);
		CPOperand out = new CPOperand(parts[2]);

		StorageLevel level = StorageLevel.fromString(parts[3]);

		return new CheckpointSPInstruction(null, in, out, level, opcode, str);
	}
	
	@Override
	public void processInstruction(ExecutionContext ec)
			throws DMLUnsupportedOperationException, DMLRuntimeException 
	{
		SparkExecutionContext sec = (SparkExecutionContext)ec;
		
		// Step 1: early abort on non-existing inputs 
		// -------
		// (checkpoints are generated for all read only variables in loops; due to unbounded scoping and 
		// conditional control flow they to not necessarily exist in the symbol table during runtime - 
		// this is valid if relevant branches are never entered)
		if( sec.getVariable( input1.getName() ) == null ) {
			//add a dummy entry to the input, which will be immediately overwritten by the null output.
			sec.setVariable( input1.getName(), new BooleanObject(false));
			return;
		}
		
		//get input rdd handle
		JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable( input1.getName() );
		MatrixCharacteristics mcIn = sec.getMatrixCharacteristics( input1.getName() );
		
		// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
		// -------
		// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
		// This prevents unnecessary overhead if the dataset is only consumed by cp operations.

		JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
		if( !in.getStorageLevel().equals( _level ) ) 
		{
			//investigate issue of unnecessarily large number of partitions
			boolean coalesce = false;
			int numPartitions = -1;
			if( mcIn.dimsKnown(true) ) {
				double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
				double matrixPSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(mcIn);
				numPartitions = (int) Math.max(Math.ceil(matrixPSize/hdfsBlockSize), 1);
				coalesce = ( numPartitions < in.partitions().size() );
			}
			
			//checkpoint pre-processing rdd operations
			if( coalesce ) {
				//merge partitions without shuffle if too many partitions
				out = in.coalesce( numPartitions );
			}
			else {
				//since persist is an in-place marker for a storage level, we 
				//apply a narrow shallow copy to allow for short-circuit collects 
				out = in.mapValues(new CopyBlockFunction(false));	
			}
				
			//actual checkpoint into given storage level
			out = out.persist( _level );
		}
		else {
			out = in; //pass-through
		}
			
		// Step 3: In-place update of input matrix rdd handle and set as output
		// -------
		// We use this in-place approach for two reasons. First, it is correct because our checkpoint 
		// injection rewrites guarantee that after checkpoint instructions there are no consumers on the 
		// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
		// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
		// caching and subsequent collects. Note that in-place update requires us to explicitly handle
		// lineage information in order to prevent cycles on cleanup. 
		
		MatrixObject mo = sec.getMatrixObject( input1.getName() );
		if( out != in ) {                         //prevent unnecessary lineage info
			RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
			RDDObject outro = new RDDObject(out, output.getName()); //create new rdd object
			outro.setCheckpointRDD(true);         //mark as checkpointed
			outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
			mo.setRDDHandle(outro);
		}
		sec.setVariable( output.getName(), mo);
	}
}