CostEstimatorStaticRuntime.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.hops.cost;

import java.util.ArrayList;
import java.util.HashSet;

import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.lops.DataGen;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.MapMult;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.lops.MMTSJ.MMTSJType;
import org.apache.sysml.lops.compile.JobType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.CPInstructionParser;
import org.apache.sysml.runtime.instructions.Instruction;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.MRInstructionParser;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.cp.CPInstruction;
import org.apache.sysml.runtime.instructions.cp.CPInstruction.CPINSTRUCTION_TYPE;
import org.apache.sysml.runtime.instructions.cp.FunctionCallCPInstruction;
import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
import org.apache.sysml.runtime.instructions.mr.BinaryMRInstructionBase;
import org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction;
import org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction;
import org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction;
import org.apache.sysml.runtime.instructions.mr.IDistributedCacheConsumer;
import org.apache.sysml.runtime.instructions.mr.MMTSJMRInstruction;
import org.apache.sysml.runtime.instructions.mr.MRInstruction;
import org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction;
import org.apache.sysml.runtime.instructions.mr.PickByCountInstruction;
import org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction;
import org.apache.sysml.runtime.instructions.mr.TernaryInstruction;
import org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase;
import org.apache.sysml.runtime.instructions.mr.MRInstruction.MRINSTRUCTION_TYPE;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes;
import org.apache.sysml.yarn.ropt.MRJobResourceInstruction;
import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;

public class CostEstimatorStaticRuntime extends CostEstimator
{
	
	//time-conversion
	private static final long DEFAULT_FLOPS = 2L * 1024 * 1024 * 1024; //2GFLOPS
	//private static final long UNKNOWN_TIME = -1;
	
	//floating point operations
	private static final double DEFAULT_NFLOP_NOOP = 10; 
	private static final double DEFAULT_NFLOP_UNKNOWN = 1; 
	private static final double DEFAULT_NFLOP_CP = 1; 	
	private static final double DEFAULT_NFLOP_TEXT_IO = 350; 
	
	//MR job latency
	private static final double DEFAULT_MR_JOB_LATENCY_LOCAL = 2;
	private static final double DEFAULT_MR_JOB_LATENCY_REMOTE = 20;
	private static final double DEFAULT_MR_TASK_LATENCY_LOCAL = 0.001;
	private static final double DEFAULT_MR_TASK_LATENCY_REMOTE = 1.5;
	
	//IO READ throughput
	private static final double DEFAULT_MBS_FSREAD_BINARYBLOCK_DENSE = 200;
	private static final double DEFAULT_MBS_FSREAD_BINARYBLOCK_SPARSE = 100;
	private static final double DEFAULT_MBS_HDFSREAD_BINARYBLOCK_DENSE = 150;
	private static final double DEFAULT_MBS_HDFSREAD_BINARYBLOCK_SPARSE = 75;
	//IO WRITE throughput
	private static final double DEFAULT_MBS_FSWRITE_BINARYBLOCK_DENSE = 150;
	private static final double DEFAULT_MBS_FSWRITE_BINARYBLOCK_SPARSE = 75;
	private static final double DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_DENSE = 120;
	private static final double DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_SPARSE = 60;
	private static final double DEFAULT_MBS_HDFSWRITE_TEXT_DENSE = 40;
	private static final double DEFAULT_MBS_HDFSWRITE_TEXT_SPARSE = 30;
	
	@Override
	@SuppressWarnings("unused")
	protected double getCPInstTimeEstimate( Instruction inst, VarStats[] vs, String[] args ) 
		throws DMLRuntimeException
	{
		CPInstruction cpinst = (CPInstruction)inst;
		
		//load time into mem
		double ltime = 0;
		if( !vs[0]._inmem ){
			ltime += getHDFSReadTime( vs[0]._rlen, vs[0]._clen, vs[0].getSparsity() );
			//eviction costs
			if( CacheableData.CACHING_WRITE_CACHE_ON_READ &&
				LazyWriteBuffer.getWriteBufferSize()<MatrixBlock.estimateSizeOnDisk(vs[0]._rlen, vs[0]._clen, (long)((vs[0]._nnz<0)? vs[0]._rlen*vs[0]._clen:vs[0]._nnz)) )
			{
				ltime += Math.abs( getFSWriteTime( vs[0]._rlen, vs[0]._clen, vs[0].getSparsity() ));
			}
			vs[0]._inmem = true;
		}
		if( !vs[1]._inmem ){
			ltime += getHDFSReadTime( vs[1]._rlen, vs[1]._clen, vs[1].getSparsity() );
			//eviction costs
			if( CacheableData.CACHING_WRITE_CACHE_ON_READ &&
				LazyWriteBuffer.getWriteBufferSize()<MatrixBlock.estimateSizeOnDisk(vs[1]._rlen, vs[1]._clen, (long)((vs[1]._nnz<0)? vs[1]._rlen*vs[1]._clen:vs[1]._nnz)) )
			{
				ltime += Math.abs( getFSWriteTime( vs[1]._rlen, vs[1]._clen, vs[1].getSparsity()) );
			}
			vs[1]._inmem = true;
		}
		if( LOG.isDebugEnabled() && ltime!=0 ) {
			LOG.debug("Cost["+cpinst.getOpcode()+" - read] = "+ltime);
		}		
				
		//exec time CP instruction
		String opcode = (cpinst instanceof FunctionCallCPInstruction) ? InstructionUtils.getOpCode(cpinst.toString()) : cpinst.getOpcode();
		double etime = getInstTimeEstimate(opcode, vs, args, ExecType.CP);
		
		//write time caching
		double wtime = 0;
		//double wtime = getFSWriteTime( vs[2]._rlen, vs[2]._clen, (vs[2]._nnz<0)? 1.0:(double)vs[2]._nnz/vs[2]._rlen/vs[2]._clen );
		if( inst instanceof VariableCPInstruction && ((VariableCPInstruction)inst).getOpcode().equals("write") )
			wtime += getHDFSWriteTime(vs[2]._rlen, vs[2]._clen, vs[2].getSparsity(), ((VariableCPInstruction)inst).getInput3().getName() );
		
		if( LOG.isDebugEnabled() && wtime!=0 ) {
			LOG.debug("Cost["+cpinst.getOpcode()+" - write] = "+wtime);
		}
		
		//total costs
		double costs = ltime + etime + wtime;
		
		//if( LOG.isDebugEnabled() )
		//	LOG.debug("Costs CP instruction = "+costs);
		
		return costs;
	}
	
	
	@Override
	protected double getMRJobInstTimeEstimate( Instruction inst, VarStats[] vs, String[] args ) 
		throws DMLRuntimeException
	{
		MRJobInstruction jinst = (MRJobInstruction) inst;
		
		//infrastructure properties
		boolean localJob = InfrastructureAnalyzer.isLocalMode();
		int maxPMap = InfrastructureAnalyzer.getRemoteParallelMapTasks(); 	
		int maxPRed = Math.min( InfrastructureAnalyzer.getRemoteParallelReduceTasks(),
				        ConfigurationManager.getNumReducers() );
		double blocksize = ((double)InfrastructureAnalyzer.getHDFSBlockSize())/(1024*1024);
		
		//correction max number of mappers/reducers on yarn clusters
		if( InfrastructureAnalyzer.isYarnEnabled() ) {
			maxPMap = (int)Math.max( maxPMap, YarnClusterAnalyzer.getNumCores() );
			//artificially reduced by factor 2, in order to prefer map-side processing even if smaller degree of parallelism
			maxPRed = (int)Math.max( maxPRed, YarnClusterAnalyzer.getNumCores()/2 /2 );
		}
				
		//yarn-specific: take degree of parallelism into account
		if( jinst instanceof MRJobResourceInstruction ){
			int maxTasks = (int)((MRJobResourceInstruction)jinst).getMaxMRTasks();
			maxPMap = Math.min(maxPMap, maxTasks);
			maxPRed = Math.min(maxPRed, maxTasks);
		}
		
		//job properties
		boolean mapOnly = jinst.isMapOnly();
		String rdInst = jinst.getIv_randInstructions();
		String rrInst = jinst.getIv_recordReaderInstructions();
		String mapInst = jinst.getIv_instructionsInMapper();
		String shfInst = jinst.getIv_shuffleInstructions();
		String aggInst = jinst.getIv_aggInstructions();
		String otherInst = jinst.getIv_otherInstructions();		
		byte[] inIx = getInputIndexes( jinst.getInputVars() );
		byte[] retIx = jinst.getIv_resultIndices();
		byte[] mapOutIx = getMapOutputIndexes(inIx, retIx, rdInst, mapInst, shfInst, aggInst, otherInst);
		int numMap = computeNumMapTasks(vs, inIx, blocksize, maxPMap, jinst.getJobType());
		int numPMap = Math.min(numMap, maxPMap);
		int numEPMap = Math.max(Math.min(numMap, maxPMap/2),1); //effective map dop
		int numRed = computeNumReduceTasks( vs, mapOutIx, jinst.getJobType() );
		int numPRed = Math.min(numRed, maxPRed);
		int numEPRed = Math.max(Math.min(numRed, maxPRed/2),1); //effective reduce dop
				
		LOG.debug("Meta nmap = "+numMap+", nred = "+numRed+"; npmap = "+numPMap+", npred = "+numPRed+"; nepmap = "+numEPMap+", nepred = "+numEPRed);
	
		//step 0: export if inputs in mem
		double exportCosts = 0; 
		for( int i=0; i<jinst.getInputVars().length; i++ )
			if( vs[i]._inmem )
				exportCosts += getHDFSWriteTime(vs[i]._rlen, vs[i]._clen, vs[i].getSparsity());
		
		//step 1: MR job / task latency (normalization by effective dop)
		double jobLatencyCosts = localJob ? DEFAULT_MR_JOB_LATENCY_LOCAL : DEFAULT_MR_JOB_LATENCY_REMOTE;
		double taskLatencyCost = (numMap / numEPMap + numEPRed)
				               * (localJob ? DEFAULT_MR_TASK_LATENCY_LOCAL : DEFAULT_MR_TASK_LATENCY_REMOTE);	
		double latencyCosts = jobLatencyCosts + taskLatencyCost;
		
		//step 2: parallel read of inputs (normalization by effective dop)
		double hdfsReadCosts = 0;
		for( int i=0; i<jinst.getInputVars().length; i++ )
			hdfsReadCosts += getHDFSReadTime(vs[i]._rlen, vs[i]._clen, vs[i].getSparsity()); 
		 hdfsReadCosts /= numEPMap;
		
		//step 3: parallel MR instructions
		String[] mapperInst = new String[]{rdInst, rrInst, mapInst};
		String[] reducerInst = new String[]{shfInst, aggInst, otherInst};	
		
		//map instructions compute/distcache read (normalization by effective dop) 
		double mapDCReadCost = 0; //read through distributed cache
		double mapCosts = 0; //map compute cost
		double shuffleCosts = 0; 
		double reduceCosts = 0; //reduce compute costs
		
		for( String instCat : mapperInst )
			if( instCat != null && instCat.length()>0 ) {
				String[] linst = instCat.split( Lop.INSTRUCTION_DELIMITOR );
				for( String tmp : linst ){
					//map compute costs
					Object[] o = extractMRInstStatistics(tmp, vs);
					String opcode = InstructionUtils.getOpCode(tmp);
					mapCosts += getInstTimeEstimate(opcode, (VarStats[])o[0], (String[])o[1], ExecType.MR);
					//dist cache read costs
					int dcIndex = getDistcacheIndex(tmp);
					if( dcIndex >= 0 ) {
						mapDCReadCost += Math.min(getFSReadTime(vs[dcIndex]._rlen, vs[dcIndex]._clen, vs[dcIndex].getSparsity()),
								                  getFSReadTime(DistributedCacheInput.PARTITION_SIZE, 1, 1.0)) //32MB partitions
								         * numMap; //read in each task
					}
				}
			}
		mapCosts /= numEPMap;
		mapDCReadCost /= numEPMap;
		
		if( !mapOnly )
		{
			//shuffle costs (normalization by effective map/reduce dop)
			for( int i=0; i<mapOutIx.length; i++ )
			{
				shuffleCosts += ( getFSWriteTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity()) / numEPMap
				                 + getFSWriteTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity())*4 / numEPRed
						         + getFSReadTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity()) / numEPRed); 	
			
				//correction of shuffle costs (necessary because the above shuffle does not consider the number of blocks)
				//TODO this is a workaround - we need to address the number of map output blocks in a more systematic way
				for( String instCat : reducerInst )
					if( instCat != null && instCat.length()>0 ) {
						String[] linst = instCat.split( Lop.INSTRUCTION_DELIMITOR );
						for( String tmp : linst ) {
							if(InstructionUtils.getMRType(tmp)==MRINSTRUCTION_TYPE.Aggregate)
								shuffleCosts += numMap * getFSWriteTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity()) / numEPMap
										      + numPMap * getFSWriteTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity()) / numEPMap
										      + numPMap * getFSReadTime(vs[mapOutIx[i]]._rlen, vs[mapOutIx[i]]._clen, vs[mapOutIx[i]].getSparsity()) / numEPRed;
						}
					}
			}
						
			//reduce instructions compute (normalization by effective dop)
			for( String instCat : reducerInst )
				if( instCat != null && instCat.length()>0 ) {
					String[] linst = instCat.split( Lop.INSTRUCTION_DELIMITOR );
					for( String tmp : linst ){
						Object[] o = extractMRInstStatistics(tmp, vs);
						if(InstructionUtils.getMRType(tmp)==MRINSTRUCTION_TYPE.Aggregate)
							o[1] = new String[]{String.valueOf(numMap)};
						String opcode = InstructionUtils.getOpCode(tmp);
						reduceCosts += getInstTimeEstimate(opcode, (VarStats[])o[0], (String[])o[1], ExecType.MR);
					}
				}
			reduceCosts /= numEPRed;
		}		
		
		//step 4: parallel write of outputs (normalization by effective dop)
		double hdfsWriteCosts = 0;
		for( int i=0; i<jinst.getOutputVars().length; i++ )
		{
			hdfsWriteCosts += getHDFSWriteTime(vs[retIx[i]]._rlen, vs[retIx[i]]._clen, vs[retIx[i]].getSparsity()); 
		}
		hdfsWriteCosts /= ((mapOnly)? numEPMap : numEPRed);
		
		//debug output
		if( LOG.isDebugEnabled() ) {
			LOG.debug("Costs Export = "+exportCosts);
			LOG.debug("Costs Latency = "+latencyCosts);
			LOG.debug("Costs HDFS Read = "+hdfsReadCosts);
			LOG.debug("Costs Distcache Read = "+mapDCReadCost);
			LOG.debug("Costs Map Exec = "+mapCosts);
			LOG.debug("Costs Shuffle = "+shuffleCosts);
			LOG.debug("Costs Reduce Exec = "+reduceCosts);
			LOG.debug("Costs HDFS Write = "+hdfsWriteCosts);
		}
	
		//aggregate individual cost factors
		return exportCosts + latencyCosts + 
			   hdfsReadCosts + mapCosts + mapDCReadCost + 
			   shuffleCosts +  
		       reduceCosts + hdfsWriteCosts; 
	}		
	
	private Object[] extractMRInstStatistics( String inst, VarStats[] stats ) 
		throws DMLRuntimeException
	{
		Object[] ret = new Object[2]; //stats, attrs
		VarStats[] vs = new VarStats[3];
		String[] attr = null; 

		String[] parts = InstructionUtils.getInstructionParts(inst);
		String opcode = parts[0];
		
		
		if( opcode.equals(DataGen.RAND_OPCODE) )
		{
			vs[0] = _unknownStats;
			vs[1] = _unknownStats;
			vs[2] = stats[Integer.parseInt(parts[2])];
			
			int type = 2; 
			//awareness of instruction patching min/max
			if(    !parts[7].contains(Lop.VARIABLE_NAME_PLACEHOLDER) 
				&& !parts[8].contains(Lop.VARIABLE_NAME_PLACEHOLDER) )
			{
				double minValue = Double.parseDouble(parts[7]);
				double maxValue = Double.parseDouble(parts[8]);
				double sparsity = Double.parseDouble(parts[9]);
				if( minValue == 0.0 && maxValue == 0.0 )
					type = 0;
				else if( sparsity == 1.0 && minValue == maxValue )
					type = 1;
			}
			attr = new String[]{String.valueOf(type)};
		}	
		if( opcode.equals(DataGen.SEQ_OPCODE) )
		{
			vs[0] = _unknownStats;
			vs[1] = _unknownStats;
			vs[2] = stats[Integer.parseInt(parts[2])];
		}	
		else //general case
		{
			
			String inst2 = replaceInstructionPatch( inst );
			MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(inst2);
			
			if( mrinst instanceof UnaryMRInstructionBase )
			{
				UnaryMRInstructionBase uinst = (UnaryMRInstructionBase) mrinst;
				vs[0] = uinst.input>=0 ? stats[ uinst.input ] : _unknownStats;
				vs[1] = _unknownStats;
				vs[2] = stats[ uinst.output ];
				
				if( vs[0] == null ) //scalar input, e.g., print
					vs[0] = _scalarStats;
				if( vs[2] == null ) //scalar output
					vs[2] = _scalarStats;
				
				if( mrinst instanceof MMTSJMRInstruction )
				{
					String type = ((MMTSJMRInstruction)mrinst).getMMTSJType().toString();
					attr = new String[]{type};
				}  
				else if( mrinst instanceof CM_N_COVInstruction )
				{
					if( opcode.equals("cm") )
						attr = new String[]{parts[parts.length-2]};		
				}
				else if( mrinst instanceof GroupedAggregateInstruction )
				{
					if( opcode.equals("groupedagg") )
					{
						AggregateOperationTypes type = CMOperator.getAggOpType(parts[2], parts[3]);
						attr = new String[]{String.valueOf(type.ordinal())};
					}
					
				}
			}
			else if( mrinst instanceof BinaryMRInstructionBase )
			{
				BinaryMRInstructionBase binst = (BinaryMRInstructionBase) mrinst;
				vs[0] = stats[ binst.input1 ];
				vs[1] = stats[ binst.input2 ];
				vs[2] = stats[ binst.output ];
								
				if( vs[0] == null ) //scalar input, 
					vs[0] = _scalarStats;
				if( vs[1] == null ) //scalar input, 
					vs[1] = _scalarStats;
				if( vs[2] == null ) //scalar output
					vs[2] = _scalarStats;
				
				if( opcode.equals("rmempty") ) {
					RemoveEmptyMRInstruction rbinst = (RemoveEmptyMRInstruction) mrinst;
					attr = new String[]{rbinst.isRemoveRows()?"0":"1"};
				}
			}
			else if( mrinst instanceof TernaryInstruction )
			{
				TernaryInstruction tinst = (TernaryInstruction) mrinst;
				vs[0] = stats[ tinst.input1 ];
				vs[1] = stats[ tinst.input2 ];
				vs[2] = stats[ tinst.input3 ];
				
				if( vs[0] == null ) //scalar input, 
					vs[0] = _scalarStats;
				if( vs[1] == null ) //scalar input, 
					vs[1] = _scalarStats;
				if( vs[2] == null ) //scalar input
					vs[2] = _scalarStats;
			}
			else if( mrinst instanceof PickByCountInstruction )
			{
				PickByCountInstruction pinst = (PickByCountInstruction) mrinst;
				vs[0] = stats[ pinst.input1 ];
				vs[2] = stats[ pinst.output ];
				if( vs[0] == null ) //scalar input, 
					vs[0] = _scalarStats;
				if( vs[1] == null ) //scalar input, 
					vs[1] = _scalarStats;
				if( vs[2] == null ) //scalar input
					vs[2] = _scalarStats;
			}
			else if( mrinst instanceof MapMultChainInstruction)
			{
				MapMultChainInstruction minst = (MapMultChainInstruction) mrinst;
				vs[0] = stats[ minst.getInput1() ];
				vs[1] = stats[ minst.getInput2() ];
				if( minst.getInput3()>=0 )
					vs[2] = stats[ minst.getInput3() ];
				
				if( vs[0] == null ) //scalar input, 
					vs[0] = _scalarStats;
				if( vs[1] == null ) //scalar input, 
					vs[1] = _scalarStats;
				if( vs[2] == null ) //scalar input
					vs[2] = _scalarStats;
			}
		}
		
		//maintain var status (CP output always inmem)
		vs[2]._inmem = true;
		
		ret[0] = vs;
		ret[1] = attr;
		
		return ret;
	}
	
	

	/////////////////////
	// Utilities       //
	/////////////////////	
	
	private byte[] getInputIndexes(String[] inputVars)
	{
		byte[] inIx = new byte[inputVars.length];
		for( int i=0; i<inIx.length; i++ )
			inIx[i] = (byte)i;
		return inIx;
	}
	
	private byte[] getMapOutputIndexes( byte[] inIx, byte[] retIx, String rdInst, String mapInst, String shfInst, String aggInst, String otherInst ) 
		throws DMLRuntimeException
	{
		//note: this is a simplified version of MRJobConfiguration.setUpOutputIndexesForMapper
		
		//map indices
		HashSet<Byte> ixMap = new HashSet<Byte>();
		for( byte ix : inIx )
			ixMap.add(ix);
		
		if( rdInst!=null && rdInst.length()>0 ) {
			rdInst = replaceInstructionPatch(rdInst);
			DataGenMRInstruction[] ins = MRInstructionParser.parseDataGenInstructions(rdInst);
			for( DataGenMRInstruction inst : ins )
				for( byte ix : inst.getAllIndexes() )
					ixMap.add(ix);
		}
		
		if( mapInst!=null && mapInst.length()>0 ) {
			mapInst = replaceInstructionPatch(mapInst);
			MRInstruction[] ins = MRInstructionParser.parseMixedInstructions(mapInst);
			for( MRInstruction inst : ins )
				for( byte ix : inst.getAllIndexes() )
					ixMap.add(ix);
		}
		
		//reduce indices
		HashSet<Byte> ixRed = new HashSet<Byte>();
		for( byte ix : retIx )
			ixRed.add(ix);
	

		if( shfInst!=null && shfInst.length()>0 ) {
			shfInst = replaceInstructionPatch(shfInst);
			MRInstruction[] ins = MRInstructionParser.parseMixedInstructions(shfInst);
			for( MRInstruction inst : ins )
				for( byte ix : inst.getAllIndexes() )
					ixRed.add(ix);
		}
		
		if( aggInst!=null && aggInst.length()>0 ) {
			aggInst = replaceInstructionPatch(aggInst);
			MRInstruction[] ins = MRInstructionParser.parseAggregateInstructions(aggInst);
			for( MRInstruction inst : ins )
				for( byte ix : inst.getAllIndexes() )
					ixRed.add(ix);
		}
		
		if( otherInst!=null && otherInst.length()>0 ) {
			otherInst = replaceInstructionPatch(otherInst);
			MRInstruction[] ins = MRInstructionParser.parseMixedInstructions(otherInst);
			for( MRInstruction inst : ins )
				for( byte ix : inst.getAllIndexes() )
					ixRed.add(ix);
		}

		//difference
		ixMap.retainAll(ixRed);
			
		//copy result
		byte[] ret = new byte[ixMap.size()];
		int i = 0;
		for( byte ix : ixMap )
			ret[i++] = ix;
		
		return ret;
	}
	
	private int computeNumMapTasks( VarStats[] vs, byte[] inputIx, double blocksize, int maxPMap, JobType jobtype )
	{
		//special cases
		if( jobtype == JobType.DATAGEN )
			return maxPMap;
			
		//input size, num blocks
		double mapInputSize = 0;
		int numBlocks = 0;
		for( int i=0; i<inputIx.length; i++ )
		{
			//input size
			mapInputSize += ((double)MatrixBlock.estimateSizeOnDisk((long)vs[inputIx[i]]._rlen, (long)vs[inputIx[i]]._clen, (long)vs[inputIx[i]]._nnz)) / (1024*1024);	
		
			//num blocks
			int lret =  (int) Math.ceil((double)vs[inputIx[i]]._rlen/vs[inputIx[i]]._brlen)
	                   *(int) Math.ceil((double)vs[inputIx[i]]._clen/vs[inputIx[i]]._bclen);
			numBlocks = Math.max(lret, numBlocks);
		}
		
		return Math.max(1, Math.min( (int)Math.ceil(mapInputSize/blocksize),numBlocks ));
	}
	
	private int computeNumReduceTasks( VarStats[] vs, byte[] mapOutIx, JobType jobtype )
	{
		int ret = -1;
		
		//TODO for jobtype==JobType.MMCJ common dim

		switch( jobtype )
		{
			case REBLOCK:
			case CSV_REBLOCK: {
				for( int i=0; i<mapOutIx.length; i++ )
				{
					int lret =  (int) Math.ceil((double)vs[mapOutIx[i]]._rlen/vs[mapOutIx[i]]._brlen)
					           *(int) Math.ceil((double)vs[mapOutIx[i]]._clen/vs[mapOutIx[i]]._bclen);
					ret = Math.max(lret, ret);
				}		
				break;
			}
			
			default: {
				for( int i=0; i<mapOutIx.length; i++ )
				{
					int lret =  (int) Math.ceil((double)vs[mapOutIx[i]]._rlen/ConfigurationManager.getBlocksize())
					           *(int) Math.ceil((double)vs[mapOutIx[i]]._clen/ConfigurationManager.getBlocksize());
					ret = Math.max(lret, ret);
				}
				break;
			}
		}
		
		return Math.max(1, ret);
	}

	private int getDistcacheIndex(String inst) 
		throws DMLRuntimeException
	{
		ArrayList<Byte> indexes = new ArrayList<Byte>();
		
		if( InstructionUtils.isDistributedCacheUsed(inst) ) {
			MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(inst);
			if( mrinst instanceof IDistributedCacheConsumer )
				((IDistributedCacheConsumer)mrinst).addDistCacheIndex(inst, indexes);
		}
		
		if( !indexes.isEmpty() )
			return indexes.get(0);
		else
			return -1;
	}
	
	
	/////////////////////
	// I/O Costs       //
	/////////////////////	
	
	/**
	 * Returns the estimated read time from HDFS. 
	 * NOTE: Does not handle unknowns.
	 * 
	 * @param dm rows?
	 * @param dn columns?
	 * @param ds sparsity factor?
	 * @return estimated HDFS read time
	 */
	private double getHDFSReadTime( long dm, long dn, double ds )
	{
		boolean sparse = MatrixBlock.evalSparseFormatOnDisk(dm, dn, (long)(ds*dm*dn));
		double ret = ((double)MatrixBlock.estimateSizeOnDisk((long)dm, (long)dn, (long)(ds*dm*dn))) / (1024*1024);  		
		
		if( sparse )
			ret /= DEFAULT_MBS_HDFSREAD_BINARYBLOCK_SPARSE;
		else //dense
			ret /= DEFAULT_MBS_HDFSREAD_BINARYBLOCK_DENSE;
		
		return ret;
	}
	
	private double getHDFSWriteTime( long dm, long dn, double ds )
	{
		boolean sparse = MatrixBlock.evalSparseFormatOnDisk(dm, dn, (long)(ds*dm*dn));
		
		double bytes = (double)MatrixBlock.estimateSizeOnDisk((long)dm, (long)dn, (long)(ds*dm*dn));
		double mbytes = bytes / (1024*1024);  		
		
		double ret = -1;
		if( sparse )
			ret = mbytes / DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_SPARSE;
		else //dense
			ret = mbytes / DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_DENSE;
		
		//if( LOG.isDebugEnabled() )
		//	LOG.debug("Costs[export] = "+ret+"s, "+mbytes+" MB ("+dm+","+dn+","+ds+").");
		
		
		return ret;
	}
	
	private double getHDFSWriteTime( long dm, long dn, double ds, String format )
	{
		boolean sparse = MatrixBlock.evalSparseFormatOnDisk(dm, dn, (long)(ds*dm*dn));
		
		double bytes = (double)MatrixBlock.estimateSizeOnDisk((long)dm, (long)dn, (long)(ds*dm*dn));
		double mbytes = bytes / (1024*1024);  		
		
		double ret = -1;
		
		if( format.equals("textcell") || format.equals("csv") )
		{
			if( sparse )
				ret = mbytes / DEFAULT_MBS_HDFSWRITE_TEXT_SPARSE;
			else //dense
				ret = mbytes / DEFAULT_MBS_HDFSWRITE_TEXT_DENSE;	
			ret *= 2.75; //text commonly 2x-3.5x larger than binary
		}
		else
		{
			if( sparse )
				ret = mbytes / DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_SPARSE;
			else //dense
				ret = mbytes / DEFAULT_MBS_HDFSWRITE_BINARYBLOCK_DENSE;
		}
		//if( LOG.isDebugEnabled() )
		//	LOG.debug("Costs[export] = "+ret+"s, "+mbytes+" MB ("+dm+","+dn+","+ds+").");
		
		
		return ret;
	}

	/**
	 * Returns the estimated read time from local FS. 
	 * NOTE: Does not handle unknowns.
	 * 
	 * @param dm rows?
	 * @param dn columns?
	 * @param ds sparsity factor?
	 * @return estimated local file system read time
	 */
	private double getFSReadTime( long dm, long dn, double ds )
	{
		boolean sparse = MatrixBlock.evalSparseFormatOnDisk(dm, dn, (long)(ds*dm*dn));
		
		double ret = ((double)MatrixBlock.estimateSizeOnDisk((long)dm, (long)dn, (long)(ds*dm*dn))) / (1024*1024);  		
		if( sparse )
			ret /= DEFAULT_MBS_FSREAD_BINARYBLOCK_SPARSE;
		else //dense
			ret /= DEFAULT_MBS_FSREAD_BINARYBLOCK_DENSE;
		
		return ret;
	}

	private double getFSWriteTime( long dm, long dn, double ds )
	{
		boolean sparse = MatrixBlock.evalSparseFormatOnDisk(dm, dn, (long)(ds*dm*dn));
		
		double ret = ((double)MatrixBlock.estimateSizeOnDisk((long)dm, (long)dn, (long)(ds*dm*dn))) / (1024*1024);  		
		
		if( sparse )
			ret /= DEFAULT_MBS_FSWRITE_BINARYBLOCK_SPARSE;
		else //dense
			ret /= DEFAULT_MBS_FSWRITE_BINARYBLOCK_DENSE;
		
		return ret;
	}

	
	/////////////////////
	// Operation Costs //
	/////////////////////
	
	private double getInstTimeEstimate(String opcode, VarStats[] vs, String[] args, ExecType et) 
		throws DMLRuntimeException
	{
		boolean inMR = (et == ExecType.MR);
		return getInstTimeEstimate(opcode, inMR,  
				                   vs[0]._rlen, vs[0]._clen, (vs[0]._nnz<0)? 1.0:(double)vs[0]._nnz/vs[0]._rlen/vs[0]._clen, 
						           vs[1]._rlen, vs[1]._clen, (vs[1]._nnz<0)? 1.0:(double)vs[1]._nnz/vs[1]._rlen/vs[1]._clen, 
						           vs[2]._rlen, vs[2]._clen, (vs[2]._nnz<0)? 1.0:(double)vs[2]._nnz/vs[2]._rlen/vs[2]._clen,
						           args);
	}
	
	/**
	 * Returns the estimated instruction execution time, w/o data transfer and single-threaded.
	 * For scalars input dims must be set to 1 before invocation. 
	 * 
	 * NOTE: Does not handle unknowns.
	 * 
	 * @param opcode instruction opcode
	 * @param inMR ?
	 * @param d1m ?
	 * @param d1n ?
	 * @param d1s ?
	 * @param d2m ?
	 * @param d2n ?
	 * @param d2s ?
	 * @param d3m ?
	 * @param d3n ?
	 * @param d3s ?
	 * @param args ?
	 * @return estimated instruction execution time
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
	private double getInstTimeEstimate( String opcode, boolean inMR, long d1m, long d1n, double d1s, long d2m, long d2n, double d2s, long d3m, long d3n, double d3s, String[] args ) throws DMLRuntimeException
	{
		double nflops = getNFLOP(opcode, inMR, d1m, d1n, d1s, d2m, d2n, d2s, d3m, d3n, d3s, args);
		double time = nflops / DEFAULT_FLOPS;
		
		if( LOG.isDebugEnabled() )
			LOG.debug("Cost["+opcode+"] = "+time+"s, "+nflops+" flops ("+d1m+","+d1n+","+d1s+","+d2m+","+d2n+","+d2s+","+d3m+","+d3n+","+d3s+").");
		
		return time;
	}
	
	private double getNFLOP( String optype, boolean inMR, long d1m, long d1n, double d1s, long d2m, long d2n, double d2s, long d3m, long d3n, double d3s, String[] args ) 
		throws DMLRuntimeException
	{
		//operation costs in FLOP on matrix block level (for CP and MR instructions)
		//(excludes IO and parallelism; assumes known dims for all inputs, outputs )
	
		boolean leftSparse = MatrixBlock.evalSparseFormatInMemory(d1m, d1n, (long)(d1s*d1m*d1n));
		boolean rightSparse = MatrixBlock.evalSparseFormatInMemory(d2m, d2n, (long)(d2s*d2m*d2n));
		boolean onlyLeft = (d1m>=0 && d1n>=0 && d2m<0 && d2n<0 );
		boolean allExists = (d1m>=0 && d1n>=0 && d2m>=0 && d2n>=0 && d3m>=0 && d3n>=0 );
		
		//NOTE: all instruction types that are equivalent in CP and MR are only
		//included in CP to prevent redundancy
		CPINSTRUCTION_TYPE cptype = CPInstructionParser.String2CPInstructionType.get(optype);
		if( cptype != null ) //for CP Ops and equivalent MR ops 
		{
			//general approach: count of floating point *, /, +, -, ^, builtin ;
			switch(cptype) 
			{
			
				case AggregateBinary: //opcodes: ba+*, cov
					if( optype.equals("ba+*") ) { //matrix mult
						//reduction by factor 2 because matrix mult better than
						//average flop count
						if( !leftSparse && !rightSparse )
							return 2 * (d1m * d1n * ((d2n>1)?d1s:1.0) * d2n) /2;
						else if( !leftSparse && rightSparse )
							return 2 * (d1m * d1n * d1s * d2n * d2s) /2;
						else if( leftSparse && !rightSparse )
							return 2 * (d1m * d1n * d1s * d2n) /2;
						else //leftSparse && rightSparse
							return 2 * (d1m * d1n * d1s * d2n * d2s) /2;
					}
					else if( optype.equals("cov") ) {
						//note: output always scalar, d3 used as weights block
						//if( allExists ), same runtime for 2 and 3 inputs
						return 23 * d1m; //(11+3*k+)
					}
					
					return 0;
				
				case MMChain:
					//reduction by factor 2 because matrix mult better than average flop count
					//(mmchain essentially two matrix-vector muliplications)
					if( !leftSparse  )
						return (2+2) * (d1m * d1n) /2;
					else 
						return (2+2) * (d1m * d1n * d1s) /2;
					
				case AggregateTernary: //opcodes: tak+*
					return 6 * d1m * d1n; //2*1(*) + 4 (k+)
					
				case AggregateUnary: //opcodes: uak+, uark+, uack+, uasqk+, uarsqk+, uacsqk+,
				                     //         uamean, uarmean, uacmean, uavar, uarvar, uacvar,
				                     //         uamax, uarmax, uarimax, uacmax, uamin, uarmin, uacmin,
				                     //         ua+, uar+, uac+, ua*, uatrace, uaktrace,
				                     //         nrow, ncol, length, cm
					
					if( optype.equals("nrow") || optype.equals("ncol") || optype.equals("length") )
						return DEFAULT_NFLOP_NOOP;
					else if( optype.equals( "cm" ) ) {
						double xcm = 1;
						switch( Integer.parseInt(args[0]) ) {
							case 0: xcm=1; break; //count
							case 1: xcm=8; break; //mean
							case 2: xcm=16; break; //cm2
							case 3: xcm=31; break; //cm3
							case 4: xcm=51; break; //cm4
							case 5: xcm=16; break; //variance
						}
						return (leftSparse) ? xcm * (d1m * d1s + 1) : xcm * d1m;
					}
				    else if( optype.equals("uatrace") || optype.equals("uaktrace") )
				    	return 2 * d1m * d1n;
				    else if( optype.equals("ua+") || optype.equals("uar+") || optype.equals("uac+")  ){
				    	//sparse safe operations
				    	if( !leftSparse ) //dense
				    		return d1m * d1n;
				    	else //sparse
				    		return d1m * d1n * d1s;
				    }
				    else if( optype.equals("uak+") || optype.equals("uark+") || optype.equals("uack+"))
				    	return 4 * d1m * d1n; //1*k+
				    else if( optype.equals("uasqk+") || optype.equals("uarsqk+") || optype.equals("uacsqk+"))
						return 5 * d1m * d1n; // +1 for multiplication to square term
				    else if( optype.equals("uamean") || optype.equals("uarmean") || optype.equals("uacmean"))
						return 7 * d1m * d1n; //1*k+
				    else if( optype.equals("uavar") || optype.equals("uarvar") || optype.equals("uacvar"))
						return 14 * d1m * d1n;
				    else if(   optype.equals("uamax") || optype.equals("uarmax") || optype.equals("uacmax")
				    		|| optype.equals("uamin") || optype.equals("uarmin") || optype.equals("uacmin")
				    		|| optype.equals("uarimax") || optype.equals("ua*") )
				    	return d1m * d1n;
					
				    return 0;	
				    
				case ArithmeticBinary: //opcodes: +, -, *, /, ^ (incl. ^2, *2)
					//note: covers scalar-scalar, scalar-matrix, matrix-matrix
					if( optype.equals("+") || optype.equals("-") //sparse safe
						&& ( leftSparse || rightSparse ) )
						return d1m*d1n*d1s + d2m*d2n*d2s;
					else
						return d3m*d3n;
					
				case Ternary: //opcodes: ctable
					if( optype.equals("ctable") ){
						if( leftSparse )
							return d1m * d1n * d1s; //add
						else 
							return d1m * d1n;
					}
					return 0;
					
				case BooleanBinary: //opcodes: &&, ||
					return 1; //always scalar-scalar
						
				case BooleanUnary: //opcodes: !
					return 1; //always scalar-scalar

				case Builtin: //opcodes: log 
					//note: covers scalar-scalar, scalar-matrix, matrix-matrix
					//note: can be unary or binary
					if( allExists ) //binary
						return 3 * d3m * d3n;
					else //unary
						return d3m * d3n;
					
				case BuiltinBinary: //opcodes: max, min, solve
					//note: covers scalar-scalar, scalar-matrix, matrix-matrix
					if( optype.equals("solve") ) //see also MultiReturnBuiltin
						return d1m * d1n * d1n; //for 1kx1k ~ 1GFLOP -> 0.5s
					else //default
						return d3m * d3n;

					
				case BuiltinUnary: //opcodes: exp, abs, sin, cos, tan, sign, sqrt, plogp, print, round, sprop, sigmoid
					//TODO add cost functions for commons math builtins: inverse, cholesky
					if( optype.equals("print") ) //scalar only
						return 1;
					else
					{
						double xbu = 1; //default for all ops
						if( optype.equals("plogp") ) xbu = 2;
						else if( optype.equals("round") ) xbu = 4;
						
						if( optype.equals("sin") || optype.equals("tan") || optype.equals("round")
							|| optype.equals("abs") || optype.equals("sqrt") || optype.equals("sprop")
							|| optype.equals("sigmoid") || optype.equals("sign") ) //sparse-safe
						{
							if( leftSparse ) //sparse
								return xbu * d1m * d1n * d1s;	
							else //dense
								return xbu * d1m * d1n;
						}
						else
							return xbu * d1m * d1n;
					}
										
				case Reorg: //opcodes: r', rdiag
				case MatrixReshape: //opcodes: rshape
					if( leftSparse )
						return d1m * d1n * d1s;
					else
						return d1m * d1n;
					
				case Append: //opcodes: append
					return DEFAULT_NFLOP_CP * 
					       (((leftSparse) ? d1m * d1n * d1s : d1m * d1n ) +
					        ((rightSparse) ? d2m * d2n * d2s : d2m * d2n ));
					
				case RelationalBinary: //opcodes: ==, !=, <, >, <=, >=  
					//note: all relational ops are not sparsesafe
					return d3m * d3n; //covers all combinations of scalar and matrix  
					
				case File: //opcodes: rm, mv
					return DEFAULT_NFLOP_NOOP;
					
				case Variable: //opcodes: assignvar, cpvar, rmvar, rmfilevar, assignvarwithfile, attachfiletovar, valuepick, iqsize, read, write, createvar, setfilename, castAsMatrix
					if( optype.equals("write") ){
						boolean text = args[0].equals("textcell") || args[0].equals("csv");
						double xwrite =  text ? DEFAULT_NFLOP_TEXT_IO : DEFAULT_NFLOP_CP;
						
						if( !leftSparse )
							return d1m * d1n * xwrite; 
						else
							return d1m * d1n * d1s * xwrite;
					}
					else if ( optype.equals("inmem-iqm") )
						//note: assumes uniform distribution
						return 2 * d1m + //sum of weights
						       5 + 0.25d * d1m + //scan to lower quantile
						       8 * 0.5 * d1m; //scan from lower to upper quantile
					else
						return DEFAULT_NFLOP_NOOP;
			
				case Rand: //opcodes: rand, seq
					if( optype.equals(DataGen.RAND_OPCODE) ){
						int nflopRand = 32; //per random number
						switch(Integer.parseInt(args[0])) {
							case 0: return DEFAULT_NFLOP_NOOP; //empty matrix
							case 1: return d3m * d3n * 8; //allocate, arrayfill
							case 2: //full rand
							{
								if( d3s==1.0 )
									return d3m * d3n * nflopRand + d3m * d3n * 8; //DENSE gen (incl allocate)    
								else 
									return (d3s>=MatrixBlock.SPARSITY_TURN_POINT)? 
										    2 * d3m * d3n * nflopRand + d3m * d3n * 8: //DENSE gen (incl allocate)    
									        3 * d3m * d3n * d3s * nflopRand + d3m * d3n * d3s * 24; //SPARSE gen (incl allocate)
							}
						}
					}
					else //seq
						return d3m * d3n * DEFAULT_NFLOP_CP;
				
				case StringInit: //sinit
					return d3m * d3n * DEFAULT_NFLOP_CP;
					
				case External: //opcodes: extfunct
					//note: should be invoked independently for multiple outputs
					return d1m * d1n * d1s * DEFAULT_NFLOP_UNKNOWN;
				
				case MultiReturnBuiltin: //opcodes: qr, lu, eigen
					//note: they all have cubic complexity, the scaling factor refers to commons.math
					double xf = 2; //default e.g, qr
					if( optype.equals("eigen") ) 
						xf = 32;
					else if ( optype.equals("lu") )
						xf = 16;
					return xf * d1m * d1n * d1n; //for 1kx1k ~ 2GFLOP -> 1s
					
				case ParameterizedBuiltin: //opcodes: cdf, invcdf, groupedagg, rmempty
					if( optype.equals("cdf") || optype.equals("invcdf"))
						return DEFAULT_NFLOP_UNKNOWN; //scalar call to commons.math
					else if( optype.equals("groupedagg") ){	
						double xga = 1;
						switch( Integer.parseInt(args[0]) ) {
							case 0: xga=4; break; //sum, see uk+
							case 1: xga=1; break; //count, see cm
							case 2: xga=8; break; //mean
							case 3: xga=16; break; //cm2
							case 4: xga=31; break; //cm3
							case 5: xga=51; break; //cm4
							case 6: xga=16; break; //variance
						}						
						return 2 * d1m + xga * d1m; //scan for min/max, groupedagg
					}	
					else if( optype.equals("rmempty") ){
						switch(Integer.parseInt(args[0])){
							case 0: //remove rows
								return ((leftSparse) ? d1m : d1m * Math.ceil(1.0d/d1s)/2) +
									   DEFAULT_NFLOP_CP * d3m * d2m;
							case 1: //remove cols
								return d1n * Math.ceil(1.0d/d1s)/2 + 
								       DEFAULT_NFLOP_CP * d3m * d2m;
						}
						
					}	
					return 0;
					
				case QSort: //opcodes: sort
					if( optype.equals("sort") ){
						//note: mergesort since comparator used
						double sortCosts = 0;
						if( onlyLeft )
							sortCosts = DEFAULT_NFLOP_CP * d1m + d1m;
						else //w/ weights
							sortCosts = DEFAULT_NFLOP_CP * ((leftSparse)?d1m*d1s:d1m); 
						return sortCosts + d1m*(int)(Math.log(d1m)/Math.log(2)) + //mergesort
										   DEFAULT_NFLOP_CP * d1m;
					}
					return 0;
					
				case MatrixIndexing: //opcodes: rangeReIndex, leftIndex
					if( optype.equals("leftIndex") ){
						return DEFAULT_NFLOP_CP * ((leftSparse)? d1m*d1n*d1s : d1m*d1n)
						       + 2 * DEFAULT_NFLOP_CP * ((rightSparse)? d2m*d2n*d2s : d2m*d2n );
					}
					else if( optype.equals("rangeReIndex") ){
						return DEFAULT_NFLOP_CP * ((leftSparse)? d2m*d2n*d2s : d2m*d2n );
					}
					return 0;
					
				case MMTSJ: //opcodes: tsmm
					//diff to ba+* only upper triangular matrix
					//reduction by factor 2 because matrix mult better than
					//average flop count
					if( MMTSJType.valueOf(args[0]).isLeft() ) { //lefttranspose
						if( !rightSparse ) //dense						
							return d1m * d1n * d1s * d1n /2;
						else //sparse
							return d1m * d1n * d1s * d1n * d1s /2; 
					}
					else if(onlyLeft) { //righttranspose
						if( !leftSparse ) //dense
							return (double)d1m * d1n * d1m /2;
						else //sparse
							return   d1m * d1n * d1s //reorg sparse
							       + d1m * d1n * d1s * d1n * d1s /2; //core tsmm
					}					
					return 0;
				
				case Partition:
					return d1m * d1n * d1s + //partitioning costs
						   (inMR ? 0 : //include write cost if in CP  	
							getHDFSWriteTime(d1m, d1n, d1s)* DEFAULT_FLOPS);
					
				case INVALID:
					return 0;
				
				default: 
					throw new DMLRuntimeException("CostEstimator: unsupported instruction type: "+optype);
			}
				
		}
		
		//if not found in CP instructions
		MRINSTRUCTION_TYPE mrtype = MRInstructionParser.String2MRInstructionType.get(optype);
		if ( mrtype != null ) //for specific MR ops
		{
			switch(mrtype)
			{
				case Aggregate: //opcodes: a+, ak+, asqk+, a*, amax, amin, amean
					//TODO should be aggregate unary
					int numMap = Integer.parseInt(args[0]);
					if( optype.equals("ak+") )
						return 4 * numMap * d1m * d1n * d1s;
					else if( optype.equals("asqk+") )
						return 5 * numMap * d1m * d1n * d1s; // +1 for multiplication to square term
					else if( optype.equals("avar") )
						return 14 * numMap * d1m * d1n * d1s;
					else
						return numMap * d1m * d1n * d1s;
					
				case AggregateBinary: //opcodes: cpmm, rmm, mapmult
					//note: copy from CP costs
					if(    optype.equals("cpmm") || optype.equals("rmm") 
						|| optype.equals(MapMult.OPCODE) ) //matrix mult
					{
						//reduction by factor 2 because matrix mult better than
						//average flop count
						if( !leftSparse && !rightSparse )
							return 2 * (d1m * d1n * ((d2n>1)?d1s:1.0) * d2n) /2;
						else if( !leftSparse && rightSparse )
							return 2 * (d1m * d1n * d1s * d2n * d2s) /2;
						else if( leftSparse && !rightSparse )
							return 2 * (d1m * d1n * d1s * d2n) /2;
						else //leftSparse && rightSparse
							return 2 * (d1m * d1n * d1s * d2n * d2s) /2;
					}
					return 0;
					
				case MapMultChain: //opcodes: mapmultchain	
					//assume dense input2 and input3
					return   2 * d1m * d2n * d1n * ((d2n>1)?d1s:1.0) //ba(+*) 
						   + d1m * d2n //cellwise b(*) 
					       + d1m * d2n //r(t)
					       + 2 * d2n * d1n * d1m * (leftSparse?d1s:1.0) //ba(+*)
					       + d2n * d1n; //r(t)
					
				case ArithmeticBinary: //opcodes: s-r, so, max, min, 
					                   //         >, >=, <, <=, ==, != 
					//TODO Should be relational 
				
					//note: all relational ops are not sparsesafe
					return d3m * d3n; //covers all combinations of scalar and matrix  
	
				case CombineUnary: //opcodes: combineunary
					return d1m * d1n * d1s;
					
				case CombineBinary: //opcodes: combinebinary
					return   d1m * d1n * d1s
					       + d2m * d2n * d2s;
					
				case CombineTernary: //opcodes: combinetertiary
					return   d1m * d1n * d1s
				           + d2m * d2n * d2s
				           + d3m * d3n * d3s;
					
				case Unary: //opcodes: log, slog, pow 			
					//TODO requires opcode consolidation (builtin, arithmic)
					//note: covers scalar, matrix, matrix-scalar
					return d3m * d3n;
					
				case Ternary: //opcodes: ctabletransform, ctabletransformscalarweight, ctabletransformhistogram, ctabletransformweightedhistogram
					//note: copy from cp
					if( leftSparse )
						return d1m * d1n * d1s; //add
					else 
						return d1m * d1n;
			
				case Quaternary:
					//TODO pattern specific and all 4 inputs requires
					return d1m * d1n * d1s *4;
					
				case Reblock: //opcodes: rblk
					return DEFAULT_NFLOP_CP * ((leftSparse)? d1m*d1n*d1s : d1m*d1n); 
					
				case Replicate: //opcodes: rblk
					return DEFAULT_NFLOP_CP * ((leftSparse)? d1m*d1n*d1s : d1m*d1n); 
					
				case CM_N_COV: //opcodes: mean
					double xcm = 8;
					return (leftSparse) ? xcm * (d1m * d1s + 1) : xcm * d1m;
					
				case GroupedAggregate: //opcodes: groupedagg		
					//TODO: need to consolidate categories (ParameterizedBuiltin)
					//copy from CP opertion
					double xga = 1;
					switch( Integer.parseInt(args[0]) ) {
						case 0: xga=4; break; //sum, see uk+
						case 1: xga=1; break; //count, see cm
						case 2: xga=8; break; //mean
						case 3: xga=16; break; //cm2
						case 4: xga=31; break; //cm3
						case 5: xga=51; break; //cm4
						case 6: xga=16; break; //variance
					}						
					return 2 * d1m + xga * d1m; //scan for min/max, groupedagg
					
				case PickByCount: //opcodes: valuepick, rangepick
					break;
					//TODO
					//String2MRInstructionType.put( "valuepick"  , MRINSTRUCTION_TYPE.PickByCount);  // for quantile()
					//String2MRInstructionType.put( "rangepick"  , MRINSTRUCTION_TYPE.PickByCount);  // for interQuantile()
					
				case RangeReIndex: //opcodes: rangeReIndex, rangeReIndexForLeft
					//TODO: requires category consolidation
					if( optype.equals("rangeReIndex") )
						return DEFAULT_NFLOP_CP * ((leftSparse)? d2m*d2n*d2s : d2m*d2n );
					else //rangeReIndexForLeft
						return   DEFAULT_NFLOP_CP * ((leftSparse)? d1m*d1n*d1s : d1m*d1n)
					           + DEFAULT_NFLOP_CP * ((rightSparse)? d2m*d2n*d2s : d2m*d2n );
	
				case ZeroOut: //opcodes: zeroOut
					return   DEFAULT_NFLOP_CP * ((leftSparse)? d1m*d1n*d1s : d1m*d1n)
				           + DEFAULT_NFLOP_CP * ((rightSparse)? d2m*d2n*d2s : d2m*d2n );								
					
				default:
					return 0;
			}
		}
		else
		{
			throw new DMLRuntimeException("CostEstimator: unsupported instruction type: "+optype);
		}
		
		//TODO Parameterized Builtin Functions
		//String2CPFileInstructionType.put( "rmempty"	    , CPINSTRUCTION_TYPE.ParameterizedBuiltin);
		
		return -1; //should never come here.
	}
}