QuaternaryOp.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.hops;

import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.Hop.MultiThreadedHop;
import org.apache.sysml.lops.Aggregate;
import org.apache.sysml.lops.DataPartition;
import org.apache.sysml.lops.Group;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.LopsException;
import org.apache.sysml.lops.RepMat;
import org.apache.sysml.lops.Transform;
import org.apache.sysml.lops.Unary;
import org.apache.sysml.lops.UnaryCP;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.lops.PartialAggregate.CorrectionLocationType;
import org.apache.sysml.lops.WeightedCrossEntropy;
import org.apache.sysml.lops.WeightedCrossEntropyR;
import org.apache.sysml.lops.WeightedDivMM;
import org.apache.sysml.lops.WeightedCrossEntropy.WCeMMType;
import org.apache.sysml.lops.WeightedDivMM.WDivMMType;
import org.apache.sysml.lops.WeightedDivMMR;
import org.apache.sysml.lops.WeightedSigmoid;
import org.apache.sysml.lops.WeightedSigmoid.WSigmoidType;
import org.apache.sysml.lops.WeightedSigmoidR;
import org.apache.sysml.lops.WeightedSquaredLoss;
import org.apache.sysml.lops.WeightedSquaredLoss.WeightsType;
import org.apache.sysml.lops.WeightedSquaredLossR;
import org.apache.sysml.lops.WeightedUnaryMM;
import org.apache.sysml.lops.WeightedUnaryMM.WUMMType;
import org.apache.sysml.lops.WeightedUnaryMMR;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;

/** 
 * Note: this hop should be called AggQuaternaryOp in consistency with AggUnaryOp and AggBinaryOp;
 * however, since there does not exist a real QuaternaryOp yet - we can leave it as is for now. 
 */
public class QuaternaryOp extends Hop implements MultiThreadedHop
{

	//config influencing mr operator selection (for testing purposes only) 
	public static boolean FORCE_REPLICATION = false;
	
	private OpOp4 _op = null;
	private int _maxNumThreads = -1; //-1 for unlimited
	
	//wsloss-specific attributes
	private boolean _postWeights = false;
	
	//wsigmoid-specific attributes
	private boolean _logout = false;
	private boolean _minusin = false;
	
	//wdivmm-specific attributes
	private int _baseType = -1;
	private boolean _mult = false;
	private boolean _minus = false;
	
	//wumm-specific attributes
	private boolean _umult = false;
	private OpOp1 _uop = null;
	private OpOp2 _sop = null;
	
	private QuaternaryOp() {
		//default constructor for clone
	}
	
	/**
	 * Constructor for wsloss.
	 * 
	 * @param l ?
	 * @param dt data type
	 * @param vt value type
	 * @param o the Hop.OpOp4
	 * @param inX high-level operator X
	 * @param inU high-level operator U
	 * @param inV high-level operator V
	 * @param inW high-level operator W
	 * @param post post weights
	 */
	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o,
			Hop inX, Hop inU, Hop inV, Hop inW, boolean post) 
	{			
		this(l, dt, vt, o, inX, inU, inV);
		getInput().add(3, inW);
		inW.getParent().add(this);
		
		_postWeights = post;
	}
	
	/**
	 * Constructor for wsigmoid.
	 * 
	 * @param l ?
	 * @param dt data type
	 * @param vt value type
	 * @param o the Hop.OpOp4
	 * @param inX high-level operator X
	 * @param inU high-level operator U
	 * @param inV high-level operator V
	 * @param flag1 logout
	 * @param flag2 minusin
	 */
	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o,
			Hop inX, Hop inU, Hop inV, boolean flag1, boolean flag2) 
	{
		this(l, dt, vt, o, inX, inU, inV);
		
		_logout = flag1;
		_minusin = flag2;
	}

	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o,
			Hop inX, Hop inU, Hop inV, Hop inW, int baseType, boolean flag1, boolean flag2) 
	{
		this(l, dt, vt, o, inX, inU, inV);
		if( inW != null ) { //four inputs
			getInput().add(3, inW);
			inW.getParent().add(this);
		}
		
		_baseType = baseType;
		_mult = flag1;
		_minus = flag2;
	}
	
	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o,
			Hop inW, Hop inU, Hop inV, boolean umult, OpOp1 uop, OpOp2 sop) 
	{
		this(l, dt, vt, o, inW, inU, inV);
		
		_umult = umult;
		_uop = uop;
		_sop = sop;
	}

	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o, Hop inX, Hop inU, Hop inV) 
	{
		super(l, dt, vt);
		_op = o;
		getInput().add(0, inX);
		getInput().add(1, inU);
		getInput().add(2, inV);
		inX.getParent().add(this);
		inU.getParent().add(this);
		inV.getParent().add(this);
	}
	
	public OpOp4 getOp(){
		return _op;
	}

	@Override
	public void setMaxNumThreads( int k ) {
		_maxNumThreads = k;
	}
	
	@Override
	public int getMaxNumThreads() {
		return _maxNumThreads;
	}
	
	@Override
	public Lop constructLops() 
		throws HopsException, LopsException 
	{	
		//return already created lops
		if( getLops() != null )
			return getLops();

		try 
		{
			ExecType et = optFindExecType();
			
			switch( _op ) {
				case WSLOSS: {
					WeightsType wtype = checkWeightsType();
					
					if( et == ExecType.CP )
						constructCPLopsWeightedSquaredLoss(wtype);
					else if( et == ExecType.MR )
						constructMRLopsWeightedSquaredLoss(wtype);
					else if( et == ExecType.SPARK )
						constructSparkLopsWeightedSquaredLoss(wtype);
					else
						throw new HopsException("Unsupported quaternaryop-wsloss exec type: "+et);
					break;
				}
				
				case WSIGMOID:{
					WSigmoidType wtype = checkWSigmoidType();
					
					if( et == ExecType.CP )
						constructCPLopsWeightedSigmoid(wtype);
					else if( et == ExecType.MR )
						constructMRLopsWeightedSigmoid(wtype);
					else if( et == ExecType.SPARK )
						constructSparkLopsWeightedSigmoid(wtype);
					else
						throw new HopsException("Unsupported quaternaryop-wsigmoid exec type: "+et);
					break;
				}
				
				case WDIVMM:{
					WDivMMType wtype = checkWDivMMType();
					if( et == ExecType.CP )
						constructCPLopsWeightedDivMM(wtype);
					else if( et == ExecType.MR )
						constructMRLopsWeightedDivMM(wtype);
					else if( et == ExecType.SPARK )
						constructSparkLopsWeightedDivMM(wtype);
					else
						throw new HopsException("Unsupported quaternaryop-wdivmm exec type: "+et);
					break;
				}
				
				case WCEMM:{
					WCeMMType wtype = checkWCeMMType();
					
					if( et == ExecType.CP )
						constructCPLopsWeightedCeMM(wtype);
					else if( et == ExecType.MR )
						constructMRLopsWeightedCeMM(wtype);
					else if( et == ExecType.SPARK )
						constructSparkLopsWeightedCeMM(wtype);
					else
						throw new HopsException("Unsupported quaternaryop-wcemm exec type: "+et);
					break;
				}
				
				case WUMM:{
					WUMMType wtype = _umult ? WUMMType.MULT : WUMMType.DIV;
					
					if( et == ExecType.CP )
						constructCPLopsWeightedUMM(wtype);
					else if( et == ExecType.MR )
						constructMRLopsWeightedUMM(wtype);
					else if( et == ExecType.SPARK )
						constructSparkLopsWeightedUMM(wtype);
					else
						throw new HopsException("Unsupported quaternaryop-wumm exec type: "+et);
					break;
				}
				
				default:
					throw new HopsException(this.printErrorLocation() + "Unknown QuaternaryOp (" + _op + ") while constructing Lops");
			}
		} 
		catch(LopsException e) {
			throw new HopsException(this.printErrorLocation() + "error constructing lops for QuaternaryOp." , e);
		}
	
		//add reblock/checkpoint lops if necessary
		constructAndSetLopsDataFlowProperties();
				
		return getLops();
	}

	@Override
	public String getOpString() {
		String s = new String("");
		s += "q(" + HopsOpOp4String.get(_op) + ")";
		return s;
	}

	@Override
	public boolean allowsAllExecTypes()
	{
		return true;
	}

	private void constructCPLopsWeightedSquaredLoss(WeightsType wtype) 
		throws HopsException, LopsException
	{
		WeightedSquaredLoss wsloss = new WeightedSquaredLoss(
				getInput().get(0).constructLops(),
				getInput().get(1).constructLops(),
				getInput().get(2).constructLops(),
				getInput().get(3).constructLops(),
				getDataType(), getValueType(), wtype, ExecType.CP);
		
		//set degree of parallelism
		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
		wsloss.setNumThreads(k);
		
		setOutputDimensions( wsloss );
		setLineNumbers( wsloss );
		setLops( wsloss );
	}

	private void constructMRLopsWeightedSquaredLoss(WeightsType wtype) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wsloss are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Squared Loss only if this constraint holds. 
		
		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop W = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWsloss = (!wtype.hasFourInputs() && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
		
		if( !FORCE_REPLICATION && isMapWsloss ) //broadcast
		{
			//partitioning of U
			boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lU = U.constructLops();
			if( needPartU ){ //requires partitioning
				lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
				setLineNumbers(lU);	
			}
			
			//partitioning of V
			boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lV = V.constructLops();
			if( needPartV ){ //requires partitioning
				lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
				setLineNumbers(lV);	
			}
			
			//map-side wsloss always with broadcast
			Lop wsloss = new WeightedSquaredLoss( X.constructLops(), lU, lV, W.constructLops(), 
					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.MR);
			wsloss.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(wsloss);
			
			Group grp = new Group(wsloss, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grp.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grp);
			
			Aggregate agg1 = new Aggregate(grp, HopsAgg2Lops.get(AggOp.SUM), DataType.MATRIX, ValueType.DOUBLE, ExecType.MR);
			agg1.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses kahanSum 
			agg1.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(agg1);
			
			UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), getDataType(), getValueType());
			unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
			setLineNumbers(unary1);
			setLops(unary1);
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < OptimizerUtils.getRemoteMemBudgetReduce());
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < OptimizerUtils.getRemoteMemBudgetReduce()) 
					        || (cacheU && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetReduce()));
			
			Group grpX = new Group(X.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grpX);
			
			Lop grpW = W.constructLops();
			if( grpW.getDataType()==DataType.MATRIX ) {
				grpW = new Group(W.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpW.getOutputParameters().setDimensions(W.getDim1(), W.getDim2(), W.getRowsInBlock(), W.getColsInBlock(), -1);
				setLineNumbers(grpW);
			}
			
			Lop lU = null;
			if( cacheU ) {
				//partitioning of U for read through distributed cache
				boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lU = U.constructLops();
				if( needPartU ){ //requires partitioning
					lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
					setLineNumbers(lU);	
				}
			}
			else {
				//replication of U for shuffle to target block
				Lop offset = createOffsetLop(V, false); //ncol of t(V) -> nrow of V determines num replicates
				lU = new RepMat(U.constructLops(), offset, true, V.getDataType(), V.getValueType());
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), 
						U.getRowsInBlock(), U.getColsInBlock(), U.getNnz());
				setLineNumbers(lU);
				
				Group grpU = new Group(lU, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), U.getRowsInBlock(), U.getColsInBlock(), -1);
				setLineNumbers(grpU);
				lU = grpU;
			}
			
			Lop lV = null;
			if( cacheV ) {
				//partitioning of V for read through distributed cache
				boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lV = V.constructLops();
				if( needPartV ){ //requires partitioning
					lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
					setLineNumbers(lV);	
				}
			}
			else {
				//replication of t(V) for shuffle to target block
				Transform ltV = new Transform( V.constructLops(), HopsTransf2Lops.get(ReOrgOp.TRANSPOSE), getDataType(), getValueType(), ExecType.MR);
				ltV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(ltV);
				
				Lop offset = createOffsetLop(U, false); //nrow of U determines num replicates
				lV = new RepMat(ltV, offset, false, V.getDataType(), V.getValueType());
				lV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(lV);
				
				Group grpV = new Group(lV, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), V.getColsInBlock(), V.getRowsInBlock(), -1);
				setLineNumbers(grpV);
				lV = grpV;
			}
			
			//reduce-side wsloss w/ or without broadcast
			Lop wsloss = new WeightedSquaredLossR( 
					grpX, lU, lV, grpW, DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.MR);
			wsloss.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(wsloss);
			
			Group grp = new Group(wsloss, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grp.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grp);
			
			Aggregate agg1 = new Aggregate(grp, HopsAgg2Lops.get(AggOp.SUM), DataType.MATRIX, ValueType.DOUBLE, ExecType.MR);
			agg1.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses kahanSum 
			agg1.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(agg1);
			
			UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), getDataType(), getValueType());
			unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
			setLineNumbers(unary1);
			setLops(unary1);
		}
	}

	private void constructSparkLopsWeightedSquaredLoss(WeightsType wtype) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wsloss are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Squared Loss only if this constraint holds. 
		
		//Notes: Any broadcast needs to fit twice in local memory because we partition the input in cp,
		//and needs to fit once in executor broadcast memory. The 2GB broadcast constraint is no longer
		//required because the max_int byte buffer constraint has been fixed in Spark 1.4 
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop W = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWsloss = (!wtype.hasFourInputs() && m1Size+m2Size < memBudgetExec
				&& 2*m1Size < memBudgetLocal && 2*m2Size < memBudgetLocal); 
		
		if( !FORCE_REPLICATION && isMapWsloss ) //broadcast
		{
			//map-side wsloss always with broadcast
			Lop wsloss = new WeightedSquaredLoss( X.constructLops(), U.constructLops(), V.constructLops(), W.constructLops(), 
					DataType.SCALAR, ValueType.DOUBLE, wtype, ExecType.SPARK);
			setOutputDimensions(wsloss);
			setLineNumbers(wsloss);
			setLops(wsloss);
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < memBudgetExec && 2*m1Size < memBudgetLocal);
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < memBudgetExec ) 
					        || (cacheU && m1Size+m2Size < memBudgetExec)) && 2*m2Size < memBudgetLocal;
			
			//reduce-side wsloss w/ or without broadcast
			Lop wsloss = new WeightedSquaredLossR( 
					X.constructLops(), U.constructLops(), V.constructLops(), W.constructLops(), 
					DataType.SCALAR, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.SPARK);
			setOutputDimensions(wsloss);
			setLineNumbers(wsloss);
			setLops(wsloss);
		}
	}

	private void constructCPLopsWeightedSigmoid(WSigmoidType wtype) 
		throws HopsException, LopsException
	{
		WeightedSigmoid wsig = new WeightedSigmoid(
				getInput().get(0).constructLops(),
				getInput().get(1).constructLops(),
				getInput().get(2).constructLops(),
				getDataType(), getValueType(), wtype, ExecType.CP);
		
		//set degree of parallelism
		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
		wsig.setNumThreads(k);
		
		setOutputDimensions( wsig );
		setLineNumbers( wsig );
		setLops( wsig );
	}

	private void constructMRLopsWeightedSigmoid( WSigmoidType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wsigmoid are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Sigmoid only if this constraint holds. 
		
		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWsig = (m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
		
		if( !FORCE_REPLICATION && isMapWsig ) //broadcast
		{
			//partitioning of U
			boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lU = U.constructLops();
			if( needPartU ){ //requires partitioning
				lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
				setLineNumbers(lU);	
			}
			
			//partitioning of V
			boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lV = V.constructLops();
			if( needPartV ){ //requires partitioning
				lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
				setLineNumbers(lV);	
			}
			
			//map-side wsig always with broadcast
			Lop wsigmoid = new WeightedSigmoid( X.constructLops(), lU, lV,  
					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.MR);
			setOutputDimensions(wsigmoid);
			setLineNumbers(wsigmoid);
			setLops( wsigmoid );
			
			//in contrast to wsloss no aggregation required 
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < OptimizerUtils.getRemoteMemBudgetReduce());
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < OptimizerUtils.getRemoteMemBudgetReduce()) 
					        || (cacheU && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetReduce()));
			
			Group grpX = new Group(X.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), X.getNnz());
			setLineNumbers(grpX);
			
			Lop lU = null;
			if( cacheU ) {
				//partitioning of U for read through distributed cache
				boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lU = U.constructLops();
				if( needPartU ){ //requires partitioning
					lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
					setLineNumbers(lU);	
				}
			}
			else {
				//replication of U for shuffle to target block
				Lop offset = createOffsetLop(V, false); //ncol of t(V) -> nrow of V determines num replicates
				lU = new RepMat(U.constructLops(), offset, true, V.getDataType(), V.getValueType());
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), 
						U.getRowsInBlock(), U.getColsInBlock(), U.getNnz());
				setLineNumbers(lU);
				
				Group grpU = new Group(lU, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), U.getRowsInBlock(), U.getColsInBlock(), -1);
				setLineNumbers(grpU);
				lU = grpU;
			}
			
			Lop lV = null;
			if( cacheV ) {
				//partitioning of V for read through distributed cache
				boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lV = V.constructLops();
				if( needPartV ){ //requires partitioning
					lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
					setLineNumbers(lV);	
				}
			}
			else {
				//replication of t(V) for shuffle to target block
				Transform ltV = new Transform( V.constructLops(), HopsTransf2Lops.get(ReOrgOp.TRANSPOSE), getDataType(), getValueType(), ExecType.MR);
				ltV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(ltV);
				
				Lop offset = createOffsetLop(U, false); //nrow of U determines num replicates
				lV = new RepMat(ltV, offset, false, V.getDataType(), V.getValueType());
				lV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(lV);
				
				Group grpV = new Group(lV, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), V.getColsInBlock(), V.getRowsInBlock(), -1);
				setLineNumbers(grpV);
				lV = grpV;
			}
			
			//reduce-side wsig w/ or without broadcast
			Lop wsigmoid = new WeightedSigmoidR( 
					grpX, lU, lV, DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.MR);
			setOutputDimensions(wsigmoid);
			setLineNumbers(wsigmoid);
			setLops(wsigmoid);
			
			//in contrast to wsloss no aggregation required 	
		}
	}

	private void constructSparkLopsWeightedSigmoid( WSigmoidType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wsigmoid are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Sigmoid only if this constraint holds. 

		//Notes: Any broadcast needs to fit twice in local memory because we partition the input in cp,
		//and needs to fit once in executor broadcast memory. The 2GB broadcast constraint is no longer
		//required because the max_int byte buffer constraint has been fixed in Spark 1.4 
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWsig = (m1Size+m2Size < memBudgetExec
				&& 2*m1Size<memBudgetLocal && 2*m2Size<memBudgetLocal); 
		
		if( !FORCE_REPLICATION && isMapWsig ) //broadcast
		{
			//map-side wsig always with broadcast
			Lop wsigmoid = new WeightedSigmoid( X.constructLops(), U.constructLops(), V.constructLops(),  
					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.SPARK);
			setOutputDimensions(wsigmoid);
			setLineNumbers(wsigmoid);
			setLops( wsigmoid );
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < memBudgetExec && 2*m1Size < memBudgetLocal);
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < memBudgetExec ) 
					        || (cacheU && m1Size+m2Size < memBudgetExec)) && 2*m2Size < memBudgetLocal;
			
			//reduce-side wsig w/ or without broadcast
			Lop wsigmoid = new WeightedSigmoidR( 
					X.constructLops(), U.constructLops(), V.constructLops(), 
					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.SPARK);
			setOutputDimensions(wsigmoid);
			setLineNumbers(wsigmoid);
			setLops(wsigmoid);
		}
	}

	private void constructCPLopsWeightedDivMM(WDivMMType wtype) 
		throws HopsException, LopsException
	{
		WeightedDivMM wdiv = new WeightedDivMM(
				getInput().get(0).constructLops(),
				getInput().get(1).constructLops(),
				getInput().get(2).constructLops(),
				getInput().get(3).constructLops(),
				getDataType(), getValueType(), wtype, ExecType.CP);
		
		//set degree of parallelism
		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
		wdiv.setNumThreads(k);
		
		setOutputDimensions( wdiv );
		setLineNumbers( wdiv );
		setLops( wdiv );
	}

	private void constructMRLopsWeightedDivMM( WDivMMType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wdivmm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted DivMM only if this constraint holds. 
		
		Hop W = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop X = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWdivmm = ((!wtype.hasFourInputs() || wtype.hasScalar()) &&
				m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
		
		if( !FORCE_REPLICATION && isMapWdivmm ) //broadcast
		{
			//partitioning of U
			boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lU = U.constructLops();
			if( needPartU ){ //requires partitioning
				lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
				setLineNumbers(lU);	
			}
			
			//partitioning of V
			boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lV = V.constructLops();
			if( needPartV ){ //requires partitioning
				lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
				setLineNumbers(lV);	
			}
			
			//map-side wdivmm always with broadcast
			Lop wdivmm = new WeightedDivMM( W.constructLops(), lU, lV, X.constructLops(), 
					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.MR);
			setOutputDimensions(wdivmm);
			setLineNumbers(wdivmm);
			setLops(wdivmm); 
		}
		else //general case
		{
			//MR operator selection part 2 (both cannot happen for wdivmm, otherwise mapwdivmm)
			boolean cacheU = !FORCE_REPLICATION && (m1Size < OptimizerUtils.getRemoteMemBudgetReduce());
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < OptimizerUtils.getRemoteMemBudgetReduce()) 
					        || (cacheU && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetReduce()));
			
			Group grpW = new Group(W.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpW.getOutputParameters().setDimensions(W.getDim1(), W.getDim2(), W.getRowsInBlock(), W.getColsInBlock(), W.getNnz());
			setLineNumbers(grpW);
			
			Lop grpX = X.constructLops();
			if( wtype.hasFourInputs() && (X.getDataType() != DataType.SCALAR) )
				grpX = new Group(grpX, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), X.getNnz());
			setLineNumbers(grpX);
			
			Lop lU = null;
			if( cacheU ) {
				//partitioning of U for read through distributed cache
				boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lU = U.constructLops();
				if( needPartU ){ //requires partitioning
					lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
					setLineNumbers(lU);	
				}
			}
			else {
				//replication of U for shuffle to target block
				Lop offset = createOffsetLop(V, false); //ncol of t(V) -> nrow of V determines num replicates
				lU = new RepMat(U.constructLops(), offset, true, V.getDataType(), V.getValueType());
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), 
						U.getRowsInBlock(), U.getColsInBlock(), U.getNnz());
				setLineNumbers(lU);
				
				Group grpU = new Group(lU, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), U.getRowsInBlock(), U.getColsInBlock(), -1);
				setLineNumbers(grpU);
				lU = grpU;
			}
			
			Lop lV = null;
			if( cacheV ) {
				//partitioning of V for read through distributed cache
				boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lV = V.constructLops();
				if( needPartV ){ //requires partitioning
					lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
					setLineNumbers(lV);	
				}
			}
			else {
				//replication of t(V) for shuffle to target block
				Transform ltV = new Transform( V.constructLops(), HopsTransf2Lops.get(ReOrgOp.TRANSPOSE), getDataType(), getValueType(), ExecType.MR);
				ltV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(ltV);
				
				Lop offset = createOffsetLop(U, false); //nrow of U determines num replicates
				lV = new RepMat(ltV, offset, false, V.getDataType(), V.getValueType());
				lV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(lV);
				
				Group grpV = new Group(lV, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), V.getColsInBlock(), V.getRowsInBlock(), -1);
				setLineNumbers(grpV);
				lV = grpV;
			}
			
			//reduce-side wdivmm w/ or without broadcast
			Lop wdivmm = new WeightedDivMMR( grpW, lU, lV, grpX, 
					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.MR);
			setOutputDimensions(wdivmm);
			setLineNumbers(wdivmm);
			setLops(wdivmm);
		}
		
		//in contrast to to wsloss/wsigmoid, wdivmm requires partial aggregation (for the final mm)
		Group grp = new Group(getLops(), Group.OperationTypes.Sort, getDataType(), getValueType());
		setOutputDimensions(grp);
		setLineNumbers(grp);
		
		Aggregate agg1 = new Aggregate(grp, HopsAgg2Lops.get(AggOp.SUM), getDataType(), getValueType(), ExecType.MR);
		// aggregation uses kahanSum but the inputs do not have correction values
		agg1.setupCorrectionLocation(CorrectionLocationType.NONE);  
		setOutputDimensions(agg1);
		setLineNumbers(agg1);
		
		setLops(agg1);
	}

	private void constructSparkLopsWeightedDivMM( WDivMMType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wdivmm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted DivMM only if this constraint holds. 

		//Notes: Any broadcast needs to fit twice in local memory because we partition the input in cp,
		//and needs to fit once in executor broadcast memory. The 2GB broadcast constraint is no longer
		//required because the max_int byte buffer constraint has been fixed in Spark 1.4 
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		Hop W = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop X = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWdivmm = ((!wtype.hasFourInputs() || wtype.hasScalar()) && m1Size+m2Size < memBudgetExec
				&& 2*m1Size<memBudgetLocal && 2*m2Size<memBudgetLocal); 
		
		if( !FORCE_REPLICATION && isMapWdivmm ) //broadcast
		{
			//map-side wdivmm always with broadcast
			Lop wdivmm = new WeightedDivMM( W.constructLops(), U.constructLops(), V.constructLops(), 
					X.constructLops(), DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.SPARK);
			setOutputDimensions(wdivmm);
			setLineNumbers(wdivmm);
			setLops( wdivmm );
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < memBudgetExec && 2*m1Size < memBudgetLocal);
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < memBudgetExec ) 
					        || (cacheU && m1Size+m2Size < memBudgetExec)) && 2*m2Size < memBudgetLocal;
			
			//reduce-side wdivmm w/ or without broadcast
			Lop wdivmm = new WeightedDivMMR( 
					W.constructLops(), U.constructLops(), V.constructLops(), X.constructLops(),
					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.SPARK);
			setOutputDimensions(wdivmm);
			setLineNumbers(wdivmm);
			setLops(wdivmm);
		}
	}

	private void constructCPLopsWeightedCeMM(WCeMMType wtype) 
		throws HopsException, LopsException
	{
		WeightedCrossEntropy wcemm = new WeightedCrossEntropy(
				getInput().get(0).constructLops(),
				getInput().get(1).constructLops(),
				getInput().get(2).constructLops(),
				getInput().get(3).constructLops(),
				getDataType(), getValueType(), wtype, ExecType.CP);
		
		//set degree of parallelism
		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
		wcemm.setNumThreads(k);
		
		setOutputDimensions( wcemm );
		setLineNumbers( wcemm );
		setLops( wcemm );
	}

	private void constructMRLopsWeightedCeMM(WCeMMType wtype) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wcemm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Cross Entropy only if this constraint holds. 
		
		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop eps = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWcemm = (m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
		
		if( !FORCE_REPLICATION && isMapWcemm ) //broadcast
		{
			//partitioning of U
			boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lU = U.constructLops();
			if( needPartU ){ //requires partitioning
				lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
				setLineNumbers(lU);	
			}
			
			//partitioning of V
			boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lV = V.constructLops();
			if( needPartV ){ //requires partitioning
				lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
				setLineNumbers(lV);	
			}
			
			//map-side wcemm always with broadcast
			Lop wcemm = new WeightedCrossEntropy( X.constructLops(), lU, lV, eps.constructLops(),
					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.MR);
			wcemm.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(wcemm);
			
			Group grp = new Group(wcemm, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grp.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grp);
			
			Aggregate agg1 = new Aggregate(grp, HopsAgg2Lops.get(AggOp.SUM), DataType.MATRIX, ValueType.DOUBLE, ExecType.MR);
			agg1.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses kahanSum 
			agg1.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(agg1);
			
			UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), getDataType(), getValueType());
			unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
			setLineNumbers(unary1);
			setLops(unary1);
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < OptimizerUtils.getRemoteMemBudgetReduce());
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < OptimizerUtils.getRemoteMemBudgetReduce()) 
					        || (cacheU && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetReduce()));
			
			Group grpX = new Group(X.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grpX);
			
			Lop lU = null;
			if( cacheU ) {
				//partitioning of U for read through distributed cache
				boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lU = U.constructLops();
				if( needPartU ){ //requires partitioning
					lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
					setLineNumbers(lU);	
				}
			}
			else {
				//replication of U for shuffle to target block
				Lop offset = createOffsetLop(V, false); //ncol of t(V) -> nrow of V determines num replicates
				lU = new RepMat(U.constructLops(), offset, true, V.getDataType(), V.getValueType());
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), 
						U.getRowsInBlock(), U.getColsInBlock(), U.getNnz());
				setLineNumbers(lU);
				
				Group grpU = new Group(lU, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), U.getRowsInBlock(), U.getColsInBlock(), -1);
				setLineNumbers(grpU);
				lU = grpU;
			}
			
			Lop lV = null;
			if( cacheV ) {
				//partitioning of V for read through distributed cache
				boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lV = V.constructLops();
				if( needPartV ){ //requires partitioning
					lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
					setLineNumbers(lV);	
				}
			}
			else {
				//replication of t(V) for shuffle to target block
				Transform ltV = new Transform( V.constructLops(), HopsTransf2Lops.get(ReOrgOp.TRANSPOSE), getDataType(), getValueType(), ExecType.MR);
				ltV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(ltV);
				
				Lop offset = createOffsetLop(U, false); //nrow of U determines num replicates
				lV = new RepMat(ltV, offset, false, V.getDataType(), V.getValueType());
				lV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(lV);
				
				Group grpV = new Group(lV, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), V.getColsInBlock(), V.getRowsInBlock(), -1);
				setLineNumbers(grpV);
				lV = grpV;
			}
			
			//reduce-side wcemm w/ or without broadcast
			Lop wcemm = new WeightedCrossEntropyR( grpX, lU, lV, eps.constructLops(),
					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.MR);
			wcemm.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(wcemm);
			
			Group grp = new Group(wcemm, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grp.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(grp);
			
			Aggregate agg1 = new Aggregate(grp, HopsAgg2Lops.get(AggOp.SUM), DataType.MATRIX, ValueType.DOUBLE, ExecType.MR);
			agg1.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses kahanSum 
			agg1.getOutputParameters().setDimensions(1, 1, X.getRowsInBlock(), X.getColsInBlock(), -1);
			setLineNumbers(agg1);
			
			UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), getDataType(), getValueType());
			unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
			setLineNumbers(unary1);
			setLops(unary1);
		}
	}

	private void constructSparkLopsWeightedCeMM(WCeMMType wtype) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wcemm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted Cross Entropy only if this constraint holds. 
		
		//Notes: Any broadcast needs to fit twice in local memory because we partition the input in cp,
		//and needs to fit once in executor broadcast memory. The 2GB broadcast constraint is no longer
		//required because the max_int byte buffer constraint has been fixed in Spark 1.4 
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		Hop eps = getInput().get(3);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWcemm = (m1Size+m2Size < memBudgetExec
				&& 2*m1Size < memBudgetLocal && 2*m2Size < memBudgetLocal); 
		
		if( !FORCE_REPLICATION && isMapWcemm ) //broadcast
		{
			//map-side wcemm always with broadcast
			Lop wcemm = new WeightedCrossEntropy( X.constructLops(), U.constructLops(), V.constructLops(), eps.constructLops(),
					DataType.SCALAR, ValueType.DOUBLE, wtype, ExecType.SPARK);
			setOutputDimensions(wcemm);
			setLineNumbers(wcemm);
			setLops(wcemm);
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < memBudgetExec && 2*m1Size < memBudgetLocal);
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < memBudgetExec ) 
					        || (cacheU && m1Size+m2Size < memBudgetExec)) && 2*m2Size < memBudgetLocal;
			
			//reduce-side wcemm w/ or without broadcast
			Lop wcemm = new WeightedCrossEntropyR( 
					X.constructLops(), U.constructLops(), V.constructLops(), eps.constructLops(),
					DataType.SCALAR, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.SPARK);
			setOutputDimensions(wcemm);
			setLineNumbers(wcemm);
			setLops(wcemm);
		}
	}

	private void constructCPLopsWeightedUMM(WUMMType wtype) 
		throws HopsException, LopsException
	{
		Unary.OperationTypes uop = _uop!=null ? 
				HopsOpOp1LopsU.get(_uop) : _sop==OpOp2.POW ? 
				Unary.OperationTypes.POW2 : Unary.OperationTypes.MULTIPLY2;	
		
		WeightedUnaryMM wumm = new WeightedUnaryMM(
				getInput().get(0).constructLops(),
				getInput().get(1).constructLops(),
				getInput().get(2).constructLops(),
				getDataType(), getValueType(), wtype, uop, ExecType.CP);
		
		//set degree of parallelism
		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
		wumm.setNumThreads(k);
		
		setOutputDimensions( wumm );
		setLineNumbers( wumm );
		setLops( wumm );
	}

	private void constructMRLopsWeightedUMM( WUMMType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wumm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted UnaryMM  only if this constraint holds. 
		
		Unary.OperationTypes uop = _uop!=null ? 
				HopsOpOp1LopsU.get(_uop) : _sop==OpOp2.POW ? 
				Unary.OperationTypes.POW2 : Unary.OperationTypes.MULTIPLY2;	
		
		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWumm = (m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
		
		if( !FORCE_REPLICATION && isMapWumm ) //broadcast
		{
			//partitioning of U
			boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lU = U.constructLops();
			if( needPartU ){ //requires partitioning
				lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
				setLineNumbers(lU);	
			}
			
			//partitioning of V
			boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
			Lop lV = V.constructLops();
			if( needPartV ){ //requires partitioning
				lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
				lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
				setLineNumbers(lV);	
			}
			
			//map-side wumm always with broadcast
			Lop wumm = new WeightedUnaryMM( X.constructLops(), lU, lV,  
					DataType.MATRIX, ValueType.DOUBLE, wtype, uop, ExecType.MR);
			setOutputDimensions(wumm);
			setLineNumbers(wumm);
			setLops( wumm );
			
			//in contrast to wsloss no aggregation required 
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < OptimizerUtils.getRemoteMemBudgetReduce());
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < OptimizerUtils.getRemoteMemBudgetReduce()) 
					        || (cacheU && m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetReduce()));
			
			Group grpX = new Group(X.constructLops(), Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), X.getNnz());
			setLineNumbers(grpX);
			
			Lop lU = null;
			if( cacheU ) {
				//partitioning of U for read through distributed cache
				boolean needPartU = !U.dimsKnown() || U.getDim1() * U.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lU = U.constructLops();
				if( needPartU ){ //requires partitioning
					lU = new DataPartition(lU, DataType.MATRIX, ValueType.DOUBLE, (m1Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), getRowsInBlock(), getColsInBlock(), U.getNnz());
					setLineNumbers(lU);	
				}
			}
			else {
				//replication of U for shuffle to target block
				Lop offset = createOffsetLop(V, false); //ncol of t(V) -> nrow of V determines num replicates
				lU = new RepMat(U.constructLops(), offset, true, V.getDataType(), V.getValueType());
				lU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), 
						U.getRowsInBlock(), U.getColsInBlock(), U.getNnz());
				setLineNumbers(lU);
				
				Group grpU = new Group(lU, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpU.getOutputParameters().setDimensions(U.getDim1(), U.getDim2(), U.getRowsInBlock(), U.getColsInBlock(), -1);
				setLineNumbers(grpU);
				lU = grpU;
			}
			
			Lop lV = null;
			if( cacheV ) {
				//partitioning of V for read through distributed cache
				boolean needPartV = !V.dimsKnown() || V.getDim1() * V.getDim2() > DistributedCacheInput.PARTITION_SIZE;
				lV = V.constructLops();
				if( needPartV ){ //requires partitioning
					lV = new DataPartition(lV, DataType.MATRIX, ValueType.DOUBLE, (m2Size>OptimizerUtils.getLocalMemBudget())?ExecType.MR:ExecType.CP, PDataPartitionFormat.ROW_BLOCK_WISE_N);
					lV.getOutputParameters().setDimensions(V.getDim1(), V.getDim2(), getRowsInBlock(), getColsInBlock(), V.getNnz());
					setLineNumbers(lV);	
				}
			}
			else {
				//replication of t(V) for shuffle to target block
				Transform ltV = new Transform( V.constructLops(), HopsTransf2Lops.get(ReOrgOp.TRANSPOSE), getDataType(), getValueType(), ExecType.MR);
				ltV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(ltV);
				
				Lop offset = createOffsetLop(U, false); //nrow of U determines num replicates
				lV = new RepMat(ltV, offset, false, V.getDataType(), V.getValueType());
				lV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), 
						V.getColsInBlock(), V.getRowsInBlock(), V.getNnz());
				setLineNumbers(lV);
				
				Group grpV = new Group(lV, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
				grpV.getOutputParameters().setDimensions(V.getDim2(), V.getDim1(), V.getColsInBlock(), V.getRowsInBlock(), -1);
				setLineNumbers(grpV);
				lV = grpV;
			}
			
			//reduce-side wumm w/ or without broadcast
			Lop wumm = new WeightedUnaryMMR( 
					grpX, lU, lV, DataType.MATRIX, ValueType.DOUBLE, wtype, uop, cacheU, cacheV, ExecType.MR);
			setOutputDimensions(wumm);
			setLineNumbers(wumm);
			setLops(wumm);
			
			//in contrast to wsloss no aggregation required 	
		}
	}

	private void constructSparkLopsWeightedUMM( WUMMType wtype ) 
		throws HopsException, LopsException
	{
		//NOTE: the common case for wumm are factors U/V with a rank of 10s to 100s; the current runtime only
		//supports single block outer products (U/V rank <= blocksize, i.e., 1000 by default); we enforce this
		//by applying the hop rewrite for Weighted UnaryMM only if this constraint holds. 

		Unary.OperationTypes uop = _uop!=null ? 
				HopsOpOp1LopsU.get(_uop) : _sop==OpOp2.POW ? 
				Unary.OperationTypes.POW2 : Unary.OperationTypes.MULTIPLY2;	
		
		//Notes: Any broadcast needs to fit twice in local memory because we partition the input in cp,
		//and needs to fit once in executor broadcast memory. The 2GB broadcast constraint is no longer
		//required because the max_int byte buffer constraint has been fixed in Spark 1.4 
		double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
		double memBudgetLocal = OptimizerUtils.getLocalMemBudget();

		Hop X = getInput().get(0);
		Hop U = getInput().get(1);
		Hop V = getInput().get(2);
		
		//MR operator selection, part1
		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
		boolean isMapWsloss = (m1Size+m2Size < memBudgetExec
				&& 2*m1Size<memBudgetLocal && 2*m2Size<memBudgetLocal); 
		
		if( !FORCE_REPLICATION && isMapWsloss ) //broadcast
		{
			//map-side wumm always with broadcast
			Lop wumm = new WeightedUnaryMM( X.constructLops(), U.constructLops(), V.constructLops(),  
					DataType.MATRIX, ValueType.DOUBLE, wtype, uop, ExecType.SPARK);
			setOutputDimensions(wumm);
			setLineNumbers(wumm);
			setLops( wumm );
		}
		else //general case
		{
			//MR operator selection part 2
			boolean cacheU = !FORCE_REPLICATION && (m1Size < memBudgetExec && 2*m1Size < memBudgetLocal);
			boolean cacheV = !FORCE_REPLICATION && ((!cacheU && m2Size < memBudgetExec ) 
					        || (cacheU && m1Size+m2Size < memBudgetExec)) && 2*m2Size < memBudgetLocal;
			
			//reduce-side wumm w/ or without broadcast
			Lop wumm = new WeightedUnaryMMR( 
					X.constructLops(), U.constructLops(), V.constructLops(), 
					DataType.MATRIX, ValueType.DOUBLE, wtype, uop, cacheU, cacheV, ExecType.SPARK);
			setOutputDimensions(wumm);
			setLineNumbers(wumm);
			setLops(wumm);
		}
	}

	private WeightsType checkWeightsType()
	{
		WeightsType ret = WeightsType.NONE;
		if( !(getInput().get(3) instanceof LiteralOp) ){
			if( _postWeights )
				ret = WeightsType.POST;
			else
				ret = WeightsType.PRE;
		}
		else if( _postWeights ){
			ret = WeightsType.POST_NZ;
		}
		
		return ret;
	}

	private WSigmoidType checkWSigmoidType()
	{
		
		if( _logout && _minusin )
			return WSigmoidType.LOG_MINUS;
		else if( _logout )
			return WSigmoidType.LOG;
		else if( _minusin )
			return WSigmoidType.MINUS;
		else
			return WSigmoidType.BASIC;
	}

	private WDivMMType checkWDivMMType()
	{
		switch( _baseType )
		{
			case 0: //BASIC
				return WDivMMType.MULT_BASIC;
			case 1: //LEFT
				if( getInput().get(3).getDataType()==DataType.MATRIX )
					return WDivMMType.MULT_MINUS_4_LEFT;
				else if( _minus )
					return WDivMMType.MULT_MINUS_LEFT;
				else
					return _mult ? WDivMMType.MULT_LEFT : WDivMMType.DIV_LEFT;
			case 2: //RIGHT	
				if( getInput().get(3).getDataType()==DataType.MATRIX )
					return WDivMMType.MULT_MINUS_4_RIGHT;
				else if( _minus )
					return WDivMMType.MULT_MINUS_RIGHT;
				else
					return _mult ? WDivMMType.MULT_RIGHT : WDivMMType.DIV_RIGHT;
			case 3: //LEFT w/EPS
				return WDivMMType.DIV_LEFT_EPS;
			case 4: //RIGHT w/EPS
				return WDivMMType.DIV_RIGHT_EPS;
		}
		
		return null;
	}

	private WCeMMType checkWCeMMType()
	{
		return _baseType == 1 ? WCeMMType.BASIC_EPS : WCeMMType.BASIC;
	}
	
	@Override
	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
	{
		switch( _op ) {
			case WSLOSS: //always scalar output 
			case WCEMM:	
				return OptimizerUtils.DOUBLE_SIZE;
				
			case WSIGMOID: 
			case WDIVMM: 
			case WUMM:	
				double sp = OptimizerUtils.getSparsity(dim1, dim2, nnz);
				return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sp);
				
			default:
				return 0;
		}
	}
	
	@Override
	protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz ) 
	{
		//no intermediates 
		return 0;
	}
	
	@Override
	protected long[] inferOutputCharacteristics( MemoTable memo )
	{
		long[] ret = null;
		
		switch( _op ) {
			case WSLOSS: //always scalar output
				ret = null;
				break;
			
			case WSIGMOID: 
			case WUMM: {
				MatrixCharacteristics mcW = memo.getAllInputStats(getInput().get(0));
				ret = new long[]{mcW.getRows(), mcW.getCols(), mcW.getNonZeros()};
				break;
			}
			
			case WDIVMM: {
				if( _baseType == 0 ){ //basic 
					MatrixCharacteristics mcW = memo.getAllInputStats(getInput().get(0));
					ret = new long[]{mcW.getRows(), mcW.getCols(), mcW.getNonZeros()};	
				}
				if( _baseType == 1 || _baseType == 3 ) { //left (w/ transpose or w/ epsilon)
					MatrixCharacteristics mcV = memo.getAllInputStats(getInput().get(2));
					ret = new long[]{mcV.getRows(), mcV.getCols(), -1};
				}
				else { //right
					MatrixCharacteristics mcU = memo.getAllInputStats(getInput().get(1));
					ret = new long[]{mcU.getRows(), mcU.getCols(), -1};
				}
				break;
			}
			
			default:
				throw new RuntimeException("Memory for operation (" + _op + ") can not be estimated.");
		}
				
		return ret;
	}
	
	@Override
	protected ExecType optFindExecType() 
		throws HopsException 
	{	
		checkAndSetForcedPlatform();
		
		ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;
		
		if( _etypeForced != null ) 			
		{
			_etype = _etypeForced;
		}
		else
		{	
			if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
				_etype = findExecTypeByMemEstimate();
			}
			else if ( (getInput().get(0).areDimsBelowThreshold() 
					&& getInput().get(1).areDimsBelowThreshold()
					&& getInput().get(2).areDimsBelowThreshold()
					&& getInput().get(3).areDimsBelowThreshold()) )
				_etype = ExecType.CP;
			else
				_etype = REMOTE;
			
			//check for valid CP dimensions and matrix size
			checkAndSetInvalidCPDimsAndSize();
		}
		
		//mark for recompile (forever)
		if( ConfigurationManager.isDynamicRecompilation() && !dimsKnown(true) && _etype==REMOTE )
			setRequiresRecompile();
	
		return _etype;
	}
	
	@Override
	public void refreshSizeInformation()
	{
		switch( _op ) {
			case WSLOSS: 
				//do nothing: always scalar
				break;
				
			case WSIGMOID:
			case WUMM: {
				Hop inW = getInput().get(0);
				setDim1( inW.getDim1() );
				setDim2( inW.getDim2() );
				setNnz( inW.getNnz() );
				break;
			}
			
			case WDIVMM: {
				if( _baseType == 0 ) { //basic
					Hop inW = getInput().get(0);
					setDim1( inW.getDim1() );
					setDim2( inW.getDim2() );
					setNnz( inW.getNnz() );	
				}
				else if( _baseType == 1 || _baseType == 3 ){ //left (w/ transpose or w/ epsilon)
					Hop inV = getInput().get(2);
					setDim1( inV.getDim1() );
					setDim2( inV.getDim2() );				
				}
				else { //right
					Hop inU = getInput().get(1);
					setDim1( inU.getDim1() );
					setDim2( inU.getDim2() );	
				}
				break;
			}
			
			default:
				break;
		}	
	}
	
	@Override
	public Object clone() throws CloneNotSupportedException 
	{
		QuaternaryOp ret = new QuaternaryOp();	
		
		//copy generic attributes
		ret.clone(this, false);
		
		//copy specific attributes
		ret._op            = _op;
		ret._postWeights   = _postWeights;
		ret._logout        = _logout;
		ret._minusin       = _minusin;
		ret._baseType      = _baseType;
		ret._mult          = _mult;
		ret._minus         = _minus;
		ret._umult         = _umult;
		ret._uop           = _uop;
		ret._sop           = _sop;		
		ret._maxNumThreads = _maxNumThreads;
		
		return ret;
	}
	
	@Override
	public boolean compare( Hop that )
	{
		if( !(that instanceof QuaternaryOp) )
			return false;
		
		QuaternaryOp that2 = (QuaternaryOp)that;
		
		//compare basic inputs and weights (always existing)
		boolean ret = (_op == that2._op
				&& getInput().size() == that2.getInput().size()
				&& getInput().get(0) == that2.getInput().get(0)
				&& getInput().get(1) == that2.getInput().get(1)
				&& getInput().get(2) == that2.getInput().get(2) );
	
		//check for 4th argument if same size (see above)
		if( ret && getInput().size()==4 ) 
			ret &= (getInput().get(3) == that2.getInput().get(3));
		
		//compare specific parameters
		ret &= _postWeights   == that2._postWeights;
		ret &= _logout        == that2._logout;
		ret &= _minusin 	  == that2._minusin;
		ret &= _baseType      == that2._baseType;
		ret &= _mult          == that2._mult;
		ret &= _minus         == that2._minus;
		ret &= _umult         == that2._umult;
		ret &= _uop           == that2._uop;
		ret &= _sop           == that2._sop;
		ret &= _maxNumThreads == that2._maxNumThreads;
		
		return ret;
	}
}