GPUInstruction.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.instructions.gpu;

import org.apache.sysml.lops.runtime.RunMRJobs;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.GPUInstructionParser;
import org.apache.sysml.runtime.instructions.Instruction;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.Statistics;

public abstract class GPUInstruction extends Instruction
{
	public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, BuiltinBinary, Builtin };

	// Memory/conversions
	public final static String MISC_TIMER_HOST_TO_DEVICE =          "H2D";	// time spent in bringing data to gpu (from host)
	public final static String MISC_TIMER_DEVICE_TO_HOST =          "D2H"; 	// time spent in bringing data from gpu (to host)
	public final static String MISC_TIMER_DEVICE_TO_DEVICE =        "D2D"; 	// time spent in copying data from one region on the device to another
	public final static String MISC_TIMER_SPARSE_TO_DENSE =         "s2d";	// time spent in converting data from sparse to dense
	public final static String MISC_TIMER_DENSE_TO_SPARSE =         "d2s";	// time spent in converting data from dense to sparse
	public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR =     "r2c";	// time spent in converting data from row major to column major
	public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR =     "c2r";	// time spent in converting data from column major to row major
	public final static String MISC_TIMER_OBJECT_CLONE =            "clone";// time spent in cloning (deep copying) a GPUObject instance

	public final static String MISC_TIMER_CUDA_FREE =               "f";		// time spent in calling cudaFree
	public final static String MISC_TIMER_ALLOCATE =                "a";		// time spent to allocate memory on gpu
	public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT =   "ao";		// time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE)
	public final static String MISC_TIMER_SET_ZERO =                "az";		// time spent to allocate
	public final static String MISC_TIMER_REUSE =                   "r";		// time spent in reusing already allocated memory on GPU (mainly for the count)

	// Matmult instructions
	public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB = 						"Msao";		// time spend in allocating for sparse matrix output
	public final static String MISC_TIMER_DENSE_DOT_LIB = 									"Mddot";	// time spent in dot product of 2 dense vectors
	public final static String MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB = 	"Mdvdm";	// time spent in matrix mult of dense vector and dense matrix
	public final static String MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB = 	"Mdmdv";	// time spent in matrix mult of dense matrix and dense vector
	public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = 	"Mdmdm";	// time spent in matrix mult of dense matrices
	public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = 	"Msmdv";	// time spent in matrix mult of sparse matrix and dense vector
	public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm";  // time spent in matrix mult of sparse matrices
	public final static String MISC_TIMER_SYRK_LIB = 												"Msyrk"; 	// time spent in symmetric rank-k update

	// Other BLAS instructions
	public final static String MISC_TIMER_DAXPY_LIB =   "daxpy";    // time spent in daxpy
	public final static String MISC_TIMER_QR_BUFFER =   "qr_buffer";// time spent in calculating buffer needed to perform QR
	public final static String MISC_TIMER_QR =          "qr";       // time spent in doing QR
	public final static String MISC_TIMER_ORMQR =       "ormqr";    // time spent in ormqr
	public final static String MISC_TIMER_TRSM =        "trsm";     // time spent in cublas Dtrsm

	// Transpose
	public final static String MISC_TIMER_SPARSE_DGEAM_LIB =    "sdgeaml";  // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
	public final static String MISC_TIMER_DENSE_DGEAM_LIB =     "ddgeaml";  // time spent in dense transpose (and other ops of type a*op(A) + b*op(B))
	public final static String MISC_TIMER_TRANSPOSE_LIB =       "dtl";      // time spent on dense transpose, this includes allocation of output

	// Custom kernels
	public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck";   // time spent in matrix-matrix cellwise operations
	public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL =           "cask";   // time spent in compareAndSet kernel
	public final static String MISC_TIMER_EXP_KERNEL =                       "expk";   // time spent in the exp kernel
	public final static String MISC_TIMER_SQRT_KERNEL =                      "sqrtk";   // time spent in the sqrt kernel
	public final static String MISC_TIMER_ROUND_KERNEL =                     "roundk";   // time spent in the round kernel
	public final static String MISC_TIMER_ABS_KERNEL =                       "absk";   // time spent in the abs kernel
	public final static String MISC_TIMER_LOG_KERNEL =                       "logk";   // time spent in the log kernel
	public final static String MISC_TIMER_FLOOR_KERNEL =                     "floork";  // time spent in the floor kernel
	public final static String MISC_TIMER_CEIL_KERNEL =                      "ceilk";   // time spent in the ceil kernel
	public final static String MISC_TIMER_SIN_KERNEL =                       "sink";   // time spent in the sin kernel
	public final static String MISC_TIMER_COS_KERNEL =                       "cosk";   // time spent in the cos kernel
	public final static String MISC_TIMER_TAN_KERNEL =                       "tank";   // time spent in the tan kernel
	public final static String MISC_TIMER_ASIN_KERNEL =                      "asink";   // time spent in the asin kernel
	public final static String MISC_TIMER_ACOS_KERNEL =                      "acosk";   // time spent in the acos kernel
	public final static String MISC_TIMER_ATAN_KERNEL =                      "atank";   // time spent in the atan kernel
	public final static String MISC_TIMER_SIGN_KERNEL =                      "signk";   // time spent in the sign kernel

	public final static String MISC_TIMER_DAXPY_MV_KERNEL =                  "daxpymv";// time spent in the daxpy_matrix_vector kernel
	public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL =   "u2lk";   // time spent in the copy_u2l_dense kernel
	public final static String MISC_TIMER_FILL_KERNEL =                      "fillk";  // time spent in the "fill" kernel
	public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL =          "msk";    // time spent in the matrix scalar kernel
	public final static String MISC_TIMER_REDUCE_ALL_KERNEL =                "rallk";  // time spent in reduce all kernel
	public final static String MISC_TIMER_REDUCE_ROW_KERNEL =                "rrowk";  // time spent in reduce row kernel
	public final static String MISC_TIMER_REDUCE_COL_KERNEL =                "rcolk";  // time spent in reduce column kernel

	// Deep learning operators
	public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =         "nnaf";  // time spent in cudnnActivationForward
	public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB =        "nncf";  // time spent in cudnnConvolutionForward
	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter
	public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =  "nncbd"; // time spent in cudnnConvolutionBackwardData
	public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =         "nnmf";  // time spent in cudnnPoolingForward
	public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =        "nnmb";  // time spent in cudnnPoolingBackward
	public final static String MISC_TIMER_BIAS_ADD_LIB =                   "nnba";  // time spent in bias_add cuda kernel
	public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=            "nnrbk"; // time spent in relu_backward cuda kernel
	public final static String MISC_TIMER_RELU_KERNEL =                    "nnrk";  // time spent in the relu kernel
	public final static String MISC_TIMER_CUDNN_INIT =                     "nni";   // time spent in initializations for cudnn call
	public final static String MISC_TIMER_CUDNN_CLEANUP =                  "nnc";   // time spent in cleanup for cudnn call


	protected GPUINSTRUCTION_TYPE _gputype;
	protected Operator _optr;
	
	protected boolean _requiresLabelUpdate = false;
	
	public GPUInstruction(String opcode, String istr) {
		type = INSTRUCTION_TYPE.GPU;
		instString = istr;
		
		//prepare opcode and update requirement for repeated usage
		instOpcode = opcode;
		_requiresLabelUpdate = super.requiresLabelUpdate();
	}
	
	public GPUInstruction(Operator op, String opcode, String istr) {
		this(opcode, istr);
		_optr = op;
	}
	
	public GPUINSTRUCTION_TYPE getGPUInstructionType() {
		return _gputype;
	}
	
	@Override
	public boolean requiresLabelUpdate() {
		return _requiresLabelUpdate;
	}

	@Override
	public String getGraphString() {
		return getOpcode();
	}

	@Override
	public Instruction preprocessInstruction(ExecutionContext ec)
		throws DMLRuntimeException 
	{
		//default preprocess behavior (e.g., debug state)
		Instruction tmp = super.preprocessInstruction(ec);
		
		//instruction patching
		if( tmp.requiresLabelUpdate() ) { //update labels only if required
			//note: no exchange of updated instruction as labels might change in the general case
			String updInst = RunMRJobs.updateLabels(tmp.toString(), ec.getVariables());
			tmp = GPUInstructionParser.parseSingleInstruction(updInst);
		}

		return tmp;
	}

	@Override 
	public abstract void processInstruction(ExecutionContext ec)
			throws DMLRuntimeException;

	@Override
	public void postprocessInstruction(ExecutionContext ec)
					throws DMLRuntimeException
	{
		//JCuda.cudaDeviceSynchronize();
	}

	/**
	 * Helper method to get the input block (allocated on the GPU)
	 * Also records performance information into {@link Statistics}
	 * @param ec		active {@link ExecutionContext}
	 * @param name	name of input matrix (that the {@link ExecutionContext} is aware of)
	 * @return	the matrix object
	 * @throws DMLRuntimeException if an error occurs
	 */
	protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException {
		long t0 = System.nanoTime();
		Pair<MatrixObject, Boolean> mb = ec.getMatrixInputForGPUInstruction(name);
		if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0);
		return mb.getKey();
	}

	/**
	 * Helper method to get the output block (allocated on the GPU)
	 * Also records performance information into {@link Statistics}
	 * @param ec		active {@link ExecutionContext}
	 * @param name	name of input matrix (that the {@link ExecutionContext} is aware of)
	 * @return	the matrix object
	 * @throws DMLRuntimeException	if an error occurs
	 */
	protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException {
		long t0 = System.nanoTime();
		Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name);
		if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
		return mb.getKey();
	}
}