/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.instructions.gpu; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.GPUInstructionParser; import org.apache.sysml.runtime.instructions.Instruction; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; public abstract class GPUInstruction extends Instruction { public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, BuiltinBinary, Builtin }; // Memory/conversions public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host) public final static String MISC_TIMER_DEVICE_TO_HOST = "D2H"; // time spent in bringing data from gpu (to host) public final static String MISC_TIMER_DEVICE_TO_DEVICE = "D2D"; // time spent in copying data from one region on the device to another public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in converting data from sparse to dense public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in converting data from dense to sparse public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR = "r2c"; // time spent in converting data from row major to column major public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR = "c2r"; // time spent in converting data from column major to row major public final static String MISC_TIMER_OBJECT_CLONE = "clone";// time spent in cloning (deep copying) a GPUObject instance public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE) public final static String MISC_TIMER_SET_ZERO = "az"; // time spent to allocate public final static String MISC_TIMER_REUSE = "r"; // time spent in reusing already allocated memory on GPU (mainly for the count) // Matmult instructions public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB = "Msao"; // time spend in allocating for sparse matrix output public final static String MISC_TIMER_DENSE_DOT_LIB = "Mddot"; // time spent in dot product of 2 dense vectors public final static String MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB = "Mdvdm"; // time spent in matrix mult of dense vector and dense matrix public final static String MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB = "Mdmdv"; // time spent in matrix mult of dense matrix and dense vector public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = "Mdmdm"; // time spent in matrix mult of dense matrices public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = "Msmdv"; // time spent in matrix mult of sparse matrix and dense vector public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm"; // time spent in matrix mult of sparse matrices public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update // Other BLAS instructions public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy public final static String MISC_TIMER_QR_BUFFER = "qr_buffer";// time spent in calculating buffer needed to perform QR public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm // Transpose public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B)) public final static String MISC_TIMER_DENSE_DGEAM_LIB = "ddgeaml"; // time spent in dense transpose (and other ops of type a*op(A) + b*op(B)) public final static String MISC_TIMER_TRANSPOSE_LIB = "dtl"; // time spent on dense transpose, this includes allocation of output // Custom kernels public final static String MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL = "mmck"; // time spent in matrix-matrix cellwise operations public final static String MISC_TIMER_COMPARE_AND_SET_KERNEL = "cask"; // time spent in compareAndSet kernel public final static String MISC_TIMER_EXP_KERNEL = "expk"; // time spent in the exp kernel public final static String MISC_TIMER_SQRT_KERNEL = "sqrtk"; // time spent in the sqrt kernel public final static String MISC_TIMER_ROUND_KERNEL = "roundk"; // time spent in the round kernel public final static String MISC_TIMER_ABS_KERNEL = "absk"; // time spent in the abs kernel public final static String MISC_TIMER_LOG_KERNEL = "logk"; // time spent in the log kernel public final static String MISC_TIMER_FLOOR_KERNEL = "floork"; // time spent in the floor kernel public final static String MISC_TIMER_CEIL_KERNEL = "ceilk"; // time spent in the ceil kernel public final static String MISC_TIMER_SIN_KERNEL = "sink"; // time spent in the sin kernel public final static String MISC_TIMER_COS_KERNEL = "cosk"; // time spent in the cos kernel public final static String MISC_TIMER_TAN_KERNEL = "tank"; // time spent in the tan kernel public final static String MISC_TIMER_ASIN_KERNEL = "asink"; // time spent in the asin kernel public final static String MISC_TIMER_ACOS_KERNEL = "acosk"; // time spent in the acos kernel public final static String MISC_TIMER_ATAN_KERNEL = "atank"; // time spent in the atan kernel public final static String MISC_TIMER_SIGN_KERNEL = "signk"; // time spent in the sign kernel public final static String MISC_TIMER_DAXPY_MV_KERNEL = "daxpymv";// time spent in the daxpy_matrix_vector kernel public final static String MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL = "u2lk"; // time spent in the copy_u2l_dense kernel public final static String MISC_TIMER_FILL_KERNEL = "fillk"; // time spent in the "fill" kernel public final static String MISC_TIMER_MATRIX_SCALAR_OP_KERNEL = "msk"; // time spent in the matrix scalar kernel public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel // Deep learning operators public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward public final static String MISC_TIMER_CONVOLUTION_FORWARD_LIB = "nncf"; // time spent in cudnnConvolutionForward public final static String MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB ="nncbf"; // time spent in cudnnConvolutionBackwardFilter public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call public final static String MISC_TIMER_CUDNN_CLEANUP = "nnc"; // time spent in cleanup for cudnn call protected GPUINSTRUCTION_TYPE _gputype; protected Operator _optr; protected boolean _requiresLabelUpdate = false; public GPUInstruction(String opcode, String istr) { type = INSTRUCTION_TYPE.GPU; instString = istr; //prepare opcode and update requirement for repeated usage instOpcode = opcode; _requiresLabelUpdate = super.requiresLabelUpdate(); } public GPUInstruction(Operator op, String opcode, String istr) { this(opcode, istr); _optr = op; } public GPUINSTRUCTION_TYPE getGPUInstructionType() { return _gputype; } @Override public boolean requiresLabelUpdate() { return _requiresLabelUpdate; } @Override public String getGraphString() { return getOpcode(); } @Override public Instruction preprocessInstruction(ExecutionContext ec) throws DMLRuntimeException { //default preprocess behavior (e.g., debug state) Instruction tmp = super.preprocessInstruction(ec); //instruction patching if( tmp.requiresLabelUpdate() ) { //update labels only if required //note: no exchange of updated instruction as labels might change in the general case String updInst = RunMRJobs.updateLabels(tmp.toString(), ec.getVariables()); tmp = GPUInstructionParser.parseSingleInstruction(updInst); } return tmp; } @Override public abstract void processInstruction(ExecutionContext ec) throws DMLRuntimeException; @Override public void postprocessInstruction(ExecutionContext ec) throws DMLRuntimeException { //JCuda.cudaDeviceSynchronize(); } /** * Helper method to get the input block (allocated on the GPU) * Also records performance information into {@link Statistics} * @param ec active {@link ExecutionContext} * @param name name of input matrix (that the {@link ExecutionContext} is aware of) * @return the matrix object * @throws DMLRuntimeException if an error occurs */ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException { long t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getMatrixInputForGPUInstruction(name); if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0); return mb.getKey(); } /** * Helper method to get the output block (allocated on the GPU) * Also records performance information into {@link Statistics} * @param ec active {@link ExecutionContext} * @param name name of input matrix (that the {@link ExecutionContext} is aware of) * @return the matrix object * @throws DMLRuntimeException if an error occurs */ protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException { long t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name); if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } }