/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.instructions.gpu; import java.util.ArrayList; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.GPUStatistics; public class ConvolutionGPUInstruction extends GPUInstruction { private CPOperand _input1; private CPOperand _input2; private CPOperand _input3; private CPOperand _output; private ArrayList<CPOperand> _input_shape; private ArrayList<CPOperand> _filter_shape; private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>(); private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>(); public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); if(!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) { throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode); } _input1 = in1; _input2 = in2; _gputype = GPUINSTRUCTION_TYPE.Convolution; _output = out; } public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape) { this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape); _input3 = in3; } public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, ArrayList<CPOperand> filter_shape) { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); _gputype = GPUINSTRUCTION_TYPE.Convolution; _input1 = in1; _input2 = in2; _output = out; _stride = stride; _padding = padding; _input_shape = input_shape; _filter_shape = filter_shape; } public static ConvolutionGPUInstruction parseInstruction(String str) throws DMLRuntimeException { String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); String opcode = parts[0]; if( ( opcode.equalsIgnoreCase("conv2d") || opcode.equalsIgnoreCase("conv2d_backward_filter") || opcode.equalsIgnoreCase("conv2d_backward_data") || opcode.equalsIgnoreCase("maxpooling_backward")) ) { InstructionUtils.checkNumFields(parts, 15); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand out = new CPOperand(parts[15]); ArrayList<CPOperand> stride = new ArrayList<CPOperand>(); ArrayList<CPOperand> padding = new ArrayList<CPOperand>(); ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>(); ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>(); stride.add(new CPOperand(parts[3])); stride.add(new CPOperand(parts[4])); padding.add(new CPOperand(parts[5])); padding.add(new CPOperand(parts[6])); input_shape.add(new CPOperand(parts[7])); input_shape.add(new CPOperand(parts[8])); input_shape.add(new CPOperand(parts[9])); input_shape.add(new CPOperand(parts[10])); filter_shape.add(new CPOperand(parts[11])); filter_shape.add(new CPOperand(parts[12])); filter_shape.add(new CPOperand(parts[13])); filter_shape.add(new CPOperand(parts[14])); return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride, padding, input_shape, filter_shape); } else if (opcode.equalsIgnoreCase("conv2d_bias_add")) { InstructionUtils.checkNumFields(parts, 16); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand in3 = new CPOperand(parts[3]); CPOperand out = new CPOperand(parts[16]); ArrayList<CPOperand> stride = new ArrayList<CPOperand>(); ArrayList<CPOperand> padding = new ArrayList<CPOperand>(); ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>(); ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>(); stride.add(new CPOperand(parts[4])); stride.add(new CPOperand(parts[5])); padding.add(new CPOperand(parts[6])); padding.add(new CPOperand(parts[7])); input_shape.add(new CPOperand(parts[8])); input_shape.add(new CPOperand(parts[9])); input_shape.add(new CPOperand(parts[10])); input_shape.add(new CPOperand(parts[11])); filter_shape.add(new CPOperand(parts[12])); filter_shape.add(new CPOperand(parts[13])); filter_shape.add(new CPOperand(parts[14])); filter_shape.add(new CPOperand(parts[15])); return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride, padding, input_shape, filter_shape); } else if (opcode.equalsIgnoreCase("maxpooling")) { InstructionUtils.checkNumFields(parts, 14); CPOperand in1 = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[14]); ArrayList<CPOperand> stride = new ArrayList<CPOperand>(); ArrayList<CPOperand> padding = new ArrayList<CPOperand>(); ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>(); ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>(); stride.add(new CPOperand(parts[2])); stride.add(new CPOperand(parts[3])); padding.add(new CPOperand(parts[4])); padding.add(new CPOperand(parts[5])); input_shape.add(new CPOperand(parts[6])); input_shape.add(new CPOperand(parts[7])); input_shape.add(new CPOperand(parts[8])); input_shape.add(new CPOperand(parts[9])); filter_shape.add(new CPOperand(parts[10])); filter_shape.add(new CPOperand(parts[11])); filter_shape.add(new CPOperand(parts[12])); filter_shape.add(new CPOperand(parts[13])); return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride, padding, input_shape, filter_shape); } else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) { InstructionUtils.checkNumFields(parts, 3); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand out = new CPOperand(parts[3]); return new ConvolutionGPUInstruction(in1, in2, out, opcode, str); } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); } } public void processBiasInstruction(String instOpcode, ExecutionContext ec) throws DMLRuntimeException { GPUStatistics.incrementNoOfExecutedGPUInst(); MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input2.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("bias_add")) LibMatrixCUDA.biasAdd(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); else if(instOpcode.equalsIgnoreCase("bias_multiply")) LibMatrixCUDA.biasMultiply(ec.getGPUContext(), getExtendedOpcode(), input, bias, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } public void processReLUBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { GPUStatistics.incrementNoOfExecutedGPUInst(); MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns()); LibMatrixCUDA.reluBackward(ec.getGPUContext(), getExtendedOpcode(), input, dout, out); // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); ec.releaseMatrixInputForGPUInstruction(_input2.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { if (instOpcode.equalsIgnoreCase("bias_add") || instOpcode.equalsIgnoreCase("bias_multiply")) { processBiasInstruction(instOpcode, ec); return; } else if (instOpcode.equalsIgnoreCase("relu_backward")) { processReLUBackwardInstruction(ec); return; } GPUStatistics.incrementNoOfExecutedGPUInst(); int pad_h = getScalarInput(ec, _padding, 0); int pad_w = getScalarInput(ec, _padding, 1); int stride_h = getScalarInput(ec, _stride, 0); int stride_w = getScalarInput(ec, _stride, 1); int N = getScalarInput(ec, _input_shape, 0); int C = getScalarInput(ec, _input_shape, 1); int H = getScalarInput(ec, _input_shape, 2); int W = getScalarInput(ec, _input_shape, 3); int K = getScalarInput(ec, _filter_shape, 0); int R = getScalarInput(ec, _filter_shape, 2); int S = getScalarInput(ec, _filter_shape, 3); int P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h); int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w); if (instOpcode.equalsIgnoreCase("conv2d")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject filter = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in conv2d"); if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) throw new DMLRuntimeException("Incorrect dimensions for filter in conv2d"); ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); LibMatrixCUDA.conv2d(ec.getGPUContext(), getExtendedOpcode(), image, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input2.getName()); MatrixObject filter = getMatrixInputForGPUInstruction(ec, _input3.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in conv2d"); if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) throw new DMLRuntimeException("Incorrect dimensions for filter in conv2d"); ec.setMetaData(_output.getName(), N, K * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in conv2d_backward_filter"); if(dout.getNumRows() != N || dout.getNumColumns() != K*P*Q) throw new DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_filter: " + dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), K, C * R * S); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) { MatrixObject filter = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(filter.getNumRows() != K || filter.getNumColumns() != C*R*S) throw new DMLRuntimeException("Incorrect dimensions for filter in convolution_backward_data"); if(dout.getNumRows() != N || dout.getNumColumns() != K*P*Q) throw new DMLRuntimeException("Incorrect dimensions for dout in conv2d_backward_data: " + dout.getNumRows() + " != " + N + " || " + dout.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in maxpooling: " + image.getNumRows() + " != " + N + " || " + image.getNumColumns() + " != " + C*H*W); ec.setMetaData(_output.getName(), N, C * P * Q); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); if(instOpcode.equalsIgnoreCase("maxpooling")) LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); if(dout.getNumRows() != N || dout.getNumColumns() != C*P*Q) throw new DMLRuntimeException("Incorrect dimensions for dout in maxpooling_backward"); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in maxpooling_backward: " + image.getNumRows() + " != " + N + " || " + image.getNumColumns() + " != " + K*P*Q); ec.setMetaData(_output.getName(), N, C * H * W); MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName()); LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else { throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode); } // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); if ( !instOpcode.equalsIgnoreCase("maxpooling") ) ec.releaseMatrixInputForGPUInstruction(_input2.getName()); if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) ec.releaseMatrixInputForGPUInstruction(_input3.getName()); ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) throws DMLRuntimeException { return (int) ec.getScalarInput(aL.get(index).getName(), aL.get(index).getValueType(), aL.get(index).isLiteral()) .getLongValue(); } }