/* * Copyright (c) 2009-2016 jMonkeyEngine * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name of 'jMonkeyEngine' nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.jme3.opencl; import com.jme3.math.*; import com.jme3.util.TempVars; import java.nio.ByteBuffer; import java.util.Arrays; /** * Wrapper for an OpenCL kernel, a piece of executable code on the GPU. * <p> * Terminology:<br> * A Kernel is executed in parallel. In total number of parallel threads, * called work items, are specified by the <i>global work size</i> (of type * {@link WorkSize}. These threads are organized in a 1D, 2D or 3D grid * (of coarse, this is only a logical view). Inside each kernel, * the id of each thread (i.e. the index inside this grid) can be requested * by {@code get_global_id(dimension)} with {@code dimension=0,1,2}. * <br> * Not all threads can always be executed in parallel because there simply might * not be enough processor cores. * Therefore, the concept of a <i>work group</i> is introduced. The work group * specifies the actual number of threads that are executed in parallel. * The maximal size of it can be queried by {@link Device#getMaxiumWorkItemsPerGroup() }. * Again, the threads inside the work group can be organized in a 1D, 2D or 3D * grid, but this is also just a logical view (specifying how the threads are * indexed). * The work group is imporatant for another concept: <i> shared memory</i> * Unlike the normal global or constant memory (passing a {@link Buffer} object * as argument), shared memory can't be set from outside. Shared memory is * allocated by the kernel and is only valid within the kernel. It is used * to quickly share data between threads within a work group. * The size of the shared memory is specified by setting an instance of * {@link LocalMem} or {@link LocalMemPerElement} as argument.<br> * Due to heavy register usage or other reasons, a kernel might not be able * to utilize a whole work group. Therefore, the actual number of threads * that can be executed in a work group can be queried by * {@link #getMaxWorkGroupSize(com.jme3.opencl.Device) }, which might differ from the * value returned from the Device. * * <p> * There are two ways to launch a kernel:<br> * First, arguments and the work group sizes can be set in advance * ({@code setArg(index, ...)}, {@code setGlobalWorkSize(...)} and {@code setWorkGroupSize(...)}. * Then a kernel is launched by {@link #Run(com.jme3.opencl.CommandQueue) }.<br> * Second, two convenient functions are provided that set the arguments * and work sizes in one call: * {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) } * and {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. * * @author shaman * @see Program#createKernel(java.lang.String) */ public abstract class Kernel extends AbstractOpenCLObject { /** * The current global work size */ protected final WorkSize globalWorkSize; /** * The current local work size */ protected final WorkSize workGroupSize; protected Kernel(ObjectReleaser releaser) { super(releaser); this.globalWorkSize = new WorkSize(0); this.workGroupSize = new WorkSize(0); } @Override public Kernel register() { super.register(); return this; } /** * @return the name of the kernel as defined in the program source code */ public abstract String getName(); /** * @return the number of arguments */ public abstract int getArgCount(); /** * @return the current global work size */ public WorkSize getGlobalWorkSize() { return globalWorkSize; } /** * Sets the global work size. * @param ws the work size to set */ public void setGlobalWorkSize(WorkSize ws) { globalWorkSize.set(ws); } /** * Sets the global work size to a 1D grid * @param size the size in 1D */ public void setGlobalWorkSize(int size) { globalWorkSize.set(1, size); } /** * Sets the global work size to be a 2D grid * @param width the width * @param height the height */ public void setGlobalWorkSize(int width, int height) { globalWorkSize.set(2, width, height); } /** * Sets the global work size to be a 3D grid * @param width the width * @param height the height * @param depth the depth */ public void setGlobalWorkSize(int width, int height, int depth) { globalWorkSize.set(3, width, height, depth); } /** * @return the current work group size */ public WorkSize getWorkGroupSize() { return workGroupSize; } /** * Sets the work group size * @param ws the work group size to set */ public void setWorkGroupSize(WorkSize ws) { workGroupSize.set(ws); } /** * Sets the work group size to be a 1D grid * @param size the size to set */ public void setWorkGroupSize(int size) { workGroupSize.set(1, size); } /** * Sets the work group size to be a 2D grid * @param width the width * @param height the height */ public void setWorkGroupSize(int width, int height) { workGroupSize.set(2, width, height); } /** * Sets the work group size to be a 3D grid * @param width the width * @param height the height * @param depth the depth */ public void setWorkGroupSdize(int width, int height, int depth) { workGroupSize.set(3, width, height, depth); } /** * Tells the driver to figure out the work group size on their own. * Use this if you do not rely on specific work group layouts, i.e. * because shared memory is not used. * {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) } * implicetly calls this mehtod. */ public void setWorkGroupSizeToNull() { workGroupSize.set(1, 0, 0, 0); } /** * Returns the maximal work group size when this kernel is executed on * the specified device * @param device the device * @return the maximal work group size */ public abstract long getMaxWorkGroupSize(Device device); public abstract void setArg(int index, LocalMemPerElement t); public abstract void setArg(int index, LocalMem t); public abstract void setArg(int index, Buffer t); public abstract void setArg(int index, Image i); public abstract void setArg(int index, byte b); public abstract void setArg(int index, short s); public abstract void setArg(int index, int i); public abstract void setArg(int index, long l); public abstract void setArg(int index, float f); public abstract void setArg(int index, double d); public abstract void setArg(int index, Vector2f v); public abstract void setArg(int index, Vector4f v); public abstract void setArg(int index, Quaternion q); public abstract void setArg(int index, Matrix4f mat); public void setArg(int index, Matrix3f mat) { TempVars vars = TempVars.get(); try { Matrix4f m = vars.tempMat4; m.zero(); for (int i=0; i<3; ++i) { for (int j=0; j<3; ++j) { m.set(i, j, mat.get(i, j)); } } setArg(index, m); } finally { vars.release(); } } /** * Raw version to set an argument. * {@code size} bytes of the provided byte buffer are copied to the kernel * argument. The size in bytes must match exactly the argument size * as defined in the kernel code. * Use this method to send custom structures to the kernel * @param index the index of the argument * @param buffer the raw buffer * @param size the size in bytes */ public abstract void setArg(int index, ByteBuffer buffer, long size); /** * Sets the kernel argument at the specified index.<br> * The argument must be a known type: * {@code LocalMemPerElement, LocalMem, Image, Buffer, byte, short, int, * long, float, double, Vector2f, Vector4f, Quaternion, Matrix3f, Matrix4f}. * <br> * Note: Matrix3f and Matrix4f will be mapped to a {@code float16} (row major). * @param index the index of the argument, from 0 to {@link #getArgCount()}-1 * @param arg the argument * @throws IllegalArgumentException if the argument type is not one of the listed ones */ public void setArg(int index, Object arg) { if (arg instanceof Byte) { setArg(index, (byte) arg); } else if (arg instanceof Short) { setArg(index, (short) arg); } else if (arg instanceof Integer) { setArg(index, (int) arg); } else if (arg instanceof Long) { setArg(index, (long) arg); } else if (arg instanceof Float) { setArg(index, (float) arg); } else if (arg instanceof Double) { setArg(index, (double) arg); } else if (arg instanceof Vector2f) { setArg(index, (Vector2f) arg); } else if (arg instanceof Vector4f) { setArg(index, (Vector4f) arg); } else if (arg instanceof Quaternion) { setArg(index, (Quaternion) arg); } else if (arg instanceof Matrix3f) { setArg(index, (Matrix3f) arg); } else if (arg instanceof Matrix4f) { setArg(index, (Matrix4f) arg); } else if (arg instanceof LocalMemPerElement) { setArg(index, (LocalMemPerElement) arg); } else if (arg instanceof LocalMem) { setArg(index, (LocalMem) arg); } else if (arg instanceof Buffer) { setArg(index, (Buffer) arg); } else if (arg instanceof Image) { setArg(index, (Image) arg); } else { throw new IllegalArgumentException("unknown kernel argument type: " + arg); } } private void setArgs(Object... args) { for (int i = 0; i < args.length; ++i) { setArg(i, args[i]); } } /** * Launches the kernel with the current global work size, work group size * and arguments. * If the returned event object is not needed and would otherwise be * released immediately, {@link #RunNoEvent(com.jme3.opencl.CommandQueue) } * might bring a better performance. * @param queue the command queue * @return an event object indicating when the kernel is finished * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) * @see #setWorkGroupSize(com.jme3.opencl.Kernel.WorkSize) * @see #setArg(int, java.lang.Object) */ public abstract Event Run(CommandQueue queue); /** * Launches the kernel with the current global work size, work group size * and arguments without returning an event object. * The generated event is directly released. Therefore, the performance * is better, but there is no way to detect when the kernel execution * has finished. For this purpose, use {@link #Run(com.jme3.opencl.CommandQueue) }. * @param queue the command queue * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) * @see #setWorkGroupSize(com.jme3.opencl.Kernel.WorkSize) * @see #setArg(int, java.lang.Object) */ public void RunNoEvent(CommandQueue queue) { //Default implementation, overwrite to not allocate the event object Run(queue).release(); } /** * Sets the work sizes and arguments in one call and launches the kernel. * The global work size is set to the specified size. The work group * size is automatically determined by the driver. * Each object in the argument array is sent to the kernel by * {@link #setArg(int, java.lang.Object) }. * @param queue the command queue * @param globalWorkSize the global work size * @param args the kernel arguments * @return an event object indicating when the kernel is finished * @see #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) */ public Event Run1(CommandQueue queue, WorkSize globalWorkSize, Object... args) { setGlobalWorkSize(globalWorkSize); setWorkGroupSizeToNull(); setArgs(args); return Run(queue); } /** * Sets the work sizes and arguments in one call and launches the kernel. * The global work size is set to the specified size. The work group * size is automatically determined by the driver. * Each object in the argument array is sent to the kernel by * {@link #setArg(int, java.lang.Object) }. * The generated event is directly released. Therefore, the performance * is better, but there is no way to detect when the kernel execution * has finished. For this purpose, use * {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. * @param queue the command queue * @param globalWorkSize the global work size * @param args the kernel arguments * @see #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) */ public void Run1NoEvent(CommandQueue queue, WorkSize globalWorkSize, Object... args) { setGlobalWorkSize(globalWorkSize); setWorkGroupSizeToNull(); setArgs(args); RunNoEvent(queue); } /** * Sets the work sizes and arguments in one call and launches the kernel. * @param queue the command queue * @param globalWorkSize the global work size * @param workGroupSize the work group size * @param args the kernel arguments * @return an event object indicating when the kernel is finished */ public Event Run2(CommandQueue queue, WorkSize globalWorkSize, WorkSize workGroupSize, Object... args) { setGlobalWorkSize(globalWorkSize); setWorkGroupSize(workGroupSize); setArgs(args); return Run(queue); } /** * Sets the work sizes and arguments in one call and launches the kernel. * The generated event is directly released. Therefore, the performance * is better, but there is no way to detect when the kernel execution * has finished. For this purpose, use * {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. * @param queue the command queue * @param globalWorkSize the global work size * @param workGroupSize the work group size * @param args the kernel arguments */ public void Run2NoEvent(CommandQueue queue, WorkSize globalWorkSize, WorkSize workGroupSize, Object... args) { setGlobalWorkSize(globalWorkSize); setWorkGroupSize(workGroupSize); setArgs(args); RunNoEvent(queue); } @Override public String toString() { return "Kernel (" + getName() + ")"; } /** * A placeholder for kernel arguments representing local kernel memory. * This defines the size of available shared memory of a {@code __shared} kernel * argument */ public static final class LocalMem { private int size; /** * Creates a new LocalMem instance * @param size the size of the available shared memory in bytes */ public LocalMem(int size) { super(); this.size = size; } public int getSize() { return size; } @Override public int hashCode() { int hash = 3; hash = 79 * hash + this.size; return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final LocalMem other = (LocalMem) obj; if (this.size != other.size) { return false; } return true; } @Override public String toString() { return "LocalMem (" + size + "B)"; } } /** * A placeholder for a kernel argument representing local kernel memory per thread. * This effectively computes {@code SharedMemoryPerElement * WorkGroupSize} * and uses this value as the size of shared memory available in the kernel. * Therefore, an instance of this class must be set as an argument AFTER * the work group size has been specified. This is * ensured by {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. * This argument can't be used when no work group size was defined explicetly * (e.g. by {@link #setWorkGroupSizeToNull()} or {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. */ public static final class LocalMemPerElement { private int size; /** * Creates a new LocalMemPerElement instance * @param size the number of bytes available for each thread within * a work group */ public LocalMemPerElement(int size) { super(); this.size = size; } public int getSize() { return size; } @Override public int hashCode() { int hash = 3; hash = 79 * hash + this.size; return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final LocalMemPerElement other = (LocalMemPerElement) obj; if (this.size != other.size) { return false; } return true; } @Override public String toString() { return "LocalMemPerElement (" + size + "B)"; } } /** * The work size (global and local) for executing a kernel * @author shaman */ public static final class WorkSize { private int dimension; private long[] sizes; /** * Creates a new work size object * @param dimension the dimension (1,2,3) * @param sizes the sizes in each dimension, the length must match the specified dimension */ public WorkSize(int dimension, long... sizes) { super(); set(dimension, sizes); } /** * Creates a work size of dimension 1 and extend 1,1,1 (only one thread). */ public WorkSize() { this(1, 1, 1, 1); } /** * Creates a 1D work size of the specified extend * @param size the size */ public WorkSize(long size) { this(1, size, 1, 1); } /** * Creates a 2D work size of the specified extend * @param width the width * @param height the height */ public WorkSize(long width, long height) { this(2, width, height, 1); } /** * Creates a 3D work size of the specified extend. * @param width the width * @param height the height * @param depth the depth */ public WorkSize(long width, long height, long depth) { this(3, width, height, depth); } public int getDimension() { return dimension; } public long[] getSizes() { return sizes; } public void set(int dimension, long... sizes) { if (sizes == null || sizes.length != 3) { throw new IllegalArgumentException("sizes must be an array of length 3"); } if (dimension <= 0 || dimension > 3) { throw new IllegalArgumentException("dimension must be between 1 and 3"); } this.dimension = dimension; this.sizes = sizes; } public void set(WorkSize ws) { this.dimension = ws.dimension; this.sizes = ws.sizes; } @Override public int hashCode() { int hash = 5; hash = 47 * hash + this.dimension; hash = 47 * hash + Arrays.hashCode(this.sizes); return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final WorkSize other = (WorkSize) obj; if (this.dimension != other.dimension) { return false; } if (!Arrays.equals(this.sizes, other.sizes)) { return false; } return true; } @Override public String toString() { StringBuilder str = new StringBuilder(); str.append("WorkSize["); for (int i=0; i<dimension; ++i) { if (i>0) { str.append(", "); } str.append(sizes[i]); } str.append(']'); return str.toString(); } } }