/*******************************************************************************
* TurtleKit 3 - Agent Based and Artificial Life Simulation Platform
* Copyright (C) 2011-2016 Fabien Michel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package turtlekit.cuda;
import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemFreeHost;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import jcuda.CudaException;
import jcuda.Pointer;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.CUstream;
import jcuda.driver.CUstream_flags;
import jcuda.driver.JCudaDriver;
import jcuda.utils.KernelLauncher;
public class CudaEngine {
/**
* Native libs have to be loaded from the file system.
* This is the dir where native libs are extracted (in the default io dir of the OS)
*/
public static final String ioTmpDir = System.getProperty("java.io.tmpdir");
private static int availableDevicesNb = 0;
private static ExecutorService initialization;
final static Map<Integer,CudaEngine> cudaEngines = new HashMap<>();
private static int NB_OF_DEVICE_TO_USE = 1;
private static Logger logger;
/**
* @param logLevel
*
*/
public static boolean init(String logLevel) {
logger = Logger.getLogger(CudaEngine.class.getSimpleName());
logger.setLevel(Level.parse(logLevel));
logger.setLevel(Level.ALL);
synchronized (cudaEngines) {
logger.finer("---------Initializing Cuda----------------");
try {
JCudaDriver.setExceptionsEnabled(true);
JCudaDriver.cuInit(0);
// Obtain the number of devices
int deviceCountArray[] = { 0 };
JCudaDriver.cuDeviceGetCount(deviceCountArray);
availableDevicesNb = deviceCountArray[0];
if (availableDevicesNb == 0)
return false;
// availableDevicesNb = NB_OF_DEVICE_TO_USE;// TODO
initialization = Executors.newCachedThreadPool();
logger.finer("Found " + availableDevicesNb + " GPU devices");
Future<?> initJob = initialization.submit(new Runnable() {
public void run() {
for (int i = 0/*-NB_OF_DEVICE_TO_USE*/; i < availableDevicesNb; i++) {
final int index = i;
logger.finer("Initializing device n°" + index);
cudaEngines.put(index, new CudaEngine(index));
}}
});
initJob.get();
initialization.shutdown();
} catch (InterruptedException | ExecutionException | CudaException | UnsatisfiedLinkError e) {
logger.finer("---------Cannot initialize Cuda !!! ----------------");
e.printStackTrace();
return false;
}
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
CudaEngine.stop();
}
});
logger.fine("---------Cuda Initialized----------------");
return true;
}
}
//
// public Pointer getPointerToFloat(float f){
// return Pointer.to(new float[]{f});
// }
public int cuDeviceGetCount() {
return availableDevicesNb;
}
private static AtomicInteger cudaObjectID = new AtomicInteger(0);
private static Map<CudaObject, CudaEngine> engineBinds = new HashMap<>();
private ExecutorService exe;
protected CUfunction f;
private List<CudaObject> cudaObjects = new ArrayList<CudaObject>();
private int maxThreads;
private int Id = -1;
protected CUcontext context;
protected CUmodule myModule;
private Map<String, CUfunction> functions = new HashMap<>();
private CudaEngine(final int deviceId) {
exe = Executors.newSingleThreadExecutor(new ThreadFactory() {
@Override
public Thread newThread(Runnable r) {
Thread thread = new Thread(r);
thread.setDaemon(true);
return thread;
}
}); //mandatory: Only one cuda thread per context
Id = deviceId;
try {
exe.submit(new Runnable() {
@Override
public void run() {
CUdevice device = new CUdevice();
JCudaDriver.cuDeviceGet(device, deviceId);
int array[] = { 0 };
JCudaDriver.cuDeviceGetAttribute(array,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
maxThreads = (int) Math.sqrt(array[0]);
// System.out.println(maxThreads);
// maxThreads = (int) 1024;
context = new CUcontext();
JCudaDriver.cuCtxCreate(context, 0, device);
myModule = new CUmodule();
}
}).get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e.getMessage());
}
}
public static boolean isCudaAvailable() {
return availableDevicesNb != 0;
}
@Override
public String toString() {
return "cudaEngine device #"+Id;
}
public static void addKernelSourceFile(String aPathInTheClassPath){
}
public KernelConfiguration getDefaultKernelConfiguration(int dataSizeX, int dataSizeY){
int gridSizeX = (dataSizeX + maxThreads - 1) / maxThreads;
int gridSizeY = (dataSizeY + maxThreads - 1) / maxThreads;
return new KernelConfiguration(gridSizeX, gridSizeY, maxThreads, maxThreads, getNewCudaStream());
}
public CudaKernel getKernel(final String kernelFunctionName, final String cuSourceFilePath, final KernelConfiguration kc){
try {
return exe.submit(() -> {
CUfunction function = functions.computeIfAbsent(""+kernelFunctionName+cuSourceFilePath, k -> updateCuSourceFile(kernelFunctionName,cuSourceFilePath));
return new CudaKernel(function, CudaEngine.this, kernelFunctionName, cuSourceFilePath, kc);
}).get();
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
return null;
}
public <T> CUdeviceptr createDeviceDataGrid(int wdith, int height, Class<T> dataType){
CUdeviceptr tmpPtr = new CUdeviceptr();
try {
submit(() -> cuMemAlloc(tmpPtr, getRequiredMemorySize(dataType, wdith, height))).get();
} catch (InterruptedException | ExecutionException | IllegalArgumentException | SecurityException e) {
e.printStackTrace();
}
return tmpPtr;
}
public <T> Buffer getUnifiedBufferBetweenPointer(Pointer hostData, CUdeviceptr deviceData, Class<T> dataType, int wdith, int height){
try {
int size = getRequiredMemorySize(dataType, wdith, height);
final Buffer buffer = exe.submit(() -> {
JCudaDriver.cuMemHostAlloc(hostData, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
final ByteBuffer byteBuffer = hostData.getByteBuffer(0, size);
byteBuffer.order(ByteOrder.nativeOrder());
JCudaDriver.cuMemHostGetDevicePointer(deviceData, hostData, 0);
return byteBuffer;
}).get();
String simpleName = dataType.getSimpleName();
switch (simpleName) {
case "Integer":
simpleName = "Int";
break;
case "Character":
simpleName = "Char";
break;
default:
break;
}
final Method method = buffer.getClass().getMethod("as"+simpleName+"Buffer");
method.setAccessible(true);
return (Buffer) method.invoke(buffer);
} catch (InterruptedException | ExecutionException | IllegalArgumentException | IllegalAccessException | SecurityException | InvocationTargetException | NoSuchMethodException e) {
e.printStackTrace();
}
return null;
}
private <T> int getRequiredMemorySize(Class<T> dataType, int wdith, int height) {
String simpleName = dataType.getSimpleName();
switch (simpleName) {
case "int":
dataType = (Class<T>) Integer.class;
break;
case "char":
dataType = (Class<T>) Character.class;
break;
default:
break;
}
int floatGridMemorySize = 0;
try {
floatGridMemorySize = wdith * height * dataType.getField("SIZE").getInt(null) / 8;
} catch (IllegalArgumentException | IllegalAccessException | NoSuchFieldException | SecurityException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return floatGridMemorySize;
}
public static CudaEngine getCudaEngine(CudaObject co) {
synchronized (cudaEngines) {
if (!isCudaAvailable())
throw new CudaException("No cuda device found");
try {
initialization.awaitTermination(10, TimeUnit.SECONDS);
}
catch (InterruptedException e) {
e.printStackTrace();
}
synchronized (engineBinds) {
return engineBinds.computeIfAbsent(co, v -> {
final int pheroID = cudaObjectID.incrementAndGet();
final CudaEngine ce = cudaEngines.get(pheroID % availableDevicesNb);
// final CudaEngine ce = cudaEngines.get(0);
ce.cudaObjects.add(co);
logger.finer(co + "ID " + pheroID + " getting cuda engine Id " + ce.Id);
return ce;
});
}
}
}
static FloatBuffer getUnifiedFloatBuffer(Pointer pinnedMemory,
CUdeviceptr devicePtr, long size) {
JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
byteBuffer.order(ByteOrder.nativeOrder());
JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
return byteBuffer.asFloatBuffer();
}
public static IntBuffer getUnifiedIntBuffer(Pointer pinnedMemory,
CUdeviceptr devicePtr,
int size) {
JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
byteBuffer.order(ByteOrder.nativeOrder());
JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
return byteBuffer.asIntBuffer();
}
public static int[] getUnifiedIntArray(Pointer pinnedMemory,
CUdeviceptr devicePtr, int size) {
int[] values = new int[size];
JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
byteBuffer.order(ByteOrder.nativeOrder());
JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
return values;
}
public static ByteBuffer getUnifiedByteBuffer(Pointer pinnedMemory,
CUdeviceptr devicePtr, int size) {
JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
byteBuffer.order(ByteOrder.nativeOrder());
JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
return byteBuffer;
}
/**
* Stop the executors and clean memory on registered CUObject
*/
public static void stop() {
synchronized (cudaEngines) {
cuCtxSynchronizeAll();
for (Iterator<CudaEngine> iterator = cudaEngines.values()
.iterator(); iterator.hasNext();) {
iterator.next().shutdown();
iterator.remove();
}
// for (CudaEngine ce : cudaEngines.values()) {
// ce.shutdown();
// }
}
}
/**
* Stop the executors and clean memory on registered CUObject
*/
synchronized public static void freeMemory() {
for (CudaEngine ce : cudaEngines.values()) {
ce.freeCUObjectsMemory();
}
}
/**
* Free memory from the currently registered CUObjects
*/
public void freeCUObjectsMemory() {
exe.submit(new Runnable() {
@Override
public void run() {
cuCtxSynchronize();
for (CudaObject co : cudaObjects) {
co.freeMemory();
}
JCudaDriver.cuCtxDestroy(context);
}
});
}
private synchronized void shutdown() {
if (! exe.isShutdown()) {
freeCUObjectsMemory();
}
exe.shutdown();
try {
System.err.println("cuda device "+Id+" freed ? " +exe.awaitTermination(10, TimeUnit.SECONDS));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public CUstream getNewCudaStream(){
try {
return exe.submit(() -> {
final CUstream cudaStream = new CUstream();
JCudaDriver.cuStreamCreate(cudaStream, CUstream_flags.CU_STREAM_NON_BLOCKING);
return cudaStream;
}).get();
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
return null;
}
private CUfunction updateCuSourceFile(String kernelFunctionName, String dotCuSourceFilePath) {
// KernelLauncher.setCompilerPath("/usr/local/cuda-7.0/bin/");//FIXME
try {
final URL resource = CudaEngine.class.getResource(dotCuSourceFilePath);
if(resource == null)
throw new FileNotFoundException(dotCuSourceFilePath+" not found on the class path");
File f = new File(resource.toURI());
final Path path = Paths.get(CudaEngine.ioTmpDir, f.getName());
final File file = path.toFile();
final boolean rebuildNeeded = ! file.exists() || file.lastModified() < f.lastModified();
String cuFile = path.toString();
if(rebuildNeeded){
Files.copy(f.toPath(), path, StandardCopyOption.REPLACE_EXISTING);
System.err.println("--------------- Compiling ptx from "+cuFile);
}
KernelLauncher.create(
cuFile,
kernelFunctionName, rebuildNeeded, "--use_fast_math","--prec-div=false");//,"--gpu-architecture=sm_20");
JCudaDriver.cuModuleLoad(myModule, cuFile.substring(0, cuFile.lastIndexOf('.')) + ".ptx");
System.err.println("initializing kernel "+ kernelFunctionName);
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, myModule, kernelFunctionName);
return function;
} catch (URISyntaxException | IOException e) {
e.printStackTrace();
}
return null;
// try(InputStream is = CudaEngine.class.getResourceAsStreéam(dotCuSourceFilePath)){
}
public int getMaxThreads() {
return maxThreads;
}
public static synchronized void cuCtxSynchronizeAll() {
for (CudaEngine ce : cudaEngines.values()) {
ce.cuCtxSynchronize();
}
}
public void cuCtxSynchronize() {
try {
exe.submit(() -> JCudaDriver.cuCtxSynchronize()).get();
} catch (ExecutionException | InterruptedException e) {
}
}
public Future<?> submit(Runnable runnable) {
if (! exe.isShutdown()) {
return exe.submit(runnable);
}
return null;
}
public void freeCudaMemory(Pointer p){
exe.submit(() -> cuMemFreeHost(p));
}
public void freeCudaMemory(CUdeviceptr p){
exe.submit(() -> cuMemFree(p));
}
/**
* Implements a little test that instantiates the CudaEngine and then cleans up
*
* @param args
*/
public static void main(String[] args) {
init(Level.ALL.toString());
CudaEngine cudaEngine = new CudaEngine(0);
KernelConfiguration kernelConfiguration = cudaEngine.getDefaultKernelConfiguration(100, 100);
cudaEngine.getKernel("EVAPORATION", "/turtlekit/cuda/kernels/Evaporation_2D.cu", kernelConfiguration);
}
}