CudaEngine.java example

Explorer
TurtleKit-master
- demos
  - turtlekit
    - digitalart
      - BlackAndWhiteViewing.java
      - LangtonAnt.java
    - epidemic
      - Virus.java
    - flocking
    - galaxy
      - BlackHole.java
      - Star.java
    - langtonAnts
      - LangtonAnt.java
      - RLR_LangtonAnt.java
    - mle
    - preypredator
      - Predator.java
      - Prey.java
    - pvequalsnrt
    - termites
      - Termite.java
      - TermiteViewer.java
    - toys
- src
  - turtlekit
/*******************************************************************************
 * TurtleKit 3 - Agent Based and Artificial Life Simulation Platform
 * Copyright (C) 2011-2016 Fabien Michel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package turtlekit.cuda;

import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemFreeHost;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import jcuda.CudaException;
import jcuda.Pointer;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.CUstream;
import jcuda.driver.CUstream_flags;
import jcuda.driver.JCudaDriver;
import jcuda.utils.KernelLauncher;

public class CudaEngine {

	/**
	 * Native libs have to be loaded from the file system. 
	 * This is the dir where native libs are extracted (in the default io dir of the OS)
	 */
	public static final String ioTmpDir = System.getProperty("java.io.tmpdir");

	private static int availableDevicesNb = 0;

	private static ExecutorService initialization;

	final static Map<Integer,CudaEngine> cudaEngines = new HashMap<>();

	private static int NB_OF_DEVICE_TO_USE = 1;

	private static Logger logger;

	/**
	 * @param logLevel 
	 * 
	 */
	public static boolean init(String logLevel) {
		logger = Logger.getLogger(CudaEngine.class.getSimpleName());
		logger.setLevel(Level.parse(logLevel));
		logger.setLevel(Level.ALL);
		synchronized (cudaEngines) {
			logger.finer("---------Initializing Cuda----------------");
			try {
				JCudaDriver.setExceptionsEnabled(true);
				JCudaDriver.cuInit(0);
				// Obtain the number of devices
				int deviceCountArray[] = { 0 };
				JCudaDriver.cuDeviceGetCount(deviceCountArray);
				availableDevicesNb = deviceCountArray[0];
				if (availableDevicesNb == 0)
					return false;
				//				availableDevicesNb = NB_OF_DEVICE_TO_USE;// TODO
				initialization = Executors.newCachedThreadPool();
				logger.finer("Found " + availableDevicesNb + " GPU devices");
				Future<?> initJob = initialization.submit(new Runnable() {
					public void run() {
						for (int i = 0/*-NB_OF_DEVICE_TO_USE*/; i < availableDevicesNb; i++) {
							final int index = i;
							logger.finer("Initializing device n°" + index);
							cudaEngines.put(index, new CudaEngine(index));
						}}
				});
				initJob.get();
				initialization.shutdown();
			} catch (InterruptedException | ExecutionException | CudaException | UnsatisfiedLinkError e) {
				logger.finer("---------Cannot initialize Cuda !!! ----------------");
				e.printStackTrace();
				return false;
			}
			Runtime.getRuntime().addShutdownHook(new Thread() {
				@Override
				public void run() {
					CudaEngine.stop();
				}
			});
			logger.fine("---------Cuda Initialized----------------");
			return true;
		}
	}
//	
//	public Pointer getPointerToFloat(float f){
//		return Pointer.to(new float[]{f});
//	}
	


	public int cuDeviceGetCount() {
		return availableDevicesNb;
	}
	
	private static AtomicInteger cudaObjectID = new AtomicInteger(0);

	private static Map<CudaObject, CudaEngine> engineBinds = new HashMap<>();

	private ExecutorService exe;
	protected CUfunction f;
	private List<CudaObject> cudaObjects = new ArrayList<CudaObject>();

	private int maxThreads;

	private int Id = -1;

	protected CUcontext context;

	protected CUmodule myModule;

	private Map<String, CUfunction> functions = new HashMap<>();

	private CudaEngine(final int deviceId) {
		exe = Executors.newSingleThreadExecutor(new ThreadFactory() {
			@Override
			public Thread newThread(Runnable r) {
		        Thread thread = new Thread(r);
		        thread.setDaemon(true);
		        return thread;
			}
		}); //mandatory: Only one cuda thread per context
		Id  = deviceId;
		try {
			exe.submit(new Runnable() {
				@Override
				public void run() {
					CUdevice device = new CUdevice();
					JCudaDriver.cuDeviceGet(device, deviceId);
					int array[] = { 0 };
					JCudaDriver.cuDeviceGetAttribute(array,
							CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
					maxThreads = (int) Math.sqrt(array[0]);
//					System.out.println(maxThreads);
//					maxThreads = (int) 1024;
					context = new CUcontext();
					JCudaDriver.cuCtxCreate(context, 0, device);
					myModule = new CUmodule();
				}
			}).get();
		} catch (InterruptedException | ExecutionException e) {
			throw new RuntimeException(e.getMessage());
		}
	}
	
	public static boolean isCudaAvailable() {
		return availableDevicesNb != 0;
	}
	
	@Override
	public String toString() {
		return "cudaEngine device #"+Id;
	}
	
	public static void addKernelSourceFile(String aPathInTheClassPath){
		
	}
	
	public KernelConfiguration getDefaultKernelConfiguration(int dataSizeX, int dataSizeY){
		int gridSizeX = (dataSizeX + maxThreads - 1) / maxThreads;
		int gridSizeY = (dataSizeY + maxThreads - 1) / maxThreads;
		return new KernelConfiguration(gridSizeX, gridSizeY, maxThreads, maxThreads, getNewCudaStream());
	}
	
	public CudaKernel getKernel(final String kernelFunctionName, final String cuSourceFilePath, final KernelConfiguration kc){
		try {
			return exe.submit(() -> {
				CUfunction function = functions.computeIfAbsent(""+kernelFunctionName+cuSourceFilePath, k -> updateCuSourceFile(kernelFunctionName,cuSourceFilePath));
				return new CudaKernel(function, CudaEngine.this, kernelFunctionName, cuSourceFilePath, kc);
			}).get();
		} catch (InterruptedException | ExecutionException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	

	
	public <T> CUdeviceptr createDeviceDataGrid(int wdith, int height, Class<T> dataType){
		CUdeviceptr tmpPtr = new CUdeviceptr();
		try {
			submit(() -> cuMemAlloc(tmpPtr, getRequiredMemorySize(dataType, wdith, height))).get();
		} catch (InterruptedException | ExecutionException | IllegalArgumentException | SecurityException e) {
			e.printStackTrace();
		}
		return tmpPtr;
	}

	public <T> Buffer getUnifiedBufferBetweenPointer(Pointer hostData, CUdeviceptr deviceData, Class<T> dataType, int wdith, int height){
		try {
			int size = getRequiredMemorySize(dataType, wdith, height);
			final Buffer buffer = exe.submit(() -> {
				JCudaDriver.cuMemHostAlloc(hostData, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
				final ByteBuffer byteBuffer = hostData.getByteBuffer(0, size);
				byteBuffer.order(ByteOrder.nativeOrder());
				JCudaDriver.cuMemHostGetDevicePointer(deviceData, hostData, 0);
				return byteBuffer;
			}).get();
			String simpleName = dataType.getSimpleName();
			switch (simpleName) {
			case "Integer":
				simpleName = "Int";
				break;
			case "Character":
				simpleName = "Char";
				break;
			default:
				break;
			}
			final Method method = buffer.getClass().getMethod("as"+simpleName+"Buffer");
			method.setAccessible(true);
			return (Buffer) method.invoke(buffer);
		} catch (InterruptedException | ExecutionException | IllegalArgumentException | IllegalAccessException | SecurityException | InvocationTargetException | NoSuchMethodException e) {
			e.printStackTrace();
		}
		return null;
	}


	private <T> int getRequiredMemorySize(Class<T> dataType, int wdith, int height) {
		String simpleName = dataType.getSimpleName();
		switch (simpleName) {
		case "int":
			dataType = (Class<T>) Integer.class;
			break;
		case "char":
			dataType = (Class<T>) Character.class;
			break;
		default:
			break;
		}
		int floatGridMemorySize = 0;
		try {
			floatGridMemorySize = wdith * height * dataType.getField("SIZE").getInt(null) / 8;
		} catch (IllegalArgumentException | IllegalAccessException | NoSuchFieldException | SecurityException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return floatGridMemorySize;
	}

	public static CudaEngine getCudaEngine(CudaObject co) {
		synchronized (cudaEngines) {
			if (!isCudaAvailable())
				throw new CudaException("No cuda device found");
			try {
				initialization.awaitTermination(10, TimeUnit.SECONDS);
			} 
			catch (InterruptedException e) {
				e.printStackTrace();
			}
			synchronized (engineBinds) {
				return engineBinds.computeIfAbsent(co, v -> {
					final int pheroID = cudaObjectID.incrementAndGet();
					final CudaEngine ce = cudaEngines.get(pheroID % availableDevicesNb);
					//			final CudaEngine ce = cudaEngines.get(0);
					ce.cudaObjects.add(co);
					logger.finer(co + "ID " + pheroID + " getting cuda engine Id " + ce.Id);
					return ce;
				});
			}
	}
	}

	static FloatBuffer getUnifiedFloatBuffer(Pointer pinnedMemory,
			CUdeviceptr devicePtr, long size) {
		JCudaDriver.cuMemHostAlloc(pinnedMemory, size, JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
		final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
		byteBuffer.order(ByteOrder.nativeOrder());
		JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
		return byteBuffer.asFloatBuffer();
	}

	public static IntBuffer getUnifiedIntBuffer(Pointer pinnedMemory,
			CUdeviceptr devicePtr, 
			int size) {
		JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
				JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
		final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
		byteBuffer.order(ByteOrder.nativeOrder());
		JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
		return byteBuffer.asIntBuffer();
	}

	public static int[] getUnifiedIntArray(Pointer pinnedMemory,
			CUdeviceptr devicePtr, int size) {
		int[] values = new int[size];
		JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
				JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
		final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
		byteBuffer.order(ByteOrder.nativeOrder());
		JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
		
		return values;
	}

	public static ByteBuffer getUnifiedByteBuffer(Pointer pinnedMemory,
			CUdeviceptr devicePtr, int size) {
		JCudaDriver.cuMemHostAlloc(pinnedMemory, size,
				JCudaDriver.CU_MEMHOSTALLOC_DEVICEMAP);
		final ByteBuffer byteBuffer = pinnedMemory.getByteBuffer(0, size);
		byteBuffer.order(ByteOrder.nativeOrder());
		JCudaDriver.cuMemHostGetDevicePointer(devicePtr, pinnedMemory, 0);
		return byteBuffer;
	}

	/**
	 * Stop the executors and clean memory on registered CUObject
	 */
	public static void stop() {
		synchronized (cudaEngines) {
			cuCtxSynchronizeAll();
			for (Iterator<CudaEngine> iterator = cudaEngines.values()
					.iterator(); iterator.hasNext();) {
				iterator.next().shutdown();
				iterator.remove();

			}
			//		for (CudaEngine ce : cudaEngines.values()) {
			//			ce.shutdown();
			//		}
		}
	}

	/**
	 * Stop the executors and clean memory on registered CUObject
	 */
	synchronized public static void freeMemory() {
		for (CudaEngine ce : cudaEngines.values()) {
			ce.freeCUObjectsMemory();
		}
	}

	/**
	 * Free memory from the currently registered CUObjects
	 */
	public void freeCUObjectsMemory() {
		exe.submit(new Runnable() {
			@Override
			public void run() {
				cuCtxSynchronize();
				for (CudaObject co : cudaObjects) {
					co.freeMemory();
				}
				JCudaDriver.cuCtxDestroy(context);
			}
		});
	}

	private synchronized void shutdown() {
		if (! exe.isShutdown()) {
			freeCUObjectsMemory();
		}
		exe.shutdown();
		try {
			System.err.println("cuda device "+Id+" freed ? " +exe.awaitTermination(10, TimeUnit.SECONDS));
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}


	public CUstream getNewCudaStream(){
		try {
			return exe.submit(() -> {
				final CUstream cudaStream = new CUstream();
				JCudaDriver.cuStreamCreate(cudaStream, CUstream_flags.CU_STREAM_NON_BLOCKING);
				return cudaStream;
			}).get();
		} catch (InterruptedException | ExecutionException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	private CUfunction updateCuSourceFile(String kernelFunctionName, String dotCuSourceFilePath) {
//		KernelLauncher.setCompilerPath("/usr/local/cuda-7.0/bin/");//FIXME
		try {
			final URL resource = CudaEngine.class.getResource(dotCuSourceFilePath);
			if(resource == null)
				throw new FileNotFoundException(dotCuSourceFilePath+" not found on the class path");
			File f = new File(resource.toURI());
			final Path path = Paths.get(CudaEngine.ioTmpDir, f.getName());
			final File file = path.toFile();
			final boolean rebuildNeeded = ! file.exists() || file.lastModified() < f.lastModified();
			String cuFile = path.toString();
			if(rebuildNeeded){
				Files.copy(f.toPath(), path, StandardCopyOption.REPLACE_EXISTING);
				System.err.println("--------------- Compiling ptx from "+cuFile);
			}
			KernelLauncher.create(
					cuFile,
					kernelFunctionName, rebuildNeeded, "--use_fast_math","--prec-div=false");//,"--gpu-architecture=sm_20");
			JCudaDriver.cuModuleLoad(myModule, cuFile.substring(0, cuFile.lastIndexOf('.')) + ".ptx");
			System.err.println("initializing kernel "+ kernelFunctionName);
			
			CUfunction function = new CUfunction();
			JCudaDriver.cuModuleGetFunction(function, myModule, kernelFunctionName);
			return function;
		} catch (URISyntaxException | IOException e) {
			e.printStackTrace();
		}
		return null;
//		try(InputStream is = CudaEngine.class.getResourceAsStreéam(dotCuSourceFilePath)){
		
	}


	public int getMaxThreads() {
		return maxThreads;
	}

	public static synchronized void cuCtxSynchronizeAll() {
		for (CudaEngine ce : cudaEngines.values()) {
			ce.cuCtxSynchronize();
		}
	}
	
	public void cuCtxSynchronize() {
		try {
			exe.submit(() -> JCudaDriver.cuCtxSynchronize()).get();
		} catch (ExecutionException | InterruptedException e) {
		}
	}

	public Future<?> submit(Runnable runnable) {
		if (! exe.isShutdown()) {
			return exe.submit(runnable);
		}
		return null;
	}
	
	public void freeCudaMemory(Pointer p){
		exe.submit(() -> cuMemFreeHost(p));
	}

	public void freeCudaMemory(CUdeviceptr p){
		exe.submit(() -> cuMemFree(p));
	}

	/**
	 * Implements a little test that instantiates the CudaEngine and then cleans up
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		init(Level.ALL.toString());
		CudaEngine cudaEngine = new CudaEngine(0);
		KernelConfiguration kernelConfiguration = cudaEngine.getDefaultKernelConfiguration(100, 100);
		cudaEngine.getKernel("EVAPORATION", "/turtlekit/cuda/kernels/Evaporation_2D.cu", kernelConfiguration);
	
	}
	
}