package com.nativelibs4java.opencl; import static com.nativelibs4java.opencl.CLTestUtils.avgError; import static com.nativelibs4java.opencl.CLTestUtils.fillBuffersWithSomeDatad; import static com.nativelibs4java.opencl.CLTestUtils.fillBuffersWithSomeDataf; import static com.nativelibs4java.test.BenchmarkUtils.gc; import static com.nativelibs4java.util.NIOUtils.directBytes; import org.bridj.Pointer; import static org.bridj.Pointer.*; import java.util.Arrays; import java.nio.FloatBuffer; import java.nio.DoubleBuffer; import org.junit.BeforeClass; import org.junit.Test; import com.nativelibs4java.opencl.CLTestUtils.Action2; import com.nativelibs4java.opencl.CLTestUtils.ExecResult; //import com.nativelibs4java.scalacl.*; /// @see http://ati.amd.com/technology/streamcomputing/intro_opencl.html#simple public class OpenCL4JavaBenchmarkTest { static final boolean warmup = true; static ExecResult<FloatBuffer> testJava_float_aSinB(int loops, int dataSize) throws CLBuildException { FloatBuffer aBuffer = FloatBuffer.allocate(dataSize); FloatBuffer bBuffer = FloatBuffer.allocate(dataSize); FloatBuffer outputBuffer = FloatBuffer.allocate(dataSize); float[] a = aBuffer.array(), b = bBuffer.array(), output = outputBuffer.array(); if (warmup) { System.out.print("Warming up Java operations..."); for (int i = 0; i < 3000; i++) { java_aSinB(a, b, output, 100); } System.out.println(); } fillBuffersWithSomeDataf(aBuffer, bBuffer); gc(); long start = System.nanoTime(); for (int i = 0; i < loops; i++) { java_aSinB(a, b, output, dataSize); } long time = System.nanoTime() - start; System.out.println("Java operations : " + time + "ns"); return new ExecResult<FloatBuffer>(outputBuffer, time / (loops * (double) dataSize)); } static ExecResult<DoubleBuffer> testJava_double_aSinB(int loops, int dataSize) throws CLBuildException { DoubleBuffer aBuffer = DoubleBuffer.allocate(dataSize); DoubleBuffer bBuffer = DoubleBuffer.allocate(dataSize); DoubleBuffer outputBuffer = DoubleBuffer.allocate(dataSize); double[] a = aBuffer.array(), b = bBuffer.array(), output = outputBuffer.array(); if (warmup) { System.out.print("Warming up Java operations..."); for (int i = 0; i < 3000; i++) { java_aSinB(a, b, output, 100); } System.out.println(); } fillBuffersWithSomeDatad(aBuffer, bBuffer); gc(); long start = System.nanoTime(); for (int i = 0; i < loops; i++) { java_aSinB(a, b, output, dataSize); } long time = System.nanoTime() - start; System.out.println("Java operations : " + time + "ns"); return new ExecResult<DoubleBuffer>(outputBuffer, time / (loops * (double) dataSize)); } static ExecResult<Pointer<Float>> testOpenCL_float_aSinB(CLContext context, int loops, int dataSize, boolean hostInOpenCL) throws CLBuildException { ExecResult<Pointer<Byte>> er = testOpenCL_aSinB(context, Prim.Float, loops, dataSize, hostInOpenCL, new Action2<Pointer<Byte>, Pointer<Byte>>() { public void call(Pointer<Byte> a, Pointer<Byte> b) { fillBuffersWithSomeDataf(a.as(Float.class), b.as(Float.class)); } }); return new ExecResult<Pointer<Float>>(er.buffer.as(Float.class), er.unitTimeNano); } static ExecResult<Pointer<Double>> testOpenCL_double_aSinB(CLContext context, int loops, int dataSize, boolean hostInOpenCL) throws CLBuildException { ExecResult<Pointer<Byte>> er = testOpenCL_aSinB(context, Prim.Double, loops, dataSize, hostInOpenCL, new Action2<Pointer<Byte>, Pointer<Byte>>() { public void call(Pointer<Byte> a, Pointer<Byte> b) { fillBuffersWithSomeDatad(a.as(Double.class), b.as(Double.class)); } }); return new ExecResult<Pointer<Double>>(er.buffer.as(Double.class), er.unitTimeNano); } static ExecResult<Pointer<Byte>> testOpenCL_aSinB(CLContext context, Prim nativePrim, int loops, int dataSize, boolean hostInOpenCL, Action2<Pointer<Byte>, Pointer<Byte>> fillBuffersWithSomeData) throws CLBuildException { CLKernel kernel = setupASinB(nativePrim, context); CLQueue queue = context.createDefaultQueue(); Pointer<Byte> input1 = null, input2 = null, output = null; CLBuffer<Byte> memIn1, memIn2, memOut; if (hostInOpenCL) { memIn1 = kernel.program.context.createBuffer(CLMem.Usage.Input, Byte.class, dataSize * nativePrim.sizeof()); memIn2 = kernel.program.context.createBuffer(CLMem.Usage.Input, Byte.class, dataSize * nativePrim.sizeof()); memOut = kernel.program.context.createBuffer(CLMem.Usage.Output, Byte.class, dataSize * nativePrim.sizeof()); } else { input1 = allocateBytes(dataSize * nativePrim.sizeof()).order(context.getByteOrder()); input2 = allocateBytes(dataSize * nativePrim.sizeof()).order(context.getByteOrder()); output = allocateBytes(dataSize * nativePrim.sizeof()).order(context.getByteOrder()); memIn1 = kernel.program.context.createBuffer(CLMem.Usage.Input, input1, false); memIn2 = kernel.program.context.createBuffer(CLMem.Usage.Input, input2, false); memOut = kernel.program.context.createBuffer(CLMem.Usage.Output, output, false); } kernel.setArgs(memIn1, memIn2, memOut); if (warmup) { for (int i = 0; i < 3000; i++) { kernel.enqueueNDRange(queue, new int[]{dataSize}); } queue.finish(); } if (hostInOpenCL) { input1 = memIn1.map(queue, CLMem.MapFlags.Write); input2 = memIn2.map(queue, CLMem.MapFlags.Write); } fillBuffersWithSomeData.call(input1, input2); if (hostInOpenCL) { memIn1.unmap(queue, input1); memIn2.unmap(queue, input2); } queue.finish(); gc(); //if (dataSize < workItemSize) { // System.err.println("dataSize = " + dataSize + " is lower than max workItemSize for first dim = " + workItemSize + " !!!"); // workItemSize = 1; //} long start = System.nanoTime(); for (int i = 0; i < loops; i++) { kernel.enqueueNDRange(queue, new int[]{dataSize});//, new int[]{workItemSize}); } queue.finish(); long time = System.nanoTime() - start; //System.out.println("OpenCL operations(" + target + ") : " + time + "ns"); if (hostInOpenCL) { // Copy the OpenCL-hosted array back to RAM output = memOut.map(queue, CLMem.MapFlags.Read); // System.out.println("memOut.map = " + Long.toHexString(output.getPeer())); //queue.finish(); Pointer<Byte> b = allocateBytes(dataSize * nativePrim.sizeof()).order(context.getByteOrder()); output.copyTo(b, dataSize); memOut.unmap(queue, output); output = b; } return new ExecResult<Pointer<Byte>>(output, time / (loops * (double) dataSize)); } static CLKernel setupASinB(Prim nativeType, CLContext context) throws CLBuildException { String src = "\n" //+ "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" + (nativeType == Prim.Double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" : "") + "__kernel void aSinB( \n" + " __global const " + nativeType + "* a, \n" + " __global const " + nativeType + "* b, \n" + " __global " + nativeType + "* output) \n" + "{ \n" + " int i = get_global_id(0); \n" + " float ai = a[i], bi = b[i]; \n" + " output[i] = ai * sin(bi);// + atan2(ai, bi); \n" + "} \n"; CLProgram program = context.createProgram(src).build(); CLKernel kernel = program.createKernel("aSinB"); return kernel; } public static void java_aSinB(float[] a, float[] b, float[] output, int dataSize) throws CLBuildException { for (int i = 0; i < dataSize; i++) { float ai = a[i], bi = b[i]; output[i] = ai * (float) Math.sin(bi);// + (float)Math.atan2(ai, bi); } } public static void java_aSinB(double[] a, double[] b, double[] output, int dataSize) throws CLBuildException { for (int i = 0; i < dataSize; i++) { double ai = a[i], bi = b[i]; output[i] = ai * Math.sin(bi);// + Math.atan2(ai, bi); } } enum Prim { Float(4), Double(8), Int(4), Short(2), Half(2); final int sizeof; Prim(int sizeof) { this.sizeof = sizeof; } public int sizeof() { return sizeof; } public String toString() { return name().toLowerCase(); } } @Test public void testBenchmark() { /*for (String s : new String[] { "C:\\Program Files (x86)\\ATI Stream\\bin\\x86_64\\OpenCL.dll", "C:\\Program Files (x86)\\ATI Stream\\bin\\x86\\OpenCL.dll", "C:\\Program Files\\ATI Stream\\bin\\x86\\OpenCL.dll" }) if ((f = new File(s)).exists()) break; if (f.exists()) System.setProperty("OpenCL.library", f.toString()); //sss */ try { System.out.println("#\n# " + OpenCL4JavaBenchmarkTest.class.getName() + "\n#"); System.out.println("java.vm.name = " + System.getProperty("java.vm.name")); System.out.println("java.vm.version = " + System.getProperty("java.runtime.version")); System.out.println("Found platforms : " + Arrays.asList(JavaCL.listPlatforms())); CLPlatform platform = JavaCL.listPlatforms()[0]; String v = platform.getVendor(); System.out.println("Platform Vendor: " + v); boolean isAMD = v.equals("Advanced Micro Devices, Inc."); int loops = 10; int dataSize = isAMD ? 1024 : 1024 * 1024; CLContext context = JavaCL.createBestContext(); boolean hasDoubleSupport = context.isDoubleSupported(); if (!hasDoubleSupport) { System.out.println("OpenCL context does not support double precision computations : skipping second part of the test"); } else { System.out.println("#\n# [Double Operations]\n#"); ExecResult<DoubleBuffer> nsByJavaOp = testJava_double_aSinB(loops, dataSize); ExecResult<Pointer<Double>> nsByCLHostedOp = testOpenCL_double_aSinB(context, loops, dataSize, true); ExecResult<Pointer<Double>> nsByNativeHostedCLOp = testOpenCL_double_aSinB(context, loops, dataSize, false); double errCLHosted = avgError(nsByJavaOp.buffer, nsByCLHostedOp.buffer, dataSize); double errNativeHosted = avgError(nsByJavaOp.buffer, nsByNativeHostedCLOp.buffer, dataSize); System.out.println(" Avg relative error (hosted in CL) = " + errCLHosted); System.out.println("Avg relative error (hosted in RAM) = " + errNativeHosted); System.out.println(); System.out.println(" java op\t= " + nsByJavaOp.unitTimeNano + " ns"); System.out.println(); System.out.println(" opencl (hosted in CL) op\t= " + nsByCLHostedOp.unitTimeNano + " ns"); System.out.println(" times slower than Java = " + (nsByCLHostedOp.unitTimeNano / nsByJavaOp.unitTimeNano)); System.out.println(" times faster than Java = " + (nsByJavaOp.unitTimeNano / nsByCLHostedOp.unitTimeNano)); System.out.println(); System.out.println("opencl (hosted in RAM) op\t= " + nsByNativeHostedCLOp.unitTimeNano + " ns"); System.out.println(" times slower than Java = " + (nsByNativeHostedCLOp.unitTimeNano / nsByJavaOp.unitTimeNano)); System.out.println(" times faster than Java = " + (nsByJavaOp.unitTimeNano / nsByNativeHostedCLOp.unitTimeNano)); } if (true) { System.out.println("#\n# [Float Operations]\n#"); ExecResult<FloatBuffer> nsByJavaOp = testJava_float_aSinB(loops, dataSize); ExecResult<Pointer<Float>> nsByCLHostedOp = testOpenCL_float_aSinB(context, loops, dataSize, true); ExecResult<Pointer<Float>> nsByNativeHostedCLOp = testOpenCL_float_aSinB(context, loops, dataSize, false); double errCLHosted = avgError(nsByJavaOp.buffer, nsByCLHostedOp.buffer, dataSize); double errNativeHosted = avgError(nsByJavaOp.buffer, nsByNativeHostedCLOp.buffer, dataSize); /*for (int i = 0; i < 10; i++) { System.out.print("i\t = " + i + ",\t"); System.out.print("buf \t=" + nsByJavaOp.buffer.get(i) +",\t"); System.out.print("nat \t=" + nsByNativeHostedCLOp.buffer.get(i) +",\t"); System.out.print("ocl \t=" + nsByCLHostedOp.buffer.get(i) +",\t"); System.out.println(); }*/ System.out.println(" Avg relative error (hosted in CL) = " + errCLHosted); System.out.println("Avg relative error (hosted in RAM) = " + errNativeHosted); System.out.println(); System.out.println(" java op\t= " + nsByJavaOp.unitTimeNano + " ns"); System.out.println(); System.out.println(" opencl (hosted in CL) op\t= " + nsByCLHostedOp.unitTimeNano + " ns"); System.out.println(" times slower than Java = " + (nsByCLHostedOp.unitTimeNano / nsByJavaOp.unitTimeNano)); System.out.println(" times faster than Java = " + (nsByJavaOp.unitTimeNano / nsByCLHostedOp.unitTimeNano)); System.out.println(); System.out.println("opencl (hosted in RAM) op\t= " + nsByNativeHostedCLOp.unitTimeNano + " ns"); System.out.println(" times slower than Java = " + (nsByNativeHostedCLOp.unitTimeNano / nsByJavaOp.unitTimeNano)); System.out.println(" times faster than Java = " + (nsByJavaOp.unitTimeNano / nsByNativeHostedCLOp.unitTimeNano)); } } catch (CLBuildException e) { e.printStackTrace(); } } }