/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.nativelibs4java.opencl;
import com.nativelibs4java.opencl.library.*;
import java.util.Map;
import static org.junit.Assert.*;
import org.junit.*;
import org.bridj.Pointer;
import org.bridj.Platform;
import static org.bridj.Pointer.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.nio.ByteBuffer;
import java.util.List;
import org.junit.runners.Parameterized;
/**
*
* @author Kazo Csaba
*/
@Ignore
@SuppressWarnings("unchecked")
public class OverheadTest extends AbstractCommon {
public OverheadTest(CLDevice device) {
super(device);
}
@Parameterized.Parameters
public static List<Object[]> getDeviceParameters() {
return AbstractCommon.getDeviceParameters();
}
static void gc() {
try {
System.gc();
Thread.sleep(100);
System.gc();
Thread.sleep(100);
} catch (InterruptedException ex) {}
}
static long time(String title, int n, Runnable payload, Runnable finalizer) {
gc();
long start = System.nanoTime();
for (int i = 0; i < n; i++) {
payload.run();
}
if (finalizer != null)
finalizer.run();
long timeMillis = (System.nanoTime() - start) / 1000000;
if (title != null)
System.out.println("Time[ " + title + " ; n = " + n + "] = " + timeMillis + " ms");
return timeMillis;
}
@Test
public void compareVariousSetArgsPerformance() throws CLBuildException {
if (!context.getCacheBinaries()) {
System.out.println("Skipping binaries caching test");
return;
}
CLProgram program = context.createProgram(
"__kernel void copy(__global int* a, __global int* b, short s, int i, char c, float f) {\n" +
" int idx = get_global_id(0);\n" +
" b[idx] = a[idx];\n" +
"} ");
program.build();
final CLKernel kernel = program.createKernel("copy");
final CLBuffer<Integer> a=context.createBuffer(CLMem.Usage.Input, Integer.class, 4);
final CLBuffer<Integer> b=context.createBuffer(CLMem.Usage.Output, Integer.class, 4);
int nArgs = kernel.getNumArgs();
Runnable setWithSetArgs = new Runnable() { public void run() {
kernel.setArgs(a, b, (short)1, 1, (byte)1, 1.0f);
}};
Runnable setWithSpecializedSetArg = new Runnable() { public void run() {
kernel.setArg(0, a);
kernel.setArg(1, b);
kernel.setArg(2, (short)1);
kernel.setArg(3, 1);
kernel.setArg(4, (byte)1);
kernel.setArg(5, 1.0f);
}};
/*
Runnable setWithCLAPI = new Runnable() {
private final OpenCLLibrary CL = new OpenCLLibrary();
private final Pointer<?> tmp = allocateBytes(8);
public void run() {
CL.clSetKernelArg(kernel.getEntity(), 0, Pointer.SIZE, a.getEntity());
CL.clSetKernelArg(kernel.getEntity(), 1, Pointer.SIZE, b.getEntity());
CL.clSetKernelArg(kernel.getEntity(), 2, 2L, tmp.setShort((short)1));
CL.clSetKernelArg(kernel.getEntity(), 3, 4L, tmp.setInt(1));
CL.clSetKernelArg(kernel.getEntity(), 4, 1L, tmp.setByte((byte)1));
CL.clSetKernelArg(kernel.getEntity(), 5, 4L, tmp.setFloat(1));
}
};
*/
Runnable setWithRawCLAPI = new Runnable() {
private final long aPeer = a.getEntity();
private final long bPeer = b.getEntity();
private final long kEntity = kernel.getEntity();
private final Pointer<?> tmp = allocateBytes(8);//.withoutValidityInformation();
private final ByteBuffer tmpBuf = tmp.getByteBuffer();
private final long tPeer = getPeer(tmp);
private final long pointerSize = Pointer.SIZE;
private final OpenCLLibrary CL = new OpenCLLibrary();
public void run() {
CL.clSetKernelArg(kEntity, 0, pointerSize, aPeer);
CL.clSetKernelArg(kEntity, 1, pointerSize, bPeer);
tmpBuf.putShort(0, (short)1);
CL.clSetKernelArg(kEntity, 2, 2L, tPeer);
tmpBuf.putInt(0, 1);
CL.clSetKernelArg(kEntity, 3, 4L, tPeer);
tmpBuf.put(0, (byte)1);
CL.clSetKernelArg(kEntity, 4, 1L, tPeer);
tmpBuf.putFloat(0, 1);
CL.clSetKernelArg(kEntity, 5, 4L, tPeer);
}
};
int nWarmup = 8000, nTest = 50000;
//time(null, nWarmup, setWithCLAPI, null);
time(null, nWarmup, setWithSetArgs, null);
time(null, nWarmup, setWithSpecializedSetArg, null);
time(null, nWarmup, setWithRawCLAPI, null);
int nSamples = 10;
double totSetArgs = 0, totCLSetKernelArg = 0, totSetArg = 0, totCLSetKernelArgRaw = 0;
for (int i = 0; i < nSamples; i++) {
//totCLSetKernelArg += time("clSetKernelArg pointers", nTest, setWithCLAPI, null);
totSetArgs += time("CLKernel.setArgs", nTest, setWithSetArgs, null);
totSetArg += time("CLKernel.setArg", nTest, setWithSpecializedSetArg, null);
totCLSetKernelArgRaw += time("clSetKernelArg raw", nTest, setWithRawCLAPI, null);
System.out.println();
}
final double maxSlower = 1.4;
double slowerSetArg = totSetArg / totCLSetKernelArgRaw;
double slowerSetArgs = totSetArgs / totCLSetKernelArgRaw;
System.out.println("CLKernel.setArg is " + slowerSetArg + "x slower than hand-optimized.");
System.out.println("CLKernel.setArgs is " + slowerSetArgs + "x slower than hand-optimized.");
final double maxMilliSecondsPerCall = Platform.is64Bits() ? 0.0025 : 0.007;
double setArgAvg = (totSetArg / (double)nTest) / nArgs;
System.out.println("CLKernel.setArg took " + setArgAvg + " ms per call in average.");
assertTrue("CLKernel.setArg was supposed to last at most " + maxMilliSecondsPerCall + " ms in average, but was " + setArgAvg + " ms", setArgAvg < maxMilliSecondsPerCall);
assertTrue("CLKernel.setArg was supposed not to be more than " + maxSlower + "x slower than hand-optimized version, was " + slowerSetArg + "x slower.", slowerSetArg <= maxSlower);
assertTrue("CLKernel.setArgs was supposed not to be more than " + maxSlower + "x slower than hand-optimized version, was " + slowerSetArgs + "x slower.", slowerSetArgs <= maxSlower);
}
}