package wcet.dsvmfp.model.smo.kernel;
import util.Dbg;
import com.jopdesign.sys.Const;
import com.jopdesign.sys.Native;
/**
* @author rup & ms
*/
public class KFloat {
// mask to test a resolution in the dot kernel functions
// 0xFF:1111 1111
// 0x3F:0011 1111
// 0x0F:0000 1111
// 0x03:0000 0011
// 0x00:0000 0000
// 0xC0:1100 0000
// 0xF0:1111 0000
// 0xFC:1111 1100
// 0xFF:1111 1111
// 16:16 0xFFFFFFFF
// 12:12 0x0FFFFFF0
// 8:8 0x00FFFF00
// 4:4 0x000FF000
final static int mask = 0xffffffff;
public final static int DOTKERNEL = 1;
public final static int GAUSSIANKERNEL = 2;
// All decimal numbers are FP
private static float[][] data;// data
private static int m; // number of rows (observations)
private static int n; // number of columns (dimensions)
private static float[] x; // FP test data
private static float[] kernelCache; // kernelDot(i,i)
private static float sigma2; // sigma squared
private static float gaussConst;// -0.5/(sigma*sigma)
private static int kernelType;// GAUSSIANKERNEL or DOTKERNEL;
// A few variables to avoid too deep calls (and stack overflows)
private static float tmp1;
private static float tmp2;
// Gauss k(data[i1],data[i2])
private static float kernelGauss(int i1, int i2) {
tmp1 = FloatUtil.mul(2.0f, kernelDot(i1, i2));
tmp2 = FloatUtil.sub((kernelCache[i1] + kernelCache[i2]), tmp1);
tmp1 = FloatUtil.mul(gaussConst, tmp2);
throw new Error("Fixme exp missing in KFloat");
}
// Gauss k(data[i1],x)
private static int kernelGaussX(int i1) {
throw new Error("Fixme exp missing in KFloat");
}
/*
* this is the original code and takes 4-5us for MAC unit and 8us for the SW
* version. SW: 485 cycles HW: 273 cycles BUT: this was measured with n=1 BTW:
* The SW version is invoked more often - does learning take longer with the
* less resolution multiplication. A point in the paper!
* // Dot k(data[i1],data[i2]) private static int kernelDot(int i1, int i2) {
* int r = 0; int t;
*
* t = Native.rd(Const.IO_CNT); if (USEMAC) { for (int j = 0; j < n; j++) {
* Native.wrMem(data[i1][j], Const.IO_MAC_A); Native.wrMem(data[i2][j],
* Const.IO_MAC_B); } r = Native.rdMem(Const.IO_MAC_A);
* Native.rdMem(Const.IO_MAC_B); } else { for (int j = 0; j < n; j++) { r =
* ABC.add(r, ABC.mul(data[i1][j], data[i2][j])); } } t =
* Native.rd(Const.IO_CNT)-t; System.out.print("time in cycles: ");
* System.out.println(t); return r; }
*/
/*
* Optimizations: Baseline: HW 273 cycles j as local 2: 269 n as local: 244
* avoid if, r=0: 244 (not part of measurement) single array access: 146 BTW:
* These numbers are with n=1!!! now n=2: Baseline: HW: 499, SW: 939 HW: same
* opt. as above: 258 HW: only n, no j: 244
*
* SW: same as above: 686 SW: inline add: 486 SW: inline >>8 mul: 286 SW:
* 'correct' mul: 650
*/
// this and an optimized version of kernelDotX should be inlined!
private static float kernelDot(int i1, int i2) {
float r;
int n = KFloat.n;
// int t;
//t = Native.rd(Const.IO_CNT);
float a[] = data[i1];
float b[] = data[i2];
// t = Native.rd(Const.IO_CNT);
// SW version
r = 0;
while (n != 0) { //@WCA loop=6
n = n - 1;
// the mask is for experimenting with #sv and test err vs. resolution
//r += ((a[n] & mask) >> 8) * ((b[n] & mask) >> 8);
r += a[n] * b[n];
}
return r;
}
// Dot k(data[i1],x)
public static float kernelDotX(int i1) {
float r;
int n = KFloat.n;
// int t = Native.rd(Const.IO_CNT);
float a[] = data[i1];
// HW version
while(n!=0){
n=n-1;
//Native.wrMem(a[n], Const.IO_MAC_A);
//Native.wrMem(x[n], Const.IO_MAC_B);
}
r = a[0]; // we need this time for the MAC to finish!
//r = Native.rdMem(Const.IO_MAC_A);
// SW version
// r = 0;
// while (n != 0) {
// n = n - 1;
// //r += ((a[n] & mask) >> 8) * ((x[n] & mask) >> 8);
// r += (a[n] >> 8) * (x[n] >> 8);
// }
// t = Native.rd(Const.IO_CNT) - t;
// System.out.print("dotX time in cycles: ");
// System.out.println(t);
// System.exit(-1);
return r;
}
// Dot k(x,x)
private static float kernelDotXX() {
float r = 0;
for (int j = 0; j < n; j++) {
r = r + FloatUtil.mul(x[j], x[j]);
}
return r;
}
// TRAINING //
// kernelType: k(data[i1],data[i1]
public static float kernel(int i1, int i2) {
if (kernelType == DOTKERNEL)
return kernelDot(i1, i2);
if (kernelType == GAUSSIANKERNEL)
return kernelGauss(i1, i2);
return -1;
}
// Sum over i k(data[i1],data[i])
public static float kernelArray(int i1) {
float s = 0;
for (int i = 0; i < m; i++) {
s = s + kernel(i1, i);
}
return s;
}
// kernelType k(data[i1],x)
public static float kernelX(int i1) {
if (kernelType == DOTKERNEL)
return kernelDotX(i1);
if (kernelType == GAUSSIANKERNEL)
return kernelGaussX(i1);
return -1;
}
// Sum over i kernelType k(data[i],x)
public static float kernelXArray() {
float s = 0;
for (int i = 0; i < m; i++) {
s = s + kernelX(i);
}
return s;
}
// SETUP //
public static void setData(float[][] data) {
KFloat.data = data;
m = data.length;
n = data[0].length;
kernelCache = new float[m];
for (int i = 0; i < m; i++) {
kernelCache[i] = kernelDot(i, i);
}
}
public static void setX(float[] x) {
KFloat.x = x;
}
public static void setKernelType(int kernelType) {
KFloat.kernelType = kernelType;
}
public static void setSigma2(float sigma2) {
KFloat.sigma2 = sigma2;
gaussConst = FloatUtil.div(-FloatUtil.HALF, sigma2);
}
}