package wcet.dsvmfp.model.smo.kernel; import util.Dbg; import com.jopdesign.sys.Const; import com.jopdesign.sys.Native; /** * @author rup & ms */ public class KFP { // mask to test a resolution in the dot kernel functions // 0xFF:1111 1111 // 0x3F:0011 1111 // 0x0F:0000 1111 // 0x03:0000 0011 // 0x00:0000 0000 // 0xC0:1100 0000 // 0xF0:1111 0000 // 0xFC:1111 1100 // 0xFF:1111 1111 // 16:16 0xFFFFFFFF // 12:12 0x0FFFFFF0 // 8:8 0x00FFFF00 // 4:4 0x000FF000 final static int mask = 0xffffffff; public final static int DOTKERNEL = 1; public final static int GAUSSIANKERNEL = 2; // All decimal numbers are FP private static int[][] data;// data private static int m; // number of rows (observations) private static int n; // number of columns (dimensions) private static int[] x; // FP test data private static int[] kernelCache; // kernelDot(i,i) private static int sigma2; // sigma squared private static int gaussConst;// -0.5/(sigma*sigma) private static int kernelType;// GAUSSIANKERNEL or DOTKERNEL; // A few variables to avoid too deep calls (and stack overflows) private static int tmp1; private static int tmp2; // Gauss k(data[i1],data[i2]) private static int kernelGauss(int i1, int i2) { // Dbg.wr("K-SP: ", Native.getSP()); tmp1 = FP.mul(FP.TWO, kernelDot(i1, i2)); // Dbg.wr("K-SP1: ", Native.getSP()); tmp2 = FP.sub(FP.add(kernelCache[i1], kernelCache[i2]), tmp1); // Dbg.wr("K-SP2: ", Native.getSP()); tmp1 = FP.mul(gaussConst, tmp2); // Dbg.wr("K-SP3: ", Native.getSP()); // Dbg.wr("tmp1: ", tmp1); tmp1 = FP.exp(tmp1); // Dbg.wr("K-SP4: ", Native.getSP()); // Dbg.wr("tmp1: ", tmp1); return tmp1; } // Gauss k(data[i1],x) private static int kernelGaussX(int i1) { return FP.exp(FP.mul(gaussConst, FP.sub(FP.add(kernelCache[i1], kernelDotXX()), FP.mul(FP.TWO, kernelDotX(i1))))); } /* * this is the original code and takes 4-5us for MAC unit and 8us for the SW * version. SW: 485 cycles HW: 273 cycles BUT: this was measured with n=1 BTW: * The SW version is invoked more often - does learning take longer with the * less resolution multiplication. A point in the paper! * // Dot k(data[i1],data[i2]) private static int kernelDot(int i1, int i2) { * int r = 0; int t; * * t = Native.rd(Const.IO_CNT); if (USEMAC) { for (int j = 0; j < n; j++) { * Native.wrMem(data[i1][j], Const.IO_MAC_A); Native.wrMem(data[i2][j], * Const.IO_MAC_B); } r = Native.rdMem(Const.IO_MAC_A); * Native.rdMem(Const.IO_MAC_B); } else { for (int j = 0; j < n; j++) { r = * FP.add(r, FP.mul(data[i1][j], data[i2][j])); } } t = * Native.rd(Const.IO_CNT)-t; System.out.print("time in cycles: "); * System.out.println(t); return r; } */ /* * Optimizations: Baseline: HW 273 cycles j as local 2: 269 n as local: 244 * avoid if, r=0: 244 (not part of measurement) single array access: 146 BTW: * These numbers are with n=1!!! now n=2: Baseline: HW: 499, SW: 939 HW: same * opt. as above: 258 HW: only n, no j: 244 * * SW: same as above: 686 SW: inline add: 486 SW: inline >>8 mul: 286 SW: * 'correct' mul: 650 */ // this and an optimized version of kernelDotX should be inlined! private static int kernelDot(int i1, int i2) { int r; int n = KFP.n; // int t; //t = Native.rd(Const.IO_CNT); int a[] = data[i1]; int b[] = data[i2]; // t = Native.rd(Const.IO_CNT); // long lr = 0; // HW version // while (n != 0) { // n = n - 1; // // lr += ((long) a[n]) * ((long) b[n]); // Native.wrMem(a[n], Const.IO_MAC_A); // Native.wrMem(b[n], Const.IO_MAC_B); // } // r = a[0]; // we need this time for the MAC to finish! // r = Native.rdMem(Const.IO_MAC_A); // check the MAC unit // if (r != ((int) (lr>>16))) { // System.out.print("result="); // System.out.println(r); // System.out.print("long result="); // System.out.println((int) (lr>>16)); // } // SW version r = 0; while (n != 0) { n = n - 1; // the mask is for experimenting with #sv and test err vs. resolution //r += ((a[n] & mask) >> 8) * ((b[n] & mask) >> 8); r += (a[n] >> 8) * (b[n] >> 8); } // SW 2 // // int f1 = a[n]; int f2 = b[n]; // // r += ((f1 >> 16) * (f2 >> 16)) << 16; //AH*BH // // r += ((f1 >> 16) * (f2 & 0x0000FFFF)) ; //AH*BL // // r += ((f1 & 0x0000FFFF) * (f2 >> 16)) ; //AL*BH // // r += ((f1 & 0x0000FFFF) * (f2 & 0x0000FFFF)) >> 16; //AL*BL // // } // t = Native.rd(Const.IO_CNT) - t; // System.out.print("time in cycles: "); // System.out.println(t); return r; } // Dot k(data[i1],x) public static int kernelDotX(int i1) { int r; int n = KFP.n; // int t = Native.rd(Const.IO_CNT); int a[] = data[i1]; // HW version while(n!=0){ n=n-1; Native.wrMem(a[n], Const.IO_MAC_A); Native.wrMem(x[n], Const.IO_MAC_B); } r = a[0]; // we need this time for the MAC to finish! r = Native.rdMem(Const.IO_MAC_A); // SW version // r = 0; // while (n != 0) { // n = n - 1; // //r += ((a[n] & mask) >> 8) * ((x[n] & mask) >> 8); // r += (a[n] >> 8) * (x[n] >> 8); // } // t = Native.rd(Const.IO_CNT) - t; // System.out.print("dotX time in cycles: "); // System.out.println(t); // System.exit(-1); return r; } // Dot k(x,x) private static int kernelDotXX() { int r = 0; for (int j = 0; j < n; j++) { r = FP.add(r, FP.mul(x[j], x[j])); } return r; } // TRAINING // // kernelType: k(data[i1],data[i1] public static int kernel(int i1, int i2) { if (kernelType == DOTKERNEL) return kernelDot(i1, i2); if (kernelType == GAUSSIANKERNEL) return kernelGauss(i1, i2); return -1; } // Sum over i k(data[i1],data[i]) public static int kernelArray(int i1) { int s = 0; for (int i = 0; i < m; i++) { s = FP.add(s, kernel(i1, i)); } return s; } // TESTING // // kernelType k(data[i1],x) public static int kernelX(int i1) { if (kernelType == DOTKERNEL) return kernelDotX(i1); if (kernelType == GAUSSIANKERNEL) return kernelGaussX(i1); return -1; } // Sum over i kernelType k(data[i],x) public static int kernelXArray() { int s = 0; for (int i = 0; i < m; i++) { s = FP.add(s, kernelX(i)); } return s; } // SETUP // public static void setData(int[][] data) { KFP.data = data; m = data.length; n = data[0].length; kernelCache = new int[m]; for (int i = 0; i < m; i++) { kernelCache[i] = kernelDot(i, i); } } public static void setX(int[] x) { KFP.x = x; } public static void setKernelType(int kernelType) { KFP.kernelType = kernelType; } public static void setSigma2(int sigma2) { KFP.sigma2 = sigma2; gaussConst = FP.div(-FP.HALF, sigma2); } }