KFloat.java example

Explorer
jop-master
package wcet.dsvmfp.model.smo.kernel;

import util.Dbg;

import com.jopdesign.sys.Const;
import com.jopdesign.sys.Native;

/**
 * @author rup & ms
 */
public class KFloat {

  // mask to test a resolution in the dot kernel functions
  // 0xFF:1111 1111
  // 0x3F:0011 1111
  // 0x0F:0000 1111
  // 0x03:0000 0011
  // 0x00:0000 0000
  // 0xC0:1100 0000
  // 0xF0:1111 0000
  // 0xFC:1111 1100
  // 0xFF:1111 1111

  // 16:16 0xFFFFFFFF
  // 12:12 0x0FFFFFF0
  // 8:8 0x00FFFF00
  // 4:4 0x000FF000
  final static int mask = 0xffffffff;

  public final static int DOTKERNEL = 1;

  public final static int GAUSSIANKERNEL = 2;

  // All decimal numbers are FP
  private static float[][] data;// data

  private static int m; // number of rows (observations)

  private static int n; // number of columns (dimensions)

  private static float[] x; // FP test data

  private static float[] kernelCache; // kernelDot(i,i)

  private static float sigma2; // sigma squared

  private static float gaussConst;// -0.5/(sigma*sigma)

  private static int kernelType;// GAUSSIANKERNEL or DOTKERNEL;

  // A few variables to avoid too deep calls (and stack overflows)
  private static float tmp1;

  private static float tmp2;

  // Gauss k(data[i1],data[i2])
  private static float kernelGauss(int i1, int i2) {

    tmp1 = FloatUtil.mul(2.0f, kernelDot(i1, i2));

    tmp2 = FloatUtil.sub((kernelCache[i1] + kernelCache[i2]), tmp1);

    tmp1 = FloatUtil.mul(gaussConst, tmp2);

    throw new Error("Fixme exp missing in KFloat");
  }

  // Gauss k(data[i1],x)
  private static int kernelGaussX(int i1) {
	  throw new Error("Fixme exp missing in KFloat");
  }

  /*
   * this is the original code and takes 4-5us for MAC unit and 8us for the SW
   * version. SW: 485 cycles HW: 273 cycles BUT: this was measured with n=1 BTW:
   * The SW version is invoked more often - does learning take longer with the
   * less resolution multiplication. A point in the paper!
   *  // Dot k(data[i1],data[i2]) private static int kernelDot(int i1, int i2) {
   * int r = 0; int t;
   *
   * t = Native.rd(Const.IO_CNT); if (USEMAC) { for (int j = 0; j < n; j++) {
   * Native.wrMem(data[i1][j], Const.IO_MAC_A); Native.wrMem(data[i2][j],
   * Const.IO_MAC_B); } r = Native.rdMem(Const.IO_MAC_A);
   * Native.rdMem(Const.IO_MAC_B); } else { for (int j = 0; j < n; j++) { r =
   * ABC.add(r, ABC.mul(data[i1][j], data[i2][j])); } } t =
   * Native.rd(Const.IO_CNT)-t; System.out.print("time in cycles: ");
   * System.out.println(t); return r; }
   */

  /*
   * Optimizations: Baseline: HW 273 cycles j as local 2: 269 n as local: 244
   * avoid if, r=0: 244 (not part of measurement) single array access: 146 BTW:
   * These numbers are with n=1!!! now n=2: Baseline: HW: 499, SW: 939 HW: same
   * opt. as above: 258 HW: only n, no j: 244
   *
   * SW: same as above: 686 SW: inline add: 486 SW: inline >>8 mul: 286 SW:
   * 'correct' mul: 650
   */

  // this and an optimized version of kernelDotX should be inlined!
  private static float kernelDot(int i1, int i2) {
	float r;
    int n = KFloat.n;

//    int t;
    //t = Native.rd(Const.IO_CNT);
    float a[] = data[i1];
    float b[] = data[i2];
    // t = Native.rd(Const.IO_CNT);

    // SW version
    r = 0;
    while (n != 0) { //@WCA loop=6
      n = n - 1;
      // the mask is for experimenting with #sv and test err vs. resolution
      //r += ((a[n] & mask) >> 8) * ((b[n] & mask) >> 8);
      r += a[n] * b[n];
    }

    return r;
  }

  // Dot k(data[i1],x)
  public static float kernelDotX(int i1) {
	  float r;
    int n = KFloat.n;
    // int t = Native.rd(Const.IO_CNT);
    float a[] = data[i1];

    // HW version
     while(n!=0){
       n=n-1;
       //Native.wrMem(a[n], Const.IO_MAC_A);
       //Native.wrMem(x[n], Const.IO_MAC_B);
     }
     r = a[0]; // we need this time for the MAC to finish!
     //r = Native.rdMem(Const.IO_MAC_A);

    // SW version
//    r = 0;
//    while (n != 0) {
//      n = n - 1;
//      //r += ((a[n] & mask) >> 8) * ((x[n] & mask) >> 8);
//      r += (a[n] >> 8) * (x[n] >> 8);
//    }

    // t = Native.rd(Const.IO_CNT) - t;
    // System.out.print("dotX time in cycles: ");
    // System.out.println(t);
    // System.exit(-1);

    return r;
  }

  // Dot k(x,x)
  private static float kernelDotXX() {
	  float r = 0;
    for (int j = 0; j < n; j++) {
      r = r + FloatUtil.mul(x[j], x[j]);
    }
    return r;
  }

  // TRAINING //

  // kernelType: k(data[i1],data[i1]
  public static float kernel(int i1, int i2) {
    if (kernelType == DOTKERNEL)
      return kernelDot(i1, i2);

    if (kernelType == GAUSSIANKERNEL)
      return kernelGauss(i1, i2);

    return -1;
  }

  // Sum over i k(data[i1],data[i])
  public static float kernelArray(int i1) {
	float s = 0;
    for (int i = 0; i < m; i++) {
      s = s + kernel(i1, i);
    }
    return s;
  }

  // kernelType k(data[i1],x)
  public static float kernelX(int i1) {
    if (kernelType == DOTKERNEL)
      return kernelDotX(i1);

    if (kernelType == GAUSSIANKERNEL)
      return kernelGaussX(i1);

    return -1;
  }

  // Sum over i kernelType k(data[i],x)
  public static float kernelXArray() {
	float s = 0;
    for (int i = 0; i < m; i++) {
      s = s + kernelX(i);
    }
    return s;
  }

  // SETUP //

  public static void setData(float[][] data) {
    KFloat.data = data;
    m = data.length;
    n = data[0].length;
    kernelCache = new float[m];
    for (int i = 0; i < m; i++) {
      kernelCache[i] = kernelDot(i, i);
    }
  }

  public static void setX(float[] x) {
    KFloat.x = x;
  }

  public static void setKernelType(int kernelType) {
    KFloat.kernelType = kernelType;
  }

  public static void setSigma2(float sigma2) {
    KFloat.sigma2 = sigma2;
    gaussConst = FloatUtil.div(-FloatUtil.HALF, sigma2);
  }
}