MatrixApp.java example

Explorer
rootbeer1-master
package rootbeer.examples.gtc2013;

import org.trifort.rootbeer.runtime.util.Stopwatch;
import org.trifort.rootbeer.runtime.Rootbeer;
import org.trifort.rootbeer.runtime.Kernel;
import org.trifort.rootbeer.runtime.StatsRow;
import org.trifort.rootbeer.runtime.ThreadConfig;
import java.util.List;
import java.util.ArrayList;

//see: http://www.shodor.org/media/content//petascale/materials/UPModules/matrixMultiplication/moduleDocument.pdf
public class MatrixApp {

  private float m_a[];
  private float m_bgpu[];
  private float m_bcpu[];
  private float m_bcpu2[];
  private float m_cgpu[];
  private float m_ccpu[];
  private float m_ccpu2[];
  private int m_blockSize;
  private int m_gridSize;
  private int m_blockIters;
  private Stopwatch m_cpuWatch;
  private Stopwatch m_gpuWatch;
  private Stopwatch m_transposeWatch;

  public MatrixApp(){
    m_cpuWatch = new Stopwatch();
    m_gpuWatch = new Stopwatch();
    m_transposeWatch = new Stopwatch();
  }

  public void init(){
    m_blockIters = 256;
    m_blockSize = 256;
    m_gridSize = 14;
    m_a = new float[m_blockSize*m_blockSize];
    m_bcpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];
    m_bcpu2 = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];
    m_bgpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];
    m_ccpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];
    m_ccpu2 = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];
    m_cgpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters];

    for(int i = 0; i < m_a.length; ++i){
      m_a[i] = i % 3;
    }

    for(int i = 0; i < m_bgpu.length; ++i){
      m_bgpu[i] = i % 3;
      m_bcpu2[i] = i % 3;
    }

    m_transposeWatch.start();
    for(int i = 0; i < m_bgpu.length; ++i){
      int row = i / (m_blockSize*m_gridSize*m_blockIters);
      int col = i % (m_blockSize*m_gridSize*m_blockIters);
      int dest = col * m_blockSize + row;
      m_bcpu[dest] = m_bgpu[i];
    }

    m_transposeWatch.stop();
    System.out.println("transpose time: "+m_transposeWatch.getAverageTime()+" ms");
  }

  private void printMatrix(int[] matrix, int block_size, String heading){
    System.out.println(heading);
    int row_count = 0;
    for(int i = 0; i < matrix.length; ++i){
      System.out.print(matrix[i]+" ");
      row_count++;
      if(row_count == block_size){
        row_count = 0;
        System.out.println();
      }
    } 
  }

  private void printRow(int[] matrix, int block_size, int row){
    System.out.println("row: "+row);
    int start = row * block_size;
    for(int i = 0; i < block_size; ++i){
      System.out.print(matrix[start+i]);
    }
    System.out.println();
  }

  private void printCol(int[] matrix, int block_size, int col){
    System.out.println("col: "+col);
    for(int i = 0; i < block_size; ++i){
      System.out.print(matrix[(i * block_size) + col]);
    }
    System.out.println();
  }

  private void cpuRun(){
    int num_cores = Runtime.getRuntime().availableProcessors();
    m_cpuWatch.start();
    List<MatrixCpuThread> threads = new ArrayList<MatrixCpuThread>();
    for(int i = 0; i < num_cores; ++i){
      MatrixCpuThread thread = new MatrixCpuThread(m_a, m_bcpu, m_ccpu, i,
        m_blockSize, m_gridSize*m_blockIters, num_cores, true);
      threads.add(thread);
    }
    for(int i = 0; i < num_cores; ++i){
      MatrixCpuThread thread = threads.get(i);
      thread.join();
    }
    m_cpuWatch.stop();
    System.out.println("avg cpu time: "+m_cpuWatch.getAverageTime()+" ms");
    
    //runs on cpu without transpose
    //threads = new ArrayList<MatrixCpuThread>();
    //for(int i = 0; i < num_cores; ++i){
    //  MatrixCpuThread thread = new MatrixCpuThread(m_a, m_bcpu2, m_ccpu2, i,
    //    m_blockSize, m_gridSize*m_blockIters, num_cores, false);
    //  threads.add(thread);
    //}
    //for(int i = 0; i < num_cores; ++i){
    //  MatrixCpuThread thread = threads.get(i);
    //  thread.join();
    //}
  }

  private void gpuRun(){
    m_gpuWatch.start();
    MatrixKernel matrix_kernel = new MatrixKernel(m_a, m_bgpu, m_cgpu, m_blockSize, 
      m_gridSize, m_blockIters);
    Rootbeer rootbeer = new Rootbeer();
    ThreadConfig thread_config = new ThreadConfig(1024, m_gridSize, 1024 * m_gridSize);
    rootbeer.run(matrix_kernel, thread_config);
    m_gpuWatch.stop();
    System.out.println("avg gpu time: "+m_gpuWatch.getAverageTime()+" ms");

    List<Calculation> calc_list = matrix_kernel.m_calcList.getList();
    for(Calculation calc : calc_list){
      if(calc == null){
        continue;
      }
      System.out.println(calc.toString());
    }

    //List<StatsRow> stats = rootbeer.getStats();
    //for(StatsRow row : stats){
    //  System.out.println("  StatsRow:");
    //  System.out.println("    init time: "+row.getInitTime());
    //  System.out.println("    serial time: "+row.getSerializationTime());
    //  System.out.println("    exec time: "+row.getExecutionTime());
    //  System.out.println("    deserial time: "+row.getDeserializationTime());
    //  System.out.println("    num blocks: "+row.getNumBlocks());
    //  System.out.println("    num threads: "+row.getNumThreads());
    //}
  }

  private void verifyCpuTranspose(){
    for(int i = 0; i < m_ccpu.length; ++i){
      float cpu_value = m_ccpu[i];
      float cpu_value2 = m_ccpu2[i];
      if(cpu_value != cpu_value2){
        System.out.println("Verify Failed.");
        System.out.println("  cpu_value: "+cpu_value);
        System.out.println("  cpu_value2: "+cpu_value2);
        System.out.println("  index: "+i);
        System.exit(1);
        return;
      }
    }
    System.out.println("Verify PASSED!");
  }

  private void verify(){
    for(int i = 0; i < m_ccpu.length; ++i){
      float cpu_value = m_ccpu[i];
      float gpu_value = m_cgpu[i];
      if(cpu_value != gpu_value){
        System.out.println("Verify Failed.");
        System.out.println("  cpu_value: "+cpu_value);
        System.out.println("  gpu_value: "+gpu_value);
        System.out.println("  index: "+i);
        System.exit(1);
        return;
      }
    }
    System.out.println("Verify PASSED!");
  }

  public void run(){
    for(int i = 0; i < 50; ++i){
      init();
      cpuRun();
      //verifyCpuTranspose();
      gpuRun();
      verify();
    }
  }

  public static void main(String[] args){
    MatrixApp app = new MatrixApp();
    app.run();
  }
}