MatrixKernel.java example

Explorer
rootbeer1-master
package rootbeer.examples.gtc2013;

import org.trifort.rootbeer.runtime.Kernel;
import org.trifort.rootbeer.runtime.RootbeerGpu;
import org.trifort.rootbeer.runtimegpu.GpuException;

import java.util.List;
import java.util.ArrayList;

public class MatrixKernel implements Kernel {

  private float[] m_a;
  private float[] m_b;
  private float[] m_c;
  private int m_blockSize;
  private int m_gridSize;
  private int m_blockIters;
  public CalcList m_calcList;

  public MatrixKernel(float[] a, float[] b, float[] c, int block_size, int grid_size,
    int block_iters){
    m_a = a;
    m_b = b;
    m_c = c;
    m_blockSize = block_size;
    m_gridSize = grid_size;
    m_blockIters = block_iters;
    m_calcList = new CalcList();
  }

  public void gpuMethod(){

    //save off fields into local variables. each read from a field hits global
    //ram while a local variable is most likely in a register.
    int block_size = m_blockSize;
    int grid_size = m_gridSize;
    int block_iters = m_blockIters;

    //getBlockIdxx is CUDA blockIdx.x
    //getThreadIdxx is CUDA threadIdx.x
    int block_idxx = RootbeerGpu.getBlockIdxx();
    int thread_idxx = RootbeerGpu.getThreadIdxx();

    //Rootbeer thread index is single dimensional right now. Convert this
    //to a two dimensional index.
    int thread_row = thread_idxx / 32;
    int thread_col = thread_idxx % 32;

    //save off the arrays into local variables. the array elements are still
    //in global ram right now, but at least the pointers are local.
    float[] a = m_a;
    float[] b = m_b;
    float[] c = m_c;

    int sub_matrix_size = block_size / 32;
    sub_matrix_size *= sub_matrix_size;

    int m_size = block_size / 32;

    for(int block_iter = 0; block_iter < block_iters; ++block_iter){ 
      for(int sub_matrix = 0; sub_matrix < sub_matrix_size; ++sub_matrix){
        float sum = 0;
        int sub_matrix_row = sub_matrix / m_size;
        int sub_matrix_col = sub_matrix % m_size;

        int dest_row = (32 * sub_matrix_row) + thread_row;
        int dest_col = (32 * sub_matrix_col) + thread_col;

        int dest_index = (block_iter * block_size * block_size * grid_size) + (block_idxx * block_size * block_size) + dest_row * block_size + dest_col;   
  
        for(int m = 0; m < m_size; ++m){
          int a_src_row = (sub_matrix_row * 32) + thread_row;
          int a_src_col = (m * 32) + thread_col;
          int a_src = (a_src_row * block_size) + a_src_col;

          int b_src_row = (m * 32) + thread_row;
          int b_src_col = (sub_matrix_col * 32) + thread_col;
          int b_src = (b_src_row * block_size) + b_src_col;

          float a_value = a[a_src];
          float b_value = b[b_src];

          //store the a_value into shared memory at location shared_a[threadIdx.x]
          //each thread is loading a single value of global ram into shared ram
          //and then later in the for loop, all threads read from all values
          //placed in shared ram. Fetches from global ram take 200-300 clock cyles
          //while fetches from shared ram take 2-3 clock cycles. If we can have
          //each thread fetch a single value from global memory and store all of
          //the values into shared memory, most of the reads take 2-3 clock cycles
          //rather than 200-300.
          RootbeerGpu.setSharedFloat(thread_idxx * 4, a_value);
          //store the b_value into shared memory at location shared_b[threadIdx.x]
          RootbeerGpu.setSharedFloat((1024 + thread_idxx) * 4, b_value);

          //sync the threads within a block
          RootbeerGpu.syncthreads();

          //loop over all of shared_a[] and shared_b[]
          for(int k = 0; k < 32; ++k){
            //read the a_value from shared_a[thread_row][k]
            a_value = RootbeerGpu.getSharedFloat((thread_row * 32 + k) * 4);
            //read the b_value from shared_b[k][thread_col]
            b_value = RootbeerGpu.getSharedFloat((1024 + k * 32 + thread_col) * 4);
            //multiply a_value and b_value and accumulate
            sum += a_value * b_value;
          }
          //sync threads within a block
          RootbeerGpu.syncthreads();
        }
        //increment c[dest_index] with the sum
        c[dest_index] += sum;
      }
    }
  }
}