package rootbeer.examples.gtc2013; import org.trifort.rootbeer.runtime.Kernel; import org.trifort.rootbeer.runtime.RootbeerGpu; import org.trifort.rootbeer.runtimegpu.GpuException; import java.util.List; import java.util.ArrayList; public class MatrixKernel implements Kernel { private float[] m_a; private float[] m_b; private float[] m_c; private int m_blockSize; private int m_gridSize; private int m_blockIters; public CalcList m_calcList; public MatrixKernel(float[] a, float[] b, float[] c, int block_size, int grid_size, int block_iters){ m_a = a; m_b = b; m_c = c; m_blockSize = block_size; m_gridSize = grid_size; m_blockIters = block_iters; m_calcList = new CalcList(); } public void gpuMethod(){ //save off fields into local variables. each read from a field hits global //ram while a local variable is most likely in a register. int block_size = m_blockSize; int grid_size = m_gridSize; int block_iters = m_blockIters; //getBlockIdxx is CUDA blockIdx.x //getThreadIdxx is CUDA threadIdx.x int block_idxx = RootbeerGpu.getBlockIdxx(); int thread_idxx = RootbeerGpu.getThreadIdxx(); //Rootbeer thread index is single dimensional right now. Convert this //to a two dimensional index. int thread_row = thread_idxx / 32; int thread_col = thread_idxx % 32; //save off the arrays into local variables. the array elements are still //in global ram right now, but at least the pointers are local. float[] a = m_a; float[] b = m_b; float[] c = m_c; int sub_matrix_size = block_size / 32; sub_matrix_size *= sub_matrix_size; int m_size = block_size / 32; for(int block_iter = 0; block_iter < block_iters; ++block_iter){ for(int sub_matrix = 0; sub_matrix < sub_matrix_size; ++sub_matrix){ float sum = 0; int sub_matrix_row = sub_matrix / m_size; int sub_matrix_col = sub_matrix % m_size; int dest_row = (32 * sub_matrix_row) + thread_row; int dest_col = (32 * sub_matrix_col) + thread_col; int dest_index = (block_iter * block_size * block_size * grid_size) + (block_idxx * block_size * block_size) + dest_row * block_size + dest_col; for(int m = 0; m < m_size; ++m){ int a_src_row = (sub_matrix_row * 32) + thread_row; int a_src_col = (m * 32) + thread_col; int a_src = (a_src_row * block_size) + a_src_col; int b_src_row = (m * 32) + thread_row; int b_src_col = (sub_matrix_col * 32) + thread_col; int b_src = (b_src_row * block_size) + b_src_col; float a_value = a[a_src]; float b_value = b[b_src]; //store the a_value into shared memory at location shared_a[threadIdx.x] //each thread is loading a single value of global ram into shared ram //and then later in the for loop, all threads read from all values //placed in shared ram. Fetches from global ram take 200-300 clock cyles //while fetches from shared ram take 2-3 clock cycles. If we can have //each thread fetch a single value from global memory and store all of //the values into shared memory, most of the reads take 2-3 clock cycles //rather than 200-300. RootbeerGpu.setSharedFloat(thread_idxx * 4, a_value); //store the b_value into shared memory at location shared_b[threadIdx.x] RootbeerGpu.setSharedFloat((1024 + thread_idxx) * 4, b_value); //sync the threads within a block RootbeerGpu.syncthreads(); //loop over all of shared_a[] and shared_b[] for(int k = 0; k < 32; ++k){ //read the a_value from shared_a[thread_row][k] a_value = RootbeerGpu.getSharedFloat((thread_row * 32 + k) * 4); //read the b_value from shared_b[k][thread_col] b_value = RootbeerGpu.getSharedFloat((1024 + k * 32 + thread_col) * 4); //multiply a_value and b_value and accumulate sum += a_value * b_value; } //sync threads within a block RootbeerGpu.syncthreads(); } //increment c[dest_index] with the sum c[dest_index] += sum; } } } }