package rootbeer.examples.gtc2013; import org.trifort.rootbeer.runtime.util.Stopwatch; import org.trifort.rootbeer.runtime.Rootbeer; import org.trifort.rootbeer.runtime.Kernel; import org.trifort.rootbeer.runtime.StatsRow; import org.trifort.rootbeer.runtime.ThreadConfig; import java.util.List; import java.util.ArrayList; //see: http://www.shodor.org/media/content//petascale/materials/UPModules/matrixMultiplication/moduleDocument.pdf public class MatrixApp { private float m_a[]; private float m_bgpu[]; private float m_bcpu[]; private float m_bcpu2[]; private float m_cgpu[]; private float m_ccpu[]; private float m_ccpu2[]; private int m_blockSize; private int m_gridSize; private int m_blockIters; private Stopwatch m_cpuWatch; private Stopwatch m_gpuWatch; private Stopwatch m_transposeWatch; public MatrixApp(){ m_cpuWatch = new Stopwatch(); m_gpuWatch = new Stopwatch(); m_transposeWatch = new Stopwatch(); } public void init(){ m_blockIters = 256; m_blockSize = 256; m_gridSize = 14; m_a = new float[m_blockSize*m_blockSize]; m_bcpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; m_bcpu2 = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; m_bgpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; m_ccpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; m_ccpu2 = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; m_cgpu = new float[m_blockSize*m_blockSize*m_gridSize*m_blockIters]; for(int i = 0; i < m_a.length; ++i){ m_a[i] = i % 3; } for(int i = 0; i < m_bgpu.length; ++i){ m_bgpu[i] = i % 3; m_bcpu2[i] = i % 3; } m_transposeWatch.start(); for(int i = 0; i < m_bgpu.length; ++i){ int row = i / (m_blockSize*m_gridSize*m_blockIters); int col = i % (m_blockSize*m_gridSize*m_blockIters); int dest = col * m_blockSize + row; m_bcpu[dest] = m_bgpu[i]; } m_transposeWatch.stop(); System.out.println("transpose time: "+m_transposeWatch.getAverageTime()+" ms"); } private void printMatrix(int[] matrix, int block_size, String heading){ System.out.println(heading); int row_count = 0; for(int i = 0; i < matrix.length; ++i){ System.out.print(matrix[i]+" "); row_count++; if(row_count == block_size){ row_count = 0; System.out.println(); } } } private void printRow(int[] matrix, int block_size, int row){ System.out.println("row: "+row); int start = row * block_size; for(int i = 0; i < block_size; ++i){ System.out.print(matrix[start+i]); } System.out.println(); } private void printCol(int[] matrix, int block_size, int col){ System.out.println("col: "+col); for(int i = 0; i < block_size; ++i){ System.out.print(matrix[(i * block_size) + col]); } System.out.println(); } private void cpuRun(){ int num_cores = Runtime.getRuntime().availableProcessors(); m_cpuWatch.start(); List<MatrixCpuThread> threads = new ArrayList<MatrixCpuThread>(); for(int i = 0; i < num_cores; ++i){ MatrixCpuThread thread = new MatrixCpuThread(m_a, m_bcpu, m_ccpu, i, m_blockSize, m_gridSize*m_blockIters, num_cores, true); threads.add(thread); } for(int i = 0; i < num_cores; ++i){ MatrixCpuThread thread = threads.get(i); thread.join(); } m_cpuWatch.stop(); System.out.println("avg cpu time: "+m_cpuWatch.getAverageTime()+" ms"); //runs on cpu without transpose //threads = new ArrayList<MatrixCpuThread>(); //for(int i = 0; i < num_cores; ++i){ // MatrixCpuThread thread = new MatrixCpuThread(m_a, m_bcpu2, m_ccpu2, i, // m_blockSize, m_gridSize*m_blockIters, num_cores, false); // threads.add(thread); //} //for(int i = 0; i < num_cores; ++i){ // MatrixCpuThread thread = threads.get(i); // thread.join(); //} } private void gpuRun(){ m_gpuWatch.start(); MatrixKernel matrix_kernel = new MatrixKernel(m_a, m_bgpu, m_cgpu, m_blockSize, m_gridSize, m_blockIters); Rootbeer rootbeer = new Rootbeer(); ThreadConfig thread_config = new ThreadConfig(1024, m_gridSize, 1024 * m_gridSize); rootbeer.run(matrix_kernel, thread_config); m_gpuWatch.stop(); System.out.println("avg gpu time: "+m_gpuWatch.getAverageTime()+" ms"); List<Calculation> calc_list = matrix_kernel.m_calcList.getList(); for(Calculation calc : calc_list){ if(calc == null){ continue; } System.out.println(calc.toString()); } //List<StatsRow> stats = rootbeer.getStats(); //for(StatsRow row : stats){ // System.out.println(" StatsRow:"); // System.out.println(" init time: "+row.getInitTime()); // System.out.println(" serial time: "+row.getSerializationTime()); // System.out.println(" exec time: "+row.getExecutionTime()); // System.out.println(" deserial time: "+row.getDeserializationTime()); // System.out.println(" num blocks: "+row.getNumBlocks()); // System.out.println(" num threads: "+row.getNumThreads()); //} } private void verifyCpuTranspose(){ for(int i = 0; i < m_ccpu.length; ++i){ float cpu_value = m_ccpu[i]; float cpu_value2 = m_ccpu2[i]; if(cpu_value != cpu_value2){ System.out.println("Verify Failed."); System.out.println(" cpu_value: "+cpu_value); System.out.println(" cpu_value2: "+cpu_value2); System.out.println(" index: "+i); System.exit(1); return; } } System.out.println("Verify PASSED!"); } private void verify(){ for(int i = 0; i < m_ccpu.length; ++i){ float cpu_value = m_ccpu[i]; float gpu_value = m_cgpu[i]; if(cpu_value != gpu_value){ System.out.println("Verify Failed."); System.out.println(" cpu_value: "+cpu_value); System.out.println(" gpu_value: "+gpu_value); System.out.println(" index: "+i); System.exit(1); return; } } System.out.println("Verify PASSED!"); } public void run(){ for(int i = 0; i < 50; ++i){ init(); cpuRun(); //verifyCpuTranspose(); gpuRun(); verify(); } } public static void main(String[] args){ MatrixApp app = new MatrixApp(); app.run(); } }