package jcuda.jcublas.ops; import org.apache.commons.math3.util.Pair; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import org.nd4j.jita.allocator.impl.AllocationPoint; import org.nd4j.jita.allocator.impl.AtomicAllocator; import org.nd4j.jita.conf.CudaEnvironment; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.util.DataTypeUtil; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.factory.Nd4j; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; /** * @author raver119@gmail.com */ public class AveragingTests { private final int THREADS = 16; private final int LENGTH = 512000 * 4; @Before public void setUp() { DataTypeUtil.setDTypeForContext(DataBuffer.Type.FLOAT); CudaEnvironment.getInstance().getConfiguration() .allowMultiGPU(true) .allowCrossDeviceAccess(true) .enableDebug(true) .setMaximumGridSize(512) .setMaximumBlockSize(256) .setVerbose(true); } @Test @Ignore public void testReshape() { INDArray a = Nd4j.linspace(0,1000 , 100000000).reshape(1000, 1000, 100).permutei(0, 2, 1); long startDup = System.nanoTime(); INDArray a2 = a.dup().reshape(500, 2000, 100); System.out.println(String.format("Dup time: %.3f s", (System.nanoTime() - startDup) / 1000000000.)); int[] newShape = new int[]{500, 2000, 100}; long startTime = System.nanoTime(); INDArray b = Nd4j.createUninitialized(newShape, 'c').assign(a); System.out.println(String.format("Assign: %.3f s", (System.nanoTime() - startTime) / 1000000000.)); assertEquals(a2, b); } @Test public void testSingleDeviceAveraging() throws Exception { INDArray array1 = Nd4j.valueArrayOf(LENGTH, 1.0); INDArray array2 = Nd4j.valueArrayOf(LENGTH, 2.0); INDArray array3 = Nd4j.valueArrayOf(LENGTH, 3.0); INDArray array4 = Nd4j.valueArrayOf(LENGTH, 4.0); INDArray array5 = Nd4j.valueArrayOf(LENGTH, 5.0); INDArray array6 = Nd4j.valueArrayOf(LENGTH, 6.0); INDArray array7 = Nd4j.valueArrayOf(LENGTH, 7.0); INDArray array8 = Nd4j.valueArrayOf(LENGTH, 8.0); INDArray array9 = Nd4j.valueArrayOf(LENGTH, 9.0); INDArray array10 = Nd4j.valueArrayOf(LENGTH, 10.0); INDArray array11 = Nd4j.valueArrayOf(LENGTH, 11.0); INDArray array12 = Nd4j.valueArrayOf(LENGTH, 12.0); INDArray array13 = Nd4j.valueArrayOf(LENGTH, 13.0); INDArray array14 = Nd4j.valueArrayOf(LENGTH, 14.0); INDArray array15 = Nd4j.valueArrayOf(LENGTH, 15.0); INDArray array16 = Nd4j.valueArrayOf(LENGTH, 16.0); long time1 = System.currentTimeMillis(); INDArray arrayMean = Nd4j.averageAndPropagate(new INDArray[] {array1, array2, array3, array4, array5, array6, array7, array8, array9, array10, array11, array12, array13, array14, array15, array16}); long time2 = System.currentTimeMillis(); System.out.println("Execution time: " + (time2 - time1)); assertNotEquals(null, arrayMean); assertEquals(8.5f, arrayMean.getFloat(12), 0.1f); assertEquals(8.5f, arrayMean.getFloat(150), 0.1f); assertEquals(8.5f, arrayMean.getFloat(475), 0.1f); assertEquals(8.5f, array1.getFloat(475), 0.1f); assertEquals(8.5f, array2.getFloat(475), 0.1f); assertEquals(8.5f, array3.getFloat(475), 0.1f); assertEquals(8.5f, array5.getFloat(475), 0.1f); assertEquals(8.5f, array16.getFloat(475), 0.1f); } /** * This test should be run on multi-gpu system only. On single-gpu system this test will fail * @throws Exception */ @Test public void testMultiDeviceAveraging() throws Exception { final List<Pair<INDArray, INDArray>> pairs = new ArrayList<>(); int numDevices = Nd4j.getAffinityManager().getNumberOfDevices(); AtomicAllocator allocator = AtomicAllocator.getInstance(); for (int i = 0; i < THREADS; i++) { final int order = i; Thread thread = new Thread(new Runnable() { @Override public void run() { pairs.add(new Pair<INDArray, INDArray>(Nd4j.valueArrayOf(LENGTH, (double) order), null)); try { Thread.sleep(100); } catch (Exception e) { // } } }); thread.start(); thread.join(); } assertEquals(THREADS, pairs.size()); final List<INDArray> arrays = new ArrayList<>(); AtomicBoolean hasNonZero = new AtomicBoolean(false); for (int i = 0; i < THREADS; i++) { INDArray array = pairs.get(i).getKey(); AllocationPoint point = allocator.getAllocationPoint(array.data()); if (point.getDeviceId() != 0 ) hasNonZero.set(true); arrays.add(array); } assertEquals(true, hasNonZero.get()); /* // old way of averaging, without further propagation INDArray z = Nd4j.create(LENGTH); long time1 = System.currentTimeMillis(); for (int i = 0; i < THREADS; i++) { z.addi(arrays.get(i)); } z.divi((float) THREADS); CudaContext context = (CudaContext) allocator.getDeviceContext().getContext(); context.syncOldStream(); long time2 = System.currentTimeMillis(); System.out.println("Execution time: " + (time2 - time1)); */ long time1 = System.currentTimeMillis(); INDArray z = Nd4j.averageAndPropagate(arrays); long time2 = System.currentTimeMillis(); System.out.println("Execution time: " + (time2 - time1)); assertEquals(7.5f, z.getFloat(0), 0.01f); assertEquals(7.5f, z.getFloat(10), 0.01f); for (int i = 0; i < THREADS; i++) { for (int x = 0; x < LENGTH; x++) { assertEquals("Failed on array [" +i+ "], element [" +x+ "]",z.getFloat(0), arrays.get(i).getFloat(x), 0.01f); } } } }