/*- * #%L * Fiji distribution of ImageJ for the life sciences. * %% * Copyright (C) 2007 - 2017 Fiji developers. * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ package spim.process.interestpointdetection; import java.util.List; import mpicbg.imglib.algorithm.OutputAlgorithm; import mpicbg.imglib.image.Image; import mpicbg.imglib.type.numeric.real.FloatType; import mpicbg.imglib.util.Util; import mpicbg.imglib.wrapper.ImgLib2; import mpicbg.spim.io.IOFunctions; import net.imglib2.Interval; import net.imglib2.RandomAccessible; import net.imglib2.img.Img; import net.imglib2.img.array.ArrayImg; import net.imglib2.img.array.ArrayImgs; import net.imglib2.img.basictypeaccess.array.FloatArray; import net.imglib2.view.Views; import spim.process.cuda.Block; import spim.process.cuda.BlockGenerator; import spim.process.cuda.BlockGeneratorVariableSizePrecise; import spim.process.cuda.BlockGeneratorVariableSizeSimple; import spim.process.cuda.CUDADevice; import spim.process.cuda.CUDASeparableConvolution; import spim.process.cuda.CUDASeparableConvolutionFunctions; import spim.process.cuda.CUDASeparableConvolutionFunctions.OutOfBounds; public class DifferenceOfGaussianCUDA extends DifferenceOfGaussianNewPeakFinder { final Img< net.imglib2.type.numeric.real.FloatType > img2; final List< CUDADevice > devList; final CUDASeparableConvolution cuda; final boolean accurate; final double percentGPUMem; final CUDADevice cudaDev1, cudaDev2; public DifferenceOfGaussianCUDA( final CUDASeparableConvolution cuda, final double percentGPUMem, final List< CUDADevice > devList, final Image< FloatType > img1, final Img< net.imglib2.type.numeric.real.FloatType > img2, final boolean accurate, double[] sigma1, double[] sigma2, double minPeakValue, double normalizationFactor) { super( img1, null, sigma1, sigma2, minPeakValue, normalizationFactor ); this.img2 = img2; this.percentGPUMem = percentGPUMem; this.devList = devList; this.cuda = cuda; this.accurate = accurate; if ( devList.size() > 1 ) { this.setComputeConvolutionsParalell( true ); this.cudaDev1 = devList.get( 0 ); this.cudaDev2 = devList.get( 1 ); } else { this.setComputeConvolutionsParalell( false ); this.cudaDev1 = this.cudaDev2 = devList.get( 0 ); } } int countCUDA = 0; @Override protected OutputAlgorithm< FloatType > getGaussianConvolution( final double[] sigma, final int numThreads ) { if ( countCUDA == 0 ) { countCUDA = 1; return new CUDAOutput( img2, percentGPUMem, cudaDev1, cuda, accurate, sigma ); } else { countCUDA = 0; return new CUDAOutput( img2, percentGPUMem, cudaDev2, cuda, accurate, sigma ); } } public static class CUDAOutput implements OutputAlgorithm< FloatType > { final Img< net.imglib2.type.numeric.real.FloatType > img, result; final CUDADevice cudaDevice; final CUDASeparableConvolutionFunctions cudaconvolve; final boolean accurate; final double[] sigma; final double percentGPUMem; public CUDAOutput( final Img< net.imglib2.type.numeric.real.FloatType > img, final double percentGPUMem, final CUDADevice cudaDevice, final CUDASeparableConvolution cuda, final boolean accurate, final double[] sigma ) { this.img = img; this.percentGPUMem = percentGPUMem; this.result = img.factory().create( img, new net.imglib2.type.numeric.real.FloatType() ); this.cudaDevice = cudaDevice; this.accurate = accurate; this.sigma = sigma; this.cudaconvolve = new CUDASeparableConvolutionFunctions( cuda, cudaDevice.getDeviceId() ); } @Override public boolean checkInput() { return true; } @Override public boolean process() { // do not operate at the edge, 80% of the memory is a good idea I think final long memAvail = Math.round( cudaDevice.getFreeDeviceMemory() * ( percentGPUMem / 100.0 ) ); final long imgBytes = numPixels() * 4 * 2; // float, two images on the card at once final long[] numBlocksDim = net.imglib2.util.Util.int2long( computeNumBlocksDim( memAvail, imgBytes, percentGPUMem, img.numDimensions(), "CUDA-Device " + cudaDevice.getDeviceId() ) ); final BlockGenerator< Block > generator; if ( accurate ) generator = new BlockGeneratorVariableSizePrecise( numBlocksDim ); else generator = new BlockGeneratorVariableSizeSimple( numBlocksDim ); final Block[] blocks = generator.divideIntoBlocks( getImgSize( img ), getKernelSize( sigma ) ); if ( !accurate && blocks.length == 1 && ArrayImg.class.isInstance( img ) ) { IOFunctions.println( "Conovlving image as one single block." ); long time = System.currentTimeMillis(); // copy the only directly into the result blocks[ 0 ].copyBlock( img, result ); long copy = System.currentTimeMillis(); IOFunctions.println( "Copying data took " + ( copy - time ) + "ms" ); // convolve final float[] resultF = ((FloatArray)((ArrayImg< net.imglib2.type.numeric.real.FloatType, ? > )result).update( null ) ).getCurrentStorageArray(); cudaconvolve.gauss( resultF, getImgSizeInt( result ), sigma, OutOfBounds.EXTEND_BORDER_PIXELS, 0 ); IOFunctions.println( "Convolution took " + ( System.currentTimeMillis() - copy ) + "ms using device=" + cudaDevice.getDeviceName() + " (id=" + cudaDevice.getDeviceId() + ")" ); // no copy back required } else { final RandomAccessible< net.imglib2.type.numeric.real.FloatType > input; if ( accurate ) input = Views.extendMirrorSingle( img ); else input = img; for( final Block block : blocks ) { //long time = System.currentTimeMillis(); final ArrayImg< net.imglib2.type.numeric.real.FloatType, FloatArray > imgBlock = ArrayImgs.floats( block.getBlockSize() ); // copy the block block.copyBlock( input, imgBlock ); //long copy = System.currentTimeMillis(); //IOFunctions.println( "Copying block took " + ( copy - time ) + "ms" ); // convolve final float[] imgBlockF = ((FloatArray)((ArrayImg< net.imglib2.type.numeric.real.FloatType, ? > )imgBlock).update( null ) ).getCurrentStorageArray(); cudaconvolve.gauss( imgBlockF, getImgSizeInt( imgBlock ), sigma, OutOfBounds.EXTEND_BORDER_PIXELS, 0 ); //long convolve = System.currentTimeMillis(); //IOFunctions.println( "Convolution took " + ( convolve - copy ) + "ms using device=" + cudaDevice.getDeviceName() + " (id=" + cudaDevice.getDeviceId() + ")" ); // no copy back required block.pasteBlock( result, imgBlock ); //IOFunctions.println( "Pasting block took " + ( System.currentTimeMillis() - convolve ) + "ms" ); } } return true; } @Override public String getErrorMessage() { return ""; } @Override public Image<FloatType> getResult() { return ImgLib2.wrapFloatToImgLib1( result ); } protected static long[] getKernelSize( final double[] sigma ) { final long[] dim = new long[ sigma.length ]; for ( int d = 0; d < sigma.length; ++d ) dim[ d ] = Util.createGaussianKernel1DDouble( sigma[ d ], false ).length; return dim; } public static long[] getImgSize( final Interval img ) { final long[] dim = new long[ img.numDimensions() ]; for ( int d = 0; d < img.numDimensions(); ++d ) dim[ d ] = img.dimension( d ); return dim; } protected static int[] getKernelSizeInt( final double[] sigma ) { final int[] dim = new int[ sigma.length ]; for ( int d = 0; d < sigma.length; ++d ) dim[ d ] = Util.createGaussianKernel1DDouble( sigma[ d ], false ).length; return dim; } public static int[] getImgSizeInt( final Interval img ) { final int[] dim = new int[ img.numDimensions() ]; for ( int d = 0; d < img.numDimensions(); ++d ) dim[ d ] = (int)img.dimension( d ); return dim; } public static int[] computeNumBlocksDim( final long memAvail, final long memReq, final double percentGPUMem, final int n, final String start ) { final int numBlocks = (int)( memReq / memAvail + Math.min( 1, memReq % memAvail ) ); final double blocksPerDim = Math.pow( numBlocks, 1 / n ); final int[] numBlocksDim = new int[ n ]; for ( int d = 0; d < numBlocksDim.length; ++d ) numBlocksDim[ d ] = (int)Math.round( Math.floor( blocksPerDim ) ) + 1; int numBlocksCurrent; do { numBlocksCurrent = numBlocks( numBlocksDim ); for ( int d = 0; d < numBlocksDim.length; ++d ) { ++numBlocksDim[ d ]; reduceBlockNumbers( numBlocksDim, numBlocks ); } } while ( numBlocks( numBlocksDim ) < numBlocksCurrent ); if ( start != null ) { String out = start + ", mem=" + memAvail / (1024*1024) + "MB (" + Math.round( percentGPUMem / 100 ) + "%), required mem=" + memReq / (1024*1024) + "MB, need to split up into " + numBlocks + " blocks: "; for ( int d = 0; d < numBlocksDim.length; ++d ) { out += numBlocksDim[ d ]; if ( d != numBlocksDim.length - 1 ) out += "x"; } IOFunctions.println( out ); } return numBlocksDim; } protected static void reduceBlockNumbers( final int[] numBlocksDim, final int numBlocks ) { boolean reduced; do { reduced = false; for ( int d = numBlocksDim.length - 1; d >= 0 ; --d ) { if ( numBlocksDim[ d ] > 1 ) { --numBlocksDim[ d ]; if ( numBlocks( numBlocksDim ) < numBlocks ) ++numBlocksDim[ d ]; else reduced = true; } } } while ( reduced ); } protected static int numBlocks( final int[] numBlocksDim ) { int numBlocks = 1; for ( int d = 0; d < numBlocksDim.length; ++d ) numBlocks *= numBlocksDim[ d ]; return numBlocks; } protected long numPixels() { if ( accurate ) { long size = 1; for ( int d = 0; d < img.numDimensions(); ++d ) size *= img.dimension( d ) + Util.createGaussianKernel1DDouble( sigma[ d ], false ).length - 1; return size; } else { return img.size(); } } } public static void main( String[] args ) { for ( int i = 1; i < 20; ++i ) CUDAOutput.computeNumBlocksDim( 1024l * 1024l*1024l, i * 1000l * 1024l*1024l, 80, 3, "" ); } }