/*-
* #%L
* Fiji distribution of ImageJ for the life sciences.
* %%
* Copyright (C) 2007 - 2017 Fiji developers.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
package spim.process.interestpointdetection;
import java.util.List;
import mpicbg.imglib.algorithm.OutputAlgorithm;
import mpicbg.imglib.image.Image;
import mpicbg.imglib.type.numeric.real.FloatType;
import mpicbg.imglib.util.Util;
import mpicbg.imglib.wrapper.ImgLib2;
import mpicbg.spim.io.IOFunctions;
import net.imglib2.Interval;
import net.imglib2.RandomAccessible;
import net.imglib2.img.Img;
import net.imglib2.img.array.ArrayImg;
import net.imglib2.img.array.ArrayImgs;
import net.imglib2.img.basictypeaccess.array.FloatArray;
import net.imglib2.view.Views;
import spim.process.cuda.Block;
import spim.process.cuda.BlockGenerator;
import spim.process.cuda.BlockGeneratorVariableSizePrecise;
import spim.process.cuda.BlockGeneratorVariableSizeSimple;
import spim.process.cuda.CUDADevice;
import spim.process.cuda.CUDASeparableConvolution;
import spim.process.cuda.CUDASeparableConvolutionFunctions;
import spim.process.cuda.CUDASeparableConvolutionFunctions.OutOfBounds;
public class DifferenceOfGaussianCUDA extends DifferenceOfGaussianNewPeakFinder
{
final Img< net.imglib2.type.numeric.real.FloatType > img2;
final List< CUDADevice > devList;
final CUDASeparableConvolution cuda;
final boolean accurate;
final double percentGPUMem;
final CUDADevice cudaDev1, cudaDev2;
public DifferenceOfGaussianCUDA(
final CUDASeparableConvolution cuda,
final double percentGPUMem,
final List< CUDADevice > devList,
final Image< FloatType > img1,
final Img< net.imglib2.type.numeric.real.FloatType > img2,
final boolean accurate,
double[] sigma1, double[] sigma2, double minPeakValue,
double normalizationFactor)
{
super( img1, null, sigma1, sigma2, minPeakValue, normalizationFactor );
this.img2 = img2;
this.percentGPUMem = percentGPUMem;
this.devList = devList;
this.cuda = cuda;
this.accurate = accurate;
if ( devList.size() > 1 )
{
this.setComputeConvolutionsParalell( true );
this.cudaDev1 = devList.get( 0 );
this.cudaDev2 = devList.get( 1 );
}
else
{
this.setComputeConvolutionsParalell( false );
this.cudaDev1 = this.cudaDev2 = devList.get( 0 );
}
}
int countCUDA = 0;
@Override
protected OutputAlgorithm< FloatType > getGaussianConvolution( final double[] sigma, final int numThreads )
{
if ( countCUDA == 0 )
{
countCUDA = 1;
return new CUDAOutput( img2, percentGPUMem, cudaDev1, cuda, accurate, sigma );
}
else
{
countCUDA = 0;
return new CUDAOutput( img2, percentGPUMem, cudaDev2, cuda, accurate, sigma );
}
}
public static class CUDAOutput implements OutputAlgorithm< FloatType >
{
final Img< net.imglib2.type.numeric.real.FloatType > img, result;
final CUDADevice cudaDevice;
final CUDASeparableConvolutionFunctions cudaconvolve;
final boolean accurate;
final double[] sigma;
final double percentGPUMem;
public CUDAOutput(
final Img< net.imglib2.type.numeric.real.FloatType > img,
final double percentGPUMem,
final CUDADevice cudaDevice,
final CUDASeparableConvolution cuda,
final boolean accurate,
final double[] sigma )
{
this.img = img;
this.percentGPUMem = percentGPUMem;
this.result = img.factory().create( img, new net.imglib2.type.numeric.real.FloatType() );
this.cudaDevice = cudaDevice;
this.accurate = accurate;
this.sigma = sigma;
this.cudaconvolve = new CUDASeparableConvolutionFunctions( cuda, cudaDevice.getDeviceId() );
}
@Override
public boolean checkInput() { return true; }
@Override
public boolean process()
{
// do not operate at the edge, 80% of the memory is a good idea I think
final long memAvail = Math.round( cudaDevice.getFreeDeviceMemory() * ( percentGPUMem / 100.0 ) );
final long imgBytes = numPixels() * 4 * 2; // float, two images on the card at once
final long[] numBlocksDim = net.imglib2.util.Util.int2long( computeNumBlocksDim( memAvail, imgBytes, percentGPUMem, img.numDimensions(), "CUDA-Device " + cudaDevice.getDeviceId() ) );
final BlockGenerator< Block > generator;
if ( accurate )
generator = new BlockGeneratorVariableSizePrecise( numBlocksDim );
else
generator = new BlockGeneratorVariableSizeSimple( numBlocksDim );
final Block[] blocks = generator.divideIntoBlocks( getImgSize( img ), getKernelSize( sigma ) );
if ( !accurate && blocks.length == 1 && ArrayImg.class.isInstance( img ) )
{
IOFunctions.println( "Conovlving image as one single block." );
long time = System.currentTimeMillis();
// copy the only directly into the result
blocks[ 0 ].copyBlock( img, result );
long copy = System.currentTimeMillis();
IOFunctions.println( "Copying data took " + ( copy - time ) + "ms" );
// convolve
final float[] resultF = ((FloatArray)((ArrayImg< net.imglib2.type.numeric.real.FloatType, ? > )result).update( null ) ).getCurrentStorageArray();
cudaconvolve.gauss( resultF, getImgSizeInt( result ), sigma, OutOfBounds.EXTEND_BORDER_PIXELS, 0 );
IOFunctions.println( "Convolution took " + ( System.currentTimeMillis() - copy ) + "ms using device=" + cudaDevice.getDeviceName() + " (id=" + cudaDevice.getDeviceId() + ")" );
// no copy back required
}
else
{
final RandomAccessible< net.imglib2.type.numeric.real.FloatType > input;
if ( accurate )
input = Views.extendMirrorSingle( img );
else
input = img;
for( final Block block : blocks )
{
//long time = System.currentTimeMillis();
final ArrayImg< net.imglib2.type.numeric.real.FloatType, FloatArray > imgBlock = ArrayImgs.floats( block.getBlockSize() );
// copy the block
block.copyBlock( input, imgBlock );
//long copy = System.currentTimeMillis();
//IOFunctions.println( "Copying block took " + ( copy - time ) + "ms" );
// convolve
final float[] imgBlockF = ((FloatArray)((ArrayImg< net.imglib2.type.numeric.real.FloatType, ? > )imgBlock).update( null ) ).getCurrentStorageArray();
cudaconvolve.gauss( imgBlockF, getImgSizeInt( imgBlock ), sigma, OutOfBounds.EXTEND_BORDER_PIXELS, 0 );
//long convolve = System.currentTimeMillis();
//IOFunctions.println( "Convolution took " + ( convolve - copy ) + "ms using device=" + cudaDevice.getDeviceName() + " (id=" + cudaDevice.getDeviceId() + ")" );
// no copy back required
block.pasteBlock( result, imgBlock );
//IOFunctions.println( "Pasting block took " + ( System.currentTimeMillis() - convolve ) + "ms" );
}
}
return true;
}
@Override
public String getErrorMessage() { return ""; }
@Override
public Image<FloatType> getResult() { return ImgLib2.wrapFloatToImgLib1( result ); }
protected static long[] getKernelSize( final double[] sigma )
{
final long[] dim = new long[ sigma.length ];
for ( int d = 0; d < sigma.length; ++d )
dim[ d ] = Util.createGaussianKernel1DDouble( sigma[ d ], false ).length;
return dim;
}
public static long[] getImgSize( final Interval img )
{
final long[] dim = new long[ img.numDimensions() ];
for ( int d = 0; d < img.numDimensions(); ++d )
dim[ d ] = img.dimension( d );
return dim;
}
protected static int[] getKernelSizeInt( final double[] sigma )
{
final int[] dim = new int[ sigma.length ];
for ( int d = 0; d < sigma.length; ++d )
dim[ d ] = Util.createGaussianKernel1DDouble( sigma[ d ], false ).length;
return dim;
}
public static int[] getImgSizeInt( final Interval img )
{
final int[] dim = new int[ img.numDimensions() ];
for ( int d = 0; d < img.numDimensions(); ++d )
dim[ d ] = (int)img.dimension( d );
return dim;
}
public static int[] computeNumBlocksDim( final long memAvail, final long memReq, final double percentGPUMem, final int n, final String start )
{
final int numBlocks = (int)( memReq / memAvail + Math.min( 1, memReq % memAvail ) );
final double blocksPerDim = Math.pow( numBlocks, 1 / n );
final int[] numBlocksDim = new int[ n ];
for ( int d = 0; d < numBlocksDim.length; ++d )
numBlocksDim[ d ] = (int)Math.round( Math.floor( blocksPerDim ) ) + 1;
int numBlocksCurrent;
do
{
numBlocksCurrent = numBlocks( numBlocksDim );
for ( int d = 0; d < numBlocksDim.length; ++d )
{
++numBlocksDim[ d ];
reduceBlockNumbers( numBlocksDim, numBlocks );
}
}
while ( numBlocks( numBlocksDim ) < numBlocksCurrent );
if ( start != null )
{
String out =
start + ", mem=" + memAvail / (1024*1024) +
"MB (" + Math.round( percentGPUMem / 100 ) + "%), required mem=" + memReq / (1024*1024) + "MB, need to split up into " + numBlocks + " blocks: ";
for ( int d = 0; d < numBlocksDim.length; ++d )
{
out += numBlocksDim[ d ];
if ( d != numBlocksDim.length - 1 )
out += "x";
}
IOFunctions.println( out );
}
return numBlocksDim;
}
protected static void reduceBlockNumbers( final int[] numBlocksDim, final int numBlocks )
{
boolean reduced;
do
{
reduced = false;
for ( int d = numBlocksDim.length - 1; d >= 0 ; --d )
{
if ( numBlocksDim[ d ] > 1 )
{
--numBlocksDim[ d ];
if ( numBlocks( numBlocksDim ) < numBlocks )
++numBlocksDim[ d ];
else
reduced = true;
}
}
}
while ( reduced );
}
protected static int numBlocks( final int[] numBlocksDim )
{
int numBlocks = 1;
for ( int d = 0; d < numBlocksDim.length; ++d )
numBlocks *= numBlocksDim[ d ];
return numBlocks;
}
protected long numPixels()
{
if ( accurate )
{
long size = 1;
for ( int d = 0; d < img.numDimensions(); ++d )
size *= img.dimension( d ) + Util.createGaussianKernel1DDouble( sigma[ d ], false ).length - 1;
return size;
}
else
{
return img.size();
}
}
}
public static void main( String[] args )
{
for ( int i = 1; i < 20; ++i )
CUDAOutput.computeNumBlocksDim( 1024l * 1024l*1024l, i * 1000l * 1024l*1024l, 80, 3, "" );
}
}