package com.compomics.util.math.statistics.distributions; import java.util.ArrayList; import java.util.Arrays; import umontreal.iro.lecuyer.gof.KernelDensity; import umontreal.iro.lecuyer.probdist.EmpiricalDist; import umontreal.iro.lecuyer.probdist.NormalDist; import umontreal.iro.lecuyer.randvar.KernelDensityGen; import umontreal.iro.lecuyer.randvar.NormalGen; import umontreal.iro.lecuyer.rng.MRG31k3p; import umontreal.iro.lecuyer.rng.RandomStream; /** * This class makes use of "SSJ: Stochastic Simulation in Java" library from * iro.umontreal.ca to estimate probability density function of an array of * double. It first generates independent and identically distributed random * variables from the dataset, at which the density needs to be computed and * then generates the vector of density estimates at the corresponding * variables. * * The KernelDensityGen class from the same library is used: the class * implements random variate generators for distributions obtained via kernel * density estimation methods from a set of n individual observations x1,..., * xn. The basic idea is to center a copy of the same symmetric density at each * observation and take an equally weighted mixture of the n copies as an * estimator of the density from which the observations come. The resulting * kernel density has the general form: fn(x) = (1/nh)?i=1nk((x - xi)/h). K is * the kernel (here a Gaussian is chosen) and h is the bandwidth (smoothing * factor). * * @author Paola Masuzzo */ public class NormalKernelDensityEstimator { /** * N, estimation precision, is set to a default of 512, as in most KDE * algorithms default values, i.e. R "density"function, OmicSoft, Matlab * algorithm. */ private final int n = 4096; /** * The empirical distribution. */ private EmpiricalDist empiricalDist; /** * The kernel density generator. */ private KernelDensityGen kernelDensityGen; /** * The dataset size. */ private double datasetSize; /** * This method initiates the KDE, i.e. sort values in ascending order, * compute an empirical distribution out of it, makes use of a NormalGen to * generate random variates from the normal distribution, and then use these * variates to generate a kernel density generator of the empirical * distribution. * * @param data the data */ private void init(double[] data) { datasetSize = (double) data.length; Arrays.sort(data); empiricalDist = new EmpiricalDist(data); // new Stream to randomly generate numbers // combined multiple recursive generator (CMRG) RandomStream stream = new MRG31k3p(); NormalGen normalKernelDensityGen = new NormalGen(stream); kernelDensityGen = new KernelDensityGen(stream, empiricalDist, normalKernelDensityGen); } /** * Estimate the density function. * * @param data the data * @return the estimated density function */ public ArrayList estimateDensityFunction(Double[] data) { init(excludeNullValues(data)); // init the KDE with a normal generator return estimateDensityFunction(); } /** * Estimate the density function. * * @param data the data * @return the estimated density function */ public ArrayList estimateDensityFunction(double[] data) { init(data); // init the KDE with a normal generator return estimateDensityFunction(); } /** * Estimate the density function. * * @return the estimated density function */ private ArrayList estimateDensityFunction() { ArrayList densityFunction = new ArrayList(); // array for random samples double[] randomSamples = new double[n]; // compute x values for (int i = 0; i < n; i++) { double nextDouble = kernelDensityGen.nextDouble(); randomSamples[i] = nextDouble; } Arrays.sort(randomSamples); densityFunction.add(randomSamples); // compute y values // use normal default kernel NormalDist kern = new NormalDist(); // calculate optimal bandwidth with the (ROBUST) Silverman's "rule of thumb" (Scott Variation uses factor = 1.06) double bandWidth = 0.99 * Math.min(empiricalDist.getSampleStandardDeviation(), (empiricalDist.getInterQuartileRange() / 1.34)) / Math.pow(datasetSize, 0.2); // estimate density and store values in a vector double[] estimatedDensityValues = KernelDensity.computeDensity(empiricalDist, kern, bandWidth, randomSamples); densityFunction.add(estimatedDensityValues); return densityFunction; } /** * Exclude null values from an array of double. * * @param data the data * @return another double array with no longer null values */ public double[] excludeNullValues(Double[] data) { ArrayList<Double> list = new ArrayList<Double>(); for (Double value : data) { if (value != null) { list.add(value); } } double[] newArray = new double[list.size()]; for (int i = 0; i < list.size(); i++) { newArray[i] = list.get(i); } return newArray; } }